Source code for speechbrain.utils.check_url

"""Libraries for automatic finding URLs in the files and checking if they are
reachable.

Authors
 * Mirco Ravanelli 2022
"""
import os
import re
import time
import requests
from tqdm.contrib import tqdm
from speechbrain.utils.data_utils import get_all_files


[docs]def get_url(path):
    """This function searches for the URLs in the specified file.

    Arguments
    ---------
    path: path
        Path of the file where to search for URLs.

    Returns
    ---------
    urls: list
        a list of all the URLs found in the specified path.
    """
    # Check if files exist
    if not (os.path.exists(path)):
        print("File %s not found!" % (path))
        return False

    # Read the file
    with open(path, "r") as file:
        text = file.read()

    # Set up Regex for URL detection
    url_regex = re.compile(
        r"((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)",
        re.DOTALL,
    )
    urls = re.findall(url_regex, text)

    return list(set(urls))


[docs]def get_all_urls(file_lst, avoid_urls):
    """This function searches for all the URLs in the specified file list

    Arguments
    ---------
    file_lst: list
        List of the files where to search for URLs.
    avoid_urls: list
        List of URLs to avoid.

    Returns
    ---------
    urls: dict
        A dictionary where the keys are the detected URLs and the values
    are the files where the URLs are found.
    """
    all_urls = {}

    for path in file_lst:
        urls = get_url(path)

        for url in urls:

            # Clean up urls
            url = url[0].split(")")[0]
            if (
                url[-1] == "."
                or url[-1] == ","
                or url[-1] == " "
                or url[-1] == "/"
            ):
                url = url[:-1]

            if url in avoid_urls:
                continue

            if url not in all_urls:
                all_urls[url] = []
            all_urls[url].append(path)
    return all_urls


[docs]def check_url(url):
    """Cheks if an URL is broken

    Arguments
    ---------
    url: string
        URL to check

    Returns
    ---------
    Bool
        False if the URL is broken, True otherwise.
    """
    try:
        response = requests.head(url)
        if response.status_code == 404 or response.status_code > 499:
            return False
        else:
            return True
    except requests.ConnectionError:
        return False


[docs]def test_links(
    folder=".",
    match_or=[".py", ".md", ".txt"],
    exclude_or=[".pyc"],
    avoid_files=[""],
    avoid_urls=["http:/", "http://", "https:/", "https://"],
):
    """This test checks if the files in the specified folders contain broken URLs

    Arguments
    ---------
    folder: path
        The top Folder for searching for the files.
    match_or: list
        Used to specify the extensions of the files to check.
    exclude_or: list
        Used to avoid some file extensions.
    avoid_files: list
        Used to avoid testing some specific file.
    """

    check_test = True
    # Find all the files that potentially contain urls
    file_lst = get_all_files(folder, match_or=match_or, exclude_or=exclude_or)

    # Get urls for the list of files - unique list
    all_urls = get_all_urls(file_lst, avoid_urls)

    # Check all the urls
    with tqdm(all_urls) as all_urls_progressbar:
        for url in all_urls_progressbar:
            time.sleep(1)
            if not (check_url(url)):
                check_test = False
                print("WARNING: %s is DOWN!" % (url))
                for path in all_urls[url]:
                    print("\t link detected in %s" % (path))
    return check_test