modm_data.dl.store

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4import os
 5import shutil
 6import logging
 7import tempfile
 8import subprocess
 9from pathlib import Path
10from urllib.request import urlopen, Request
11
12
13LOGGER = logging.getLogger(__name__)
14_hdr = {
15    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
16    'Sec-Fetch-Site': 'none',
17    'Accept-Encoding': 'identity',
18    'Sec-Fetch-Mode': 'navigate',
19    'Host': 'www.st.com',
20    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
21    'Accept-Language': 'en-GB,en;q=0.9',
22    'Sec-Fetch-Dest': 'document',
23    'Connection': 'keep-alive',
24}
25
26
27def download_data(url: str, encoding: str = None, errors: str = None) -> str:
28    """
29    Download and decode the data of a URL.
30
31    :param url: URL to download
32    :param encoding: optional encoding to apply (default is `utf-8`)
33    :param errors: optional error handling (default is `ignore`)
34    :return: The data as a decoded string.
35    """
36    LOGGER.debug(f"Downloading data from {url}")
37    cmd = f"curl '{url}' -L -s --max-time 120 -o - " + " ".join(f"-H '{k}: {v}'" for k,v in _hdr.items())
38    data = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout
39    return data.decode(encoding=encoding or "utf-8", errors=errors or "ignore")
40
41
42def download_file(url: str, path: Path, overwrite: bool = False) -> bool:
43    """
44    Download a file from a URL and copy it to a path, potentially overwriting an
45    existing file there. Creates directories if necessary.
46
47    :param url: File URL to download.
48    :param path: Path to copy the downloaded file to.
49    :param overwrite: If the file already exists, overwrite it.
50    :return: Whether the file was downloaded and copied.
51    """
52    if not overwrite and path.exists():
53        LOGGER.error(f"File {path} already exists!")
54        return False
55    if isinstance(path, Path):
56        path.parent.mkdir(parents=True, exist_ok=True)
57    LOGGER.debug(f"Downloading file from {url} to {path}")
58    cmd = f"curl '{url}' -L -s --max-time 60 -o {path} " + " ".join(f"-H '{k}: {v}'" for k,v in _hdr.items())
59    return subprocess.call(cmd, shell=True) == 0
60    # with tempfile.NamedTemporaryFile() as outfile:
61    #     os.system(f'wget -q --user-agent="{_hdr["User-Agent"]}" "{url}" -O {outfile.name}')
62    #     shutil.copy(outfile.name, str(path))
63    # This doesn't work with all PDFs, redirects maybe?
64    # with urlopen(Request(url, headers=_hdr)) as infile, \
65    #      tempfile.NamedTemporaryFile() as outfile:
66    #     shutil.copyfileobj(infile, outfile)
67    #     shutil.copy(outfile.name, str(path))
68    # return True
LOGGER = <Logger modm_data.dl.store (WARNING)>
def download_data(url: str, encoding: str = None, errors: str = None) -> str:
28def download_data(url: str, encoding: str = None, errors: str = None) -> str:
29    """
30    Download and decode the data of a URL.
31
32    :param url: URL to download
33    :param encoding: optional encoding to apply (default is `utf-8`)
34    :param errors: optional error handling (default is `ignore`)
35    :return: The data as a decoded string.
36    """
37    LOGGER.debug(f"Downloading data from {url}")
38    cmd = f"curl '{url}' -L -s --max-time 120 -o - " + " ".join(f"-H '{k}: {v}'" for k,v in _hdr.items())
39    data = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout
40    return data.decode(encoding=encoding or "utf-8", errors=errors or "ignore")

Download and decode the data of a URL.

Parameters
  • url: URL to download
  • encoding: optional encoding to apply (default is utf-8)
  • errors: optional error handling (default is ignore)
Returns

The data as a decoded string.

def download_file(url: str, path: pathlib.Path, overwrite: bool = False) -> bool:
43def download_file(url: str, path: Path, overwrite: bool = False) -> bool:
44    """
45    Download a file from a URL and copy it to a path, potentially overwriting an
46    existing file there. Creates directories if necessary.
47
48    :param url: File URL to download.
49    :param path: Path to copy the downloaded file to.
50    :param overwrite: If the file already exists, overwrite it.
51    :return: Whether the file was downloaded and copied.
52    """
53    if not overwrite and path.exists():
54        LOGGER.error(f"File {path} already exists!")
55        return False
56    if isinstance(path, Path):
57        path.parent.mkdir(parents=True, exist_ok=True)
58    LOGGER.debug(f"Downloading file from {url} to {path}")
59    cmd = f"curl '{url}' -L -s --max-time 60 -o {path} " + " ".join(f"-H '{k}: {v}'" for k,v in _hdr.items())
60    return subprocess.call(cmd, shell=True) == 0
61    # with tempfile.NamedTemporaryFile() as outfile:
62    #     os.system(f'wget -q --user-agent="{_hdr["User-Agent"]}" "{url}" -O {outfile.name}')
63    #     shutil.copy(outfile.name, str(path))
64    # This doesn't work with all PDFs, redirects maybe?
65    # with urlopen(Request(url, headers=_hdr)) as infile, \
66    #      tempfile.NamedTemporaryFile() as outfile:
67    #     shutil.copyfileobj(infile, outfile)
68    #     shutil.copy(outfile.name, str(path))
69    # return True

Download a file from a URL and copy it to a path, potentially overwriting an existing file there. Creates directories if necessary.

Parameters
  • url: File URL to download.
  • path: Path to copy the downloaded file to.
  • overwrite: If the file already exists, overwrite it.
Returns

Whether the file was downloaded and copied.