1
from bs4 import BeautifulSoup
from time import sleep
import requests, os
from urllib.parse import urlparse, urljoin
page = requests.get("https://www.cdc.gov/mmwr/mmwr_wk/wk_pvol.html")
soup = BeautifulSoup(page.content, 'html.parser')
issues = soup.select_one(".match-height").find_all('a')
download_queue = []
sleep(1)
for issue in issues:
folder = issue.text
pg_url = urljoin("https://www.cdc.gov/mmwr/mmwr_wk/wk_pvol.html", issue.get('href', []))
page = requests.get(pg_url)
issue_soup = BeautifulSoup(page.content, 'html.parser')
filename = os.path.basename(urlparse(pg_url).path)
download_queue.append(['https://www.cdc.gov' + issue.get('href', []), folder, filename])
links = issue_soup.find_all('a')
for link in links:
if ('.pdf' in link.get('href', [])):
filename = os.path.basename(urlparse(link.get('href', [])).path)
print(f"Queueing {link.get('href', [])} as {folder}/{filename}")
url = urljoin(pg_url, link.get('href', []))
download_queue.append([url, folder, filename])
sleep(1)
import aria2p
# initialization, these are the default values
aria2 = aria2p.API(
aria2p.Client(
host="http://localhost",
port=6800,
secret=""
)
)
for download in download_queue:
os.makedirs(download[1], exist_ok=True)
aria2.add(download[0], {"dir": "/redacted/MMWR/" + download[1], "out": download[2], "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"})
For immediate assistance, please email our customer support: [email protected]