1
from bs4 import BeautifulSoup
from time import sleep
import requests, os
from urllib.parse import urlparse, urljoin

page = requests.get("https://www.cdc.gov/mmwr/mmwr_wk/wk_pvol.html")
soup = BeautifulSoup(page.content, 'html.parser')

issues = soup.select_one(".match-height").find_all('a')
download_queue = []

sleep(1)

for issue in issues:
    folder = issue.text
    pg_url = urljoin("https://www.cdc.gov/mmwr/mmwr_wk/wk_pvol.html", issue.get('href', []))

    page = requests.get(pg_url)
    issue_soup = BeautifulSoup(page.content, 'html.parser')

    filename = os.path.basename(urlparse(pg_url).path)
    download_queue.append(['https://www.cdc.gov' + issue.get('href', []), folder, filename])

    links = issue_soup.find_all('a')

    for link in links:
        if ('.pdf' in link.get('href', [])):
            filename = os.path.basename(urlparse(link.get('href', [])).path)
            print(f"Queueing {link.get('href', [])} as {folder}/{filename}")
            url = urljoin(pg_url, link.get('href', []))
            download_queue.append([url, folder, filename])
    
    sleep(1)


import aria2p

# initialization, these are the default values
aria2 = aria2p.API(
    aria2p.Client(
        host="http://localhost",
        port=6800,
        secret=""
    )
)

for download in download_queue:
    os.makedirs(download[1], exist_ok=True)
    aria2.add(download[0], {"dir": "/redacted/MMWR/" + download[1], "out": download[2], "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"})

For immediate assistance, please email our customer support: [email protected]

Download RAW File