Skip to content Skip to sidebar Skip to footer

How To Write Code To Read Output File To Figure Out How Far It Got In Scraping Website And Then Starting From Where It Left Off

I'm writing a program to scrape article title, date and body text from each article on this website's archive and export to a csv file. The website seems to block me at some point

Solution 1:

You can count the number of articles you've saved in CSV, int divide it by 10 (page = 1 + records // 10 (+1 is for the first page)) to get the last page you were at.

I've refactored your code like this:

import csv
import time
from random import randint
from urllib.request import urlopen

from bs4 import BeautifulSoup

HEADERS = ["Date", "Title", "Article"]


defcount_rows(csv_path: str) -> int:
    withopen(csv_path) as f:
        reader = csv.DictReader(f)
        returnlen(list(reader))


defwrite_articles(csv_path: str, articles: list):
    # note the append mode, write mode would delete everything and start freshwithopen(csv_path, 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f,
                                quoting=csv.QUOTE_MINIMAL,
                                fieldnames=HEADERS)
        writer.writerows(articles)


definit_csv(csv_path: str):
    withopen(csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=HEADERS, quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()


defget_page_soup(url: str) -> BeautifulSoup:
    response = urlopen(url)
    html = response.read()

    soup = BeautifulSoup(html, "lxml")
    return soup


defscrape_article(url: str) -> dict:
    article_soup = get_page_soup(url)

    # Limits to div class = story-text tag (where article text is)
    story_el = article_soup.select_one('.story-text')

    # find date
    date = story_el.select_one('.timestamp time')['datetime']

    # find title
    title = story_el.find('h1').text

    # find body text
    article_text = ''for p in story_el.find_all('p'):
        article_text += p.text + ' 'return {
        'Title': title,
        'Date': date,
        'Article': article_text
    }


defmain():
    csvfile = "test.csv"try:
        record_count = count_rows(csvfile)
    except FileNotFoundError:
        init_csv(csvfile)
        print('Initialized CSV file')
        record_count = 0

    article_per_page = 10
    page = 1 + record_count // article_per_page

    print('Continuing from page', page)

    articles = []
    for p inrange(page, 413):
        url = "https://www.politico.com/newsletters/playbook/archive/%d" % p
        soup = get_page_soup(url)
        article_links = soup.select('article.story-frag.format-l')

        # Each article link on pagefor article in article_links:
            link = article.select_one('a[target=_top]')['href']
            scraped_article = scrape_article(link)
            print(scraped_article)
            articles.append(scraped_article)

        write_articles(csvfile, articles)
        print('Finished page', p)
        time.sleep(randint(3, 8))


if __name__ == '__main__':
    main()

this gives you an output like this:

Finished page 48
{'Title': 'Playbook: Scalise takes several Republicans to ...
{'Title': 'Playbook: Four unfolding events that show the  ...
{'Title': 'Playbook: Texas kicks off primary season, as D ...
{'Title': 'Playbook: The next gen: McCarthy and Crowley’s ...
{'Title': 'INSIDE THE GRIDIRON DINNER: What Trump said an ...
{'Title': 'DEMS spending millions already to boost vulner ...
{'Title': 'Playbook: Inside the Republican super PAC mone ...
{'Title': 'Playbook: Who would want to be White House com ...
{'Title': "Playbook: Jared Kushner's bad day", 'Date': '2 ...
{'Title': 'Playbook: Gun control quickly stalls in the Se ...
Finished page 49

Post a Comment for "How To Write Code To Read Output File To Figure Out How Far It Got In Scraping Website And Then Starting From Where It Left Off"