How To Write Code To Read Output File To Figure Out How Far It Got In Scraping Website And Then Starting From Where It Left Off
I'm writing a program to scrape article title, date and body text from each article on this website's archive and export to a csv file. The website seems to block me at some point
Solution 1:
You can count the number of articles you've saved in CSV, int divide it by 10 (page = 1 + records // 10
(+1 is for the first page)) to get the last page you were at.
I've refactored your code like this:
import csv
import time
from random import randint
from urllib.request import urlopen
from bs4 import BeautifulSoup
HEADERS = ["Date", "Title", "Article"]
defcount_rows(csv_path: str) -> int:
withopen(csv_path) as f:
reader = csv.DictReader(f)
returnlen(list(reader))
defwrite_articles(csv_path: str, articles: list):
# note the append mode, write mode would delete everything and start freshwithopen(csv_path, 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f,
quoting=csv.QUOTE_MINIMAL,
fieldnames=HEADERS)
writer.writerows(articles)
definit_csv(csv_path: str):
withopen(csv_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=HEADERS, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
defget_page_soup(url: str) -> BeautifulSoup:
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "lxml")
return soup
defscrape_article(url: str) -> dict:
article_soup = get_page_soup(url)
# Limits to div class = story-text tag (where article text is)
story_el = article_soup.select_one('.story-text')
# find date
date = story_el.select_one('.timestamp time')['datetime']
# find title
title = story_el.find('h1').text
# find body text
article_text = ''for p in story_el.find_all('p'):
article_text += p.text + ' 'return {
'Title': title,
'Date': date,
'Article': article_text
}
defmain():
csvfile = "test.csv"try:
record_count = count_rows(csvfile)
except FileNotFoundError:
init_csv(csvfile)
print('Initialized CSV file')
record_count = 0
article_per_page = 10
page = 1 + record_count // article_per_page
print('Continuing from page', page)
articles = []
for p inrange(page, 413):
url = "https://www.politico.com/newsletters/playbook/archive/%d" % p
soup = get_page_soup(url)
article_links = soup.select('article.story-frag.format-l')
# Each article link on pagefor article in article_links:
link = article.select_one('a[target=_top]')['href']
scraped_article = scrape_article(link)
print(scraped_article)
articles.append(scraped_article)
write_articles(csvfile, articles)
print('Finished page', p)
time.sleep(randint(3, 8))
if __name__ == '__main__':
main()
this gives you an output like this:
Finished page 48
{'Title': 'Playbook: Scalise takes several Republicans to ...
{'Title': 'Playbook: Four unfolding events that show the ...
{'Title': 'Playbook: Texas kicks off primary season, as D ...
{'Title': 'Playbook: The next gen: McCarthy and Crowley’s ...
{'Title': 'INSIDE THE GRIDIRON DINNER: What Trump said an ...
{'Title': 'DEMS spending millions already to boost vulner ...
{'Title': 'Playbook: Inside the Republican super PAC mone ...
{'Title': 'Playbook: Who would want to be White House com ...
{'Title': "Playbook: Jared Kushner's bad day", 'Date': '2 ...
{'Title': 'Playbook: Gun control quickly stalls in the Se ...
Finished page 49
Post a Comment for "How To Write Code To Read Output File To Figure Out How Far It Got In Scraping Website And Then Starting From Where It Left Off"