Trouble Running A Parser Created Using Scrapy With Selenium

June 22, 2024 Post a Comment

I've written a scraper in Python scrapy in combination with selenium to scrape some titles from a website. The css selectors defined within my scraper is flawless. I wish my scrape

Solution 1:

Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.

Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen

Below is final code working fine

classIncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def__init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

    defclick_nextpage(self,link):
        # self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
        self.wait.until(EC.staleness_of(elem))


    defparse(self, response):
        self.driver.get(response.url)

        whileTrue:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage(response.url) #initiate the method to do the clicking
                response = response.replace(body=self.driver.page_source)
            except TimeoutException:break

After that change it works perfect

Solution 2:

In case you need pure Selenium solution:

Baca Juga

driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")

whileTrue:
    for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
        print(item.text)
    try:
        driver.find_element_by_xpath("//input[@text='Next' and not(contains(@class, 'disabledImageButton'))]").click()
    except NoSuchElementException:
        break

Solution 3:

import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess

classIncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def__init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

        link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
        self.driver.get(link)

    defclick_nextpage(self):        
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
        self.wait.until(EC.staleness_of(elem))
        time.sleep(4)

    defparse(self,response):
        whileTrue:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage() #initiate the method to do the clickingexcept TimeoutException:break

process = CrawlerProcess()

process.crawl(IncomeTaxSpider)
process.start()

Solution 4:

Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script) Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.

Here is the modified code. Hope this may be useful for you.

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

classIncomeTaxSpider(scrapy.Spider):
    name = "taxspider"
    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]
    def__init__(self):
        self.driver = webdriver.Firefox()
        self.wait = WebDriverWait(self.driver, 10)

    defclick_nextpage(self,link, number):
        self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

    inputElement = self.driver.find_element_by_xpath("//input[@id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
    inputElement.clear()
    inputElement.send_keys(number)
    inputElement.send_keys(Keys.ENTER)
        self.wait.until(EC.staleness_of(elem))


    defparse(self,response):
        number = 1while number < 10412: #Website shows it has 10411 pages.for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}
                print (name)

            try:
                number += 1
                self.click_nextpage(response.url, number) #initiate the method to do the clickingexcept TimeoutException:break

Solution 5:

Create a self.page_num or something.

defparse(self,response):
    self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
    self.pages = int(self.pages.split('of ')[1].split(']')[0])

    self.page_num = 1while self.page_num <= self.pages:
        for item in response.css("h1.faqsno-heading"):
            name = item.css("div[id^='arrowex']::text").extract_first()
            yield {"Name": name}

        try:
            self.click_nextpage(response.url) #initiate the method to do the clickingexcept TimeoutException:breakdefclick_nextpage(self,link):
    self.driver.get(link)
    elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

    page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
    self.page_num = self.page_num + 1


    self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
    self.wait.until(EC.staleness_of(elem))

Python Playground