Trouble Running A Parser Created Using Scrapy With Selenium
Solution 1:
Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.
Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen
Below is final code working fine
classIncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def__init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
defclick_nextpage(self,link):
# self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
defparse(self, response):
self.driver.get(response.url)
whileTrue:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
response = response.replace(body=self.driver.page_source)
except TimeoutException:break
After that change it works perfect
Solution 2:
In case you need pure Selenium solution:
driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
whileTrue:
for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
print(item.text)
try:
driver.find_element_by_xpath("//input[@text='Next' and not(contains(@class, 'disabledImageButton'))]").click()
except NoSuchElementException:
break
Solution 3:
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess
classIncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def__init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
self.driver.get(link)
defclick_nextpage(self):
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
time.sleep(4)
defparse(self,response):
whileTrue:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage() #initiate the method to do the clickingexcept TimeoutException:break
process = CrawlerProcess()
process.crawl(IncomeTaxSpider)
process.start()
Solution 4:
Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script) Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.
Here is the modified code. Hope this may be useful for you.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
classIncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def__init__(self):
self.driver = webdriver.Firefox()
self.wait = WebDriverWait(self.driver, 10)
defclick_nextpage(self,link, number):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
inputElement = self.driver.find_element_by_xpath("//input[@id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
inputElement.clear()
inputElement.send_keys(number)
inputElement.send_keys(Keys.ENTER)
self.wait.until(EC.staleness_of(elem))
defparse(self,response):
number = 1while number < 10412: #Website shows it has 10411 pages.for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
print (name)
try:
number += 1
self.click_nextpage(response.url, number) #initiate the method to do the clickingexcept TimeoutException:break
Solution 5:
Create a self.page_num or something.
defparse(self,response):
self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
self.pages = int(self.pages.split('of ')[1].split(']')[0])
self.page_num = 1while self.page_num <= self.pages:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clickingexcept TimeoutException:breakdefclick_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
self.page_num = self.page_num + 1
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
Post a Comment for "Trouble Running A Parser Created Using Scrapy With Selenium"