Accessing The Contents On Links Provided On A Webpage While Webscraping
Solution 1:
All can be done using Requests and BeautifulSoup without Selenium. Here code how to get data with details:
import requests
from bs4 import BeautifulSoup
base_url = 'https://randr.nist.gov'
ec_name = 'enzyme'
search_term = '1.1.1.1'
url = f'{base_url}/{ec_name}/'with requests.Session() as session:
# get __VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION parameters to use them in POST parameters
response = session.get(url)
page = BeautifulSoup(response.text, "html.parser")
view_state = page.find(id="__VIEWSTATE")["value"]
view_state_generator = page.find(id="__VIEWSTATEGENERATOR")["value"]
event_validation = page.find(id="__EVENTVALIDATION")["value"]
data = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__LASTFOCUS': '',
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': view_state_generator,
'__SCROLLPOSITIONX': '0',
'__SCROLLPOSITIONY': '0',
'__EVENTVALIDATION': event_validation,
'ctl00$MainBody$txtSrchAutoFill': search_term,
'ctl00$MainBody$repoList': 'Enzyme_thermo',
'ctl00$MainBody$ImgSrch.x': '0',
'ctl00$MainBody$ImgSrch.y': '0'
}
response = session.post(url, data=data)
page = BeautifulSoup(response.text, "html.parser")
# get all rows
rows = page.select("#MainBody_gvSearch tr")
# first row is header, remove it
rows.remove(rows[0])
for row in rows:
reference_id = row.select_one("[id*='lbSearch']").text.strip()
ec_number = row.select_one("[id*='lblECNumber']").text.strip()
method = row.select_one("[id*='lblMethod']").text.strip()
buffer = row.select_one("[id*='lblBuffer']").text.strip()
reaction = row.select_one("[id*='lblReaction']").text.strip()
enzyme = row.select_one("[id*='lblEnzyme']").text.strip()
cofactor = row.select_one("[id*='lblCofactor']").text.strip()
evaluation = row.select_one("[id*='lblEvaluation']").text.strip()
print(f"EC Number: {ec_number}, Reference Id: {reference_id}, Evaluation: {evaluation}")
# get details
params = (
('ID', reference_id),
('finalterm', search_term),
('data', ec_name),
)
response = session.get('https://randr.nist.gov/enzyme/DataDetails.aspx', params=params)
page = BeautifulSoup(response.text, "html.parser")
# parse general informationif page.find("span", text='Reference:'):
reference = page.find("span", text='Reference:').find_parent("td").find_next_sibling("td").text.strip()
if page.find("span", text='pH:'):
ph = page.find("span", text='pH:').find_parent("td").find_next_sibling("td").text.strip()
# parse table
extra_data = []
try:
table_headers = [x.text.strip() for x in page.select("#MainBody_extraData th")]
table_data = [x.text.strip() for x in page.select("#MainBody_extraData td")]
headers_count = len(table_headers)
for i inrange(0, len(table_data), headers_count):
row = {}
row_data = table_data[i:i + headers_count]
for column_index, h inenumerate(table_headers):
row[h] = row_data[column_index]
print("T(K): {}, pH: {}, K': {}".format(row["T(K)"], row["pH"], row["K'"]))
extra_data.append(row)
except Exception as ex:
print("No details table found")
print(ex)
print("")
Output of some values:
EC Number: 1.1.1.1, Reference Id: 36EUL/ADL_7, Evaluation: C T(K): 298.15, pH: 6.4, K': 1.3E-5 T(K): 298.15, pH: 7.0, K': 5.3E-5 T(K): 298.15, pH: 7.7, K': 1.3E-4 EC Number: 1.1.1.1, Reference Id: 37ADL/SRE_8, Evaluation: D T(K): 298.15, pH: 6.05, K': 6.0E-6 T(K): 298.15, pH: 7.25, K': 7.7E-5 T(K): 298.15, pH: 8.0, K': 1.2E-5 EC Number: 1.1.1.1, Reference Id: 37NEG/WUL_9, Evaluation: C T(K): 293.15, pH: 7.9, K': 7.41E-4 EC Number: 1.1.1.1, Reference Id: 38SCH/HEL_10, Evaluation: C T(K): 298.15, pH: 6.30, K': 2.6E-5 T(K): 298.15, pH: 6.85, K': 8.8E-5 T(K): 298.15, pH: 7.15, K': 1.9E-4 T(K): 298.15, pH: 7.34, K': 3.0E-4 T(K): 298.15, pH: 7.61, K': 5.1E-4 T(K): 298.15, pH: 7.77, K': 8.0E-4 T(K): 298.15, pH: 8.17, K': 2.2E-3 EC Number: 1.1.1.1, Reference Id: 38SCH/HEL_23, Evaluation: C T(K): 298.15, pH: 6.39, K': 9.1E-6 T(K): 298.15, pH: 6.60, K': 3.0E-5 T(K): 298.15, pH: 6.85, K': 5.1E-5 T(K): 298.15, pH: 7.18, K': 1.5E-4 T(K): 298.15, pH: 7.31, K': 2.3E-4 T(K): 298.15, pH: 7.69, K': 5.6E-4 T(K): 298.15, pH: 8.06, K': 1.1E-3
Solution 2:
I presume you would like to click on each link on table and then grab the content of the details after navigate to the page.
from selenium import webdriver
from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
search_input = '1.1.1.1'
driver = webdriver.Chrome('path of the chrome driver')
driver.get('https://randr.nist.gov/enzyme/Default.aspx')
driver.find_element_by_id('MainBody_txtSrchAutoFill').send_keys(search_input)
driver.find_element_by_id('MainBody_ImgSrch').click()
links=WebDriverWait(driver,20).until(EC.visibility_of_all_elements_located((By.XPATH,"//table[@id='MainBody_gvSearch']//tr/td[1]/a")))
for link in range(len(links)):
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='MainBody_gvSearch']//tr/td[1]/a")))
print("################################")
print(links[link].text)
print("################################")
links[link].click()
try :
itemrows=WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='MainBody_DataList1']//table//tr")))
for row in itemrows:
print(row.find_element_by_xpath("./td[1]").text + " " + row.find_element_by_xpath("./td[2]").text )
except:
print(driver.find_element_by_id("MainBody_lblErrorDetails").text)
driver.back()
Output on console:
################################36EUL/ADL_7################################Reference:vonEuler,H.;Adler,E.;Hellstrvm,H.;Hoppe-Seyler'sZ.Physiol.Chem.;241,239(1936).Reference ID:36EUL/ADL_7EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphatepH:6.4-7.7Evaluation:C################################37ADL/SRE_8################################Reference:Adler,E.;Sreenivasaya,M.;Hoppe-Seyler'sZ.Physiol.Chem.;249,24(1937).Reference ID:37ADL/SRE_8EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.05-8.0Evaluation:D################################37NEG/WUL_9################################Reference:Negelein,E.;Wulff,H.-J.;Biochem.Z.;293,351(1937).Reference ID:37NEG/WUL_9EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphatepH:7.9Evaluation:C################################38SCH/HEL_10################################Reference:Schlenk,F.;Hellstrvm,H.;vonEuler,H.;Ber.Dtsch.Chem.Ges.;71,1471(1938).Reference ID:38SCH/HEL_10EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.30-8.17Evaluation:C################################38SCH/HEL_23################################Reference:Schlenk,F.;Hellstrvm,H.;vonEuler,H.;Ber.Dtsch.Chem.Ges.;71,1471(1938).Reference ID:38SCH/HEL_23EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.39-8.06Evaluation:C################################50RAC_11################################Reference:Racker,E.;J.Biol.Chem.;184,313(1950).Reference ID:50RAC_11EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.01moldm-3)pH:7.4-9.5Evaluation:B################################51BLA_65################################Reference:Blakley,R.L.;Biochem.J.;49,257(1951).Reference ID:51BLA_65EC Value:1.1.1.14(ENZYME|KEGG)Method:spectrophotometryandchemicalanalysisBuffer:potassiumphosphate(0.03moldm-3)pH:8.0Evaluation:B################################51BLI_35################################Reference:Bliss,A.F.;Arch.Biochem.Biophys.;31,197(1951).Reference ID:51BLI_35EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:sodiumpyrophosphate(0.015moldm-3)pH:6.6-9.5Evaluation:C################################51THE/BON_12################################Reference:Theorell,H.;Bonnichsen,R.;ActaChem.Scand.;5,1105(1951).Reference ID:51THE/BON_12EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.05moldm-3)and {glycine(0.10moldm-3)+NaOH}
pH:7.0-10.0Evaluation:B################################52BUR_29################################Reference:Burton,K.;Biochim.Biophys.Acta;8,114(1952).Reference ID:52BUR_29EC Value:1.1.1.1(ENZYME|KEGG)Method:Buffer:pH:Evaluation:B################################53BUR/WIL_30################################Reference:Burton,K.;Wilson,T.H.;Biochem.J.;54,86(1953).Reference ID:53BUR/WIL_30EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.0055moldm-3)pH:7.03-8.83Evaluation:A################################54WIL/BAN_66################################Reference:Williams-Ashman,H.G.;Banks,J.;Arch.Biochem.Biophys.;50,513(1954).Reference ID:54WIL/BAN_66EC Value:1.1.1.14(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:Evaluation:C################################55WOL/KAP_75################################Reference:Wolff,J.B.;Kaplan,N.O.;MethodsEnzymol.;1,346(1955).Reference ID:55WOL/KAP_75EC Value:1.1.1.17(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)orbicarbonate(0.1moldm-3)pH:7.0Evaluation:C################################56KAP/CIO_13################################Reference:Kaplan,N.O.;Ciotti,M.M.;Stolzenbach,F.E.;J.Biol.Chem.;221,833(1956).Reference ID:56KAP/CIO_13EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)pH:6.51-8.07Evaluation:C################################56KAP/CIO_22################################Reference:Kaplan,N.O.;Ciotti,M.M.;Stolzenbach,F.E.;J.Biol.Chem.;221,833(1956).Reference ID:56KAP/CIO_22EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)pH:6.51-8.07Evaluation:C################################56LAR/JAC_77################################Reference:Larner,J.;Jackson,W.T.;Graves,D.J.;Stamer,J.R.;Arch.Biochem.Biophys.;60,352(1956).Reference ID:56LAR/JAC_77EC Value:1.1.1.18(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.01moldm-3)pH:8.10-8.92Evaluation:B################################56WOL/KAP_76################################Reference:Wolff,J.B.;Kaplan,N.O.;J.Biol.Chem.;218,849(1956).Reference ID:56WOL/KAP_76EC Value:1.1.1.17(ENZYME|KEGG)Method:chemicalanalysisandspectrophotometryBuffer:pH:6-10Evaluation:C################################57HOL/TOU_56################################Reference:Hollmann,S.;Touster,O.;J.Biol.Chem.;225,87(1957).Reference ID:57HOL/TOU_56EC Value:1.1.1.10(ENZYME|KEGG)Method:spectrophotometryBuffer:Tris(0.05moldm-3)pH:6.95-8.70Evaluation:Bsoon.......................
Post a Comment for "Accessing The Contents On Links Provided On A Webpage While Webscraping"