Skip to content Skip to sidebar Skip to footer

Accessing The Contents On Links Provided On A Webpage While Webscraping

This is a followup question of my previous question. I am trying to access the contents of a webpage. I could search for contents on the webpage. However, I am not sure how to acce

Solution 1:

All can be done using Requests and BeautifulSoup without Selenium. Here code how to get data with details:

import requests
from bs4 import BeautifulSoup

base_url = 'https://randr.nist.gov'
ec_name = 'enzyme'
search_term = '1.1.1.1'

url = f'{base_url}/{ec_name}/'with requests.Session() as session:
    # get __VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION parameters to use them in POST parameters
    response = session.get(url)
    page = BeautifulSoup(response.text, "html.parser")
    view_state = page.find(id="__VIEWSTATE")["value"]
    view_state_generator = page.find(id="__VIEWSTATEGENERATOR")["value"]
    event_validation = page.find(id="__EVENTVALIDATION")["value"]

    data = {
        '__EVENTTARGET': '',
        '__EVENTARGUMENT': '',
        '__LASTFOCUS': '',
        '__VIEWSTATE': view_state,
        '__VIEWSTATEGENERATOR': view_state_generator,
        '__SCROLLPOSITIONX': '0',
        '__SCROLLPOSITIONY': '0',
        '__EVENTVALIDATION': event_validation,
        'ctl00$MainBody$txtSrchAutoFill': search_term,
        'ctl00$MainBody$repoList': 'Enzyme_thermo',
        'ctl00$MainBody$ImgSrch.x': '0',
        'ctl00$MainBody$ImgSrch.y': '0'
    }
    response = session.post(url, data=data)
    page = BeautifulSoup(response.text, "html.parser")

    # get all rows
    rows = page.select("#MainBody_gvSearch tr")
    # first row is header, remove it
    rows.remove(rows[0])

    for row in rows:
        reference_id = row.select_one("[id*='lbSearch']").text.strip()
        ec_number = row.select_one("[id*='lblECNumber']").text.strip()
        method = row.select_one("[id*='lblMethod']").text.strip()
        buffer = row.select_one("[id*='lblBuffer']").text.strip()
        reaction = row.select_one("[id*='lblReaction']").text.strip()
        enzyme = row.select_one("[id*='lblEnzyme']").text.strip()
        cofactor = row.select_one("[id*='lblCofactor']").text.strip()
        evaluation = row.select_one("[id*='lblEvaluation']").text.strip()

        print(f"EC Number: {ec_number}, Reference Id: {reference_id}, Evaluation: {evaluation}")

        # get details
        params = (
            ('ID', reference_id),
            ('finalterm', search_term),
            ('data', ec_name),
        )
        response = session.get('https://randr.nist.gov/enzyme/DataDetails.aspx', params=params)
        page = BeautifulSoup(response.text, "html.parser")

        # parse general informationif page.find("span", text='Reference:'):
            reference = page.find("span", text='Reference:').find_parent("td").find_next_sibling("td").text.strip()
        if page.find("span", text='pH:'):
            ph = page.find("span", text='pH:').find_parent("td").find_next_sibling("td").text.strip()

        # parse table
        extra_data = []
        try:
            table_headers = [x.text.strip() for x in page.select("#MainBody_extraData th")]
            table_data = [x.text.strip() for x in page.select("#MainBody_extraData td")]

            headers_count = len(table_headers)
            for i inrange(0, len(table_data), headers_count):
                row = {}
                row_data = table_data[i:i + headers_count]
                for column_index, h inenumerate(table_headers):
                    row[h] = row_data[column_index]

                print("T(K): {}, pH: {}, K': {}".format(row["T(K)"], row["pH"], row["K'"]))
                extra_data.append(row)

        except Exception as ex:
            print("No details table found")
            print(ex)

        print("")

Output of some values:

EC Number: 1.1.1.1, Reference Id: 36EUL/ADL_7, Evaluation: C T(K): 298.15, pH: 6.4, K': 1.3E-5 T(K): 298.15, pH: 7.0, K': 5.3E-5 T(K): 298.15, pH: 7.7, K': 1.3E-4 EC Number: 1.1.1.1, Reference Id: 37ADL/SRE_8, Evaluation: D T(K): 298.15, pH: 6.05, K': 6.0E-6 T(K): 298.15, pH: 7.25, K': 7.7E-5 T(K): 298.15, pH: 8.0, K': 1.2E-5 EC Number: 1.1.1.1, Reference Id: 37NEG/WUL_9, Evaluation: C T(K): 293.15, pH: 7.9, K': 7.41E-4 EC Number: 1.1.1.1, Reference Id: 38SCH/HEL_10, Evaluation: C T(K): 298.15, pH: 6.30, K': 2.6E-5 T(K): 298.15, pH: 6.85, K': 8.8E-5 T(K): 298.15, pH: 7.15, K': 1.9E-4 T(K): 298.15, pH: 7.34, K': 3.0E-4 T(K): 298.15, pH: 7.61, K': 5.1E-4 T(K): 298.15, pH: 7.77, K': 8.0E-4 T(K): 298.15, pH: 8.17, K': 2.2E-3 EC Number: 1.1.1.1, Reference Id: 38SCH/HEL_23, Evaluation: C T(K): 298.15, pH: 6.39, K': 9.1E-6 T(K): 298.15, pH: 6.60, K': 3.0E-5 T(K): 298.15, pH: 6.85, K': 5.1E-5 T(K): 298.15, pH: 7.18, K': 1.5E-4 T(K): 298.15, pH: 7.31, K': 2.3E-4 T(K): 298.15, pH: 7.69, K': 5.6E-4 T(K): 298.15, pH: 8.06, K': 1.1E-3

Solution 2:

I presume you would like to click on each link on table and then grab the content of the details after navigate to the page.

from selenium import webdriver
from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
search_input = '1.1.1.1'

driver = webdriver.Chrome('path of the chrome driver')
driver.get('https://randr.nist.gov/enzyme/Default.aspx')
driver.find_element_by_id('MainBody_txtSrchAutoFill').send_keys(search_input)
driver.find_element_by_id('MainBody_ImgSrch').click()
links=WebDriverWait(driver,20).until(EC.visibility_of_all_elements_located((By.XPATH,"//table[@id='MainBody_gvSearch']//tr/td[1]/a")))
for link in range(len(links)):
    links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='MainBody_gvSearch']//tr/td[1]/a")))
    print("################################")
    print(links[link].text)
    print("################################")
    links[link].click()
    try :
        itemrows=WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='MainBody_DataList1']//table//tr")))
        for row in itemrows:
           print(row.find_element_by_xpath("./td[1]").text + " " + row.find_element_by_xpath("./td[2]").text )



    except:
        print(driver.find_element_by_id("MainBody_lblErrorDetails").text)


    driver.back()

Output on console:

################################36EUL/ADL_7################################Reference:vonEuler,H.;Adler,E.;Hellstrvm,H.;Hoppe-Seyler'sZ.Physiol.Chem.;241,239(1936).Reference ID:36EUL/ADL_7EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphatepH:6.4-7.7Evaluation:C################################37ADL/SRE_8################################Reference:Adler,E.;Sreenivasaya,M.;Hoppe-Seyler'sZ.Physiol.Chem.;249,24(1937).Reference ID:37ADL/SRE_8EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.05-8.0Evaluation:D################################37NEG/WUL_9################################Reference:Negelein,E.;Wulff,H.-J.;Biochem.Z.;293,351(1937).Reference ID:37NEG/WUL_9EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphatepH:7.9Evaluation:C################################38SCH/HEL_10################################Reference:Schlenk,F.;Hellstrvm,H.;vonEuler,H.;Ber.Dtsch.Chem.Ges.;71,1471(1938).Reference ID:38SCH/HEL_10EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.30-8.17Evaluation:C################################38SCH/HEL_23################################Reference:Schlenk,F.;Hellstrvm,H.;vonEuler,H.;Ber.Dtsch.Chem.Ges.;71,1471(1938).Reference ID:38SCH/HEL_23EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:6.39-8.06Evaluation:C################################50RAC_11################################Reference:Racker,E.;J.Biol.Chem.;184,313(1950).Reference ID:50RAC_11EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.01moldm-3)pH:7.4-9.5Evaluation:B################################51BLA_65################################Reference:Blakley,R.L.;Biochem.J.;49,257(1951).Reference ID:51BLA_65EC Value:1.1.1.14(ENZYME|KEGG)Method:spectrophotometryandchemicalanalysisBuffer:potassiumphosphate(0.03moldm-3)pH:8.0Evaluation:B################################51BLI_35################################Reference:Bliss,A.F.;Arch.Biochem.Biophys.;31,197(1951).Reference ID:51BLI_35EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:sodiumpyrophosphate(0.015moldm-3)pH:6.6-9.5Evaluation:C################################51THE/BON_12################################Reference:Theorell,H.;Bonnichsen,R.;ActaChem.Scand.;5,1105(1951).Reference ID:51THE/BON_12EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.05moldm-3)and {glycine(0.10moldm-3)+NaOH}
pH:7.0-10.0Evaluation:B################################52BUR_29################################Reference:Burton,K.;Biochim.Biophys.Acta;8,114(1952).Reference ID:52BUR_29EC Value:1.1.1.1(ENZYME|KEGG)Method:Buffer:pH:Evaluation:B################################53BUR/WIL_30################################Reference:Burton,K.;Wilson,T.H.;Biochem.J.;54,86(1953).Reference ID:53BUR/WIL_30EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.0055moldm-3)pH:7.03-8.83Evaluation:A################################54WIL/BAN_66################################Reference:Williams-Ashman,H.G.;Banks,J.;Arch.Biochem.Biophys.;50,513(1954).Reference ID:54WIL/BAN_66EC Value:1.1.1.14(ENZYME|KEGG)Method:spectrophotometryBuffer:pH:Evaluation:C################################55WOL/KAP_75################################Reference:Wolff,J.B.;Kaplan,N.O.;MethodsEnzymol.;1,346(1955).Reference ID:55WOL/KAP_75EC Value:1.1.1.17(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)orbicarbonate(0.1moldm-3)pH:7.0Evaluation:C################################56KAP/CIO_13################################Reference:Kaplan,N.O.;Ciotti,M.M.;Stolzenbach,F.E.;J.Biol.Chem.;221,833(1956).Reference ID:56KAP/CIO_13EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)pH:6.51-8.07Evaluation:C################################56KAP/CIO_22################################Reference:Kaplan,N.O.;Ciotti,M.M.;Stolzenbach,F.E.;J.Biol.Chem.;221,833(1956).Reference ID:56KAP/CIO_22EC Value:1.1.1.1(ENZYME|KEGG)Method:spectrophotometryBuffer:phosphate(0.1moldm-3)pH:6.51-8.07Evaluation:C################################56LAR/JAC_77################################Reference:Larner,J.;Jackson,W.T.;Graves,D.J.;Stamer,J.R.;Arch.Biochem.Biophys.;60,352(1956).Reference ID:56LAR/JAC_77EC Value:1.1.1.18(ENZYME|KEGG)Method:spectrophotometryBuffer:pyrophosphate(0.01moldm-3)pH:8.10-8.92Evaluation:B################################56WOL/KAP_76################################Reference:Wolff,J.B.;Kaplan,N.O.;J.Biol.Chem.;218,849(1956).Reference ID:56WOL/KAP_76EC Value:1.1.1.17(ENZYME|KEGG)Method:chemicalanalysisandspectrophotometryBuffer:pH:6-10Evaluation:C################################57HOL/TOU_56################################Reference:Hollmann,S.;Touster,O.;J.Biol.Chem.;225,87(1957).Reference ID:57HOL/TOU_56EC Value:1.1.1.10(ENZYME|KEGG)Method:spectrophotometryBuffer:Tris(0.05moldm-3)pH:6.95-8.70Evaluation:Bsoon.......................

Post a Comment for "Accessing The Contents On Links Provided On A Webpage While Webscraping"