Get Authors Name And Url For Tag From Google Scholar
Solution 1:
I'm not writing the code for you.. but I'll give you an outline for how you can.
Look at the bottom of the page. See the next button? Search for it the containing div has an id
of gsc_authors_bottom_pag
which should be easy to find. I'd do this with selenium, find the next button (right) and click it. Wait for the page to load, scrape repeat. Handle edge cases (out of pages, etc).
If the after_author=*
bit didn't change in the url you could just increment the url
start.. but unless you want to try to crack that code (unlikely) then just click the next button.
Solution 2:
This page use <button>
instead of <a>
for link to next/previous page.
Button to next page has aria-label="Następna"
.
There are two buttons to next page but you can use any of them.
Button has JavaScript
code to redirect to new page
window.location=url_to_next_page
but it is simple text so you can use slicing to get only url
import urllib.request
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"whileTrue:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
# ... do something on page ...# find buttons to next page
buttons = soup.findAll("button", {"aria-label": "Następna"})
# exit if no buttonsifnot buttons:
break
on_click = buttons[0].get('onclick')
print('javascript:', on_click)
#add `domain` and remove `window.location='` and `'` at the end
url = 'http://scholar.google.pl' + on_click[17:-1]
# converting some codes to chars
url = url.encode('utf-8').decode('unicode_escape')
print('url:', url)
BTW: if you speak Polish then you can visit on Facebook: Python Poland or Python: pierwsze kroki
Solution 3:
Since furas is already answered on how to loop through all pages, this is a complementary answer to his answer. The script below scrapes much more than your question asks and scrapes to a .csv
file.
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, os, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
proxies = {
'http': os.getenv('HTTP_PROXY')
}
defget_profiles_to_csv():
html = requests.get('http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# creating CSV Filewithopen('awesome_file.csv', mode='w') as csv_file:
# defining column names
fieldnames = ['Author', 'URL']
# defining .csv writer# https://docs.python.org/3/library/csv.html#csv.DictWriter
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writing (creating) columns
writer.writeheader()
# collecting scraped data
author_data = []
# Selecting container where all data located for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646# id = link# id_identifer = 'user='# before_keyword, keyword, after_keyword = id.partition(id_identifer)# author_id = after_keyword# affiliations = result.select_one('.gs_ai_aff').text# email = result.select_one('.gs_ai_eml').text# try:# interests = result.select_one('.gs_ai_one_int').text# except:# interests = None# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.# cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]# because we have a csv.DictWriter() we converting to the required format# dict() keys should be exactly the same as fieldnames, otherwise it will throw an error
author_data.append({
'Author': name,
'URL': f'https://scholar.google.com{link}',
})
# iterating over celebrity data list() that became dict() and writing it to the .csvfor data in author_data:
writer.writerow(data)
# print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')# output from created csv:'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=pl&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=pl&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=pl&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=pl&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=pl&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=pl&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=pl&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=pl&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=pl&user=ty7wIXoAAAAJ
'''
Alternatively, you can do the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free trial of 5,000 searches.
Code to integrate:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import csv, os
defget_profiles_to_csv():
withopen('awesome_serpapi_file_pagination.csv', mode='w') as csv_file:
fieldnames = ['Author', 'URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_profiles",
"mauthors": "label:security"
}
search = GoogleSearch(params)
whileTrue:
results = search.get_dict()
try:
for result in results['profiles']:
name = result['name']
link = result['link']
writer.writerow({'Author': name, 'URL': link})
except:
print('Done')
breakif (not'pagination'in results) and (not'next'in results['pagination']):
break
search.params_dict.update(dict(parse_qsl(urlsplit(results["pagination"]["next"]).query)))
get_profiles_to_csv()
# part of the output from created csv:'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=en&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=en&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=en&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=en&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=en&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=en&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=en&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=en&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=en&user=ty7wIXoAAAAJ
'''
Disclaimer, I work for SerpApi.
Post a Comment for "Get Authors Name And Url For Tag From Google Scholar"