├── Google-Scraper-API-1090x275.png ├── main.py └── README.md /Google-Scraper-API-1090x275.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oxylabs/how-to-scrape-google-scholar/HEAD/Google-Scraper-API-1090x275.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | USERNAME = "USERNAME" 6 | PASSWORD = "PASSWORD" 7 | 8 | 9 | def get_html_for_page(url): 10 | payload = { 11 | "url": url, 12 | "source": "google", 13 | } 14 | response = requests.post( 15 | "https://realtime.oxylabs.io/v1/queries", 16 | auth=(USERNAME, PASSWORD), 17 | json=payload, 18 | ) 19 | response.raise_for_status() 20 | return response.json()["results"][0]["content"] 21 | 22 | 23 | def get_citations(article_id): 24 | url = f"https://scholar.google.com/scholar?q=info:{article_id}:scholar.google.com&output=cite" 25 | html = get_html_for_page(url) 26 | soup = BeautifulSoup(html, "html.parser") 27 | data = [] 28 | for citation in soup.find_all("tr"): 29 | title = citation.find("th", {"class": "gs_cith"}).get_text(strip=True) 30 | content = citation.find("div", {"class": "gs_citr"}).get_text(strip=True) 31 | entry = { 32 | "title": title, 33 | "content": content, 34 | } 35 | data.append(entry) 36 | 37 | return data 38 | 39 | 40 | def parse_data_from_article(article): 41 | title_elem = article.find("h3", {"class": "gs_rt"}) 42 | title = title_elem.get_text() 43 | title_anchor_elem = article.select("a")[0] 44 | url = title_anchor_elem["href"] 45 | article_id = title_anchor_elem["id"] 46 | authors = article.find("div", {"class": "gs_a"}).get_text() 47 | return { 48 | "title": title, 49 | "authors": authors, 50 | "url": url, 51 | "citations": get_citations(article_id), 52 | } 53 | 54 | 55 | def get_url_for_page(url, page_index): 56 | return url + f"&start={page_index}" 57 | 58 | 59 | def get_data_from_page(url): 60 | html = get_html_for_page(url) 61 | soup = BeautifulSoup(html, "html.parser") 62 | articles = soup.find_all("div", {"class": "gs_ri"}) 63 | return [parse_data_from_article(article) for article in articles] 64 | 65 | 66 | data = [] 67 | url = "https://scholar.google.com/scholar?q=global+warming+&hl=en&as_sdt=0,5" 68 | 69 | NUM_OF_PAGES = 1 70 | page_index = 0 71 | for _ in range(NUM_OF_PAGES): 72 | page_url = get_url_for_page(url, page_index) 73 | entries = get_data_from_page(page_url) 74 | data.extend(entries) 75 | page_index += 10 76 | 77 | print(data) 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How to Scrape Google Scholar 2 | 3 | [![Oxylabs promo code](https://raw.githubusercontent.com/oxylabs/how-to-scrape-google-scholar/refs/heads/main/Google-Scraper-API-1090x275.png)](https://oxylabs.io/products/scraper-api/serp/google?utm_source=877&utm_medium=affiliate&groupid=877&utm_content=how-to-scrape-google-scholar-github&transaction_id=102c8d36f7f0d0e5797b8f26152160) 4 | 5 | [![](https://dcbadge.limes.pink/api/server/Pds3gBmKMH?style=for-the-badge&theme=discord)](https://discord.gg/Pds3gBmKMH) [![YouTube](https://img.shields.io/badge/YouTube-Oxylabs-red?style=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/@oxylabs) 6 | 7 | Take a look at the process of getting titles, authors, and citations from [Google Scholar](https://scholar.google.com/) using Oxylabs [SERP Scraper API](https://oxylabs.io/products/scraper-api/serp) (a part of Web Scraper API) and Python. You can get a **1-week free trial** by registering on the [dashboard](https://dashboard.oxylabs.io/). 8 | 9 | For a detailed walkthrough with explanations and visuals, check our [blog post](https://oxylabs.io/blog/how-to-scrape-google-scholar). 10 | Also, do not hesitate to check this [Best SERP APIs](https://medium.com/@oxylabs.io/the-10-best-serp-apis-in-2025-22bf7f91f8f0) list 11 | ## The complete code 12 | 13 | ```python 14 | import requests 15 | from bs4 import BeautifulSoup 16 | 17 | 18 | USERNAME = "USERNAME" 19 | PASSWORD = "PASSWORD" 20 | 21 | 22 | def get_html_for_page(url): 23 | payload = { 24 | "url": url, 25 | "source": "google", 26 | } 27 | response = requests.post( 28 | "https://realtime.oxylabs.io/v1/queries", 29 | auth=(USERNAME, PASSWORD), 30 | json=payload, 31 | ) 32 | response.raise_for_status() 33 | return response.json()["results"][0]["content"] 34 | 35 | 36 | def get_citations(article_id): 37 | url = f"https://scholar.google.com/scholar?q=info:{article_id}:scholar.google.com&output=cite" 38 | html = get_html_for_page(url) 39 | soup = BeautifulSoup(html, "html.parser") 40 | data = [] 41 | for citation in soup.find_all("tr"): 42 | title = citation.find("th", {"class": "gs_cith"}).get_text(strip=True) 43 | content = citation.find("div", {"class": "gs_citr"}).get_text(strip=True) 44 | entry = { 45 | "title": title, 46 | "content": content, 47 | } 48 | data.append(entry) 49 | 50 | return data 51 | 52 | 53 | def parse_data_from_article(article): 54 | title_elem = article.find("h3", {"class": "gs_rt"}) 55 | title = title_elem.get_text() 56 | title_anchor_elem = article.select("a")[0] 57 | url = title_anchor_elem["href"] 58 | article_id = title_anchor_elem["id"] 59 | authors = article.find("div", {"class": "gs_a"}).get_text() 60 | return { 61 | "title": title, 62 | "authors": authors, 63 | "url": url, 64 | "citations": get_citations(article_id), 65 | } 66 | 67 | 68 | def get_url_for_page(url, page_index): 69 | return url + f"&start={page_index}" 70 | 71 | 72 | def get_data_from_page(url): 73 | html = get_html_for_page(url) 74 | soup = BeautifulSoup(html, "html.parser") 75 | articles = soup.find_all("div", {"class": "gs_ri"}) 76 | return [parse_data_from_article(article) for article in articles] 77 | 78 | 79 | data = [] 80 | url = "https://scholar.google.com/scholar?q=global+warming+&hl=en&as_sdt=0,5" 81 | 82 | NUM_OF_PAGES = 1 83 | page_index = 0 84 | for _ in range(NUM_OF_PAGES): 85 | page_url = get_url_for_page(url, page_index) 86 | entries = get_data_from_page(page_url) 87 | data.extend(entries) 88 | page_index += 10 89 | 90 | print(data) 91 | ``` 92 | 93 | ## Final word 94 | 95 | Check our [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/google) for more API parameters and variables found in this tutorial. 96 | 97 | If you have any questions, feel free to contact us at support@oxylabs.io. 98 | 99 | Read More Google Scraping Related Repositories: [Google Sheets for Basic Web Scraping](https://github.com/oxylabs/web-scraping-google-sheets), [How to Scrape Google Shopping Results](https://github.com/oxylabs/scrape-google-shopping), [Google Play Scraper](https://github.com/oxylabs/google-play-scraper), [How To Scrape Google Jobs](https://github.com/oxylabs/how-to-scrape-google-jobs), [Google News Scrpaer](https://github.com/oxylabs/google-news-scraper), [How to Scrape Google Flights with Python](https://github.com/oxylabs/how-to-scrape-google-flights), [How To Scrape Google Images](https://github.com/oxylabs/how-to-scrape-google-images), [Scrape Google Search Results](https://github.com/oxylabs/scrape-google-python), [Scrape Google Trends](https://github.com/oxylabs/how-to-scrape-google-trends) 100 | --------------------------------------------------------------------------------