├── .github ├── octo-reminder.yml └── workflows │ ├── codecov-workflow.yml │ └── potential-duplicates.yml ├── .gitignore ├── .gitpod.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── example_usage.py ├── google_scholar_py ├── __init__.py ├── custom_backend │ ├── author_info_all_articles.py │ ├── cite_results.py │ ├── google_scholar_cited_by_public_access_author.py │ ├── organic_search.py │ ├── profiles_results.py │ ├── top_mandates_metrics.py │ ├── top_publications_article.py │ ├── top_publications_article_citation.py │ └── top_publications_metrics.py └── serpapi_backend │ ├── author_results.py │ ├── organic_cite_results.py │ ├── organic_results.py │ └── profile_results.py ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py └── test_custom_profile.py /.github/octo-reminder.yml: -------------------------------------------------------------------------------- 1 | # Octo Reminder Configuration 2 | 3 | ## Command Prefix 4 | ### Define the prefix of your custom command. 5 | ### Type: '/' | '!' 6 | ### Default: '@' (works only in combination with command_name 'set-reminder') 7 | command_prefix: '@' 8 | 9 | ## Command Name 10 | ### Define the name of your custom command. 11 | ### Type: String 12 | ### Default: 'set-reminder' 13 | command_name: 'set-reminder' 14 | 15 | ## Language 16 | ### Define the language. 17 | ### Type: 'en' | 'fr' | 'de' | 'pt' | 'nl' | 'ja' 18 | ### Default: 'en' 19 | language: 'en' 20 | 21 | ## Timezone 22 | ### Define the timezone. 23 | ### Type: String (see also https://github.com/moment/moment-timezone/blob/develop/data/packed/latest.json) 24 | ### Default: 'Europe/London' 25 | timezone: 'Europe/London' 26 | 27 | ## Default Hour 28 | ### Define the hour that will be used, when no time is specified. 29 | ### Type: Number 30 | default_hour: 0 31 | 32 | ## Default Minute 33 | ### Define the minute that will be used, when no time is specified. 34 | ### Type: Number 35 | default_minute: 0 36 | -------------------------------------------------------------------------------- /.github/workflows/codecov-workflow.yml: -------------------------------------------------------------------------------- 1 | name: Test API coverage 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths-ignore: 7 | - 'README.md' 8 | - 'MANIFEST.in' 9 | - 'LICENSE' 10 | - '.gitignore' 11 | - '.gitpod.yaml' 12 | pull_request: 13 | branches: [ main ] 14 | paths-ignore: 15 | - 'README.md' 16 | - 'MANIFEST.in' 17 | - 'LICENSE' 18 | - '.gitignore' 19 | - '.gitpod.yaml' 20 | 21 | jobs: 22 | build: 23 | runs-on: ubuntu-latest 24 | name: Set up Python 3.11 25 | steps: 26 | - uses: actions/checkout@v3 27 | - uses: actions/setup-python@v2 28 | with: 29 | python-version: '3.11' 30 | 31 | - name: Install requirements 32 | run: pip install -r requirements.txt && pip install -r requirements-dev.txt 33 | 34 | - name: Run tests and collect coverage 35 | run: pytest --cov=./ --cov-report=xml:coverage.xml 36 | 37 | - name: Upload coverage reports to Codecov with GitHub Action 38 | uses: codecov/codecov-action@v3 39 | with: 40 | token: ${{ secrets.CODECOV_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/potential-duplicates.yml: -------------------------------------------------------------------------------- 1 | name: Potential Duplicates 2 | on: 3 | issues: 4 | types: [opened, edited] 5 | jobs: 6 | run: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: wow-actions/potential-duplicates@v1 10 | with: 11 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 12 | # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch. 13 | # Any matched issue will stop detection immediately. 14 | # You can specify multi filters in each line. 15 | filter: '' 16 | # Exclude keywords in title before detecting. 17 | exclude: '' 18 | # Label to set, when potential duplicates are detected. 19 | label: potential-duplicate 20 | # Get issues with state to compare. Supported state: 'all', 'closed', 'open'. 21 | state: all 22 | # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate. 23 | threshold: 0.6 24 | # Reactions to be add to comment when potential duplicates are detected. 25 | # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes" 26 | reactions: 'eyes' 27 | # Comment to post when potential duplicates are detected. 28 | comment: > 29 | Potential duplicates: {{#issues}} 30 | - [#{{ number }}] {{ title }} ({{ accuracy }}%) 31 | {{/issues}} 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | env/ 3 | *.egg-info/ 4 | docs/ 5 | __pycache__/ -------------------------------------------------------------------------------- /.gitpod.yaml: -------------------------------------------------------------------------------- 1 | tasks: 2 | - name: Update Linux and Install other Chrome Dependencies 3 | init: | 4 | sudo apt-get update -y && sudo apt-get upgrade -y && sudo apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1 5 | - name: Create VirtualEnv, Install Python Dependencies 6 | init: | 7 | python -m venv env 8 | source env/bin/activate 9 | pip install -r requirements.txt 10 | github: 11 | prebuilds: 12 | addBadge: true 13 | vscode: 14 | extensions: 15 | - usernamehw.errorlens 16 | - vscode-icons-team.vscode-icons 17 | - bierner.markdown-preview-github-styles 18 | - ms-python.python 19 | - ms-toolsai.jupyter 20 | - KevinRose.vsc-python-indent 21 | - eamodio.gitlens 22 | - Gruntfuggly.todo-tree -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Dmitiry Zub☀️ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE requirements.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Special thanks to:

3 |
4 | SerpApi 5 |
6 | 7 | API to get search engine results with ease. 8 | 9 |
10 | 11 | ____ 12 | 13 |

14 | Scrape data from all Google Scholar pages from a single Python module. 15 |

16 | 17 |
18 | scrape-google-scholar-py-logo 19 |
20 | 21 | 22 |
23 | 24 | ![Downloads](https://static.pepy.tech/badge/scrape-google-scholar-py/month) 25 | ![licence](https://img.shields.io/github/license/dimitryzub/scrape-google-scholar-py?color=blue) 26 | [![codecov](https://codecov.io/github/dimitryzub/scrape-google-scholar-py/branch/main/graph/badge.svg?token=OIQKN0O3B9)](https://codecov.io/github/dimitryzub/scrape-google-scholar-py) 27 | 28 |
29 | 30 | > NOTE: As for now (2025), I no longer maintaining this repo. This could be changed later. To fix common issues, Chrome driver/CSS selectors might need an update. 31 | 32 |
33 | 🧐 Why two backends? 34 | 35 | 1. If you don't want to pay for API. However, I'm not 100% sure if [`selenium-stealth`](https://pypi.org/project/selenium-stealth/) could handle all CAPTCHAs (although it handles CAPTCHA by Cloudflare) and similar blocks. 36 | 2. If you know about SerpApi but don't want to figure out pagination. 37 | 38 | SerpApi backend is more reliable because of: 39 | - dedicated team of maintainers 40 | - pool of proxies 41 | - CAPTCHA solvers 42 | - legal part of scraping and more. 43 | 44 |
45 | 46 | 47 |
48 | 🧩 Custom backend supports 49 | 50 | 1. [Organic results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard&btnG=&oq=blizz) (with pagination). 51 | 2. [Profile results](https://scholar.google.com/citations?view_op=search_authors&mauthors=blizzard&hl=en&oi=drw) (with pagination). 52 | 3. [Author + author articles](https://scholar.google.com/citations?user=6IQ8pQwAAAAJ&hl=en&oi=sra) (with pagination), everything except "cited by" graph. 53 | 4. [Public access mandates metrics](https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en). Yes, you can download CSV with one click, however, it doesn't contain a funder link. Script here has it and saves to CSV/JSON. 54 | 5. [Top publications metrics](https://scholar.google.com/citations?view_op=top_venues&hl=en). Categories is also supported (as function argument). Saves to CSV/JSON. Sub-categories are not yet supported. 55 | 6. [Journal articles](https://github.com/dimitryzub/scrape-google-scholar/issues/2) (with pagination). 56 | 57 | You can use [`scholary`](https://github.com/scholarly-python-package/scholarly) to parse the data instead. However, it only extracts first 3 points above (organic, profile, author results). 58 | 59 |
60 | Things custom backend doesn't support yet 61 | 62 | 1. Organic results filters (case law, sorting, period ranges). You can add those URL parameters yourself ([if installing from source](https://github.com/dimitryzub/scrape-google-scholar-py#installing)) easily to the `google_scholar_py/custom_backend/organic_search.py` file (line [`147`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L147) or [`136`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L160)), where `driver.get()` is being called. 63 | 2. Author page -> cited by graph. 64 | 3. Extracting [journal articles page](https://scholar.google.com/citations?hl=uk&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022). The [issue to add this page is open](https://github.com/dimitryzub/scrape-google-scholar/issues/2). 65 | 4. [Top publications metrics page](https://scholar.google.com/citations?view_op=top_venues&hl=en). Subcategories are not yet supported, it's in a TODO list. 66 | 5. Update [cite results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard+effects+xanax&oq=blizzard+effects+x#d=gs_cit&t=1674718593252&u=%2Fscholar%3Fq%3Dinfo%3Alm-jhjzd72UJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D7%26hl%3Den) page extraction. 67 |
68 |
69 | 70 |
71 | 🔮 SerpApi backend supports 72 | 73 | - [Google Scholar Organic](https://serpapi.com/google-scholar-organic-results) 74 | - [Google Scholar Profiles](https://serpapi.com/google-scholar-profilesapi) 75 | - [Google Scholar Author](https://serpapi.com/google-scholar-author-api) 76 | - [Google Scholar Cite](https://serpapi.com/google-scholar-cite-api) 77 |
78 | 79 |
80 | 🏗 Custom backend depends on 81 | 82 | - [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) - to bypass CAPTCHAs and render some HTML (like cite results from organic result). 83 | - [`selectolax`](https://github.com/rushter/selectolax) - to parse HTML fast. Its the fastest Python parser wrapped around [`lexbor`](https://github.com/lexbor/lexbor) (parser in pure C). 84 | - [`pandas`](https://pandas.pydata.org/) - to save extracted data to CSV or JSON, or if you want to analyze the data right away. Save options is used in organic results and top publications, public access mandates pages for now. 85 | 86 | All scripts are using headless [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) to bypass CAPTCHA that appears on Google Scholar, so you need to have a `chromedriver`. If you're on Linux you may need to do additional troubleshooting if `chromedriver` won't run properly. 87 |
88 | 89 | ## 📥Installing 90 | 91 | Install via `pip`: 92 | 93 | ```bash 94 | $ pip install scrape-google-scholar-py 95 | ``` 96 | 97 | Install from source (single piped command): 98 | 99 | ```bash 100 | git clone https://github.com/dimitryzub/scrape-google-scholar-py.git \ 101 | && cd scrape-google-scholar-py \ 102 | && python -m venv env && source env/Scripts/activate \ 103 | && pip install -r requirements.txt 104 | ``` 105 | 106 | ### Possible errors that you might encounter 107 | 108 |
109 | LINUX USERS: If it throws "Web-driver exits unexpectedly" error 110 | 111 | Try installing extra dependencies to run `chromedriver`: 112 | ```bash 113 | $ apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1 114 | ``` 115 | 116 | See resolved issue: [[Linux] Web-driver exits unexpectedly using CustomGoogleScholarOrganic() #7](https://github.com/dimitryzub/scrape-google-scholar-py/issues/7) 117 |
118 | 119 | 120 |
121 | For MAC users, possible issues and fixes 122 | 123 | - ✅ [(resolved question): Wheels failed to build while pip installing](https://github.com/dimitryzub/scrape-google-scholar-py/issues/12#issuecomment-1554266222) 124 |
125 | 126 | 127 |
128 | If it throws an error with `selenium-stealth` 129 | 130 | ```bash 131 | error: The 'selenium' distribution was not found and is required by selenium-stealth 132 | ``` 133 | 134 | Use: 135 | 136 | ```bash 137 | $ pip install selenium-stealth 138 | ``` 139 |
140 | 141 | ## 📝Example usage custom backend 142 | 143 | ```python 144 | from google_scholar_py import CustomGoogleScholarProfiles 145 | import json 146 | 147 | parser = CustomGoogleScholarProfiles() 148 | data = parser.scrape_google_scholar_profiles( 149 | query='blizzard', 150 | pagination=False, 151 | save_to_csv=False, 152 | save_to_json=False 153 | ) 154 | print(json.dumps(data, indent=2)) 155 | ``` 156 | 157 |
158 | Google Scholar search operators could also be used 159 | 160 | ```lang-none 161 | label:computer_vision "Michigan State University"|"U.Michigan" 162 | ``` 163 | 164 | This query will search all profiles from 2 universities based on "computer vision" query. 165 |
166 | 167 | 168 |
169 | JSON output 170 | 171 | ```json 172 | [ 173 | { 174 | "name": "Adam Lobel", 175 | "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ", 176 | "affiliations": "Blizzard Entertainment", 177 | "interests": [ 178 | "Gaming", 179 | "Emotion regulation" 180 | ], 181 | "email": "Verified email at AdamLobel.com", 182 | "cited_by_count": 3593 183 | }, 184 | { 185 | "name": "Daniel Blizzard", 186 | "link": "https://scholar.google.com/citations?hl=en&user=dk4LWEgAAAAJ", 187 | "affiliations": "", 188 | "interests": null, 189 | "email": null, 190 | "cited_by_count": 1041 191 | }, 192 | { 193 | "name": "Shuo Chen", 194 | "link": "https://scholar.google.com/citations?hl=en&user=OBf4YnkAAAAJ", 195 | "affiliations": "Senior Data Scientist, Blizzard Entertainment", 196 | "interests": [ 197 | "Machine Learning", 198 | "Data Mining", 199 | "Artificial Intelligence" 200 | ], 201 | "email": "Verified email at cs.cornell.edu", 202 | "cited_by_count": 725 203 | }, 204 | { 205 | "name": "Ian Livingston", 206 | "link": "https://scholar.google.com/citations?hl=en&user=xBHVqNIAAAAJ", 207 | "affiliations": "Blizzard Entertainment", 208 | "interests": [ 209 | "Human-computer interaction", 210 | "User Experience", 211 | "Player Experience", 212 | "User Research", 213 | "Games" 214 | ], 215 | "email": "Verified email at usask.ca", 216 | "cited_by_count": 652 217 | }, 218 | { 219 | "name": "Minli Xu", 220 | "link": "https://scholar.google.com/citations?hl=en&user=QST5iogAAAAJ", 221 | "affiliations": "Blizzard Entertainment", 222 | "interests": [ 223 | "Game", 224 | "Machine Learning", 225 | "Data Science", 226 | "Bioinformatics" 227 | ], 228 | "email": "Verified email at blizzard.com", 229 | "cited_by_count": 541 230 | }, 231 | { 232 | "name": "Je Seok Lee", 233 | "link": "https://scholar.google.com/citations?hl=en&user=vuvtlzQAAAAJ", 234 | "affiliations": "Blizzard Entertainment", 235 | "interests": [ 236 | "HCI", 237 | "Player Experience", 238 | "Games", 239 | "Esports" 240 | ], 241 | "email": "Verified email at uci.edu", 242 | "cited_by_count": 386 243 | }, 244 | { 245 | "name": "Alisha Ness", 246 | "link": "https://scholar.google.com/citations?hl=en&user=xQuwVfkAAAAJ", 247 | "affiliations": "Activision Blizzard", 248 | "interests": null, 249 | "email": null, 250 | "cited_by_count": 324 251 | }, 252 | { 253 | "name": "Xingyu (Alfred) Liu", 254 | "link": "https://scholar.google.com/citations?hl=en&user=VW9ukOwAAAAJ", 255 | "affiliations": "Blizzard Entertainment", 256 | "interests": [ 257 | "Machine Learning in Game Development" 258 | ], 259 | "email": null, 260 | "cited_by_count": 256 261 | }, 262 | { 263 | "name": "Amanda LL Cullen", 264 | "link": "https://scholar.google.com/citations?hl=en&user=oqna6OgAAAAJ", 265 | "affiliations": "Blizzard Entertainment", 266 | "interests": [ 267 | "Games Studies", 268 | "Fan Studies", 269 | "Live Streaming" 270 | ], 271 | "email": null, 272 | "cited_by_count": 247 273 | }, 274 | { 275 | "name": "Nicole \"Nikki\" Crenshaw", 276 | "link": "https://scholar.google.com/citations?hl=en&user=zmRH6E0AAAAJ", 277 | "affiliations": "Blizzard Entertainment", 278 | "interests": [ 279 | "MMOs", 280 | "Neoliberalism", 281 | "Social Affordances", 282 | "Identity", 283 | "Accessibility" 284 | ], 285 | "email": "Verified email at uci.edu", 286 | "cited_by_count": 202 287 | } 288 | ] 289 | ``` 290 | 291 |
292 | 293 | 294 | ## 📝Example usage SerpApi backend 295 | 296 | ```python 297 | from google_scholar_py import SerpApiGoogleScholarOrganic 298 | import json 299 | 300 | profile_parser = SerpApiGoogleScholarProfiles() 301 | data = profile_parser.scrape_google_scholar_profile_results( 302 | query='blizzard', 303 | api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key 304 | pagination=False, 305 | # other params 306 | ) 307 | print(json.dumps(data, indent=2)) 308 | ``` 309 | 310 |
311 | JSON output 312 | 313 | ```json 314 | [ 315 | { 316 | "position": 0, 317 | "title": "Mining learning and crafting scientific experiments: a literature review on the use of minecraft in education and research", 318 | "result_id": "61OUs-3P374J", 319 | "link": "https://www.jstor.org/stable/pdf/jeductechsoci.19.2.355.pdf?&seq=1", 320 | "snippet": "\u2026 Minecraft have aroused the attention of teachers and researchers alike. To gain insights into the applicability of Minecraft, \u2026 our own considerable experience with Minecraft in courses on \u2026", 321 | "publication_info": { 322 | "summary": "S Nebel, S Schneider, GD Rey - Journal of Educational Technology & \u2026, 2016 - JSTOR", 323 | "authors": [ 324 | { 325 | "name": "S Nebel", 326 | "link": "https://scholar.google.com/citations?user=_WTrwUwAAAAJ&hl=en&oi=sra", 327 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=_WTrwUwAAAAJ&engine=google_scholar_author&hl=en", 328 | "author_id": "_WTrwUwAAAAJ" 329 | }, 330 | { 331 | "name": "S Schneider", 332 | "link": "https://scholar.google.com/citations?user=6Lh4FBMAAAAJ&hl=en&oi=sra", 333 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=6Lh4FBMAAAAJ&engine=google_scholar_author&hl=en", 334 | "author_id": "6Lh4FBMAAAAJ" 335 | }, 336 | { 337 | "name": "GD Rey", 338 | "link": "https://scholar.google.com/citations?user=jCilMQoAAAAJ&hl=en&oi=sra", 339 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=jCilMQoAAAAJ&engine=google_scholar_author&hl=en", 340 | "author_id": "jCilMQoAAAAJ" 341 | } 342 | ] 343 | }, 344 | "resources": [ 345 | { 346 | "title": "researchgate.net", 347 | "file_format": "PDF", 348 | "link": "https://www.researchgate.net/profile/Steve-Nebel/publication/301232882_Mining_Learning_and_Crafting_Scientific_Experiments_A_Literature_Review_on_the_Use_of_Minecraft_in_Education_and_Research/links/570e709008aed4bec6fddad4/Mining-Learning-and-Crafting-Scientific-Experiments-A-Literature-Review-on-the-Use-of-Minecraft-in-Education-and-Research.pdf" 349 | } 350 | ], 351 | "inline_links": { 352 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=61OUs-3P374J", 353 | "cited_by": { 354 | "total": 358, 355 | "link": "https://scholar.google.com/scholar?cites=13753940406839825387&as_sdt=2005&sciodt=0,5&hl=en", 356 | "cites_id": "13753940406839825387", 357 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13753940406839825387&engine=google_scholar&hl=en" 358 | }, 359 | "related_pages_link": "https://scholar.google.com/scholar?q=related:61OUs-3P374J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 360 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A61OUs-3P374J%3Ascholar.google.com%2F", 361 | "versions": { 362 | "total": 10, 363 | "link": "https://scholar.google.com/scholar?cluster=13753940406839825387&hl=en&as_sdt=0,5", 364 | "cluster_id": "13753940406839825387", 365 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13753940406839825387&engine=google_scholar&hl=en" 366 | } 367 | } 368 | }, 369 | { 370 | "position": 1, 371 | "title": "Minecraft, beyond construction and survival", 372 | "result_id": "_Lo9erywZPUJ", 373 | "type": "Pdf", 374 | "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9", 375 | "snippet": "\" We\u2019ll keep releasing expansions and keep the game alive, but there needs to be some kind of final version that you can point at and say,\u2018I did this!\u2019... I\u2019m not sure why I feel a need to \u2026", 376 | "publication_info": { 377 | "summary": "SC Duncan - 2011 - stacks.stanford.edu", 378 | "authors": [ 379 | { 380 | "name": "SC Duncan", 381 | "link": "https://scholar.google.com/citations?user=Ypqv_IEAAAAJ&hl=en&oi=sra", 382 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=Ypqv_IEAAAAJ&engine=google_scholar_author&hl=en", 383 | "author_id": "Ypqv_IEAAAAJ" 384 | } 385 | ] 386 | }, 387 | "resources": [ 388 | { 389 | "title": "stanford.edu", 390 | "file_format": "PDF", 391 | "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9" 392 | } 393 | ], 394 | "inline_links": { 395 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=_Lo9erywZPUJ", 396 | "cited_by": { 397 | "total": 288, 398 | "link": "https://scholar.google.com/scholar?cites=17682452360514616060&as_sdt=2005&sciodt=0,5&hl=en", 399 | "cites_id": "17682452360514616060", 400 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17682452360514616060&engine=google_scholar&hl=en" 401 | }, 402 | "related_pages_link": "https://scholar.google.com/scholar?q=related:_Lo9erywZPUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 403 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A_Lo9erywZPUJ%3Ascholar.google.com%2F", 404 | "versions": { 405 | "total": 6, 406 | "link": "https://scholar.google.com/scholar?cluster=17682452360514616060&hl=en&as_sdt=0,5", 407 | "cluster_id": "17682452360514616060", 408 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17682452360514616060&engine=google_scholar&hl=en" 409 | }, 410 | "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:_Lo9erywZPUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5" 411 | } 412 | }, 413 | { 414 | "position": 2, 415 | "title": "Minecraft as a creative tool: A case study", 416 | "result_id": "wOTRJ8q0KIsJ", 417 | "link": "https://www.igi-global.com/article/minecraft-as-a-creative-tool/116516", 418 | "snippet": "\u2026 environment, Minecraft. In the following case study, the authors explored the use of Minecraft in \u2026 The authors demonstrate that Minecraft offers a unique opportunity for students to display \u2026", 419 | "publication_info": { 420 | "summary": "M Cipollone, CC Schifter, RA Moffat - International Journal of Game \u2026, 2014 - igi-global.com" 421 | }, 422 | "resources": [ 423 | { 424 | "title": "minecraft.school.nz", 425 | "file_format": "PDF", 426 | "link": "https://www.minecraft.school.nz/uploads/2/9/6/3/2963069/minecraft-as-a-creative-tool_-a-case-study_cipollone2014.pdf" 427 | } 428 | ], 429 | "inline_links": { 430 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=wOTRJ8q0KIsJ", 431 | "cited_by": { 432 | "total": 102, 433 | "link": "https://scholar.google.com/scholar?cites=10027463350684869824&as_sdt=2005&sciodt=0,5&hl=en", 434 | "cites_id": "10027463350684869824", 435 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=10027463350684869824&engine=google_scholar&hl=en" 436 | }, 437 | "related_pages_link": "https://scholar.google.com/scholar?q=related:wOTRJ8q0KIsJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 438 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AwOTRJ8q0KIsJ%3Ascholar.google.com%2F", 439 | "versions": { 440 | "total": 9, 441 | "link": "https://scholar.google.com/scholar?cluster=10027463350684869824&hl=en&as_sdt=0,5", 442 | "cluster_id": "10027463350684869824", 443 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=10027463350684869824&engine=google_scholar&hl=en" 444 | } 445 | } 446 | }, 447 | { 448 | "position": 3, 449 | "title": "Learning mathematics through Minecraft", 450 | "result_id": "Hh4p5NaYNu0J", 451 | "link": "https://pubs.nctm.org/abstract/journals/tcm/21/1/article-p56.xml", 452 | "snippet": "\u2026 Minecraft to explore area and perimeter. First, the teacher reviewed the definition of perimeter and area. Using a class set of iPods with Minecraft \u2026 Minecraft forms a medium to explore \u2026", 453 | "publication_info": { 454 | "summary": "B Bos, L Wilder, M Cook, R O'Donnell - Teaching Children \u2026, 2014 - pubs.nctm.org", 455 | "authors": [ 456 | { 457 | "name": "B Bos", 458 | "link": "https://scholar.google.com/citations?user=DfdRg-8AAAAJ&hl=en&oi=sra", 459 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DfdRg-8AAAAJ&engine=google_scholar_author&hl=en", 460 | "author_id": "DfdRg-8AAAAJ" 461 | } 462 | ] 463 | }, 464 | "resources": [ 465 | { 466 | "title": "researchgate.net", 467 | "file_format": "PDF", 468 | "link": "https://www.researchgate.net/profile/Beth-Bos/publication/267507986_Learning_mathematics_through_Minecraft_Authors/links/545103b80cf249aa53dc8eb2/Learning-mathematics-through-Minecraft-Authors.pdf" 469 | } 470 | ], 471 | "inline_links": { 472 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Hh4p5NaYNu0J", 473 | "cited_by": { 474 | "total": 120, 475 | "link": "https://scholar.google.com/scholar?cites=17093017484449619486&as_sdt=2005&sciodt=0,5&hl=en", 476 | "cites_id": "17093017484449619486", 477 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17093017484449619486&engine=google_scholar&hl=en" 478 | }, 479 | "related_pages_link": "https://scholar.google.com/scholar?q=related:Hh4p5NaYNu0J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 480 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AHh4p5NaYNu0J%3Ascholar.google.com%2F", 481 | "versions": { 482 | "total": 8, 483 | "link": "https://scholar.google.com/scholar?cluster=17093017484449619486&hl=en&as_sdt=0,5", 484 | "cluster_id": "17093017484449619486", 485 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17093017484449619486&engine=google_scholar&hl=en" 486 | } 487 | } 488 | }, 489 | { 490 | "position": 4, 491 | "title": "A deep hierarchical approach to lifelong learning in minecraft", 492 | "result_id": "a_Er9i3hDtUJ", 493 | "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744", 494 | "snippet": "We propose a lifelong learning system that has the ability to reuse and transfer knowledge from one task to another while efficiently retaining the previously learned knowledge-base. \u2026", 495 | "publication_info": { 496 | "summary": "C Tessler, S Givony, T Zahavy, D Mankowitz\u2026 - Proceedings of the \u2026, 2017 - ojs.aaai.org", 497 | "authors": [ 498 | { 499 | "name": "C Tessler", 500 | "link": "https://scholar.google.com/citations?user=7eLKa3IAAAAJ&hl=en&oi=sra", 501 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=7eLKa3IAAAAJ&engine=google_scholar_author&hl=en", 502 | "author_id": "7eLKa3IAAAAJ" 503 | }, 504 | { 505 | "name": "S Givony", 506 | "link": "https://scholar.google.com/citations?user=nlVsO4YAAAAJ&hl=en&oi=sra", 507 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=nlVsO4YAAAAJ&engine=google_scholar_author&hl=en", 508 | "author_id": "nlVsO4YAAAAJ" 509 | }, 510 | { 511 | "name": "T Zahavy", 512 | "link": "https://scholar.google.com/citations?user=9dXN6cMAAAAJ&hl=en&oi=sra", 513 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=9dXN6cMAAAAJ&engine=google_scholar_author&hl=en", 514 | "author_id": "9dXN6cMAAAAJ" 515 | }, 516 | { 517 | "name": "D Mankowitz", 518 | "link": "https://scholar.google.com/citations?user=v84tWxsAAAAJ&hl=en&oi=sra", 519 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=v84tWxsAAAAJ&engine=google_scholar_author&hl=en", 520 | "author_id": "v84tWxsAAAAJ" 521 | } 522 | ] 523 | }, 524 | "resources": [ 525 | { 526 | "title": "aaai.org", 527 | "file_format": "PDF", 528 | "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744/10603" 529 | } 530 | ], 531 | "inline_links": { 532 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=a_Er9i3hDtUJ", 533 | "cited_by": { 534 | "total": 364, 535 | "link": "https://scholar.google.com/scholar?cites=15352455767272452459&as_sdt=2005&sciodt=0,5&hl=en", 536 | "cites_id": "15352455767272452459", 537 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=15352455767272452459&engine=google_scholar&hl=en" 538 | }, 539 | "related_pages_link": "https://scholar.google.com/scholar?q=related:a_Er9i3hDtUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 540 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Aa_Er9i3hDtUJ%3Ascholar.google.com%2F", 541 | "versions": { 542 | "total": 13, 543 | "link": "https://scholar.google.com/scholar?cluster=15352455767272452459&hl=en&as_sdt=0,5", 544 | "cluster_id": "15352455767272452459", 545 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=15352455767272452459&engine=google_scholar&hl=en" 546 | }, 547 | "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:a_Er9i3hDtUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5" 548 | } 549 | }, 550 | { 551 | "position": 5, 552 | "title": "Teaching scientific concepts using a virtual world: Minecraft.", 553 | "result_id": "Oh88DuoTaLYJ", 554 | "link": "https://search.informit.org/doi/abs/10.3316/aeipt.195598", 555 | "snippet": "Minecraft is a multiplayer sandbox video game based in a virtual world modelled on the real \u2026 of Minecraft lends itself to the teaching of various academic subjects. Minecraft also has a \u2026", 556 | "publication_info": { 557 | "summary": "D Short - Teaching science, 2012 - search.informit.org", 558 | "authors": [ 559 | { 560 | "name": "D Short", 561 | "link": "https://scholar.google.com/citations?user=ec_1ZmMAAAAJ&hl=en&oi=sra", 562 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=ec_1ZmMAAAAJ&engine=google_scholar_author&hl=en", 563 | "author_id": "ec_1ZmMAAAAJ" 564 | } 565 | ] 566 | }, 567 | "resources": [ 568 | { 569 | "title": "academia.edu", 570 | "file_format": "PDF", 571 | "link": "https://www.academia.edu/download/31153502/Short-2012-MC-Color-Version.pdf" 572 | } 573 | ], 574 | "inline_links": { 575 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Oh88DuoTaLYJ", 576 | "cited_by": { 577 | "total": 274, 578 | "link": "https://scholar.google.com/scholar?cites=13143777408462888762&as_sdt=2005&sciodt=0,5&hl=en", 579 | "cites_id": "13143777408462888762", 580 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13143777408462888762&engine=google_scholar&hl=en" 581 | }, 582 | "related_pages_link": "https://scholar.google.com/scholar?q=related:Oh88DuoTaLYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 583 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AOh88DuoTaLYJ%3Ascholar.google.com%2F", 584 | "versions": { 585 | "total": 8, 586 | "link": "https://scholar.google.com/scholar?cluster=13143777408462888762&hl=en&as_sdt=0,5", 587 | "cluster_id": "13143777408462888762", 588 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13143777408462888762&engine=google_scholar&hl=en" 589 | } 590 | } 591 | }, 592 | { 593 | "position": 6, 594 | "title": "Investigating the role of Minecraft in educational learning environments", 595 | "result_id": "6RcOZdlG3CcJ", 596 | "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2016.1254877", 597 | "snippet": "\u2026 This research paper identifies the way in which Minecraft Edu can be used to contribute to the teaching 598 | and learning of secondary students via a multiple case research study. Minecraft \u2026", 599 | "publication_info": { 600 | "summary": "N Callaghan - Educational Media International, 2016 - Taylor & Francis" 601 | }, 602 | "inline_links": { 603 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6RcOZdlG3CcJ", 604 | "cited_by": { 605 | "total": 95, 606 | "link": "https://scholar.google.com/scholar?cites=2872248561872803817&as_sdt=2005&sciodt=0,5&hl=en", 607 | "cites_id": "2872248561872803817", 608 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=2872248561872803817&engine=google_scholar&hl=en" 609 | }, 610 | "related_pages_link": "https://scholar.google.com/scholar?q=related:6RcOZdlG3CcJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 611 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A6RcOZdlG3CcJ%3Ascholar.google.com%2F", 612 | "versions": { 613 | "total": 3, 614 | "link": "https://scholar.google.com/scholar?cluster=2872248561872803817&hl=en&as_sdt=0,5", 615 | "cluster_id": "2872248561872803817", 616 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=2872248561872803817&engine=google_scholar&hl=en" 617 | } 618 | } 619 | }, 620 | { 621 | "position": 7, 622 | "title": "Maker culture and Minecraft: implications for the future of learning", 623 | "result_id": "h27IfZ5va2YJ", 624 | "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2015.1075103", 625 | "snippet": "\u2026 be best to subscribe to for gathering information on Minecraft maker culture. From there, we \u2026 the 626 | Minecraft videos that we are studying \u201ccreators\u201d due to the culture of the Minecraft video \u2026", 627 | "publication_info": { 628 | "summary": "DJ Niemeyer, HR Gerber - Educational Media International, 2015 - Taylor & Francis", 629 | "authors": [ 630 | { 631 | "name": "DJ Niemeyer", 632 | "link": "https://scholar.google.com/citations?user=iEZOnzQAAAAJ&hl=en&oi=sra", 633 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=iEZOnzQAAAAJ&engine=google_scholar_author&hl=en", 634 | "author_id": "iEZOnzQAAAAJ" 635 | }, 636 | { 637 | "name": "HR Gerber", 638 | "link": "https://scholar.google.com/citations?user=DwyCTMUAAAAJ&hl=en&oi=sra", 639 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DwyCTMUAAAAJ&engine=google_scholar_author&hl=en", 640 | "author_id": "DwyCTMUAAAAJ" 641 | } 642 | ] 643 | }, 644 | "resources": [ 645 | { 646 | "title": "publicservicesalliance.org", 647 | "file_format": "PDF", 648 | "link": "http://publicservicesalliance.org/wp-content/uploads/2016/06/Maker_culture_and_Minecraft_implications.pdf" 649 | } 650 | ], 651 | "inline_links": { 652 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=h27IfZ5va2YJ", 653 | "cited_by": { 654 | "total": 114, 655 | "link": "https://scholar.google.com/scholar?cites=7380115140882493063&as_sdt=2005&sciodt=0,5&hl=en", 656 | "cites_id": "7380115140882493063", 657 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=7380115140882493063&engine=google_scholar&hl=en" 658 | }, 659 | "related_pages_link": "https://scholar.google.com/scholar?q=related:h27IfZ5va2YJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 660 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Ah27IfZ5va2YJ%3Ascholar.google.com%2F", 661 | "versions": { 662 | "total": 8, 663 | "link": "https://scholar.google.com/scholar?cluster=7380115140882493063&hl=en&as_sdt=0,5", 664 | "cluster_id": "7380115140882493063", 665 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=7380115140882493063&engine=google_scholar&hl=en" 666 | } 667 | } 668 | }, 669 | { 670 | "position": 8, 671 | "title": "Control of memory, active perception, and action in minecraft", 672 | "result_id": "-5uM8qRUviwJ", 673 | "link": "http://proceedings.mlr.press/v48/oh16.html", 674 | "snippet": "In this paper, we introduce a new set of reinforcement learning (RL) tasks in Minecraft (a flexible 3D world). 675 | We then use these tasks to systematically compare and contrast existing \u2026", 676 | "publication_info": { 677 | "summary": "J Oh, V Chockalingam, H Lee - \u2026 conference on machine \u2026, 2016 - proceedings.mlr.press", 678 | "authors": [ 679 | { 680 | "name": "J Oh", 681 | "link": "https://scholar.google.com/citations?user=LNUeOu4AAAAJ&hl=en&oi=sra", 682 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=LNUeOu4AAAAJ&engine=google_scholar_author&hl=en", 683 | "author_id": "LNUeOu4AAAAJ" 684 | }, 685 | { 686 | "name": "V Chockalingam", 687 | "link": "https://scholar.google.com/citations?user=CM2UkioAAAAJ&hl=en&oi=sra", 688 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=CM2UkioAAAAJ&engine=google_scholar_author&hl=en", 689 | "author_id": "CM2UkioAAAAJ" 690 | }, 691 | { 692 | "name": "H Lee", 693 | "link": "https://scholar.google.com/citations?user=fmSHtE8AAAAJ&hl=en&oi=sra", 694 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=fmSHtE8AAAAJ&engine=google_scholar_author&hl=en", 695 | "author_id": "fmSHtE8AAAAJ" 696 | } 697 | ] 698 | }, 699 | "resources": [ 700 | { 701 | "title": "mlr.press", 702 | "file_format": "PDF", 703 | "link": "http://proceedings.mlr.press/v48/oh16.pdf" 704 | } 705 | ], 706 | "inline_links": { 707 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=-5uM8qRUviwJ", 708 | "cited_by": { 709 | "total": 317, 710 | "link": "https://scholar.google.com/scholar?cites=3224107450664524795&as_sdt=2005&sciodt=0,5&hl=en", 711 | "cites_id": "3224107450664524795", 712 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=3224107450664524795&engine=google_scholar&hl=en" 713 | }, 714 | "related_pages_link": "https://scholar.google.com/scholar?q=related:-5uM8qRUviwJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 715 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A-5uM8qRUviwJ%3Ascholar.google.com%2F", 716 | "versions": { 717 | "total": 7, 718 | "link": "https://scholar.google.com/scholar?cluster=3224107450664524795&hl=en&as_sdt=0,5", 719 | "cluster_id": "3224107450664524795", 720 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=3224107450664524795&engine=google_scholar&hl=en" 721 | }, 722 | "cached_page_link": "http://scholar.googleusercontent.com/scholar?q=cache:-5uM8qRUviwJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5" 723 | } 724 | }, 725 | { 726 | "position": 9, 727 | "title": "Minecraft as a teaching tool: One case study", 728 | "result_id": "yItxbN8DVXYJ", 729 | "link": "https://www.learntechlib.org/p/48540/", 730 | "snippet": "We know games help students gain skills and insights in many ways, and that games are engaging. With new online MMOPRPG games, like Minecraft, what we do not know is what \u2026", 731 | "publication_info": { 732 | "summary": "C Schifter, M Cipollone - Society for Information Technology & \u2026, 2013 - learntechlib.org" 733 | }, 734 | "inline_links": { 735 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=yItxbN8DVXYJ", 736 | "cited_by": { 737 | "total": 55, 738 | "link": "https://scholar.google.com/scholar?cites=8526725727627873224&as_sdt=2005&sciodt=0,5&hl=en", 739 | "cites_id": "8526725727627873224", 740 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=8526725727627873224&engine=google_scholar&hl=en" 741 | }, 742 | "related_pages_link": "https://scholar.google.com/scholar?q=related:yItxbN8DVXYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5", 743 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AyItxbN8DVXYJ%3Ascholar.google.com%2F", 744 | "versions": { 745 | "total": 2, 746 | "link": "https://scholar.google.com/scholar?cluster=8526725727627873224&hl=en&as_sdt=0,5", 747 | "cluster_id": "8526725727627873224", 748 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=8526725727627873224&engine=google_scholar&hl=en" 749 | } 750 | } 751 | } 752 | ] 753 | ``` 754 | 755 |
756 | 757 | ## ✍Contributing 758 | 759 | Feel free to open an issue: 760 | - what bug you found. 761 | - something isn't working. 762 | - what feature to add. 763 | - anything else related to Google Scholar. 764 | 765 | If you find comfortable to open a PR, feel free to do so. Guidelines are simple: conventional commits + code as simple as possible without unnecessary complexity. 766 | 767 | There's exists a `.gitpod.yaml` config if you're using [Gitpod](https://www.gitpod.io/). 768 | 769 | ## 📜Licence 770 | 771 | `scrape-google-scholar` repository is licensed under MIT license. 772 | -------------------------------------------------------------------------------- /example_usage.py: -------------------------------------------------------------------------------- 1 | # each function have documentation with an example "usage" script, after function arugments 2 | from google_scholar_py import CustomGoogleScholarOrganic 3 | from google_scholar_py import SerpApiGoogleScholarOrganic 4 | from google_scholar_py import CustomGoogleScholarTopPublicationArticle 5 | 6 | import json 7 | 8 | # TODO: add more examples 9 | custom_parser_get_organic_results = CustomGoogleScholarOrganic().scrape_google_scholar_organic_results( 10 | query='blizzard', 11 | pagination=False, 12 | save_to_csv=False, 13 | save_to_json=False 14 | ) 15 | 16 | top_publication_citation = CustomGoogleScholarTopPublicationArticle().scrape_google_scholar_top_publication_articles( 17 | journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=TdhLrHqKTh8J.2022', 18 | pagination=True, 19 | save_to_csv=False, 20 | save_to_json=False 21 | ) 22 | 23 | serpapi_parser_get_organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results( 24 | query='blizzard', 25 | api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key 26 | lang='en', 27 | pagination=False, 28 | ) 29 | 30 | 31 | print(json.dumps(custom_parser_get_organic_results, indent=2, ensure_ascii=False)) 32 | print(json.dumps(serpapi_parser_get_organic_results, indent=2, ensure_ascii=False)) 33 | print(json.dumps(top_publication_citation, indent=2, ensure_ascii=False)) -------------------------------------------------------------------------------- /google_scholar_py/__init__.py: -------------------------------------------------------------------------------- 1 | from .custom_backend.organic_search import CustomGoogleScholarOrganic 2 | from .custom_backend.profiles_results import CustomGoogleScholarProfiles 3 | from .custom_backend.author_info_all_articles import CustomGoogleScholarAuthor 4 | from .custom_backend.top_mandates_metrics import CustomGoogleScholarTopMandates 5 | from .custom_backend.top_publications_metrics import CustomGoogleScholarTopPublications 6 | from .custom_backend.top_publications_article import CustomGoogleScholarTopPublicationArticle 7 | from .custom_backend.top_publications_article_citation import CustomGoogleScholarTopPublicationArticleCitation 8 | 9 | # serpapi backend 10 | from .serpapi_backend.organic_results import SerpApiGoogleScholarOrganic 11 | from .serpapi_backend.profile_results import SerpApiGoogleScholarProfiles 12 | from .serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite 13 | from .serpapi_backend.author_results import SerpApiGoogleScholarAuthor -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/author_info_all_articles.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Union, Dict 7 | from pathlib import Path 8 | 9 | 10 | class CustomGoogleScholarAuthor: 11 | def __init__(self) -> None: 12 | pass 13 | 14 | 15 | def scrape_google_scholar_author_data( 16 | self, 17 | user_id: str, 18 | parse_articles: bool = False, 19 | article_pagination: bool = False 20 | ) -> Dict[str, List[Union[str, int, None]]]: 21 | ''' 22 | Extracts data from Google Scholar Author profile page: 23 | - Info about the author itself 24 | - Co-authors: name, link, affiliation 25 | - Author: title, link, authors, publication, cited by, year. 26 | - Articles: first 100 if pagination is False, or all if pagination is True. 27 | 28 | Arguments: 29 | - user_id: str. User ID from Google Scholar profile located in the URL. 30 | - parse_articles: True of False. If True, extracts first 100 articles. Default False. 31 | - article_pagination: True of False. If True, extracts beyond first 100 articles. 32 | 33 | Usage: 34 | 35 | from google_scholar_py import CustomGoogleScholarAuthor 36 | 37 | parser = CustomGoogleScholarAuthor() 38 | data = parser.scrape_google_scholar_author_data( 39 | user_id='nHhtvqkAAAAJ', 40 | parse_articles=True, 41 | article_pagination=True 42 | ) 43 | print(json.dumps(data, indent=2)) 44 | 45 | print(data['info']) # author info 46 | print(data['co-authors']) 47 | 48 | for article in data['articles']: 49 | print(article['title']) 50 | print(article['cited_by_count']) 51 | ... 52 | ''' 53 | 54 | # selenium stealth 55 | options = webdriver.ChromeOptions() 56 | options.add_argument('--headless') 57 | options.add_argument('--no-sandbox') 58 | options.add_argument('--disable-dev-shm-usage') 59 | 60 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 61 | options.add_experimental_option('useAutomationExtension', False) 62 | 63 | service = Service(ChromeDriverManager().install()) 64 | driver = webdriver.Chrome(service=service, options=options) 65 | 66 | stealth(driver, 67 | languages=['en-US', 'en'], 68 | vendor='Google Inc.', 69 | platform='Win32', 70 | webgl_vendor='Intel Inc.', 71 | renderer='Intel Iris OpenGL Engine', 72 | fix_hairline=True, 73 | ) 74 | 75 | driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&pagesize=100') 76 | parser = LexborHTMLParser(driver.page_source) 77 | 78 | profile_info = { 79 | 'info': {}, 80 | 'co-authors': [], 81 | 'articles': [][:-1] # [:-1] to not to return the last None element. Weird approach, I know. Revisit in the future. 82 | } 83 | 84 | profile_info['info']['name'] = parser.css_first('#gsc_prf_in').text() 85 | profile_info['info']['affiliations'] = parser.css_first('.gsc_prf_ila').text() 86 | profile_info['info']['email'] = parser.css_first('#gsc_prf_ivh').text() 87 | profile_info['info']['interests'] = [interest.text() for interest in parser.css('#gsc_prf_int .gs_ibl')] 88 | 89 | for co_author in parser.css('.gsc_rsb_aa'): 90 | profile_info['co-authors'].append({ 91 | 'name': co_author.css_first('.gsc_rsb_a_desc a').text(), 92 | 'profile_link': f"https://scholar.google.com{co_author.css_first('.gsc_rsb_a_desc a').attrs['href']}", 93 | 'affiliation': co_author.css_first('.gsc_rsb_a_ext').text(), 94 | }) 95 | 96 | # extracts only first 100 articles, WITHOUT paginaiton 97 | if parse_articles: 98 | # TODO: make a separate function to extract articles 99 | for index, article in enumerate(parser.css('.gsc_a_tr'), start=1): 100 | try: 101 | article_title = article.css_first('.gsc_a_at').text() 102 | except: article_title = None 103 | 104 | try: 105 | article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}" 106 | except: article_link = None 107 | 108 | try: 109 | if ',' in article.css_first('.gsc_a_at+ .gs_gray').text(): 110 | article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors 111 | else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text() # single authour 112 | except: article_authors = None 113 | 114 | try: 115 | article_publication = article.css_first('.gs_gray+ .gs_gray').text() 116 | except: article_publication = None 117 | 118 | try: 119 | cited_by_count = article.css_first('.gsc_a_ac').text() 120 | except: cited_by_count = None 121 | 122 | try: 123 | publication_year = article.css_first('.gsc_a_hc').text() 124 | except: publication_year = None 125 | 126 | profile_info['articles'].append({ 127 | 'title': article_title, 128 | 'link': article_link, 129 | 'authors': article_authors, 130 | 'publication': article_publication if article_publication else None, 131 | 'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str 132 | 'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str 133 | }) 134 | elif parse_articles is False: 135 | profile_info.pop('articles') 136 | 137 | page_num = 0 138 | 139 | # extracts all articles 140 | if parse_articles and article_pagination: 141 | while True: 142 | driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&cstart={page_num}&pagesize=100') 143 | parser = LexborHTMLParser(driver.page_source) 144 | 145 | for article in parser.css('.gsc_a_tr'): 146 | try: 147 | article_title = article.css_first('.gsc_a_at').text() 148 | except: article_title = None 149 | 150 | try: 151 | article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}" 152 | except: article_link = None 153 | 154 | try: 155 | if ',' in article.css_first('.gsc_a_at+ .gs_gray').text(): 156 | article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors 157 | else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text() # single authour 158 | except: article_authors = None 159 | 160 | try: 161 | article_publication = article.css_first('.gs_gray+ .gs_gray').text() 162 | except: article_publication = None 163 | 164 | try: 165 | cited_by_count = article.css_first('.gsc_a_ac').text() 166 | except: cited_by_count = None 167 | 168 | try: 169 | publication_year = article.css_first('.gsc_a_hc').text() 170 | except: publication_year = None 171 | 172 | profile_info['articles'].append({ 173 | 'title': article_title, 174 | 'link': article_link, 175 | 'authors': article_authors, 176 | 'publication': article_publication if article_publication else None, 177 | 'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str 178 | 'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str 179 | }) 180 | 181 | if parser.css_first('.gsc_a_e'): 182 | break 183 | else: 184 | page_num += 100 # paginate to the next page 185 | 186 | # remove articles key if user don't want to extract it 187 | elif article_pagination and parse_articles is False: 188 | profile_info.pop('articles') 189 | 190 | return profile_info 191 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/cite_results.py: -------------------------------------------------------------------------------- 1 | #TODO: support/refactor CITE extraction. This is not yet implemented. 2 | 3 | from parsel import Selector 4 | import requests 5 | 6 | params = { 7 | 'q': 'blizzard', # search query 8 | 'hl': 'en' # language of the search 9 | } 10 | 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 14 | 'accept-language': 'en-US,en', 15 | 'referer': f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}" 16 | } 17 | 18 | 19 | def parsel_get_cite_ids(): 20 | html = requests.get('https://scholar.google.com/scholar', params=params, headers=headers) 21 | soup = Selector(text=html.text) 22 | 23 | # returns a list of publication ID's -> U8bh6Ca9uwQJ 24 | return soup.css('.gs_r.gs_or.gs_scl::attr(data-cid)').getall() 25 | 26 | def parsel_scrape_cite_results(): 27 | citations = [] 28 | 29 | for cite_id in parsel_get_cite_ids(): 30 | html = requests.get(f'https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com', headers=headers) 31 | selector = Selector(text=html.text) 32 | 33 | # might be issues in the future with extracting data from the table 34 | if selector.css('#gs_citt').get(): 35 | for result in selector.css('tr'): 36 | institution = result.xpath('th/text()').get() 37 | citation = result.xpath('td div/text()').get() 38 | 39 | citations.append({'institution': institution, 'citations': citation}) 40 | 41 | return citations 42 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/google_scholar_cited_by_public_access_author.py: -------------------------------------------------------------------------------- 1 | from parsel import Selector 2 | import requests, json 3 | 4 | #TODO: add cited by graph extraction to author script 5 | 6 | def parsel_scrape_author_cited_by_graph(): 7 | params = { 8 | 'user': '_xwYD2sAAAAJ', # user-id 9 | 'hl': 'en' # language 10 | } 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' 14 | } 15 | 16 | data = { 17 | 'cited_by': [], 18 | 'graph': [] 19 | } 20 | 21 | html = requests.get('https://scholar.google.com/citations', params=params, headers=headers, timeout=30) 22 | selector = Selector(text=html.text) 23 | 24 | since_year = selector.css('.gsc_rsb_sth~ .gsc_rsb_sth+ .gsc_rsb_sth::text').get().lower().replace(' ', '_') 25 | 26 | for cited_by_public_access in selector.css('.gsc_rsb'): 27 | data['cited_by'].append({ 28 | 'citations_all': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(), 29 | f'citations_since_{since_year}': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std::text').get(), 30 | 'h_index_all': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(), 31 | f'h_index_since_{since_year}': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std::text').get(), 32 | 'i10_index_all': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std::text').get(), 33 | f'i10_index_since_{since_year}': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std::text').get(), 34 | 'articles': { 35 | 'available': int(cited_by_public_access.css('.gsc_rsb_m_a:nth-child(1) span::text').get().split(' ')[0]), # to get only digit value 36 | 'not_available': int(cited_by_public_access.css('.gsc_rsb_m_na div::text').get().split(' ')[0]), # to get only digit value 37 | }, 38 | 'articles_link': f"https://scholar.google.com{cited_by_public_access.css('#gsc_lwp_mndt_lnk::attr(href)').get()}" 39 | }) 40 | 41 | for graph_year, graph_yaer_value in zip(selector.css('.gsc_g_t::text'), selector.css('.gsc_g_al::text')): 42 | data['graph'].append({ 43 | 'year': graph_year.get(), 44 | 'value': int(graph_yaer_value.get()) 45 | }) 46 | 47 | if __name__ == '__main__': 48 | print(json.dumps(parsel_scrape_author_cited_by_graph(), indent=2, ensure_ascii=False)) 49 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/organic_search.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Dict, Callable 7 | import time, random, re 8 | import pandas as pd 9 | from pathlib import Path 10 | 11 | 12 | class CustomGoogleScholarOrganic: 13 | def __init__(self) -> None: 14 | pass 15 | 16 | 17 | def parse(self, parser: Callable, organic_results_data: Callable): 18 | ''' 19 | Arugments: 20 | - parser: Lexbor parser from scrape_google_scholar_organic_results() function. 21 | - organic_results_data: List to append data to. List origin location is scrape_google_scholar_organic_results() function. Line 104. 22 | 23 | This function parses data from Google Scholar Organic results and appends data to a List. 24 | 25 | It's used by scrape_google_scholar_organic_results(). 26 | 27 | It returns nothing as it appends data to `organic_results_data`, 28 | which appends it to `organic_results_data` List in the scrape_google_scholar_organic_results() function. 29 | ''' 30 | 31 | for result in parser.css('.gs_r.gs_or.gs_scl'): 32 | try: 33 | title: str = result.css_first('.gs_rt').text() 34 | except: title = None 35 | 36 | try: 37 | title_link: str = result.css_first('.gs_rt a').attrs['href'] 38 | except: title_link = None 39 | 40 | try: 41 | publication_info: str = result.css_first('.gs_a').text() 42 | except: publication_info = None 43 | 44 | try: 45 | snippet: str = result.css_first('.gs_rs').text() 46 | except: snippet = None 47 | 48 | try: 49 | # if Cited by is present in inline links, it will be extracted 50 | cited_by_link = ''.join([link.attrs['href'] for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()]) 51 | except: cited_by_link = None 52 | 53 | try: 54 | # if Cited by is present in inline links, it will be extracted and type cast it to integer 55 | cited_by_count = int(''.join([re.search(r'\d+', link.text()).group() for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()])) 56 | except: cited_by_count = None 57 | 58 | try: 59 | pdf_file: str = result.css_first('.gs_or_ggsm a').attrs['href'] 60 | except: pdf_file = None 61 | 62 | organic_results_data.append({ 63 | 'title': title, 64 | 'title_link': title_link, 65 | 'publication_info': publication_info, 66 | 'snippet': snippet if snippet else None, 67 | 'cited_by_link': f'https://scholar.google.com{cited_by_link}' if cited_by_link else None, 68 | 'cited_by_count': cited_by_count if cited_by_count else None, 69 | 'pdf_file': pdf_file 70 | }) 71 | 72 | #TODO: add lang support. https://serpapi.com/google-languages 73 | def scrape_google_scholar_organic_results( 74 | self, 75 | query: str, 76 | pagination: bool = False, 77 | save_to_csv: bool = False, 78 | save_to_json: bool = False 79 | ) -> List[Dict[str, str]]: 80 | ''' 81 | Extracts data from Google Scholar Organic resutls page: 82 | - title: str 83 | - title_link: str 84 | - publication_info: str 85 | - snippet: str 86 | - cited_by_link: str 87 | - cited_by_count: int 88 | - pdf_file: str 89 | 90 | Arguments: 91 | - query: str. Search query. 92 | - pagination: bool. Enables or disables pagination. Default is False. 93 | - save_to_csv: bool. True of False. Default is False. 94 | - save_to_json: bool. True of False. Default is False. 95 | 96 | Usage: 97 | 98 | from google_scholar_py import CustomGoogleScholarOrganic 99 | 100 | parser = CustomGoogleScholarOrganic() 101 | data = parser.scrape_google_scholar_organic_results( 102 | query='blizzard', 103 | pagination=False, 104 | save_to_csv=True 105 | ) 106 | 107 | for organic_result in data: 108 | print(organic_result['title']) 109 | print(organic_result['pdf_file']) 110 | ''' 111 | 112 | # selenium stealth 113 | options = webdriver.ChromeOptions() 114 | options.add_argument('--headless') 115 | options.add_argument('--no-sandbox') 116 | options.add_argument('--disable-dev-shm-usage') 117 | 118 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 119 | options.add_experimental_option('useAutomationExtension', False) 120 | 121 | service = Service(ChromeDriverManager().install()) 122 | driver = webdriver.Chrome(service=service, options=options) 123 | 124 | stealth(driver, 125 | languages=['en-US', 'en'], 126 | vendor='Google Inc.', 127 | platform='Win32', 128 | webgl_vendor='Intel Inc.', 129 | renderer='Intel Iris OpenGL Engine', 130 | fix_hairline=True, 131 | ) 132 | 133 | page_num = 0 134 | organic_results_data = [] 135 | 136 | # parse all pages 137 | if pagination: 138 | while True: 139 | # parse all pages 140 | driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}') 141 | parser = LexborHTMLParser(driver.page_source) 142 | 143 | self.parse(parser=parser, organic_results_data=organic_results_data) 144 | 145 | # pagination 146 | if parser.css_first('.gs_ico_nav_next'): # checks for the "Next" page button 147 | page_num += 10 # paginate to the next page 148 | time.sleep(random.randint(1, 3)) # sleep between paginations 149 | else: 150 | break 151 | else: 152 | # parse first page only 153 | driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}') 154 | parser = LexborHTMLParser(driver.page_source) 155 | 156 | self.parse(parser=parser, organic_results_data=organic_results_data) 157 | 158 | if save_to_csv: 159 | pd.DataFrame(data=organic_results_data).to_csv('google_scholar_organic_results_data.csv', 160 | index=False, encoding='utf-8') 161 | if save_to_json: 162 | pd.DataFrame(data=organic_results_data).to_json('google_scholar_organic_results_data.json', 163 | orient='records') 164 | driver.quit() 165 | 166 | return organic_results_data 167 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/profiles_results.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from parsel import Selector 7 | from typing import List, Dict, Callable 8 | import time, random, re 9 | import pandas as pd 10 | from pathlib import Path 11 | 12 | class CustomGoogleScholarProfiles: 13 | def __init__(self) -> None: 14 | pass 15 | 16 | 17 | def parse(self, parser: Callable, profile_results_data: Callable): 18 | ''' 19 | Arugments: 20 | - parser: Callable. Lexbor parser from scrape_google_scholar_profiles() function. 21 | - profile_results_data: Callable. List to append data to. List origin location is scrape_google_scholar_profiles() function. Line 100. 22 | 23 | This function parses data from Google Scholar Organic results and appends data to a List. 24 | 25 | It's used by scrape_google_scholar_profiles(). 26 | 27 | It returns nothing as it appends data to `profile_results_data`, 28 | which appends it to `profile_results_data` List in the scrape_google_scholar_profiles() function. 29 | ''' 30 | 31 | for profile in parser.css('.gs_ai_chpr'): 32 | try: 33 | name: str = profile.css_first('.gs_ai_name a').text() 34 | except: name = None 35 | 36 | try: 37 | link: str = f'https://scholar.google.com{profile.css_first(".gs_ai_name a").attrs["href"]}' 38 | except: link = None 39 | 40 | try: 41 | affiliations: str = profile.css_first('.gs_ai_aff').text() 42 | except: affiliations = None 43 | 44 | try: 45 | interests: list = [interest.text() for interest in profile.css('.gs_ai_one_int')] 46 | except: interests = None 47 | 48 | try: 49 | email: str = profile.css_first('.gs_ai_eml').text() 50 | except: email = None 51 | 52 | try: 53 | cited_by: int = re.search(r'\d+', profile.css_first('.gs_ai_cby').text()).group() # Cited by 17143 -> 17143 54 | except: cited_by = None 55 | 56 | profile_results_data.append({ 57 | 'name': name, 58 | 'link': link, 59 | 'affiliations': affiliations, 60 | 'interests': interests if interests else None, 61 | 'email': email if email else None, 62 | 'cited_by_count': int(cited_by) if cited_by else None 63 | }) 64 | 65 | 66 | def scrape_google_scholar_profiles( 67 | self, 68 | query: str, 69 | pagination: bool = False, 70 | save_to_csv: bool = False, 71 | save_to_json: bool = False 72 | ) -> List[Dict[str, str]]: 73 | ''' 74 | Extracts data from Google Scholar Organic Profile resutls page: 75 | - name: str 76 | - link: str 77 | - affiliations: str 78 | - email: str 79 | - cited_by_count: int 80 | 81 | Arguments: 82 | - query: str. Search query. 83 | - pagination: bool. Enables or disables pagination. Default is False. 84 | - save_to_csv: bool. True of False. Default is False. 85 | - save_to_json: bool. True of False. Default is False. 86 | 87 | Usage: 88 | 89 | from google_scholar_py import CustomGoogleScholarProfiles 90 | 91 | parser = CustomGoogleScholarProfiles() 92 | data = parser.scrape_google_scholar_profiles( 93 | query='blizzard', 94 | pagination=False, 95 | save_to_csv=True 96 | ) 97 | print(json.dumps(data, indent=2)) 98 | 99 | for profile_results in data: 100 | print(profile_results['name']) 101 | print(profile_results['email']) 102 | ''' 103 | 104 | # selenium stealth 105 | options = webdriver.ChromeOptions() 106 | options.add_argument('--headless') 107 | options.add_argument('--no-sandbox') 108 | options.add_argument('--disable-dev-shm-usage') 109 | 110 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 111 | options.add_experimental_option('useAutomationExtension', False) 112 | 113 | service = Service(ChromeDriverManager().install()) 114 | driver = webdriver.Chrome(service=service, options=options) 115 | 116 | stealth(driver, 117 | languages=['en-US', 'en'], 118 | vendor='Google Inc.', 119 | platform='Win32', 120 | webgl_vendor='Intel Inc.', 121 | renderer='Intel Iris OpenGL Engine', 122 | fix_hairline=True 123 | ) 124 | 125 | params = {} # stores next page token to add to URL later 126 | page_num = 0 127 | profile_results_data = [] 128 | 129 | if pagination: 130 | while True: 131 | # if next page token appears, add to to URL as URL parameter 132 | # otherwise, do a search without next page token parameter (Line: 101) 133 | if params.get('after_author') is None: 134 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}') 135 | parser = LexborHTMLParser(driver.page_source) 136 | 137 | #TODO: replace parsel with selectolax completely 138 | selector = Selector(text=driver.page_source) # to check next page token 139 | 140 | self.parse(parser=parser, profile_results_data=profile_results_data) 141 | 142 | # check if the next arrow button is active by checking 'onclick' attribute 143 | if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get(): 144 | # extracting next page token and passing to 'after_author' query URL parameter 145 | params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J 146 | page_num += 10 # paginate to the next page 147 | time.sleep(random.randint(1, 3)) # sleep between paginations 148 | else: 149 | break 150 | else: 151 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}&after_author={params["after_author"]}') 152 | parser = LexborHTMLParser(driver.page_source) 153 | 154 | #TODO: replace parsel with selectolax completely 155 | selector = Selector(text=driver.page_source) # to check next page token 156 | 157 | self.parse(parser=parser, profile_results_data=profile_results_data) 158 | 159 | if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get(): 160 | # extracting next page token and passing to 'after_author' query URL parameter 161 | params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J 162 | page_num += 10 # paginate to the next page 163 | time.sleep(random.randint(1, 3)) # sleep between paginations 164 | else: 165 | break 166 | else: 167 | # parse single, first page 168 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}') 169 | parser = LexborHTMLParser(driver.page_source) 170 | 171 | self.parse(parser=parser, profile_results_data=profile_results_data) 172 | 173 | driver.quit() 174 | 175 | if save_to_csv: 176 | pd.DataFrame(data=profile_results_data).to_csv('google_scholar_profile_results_data.csv', 177 | index=False, encoding='utf-8') 178 | if save_to_json: 179 | pd.DataFrame(data=profile_results_data).to_json('google_scholar_profile_results_data.json', 180 | orient='records') 181 | 182 | return profile_results_data 183 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/top_mandates_metrics.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Dict, Callable 7 | import pandas as pd 8 | import re 9 | 10 | 11 | class CustomGoogleScholarTopMandates: 12 | def __init__(self) -> None: 13 | pass 14 | 15 | 16 | def parse(self, parser: Callable, top_mandates_data: Callable): 17 | ''' 18 | Arugments: 19 | - parser: Callable. Lexbor parser from google_scholar_top_mandates_metrics() function. 20 | - top_mandates_data: Callable. List to append data to. List origin location is google_scholar_top_mandates_metrics() function. Line 100. 21 | 22 | This function parses data from Google Scholar Organic results and appends data to a List. 23 | 24 | It's used by google_scholar_top_mandates_metrics(). 25 | 26 | It returns nothing as it appends data to `top_mandates_data`, 27 | which appends it to `top_mandates_data` List in the google_scholar_top_mandates_metrics() function. 28 | ''' 29 | 30 | for table in parser.css('tr'): 31 | try: 32 | # removes "... - cached" 33 | # https://regex101.com/r/EfljZp/1 34 | funder: str = re.sub(r'(\s\s-.*)', '', table.css_first('td.gsc_mlt_t').text()) 35 | except: funder = None 36 | 37 | try: 38 | link: str = table.css_first('.gsc_mlt_t a').attrs['href'] 39 | except: link = None 40 | 41 | try: 42 | two_eighteen: int = table.css_first('td:nth-child(4)').text() 43 | if '-' in two_eighteen: 44 | two_eighteen = None 45 | except: two_eighteen = None 46 | 47 | try: 48 | twenty_twenty: str = table.css_first('td:nth-child(5)').text() 49 | if '-' in twenty_twenty: 50 | twenty_twenty = None 51 | except: twenty_twenty = None 52 | 53 | try: 54 | twenty_one: str = table.css_first('td:nth-child(6)').text() 55 | if '-' in twenty_one: # missing % in the table 56 | twenty_one = None 57 | except: twenty_one = None 58 | 59 | #TODO: fix selector to extract "overall" data 60 | # `td:nth-child(6)` is not working also 61 | # try: 62 | # overall: str = table.css('.gsc_mlt_n.gsc_mlt_bd').text() 63 | # except: overall = None 64 | 65 | top_mandates_data.append({ 66 | 'funder': funder, 67 | 'link': link, 68 | '2019': two_eighteen, 69 | '2020': twenty_twenty, 70 | '2021': twenty_one, 71 | # 'overall': overall 72 | }) 73 | 74 | 75 | def scrape_top_mandates_metrics( 76 | self, 77 | save_to_csv: bool = False, 78 | save_to_json: bool = False, 79 | lang: str = 'en' 80 | ) -> List[Dict[str, str]]: 81 | #TODO add argument to support other languages https://serpapi.com/google-languages 82 | 83 | ''' 84 | Results comes from: https://scholar.google.com/citations?view_op=mandates_leaderboard 85 | 86 | Returns: 87 | - funder: str 88 | - link: str 89 | - 2019: str 90 | - 2020: str 91 | - 2021: str 92 | - overall: str (not extracted at the moment, selector needs to be fixed) 93 | 94 | Arguments: 95 | - save_to_csv: True of False. Saves data to CSV file. Default is False. 96 | - save_to_json: True of False. Saves data to JSON file. Default is False. 97 | - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages 98 | 99 | Usage: 100 | 101 | from google_scholar_py import CustomGoogleScholarTopMandates 102 | 103 | parser = CustomGoogleScholarTopMandates() 104 | data = parser.scrape_top_mandates_metrics( 105 | save_to_csv=True, 106 | save_to_json=False 107 | ) 108 | print(json.dumps(data, indent=2)) 109 | 110 | for result in data: 111 | print(result['funder']) 112 | ... 113 | ''' 114 | 115 | # selenium stealth 116 | options = webdriver.ChromeOptions() 117 | options.add_argument('--headless') 118 | options.add_argument('--no-sandbox') 119 | options.add_argument('--disable-dev-shm-usage') 120 | 121 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 122 | options.add_experimental_option('useAutomationExtension', False) 123 | 124 | service = Service(ChromeDriverManager().install()) 125 | driver = webdriver.Chrome(service=service, options=options) 126 | 127 | stealth(driver, 128 | languages=['en-US', 'en'], 129 | vendor='Google Inc.', 130 | platform='Win32', 131 | webgl_vendor='Intel Inc.', 132 | renderer='Intel Iris OpenGL Engine', 133 | fix_hairline=True 134 | ) 135 | 136 | top_mandates_data: list = [] 137 | 138 | driver.get(f'https://scholar.google.com/citations?view_op=mandates_leaderboard&hl={lang}') 139 | parser = LexborHTMLParser(driver.page_source) 140 | self.parse(parser=parser, top_mandates_data=top_mandates_data) 141 | 142 | if save_to_csv: 143 | pd.DataFrame(data=top_mandates_data).to_csv('google_scholar_top_mandates_data.csv', 144 | index=False, encoding='utf-8') 145 | 146 | if save_to_json: 147 | pd.DataFrame(data=top_mandates_data).to_json('google_scholar_top_mandates_data.json', 148 | orient='records') 149 | 150 | driver.quit() 151 | return top_mandates_data 152 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/top_publications_article.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Dict, Callable, Union 7 | import pandas as pd 8 | import time, random 9 | 10 | class CustomGoogleScholarTopPublicationArticle: 11 | def __init__(self) -> None: 12 | pass 13 | 14 | 15 | def parse(self, parser: Callable, publication_citation_data: Callable): 16 | ''' 17 | Arugments: 18 | - parser: Lexbor parser from scrape_google_scholar_top_publication_articles() function. 19 | - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_articles() function. Line 104. 20 | 21 | This function parses data from Google Scholar Organic results and appends data to a List. 22 | 23 | It's used by scrape_google_scholar_top_publication_articles(). 24 | ''' 25 | 26 | # selects the whole table without the first row (header row) 27 | for result in parser.css('tr:not(:first-child)'): 28 | try: 29 | title: str = result.css_first('.gsc_mp_anchor_lrge').text() 30 | except: title = None 31 | 32 | try: 33 | title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}" 34 | except: title_link = None 35 | 36 | try: 37 | authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ') 38 | except: authors = None 39 | 40 | try: 41 | published_at: str = result.css_first('.gs_gray+ .gs_gray').text() 42 | except: published_at = None 43 | 44 | try: 45 | cited_by_count: int = int(result.css_first('.gsc_mpat_c .gsc_mp_anchor').text()) 46 | except: cited_by_count = None 47 | 48 | try: 49 | cited_by_link: str = f"https://scholar.google.com{result.css_first('.gsc_mpat_c a.gsc_mp_anchor').attrs['href']}" 50 | except: cited_by_link = None 51 | 52 | try: 53 | year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text()) 54 | except: year = None 55 | 56 | 57 | publication_citation_data.append({ 58 | 'title': title, 59 | 'title_link': title_link, 60 | 'authors': authors, 61 | 'cited_by_link': cited_by_link, 62 | 'cited_by_count': cited_by_count, 63 | 'year': year, 64 | 'published_at': published_at 65 | }) 66 | 67 | #TODO: add lang support. https://serpapi.com/google-languages 68 | def scrape_google_scholar_top_publication_articles( 69 | self, 70 | journal_publications_link: str, 71 | pagination: bool = False, 72 | save_to_csv: bool = False, 73 | save_to_json: bool = False 74 | ) -> List[Dict[str, Union[str, List[str], int]]]: 75 | ''' 76 | Results comes from (for example): https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022 77 | 78 | Extracts data from Google Scholar Top Publication Metrics Citation page: 79 | - title: str 80 | - title_link: str 81 | - authors: list 82 | - cited_by_count: int 83 | - cited_by_link: str 84 | - year: int 85 | - published_at: str 86 | 87 | Arguments: 88 | - journal_publications_link: str. Search query. 89 | - pagination: bool. Enables or disables pagination. Default is False. 90 | - save_to_csv: bool. True of False. Default is False. 91 | - save_to_json: bool. True of False. Default is False. 92 | 93 | Usage: 94 | 95 | from google_scholar_py import CustomGoogleScholarTopPublicationArticle 96 | 97 | parser = CustomGoogleScholarTopPublicationArticle() 98 | data = parser.scrape_google_scholar_top_publication_articles( 99 | journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022', # or link variable that stores the link 100 | pagination=False, 101 | save_to_csv=True 102 | ) 103 | 104 | for citations in data: 105 | print(citations['title'], citations['year'], citations['published_at'], sep='\\n') 106 | ''' 107 | 108 | # selenium stealth 109 | options = webdriver.ChromeOptions() 110 | options.add_argument('--headless') 111 | options.add_argument('--no-sandbox') 112 | options.add_argument('--disable-dev-shm-usage') 113 | 114 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 115 | options.add_experimental_option('useAutomationExtension', False) 116 | 117 | service = Service(ChromeDriverManager().install()) 118 | driver = webdriver.Chrome(service=service, options=options) 119 | 120 | stealth(driver, 121 | languages=['en-US', 'en'], 122 | vendor='Google Inc.', 123 | platform='Win32', 124 | webgl_vendor='Intel Inc.', 125 | renderer='Intel Iris OpenGL Engine', 126 | fix_hairline=True, 127 | ) 128 | 129 | page_num = 0 130 | publication_citation_data = [] 131 | 132 | # parse all pages 133 | if pagination: 134 | while True: 135 | driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination 136 | parser = LexborHTMLParser(driver.page_source) 137 | 138 | self.parse(parser=parser, publication_citation_data=publication_citation_data) 139 | 140 | # pagination 141 | if parser.css_first('.gsc_pgn_pnx:not([disabled])'): # checks if the "Next" page button selector is not disabled 142 | page_num += 20 # paginate to the next page 143 | time.sleep(random.randint(1, 3)) # sleep between paginations 144 | else: 145 | break 146 | else: 147 | # parse first page only 148 | driver.get(journal_publications_link) 149 | parser = LexborHTMLParser(driver.page_source) 150 | 151 | self.parse(parser=parser, publication_citation_data=publication_citation_data) 152 | 153 | if save_to_csv: 154 | pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv', 155 | index=False, encoding='utf-8') 156 | if save_to_json: 157 | pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json', 158 | orient='records') 159 | driver.quit() 160 | 161 | return publication_citation_data 162 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/top_publications_article_citation.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Dict, Callable, Union 7 | import pandas as pd 8 | import time, random 9 | 10 | class CustomGoogleScholarTopPublicationArticleCitation: 11 | def __init__(self) -> None: 12 | pass 13 | 14 | 15 | def parse(self, parser: Callable, publication_citation_data: Callable): 16 | ''' 17 | Arugments: 18 | - parser: Lexbor parser from scrape_google_scholar_top_publication_article_citations() function. 19 | - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_article_citations() function. Line 104. 20 | 21 | This function parses data from Google Scholar Organic results and appends data to a List. 22 | 23 | It's used by scrape_google_scholar_top_publication_article_citations(). 24 | ''' 25 | 26 | # selects the whole table without the first row (header row) 27 | for result in parser.css('tr:not(:first-child)'): 28 | try: 29 | title: str = result.css_first('.gsc_mp_anchor_lrge').text() 30 | except: title = None 31 | 32 | try: 33 | title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}" 34 | except: title_link = None 35 | 36 | try: 37 | authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ') 38 | except: authors = None 39 | 40 | try: 41 | published_at: str = result.css_first('.gs_gray+ .gs_gray').text() 42 | except: published_at = None 43 | 44 | try: 45 | year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text()) 46 | except: year = None 47 | 48 | 49 | publication_citation_data.append({ 50 | 'title': title, 51 | 'title_link': title_link, 52 | 'authors': authors, 53 | 'year': year, 54 | 'published_at': published_at 55 | }) 56 | 57 | #TODO: add lang support. https://serpapi.com/google-languages 58 | def scrape_google_scholar_top_publication_article_citations( 59 | self, 60 | journal_publications_link: str, 61 | pagination: bool = False, 62 | save_to_csv: bool = False, 63 | save_to_json: bool = False 64 | ) -> List[Dict[str, Union[str, List[str], int]]]: 65 | ''' 66 | Results comes from (for example): https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18 67 | 68 | Extracts data from Google Scholar Top Publication Metrics Citation page: 69 | - title: str 70 | - title_link: str 71 | - authors: list 72 | - published_at: str 73 | - year: int 74 | 75 | Arguments: 76 | - journal_publications_link: str. Search query. 77 | - pagination: bool. Enables or disables pagination. Default is False. 78 | - save_to_csv: bool. True of False. Default is False. 79 | - save_to_json: bool. True of False. Default is False. 80 | 81 | Usage: 82 | 83 | from google_scholar_py import CustomGoogleScholarTopPublicationArticleCitation 84 | import json 85 | 86 | parser = CustomGoogleScholarTopPublicationArticleCitation() 87 | data = parser.scrape_google_scholar_top_publication_article_citations( 88 | journal_publications_link='https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18', # or link variable that stores the link 89 | pagination=False, 90 | save_to_csv=True 91 | ) 92 | print(json.dumps(data, indent=2)) 93 | 94 | for citations in data: 95 | print(citations['title'], citations['year'], citations['published_at'], sep='\\n') 96 | ''' 97 | 98 | # selenium stealth 99 | options = webdriver.ChromeOptions() 100 | options.add_argument('--headless') 101 | options.add_argument('--no-sandbox') 102 | options.add_argument('--disable-dev-shm-usage') 103 | 104 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 105 | options.add_experimental_option('useAutomationExtension', False) 106 | 107 | service = Service(ChromeDriverManager().install()) 108 | driver = webdriver.Chrome(service=service, options=options) 109 | 110 | stealth(driver, 111 | languages=['en-US', 'en'], 112 | vendor='Google Inc.', 113 | platform='Win32', 114 | webgl_vendor='Intel Inc.', 115 | renderer='Intel Iris OpenGL Engine', 116 | fix_hairline=True, 117 | ) 118 | 119 | page_num = 0 120 | publication_citation_data = [] 121 | 122 | # parse all pages 123 | if pagination: 124 | while True: 125 | driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination 126 | parser = LexborHTMLParser(driver.page_source) 127 | 128 | self.parse(parser=parser, publication_citation_data=publication_citation_data) 129 | 130 | # pagination 131 | if parser.css_first('.gsc_pgn_pnx:not([disabled])'): # checks if the "Next" page button selector is not disabled 132 | page_num += 20 # paginate to the next page 133 | time.sleep(random.randint(1, 3)) # sleep between paginations 134 | else: 135 | break 136 | else: 137 | # parse first page only 138 | driver.get(journal_publications_link) 139 | parser = LexborHTMLParser(driver.page_source) 140 | 141 | self.parse(parser=parser, publication_citation_data=publication_citation_data) 142 | 143 | if save_to_csv: 144 | pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv', 145 | index=False, encoding='utf-8') 146 | if save_to_json: 147 | pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json', 148 | orient='records') 149 | driver.quit() 150 | 151 | return publication_citation_data 152 | -------------------------------------------------------------------------------- /google_scholar_py/custom_backend/top_publications_metrics.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium_stealth import stealth 3 | from selenium.webdriver.chrome.service import Service 4 | from webdriver_manager.chrome import ChromeDriverManager 5 | from selectolax.lexbor import LexborHTMLParser 6 | from typing import List, Dict, Callable, Union 7 | import pandas as pd 8 | 9 | class CustomGoogleScholarTopPublications: 10 | def __init__(self) -> None: 11 | pass 12 | 13 | 14 | def parse(self, parser: Callable, top_publications_data: Callable): 15 | ''' 16 | Arugments: 17 | - parser: Callable. Lexbor parser from google_scholar_top_publication_metrics() function. 18 | - top_publications_data: Callable. List to append data to. List origin location is google_scholar_top_publication_metrics() function. Line 100. 19 | 20 | This function parses data from Google Scholar Organic results and appends data to a List. 21 | 22 | It's used by google_scholar_top_publication_metrics(). 23 | 24 | It returns nothing as it appends data to `top_publications_data`, 25 | which appends it to `top_publications_data` List in the google_scholar_top_publication_metrics() function. 26 | ''' 27 | 28 | # selectors skips table header row 29 | for table in parser.css('tr:not(:first-child)'): 30 | try: 31 | title: str = table.css_first('td.gsc_mvt_t').text() 32 | except: title = None 33 | 34 | try: 35 | h5_index: int = table.css_first('a.gs_ibl').text() 36 | except: h5_index = None 37 | 38 | try: 39 | h5_index_link: str = f"https://scholar.google.com{table.css_first('a.gs_ibl').attrs['href']}" 40 | except: h5_index_link = None 41 | 42 | try: 43 | h5_median: int = table.css_first('span.gs_ibl').text() 44 | except: h5_median = None 45 | 46 | top_publications_data.append({ 47 | 'title': title, 48 | 'h5_index': int(h5_index) if h5_index else h5_index, 49 | 'h5_index_link': h5_index_link, 50 | 'h5_median': int(h5_median) if h5_median else h5_median 51 | }) 52 | 53 | 54 | def scrape_top_publication_metrics( 55 | self, 56 | category: str = '', 57 | lang: str = 'en', 58 | save_to_csv: bool = False, 59 | save_to_json: bool = False, 60 | ) -> List[Dict[str, Union[str, int]]]: 61 | #TODO add subcategories to subcategory arg 62 | #TODO: support other languages: lang='spanish' -> 'sp'. https://serpapi.com/google-languages 63 | 64 | 65 | ''' 66 | Results comes from: https://scholar.google.com/citations?view_op=top_venues 67 | 68 | Returns: 69 | - title: str 70 | - h5_index: int 71 | - h5_index_link: str 72 | - h5_median: int 73 | 74 | Arguments: 75 | - save_to_csv: True of False. Default is False. Saves data to CSV file. 76 | - save_to_json: True of False. Default is False. Saves data to JSON file. 77 | - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages 78 | - category: str. Available categories showed in the function documentation below. 79 | Available categories: 80 | - "bus": Business, Economics & Management 81 | - "chm": Chemical & Material Sciences 82 | - "eng": Engineering & Computer Science 83 | - "med": Health & Medical Sciences 84 | - "hum": Humanities, Literature & Arts 85 | - "bio": Life Sciences & Earth Sciences 86 | - "phy": Physics & Mathematics 87 | - "soc": Social Sciences 88 | 89 | Usage: 90 | 91 | from google_scholar_py import CustomGoogleScholarTopPublications 92 | 93 | data = CustomGoogleScholarTopPublications().scrape_top_publication_metrics(category='eng', lang='en') # sv = swedish 94 | 95 | for result in data: 96 | print(result['title']) 97 | ... 98 | ''' 99 | 100 | # selenium stealth 101 | options = webdriver.ChromeOptions() 102 | options.add_argument('--headless') 103 | options.add_argument('--no-sandbox') 104 | options.add_argument('--disable-dev-shm-usage') 105 | 106 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) 107 | options.add_experimental_option('useAutomationExtension', False) 108 | 109 | service = Service(ChromeDriverManager().install()) 110 | driver = webdriver.Chrome(service=service, options=options) 111 | 112 | stealth(driver, 113 | languages=['en-US', 'en'], 114 | vendor='Google Inc.', 115 | platform='Win32', 116 | webgl_vendor='Intel Inc.', 117 | renderer='Intel Iris OpenGL Engine', 118 | fix_hairline=True 119 | ) 120 | 121 | top_publications_data = [] 122 | 123 | if category: 124 | driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}') 125 | parser = LexborHTMLParser(driver.page_source) 126 | self.parse(parser=parser, top_publications_data=top_publications_data) 127 | else: 128 | # no vq={category} URL parameter 129 | driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}') # vq='' which will redirect to the page with no applied category 130 | parser = LexborHTMLParser(driver.page_source) 131 | self.parse(parser=parser, top_publications_data=top_publications_data) 132 | 133 | if save_to_csv: 134 | pd.DataFrame(data=top_publications_data).to_csv('google_scholar_top_publications_data.csv', 135 | index=False, encoding='utf-8') 136 | if save_to_json: 137 | pd.DataFrame(data=top_publications_data).to_json('google_scholar_top_publications_data.json', 138 | orient='records') 139 | 140 | driver.quit() 141 | return top_publications_data 142 | -------------------------------------------------------------------------------- /google_scholar_py/serpapi_backend/author_results.py: -------------------------------------------------------------------------------- 1 | from serpapi import GoogleScholarSearch 2 | from urllib.parse import urlsplit, parse_qsl 3 | import itertools 4 | 5 | #TODO: support pagination using `async` parameter 6 | 7 | class SerpApiGoogleScholarAuthor: 8 | def __init__(self) -> None: 9 | pass 10 | 11 | def scrape_google_scholar_author_results( 12 | self, 13 | author_id: str, 14 | api_key: str = None, 15 | lang: str = 'en', 16 | parse_articles: bool = False, 17 | article_pagination: bool = False, 18 | ): 19 | 20 | ''' 21 | Extracts all author data: author info, cited by (table, graph), co-authors, all articles. 22 | 23 | Arguments: 24 | - author_id: author id. 25 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key 26 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages 27 | - parse_articles: parses first page of authour articles. Defalul 'False'. 28 | - article_pagination: True of False. Enables to parse all articles. Default 'False'. 29 | 30 | Usage: 31 | 32 | from google_scholar_py.serpapi_backend.author_results import SerpApiGoogleScholarAuthor 33 | 34 | parser = SerpApiGoogleScholarAuthor() 35 | data = parser.scrape_google_scholar_author_results( 36 | author_id='nHhtvqkAAAAJ', 37 | api_key='serpapi_api_key', 38 | parse_articles=True, 39 | article_pagination=True, 40 | ) 41 | 42 | print(data.keys()) # show available keys 43 | 44 | for article in data['articles']: 45 | print(article['title']) 46 | ''' 47 | 48 | if api_key is None: 49 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key') 50 | 51 | if author_id is None: 52 | raise Exception('Please enter a author id.') 53 | 54 | if api_key and author_id is None: 55 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a author id to `author_id` arguments.') 56 | 57 | params = { 58 | 'api_key': api_key, # serpapi api key 59 | 'engine': 'google_scholar_author', # serpapi parsing engine 60 | 'author_id': author_id, # search by author id 61 | 'hl': lang # language 62 | } 63 | 64 | search = GoogleScholarSearch(params) # where data extracts on the backend 65 | 66 | # parsing ALL articles along with author info 67 | if parse_articles and article_pagination: 68 | params['start'] = 0 # page number: 0 is first page, 1 is second, etc. 69 | params['pagesize'] = 100 # number of articles per page 70 | 71 | author_all_articles = [] 72 | 73 | while True: 74 | results = search.get_dict() 75 | 76 | if 'error' in results: 77 | print(results['error']) 78 | break 79 | 80 | author_all_articles.append(results['articles']) 81 | 82 | # check for the `next` page 83 | if 'next' in results.get('serpapi_pagination', {}): 84 | search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query))) 85 | else: 86 | break 87 | 88 | # remove articles key that creates a nested lists 89 | results.pop('articles') 90 | 91 | # flatten list of all articles 92 | author_all_articles_flatten = list(itertools.chain(*author_all_articles)) 93 | results['articles'] = author_all_articles_flatten 94 | 95 | keys_to_delete = ['search_metadata', 'search_parameters'] 96 | for key_to_delete in keys_to_delete: 97 | results.pop(key_to_delete) 98 | 99 | return results 100 | 101 | # parsing ONLY FIRST PAGE of articles along with author info 102 | if parse_articles: 103 | search = GoogleScholarSearch(params) 104 | results = search.get_dict() # JSON -> Python dict 105 | 106 | if 'error' in results: 107 | raise Exception(results['error']) 108 | 109 | keys_to_delete = ['search_metadata', 'search_parameters', 'serpapi_pagination'] 110 | 111 | for key_to_delete in keys_to_delete: 112 | results.pop(key_to_delete) 113 | 114 | return results 115 | 116 | # if don't need to parse any articles -> remove them from the JSON 117 | elif article_pagination or parse_articles is False: 118 | search = GoogleScholarSearch(params) 119 | results = search.get_dict() 120 | 121 | if 'error' in results: 122 | raise Exception(results['error']) 123 | 124 | keys_to_delete = ['search_metadata', 'search_parameters', 'articles', 'serpapi_pagination'] 125 | 126 | for key_to_delete in keys_to_delete: 127 | results.pop(key_to_delete) 128 | 129 | return results 130 | 131 | -------------------------------------------------------------------------------- /google_scholar_py/serpapi_backend/organic_cite_results.py: -------------------------------------------------------------------------------- 1 | from .organic_results import SerpApiGoogleScholarOrganic 2 | from serpapi import GoogleScholarSearch 3 | 4 | #TODO: support extracting actual Cite data, for example Bibtex: shorturl.at/vGNU5 5 | 6 | class SerpApiGoogleScholarOrganicCite: 7 | def __init__(self) -> None: 8 | pass 9 | 10 | 11 | def scrape_google_scholar_cite_results( 12 | self, 13 | query: str, 14 | api_key: str = None, 15 | lang: str = 'en', 16 | pagination: bool = False 17 | ): 18 | 19 | ''' 20 | This function extract citations as well as BibTeX, EndNote, RefMan, RefWorks links. 21 | 22 | To extract citations, 2 requests has to be made: first for organic results, second for citation data. 23 | So if you need to get citations from 1000 articles, 2000 requests would be made accordingly. 24 | 25 | Arguments: 26 | - query: search query 27 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key 28 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages 29 | - pagination: True of False. Enables pagination from all pages. Default 'False'. 30 | 31 | Usage: 32 | 33 | from google_scholar_py.serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite 34 | 35 | parser = SerpApiGoogleScholarOrganicCite() 36 | data = parser.scrape_google_scholar_cite_results( 37 | query='minecraft', 38 | api_key='serpapi_api_key', 39 | pagination=True 40 | ) 41 | 42 | # extracting bottom links 43 | for result in data: 44 | for citations in result['links']: 45 | print(citations['name']) # or ['link'] 46 | 47 | # extracting citations 48 | for result in data: 49 | for citations in result['citations']: 50 | print(citations['title']) # or ['snippet'] 51 | ''' 52 | 53 | if api_key is None: 54 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key') 55 | 56 | #TODO: could be removed as function by itself throw an error if query is missing 57 | if api_key and query is None: 58 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.') 59 | 60 | # extract organic results from where citation data will be extracted 61 | organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results( 62 | query=query, 63 | api_key=api_key, 64 | lang=lang, 65 | pagination=pagination 66 | ) 67 | 68 | cite_results_data = [] 69 | 70 | for citation in organic_results: 71 | params = { 72 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key 73 | 'engine': 'google_scholar_cite', # serpapi parsing engine 74 | 'q': citation['result_id'] # search query 75 | } 76 | 77 | search = GoogleScholarSearch(params) # where data extracts on the backend 78 | results = search.get_dict() 79 | 80 | # removes 2 keys from the JSON response 81 | for key_to_delete in ['search_metadata', 'search_parameters']: 82 | results.pop(key_to_delete) 83 | 84 | if 'error' in results: 85 | raise Exception(results['error']) 86 | 87 | cite_results_data.append(results) 88 | 89 | return cite_results_data 90 | -------------------------------------------------------------------------------- /google_scholar_py/serpapi_backend/organic_results.py: -------------------------------------------------------------------------------- 1 | from serpapi import GoogleScholarSearch 2 | from urllib.parse import urlsplit, parse_qsl 3 | import itertools 4 | 5 | #TODO: support pagination using `async` parameter 6 | 7 | class SerpApiGoogleScholarOrganic: 8 | def __init__(self) -> None: 9 | pass 10 | 11 | 12 | #TODO: add test API key so users can test out before passing their own? 13 | def scrape_google_scholar_organic_results( 14 | self, 15 | query: str, 16 | api_key: str = None, 17 | lang: str = 'en', 18 | pagination: bool = False, 19 | ): 20 | 21 | ''' 22 | This function extracts all possible data from Google Scholar organic results. With or without pagination. 23 | 24 | Arguments: 25 | - query: search query 26 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key 27 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages 28 | - pagination: True of False. Enables pagination from all pages. Default 'False'. 29 | 30 | Usage: 31 | 32 | from google_scholar_py.serpapi_backend.organic_results import SerpApiGoogleScholarOrganic 33 | 34 | parser = SerpApiGoogleScholarOrganic() 35 | data = parser.scrape_google_scholar_organic_results( 36 | query='minecraft', 37 | api_key='serpapi_api_key', 38 | pagination=True 39 | ) 40 | 41 | print(data[0].keys()) # show available keys 42 | 43 | for result in data: 44 | print(result['title']) # and other data 45 | ''' 46 | 47 | if api_key is None: 48 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key') 49 | 50 | if api_key and query is None: 51 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.') 52 | 53 | params = { 54 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key 55 | 'engine': 'google_scholar', # serpapi parsing engine 56 | 'q': query, # search query 57 | 'hl': lang, # language 58 | 'start': 0 # first page. Used for pagination: https://serpapi.com/google-scholar-api#api-parameters-pagination-start 59 | } 60 | 61 | search = GoogleScholarSearch(params) # where data extracts on the backend 62 | 63 | if pagination: 64 | organic_results_data = [] 65 | 66 | while True: 67 | results = search.get_dict() # JSON -> Python dict 68 | 69 | if 'error' in results: 70 | print(results['error']) 71 | break 72 | 73 | organic_results_data.append(results['organic_results']) 74 | 75 | # check for `serpapi_pagination` and then for `next` page 76 | if 'next' in results.get('serpapi_pagination', {}): 77 | search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query))) 78 | else: 79 | break 80 | 81 | # flatten list 82 | return list(itertools.chain(*organic_results_data)) 83 | else: 84 | # remove page number key from the request parameters 85 | # parse first page only 86 | params.pop('start') 87 | 88 | search = GoogleScholarSearch(params) 89 | results = search.get_dict() 90 | 91 | if 'error' in results: 92 | raise Exception(results['error']) 93 | 94 | return results['organic_results'] 95 | 96 | 97 | -------------------------------------------------------------------------------- /google_scholar_py/serpapi_backend/profile_results.py: -------------------------------------------------------------------------------- 1 | from serpapi import GoogleScholarSearch 2 | from urllib.parse import parse_qsl, urlsplit 3 | import itertools 4 | 5 | 6 | #TODO: support pagination using `async` parameter 7 | 8 | class SerpApiGoogleScholarProfiles: 9 | def __init__(self) -> None: 10 | pass 11 | 12 | def scrape_google_scholar_profile_results( 13 | self, 14 | query: str, 15 | api_key: str = None, 16 | lang: str = 'en', 17 | pagination: bool = False, 18 | ): 19 | 20 | ''' 21 | This function extracts profile results. With or without pagination. 22 | 23 | Arguments: 24 | - query: search query 25 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key 26 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages 27 | - pagination: True of False. Enables pagination from all pages. Default 'False'. 28 | 29 | Usage: 30 | 31 | from google_scholar_py.serpapi_backend.profile_results import SerpApiGoogleScholarProfiles 32 | 33 | parser = SerpApiGoogleScholarProfiles() 34 | data = parser.scrape_google_scholar_profile_results( 35 | query='minecraft', 36 | api_key='serpapi_api_key', 37 | pagination=True, 38 | ) 39 | 40 | print(data[0].keys()) # show available keys 41 | 42 | for result in data: 43 | print(result['title']) 44 | # get other data 45 | ''' 46 | 47 | if api_key is None: 48 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key') 49 | 50 | if api_key and query is None: 51 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.') 52 | 53 | params = { 54 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key 55 | 'engine': 'google_scholar_profiles', # serpapi parsing engine 56 | 'mauthors': query, # search query 57 | 'hl': lang # language 58 | } 59 | 60 | search = GoogleScholarSearch(params) # where data extracts on the backend 61 | 62 | if pagination: 63 | profile_results_data = [] 64 | 65 | while True: 66 | results = search.get_dict() # JSON -> Python dict 67 | 68 | if 'error' in results: 69 | print(results['error']) 70 | break 71 | 72 | profile_results_data.append(results['profiles']) 73 | 74 | # check for 'next' page 75 | if 'next' in results.get('pagination', {}): 76 | search.params_dict.update(dict(parse_qsl(urlsplit(results['pagination']['next']).query))) 77 | else: 78 | break 79 | 80 | # flatten list 81 | return list(itertools.chain(*profile_results_data)) 82 | else: 83 | search = GoogleScholarSearch(params) 84 | results = search.get_dict() 85 | 86 | if 'error' in results: 87 | raise Exception(results['error']) 88 | 89 | return results['profiles'] 90 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.ruff] 6 | line-length = 125 7 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==7.3.1 2 | pytest-cov==4.0.0 3 | pytest-xdist==3.3.0 4 | coverage==7.2.5 5 | ruff==0.0.243 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | async-generator==1.10 2 | attrs==22.2.0 3 | bleach==6.0.0 4 | CacheControl==0.12.11 5 | certifi==2022.12.7 6 | cffi==1.15.1 7 | charset-normalizer==3.0.1 8 | cleo==2.0.1 9 | colorama==0.4.6 10 | crashtest==0.4.1 11 | cryptography==39.0.1 12 | cssselect==1.2.0 13 | Cython==0.29.33 14 | distlib==0.3.6 15 | docutils==0.19 16 | dulwich==0.20.50 17 | exceptiongroup==1.1.0 18 | execnet==1.9.0 19 | filelock==3.9.0 20 | google-search-results==2.4.2 21 | h11==0.14.0 22 | html5lib==1.1 23 | idna==3.4 24 | importlib-metadata==4.13.0 25 | importlib-resources==5.10.2 26 | iniconfig==2.0.0 27 | jaraco.classes==3.2.3 28 | jeepney==0.8.0 29 | jsonschema==4.17.3 30 | keyring==23.13.1 31 | lockfile==0.12.2 32 | lxml==4.9.2 33 | markdown-it-py==2.1.0 34 | mdurl==0.1.2 35 | more-itertools==9.0.0 36 | msgpack==1.0.4 37 | numpy==1.24.2 38 | outcome==1.2.0 39 | packaging==23.0 40 | pandas==1.5.3 41 | parsel==1.7.0 42 | pexpect==4.8.0 43 | pkginfo==1.9.6 44 | pkgutil_resolve_name==1.3.10 45 | platformdirs==2.6.2 46 | pluggy==1.0.0 47 | poetry==1.3.2 48 | poetry-core==1.4.0 49 | poetry-plugin-export==1.3.0 50 | ptyprocess==0.7.0 51 | pycparser==2.21 52 | Pygments==2.14.0 53 | pyrsistent==0.19.3 54 | PySocks==1.7.1 55 | python-dateutil==2.8.2 56 | python-dotenv==1.0.0 57 | pytz==2022.7.1 58 | pywin32-ctypes==0.2.0 59 | rapidfuzz==2.13.7 60 | readme-renderer==37.3 61 | requests==2.28.2 62 | requests-toolbelt==0.10.1 63 | rfc3986==2.0.0 64 | rich==13.3.1 65 | SecretStorage==3.3.3 66 | selectolax==0.3.12 67 | selenium==4.8.0 68 | selenium-stealth==1.0.6 69 | shellingham==1.5.0.post1 70 | six==1.16.0 71 | sniffio==1.3.0 72 | sortedcontainers==2.4.0 73 | tomli==2.0.1 74 | tomlkit==0.11.6 75 | tqdm==4.65.0 76 | trio==0.22.0 77 | trio-websocket==0.9.2 78 | trove-classifiers==2023.1.20 79 | typing_extensions==4.4.0 80 | urllib3==1.26.14 81 | virtualenv==20.19.0 82 | w3lib==2.1.1 83 | webdriver-manager==3.8.5 84 | webencodings==0.5.1 85 | wsproto==1.2.0 86 | zipp==3.12.1 87 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | README = '' 4 | with open('README.md', 'r', encoding='utf-8') as readme_file: 5 | README = readme_file.read() 6 | 7 | setup( 8 | name='scrape-google-scholar-py', 9 | description = 'Extract data from all Google Scholar pages in Python. Sponsored by SerpApi.', 10 | url='https://github.com/dimitryzub/scrape-google-scholar', 11 | version='0.3.4', 12 | license='MIT', 13 | author='Dmitiry Zub', 14 | author_email='dimitryzub@gmail.com', 15 | maintainer='Dmitiry Zub', 16 | maintainer_email='dimitryzub@gmail.com', 17 | long_description_content_type='text/markdown', 18 | long_description=README, 19 | include_package_data=True, 20 | python_requires='>=3.10', 21 | classifiers = [ 22 | 'Development Status :: 3 - Alpha', 23 | 'Intended Audience :: Developers', 24 | 'Operating System :: Microsoft :: Windows', 25 | 'Operating System :: MacOS', 26 | 'Operating System :: POSIX :: Linux', 27 | 'Topic :: Internet', 28 | 'Natural Language :: English', 29 | 'Topic :: Utilities', 30 | 'Programming Language :: Python :: 3.10', 31 | 'Programming Language :: Python :: 3.11', 32 | ], 33 | keywords=[ 34 | 'google scholar', 35 | 'serpapi', 36 | 'scraper', 37 | 'python', 38 | 'python google scholar', 39 | 'python google scholar api', 40 | 'web scraping', 41 | 'python web scraping', 42 | 'research', 43 | 'lexbor', 44 | 'selectolax', 45 | 'selenium', 46 | 'selenium-stealth', 47 | 'pandas', 48 | ], 49 | install_requires=[ 50 | 'google-search-results>=2.4.2', 51 | 'selectolax>=0.3.12', 52 | 'parsel>=1.7.0', 53 | 'selenium-stealth>=1.0.6', 54 | 'pandas>=1.5.3', 55 | 'webdriver-manager>=3.8.5' 56 | ], 57 | project_urls={ 58 | 'Documentation': 'https://github.com/dimitryzub/scrape-google-scholar#example-usage-custom-backend', 59 | 'Source': 'https://github.com/dimitryzub/scrape-google-scholar', 60 | 'Tracker': 'https://github.com/dimitryzub/scrape-google-scholar/issues', 61 | }, 62 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dimitryzub/scrape-google-scholar-py/2a11840c7d19d23faca0c544c61cc5fd1aa4dadd/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_custom_profile.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | from pathlib import Path 4 | import os 5 | from google_scholar_py.custom_backend.profiles_results import CustomGoogleScholarProfiles 6 | 7 | 8 | # # Tests for CustomGoogleScholarProfiles class 9 | # @pytest.fixture(scope='session') 10 | # def google_scholar_parser(): 11 | # return CustomGoogleScholarProfiles() 12 | 13 | @pytest.fixture(scope='session') 14 | def search_query(): 15 | return 'blizzard' 16 | 17 | def test_custom_google_scholar_profiles_scrape_without_pagination(search_query): 18 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False) 19 | assert len(results) > 0 20 | 21 | def test_custom_google_scholar_profiles_scrape_with_pagination(search_query): 22 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=True) 23 | assert len(results) > 0 24 | 25 | def test_custom_google_scholar_profiles_save_to_csv(search_query): 26 | CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_csv=True) 27 | 28 | # ../ as file saves in root, might save to a special "results" folder 29 | assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.csv').exists() 30 | 31 | def test_custom_google_scholar_profiles_save_to_json(search_query): 32 | CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_json=True) 33 | 34 | # # ../ as file saves in root, might save to a special "results" folder 35 | assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.json').exists() 36 | 37 | # @pytest.fixture(scope='session') 38 | # def remove_test_files(): 39 | # csv_file = Path().cwd().parent / 'google_scholar_profile_results_data.csv' 40 | # json_file = Path().cwd().parent / 'google_scholar_profile_results_data.json' 41 | # os.remove(csv_file) 42 | # os.remove(json_file) 43 | 44 | 45 | # Tests for scrape_google_scholar_profiles function 46 | class TestScrapeGoogleScholarProfiles(unittest.TestCase): 47 | 48 | def test_scrape_google_scholar_profiles_returns_list(self): 49 | query = "machine learning" 50 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query) 51 | self.assertIsInstance(results, list) 52 | 53 | def test_scrape_google_scholar_profiles_returns_correct_data_types(self): 54 | query = "machine learning" 55 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query) 56 | 57 | for profile_data in results: 58 | self.assertIsInstance(profile_data, dict) 59 | self.assertIsInstance(profile_data['name'], str) 60 | self.assertIsInstance(profile_data['link'], str) 61 | self.assertIsInstance(profile_data['affiliations'], str) 62 | self.assertIsInstance(profile_data['email'], str) 63 | self.assertIsInstance(profile_data['cited_by_count'], int or None) 64 | self.assertIsInstance(profile_data['interests'], list or None) 65 | for interest in profile_data['interests']: 66 | self.assertIsInstance(interest, str) 67 | 68 | def test_scrape_google_scholar_profiles_returns_valid_data(self): 69 | query = "machine learning" 70 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=query) 71 | 72 | for profile_data in results: 73 | self.assertIsNotNone(profile_data['name']) 74 | self.assertIsNotNone(profile_data['link']) 75 | self.assertIsNotNone(profile_data['affiliations']) 76 | self.assertIsNotNone(profile_data['email']) 77 | self.assertIsNotNone(profile_data['cited_by_count']) 78 | self.assertGreater(len(profile_data['interests']), 0) 79 | 80 | 81 | 82 | if __name__ == '__main__': 83 | unittest.main() --------------------------------------------------------------------------------