├── .github
    ├── octo-reminder.yml
    └── workflows
    │   ├── codecov-workflow.yml
    │   └── potential-duplicates.yml
├── .gitignore
├── .gitpod.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── example_usage.py
├── google_scholar_py
    ├── __init__.py
    ├── custom_backend
    │   ├── author_info_all_articles.py
    │   ├── cite_results.py
    │   ├── google_scholar_cited_by_public_access_author.py
    │   ├── organic_search.py
    │   ├── profiles_results.py
    │   ├── top_mandates_metrics.py
    │   ├── top_publications_article.py
    │   ├── top_publications_article_citation.py
    │   └── top_publications_metrics.py
    └── serpapi_backend
    │   ├── author_results.py
    │   ├── organic_cite_results.py
    │   ├── organic_results.py
    │   └── profile_results.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    └── test_custom_profile.py


/.github/octo-reminder.yml:
--------------------------------------------------------------------------------
 1 | # Octo Reminder Configuration
 2 | 
 3 | ## Command Prefix
 4 | ### Define the prefix of your custom command.
 5 | ### Type: '/' | '!'
 6 | ### Default: '@' (works only in combination with command_name 'set-reminder')
 7 | command_prefix: '@'
 8 | 
 9 | ## Command Name
10 | ### Define the name of your custom command.
11 | ### Type: String
12 | ### Default: 'set-reminder'
13 | command_name: 'set-reminder'
14 | 
15 | ## Language
16 | ### Define the language.
17 | ### Type: 'en' | 'fr' | 'de' | 'pt' | 'nl' | 'ja'
18 | ### Default: 'en'
19 | language: 'en'
20 | 
21 | ## Timezone
22 | ### Define the timezone.
23 | ### Type: String (see also https://github.com/moment/moment-timezone/blob/develop/data/packed/latest.json)
24 | ### Default: 'Europe/London'
25 | timezone: 'Europe/London'
26 | 
27 | ## Default Hour
28 | ### Define the hour that will be used, when no time is specified.
29 | ### Type: Number
30 | default_hour: 0
31 | 
32 | ## Default Minute
33 | ### Define the minute that will be used, when no time is specified.
34 | ### Type: Number
35 | default_minute: 0
36 | 


--------------------------------------------------------------------------------
/.github/workflows/codecov-workflow.yml:
--------------------------------------------------------------------------------
 1 | name: Test API coverage
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |     paths-ignore:
 7 |       - 'README.md'
 8 |       - 'MANIFEST.in'
 9 |       - 'LICENSE'
10 |       - '.gitignore'
11 |       - '.gitpod.yaml'
12 |   pull_request:
13 |     branches: [ main ]
14 |     paths-ignore:
15 |       - 'README.md'
16 |       - 'MANIFEST.in'
17 |       - 'LICENSE'
18 |       - '.gitignore'
19 |       - '.gitpod.yaml'
20 | 
21 | jobs:
22 |   build:
23 |     runs-on: ubuntu-latest
24 |     name: Set up Python 3.11
25 |     steps:
26 |     - uses: actions/checkout@v3
27 |     - uses: actions/setup-python@v2
28 |       with:
29 |         python-version: '3.11'
30 | 
31 |     - name: Install requirements
32 |       run: pip install -r requirements.txt && pip install -r requirements-dev.txt
33 |       
34 |     - name: Run tests and collect coverage
35 |       run: pytest --cov=./ --cov-report=xml:coverage.xml
36 | 
37 |     - name: Upload coverage reports to Codecov with GitHub Action
38 |       uses: codecov/codecov-action@v3
39 |       with:
40 |           token: ${{ secrets.CODECOV_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/potential-duplicates.yml:
--------------------------------------------------------------------------------
 1 | name: Potential Duplicates
 2 | on:
 3 |   issues:
 4 |     types: [opened, edited]
 5 | jobs:
 6 |   run:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: wow-actions/potential-duplicates@v1
10 |         with:
11 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
12 |           # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch.
13 |           # Any matched issue will stop detection immediately.
14 |           # You can specify multi filters in each line.
15 |           filter: ''
16 |           # Exclude keywords in title before detecting.
17 |           exclude: ''
18 |           # Label to set, when potential duplicates are detected.
19 |           label: potential-duplicate
20 |           # Get issues with state to compare. Supported state: 'all', 'closed', 'open'.
21 |           state: all
22 |           # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate.
23 |           threshold: 0.6
24 |           # Reactions to be add to comment when potential duplicates are detected.
25 |           # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes"
26 |           reactions: 'eyes'
27 |           # Comment to post when potential duplicates are detected.
28 |           comment: >
29 |             Potential duplicates: {{#issues}}
30 |               - [#{{ number }}] {{ title }} ({{ accuracy }}%)
31 |             {{/issues}}
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | env/
3 | *.egg-info/
4 | docs/
5 | __pycache__/


--------------------------------------------------------------------------------
/.gitpod.yaml:
--------------------------------------------------------------------------------
 1 | tasks:
 2 |   - name: Update Linux and Install other Chrome Dependencies
 3 |     init: |
 4 |       sudo apt-get update -y && sudo apt-get upgrade -y && sudo apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1
 5 |   - name: Create VirtualEnv, Install Python Dependencies
 6 |     init: |
 7 |       python -m venv env
 8 |       source env/bin/activate
 9 |       pip install -r requirements.txt
10 | github:
11 |   prebuilds:
12 |     addBadge: true
13 | vscode:
14 |   extensions:
15 |     - usernamehw.errorlens
16 |     - vscode-icons-team.vscode-icons
17 |     - bierner.markdown-preview-github-styles
18 |     - ms-python.python
19 |     - ms-toolsai.jupyter
20 |     - KevinRose.vsc-python-indent
21 |     - eamodio.gitlens
22 |     - Gruntfuggly.todo-tree


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Dmitiry Zub☀️
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE requirements.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <p>Special thanks to:</p>
  3 | <div>
  4 |    <img src="https://user-images.githubusercontent.com/78694043/231375638-5bbf2989-fc7b-482a-b6fe-603d1d6d613f.svg" width="60" alt="SerpApi">
  5 | </div>
  6 | <a href="https://serpapi.com">
  7 | 	<b>API to get search engine results with ease.</b>
  8 | </a>
  9 | </div>
 10 | 
 11 | ____
 12 | 
 13 | <h3 align="center">
 14 |   Scrape data from all Google Scholar pages from a single Python module.
 15 | </h3>
 16 | 
 17 | <div align="center">
 18 |    <img src="https://user-images.githubusercontent.com/78694043/231677340-8980c44d-389a-497d-b021-b1d4ab846a77.svg" width="600" alt="scrape-google-scholar-py-logo">
 19 | </div>
 20 | 
 21 | 
 22 | <div align="center">
 23 | 
 24 |   <a href="https://pepy.tech/project/scrape-google-scholar-py">![Downloads](https://static.pepy.tech/badge/scrape-google-scholar-py/month)</a>
 25 |   <a href="">![licence](https://img.shields.io/github/license/dimitryzub/scrape-google-scholar-py?color=blue)</a>
 26 |   [![codecov](https://codecov.io/github/dimitryzub/scrape-google-scholar-py/branch/main/graph/badge.svg?token=OIQKN0O3B9)](https://codecov.io/github/dimitryzub/scrape-google-scholar-py)
 27 |   
 28 | </div>
 29 | 
 30 | > NOTE: As for now (2025), I no longer maintaining this repo. This could be changed later. To fix common issues, Chrome driver/CSS selectors might need an update.
 31 | 
 32 | <details>
 33 | <summary>🧐 Why two backends?</summary>
 34 | 
 35 | 1. If you don't want to pay for API. However, I'm not 100% sure if [`selenium-stealth`](https://pypi.org/project/selenium-stealth/) could handle all CAPTCHAs (although it handles CAPTCHA by Cloudflare) and similar blocks.
 36 | 2. If you know about SerpApi but don't want to figure out pagination.
 37 | 
 38 | SerpApi backend is more reliable because of:
 39 | - dedicated team of maintainers
 40 | - pool of proxies
 41 | - CAPTCHA solvers
 42 | - legal part of scraping and more.
 43 | 
 44 | </details>
 45 | 
 46 | 
 47 | <details>
 48 | <summary>🧩 Custom backend supports</summary>
 49 | 
 50 | 1. [Organic results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard&btnG=&oq=blizz) (with pagination).
 51 | 2. [Profile results](https://scholar.google.com/citations?view_op=search_authors&mauthors=blizzard&hl=en&oi=drw) (with pagination).
 52 | 3. [Author + author articles](https://scholar.google.com/citations?user=6IQ8pQwAAAAJ&hl=en&oi=sra) (with pagination), everything except "cited by" graph.
 53 | 4. [Public access mandates metrics](https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en). Yes, you can download CSV with one click, however, it doesn't contain a funder link. Script here has it and saves to CSV/JSON.
 54 | 5. [Top publications metrics](https://scholar.google.com/citations?view_op=top_venues&hl=en). Categories is also supported (as function argument). Saves to CSV/JSON. Sub-categories are not yet supported.
 55 | 6. [Journal articles](https://github.com/dimitryzub/scrape-google-scholar/issues/2) (with pagination).
 56 | 
 57 | You can use [`scholary`](https://github.com/scholarly-python-package/scholarly) to parse the data instead. However, it only extracts first 3 points above (organic, profile, author results).  
 58 | 
 59 |   <details>
 60 |   <summary>Things custom backend doesn't support yet</summary>
 61 | 
 62 |   1. Organic results filters (case law, sorting, period ranges). You can add those URL parameters yourself ([if installing from source](https://github.com/dimitryzub/scrape-google-scholar-py#installing)) easily to the `google_scholar_py/custom_backend/organic_search.py` file (line [`147`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L147) or [`136`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L160)), where `driver.get()` is being called.
 63 |   2. Author page -> cited by graph.
 64 |   3. Extracting [journal articles page](https://scholar.google.com/citations?hl=uk&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022). The [issue to add this page is open](https://github.com/dimitryzub/scrape-google-scholar/issues/2).
 65 |   4. [Top publications metrics page](https://scholar.google.com/citations?view_op=top_venues&hl=en). Subcategories are not yet supported, it's in a TODO list. 
 66 |   5. Update [cite results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard+effects+xanax&oq=blizzard+effects+x#d=gs_cit&t=1674718593252&u=%2Fscholar%3Fq%3Dinfo%3Alm-jhjzd72UJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D7%26hl%3Den) page extraction.
 67 |   </details>
 68 | </details>
 69 | 
 70 | <details>
 71 | <summary>🔮 SerpApi backend supports</summary>
 72 | 
 73 | - [Google Scholar Organic](https://serpapi.com/google-scholar-organic-results)
 74 | - [Google Scholar Profiles](https://serpapi.com/google-scholar-profilesapi)
 75 | - [Google Scholar Author](https://serpapi.com/google-scholar-author-api)
 76 | - [Google Scholar Cite](https://serpapi.com/google-scholar-cite-api)
 77 | </details>
 78 | 
 79 | <details>
 80 | <summary>🏗 Custom backend depends on</summary>
 81 | 
 82 | - [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) - to bypass CAPTCHAs and render some HTML (like cite results from organic result).
 83 | - [`selectolax`](https://github.com/rushter/selectolax) - to parse HTML fast. Its the fastest Python parser wrapped around [`lexbor`](https://github.com/lexbor/lexbor) (parser in pure C).
 84 | - [`pandas`](https://pandas.pydata.org/) - to save extracted data to CSV or JSON, or if you want to analyze the data right away. Save options is used in organic results and top publications, public access mandates pages for now.
 85 | 
 86 | All scripts are using headless [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) to bypass CAPTCHA that appears on Google Scholar, so you need to have a `chromedriver`. If you're on Linux you may need to do additional troubleshooting if `chromedriver` won't run properly.
 87 | </details>
 88 | 
 89 | ## 📥Installing
 90 | 
 91 | Install via `pip`:
 92 | 
 93 | ```bash
 94 | $ pip install scrape-google-scholar-py
 95 | ```
 96 | 
 97 | Install from source (single piped command):
 98 | 
 99 | ```bash
100 | git clone https://github.com/dimitryzub/scrape-google-scholar-py.git \
101 | && cd scrape-google-scholar-py \
102 | && python -m venv env && source env/Scripts/activate \
103 | && pip install -r requirements.txt
104 | ```
105 | 
106 | ### Possible errors that you might encounter
107 | 
108 | <details>
109 | <summary>LINUX USERS: If it throws "Web-driver exits unexpectedly" error</summary>
110 | 
111 |   Try installing extra dependencies to run `chromedriver`:	
112 |   ```bash
113 |   $ apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1
114 |   ```
115 | 
116 |   See resolved issue: [[Linux] Web-driver exits unexpectedly using CustomGoogleScholarOrganic() #7](https://github.com/dimitryzub/scrape-google-scholar-py/issues/7)	
117 | </details>
118 | 
119 | 
120 | <details>
121 | <summary>For MAC users, possible issues and fixes</summary>
122 | 	
123 |   - ✅ [(resolved question): Wheels failed to build while pip installing](https://github.com/dimitryzub/scrape-google-scholar-py/issues/12#issuecomment-1554266222)
124 | </details>	
125 | 	
126 | 	
127 | <details>
128 | <summary>If it throws an error with `selenium-stealth`</summary>
129 | 
130 |   ```bash
131 |   error: The 'selenium' distribution was not found and is required by selenium-stealth
132 |   ```
133 | 
134 |   Use:
135 | 
136 |   ```bash
137 |   $ pip install selenium-stealth
138 |   ```
139 | </details>
140 | 
141 | ## 📝Example usage custom backend
142 | 
143 | ```python
144 | from google_scholar_py import CustomGoogleScholarProfiles
145 | import json
146 | 
147 | parser = CustomGoogleScholarProfiles()
148 | data = parser.scrape_google_scholar_profiles(
149 |     query='blizzard',
150 |     pagination=False,
151 |     save_to_csv=False,
152 |     save_to_json=False
153 | )
154 | print(json.dumps(data, indent=2))
155 | ```
156 | 
157 | <details>
158 | <summary>Google Scholar search operators could also be used</summary>
159 | 
160 | ```lang-none
161 | label:computer_vision "Michigan State University"|"U.Michigan"
162 | ```
163 | 
164 | This query will search all profiles from 2 universities based on "computer vision" query.
165 | </details>
166 | 
167 | 
168 | <details>
169 | <summary>JSON output</summary>
170 | 
171 | ```json
172 | [
173 |   {
174 |     "name": "Adam Lobel",
175 |     "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
176 |     "affiliations": "Blizzard Entertainment",
177 |     "interests": [
178 |       "Gaming",
179 |       "Emotion regulation"
180 |     ],
181 |     "email": "Verified email at AdamLobel.com",
182 |     "cited_by_count": 3593
183 |   },
184 |   {
185 |     "name": "Daniel Blizzard",
186 |     "link": "https://scholar.google.com/citations?hl=en&user=dk4LWEgAAAAJ",
187 |     "affiliations": "",
188 |     "interests": null,
189 |     "email": null,
190 |     "cited_by_count": 1041
191 |   },
192 |   {
193 |     "name": "Shuo Chen",
194 |     "link": "https://scholar.google.com/citations?hl=en&user=OBf4YnkAAAAJ",
195 |     "affiliations": "Senior Data Scientist, Blizzard Entertainment",
196 |     "interests": [
197 |       "Machine Learning",
198 |       "Data Mining",
199 |       "Artificial Intelligence"
200 |     ],
201 |     "email": "Verified email at cs.cornell.edu",
202 |     "cited_by_count": 725
203 |   },
204 |   {
205 |     "name": "Ian Livingston",
206 |     "link": "https://scholar.google.com/citations?hl=en&user=xBHVqNIAAAAJ",
207 |     "affiliations": "Blizzard Entertainment",
208 |     "interests": [
209 |       "Human-computer interaction",
210 |       "User Experience",
211 |       "Player Experience",
212 |       "User Research",
213 |       "Games"
214 |     ],
215 |     "email": "Verified email at usask.ca",
216 |     "cited_by_count": 652
217 |   },
218 |   {
219 |     "name": "Minli Xu",
220 |     "link": "https://scholar.google.com/citations?hl=en&user=QST5iogAAAAJ",
221 |     "affiliations": "Blizzard Entertainment",
222 |     "interests": [
223 |       "Game",
224 |       "Machine Learning",
225 |       "Data Science",
226 |       "Bioinformatics"
227 |     ],
228 |     "email": "Verified email at blizzard.com",
229 |     "cited_by_count": 541
230 |   },
231 |   {
232 |     "name": "Je Seok Lee",
233 |     "link": "https://scholar.google.com/citations?hl=en&user=vuvtlzQAAAAJ",
234 |     "affiliations": "Blizzard Entertainment",
235 |     "interests": [
236 |       "HCI",
237 |       "Player Experience",
238 |       "Games",
239 |       "Esports"
240 |     ],
241 |     "email": "Verified email at uci.edu",
242 |     "cited_by_count": 386
243 |   },
244 |   {
245 |     "name": "Alisha Ness",
246 |     "link": "https://scholar.google.com/citations?hl=en&user=xQuwVfkAAAAJ",
247 |     "affiliations": "Activision Blizzard",
248 |     "interests": null,
249 |     "email": null,
250 |     "cited_by_count": 324
251 |   },
252 |   {
253 |     "name": "Xingyu (Alfred) Liu",
254 |     "link": "https://scholar.google.com/citations?hl=en&user=VW9ukOwAAAAJ",
255 |     "affiliations": "Blizzard Entertainment",
256 |     "interests": [
257 |       "Machine Learning in Game Development"
258 |     ],
259 |     "email": null,
260 |     "cited_by_count": 256
261 |   },
262 |   {
263 |     "name": "Amanda LL Cullen",
264 |     "link": "https://scholar.google.com/citations?hl=en&user=oqna6OgAAAAJ",
265 |     "affiliations": "Blizzard Entertainment",
266 |     "interests": [
267 |       "Games Studies",
268 |       "Fan Studies",
269 |       "Live Streaming"
270 |     ],
271 |     "email": null,
272 |     "cited_by_count": 247
273 |   },
274 |   {
275 |     "name": "Nicole \"Nikki\" Crenshaw",
276 |     "link": "https://scholar.google.com/citations?hl=en&user=zmRH6E0AAAAJ",
277 |     "affiliations": "Blizzard Entertainment",
278 |     "interests": [
279 |       "MMOs",
280 |       "Neoliberalism",
281 |       "Social Affordances",
282 |       "Identity",
283 |       "Accessibility"
284 |     ],
285 |     "email": "Verified email at uci.edu",
286 |     "cited_by_count": 202
287 |   }
288 | ]
289 | ```
290 | 
291 | </details>
292 | 
293 | 
294 | ## 📝Example usage SerpApi backend
295 | 
296 | ```python
297 | from google_scholar_py import SerpApiGoogleScholarOrganic
298 | import json
299 | 
300 | profile_parser = SerpApiGoogleScholarProfiles()
301 | data = profile_parser.scrape_google_scholar_profile_results(
302 |     query='blizzard',
303 |     api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key
304 |     pagination=False,
305 |     # other params
306 | )
307 | print(json.dumps(data, indent=2))
308 | ```
309 | 
310 | <details>
311 | <summary>JSON output</summary>
312 | 
313 | ```json
314 | [
315 |   {
316 |     "position": 0,
317 |     "title": "Mining learning and crafting scientific experiments: a literature review on the use of minecraft in education and research",
318 |     "result_id": "61OUs-3P374J",
319 |     "link": "https://www.jstor.org/stable/pdf/jeductechsoci.19.2.355.pdf?&seq=1",
320 |     "snippet": "\u2026 Minecraft have aroused the attention of teachers and researchers alike. To gain insights into the applicability of Minecraft, \u2026 our own considerable experience with Minecraft in courses on \u2026",
321 |     "publication_info": {
322 |       "summary": "S Nebel, S Schneider, GD Rey - Journal of Educational Technology & \u2026, 2016 - JSTOR",
323 |       "authors": [
324 |         {
325 |           "name": "S Nebel",
326 |           "link": "https://scholar.google.com/citations?user=_WTrwUwAAAAJ&hl=en&oi=sra",
327 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=_WTrwUwAAAAJ&engine=google_scholar_author&hl=en", 
328 |           "author_id": "_WTrwUwAAAAJ"
329 |         },
330 |         {
331 |           "name": "S Schneider",
332 |           "link": "https://scholar.google.com/citations?user=6Lh4FBMAAAAJ&hl=en&oi=sra",
333 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=6Lh4FBMAAAAJ&engine=google_scholar_author&hl=en", 
334 |           "author_id": "6Lh4FBMAAAAJ"
335 |         },
336 |         {
337 |           "name": "GD Rey",
338 |           "link": "https://scholar.google.com/citations?user=jCilMQoAAAAJ&hl=en&oi=sra",
339 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=jCilMQoAAAAJ&engine=google_scholar_author&hl=en", 
340 |           "author_id": "jCilMQoAAAAJ"
341 |         }
342 |       ]
343 |     },
344 |     "resources": [
345 |       {
346 |         "title": "researchgate.net",
347 |         "file_format": "PDF",
348 |         "link": "https://www.researchgate.net/profile/Steve-Nebel/publication/301232882_Mining_Learning_and_Crafting_Scientific_Experiments_A_Literature_Review_on_the_Use_of_Minecraft_in_Education_and_Research/links/570e709008aed4bec6fddad4/Mining-Learning-and-Crafting-Scientific-Experiments-A-Literature-Review-on-the-Use-of-Minecraft-in-Education-and-Research.pdf"
349 |       }
350 |     ],
351 |     "inline_links": {
352 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=61OUs-3P374J",
353 |       "cited_by": {
354 |         "total": 358,
355 |         "link": "https://scholar.google.com/scholar?cites=13753940406839825387&as_sdt=2005&sciodt=0,5&hl=en",
356 |         "cites_id": "13753940406839825387",
357 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13753940406839825387&engine=google_scholar&hl=en"
358 |       },
359 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:61OUs-3P374J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
360 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A61OUs-3P374J%3Ascholar.google.com%2F",
361 |       "versions": {
362 |         "total": 10,
363 |         "link": "https://scholar.google.com/scholar?cluster=13753940406839825387&hl=en&as_sdt=0,5",
364 |         "cluster_id": "13753940406839825387",
365 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13753940406839825387&engine=google_scholar&hl=en"
366 |       }
367 |     }
368 |   },
369 |   {
370 |     "position": 1,
371 |     "title": "Minecraft, beyond construction and survival",
372 |     "result_id": "_Lo9erywZPUJ",
373 |     "type": "Pdf",
374 |     "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9",
375 |     "snippet": "\" We\u2019ll keep releasing expansions and keep the game alive, but there needs to be some kind of final version that you can point at and say,\u2018I did this!\u2019... I\u2019m not sure why I feel a need to \u2026",
376 |     "publication_info": {
377 |       "summary": "SC Duncan - 2011 - stacks.stanford.edu",
378 |       "authors": [
379 |         {
380 |           "name": "SC Duncan",
381 |           "link": "https://scholar.google.com/citations?user=Ypqv_IEAAAAJ&hl=en&oi=sra",
382 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=Ypqv_IEAAAAJ&engine=google_scholar_author&hl=en", 
383 |           "author_id": "Ypqv_IEAAAAJ"
384 |         }
385 |       ]
386 |     },
387 |     "resources": [
388 |       {
389 |         "title": "stanford.edu",
390 |         "file_format": "PDF",
391 |         "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9"
392 |       }
393 |     ],
394 |     "inline_links": {
395 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=_Lo9erywZPUJ",
396 |       "cited_by": {
397 |         "total": 288,
398 |         "link": "https://scholar.google.com/scholar?cites=17682452360514616060&as_sdt=2005&sciodt=0,5&hl=en",
399 |         "cites_id": "17682452360514616060",
400 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17682452360514616060&engine=google_scholar&hl=en"
401 |       },
402 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:_Lo9erywZPUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
403 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A_Lo9erywZPUJ%3Ascholar.google.com%2F",
404 |       "versions": {
405 |         "total": 6,
406 |         "link": "https://scholar.google.com/scholar?cluster=17682452360514616060&hl=en&as_sdt=0,5",
407 |         "cluster_id": "17682452360514616060",
408 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17682452360514616060&engine=google_scholar&hl=en"
409 |       },
410 |       "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:_Lo9erywZPUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
411 |     }
412 |   },
413 |   {
414 |     "position": 2,
415 |     "title": "Minecraft as a creative tool: A case study",
416 |     "result_id": "wOTRJ8q0KIsJ",
417 |     "link": "https://www.igi-global.com/article/minecraft-as-a-creative-tool/116516",
418 |     "snippet": "\u2026 environment, Minecraft. In the following case study, the authors explored the use of Minecraft in \u2026 The authors demonstrate that Minecraft offers a unique opportunity for students to display \u2026",
419 |     "publication_info": {
420 |       "summary": "M Cipollone, CC Schifter, RA Moffat - International Journal of Game \u2026, 2014 - igi-global.com"
421 |     },
422 |     "resources": [
423 |       {
424 |         "title": "minecraft.school.nz",
425 |         "file_format": "PDF",
426 |         "link": "https://www.minecraft.school.nz/uploads/2/9/6/3/2963069/minecraft-as-a-creative-tool_-a-case-study_cipollone2014.pdf"
427 |       }
428 |     ],
429 |     "inline_links": {
430 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=wOTRJ8q0KIsJ",
431 |       "cited_by": {
432 |         "total": 102,
433 |         "link": "https://scholar.google.com/scholar?cites=10027463350684869824&as_sdt=2005&sciodt=0,5&hl=en",
434 |         "cites_id": "10027463350684869824",
435 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=10027463350684869824&engine=google_scholar&hl=en"
436 |       },
437 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:wOTRJ8q0KIsJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
438 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AwOTRJ8q0KIsJ%3Ascholar.google.com%2F",
439 |       "versions": {
440 |         "total": 9,
441 |         "link": "https://scholar.google.com/scholar?cluster=10027463350684869824&hl=en&as_sdt=0,5",
442 |         "cluster_id": "10027463350684869824",
443 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=10027463350684869824&engine=google_scholar&hl=en"
444 |       }
445 |     }
446 |   },
447 |   {
448 |     "position": 3,
449 |     "title": "Learning mathematics through Minecraft",
450 |     "result_id": "Hh4p5NaYNu0J",
451 |     "link": "https://pubs.nctm.org/abstract/journals/tcm/21/1/article-p56.xml",
452 |     "snippet": "\u2026 Minecraft to explore area and perimeter. First, the teacher reviewed the definition of perimeter and area. Using a class set of iPods with Minecraft \u2026 Minecraft forms a medium to explore \u2026",
453 |     "publication_info": {
454 |       "summary": "B Bos, L Wilder, M Cook, R O'Donnell - Teaching Children \u2026, 2014 - pubs.nctm.org",
455 |       "authors": [
456 |         {
457 |           "name": "B Bos",
458 |           "link": "https://scholar.google.com/citations?user=DfdRg-8AAAAJ&hl=en&oi=sra",
459 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DfdRg-8AAAAJ&engine=google_scholar_author&hl=en", 
460 |           "author_id": "DfdRg-8AAAAJ"
461 |         }
462 |       ]
463 |     },
464 |     "resources": [
465 |       {
466 |         "title": "researchgate.net",
467 |         "file_format": "PDF",
468 |         "link": "https://www.researchgate.net/profile/Beth-Bos/publication/267507986_Learning_mathematics_through_Minecraft_Authors/links/545103b80cf249aa53dc8eb2/Learning-mathematics-through-Minecraft-Authors.pdf"
469 |       }
470 |     ],
471 |     "inline_links": {
472 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Hh4p5NaYNu0J",
473 |       "cited_by": {
474 |         "total": 120,
475 |         "link": "https://scholar.google.com/scholar?cites=17093017484449619486&as_sdt=2005&sciodt=0,5&hl=en",
476 |         "cites_id": "17093017484449619486",
477 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17093017484449619486&engine=google_scholar&hl=en"
478 |       },
479 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:Hh4p5NaYNu0J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
480 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AHh4p5NaYNu0J%3Ascholar.google.com%2F",
481 |       "versions": {
482 |         "total": 8,
483 |         "link": "https://scholar.google.com/scholar?cluster=17093017484449619486&hl=en&as_sdt=0,5",
484 |         "cluster_id": "17093017484449619486",
485 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17093017484449619486&engine=google_scholar&hl=en"
486 |       }
487 |     }
488 |   },
489 |   {
490 |     "position": 4,
491 |     "title": "A deep hierarchical approach to lifelong learning in minecraft",
492 |     "result_id": "a_Er9i3hDtUJ",
493 |     "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744",
494 |     "snippet": "We propose a lifelong learning system that has the ability to reuse and transfer knowledge from one task to another while efficiently retaining the previously learned knowledge-base. \u2026",
495 |     "publication_info": {
496 |       "summary": "C Tessler, S Givony, T Zahavy, D Mankowitz\u2026 - Proceedings of the \u2026, 2017 - ojs.aaai.org",
497 |       "authors": [
498 |         {
499 |           "name": "C Tessler",
500 |           "link": "https://scholar.google.com/citations?user=7eLKa3IAAAAJ&hl=en&oi=sra",
501 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=7eLKa3IAAAAJ&engine=google_scholar_author&hl=en", 
502 |           "author_id": "7eLKa3IAAAAJ"
503 |         },
504 |         {
505 |           "name": "S Givony",
506 |           "link": "https://scholar.google.com/citations?user=nlVsO4YAAAAJ&hl=en&oi=sra",
507 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=nlVsO4YAAAAJ&engine=google_scholar_author&hl=en", 
508 |           "author_id": "nlVsO4YAAAAJ"
509 |         },
510 |         {
511 |           "name": "T Zahavy",
512 |           "link": "https://scholar.google.com/citations?user=9dXN6cMAAAAJ&hl=en&oi=sra",
513 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=9dXN6cMAAAAJ&engine=google_scholar_author&hl=en", 
514 |           "author_id": "9dXN6cMAAAAJ"
515 |         },
516 |         {
517 |           "name": "D Mankowitz",
518 |           "link": "https://scholar.google.com/citations?user=v84tWxsAAAAJ&hl=en&oi=sra",
519 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=v84tWxsAAAAJ&engine=google_scholar_author&hl=en", 
520 |           "author_id": "v84tWxsAAAAJ"
521 |         }
522 |       ]
523 |     },
524 |     "resources": [
525 |       {
526 |         "title": "aaai.org",
527 |         "file_format": "PDF",
528 |         "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744/10603"
529 |       }
530 |     ],
531 |     "inline_links": {
532 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=a_Er9i3hDtUJ",
533 |       "cited_by": {
534 |         "total": 364,
535 |         "link": "https://scholar.google.com/scholar?cites=15352455767272452459&as_sdt=2005&sciodt=0,5&hl=en",
536 |         "cites_id": "15352455767272452459",
537 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=15352455767272452459&engine=google_scholar&hl=en"
538 |       },
539 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:a_Er9i3hDtUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
540 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Aa_Er9i3hDtUJ%3Ascholar.google.com%2F",
541 |       "versions": {
542 |         "total": 13,
543 |         "link": "https://scholar.google.com/scholar?cluster=15352455767272452459&hl=en&as_sdt=0,5",
544 |         "cluster_id": "15352455767272452459",
545 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=15352455767272452459&engine=google_scholar&hl=en"
546 |       },
547 |       "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:a_Er9i3hDtUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
548 |     }
549 |   },
550 |   {
551 |     "position": 5,
552 |     "title": "Teaching scientific concepts using a virtual world: Minecraft.",
553 |     "result_id": "Oh88DuoTaLYJ",
554 |     "link": "https://search.informit.org/doi/abs/10.3316/aeipt.195598",
555 |     "snippet": "Minecraft is a multiplayer sandbox video game based in a virtual world modelled on the real \u2026 of Minecraft lends itself to the teaching of various academic subjects. Minecraft also has a \u2026",
556 |     "publication_info": {
557 |       "summary": "D Short - Teaching science, 2012 - search.informit.org",
558 |       "authors": [
559 |         {
560 |           "name": "D Short",
561 |           "link": "https://scholar.google.com/citations?user=ec_1ZmMAAAAJ&hl=en&oi=sra",
562 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=ec_1ZmMAAAAJ&engine=google_scholar_author&hl=en", 
563 |           "author_id": "ec_1ZmMAAAAJ"
564 |         }
565 |       ]
566 |     },
567 |     "resources": [
568 |       {
569 |         "title": "academia.edu",
570 |         "file_format": "PDF",
571 |         "link": "https://www.academia.edu/download/31153502/Short-2012-MC-Color-Version.pdf"
572 |       }
573 |     ],
574 |     "inline_links": {
575 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Oh88DuoTaLYJ",
576 |       "cited_by": {
577 |         "total": 274,
578 |         "link": "https://scholar.google.com/scholar?cites=13143777408462888762&as_sdt=2005&sciodt=0,5&hl=en",
579 |         "cites_id": "13143777408462888762",
580 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13143777408462888762&engine=google_scholar&hl=en"
581 |       },
582 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:Oh88DuoTaLYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
583 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AOh88DuoTaLYJ%3Ascholar.google.com%2F",
584 |       "versions": {
585 |         "total": 8,
586 |         "link": "https://scholar.google.com/scholar?cluster=13143777408462888762&hl=en&as_sdt=0,5",
587 |         "cluster_id": "13143777408462888762",
588 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13143777408462888762&engine=google_scholar&hl=en"
589 |       }
590 |     }
591 |   },
592 |   {
593 |     "position": 6,
594 |     "title": "Investigating the role of Minecraft in educational learning environments",
595 |     "result_id": "6RcOZdlG3CcJ",
596 |     "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2016.1254877",
597 |     "snippet": "\u2026 This research paper identifies the way in which Minecraft Edu can be used to contribute to the teaching 
598 | and learning of secondary students via a multiple case research study. Minecraft \u2026",
599 |     "publication_info": {
600 |       "summary": "N Callaghan - Educational Media International, 2016 - Taylor & Francis"
601 |     },
602 |     "inline_links": {
603 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6RcOZdlG3CcJ",
604 |       "cited_by": {
605 |         "total": 95,
606 |         "link": "https://scholar.google.com/scholar?cites=2872248561872803817&as_sdt=2005&sciodt=0,5&hl=en",
607 |         "cites_id": "2872248561872803817",
608 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=2872248561872803817&engine=google_scholar&hl=en"
609 |       },
610 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:6RcOZdlG3CcJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
611 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A6RcOZdlG3CcJ%3Ascholar.google.com%2F",
612 |       "versions": {
613 |         "total": 3,
614 |         "link": "https://scholar.google.com/scholar?cluster=2872248561872803817&hl=en&as_sdt=0,5",
615 |         "cluster_id": "2872248561872803817",
616 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=2872248561872803817&engine=google_scholar&hl=en"
617 |       }
618 |     }
619 |   },
620 |   {
621 |     "position": 7,
622 |     "title": "Maker culture and Minecraft: implications for the future of learning",
623 |     "result_id": "h27IfZ5va2YJ",
624 |     "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2015.1075103",
625 |     "snippet": "\u2026 be best to subscribe to for gathering information on Minecraft maker culture. From there, we \u2026 the 
626 | Minecraft videos that we are studying \u201ccreators\u201d due to the culture of the Minecraft video \u2026",
627 |     "publication_info": {
628 |       "summary": "DJ Niemeyer, HR Gerber - Educational Media International, 2015 - Taylor & Francis",
629 |       "authors": [
630 |         {
631 |           "name": "DJ Niemeyer",
632 |           "link": "https://scholar.google.com/citations?user=iEZOnzQAAAAJ&hl=en&oi=sra",
633 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=iEZOnzQAAAAJ&engine=google_scholar_author&hl=en", 
634 |           "author_id": "iEZOnzQAAAAJ"
635 |         },
636 |         {
637 |           "name": "HR Gerber",
638 |           "link": "https://scholar.google.com/citations?user=DwyCTMUAAAAJ&hl=en&oi=sra",
639 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DwyCTMUAAAAJ&engine=google_scholar_author&hl=en", 
640 |           "author_id": "DwyCTMUAAAAJ"
641 |         }
642 |       ]
643 |     },
644 |     "resources": [
645 |       {
646 |         "title": "publicservicesalliance.org",
647 |         "file_format": "PDF",
648 |         "link": "http://publicservicesalliance.org/wp-content/uploads/2016/06/Maker_culture_and_Minecraft_implications.pdf"    
649 |       }
650 |     ],
651 |     "inline_links": {
652 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=h27IfZ5va2YJ",
653 |       "cited_by": {
654 |         "total": 114,
655 |         "link": "https://scholar.google.com/scholar?cites=7380115140882493063&as_sdt=2005&sciodt=0,5&hl=en",
656 |         "cites_id": "7380115140882493063",
657 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=7380115140882493063&engine=google_scholar&hl=en"
658 |       },
659 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:h27IfZ5va2YJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
660 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Ah27IfZ5va2YJ%3Ascholar.google.com%2F",
661 |       "versions": {
662 |         "total": 8,
663 |         "link": "https://scholar.google.com/scholar?cluster=7380115140882493063&hl=en&as_sdt=0,5",
664 |         "cluster_id": "7380115140882493063",
665 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=7380115140882493063&engine=google_scholar&hl=en"
666 |       }
667 |     }
668 |   },
669 |   {
670 |     "position": 8,
671 |     "title": "Control of memory, active perception, and action in minecraft",
672 |     "result_id": "-5uM8qRUviwJ",
673 |     "link": "http://proceedings.mlr.press/v48/oh16.html",
674 |     "snippet": "In this paper, we introduce a new set of reinforcement learning (RL) tasks in Minecraft (a flexible 3D world). 
675 | We then use these tasks to systematically compare and contrast existing \u2026",
676 |     "publication_info": {
677 |       "summary": "J Oh, V Chockalingam, H Lee - \u2026 conference on machine \u2026, 2016 - proceedings.mlr.press",
678 |       "authors": [
679 |         {
680 |           "name": "J Oh",
681 |           "link": "https://scholar.google.com/citations?user=LNUeOu4AAAAJ&hl=en&oi=sra",
682 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=LNUeOu4AAAAJ&engine=google_scholar_author&hl=en", 
683 |           "author_id": "LNUeOu4AAAAJ"
684 |         },
685 |         {
686 |           "name": "V Chockalingam",
687 |           "link": "https://scholar.google.com/citations?user=CM2UkioAAAAJ&hl=en&oi=sra",
688 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=CM2UkioAAAAJ&engine=google_scholar_author&hl=en", 
689 |           "author_id": "CM2UkioAAAAJ"
690 |         },
691 |         {
692 |           "name": "H Lee",
693 |           "link": "https://scholar.google.com/citations?user=fmSHtE8AAAAJ&hl=en&oi=sra",
694 |           "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=fmSHtE8AAAAJ&engine=google_scholar_author&hl=en", 
695 |           "author_id": "fmSHtE8AAAAJ"
696 |         }
697 |       ]
698 |     },
699 |     "resources": [
700 |       {
701 |         "title": "mlr.press",
702 |         "file_format": "PDF",
703 |         "link": "http://proceedings.mlr.press/v48/oh16.pdf"
704 |       }
705 |     ],
706 |     "inline_links": {
707 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=-5uM8qRUviwJ",
708 |       "cited_by": {
709 |         "total": 317,
710 |         "link": "https://scholar.google.com/scholar?cites=3224107450664524795&as_sdt=2005&sciodt=0,5&hl=en",
711 |         "cites_id": "3224107450664524795",
712 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=3224107450664524795&engine=google_scholar&hl=en"
713 |       },
714 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:-5uM8qRUviwJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
715 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A-5uM8qRUviwJ%3Ascholar.google.com%2F",
716 |       "versions": {
717 |         "total": 7,
718 |         "link": "https://scholar.google.com/scholar?cluster=3224107450664524795&hl=en&as_sdt=0,5",
719 |         "cluster_id": "3224107450664524795",
720 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=3224107450664524795&engine=google_scholar&hl=en"
721 |       },
722 |       "cached_page_link": "http://scholar.googleusercontent.com/scholar?q=cache:-5uM8qRUviwJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
723 |     }
724 |   },
725 |   {
726 |     "position": 9,
727 |     "title": "Minecraft as a teaching tool: One case study",
728 |     "result_id": "yItxbN8DVXYJ",
729 |     "link": "https://www.learntechlib.org/p/48540/",
730 |     "snippet": "We know games help students gain skills and insights in many ways, and that games are engaging. With new online MMOPRPG games, like Minecraft, what we do not know is what \u2026",
731 |     "publication_info": {
732 |       "summary": "C Schifter, M Cipollone - Society for Information Technology & \u2026, 2013 - learntechlib.org"
733 |     },
734 |     "inline_links": {
735 |       "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=yItxbN8DVXYJ",
736 |       "cited_by": {
737 |         "total": 55,
738 |         "link": "https://scholar.google.com/scholar?cites=8526725727627873224&as_sdt=2005&sciodt=0,5&hl=en",
739 |         "cites_id": "8526725727627873224",
740 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=8526725727627873224&engine=google_scholar&hl=en"
741 |       },
742 |       "related_pages_link": "https://scholar.google.com/scholar?q=related:yItxbN8DVXYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
743 |       "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AyItxbN8DVXYJ%3Ascholar.google.com%2F",
744 |       "versions": {
745 |         "total": 2,
746 |         "link": "https://scholar.google.com/scholar?cluster=8526725727627873224&hl=en&as_sdt=0,5",
747 |         "cluster_id": "8526725727627873224",
748 |         "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=8526725727627873224&engine=google_scholar&hl=en"
749 |       }
750 |     }
751 |   }
752 | ]
753 | ```
754 | 
755 | </details>
756 | 
757 | ## ✍Contributing
758 | 
759 | Feel free to open an issue:
760 | - what bug you found.
761 | - something isn't working.
762 | - what feature to add.
763 | - anything else related to Google Scholar.
764 | 
765 | If you find comfortable to open a PR, feel free to do so. Guidelines are simple: conventional commits + code as simple as possible without unnecessary complexity.
766 | 
767 | There's exists a `.gitpod.yaml` config if you're using [Gitpod](https://www.gitpod.io/). 
768 | 
769 | ## 📜Licence
770 | 
771 | `scrape-google-scholar` repository is licensed under MIT license.
772 | 


--------------------------------------------------------------------------------
/example_usage.py:
--------------------------------------------------------------------------------
 1 | # each function have documentation with an example "usage" script, after function arugments
 2 | from google_scholar_py import CustomGoogleScholarOrganic
 3 | from google_scholar_py import SerpApiGoogleScholarOrganic
 4 | from google_scholar_py import CustomGoogleScholarTopPublicationArticle
 5 | 
 6 | import json
 7 | 
 8 | # TODO: add more examples
 9 | custom_parser_get_organic_results = CustomGoogleScholarOrganic().scrape_google_scholar_organic_results(
10 |     query='blizzard', 
11 |     pagination=False, 
12 |     save_to_csv=False,
13 |     save_to_json=False
14 | )
15 | 
16 | top_publication_citation = CustomGoogleScholarTopPublicationArticle().scrape_google_scholar_top_publication_articles(
17 |     journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=TdhLrHqKTh8J.2022',
18 |     pagination=True,
19 |     save_to_csv=False,
20 |     save_to_json=False
21 | )
22 | 
23 | serpapi_parser_get_organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results(
24 |     query='blizzard',
25 |     api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key
26 |     lang='en',
27 |     pagination=False,
28 | )
29 | 
30 | 
31 | print(json.dumps(custom_parser_get_organic_results, indent=2, ensure_ascii=False))
32 | print(json.dumps(serpapi_parser_get_organic_results, indent=2, ensure_ascii=False))
33 | print(json.dumps(top_publication_citation, indent=2, ensure_ascii=False))


--------------------------------------------------------------------------------
/google_scholar_py/__init__.py:
--------------------------------------------------------------------------------
 1 | from .custom_backend.organic_search import CustomGoogleScholarOrganic
 2 | from .custom_backend.profiles_results import CustomGoogleScholarProfiles
 3 | from .custom_backend.author_info_all_articles import CustomGoogleScholarAuthor
 4 | from .custom_backend.top_mandates_metrics import CustomGoogleScholarTopMandates
 5 | from .custom_backend.top_publications_metrics import CustomGoogleScholarTopPublications
 6 | from .custom_backend.top_publications_article import CustomGoogleScholarTopPublicationArticle
 7 | from .custom_backend.top_publications_article_citation import CustomGoogleScholarTopPublicationArticleCitation
 8 | 
 9 | # serpapi backend
10 | from .serpapi_backend.organic_results import SerpApiGoogleScholarOrganic
11 | from .serpapi_backend.profile_results import SerpApiGoogleScholarProfiles
12 | from .serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite
13 | from .serpapi_backend.author_results import SerpApiGoogleScholarAuthor


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/author_info_all_articles.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Union, Dict
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | class CustomGoogleScholarAuthor:
 11 |     def __init__(self) -> None:
 12 |         pass
 13 | 
 14 | 
 15 |     def scrape_google_scholar_author_data(
 16 |             self,
 17 |             user_id: str, 
 18 |             parse_articles: bool = False,
 19 |             article_pagination: bool = False 
 20 |         ) -> Dict[str, List[Union[str, int, None]]]:
 21 |         '''
 22 |         Extracts data from Google Scholar Author profile page:
 23 |         - Info about the author itself
 24 |         - Co-authors: name, link, affiliation
 25 |         - Author: title, link, authors, publication, cited by, year.
 26 |         - Articles: first 100 if pagination is False, or all if pagination is True. 
 27 | 
 28 |         Arguments:
 29 |         - user_id: str. User ID from Google Scholar profile located in the URL.
 30 |         - parse_articles: True of False. If True, extracts first 100 articles. Default False.
 31 |         - article_pagination: True of False. If True, extracts beyond first 100 articles. 
 32 | 
 33 |         Usage:
 34 |         
 35 |         from google_scholar_py import CustomGoogleScholarAuthor
 36 |         
 37 |         parser = CustomGoogleScholarAuthor()
 38 |         data = parser.scrape_google_scholar_author_data(
 39 |             user_id='nHhtvqkAAAAJ',
 40 |             parse_articles=True,
 41 |             article_pagination=True
 42 |         )
 43 |         print(json.dumps(data, indent=2))
 44 |         
 45 |         print(data['info']) # author info
 46 |         print(data['co-authors'])
 47 | 
 48 |         for article in data['articles']:
 49 |             print(article['title'])
 50 |             print(article['cited_by_count'])
 51 |             ...
 52 |         '''  
 53 | 
 54 |         # selenium stealth
 55 |         options = webdriver.ChromeOptions()
 56 |         options.add_argument('--headless')
 57 |         options.add_argument('--no-sandbox')
 58 |         options.add_argument('--disable-dev-shm-usage')
 59 |         
 60 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
 61 |         options.add_experimental_option('useAutomationExtension', False)
 62 |         
 63 |         service = Service(ChromeDriverManager().install())
 64 |         driver = webdriver.Chrome(service=service, options=options)
 65 |         
 66 |         stealth(driver,
 67 |             languages=['en-US', 'en'],
 68 |             vendor='Google Inc.',
 69 |             platform='Win32',
 70 |             webgl_vendor='Intel Inc.',
 71 |             renderer='Intel Iris OpenGL Engine',
 72 |             fix_hairline=True,
 73 |         )
 74 |         
 75 |         driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&pagesize=100')
 76 |         parser = LexborHTMLParser(driver.page_source)
 77 |         
 78 |         profile_info = {
 79 |             'info': {},
 80 |             'co-authors': [], 
 81 |             'articles': [][:-1] # [:-1] to not to return the last None element. Weird approach, I know. Revisit in the future.
 82 |         }
 83 |         
 84 |         profile_info['info']['name'] = parser.css_first('#gsc_prf_in').text()
 85 |         profile_info['info']['affiliations'] = parser.css_first('.gsc_prf_ila').text()
 86 |         profile_info['info']['email'] = parser.css_first('#gsc_prf_ivh').text()
 87 |         profile_info['info']['interests'] = [interest.text() for interest in parser.css('#gsc_prf_int .gs_ibl')]
 88 |         
 89 |         for co_author in parser.css('.gsc_rsb_aa'):
 90 |             profile_info['co-authors'].append({
 91 |                 'name': co_author.css_first('.gsc_rsb_a_desc a').text(),
 92 |                 'profile_link': f"https://scholar.google.com{co_author.css_first('.gsc_rsb_a_desc a').attrs['href']}",
 93 |                 'affiliation': co_author.css_first('.gsc_rsb_a_ext').text(),
 94 |             })
 95 |             
 96 |         # extracts only first 100 articles, WITHOUT paginaiton
 97 |         if parse_articles:
 98 |             # TODO: make a separate function to extract articles
 99 |             for index, article in enumerate(parser.css('.gsc_a_tr'), start=1):
100 |                 try:
101 |                     article_title = article.css_first('.gsc_a_at').text()
102 |                 except: article_title = None
103 |                 
104 |                 try: 
105 |                     article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}"
106 |                 except: article_link = None
107 |                 
108 |                 try:
109 |                     if ',' in article.css_first('.gsc_a_at+ .gs_gray').text():
110 |                         article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors
111 |                     else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text()           # single authour
112 |                 except: article_authors = None
113 |                 
114 |                 try:
115 |                     article_publication = article.css_first('.gs_gray+ .gs_gray').text()
116 |                 except: article_publication = None
117 | 
118 |                 try:
119 |                     cited_by_count = article.css_first('.gsc_a_ac').text() 
120 |                 except: cited_by_count = None
121 |                 
122 |                 try: 
123 |                     publication_year = article.css_first('.gsc_a_hc').text()
124 |                 except: publication_year = None
125 | 
126 |                 profile_info['articles'].append({
127 |                     'title': article_title,
128 |                     'link': article_link,
129 |                     'authors': article_authors,
130 |                     'publication': article_publication if article_publication else None,
131 |                     'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str
132 |                     'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str
133 |                 }) 
134 |         elif parse_articles is False:
135 |             profile_info.pop('articles')
136 | 
137 |         page_num = 0
138 | 
139 |         # extracts all articles
140 |         if parse_articles and article_pagination:
141 |             while True:
142 |                 driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&cstart={page_num}&pagesize=100')
143 |                 parser = LexborHTMLParser(driver.page_source)
144 |                 
145 |                 for article in parser.css('.gsc_a_tr'):
146 |                     try:
147 |                         article_title = article.css_first('.gsc_a_at').text()
148 |                     except: article_title = None
149 |                     
150 |                     try: 
151 |                         article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}"
152 |                     except: article_link = None
153 |                     
154 |                     try:
155 |                         if ',' in article.css_first('.gsc_a_at+ .gs_gray').text():
156 |                             article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors
157 |                         else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text()           # single authour
158 |                     except: article_authors = None
159 |                     
160 |                     try:
161 |                         article_publication = article.css_first('.gs_gray+ .gs_gray').text()
162 |                     except: article_publication = None
163 | 
164 |                     try:
165 |                         cited_by_count = article.css_first('.gsc_a_ac').text() 
166 |                     except: cited_by_count = None
167 |                     
168 |                     try: 
169 |                         publication_year = article.css_first('.gsc_a_hc').text()
170 |                     except: publication_year = None
171 | 
172 |                     profile_info['articles'].append({
173 |                         'title': article_title,
174 |                         'link': article_link,
175 |                         'authors': article_authors,
176 |                         'publication': article_publication if article_publication else None,
177 |                         'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str
178 |                         'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str
179 |                     })
180 | 
181 |                 if parser.css_first('.gsc_a_e'):
182 |                     break
183 |                 else:
184 |                     page_num += 100  # paginate to the next page
185 |                     
186 |         # remove articles key if user don't want to extract it
187 |         elif article_pagination and parse_articles is False: 
188 |             profile_info.pop('articles')
189 |             
190 |         return profile_info
191 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/cite_results.py:
--------------------------------------------------------------------------------
 1 | #TODO: support/refactor CITE extraction. This is not yet implemented.
 2 | 
 3 | from parsel import Selector
 4 | import requests
 5 | 
 6 | params = {
 7 |     'q': 'blizzard', # search query
 8 |     'hl': 'en'       # language of the search   
 9 | }
10 | 
11 | 
12 | headers = {
13 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
14 |     'accept-language': 'en-US,en',
15 |     'referer': f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}"
16 | }
17 | 
18 | 
19 | def parsel_get_cite_ids():
20 |     html = requests.get('https://scholar.google.com/scholar', params=params, headers=headers)
21 |     soup = Selector(text=html.text)
22 | 
23 |     # returns a list of publication ID's -> U8bh6Ca9uwQJ
24 |     return soup.css('.gs_r.gs_or.gs_scl::attr(data-cid)').getall()
25 | 
26 | def parsel_scrape_cite_results():
27 |     citations = []
28 | 
29 |     for cite_id in parsel_get_cite_ids():
30 |         html = requests.get(f'https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com', headers=headers)
31 |         selector = Selector(text=html.text)
32 |          
33 |         # might be issues in the future with extracting data from the table
34 |         if selector.css('#gs_citt').get():
35 |             for result in selector.css('tr'):
36 |                 institution = result.xpath('th/text()').get()
37 |                 citation = result.xpath('td div/text()').get()
38 | 
39 |                 citations.append({'institution': institution, 'citations': citation})
40 | 
41 |     return citations
42 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/google_scholar_cited_by_public_access_author.py:
--------------------------------------------------------------------------------
 1 | from parsel import Selector
 2 | import requests, json
 3 | 
 4 | #TODO: add cited by graph extraction to author script
 5 | 
 6 | def parsel_scrape_author_cited_by_graph():
 7 |     params = {
 8 |         'user': '_xwYD2sAAAAJ',       # user-id
 9 |         'hl': 'en'                    # language
10 |     }
11 | 
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
14 |     }
15 | 
16 |     data = {
17 |         'cited_by': [],
18 |         'graph': []
19 |     }
20 | 
21 |     html = requests.get('https://scholar.google.com/citations', params=params, headers=headers, timeout=30)
22 |     selector = Selector(text=html.text)
23 | 
24 |     since_year = selector.css('.gsc_rsb_sth~ .gsc_rsb_sth+ .gsc_rsb_sth::text').get().lower().replace(' ', '_')
25 | 
26 |     for cited_by_public_access in selector.css('.gsc_rsb'):
27 |         data['cited_by'].append({
28 |             'citations_all': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
29 |             f'citations_since_{since_year}': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std::text').get(),
30 |             'h_index_all': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
31 |             f'h_index_since_{since_year}': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std::text').get(),
32 |             'i10_index_all': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
33 |             f'i10_index_since_{since_year}': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std::text').get(),
34 |             'articles': {
35 |                     'available': int(cited_by_public_access.css('.gsc_rsb_m_a:nth-child(1) span::text').get().split(' ')[0]), # to get only digit value
36 |                     'not_available': int(cited_by_public_access.css('.gsc_rsb_m_na div::text').get().split(' ')[0]),          # to get only digit value
37 |                 },
38 |             'articles_link': f"https://scholar.google.com{cited_by_public_access.css('#gsc_lwp_mndt_lnk::attr(href)').get()}"
39 |         })
40 |     
41 |     for graph_year, graph_yaer_value in zip(selector.css('.gsc_g_t::text'), selector.css('.gsc_g_al::text')):
42 |         data['graph'].append({
43 |             'year': graph_year.get(),
44 |             'value': int(graph_yaer_value.get())
45 |         })
46 | 
47 | if __name__ == '__main__':
48 |     print(json.dumps(parsel_scrape_author_cited_by_graph(), indent=2, ensure_ascii=False))
49 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/organic_search.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Dict, Callable
  7 | import time, random, re
  8 | import pandas as pd
  9 | from pathlib import Path
 10 | 
 11 | 
 12 | class CustomGoogleScholarOrganic:
 13 |     def __init__(self) -> None:
 14 |         pass
 15 |     
 16 | 
 17 |     def parse(self, parser: Callable, organic_results_data: Callable):
 18 |         '''
 19 |         Arugments:
 20 |         - parser:  Lexbor parser from scrape_google_scholar_organic_results() function.
 21 |         - organic_results_data: List to append data to. List origin location is scrape_google_scholar_organic_results() function. Line 104.
 22 |         
 23 |         This function parses data from Google Scholar Organic results and appends data to a List.
 24 |         
 25 |         It's used by scrape_google_scholar_organic_results().
 26 |         
 27 |         It returns nothing as it appends data to `organic_results_data`, 
 28 |         which appends it to `organic_results_data` List in the scrape_google_scholar_organic_results() function.
 29 |         '''
 30 |         
 31 |         for result in parser.css('.gs_r.gs_or.gs_scl'):
 32 |             try:
 33 |                 title: str = result.css_first('.gs_rt').text()
 34 |             except: title = None
 35 | 
 36 |             try:
 37 |                 title_link: str = result.css_first('.gs_rt a').attrs['href']
 38 |             except: title_link = None
 39 | 
 40 |             try:
 41 |                 publication_info: str = result.css_first('.gs_a').text()
 42 |             except: publication_info = None
 43 | 
 44 |             try:
 45 |                 snippet: str = result.css_first('.gs_rs').text()
 46 |             except: snippet = None
 47 | 
 48 |             try:
 49 |                 # if Cited by is present in inline links, it will be extracted
 50 |                 cited_by_link = ''.join([link.attrs['href'] for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()])
 51 |             except: cited_by_link = None
 52 |             
 53 |             try:
 54 |                 # if Cited by is present in inline links, it will be extracted and type cast it to integer
 55 |                 cited_by_count = int(''.join([re.search(r'\d+', link.text()).group() for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()]))
 56 |             except: cited_by_count = None
 57 |             
 58 |             try:
 59 |                 pdf_file: str = result.css_first('.gs_or_ggsm a').attrs['href']
 60 |             except: pdf_file = None
 61 | 
 62 |             organic_results_data.append({
 63 |                 'title': title,
 64 |                 'title_link': title_link,
 65 |                 'publication_info': publication_info,
 66 |                 'snippet': snippet if snippet else None,
 67 |                 'cited_by_link': f'https://scholar.google.com{cited_by_link}' if cited_by_link else None,
 68 |                 'cited_by_count': cited_by_count if cited_by_count else None,
 69 |                 'pdf_file': pdf_file
 70 |             })
 71 | 
 72 |     #TODO: add lang support. https://serpapi.com/google-languages
 73 |     def scrape_google_scholar_organic_results(
 74 |             self,
 75 |             query: str,
 76 |             pagination: bool = False,
 77 |             save_to_csv: bool = False, 
 78 |             save_to_json: bool = False
 79 |         ) -> List[Dict[str, str]]:
 80 |         '''
 81 |         Extracts data from Google Scholar Organic resutls page:
 82 |         - title: str
 83 |         - title_link: str
 84 |         - publication_info: str 
 85 |         - snippet: str
 86 |         - cited_by_link: str 
 87 |         - cited_by_count: int
 88 |         - pdf_file: str
 89 |         
 90 |         Arguments:
 91 |         - query: str. Search query. 
 92 |         - pagination: bool. Enables or disables pagination. Default is False.
 93 |         - save_to_csv: bool. True of False. Default is False.
 94 |         - save_to_json: bool. True of False. Default is False.
 95 |         
 96 |         Usage:
 97 |         
 98 |         from google_scholar_py import CustomGoogleScholarOrganic
 99 | 
100 |         parser = CustomGoogleScholarOrganic()
101 |         data = parser.scrape_google_scholar_organic_results(
102 |             query='blizzard',
103 |             pagination=False,
104 |             save_to_csv=True
105 |         )
106 |         
107 |         for organic_result in data:
108 |             print(organic_result['title'])
109 |             print(organic_result['pdf_file'])
110 |         '''
111 |         
112 |         # selenium stealth
113 |         options = webdriver.ChromeOptions()
114 |         options.add_argument('--headless')
115 |         options.add_argument('--no-sandbox')
116 |         options.add_argument('--disable-dev-shm-usage')
117 |         
118 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
119 |         options.add_experimental_option('useAutomationExtension', False) 
120 |         
121 |         service = Service(ChromeDriverManager().install())
122 |         driver = webdriver.Chrome(service=service, options=options)
123 |         
124 |         stealth(driver,
125 |             languages=['en-US', 'en'],
126 |             vendor='Google Inc.',
127 |             platform='Win32',
128 |             webgl_vendor='Intel Inc.',
129 |             renderer='Intel Iris OpenGL Engine',
130 |             fix_hairline=True,
131 |         )
132 |         
133 |         page_num = 0
134 |         organic_results_data = []
135 |     
136 |         # parse all pages
137 |         if pagination:
138 |             while True:
139 |                 # parse all pages
140 |                 driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}')
141 |                 parser = LexborHTMLParser(driver.page_source)
142 |                 
143 |                 self.parse(parser=parser, organic_results_data=organic_results_data)
144 |                 
145 |                 # pagination
146 |                 if parser.css_first('.gs_ico_nav_next'):  # checks for the "Next" page button
147 |                     page_num += 10                        # paginate to the next page
148 |                     time.sleep(random.randint(1, 3))      # sleep between paginations
149 |                 else:
150 |                     break
151 |         else:
152 |             # parse first page only
153 |             driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}')
154 |             parser = LexborHTMLParser(driver.page_source)
155 |         
156 |             self.parse(parser=parser, organic_results_data=organic_results_data)
157 |             
158 |         if save_to_csv:
159 |             pd.DataFrame(data=organic_results_data).to_csv('google_scholar_organic_results_data.csv', 
160 |                                                             index=False, encoding='utf-8')
161 |         if save_to_json:
162 |             pd.DataFrame(data=organic_results_data).to_json('google_scholar_organic_results_data.json', 
163 |                                                             orient='records')
164 |         driver.quit()
165 |         
166 |         return organic_results_data
167 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/profiles_results.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from parsel import Selector
  7 | from typing import List, Dict, Callable
  8 | import time, random, re
  9 | import pandas as pd
 10 | from pathlib import Path
 11 | 
 12 | class CustomGoogleScholarProfiles:
 13 |     def __init__(self) -> None:
 14 |         pass
 15 |     
 16 | 
 17 |     def parse(self, parser: Callable, profile_results_data: Callable):
 18 |         '''
 19 |         Arugments:
 20 |         - parser: Callable. Lexbor parser from scrape_google_scholar_profiles() function.
 21 |         - profile_results_data: Callable. List to append data to. List origin location is scrape_google_scholar_profiles() function. Line 100.
 22 |         
 23 |         This function parses data from Google Scholar Organic results and appends data to a List.
 24 |         
 25 |         It's used by scrape_google_scholar_profiles().
 26 |         
 27 |         It returns nothing as it appends data to `profile_results_data`, 
 28 |         which appends it to `profile_results_data` List in the scrape_google_scholar_profiles() function.
 29 |         '''
 30 |         
 31 |         for profile in parser.css('.gs_ai_chpr'):
 32 |             try:
 33 |                 name: str = profile.css_first('.gs_ai_name a').text()
 34 |             except: name = None
 35 |             
 36 |             try:
 37 |                 link: str = f'https://scholar.google.com{profile.css_first(".gs_ai_name a").attrs["href"]}'
 38 |             except: link = None
 39 |             
 40 |             try:
 41 |                 affiliations: str = profile.css_first('.gs_ai_aff').text()
 42 |             except: affiliations = None
 43 |             
 44 |             try:
 45 |                 interests: list = [interest.text() for interest in profile.css('.gs_ai_one_int')]
 46 |             except: interests = None
 47 |             
 48 |             try:
 49 |                 email: str = profile.css_first('.gs_ai_eml').text()
 50 |             except: email = None
 51 |             
 52 |             try:
 53 |                 cited_by: int = re.search(r'\d+', profile.css_first('.gs_ai_cby').text()).group() # Cited by 17143 -> 17143
 54 |             except: cited_by = None
 55 | 
 56 |             profile_results_data.append({
 57 |                 'name': name,
 58 |                 'link': link,
 59 |                 'affiliations': affiliations,
 60 |                 'interests': interests if interests else None,
 61 |                 'email': email if email else None,
 62 |                 'cited_by_count': int(cited_by) if cited_by else None
 63 |             })
 64 | 
 65 | 
 66 |     def scrape_google_scholar_profiles(
 67 |             self, 
 68 |             query: str, 
 69 |             pagination: bool = False, 
 70 |             save_to_csv: bool = False, 
 71 |             save_to_json: bool = False
 72 |         ) -> List[Dict[str, str]]:
 73 |         '''
 74 |         Extracts data from Google Scholar Organic Profile resutls page:
 75 |         - name: str
 76 |         - link: str
 77 |         - affiliations: str 
 78 |         - email: str
 79 |         - cited_by_count: int
 80 |         
 81 |         Arguments:
 82 |         - query: str. Search query. 
 83 |         - pagination: bool. Enables or disables pagination. Default is False.
 84 |         - save_to_csv: bool. True of False. Default is False.
 85 |         - save_to_json: bool. True of False. Default is False.
 86 |         
 87 |         Usage:
 88 |         
 89 |         from google_scholar_py import CustomGoogleScholarProfiles
 90 |         
 91 |         parser = CustomGoogleScholarProfiles()
 92 |         data = parser.scrape_google_scholar_profiles(
 93 |             query='blizzard',
 94 |             pagination=False,
 95 |             save_to_csv=True
 96 |         )
 97 |         print(json.dumps(data, indent=2))
 98 |         
 99 |         for profile_results in data:
100 |             print(profile_results['name'])
101 |             print(profile_results['email'])
102 |         '''
103 |         
104 |         # selenium stealth
105 |         options = webdriver.ChromeOptions()
106 |         options.add_argument('--headless')
107 |         options.add_argument('--no-sandbox')
108 |         options.add_argument('--disable-dev-shm-usage')
109 |         
110 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
111 |         options.add_experimental_option('useAutomationExtension', False)
112 |         
113 |         service = Service(ChromeDriverManager().install())
114 |         driver = webdriver.Chrome(service=service, options=options)
115 |         
116 |         stealth(driver,
117 |             languages=['en-US', 'en'],
118 |             vendor='Google Inc.',
119 |             platform='Win32',
120 |             webgl_vendor='Intel Inc.',
121 |             renderer='Intel Iris OpenGL Engine',
122 |             fix_hairline=True
123 |         )
124 |         
125 |         params = {} # stores next page token to add to URL later
126 |         page_num = 0
127 |         profile_results_data = []
128 | 
129 |         if pagination:
130 |             while True:
131 |                 # if next page token appears, add to to URL as URL parameter
132 |                 # otherwise, do a search without next page token parameter (Line: 101)
133 |                 if params.get('after_author') is None:
134 |                     driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}')
135 |                     parser = LexborHTMLParser(driver.page_source)
136 |                     
137 |                     #TODO: replace parsel with selectolax completely
138 |                     selector = Selector(text=driver.page_source)  # to check next page token
139 |                 
140 |                     self.parse(parser=parser, profile_results_data=profile_results_data)
141 |                     
142 |                     # check if the next arrow button is active by checking 'onclick' attribute
143 |                     if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get():
144 |                         # extracting next page token and passing to 'after_author' query URL parameter
145 |                         params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J
146 |                         page_num += 10                        # paginate to the next page
147 |                         time.sleep(random.randint(1, 3))      # sleep between paginations
148 |                     else:
149 |                         break
150 |                 else:
151 |                     driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}&after_author={params["after_author"]}')
152 |                     parser = LexborHTMLParser(driver.page_source)
153 |                     
154 |                     #TODO: replace parsel with selectolax completely
155 |                     selector = Selector(text=driver.page_source) # to check next page token
156 |                 
157 |                     self.parse(parser=parser, profile_results_data=profile_results_data)
158 |                     
159 |                     if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get():
160 |                         # extracting next page token and passing to 'after_author' query URL parameter
161 |                         params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J
162 |                         page_num += 10                        # paginate to the next page
163 |                         time.sleep(random.randint(1, 3))      # sleep between paginations
164 |                     else:
165 |                         break
166 |         else:
167 |             # parse single, first page
168 |             driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}')
169 |             parser = LexborHTMLParser(driver.page_source)
170 |         
171 |             self.parse(parser=parser, profile_results_data=profile_results_data)
172 | 
173 |         driver.quit()
174 |         
175 |         if save_to_csv:
176 |             pd.DataFrame(data=profile_results_data).to_csv('google_scholar_profile_results_data.csv', 
177 |                                                             index=False, encoding='utf-8')
178 |         if save_to_json:
179 |             pd.DataFrame(data=profile_results_data).to_json('google_scholar_profile_results_data.json', 
180 |                                                             orient='records')
181 |         
182 |         return profile_results_data       
183 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_mandates_metrics.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Dict, Callable
  7 | import pandas as pd
  8 | import re
  9 | 
 10 | 
 11 | class CustomGoogleScholarTopMandates:
 12 |     def __init__(self) -> None:
 13 |         pass
 14 |     
 15 | 
 16 |     def parse(self, parser: Callable, top_mandates_data: Callable):
 17 |         '''
 18 |         Arugments:
 19 |         - parser: Callable. Lexbor parser from google_scholar_top_mandates_metrics() function.
 20 |         - top_mandates_data: Callable. List to append data to. List origin location is google_scholar_top_mandates_metrics() function. Line 100.
 21 |         
 22 |         This function parses data from Google Scholar Organic results and appends data to a List.
 23 |         
 24 |         It's used by google_scholar_top_mandates_metrics().
 25 |         
 26 |         It returns nothing as it appends data to `top_mandates_data`, 
 27 |         which appends it to `top_mandates_data` List in the google_scholar_top_mandates_metrics() function.
 28 |         '''
 29 |         
 30 |         for table in parser.css('tr'):
 31 |             try:
 32 |                 # removes "... - cached"
 33 |                 # https://regex101.com/r/EfljZp/1
 34 |                 funder: str = re.sub(r'(\s\s-.*)', '', table.css_first('td.gsc_mlt_t').text()) 
 35 |             except: funder = None
 36 |             
 37 |             try:
 38 |                 link: str = table.css_first('.gsc_mlt_t a').attrs['href']
 39 |             except: link = None
 40 |             
 41 |             try: 
 42 |                 two_eighteen: int = table.css_first('td:nth-child(4)').text()
 43 |                 if '-' in two_eighteen:
 44 |                     two_eighteen = None
 45 |             except: two_eighteen = None
 46 |             
 47 |             try: 
 48 |                 twenty_twenty: str = table.css_first('td:nth-child(5)').text()
 49 |                 if '-' in twenty_twenty:
 50 |                     twenty_twenty = None
 51 |             except: twenty_twenty = None
 52 |             
 53 |             try: 
 54 |                 twenty_one: str = table.css_first('td:nth-child(6)').text()
 55 |                 if '-' in twenty_one: # missing % in the table
 56 |                     twenty_one = None
 57 |             except: twenty_one = None
 58 |             
 59 |             #TODO: fix selector to extract "overall" data
 60 |             # `td:nth-child(6)` is not working also
 61 |             # try:
 62 |             #     overall: str = table.css('.gsc_mlt_n.gsc_mlt_bd').text()
 63 |             # except: overall = None
 64 |         
 65 |             top_mandates_data.append({
 66 |                 'funder': funder,
 67 |                 'link': link,
 68 |                 '2019': two_eighteen,
 69 |                 '2020': twenty_twenty,
 70 |                 '2021': twenty_one,
 71 |                 # 'overall': overall
 72 |             })
 73 | 
 74 | 
 75 |     def scrape_top_mandates_metrics(
 76 |             self,
 77 |             save_to_csv: bool = False, 
 78 |             save_to_json: bool = False,
 79 |             lang: str = 'en'
 80 |         ) -> List[Dict[str, str]]:
 81 |         #TODO add argument to support other languages https://serpapi.com/google-languages
 82 | 
 83 |         '''
 84 |         Results comes from: https://scholar.google.com/citations?view_op=mandates_leaderboard
 85 |         
 86 |         Returns:
 87 |         - funder: str
 88 |         - link: str
 89 |         - 2019: str
 90 |         - 2020: str
 91 |         - 2021: str
 92 |         - overall: str (not extracted at the moment, selector needs to be fixed)
 93 |         
 94 |         Arguments: 
 95 |         - save_to_csv: True of False. Saves data to CSV file. Default is False. 
 96 |         - save_to_json: True of False. Saves data to JSON file. Default is False.
 97 |         - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages
 98 |         
 99 |         Usage:
100 |         
101 |         from google_scholar_py import CustomGoogleScholarTopMandates
102 |         
103 |         parser = CustomGoogleScholarTopMandates()
104 |         data = parser.scrape_top_mandates_metrics(
105 |             save_to_csv=True,
106 |             save_to_json=False
107 |         )
108 |         print(json.dumps(data, indent=2))
109 | 
110 |         for result in data:
111 |             print(result['funder'])
112 |             ...
113 |         '''
114 |         
115 |         # selenium stealth
116 |         options = webdriver.ChromeOptions()
117 |         options.add_argument('--headless')
118 |         options.add_argument('--no-sandbox')
119 |         options.add_argument('--disable-dev-shm-usage')
120 |         
121 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
122 |         options.add_experimental_option('useAutomationExtension', False)
123 |         
124 |         service = Service(ChromeDriverManager().install())
125 |         driver = webdriver.Chrome(service=service, options=options)
126 |             
127 |         stealth(driver,
128 |             languages=['en-US', 'en'],
129 |             vendor='Google Inc.',
130 |             platform='Win32',
131 |             webgl_vendor='Intel Inc.',
132 |             renderer='Intel Iris OpenGL Engine',
133 |             fix_hairline=True
134 |         )
135 |         
136 |         top_mandates_data: list = []
137 | 
138 |         driver.get(f'https://scholar.google.com/citations?view_op=mandates_leaderboard&hl={lang}')
139 |         parser = LexborHTMLParser(driver.page_source)
140 |         self.parse(parser=parser, top_mandates_data=top_mandates_data)
141 |         
142 |         if save_to_csv:
143 |             pd.DataFrame(data=top_mandates_data).to_csv('google_scholar_top_mandates_data.csv', 
144 |                                                         index=False, encoding='utf-8')
145 |             
146 |         if save_to_json:
147 |             pd.DataFrame(data=top_mandates_data).to_json('google_scholar_top_mandates_data.json', 
148 |                                                         orient='records')
149 |             
150 |         driver.quit()
151 |         return top_mandates_data
152 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_article.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Dict, Callable, Union
  7 | import pandas as pd
  8 | import time, random
  9 | 
 10 | class CustomGoogleScholarTopPublicationArticle:
 11 |     def __init__(self) -> None:
 12 |         pass
 13 |     
 14 | 
 15 |     def parse(self, parser: Callable, publication_citation_data: Callable):
 16 |         '''
 17 |         Arugments:
 18 |         - parser:  Lexbor parser from scrape_google_scholar_top_publication_articles() function.
 19 |         - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_articles() function. Line 104.
 20 |         
 21 |         This function parses data from Google Scholar Organic results and appends data to a List.
 22 |         
 23 |         It's used by scrape_google_scholar_top_publication_articles().
 24 |         '''
 25 |         
 26 |         # selects the whole table without the first row (header row) 
 27 |         for result in parser.css('tr:not(:first-child)'):
 28 |             try:
 29 |                 title: str = result.css_first('.gsc_mp_anchor_lrge').text()
 30 |             except: title = None
 31 | 
 32 |             try:
 33 |                 title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}"
 34 |             except: title_link = None
 35 | 
 36 |             try:
 37 |                 authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ')
 38 |             except: authors = None
 39 |             
 40 |             try:
 41 |                 published_at: str = result.css_first('.gs_gray+ .gs_gray').text()
 42 |             except: published_at = None
 43 |             
 44 |             try:
 45 |                 cited_by_count: int = int(result.css_first('.gsc_mpat_c .gsc_mp_anchor').text())
 46 |             except: cited_by_count = None
 47 | 
 48 |             try:
 49 |                 cited_by_link: str = f"https://scholar.google.com{result.css_first('.gsc_mpat_c a.gsc_mp_anchor').attrs['href']}"
 50 |             except: cited_by_link = None
 51 |             
 52 |             try:
 53 |                 year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text())
 54 |             except: year = None
 55 |             
 56 |             
 57 |             publication_citation_data.append({
 58 |                 'title': title,
 59 |                 'title_link': title_link,
 60 |                 'authors': authors,
 61 |                 'cited_by_link': cited_by_link,
 62 |                 'cited_by_count': cited_by_count,
 63 |                 'year': year,
 64 |                 'published_at': published_at
 65 |             })
 66 | 
 67 |     #TODO: add lang support. https://serpapi.com/google-languages
 68 |     def scrape_google_scholar_top_publication_articles(
 69 |             self,
 70 |             journal_publications_link: str,
 71 |             pagination: bool = False,
 72 |             save_to_csv: bool = False, 
 73 |             save_to_json: bool = False
 74 |         ) -> List[Dict[str, Union[str, List[str], int]]]:
 75 |         '''
 76 |         Results comes from (for example): https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022
 77 |         
 78 |         Extracts data from Google Scholar Top Publication Metrics Citation page:
 79 |         - title: str
 80 |         - title_link: str
 81 |         - authors: list 
 82 |         - cited_by_count: int
 83 |         - cited_by_link: str 
 84 |         - year: int
 85 |         - published_at: str
 86 |     
 87 |         Arguments:
 88 |         - journal_publications_link: str. Search query. 
 89 |         - pagination: bool. Enables or disables pagination. Default is False.
 90 |         - save_to_csv: bool. True of False. Default is False.
 91 |         - save_to_json: bool. True of False. Default is False.
 92 |         
 93 |         Usage:
 94 |         
 95 |         from google_scholar_py import CustomGoogleScholarTopPublicationArticle
 96 | 
 97 |         parser = CustomGoogleScholarTopPublicationArticle()
 98 |         data = parser.scrape_google_scholar_top_publication_articles(
 99 |             journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022', # or link variable that stores the link
100 |             pagination=False,
101 |             save_to_csv=True
102 |         )
103 |         
104 |         for citations in data:
105 |             print(citations['title'], citations['year'], citations['published_at'], sep='\\n')
106 |         '''
107 |         
108 |         # selenium stealth
109 |         options = webdriver.ChromeOptions()
110 |         options.add_argument('--headless')
111 |         options.add_argument('--no-sandbox')
112 |         options.add_argument('--disable-dev-shm-usage')
113 |         
114 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
115 |         options.add_experimental_option('useAutomationExtension', False) 
116 |         
117 |         service = Service(ChromeDriverManager().install())
118 |         driver = webdriver.Chrome(service=service, options=options)
119 |         
120 |         stealth(driver,
121 |             languages=['en-US', 'en'],
122 |             vendor='Google Inc.',
123 |             platform='Win32',
124 |             webgl_vendor='Intel Inc.',
125 |             renderer='Intel Iris OpenGL Engine',
126 |             fix_hairline=True,
127 |         )
128 |         
129 |         page_num = 0
130 |         publication_citation_data = []
131 |     
132 |         # parse all pages
133 |         if pagination:
134 |             while True:
135 |                 driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination
136 |                 parser = LexborHTMLParser(driver.page_source)
137 |                 
138 |                 self.parse(parser=parser, publication_citation_data=publication_citation_data)
139 |                 
140 |                 # pagination
141 |                 if parser.css_first('.gsc_pgn_pnx:not([disabled])'):  # checks if the "Next" page button selector is not disabled
142 |                     page_num += 20                                    # paginate to the next page
143 |                     time.sleep(random.randint(1, 3))                  # sleep between paginations
144 |                 else:
145 |                     break
146 |         else:
147 |             # parse first page only
148 |             driver.get(journal_publications_link)
149 |             parser = LexborHTMLParser(driver.page_source)
150 |         
151 |             self.parse(parser=parser, publication_citation_data=publication_citation_data)
152 |             
153 |         if save_to_csv:
154 |             pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv', 
155 |                                                             index=False, encoding='utf-8')
156 |         if save_to_json:
157 |             pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json', 
158 |                                                             orient='records')
159 |         driver.quit()
160 |         
161 |         return publication_citation_data
162 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_article_citation.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Dict, Callable, Union
  7 | import pandas as pd
  8 | import time, random
  9 | 
 10 | class CustomGoogleScholarTopPublicationArticleCitation:
 11 |     def __init__(self) -> None:
 12 |         pass
 13 |     
 14 | 
 15 |     def parse(self, parser: Callable, publication_citation_data: Callable):
 16 |         '''
 17 |         Arugments:
 18 |         - parser:  Lexbor parser from scrape_google_scholar_top_publication_article_citations() function.
 19 |         - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_article_citations() function. Line 104.
 20 |         
 21 |         This function parses data from Google Scholar Organic results and appends data to a List.
 22 |         
 23 |         It's used by scrape_google_scholar_top_publication_article_citations().
 24 |         '''
 25 |         
 26 |         # selects the whole table without the first row (header row) 
 27 |         for result in parser.css('tr:not(:first-child)'):
 28 |             try:
 29 |                 title: str = result.css_first('.gsc_mp_anchor_lrge').text()
 30 |             except: title = None
 31 | 
 32 |             try:
 33 |                 title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}"
 34 |             except: title_link = None
 35 | 
 36 |             try:
 37 |                 authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ')
 38 |             except: authors = None
 39 |             
 40 |             try:
 41 |                 published_at: str = result.css_first('.gs_gray+ .gs_gray').text()
 42 |             except: published_at = None
 43 |             
 44 |             try:
 45 |                 year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text())
 46 |             except: year = None
 47 |             
 48 |             
 49 |             publication_citation_data.append({
 50 |                 'title': title,
 51 |                 'title_link': title_link,
 52 |                 'authors': authors,
 53 |                 'year': year,   
 54 |                 'published_at': published_at
 55 |             })
 56 | 
 57 |     #TODO: add lang support. https://serpapi.com/google-languages
 58 |     def scrape_google_scholar_top_publication_article_citations(
 59 |             self,
 60 |             journal_publications_link: str,
 61 |             pagination: bool = False,
 62 |             save_to_csv: bool = False, 
 63 |             save_to_json: bool = False
 64 |         ) -> List[Dict[str, Union[str, List[str], int]]]:
 65 |         '''
 66 |         Results comes from (for example): https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18
 67 |         
 68 |         Extracts data from Google Scholar Top Publication Metrics Citation page:
 69 |         - title: str
 70 |         - title_link: str
 71 |         - authors: list 
 72 |         - published_at: str
 73 |         - year: int
 74 |     
 75 |         Arguments:
 76 |         - journal_publications_link: str. Search query. 
 77 |         - pagination: bool. Enables or disables pagination. Default is False.
 78 |         - save_to_csv: bool. True of False. Default is False.
 79 |         - save_to_json: bool. True of False. Default is False.
 80 |         
 81 |         Usage:
 82 |         
 83 |         from google_scholar_py import CustomGoogleScholarTopPublicationArticleCitation
 84 |         import json 
 85 |         
 86 |         parser = CustomGoogleScholarTopPublicationArticleCitation()
 87 |         data = parser.scrape_google_scholar_top_publication_article_citations(
 88 |             journal_publications_link='https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18', # or link variable that stores the link
 89 |             pagination=False,
 90 |             save_to_csv=True
 91 |         )
 92 |         print(json.dumps(data, indent=2))
 93 |         
 94 |         for citations in data:
 95 |             print(citations['title'], citations['year'], citations['published_at'], sep='\\n')
 96 |         '''
 97 |         
 98 |         # selenium stealth
 99 |         options = webdriver.ChromeOptions()
100 |         options.add_argument('--headless')
101 |         options.add_argument('--no-sandbox')
102 |         options.add_argument('--disable-dev-shm-usage')
103 |         
104 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
105 |         options.add_experimental_option('useAutomationExtension', False) 
106 |         
107 |         service = Service(ChromeDriverManager().install())
108 |         driver = webdriver.Chrome(service=service, options=options)
109 |         
110 |         stealth(driver,
111 |             languages=['en-US', 'en'],
112 |             vendor='Google Inc.',
113 |             platform='Win32',
114 |             webgl_vendor='Intel Inc.',
115 |             renderer='Intel Iris OpenGL Engine',
116 |             fix_hairline=True,
117 |         )
118 |         
119 |         page_num = 0
120 |         publication_citation_data = []
121 |     
122 |         # parse all pages
123 |         if pagination:
124 |             while True:
125 |                 driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination
126 |                 parser = LexborHTMLParser(driver.page_source)
127 |                 
128 |                 self.parse(parser=parser, publication_citation_data=publication_citation_data)
129 |                 
130 |                 # pagination
131 |                 if parser.css_first('.gsc_pgn_pnx:not([disabled])'):  # checks if the "Next" page button selector is not disabled
132 |                     page_num += 20                                    # paginate to the next page
133 |                     time.sleep(random.randint(1, 3))                  # sleep between paginations
134 |                 else:
135 |                     break
136 |         else:
137 |             # parse first page only
138 |             driver.get(journal_publications_link)
139 |             parser = LexborHTMLParser(driver.page_source)
140 |         
141 |             self.parse(parser=parser, publication_citation_data=publication_citation_data)
142 |             
143 |         if save_to_csv:
144 |             pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv', 
145 |                                                             index=False, encoding='utf-8')
146 |         if save_to_json:
147 |             pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json', 
148 |                                                             orient='records')
149 |         driver.quit()
150 |         
151 |         return publication_citation_data
152 | 


--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_metrics.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium_stealth import stealth
  3 | from selenium.webdriver.chrome.service import Service
  4 | from webdriver_manager.chrome import ChromeDriverManager
  5 | from selectolax.lexbor import LexborHTMLParser
  6 | from typing import List, Dict, Callable, Union
  7 | import pandas as pd
  8 | 
  9 | class CustomGoogleScholarTopPublications:
 10 |     def __init__(self) -> None:
 11 |         pass
 12 | 
 13 | 
 14 |     def parse(self, parser: Callable, top_publications_data: Callable):
 15 |         '''
 16 |         Arugments:
 17 |         - parser: Callable. Lexbor parser from google_scholar_top_publication_metrics() function.
 18 |         - top_publications_data: Callable. List to append data to. List origin location is google_scholar_top_publication_metrics() function. Line 100.
 19 |         
 20 |         This function parses data from Google Scholar Organic results and appends data to a List.
 21 |         
 22 |         It's used by google_scholar_top_publication_metrics().
 23 |         
 24 |         It returns nothing as it appends data to `top_publications_data`, 
 25 |         which appends it to `top_publications_data` List in the google_scholar_top_publication_metrics() function.
 26 |         '''
 27 | 
 28 |         # selectors skips table header row
 29 |         for table in parser.css('tr:not(:first-child)'):
 30 |             try:
 31 |                 title: str = table.css_first('td.gsc_mvt_t').text()
 32 |             except: title = None
 33 |             
 34 |             try: 
 35 |                 h5_index: int = table.css_first('a.gs_ibl').text()
 36 |             except: h5_index = None
 37 |             
 38 |             try: 
 39 |                 h5_index_link: str = f"https://scholar.google.com{table.css_first('a.gs_ibl').attrs['href']}"
 40 |             except: h5_index_link = None
 41 |             
 42 |             try: 
 43 |                 h5_median: int = table.css_first('span.gs_ibl').text()
 44 |             except: h5_median = None
 45 |         
 46 |             top_publications_data.append({
 47 |                 'title': title,
 48 |                 'h5_index': int(h5_index) if h5_index else h5_index,
 49 |                 'h5_index_link': h5_index_link,
 50 |                 'h5_median': int(h5_median) if h5_median else h5_median
 51 |             })
 52 | 
 53 | 
 54 |     def scrape_top_publication_metrics(
 55 |             self,
 56 |             category: str = '', 
 57 |             lang: str = 'en',
 58 |             save_to_csv: bool = False, 
 59 |             save_to_json: bool = False,
 60 |         ) -> List[Dict[str, Union[str, int]]]:
 61 |         #TODO add subcategories to subcategory arg
 62 |         #TODO: support other languages: lang='spanish' -> 'sp'. https://serpapi.com/google-languages
 63 | 
 64 | 
 65 |         '''
 66 |         Results comes from: https://scholar.google.com/citations?view_op=top_venues
 67 |         
 68 |         Returns:
 69 |         - title: str
 70 |         - h5_index: int
 71 |         - h5_index_link: str
 72 |         - h5_median: int
 73 |         
 74 |         Arguments: 
 75 |         - save_to_csv: True of False. Default is False. Saves data to CSV file.
 76 |         - save_to_json: True of False. Default is False. Saves data to JSON file.
 77 |         - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages
 78 |         - category: str. Available categories showed in the function documentation below.
 79 |             Available categories:
 80 |             - "bus": Business, Economics & Management
 81 |             - "chm": Chemical & Material Sciences
 82 |             - "eng": Engineering & Computer Science
 83 |             - "med": Health & Medical Sciences
 84 |             - "hum": Humanities, Literature & Arts
 85 |             - "bio": Life Sciences & Earth Sciences
 86 |             - "phy": Physics & Mathematics
 87 |             - "soc": Social Sciences
 88 |             
 89 |         Usage:
 90 |         
 91 |         from google_scholar_py import CustomGoogleScholarTopPublications
 92 |         
 93 |         data = CustomGoogleScholarTopPublications().scrape_top_publication_metrics(category='eng', lang='en') # sv = swedish
 94 |         
 95 |         for result in data:
 96 |             print(result['title'])
 97 |             ...
 98 |         '''
 99 |         
100 |         # selenium stealth
101 |         options = webdriver.ChromeOptions()
102 |         options.add_argument('--headless')
103 |         options.add_argument('--no-sandbox')
104 |         options.add_argument('--disable-dev-shm-usage')
105 |         
106 |         options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
107 |         options.add_experimental_option('useAutomationExtension', False)
108 |         
109 |         service = Service(ChromeDriverManager().install())
110 |         driver = webdriver.Chrome(service=service, options=options)
111 |             
112 |         stealth(driver,
113 |             languages=['en-US', 'en'],
114 |             vendor='Google Inc.',
115 |             platform='Win32',
116 |             webgl_vendor='Intel Inc.',
117 |             renderer='Intel Iris OpenGL Engine',
118 |             fix_hairline=True
119 |         )
120 |         
121 |         top_publications_data = []
122 | 
123 |         if category:
124 |             driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}')
125 |             parser = LexborHTMLParser(driver.page_source)
126 |             self.parse(parser=parser, top_publications_data=top_publications_data)
127 |         else: 
128 |             # no vq={category} URL parameter
129 |             driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}') # vq='' which will redirect to the page with no applied category
130 |             parser = LexborHTMLParser(driver.page_source)
131 |             self.parse(parser=parser, top_publications_data=top_publications_data)
132 |             
133 |         if save_to_csv:
134 |             pd.DataFrame(data=top_publications_data).to_csv('google_scholar_top_publications_data.csv', 
135 |                                                             index=False, encoding='utf-8')
136 |         if save_to_json:
137 |             pd.DataFrame(data=top_publications_data).to_json('google_scholar_top_publications_data.json', 
138 |                                                             orient='records')
139 |             
140 |         driver.quit()
141 |         return top_publications_data
142 | 


--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/author_results.py:
--------------------------------------------------------------------------------
  1 | from serpapi import GoogleScholarSearch
  2 | from urllib.parse import urlsplit, parse_qsl
  3 | import itertools
  4 | 
  5 | #TODO: support pagination using `async` parameter
  6 | 
  7 | class SerpApiGoogleScholarAuthor:
  8 |     def __init__(self) -> None:
  9 |         pass
 10 | 
 11 |     def scrape_google_scholar_author_results(
 12 |             self,
 13 |             author_id: str,
 14 |             api_key: str = None,
 15 |             lang: str = 'en',
 16 |             parse_articles: bool = False,
 17 |             article_pagination: bool = False,
 18 |         ):
 19 |         
 20 |         '''
 21 |         Extracts all author data: author info, cited by (table, graph), co-authors, all articles.
 22 |         
 23 |         Arguments:
 24 |         - author_id: author id.
 25 |         - api_key: SerpApi api key, https://serpapi.com/manage-api-key
 26 |         - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
 27 |         - parse_articles: parses first page of authour articles. Defalul 'False'.
 28 |         - article_pagination: True of False. Enables to parse all articles. Default 'False'.
 29 |         
 30 |         Usage:
 31 |         
 32 |         from google_scholar_py.serpapi_backend.author_results import SerpApiGoogleScholarAuthor
 33 | 
 34 |         parser = SerpApiGoogleScholarAuthor()
 35 |         data = parser.scrape_google_scholar_author_results(
 36 |             author_id='nHhtvqkAAAAJ',
 37 |             api_key='serpapi_api_key',
 38 |             parse_articles=True,
 39 |             article_pagination=True,
 40 |         )
 41 |         
 42 |         print(data.keys()) # show available keys
 43 | 
 44 |         for article in data['articles']:
 45 |             print(article['title'])
 46 |         '''
 47 |         
 48 |         if api_key is None:
 49 |             raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
 50 |         
 51 |         if author_id is None:
 52 |             raise Exception('Please enter a author id.')
 53 |         
 54 |         if api_key and author_id is None:
 55 |             raise Exception('Please enter a SerpApi API key to a `api_key`, and a author id to `author_id` arguments.')
 56 |         
 57 |         params = {
 58 |             'api_key': api_key,                  # serpapi api key
 59 |             'engine': 'google_scholar_author',   # serpapi parsing engine
 60 |             'author_id': author_id,              # search by author id
 61 |             'hl': lang                           # language
 62 |         }
 63 |         
 64 |         search = GoogleScholarSearch(params)     # where data extracts on the backend
 65 |         
 66 |         # parsing ALL articles along with author info
 67 |         if parse_articles and article_pagination:
 68 |             params['start'] = 0          # page number: 0 is first page, 1 is second, etc.
 69 |             params['pagesize'] = 100     # number of articles per page
 70 |             
 71 |             author_all_articles = []
 72 |             
 73 |             while True:
 74 |                 results = search.get_dict()
 75 |                 
 76 |                 if 'error' in results:
 77 |                     print(results['error'])
 78 |                     break
 79 |                 
 80 |                 author_all_articles.append(results['articles'])
 81 |                 
 82 |                 # check for the `next` page
 83 |                 if 'next' in results.get('serpapi_pagination', {}):
 84 |                     search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query)))
 85 |                 else:
 86 |                     break
 87 |             
 88 |             # remove articles key that creates a nested lists
 89 |             results.pop('articles')
 90 |             
 91 |             # flatten list of all articles
 92 |             author_all_articles_flatten = list(itertools.chain(*author_all_articles))
 93 |             results['articles'] = author_all_articles_flatten
 94 |             
 95 |             keys_to_delete = ['search_metadata', 'search_parameters']
 96 |             for key_to_delete in keys_to_delete:
 97 |                 results.pop(key_to_delete)
 98 |             
 99 |             return results
100 |         
101 |         # parsing ONLY FIRST PAGE of articles along with author info
102 |         if parse_articles:
103 |             search = GoogleScholarSearch(params)
104 |             results = search.get_dict()             # JSON -> Python dict
105 |             
106 |             if 'error' in results:
107 |                 raise Exception(results['error'])
108 |             
109 |             keys_to_delete = ['search_metadata', 'search_parameters', 'serpapi_pagination']
110 | 
111 |             for key_to_delete in keys_to_delete:
112 |                 results.pop(key_to_delete)
113 | 
114 |             return results
115 |         
116 |         # if don't need to parse any articles -> remove them from the JSON
117 |         elif article_pagination or parse_articles is False: 
118 |             search = GoogleScholarSearch(params)
119 |             results = search.get_dict()
120 |             
121 |             if 'error' in results:
122 |                 raise Exception(results['error'])
123 |             
124 |             keys_to_delete = ['search_metadata', 'search_parameters', 'articles', 'serpapi_pagination']
125 | 
126 |             for key_to_delete in keys_to_delete:
127 |                 results.pop(key_to_delete)
128 |                 
129 |             return results
130 |         
131 | 


--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/organic_cite_results.py:
--------------------------------------------------------------------------------
 1 | from .organic_results import SerpApiGoogleScholarOrganic
 2 | from serpapi import GoogleScholarSearch
 3 | 
 4 | #TODO: support extracting actual Cite data, for example Bibtex: shorturl.at/vGNU5
 5 | 
 6 | class SerpApiGoogleScholarOrganicCite:
 7 |     def __init__(self) -> None:
 8 |         pass
 9 | 
10 | 
11 |     def scrape_google_scholar_cite_results(
12 |             self,
13 |             query: str,
14 |             api_key: str = None,
15 |             lang: str = 'en',
16 |             pagination: bool = False
17 |         ):
18 |         
19 |         '''
20 |         This function extract citations as well as BibTeX, EndNote, RefMan, RefWorks links.
21 |         
22 |         To extract citations, 2 requests has to be made: first for organic results, second for citation data. 
23 |         So if you need to get citations from 1000 articles, 2000 requests would be made accordingly.
24 |         
25 |         Arguments:
26 |         - query: search query
27 |         - api_key: SerpApi api key, https://serpapi.com/manage-api-key
28 |         - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
29 |         - pagination: True of False. Enables pagination from all pages. Default 'False'.
30 |         
31 |         Usage:
32 |         
33 |         from google_scholar_py.serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite
34 |         
35 |         parser = SerpApiGoogleScholarOrganicCite()
36 |         data = parser.scrape_google_scholar_cite_results(
37 |             query='minecraft', 
38 |             api_key='serpapi_api_key', 
39 |             pagination=True
40 |         )
41 |         
42 |         # extracting bottom links
43 |         for result in data:
44 |             for citations in result['links']: 
45 |                 print(citations['name']) # or ['link']
46 |         
47 |         # extracting citations
48 |         for result in data:
49 |             for citations in result['citations']: 
50 |                 print(citations['title']) # or ['snippet'] 
51 |         '''
52 |         
53 |         if api_key is None:
54 |             raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
55 |         
56 |         #TODO: could be removed as function by itself throw an error if query is missing
57 |         if api_key and query is None:
58 |             raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
59 |         
60 |         # extract organic results from where citation data will be extracted
61 |         organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results(
62 |             query=query,
63 |             api_key=api_key,
64 |             lang=lang,
65 |             pagination=pagination
66 |         )
67 |         
68 |         cite_results_data = []
69 | 
70 |         for citation in organic_results:
71 |             params = {
72 |                 'api_key': api_key,              # serpapi api key: https://serpapi.com/manage-api-key
73 |                 'engine': 'google_scholar_cite', # serpapi parsing engine
74 |                 'q': citation['result_id']       # search query
75 |             }
76 |             
77 |             search = GoogleScholarSearch(params) # where data extracts on the backend
78 |             results = search.get_dict()
79 |             
80 |             # removes 2 keys from the JSON response 
81 |             for key_to_delete in ['search_metadata', 'search_parameters']:
82 |                 results.pop(key_to_delete)
83 |                 
84 |             if 'error' in results:
85 |                 raise Exception(results['error'])
86 |                 
87 |             cite_results_data.append(results)
88 | 
89 |         return cite_results_data
90 | 


--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/organic_results.py:
--------------------------------------------------------------------------------
 1 | from serpapi import GoogleScholarSearch
 2 | from urllib.parse import urlsplit, parse_qsl
 3 | import itertools
 4 | 
 5 | #TODO: support pagination using `async` parameter
 6 | 
 7 | class SerpApiGoogleScholarOrganic:
 8 |     def __init__(self) -> None:
 9 |         pass
10 | 
11 | 
12 |     #TODO: add test API key so users can test out before passing their own?
13 |     def scrape_google_scholar_organic_results(
14 |             self,
15 |             query: str,
16 |             api_key: str = None,
17 |             lang: str = 'en',
18 |             pagination: bool = False,
19 |         ):
20 |         
21 |         '''
22 |         This function extracts all possible data from Google Scholar organic results. With or without pagination.
23 |         
24 |         Arguments:
25 |         - query: search query
26 |         - api_key: SerpApi api key, https://serpapi.com/manage-api-key
27 |         - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
28 |         - pagination: True of False. Enables pagination from all pages. Default 'False'.
29 |         
30 |         Usage:
31 |         
32 |         from google_scholar_py.serpapi_backend.organic_results import SerpApiGoogleScholarOrganic
33 | 
34 |         parser = SerpApiGoogleScholarOrganic()
35 |         data = parser.scrape_google_scholar_organic_results(
36 |             query='minecraft', 
37 |             api_key='serpapi_api_key', 
38 |             pagination=True
39 |         )
40 |         
41 |         print(data[0].keys()) # show available keys
42 |         
43 |         for result in data:
44 |             print(result['title']) # and other data
45 |         '''
46 |         
47 |         if api_key is None:
48 |             raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
49 |         
50 |         if api_key and query is None:
51 |             raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
52 |         
53 |         params = {
54 |             'api_key': api_key,              # serpapi api key: https://serpapi.com/manage-api-key
55 |             'engine': 'google_scholar',      # serpapi parsing engine
56 |             'q': query,                      # search query
57 |             'hl': lang,                      # language
58 |             'start': 0                       # first page. Used for pagination: https://serpapi.com/google-scholar-api#api-parameters-pagination-start
59 |         }
60 |         
61 |         search = GoogleScholarSearch(params) # where data extracts on the backend
62 |         
63 |         if pagination:
64 |             organic_results_data = []
65 |             
66 |             while True:
67 |                 results = search.get_dict()  # JSON -> Python dict
68 |                 
69 |                 if 'error' in results:
70 |                     print(results['error'])
71 |                     break
72 |                 
73 |                 organic_results_data.append(results['organic_results'])
74 |                 
75 |                 # check for `serpapi_pagination` and then for `next` page
76 |                 if 'next' in results.get('serpapi_pagination', {}):
77 |                     search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query)))
78 |                 else:
79 |                     break
80 |             
81 |             # flatten list
82 |             return list(itertools.chain(*organic_results_data))
83 |         else:
84 |             # remove page number key from the request parameters
85 |             # parse first page only
86 |             params.pop('start')
87 |             
88 |             search = GoogleScholarSearch(params)
89 |             results = search.get_dict()
90 |             
91 |             if 'error' in results:
92 |                 raise Exception(results['error'])
93 | 
94 |             return results['organic_results']
95 |         
96 |         
97 | 


--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/profile_results.py:
--------------------------------------------------------------------------------
 1 | from serpapi import GoogleScholarSearch
 2 | from urllib.parse import parse_qsl, urlsplit
 3 | import itertools
 4 | 
 5 | 
 6 | #TODO: support pagination using `async` parameter
 7 | 
 8 | class SerpApiGoogleScholarProfiles:
 9 |     def __init__(self) -> None:
10 |         pass
11 | 
12 |     def scrape_google_scholar_profile_results(
13 |             self,
14 |             query: str,
15 |             api_key: str = None,
16 |             lang: str = 'en',
17 |             pagination: bool = False,
18 |         ):
19 |         
20 |         '''
21 |         This function extracts profile results. With or without pagination.
22 |         
23 |         Arguments:
24 |         - query: search query
25 |         - api_key: SerpApi api key, https://serpapi.com/manage-api-key
26 |         - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
27 |         - pagination: True of False. Enables pagination from all pages. Default 'False'.
28 |         
29 |         Usage:
30 |         
31 |         from google_scholar_py.serpapi_backend.profile_results import SerpApiGoogleScholarProfiles
32 | 
33 |         parser = SerpApiGoogleScholarProfiles()
34 |         data = parser.scrape_google_scholar_profile_results(
35 |             query='minecraft', 
36 |             api_key='serpapi_api_key', 
37 |             pagination=True,
38 |         )
39 |         
40 |         print(data[0].keys()) # show available keys
41 |         
42 |         for result in data:
43 |             print(result['title'])
44 |             # get other data
45 |         '''
46 |         
47 |         if api_key is None:
48 |             raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
49 |         
50 |         if api_key and query is None:
51 |             raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
52 |         
53 |         params = {
54 |             'api_key': api_key,                       # serpapi api key: https://serpapi.com/manage-api-key
55 |             'engine': 'google_scholar_profiles',      # serpapi parsing engine
56 |             'mauthors': query,                        # search query
57 |             'hl': lang                                # language
58 |         }
59 |         
60 |         search = GoogleScholarSearch(params) # where data extracts on the backend
61 |         
62 |         if pagination:
63 |             profile_results_data = []
64 |             
65 |             while True:
66 |                 results = search.get_dict()  # JSON -> Python dict
67 |                 
68 |                 if 'error' in results:
69 |                     print(results['error'])
70 |                     break
71 |                 
72 |                 profile_results_data.append(results['profiles'])
73 |                 
74 |                 # check for 'next' page
75 |                 if 'next' in results.get('pagination', {}):
76 |                     search.params_dict.update(dict(parse_qsl(urlsplit(results['pagination']['next']).query)))
77 |                 else:
78 |                     break
79 |                 
80 |             # flatten list
81 |             return list(itertools.chain(*profile_results_data))
82 |         else:
83 |             search = GoogleScholarSearch(params)
84 |             results = search.get_dict()
85 |             
86 |             if 'error' in results:
87 |                 raise Exception(results['error'])
88 | 
89 |             return results['profiles']
90 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.ruff]
6 | line-length = 125
7 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==7.3.1
2 | pytest-cov==4.0.0
3 | pytest-xdist==3.3.0
4 | coverage==7.2.5
5 | ruff==0.0.243


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | async-generator==1.10
 2 | attrs==22.2.0
 3 | bleach==6.0.0
 4 | CacheControl==0.12.11
 5 | certifi==2022.12.7
 6 | cffi==1.15.1
 7 | charset-normalizer==3.0.1
 8 | cleo==2.0.1
 9 | colorama==0.4.6
10 | crashtest==0.4.1
11 | cryptography==39.0.1
12 | cssselect==1.2.0
13 | Cython==0.29.33
14 | distlib==0.3.6
15 | docutils==0.19
16 | dulwich==0.20.50
17 | exceptiongroup==1.1.0
18 | execnet==1.9.0
19 | filelock==3.9.0
20 | google-search-results==2.4.2
21 | h11==0.14.0
22 | html5lib==1.1
23 | idna==3.4
24 | importlib-metadata==4.13.0
25 | importlib-resources==5.10.2
26 | iniconfig==2.0.0
27 | jaraco.classes==3.2.3
28 | jeepney==0.8.0
29 | jsonschema==4.17.3
30 | keyring==23.13.1
31 | lockfile==0.12.2
32 | lxml==4.9.2
33 | markdown-it-py==2.1.0
34 | mdurl==0.1.2
35 | more-itertools==9.0.0
36 | msgpack==1.0.4
37 | numpy==1.24.2
38 | outcome==1.2.0
39 | packaging==23.0
40 | pandas==1.5.3
41 | parsel==1.7.0
42 | pexpect==4.8.0
43 | pkginfo==1.9.6
44 | pkgutil_resolve_name==1.3.10
45 | platformdirs==2.6.2
46 | pluggy==1.0.0
47 | poetry==1.3.2
48 | poetry-core==1.4.0
49 | poetry-plugin-export==1.3.0
50 | ptyprocess==0.7.0
51 | pycparser==2.21
52 | Pygments==2.14.0
53 | pyrsistent==0.19.3
54 | PySocks==1.7.1
55 | python-dateutil==2.8.2
56 | python-dotenv==1.0.0
57 | pytz==2022.7.1
58 | pywin32-ctypes==0.2.0
59 | rapidfuzz==2.13.7
60 | readme-renderer==37.3
61 | requests==2.28.2
62 | requests-toolbelt==0.10.1
63 | rfc3986==2.0.0
64 | rich==13.3.1
65 | SecretStorage==3.3.3
66 | selectolax==0.3.12
67 | selenium==4.8.0
68 | selenium-stealth==1.0.6
69 | shellingham==1.5.0.post1
70 | six==1.16.0
71 | sniffio==1.3.0
72 | sortedcontainers==2.4.0
73 | tomli==2.0.1
74 | tomlkit==0.11.6
75 | tqdm==4.65.0
76 | trio==0.22.0
77 | trio-websocket==0.9.2
78 | trove-classifiers==2023.1.20
79 | typing_extensions==4.4.0
80 | urllib3==1.26.14
81 | virtualenv==20.19.0
82 | w3lib==2.1.1
83 | webdriver-manager==3.8.5
84 | webencodings==0.5.1
85 | wsproto==1.2.0
86 | zipp==3.12.1
87 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | README = ''
 4 | with open('README.md', 'r', encoding='utf-8') as readme_file:
 5 |     README = readme_file.read()
 6 | 
 7 | setup(
 8 |     name='scrape-google-scholar-py',
 9 |     description = 'Extract data from all Google Scholar pages in Python. Sponsored by SerpApi.',
10 |     url='https://github.com/dimitryzub/scrape-google-scholar',
11 |     version='0.3.4',
12 |     license='MIT',
13 |     author='Dmitiry Zub',
14 |     author_email='dimitryzub@gmail.com',
15 |     maintainer='Dmitiry Zub',
16 |     maintainer_email='dimitryzub@gmail.com',
17 |     long_description_content_type='text/markdown',
18 |     long_description=README,
19 |     include_package_data=True,
20 |     python_requires='>=3.10',
21 |     classifiers = [
22 |         'Development Status :: 3 - Alpha',
23 |         'Intended Audience :: Developers',
24 |         'Operating System :: Microsoft :: Windows',
25 |         'Operating System :: MacOS',
26 |         'Operating System :: POSIX :: Linux',
27 |         'Topic :: Internet',
28 |         'Natural Language :: English',
29 |         'Topic :: Utilities',
30 |         'Programming Language :: Python :: 3.10',
31 |         'Programming Language :: Python :: 3.11',
32 |     ],
33 |     keywords=[
34 |             'google scholar',
35 |             'serpapi',
36 |             'scraper',
37 |             'python',
38 |             'python google scholar',
39 |             'python google scholar api',
40 |             'web scraping',
41 |             'python web scraping',
42 |             'research',
43 |             'lexbor',
44 |             'selectolax',
45 |             'selenium',
46 |             'selenium-stealth',
47 |             'pandas',
48 |         ],
49 |     install_requires=[
50 |           'google-search-results>=2.4.2',
51 |           'selectolax>=0.3.12',
52 |           'parsel>=1.7.0',
53 |           'selenium-stealth>=1.0.6',
54 |           'pandas>=1.5.3',
55 |           'webdriver-manager>=3.8.5' 
56 |     ],
57 |     project_urls={
58 |         'Documentation': 'https://github.com/dimitryzub/scrape-google-scholar#example-usage-custom-backend',
59 |         'Source': 'https://github.com/dimitryzub/scrape-google-scholar',
60 |         'Tracker': 'https://github.com/dimitryzub/scrape-google-scholar/issues',
61 |     },
62 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimitryzub/scrape-google-scholar-py/2a11840c7d19d23faca0c544c61cc5fd1aa4dadd/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_custom_profile.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import unittest
 3 | from pathlib import Path
 4 | import os
 5 | from google_scholar_py.custom_backend.profiles_results import CustomGoogleScholarProfiles
 6 | 
 7 | 
 8 | # # Tests for CustomGoogleScholarProfiles class
 9 | # @pytest.fixture(scope='session')
10 | # def google_scholar_parser():
11 | #     return CustomGoogleScholarProfiles()
12 | 
13 | @pytest.fixture(scope='session')
14 | def search_query():
15 |     return 'blizzard'
16 | 
17 | def test_custom_google_scholar_profiles_scrape_without_pagination(search_query):
18 |     results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False)
19 |     assert len(results) > 0
20 | 
21 | def test_custom_google_scholar_profiles_scrape_with_pagination(search_query):
22 |     results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=True)
23 |     assert len(results) > 0
24 | 
25 | def test_custom_google_scholar_profiles_save_to_csv(search_query):
26 |     CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_csv=True)
27 |     
28 |     # ../ as file saves in root, might save to a special "results" folder
29 |     assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.csv').exists()
30 | 
31 | def test_custom_google_scholar_profiles_save_to_json(search_query):
32 |     CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_json=True)
33 |     
34 |     # # ../ as file saves in root, might save to a special "results" folder
35 |     assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.json').exists()
36 |     
37 | # @pytest.fixture(scope='session')
38 | # def remove_test_files():    
39 | #     csv_file = Path().cwd().parent / 'google_scholar_profile_results_data.csv'
40 | #     json_file = Path().cwd().parent / 'google_scholar_profile_results_data.json'
41 | #     os.remove(csv_file)
42 | #     os.remove(json_file)
43 |     
44 | 
45 | # Tests for scrape_google_scholar_profiles function
46 | class TestScrapeGoogleScholarProfiles(unittest.TestCase):
47 | 
48 |     def test_scrape_google_scholar_profiles_returns_list(self):
49 |         query = "machine learning"
50 |         results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query)
51 |         self.assertIsInstance(results, list)
52 | 
53 |     def test_scrape_google_scholar_profiles_returns_correct_data_types(self):
54 |         query = "machine learning"
55 |         results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query)
56 |         
57 |         for profile_data in results:
58 |             self.assertIsInstance(profile_data, dict)
59 |             self.assertIsInstance(profile_data['name'], str)
60 |             self.assertIsInstance(profile_data['link'], str)
61 |             self.assertIsInstance(profile_data['affiliations'], str)
62 |             self.assertIsInstance(profile_data['email'], str)
63 |             self.assertIsInstance(profile_data['cited_by_count'], int or None)
64 |             self.assertIsInstance(profile_data['interests'], list or None)
65 |             for interest in profile_data['interests']:
66 |                 self.assertIsInstance(interest, str)
67 | 
68 |     def test_scrape_google_scholar_profiles_returns_valid_data(self):
69 |         query = "machine learning"
70 |         results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=query)
71 |         
72 |         for profile_data in results:
73 |             self.assertIsNotNone(profile_data['name'])
74 |             self.assertIsNotNone(profile_data['link'])
75 |             self.assertIsNotNone(profile_data['affiliations'])
76 |             self.assertIsNotNone(profile_data['email'])
77 |             self.assertIsNotNone(profile_data['cited_by_count'])
78 |             self.assertGreater(len(profile_data['interests']), 0)
79 |             
80 |     
81 | 
82 | if __name__ == '__main__':
83 |     unittest.main()


--------------------------------------------------------------------------------