├── .github
├── octo-reminder.yml
└── workflows
│ ├── codecov-workflow.yml
│ └── potential-duplicates.yml
├── .gitignore
├── .gitpod.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── example_usage.py
├── google_scholar_py
├── __init__.py
├── custom_backend
│ ├── author_info_all_articles.py
│ ├── cite_results.py
│ ├── google_scholar_cited_by_public_access_author.py
│ ├── organic_search.py
│ ├── profiles_results.py
│ ├── top_mandates_metrics.py
│ ├── top_publications_article.py
│ ├── top_publications_article_citation.py
│ └── top_publications_metrics.py
└── serpapi_backend
│ ├── author_results.py
│ ├── organic_cite_results.py
│ ├── organic_results.py
│ └── profile_results.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
└── test_custom_profile.py
/.github/octo-reminder.yml:
--------------------------------------------------------------------------------
1 | # Octo Reminder Configuration
2 |
3 | ## Command Prefix
4 | ### Define the prefix of your custom command.
5 | ### Type: '/' | '!'
6 | ### Default: '@' (works only in combination with command_name 'set-reminder')
7 | command_prefix: '@'
8 |
9 | ## Command Name
10 | ### Define the name of your custom command.
11 | ### Type: String
12 | ### Default: 'set-reminder'
13 | command_name: 'set-reminder'
14 |
15 | ## Language
16 | ### Define the language.
17 | ### Type: 'en' | 'fr' | 'de' | 'pt' | 'nl' | 'ja'
18 | ### Default: 'en'
19 | language: 'en'
20 |
21 | ## Timezone
22 | ### Define the timezone.
23 | ### Type: String (see also https://github.com/moment/moment-timezone/blob/develop/data/packed/latest.json)
24 | ### Default: 'Europe/London'
25 | timezone: 'Europe/London'
26 |
27 | ## Default Hour
28 | ### Define the hour that will be used, when no time is specified.
29 | ### Type: Number
30 | default_hour: 0
31 |
32 | ## Default Minute
33 | ### Define the minute that will be used, when no time is specified.
34 | ### Type: Number
35 | default_minute: 0
36 |
--------------------------------------------------------------------------------
/.github/workflows/codecov-workflow.yml:
--------------------------------------------------------------------------------
1 | name: Test API coverage
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | paths-ignore:
7 | - 'README.md'
8 | - 'MANIFEST.in'
9 | - 'LICENSE'
10 | - '.gitignore'
11 | - '.gitpod.yaml'
12 | pull_request:
13 | branches: [ main ]
14 | paths-ignore:
15 | - 'README.md'
16 | - 'MANIFEST.in'
17 | - 'LICENSE'
18 | - '.gitignore'
19 | - '.gitpod.yaml'
20 |
21 | jobs:
22 | build:
23 | runs-on: ubuntu-latest
24 | name: Set up Python 3.11
25 | steps:
26 | - uses: actions/checkout@v3
27 | - uses: actions/setup-python@v2
28 | with:
29 | python-version: '3.11'
30 |
31 | - name: Install requirements
32 | run: pip install -r requirements.txt && pip install -r requirements-dev.txt
33 |
34 | - name: Run tests and collect coverage
35 | run: pytest --cov=./ --cov-report=xml:coverage.xml
36 |
37 | - name: Upload coverage reports to Codecov with GitHub Action
38 | uses: codecov/codecov-action@v3
39 | with:
40 | token: ${{ secrets.CODECOV_TOKEN }}
--------------------------------------------------------------------------------
/.github/workflows/potential-duplicates.yml:
--------------------------------------------------------------------------------
1 | name: Potential Duplicates
2 | on:
3 | issues:
4 | types: [opened, edited]
5 | jobs:
6 | run:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: wow-actions/potential-duplicates@v1
10 | with:
11 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
12 | # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch.
13 | # Any matched issue will stop detection immediately.
14 | # You can specify multi filters in each line.
15 | filter: ''
16 | # Exclude keywords in title before detecting.
17 | exclude: ''
18 | # Label to set, when potential duplicates are detected.
19 | label: potential-duplicate
20 | # Get issues with state to compare. Supported state: 'all', 'closed', 'open'.
21 | state: all
22 | # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate.
23 | threshold: 0.6
24 | # Reactions to be add to comment when potential duplicates are detected.
25 | # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes"
26 | reactions: 'eyes'
27 | # Comment to post when potential duplicates are detected.
28 | comment: >
29 | Potential duplicates: {{#issues}}
30 | - [#{{ number }}] {{ title }} ({{ accuracy }}%)
31 | {{/issues}}
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | env/
3 | *.egg-info/
4 | docs/
5 | __pycache__/
--------------------------------------------------------------------------------
/.gitpod.yaml:
--------------------------------------------------------------------------------
1 | tasks:
2 | - name: Update Linux and Install other Chrome Dependencies
3 | init: |
4 | sudo apt-get update -y && sudo apt-get upgrade -y && sudo apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1
5 | - name: Create VirtualEnv, Install Python Dependencies
6 | init: |
7 | python -m venv env
8 | source env/bin/activate
9 | pip install -r requirements.txt
10 | github:
11 | prebuilds:
12 | addBadge: true
13 | vscode:
14 | extensions:
15 | - usernamehw.errorlens
16 | - vscode-icons-team.vscode-icons
17 | - bierner.markdown-preview-github-styles
18 | - ms-python.python
19 | - ms-toolsai.jupyter
20 | - KevinRose.vsc-python-indent
21 | - eamodio.gitlens
22 | - Gruntfuggly.todo-tree
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Dmitiry Zub☀️
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE requirements.txt
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
10 |
11 | ____
12 |
13 |
14 | Scrape data from all Google Scholar pages from a single Python module.
15 |
16 |
17 |
18 |

19 |
20 |
21 |
22 |
29 |
30 | > NOTE: As for now (2025), I no longer maintaining this repo. This could be changed later. To fix common issues, Chrome driver/CSS selectors might need an update.
31 |
32 |
33 | 🧐 Why two backends?
34 |
35 | 1. If you don't want to pay for API. However, I'm not 100% sure if [`selenium-stealth`](https://pypi.org/project/selenium-stealth/) could handle all CAPTCHAs (although it handles CAPTCHA by Cloudflare) and similar blocks.
36 | 2. If you know about SerpApi but don't want to figure out pagination.
37 |
38 | SerpApi backend is more reliable because of:
39 | - dedicated team of maintainers
40 | - pool of proxies
41 | - CAPTCHA solvers
42 | - legal part of scraping and more.
43 |
44 |
45 |
46 |
47 |
48 | 🧩 Custom backend supports
49 |
50 | 1. [Organic results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard&btnG=&oq=blizz) (with pagination).
51 | 2. [Profile results](https://scholar.google.com/citations?view_op=search_authors&mauthors=blizzard&hl=en&oi=drw) (with pagination).
52 | 3. [Author + author articles](https://scholar.google.com/citations?user=6IQ8pQwAAAAJ&hl=en&oi=sra) (with pagination), everything except "cited by" graph.
53 | 4. [Public access mandates metrics](https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en). Yes, you can download CSV with one click, however, it doesn't contain a funder link. Script here has it and saves to CSV/JSON.
54 | 5. [Top publications metrics](https://scholar.google.com/citations?view_op=top_venues&hl=en). Categories is also supported (as function argument). Saves to CSV/JSON. Sub-categories are not yet supported.
55 | 6. [Journal articles](https://github.com/dimitryzub/scrape-google-scholar/issues/2) (with pagination).
56 |
57 | You can use [`scholary`](https://github.com/scholarly-python-package/scholarly) to parse the data instead. However, it only extracts first 3 points above (organic, profile, author results).
58 |
59 |
60 | Things custom backend doesn't support yet
61 |
62 | 1. Organic results filters (case law, sorting, period ranges). You can add those URL parameters yourself ([if installing from source](https://github.com/dimitryzub/scrape-google-scholar-py#installing)) easily to the `google_scholar_py/custom_backend/organic_search.py` file (line [`147`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L147) or [`136`](https://github.com/dimitryzub/scrape-google-scholar-py/blob/a6b3b39042eabdc84851e3c1ca3c246e55bf19d1/google_scholar_py/custom_backend/organic_search.py#L160)), where `driver.get()` is being called.
63 | 2. Author page -> cited by graph.
64 | 3. Extracting [journal articles page](https://scholar.google.com/citations?hl=uk&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022). The [issue to add this page is open](https://github.com/dimitryzub/scrape-google-scholar/issues/2).
65 | 4. [Top publications metrics page](https://scholar.google.com/citations?view_op=top_venues&hl=en). Subcategories are not yet supported, it's in a TODO list.
66 | 5. Update [cite results](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=blizzard+effects+xanax&oq=blizzard+effects+x#d=gs_cit&t=1674718593252&u=%2Fscholar%3Fq%3Dinfo%3Alm-jhjzd72UJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D7%26hl%3Den) page extraction.
67 |
68 |
69 |
70 |
71 | 🔮 SerpApi backend supports
72 |
73 | - [Google Scholar Organic](https://serpapi.com/google-scholar-organic-results)
74 | - [Google Scholar Profiles](https://serpapi.com/google-scholar-profilesapi)
75 | - [Google Scholar Author](https://serpapi.com/google-scholar-author-api)
76 | - [Google Scholar Cite](https://serpapi.com/google-scholar-cite-api)
77 |
78 |
79 |
80 | 🏗 Custom backend depends on
81 |
82 | - [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) - to bypass CAPTCHAs and render some HTML (like cite results from organic result).
83 | - [`selectolax`](https://github.com/rushter/selectolax) - to parse HTML fast. Its the fastest Python parser wrapped around [`lexbor`](https://github.com/lexbor/lexbor) (parser in pure C).
84 | - [`pandas`](https://pandas.pydata.org/) - to save extracted data to CSV or JSON, or if you want to analyze the data right away. Save options is used in organic results and top publications, public access mandates pages for now.
85 |
86 | All scripts are using headless [`selenium-stealth`](https://github.com/diprajpatra/selenium-stealth) to bypass CAPTCHA that appears on Google Scholar, so you need to have a `chromedriver`. If you're on Linux you may need to do additional troubleshooting if `chromedriver` won't run properly.
87 |
88 |
89 | ## 📥Installing
90 |
91 | Install via `pip`:
92 |
93 | ```bash
94 | $ pip install scrape-google-scholar-py
95 | ```
96 |
97 | Install from source (single piped command):
98 |
99 | ```bash
100 | git clone https://github.com/dimitryzub/scrape-google-scholar-py.git \
101 | && cd scrape-google-scholar-py \
102 | && python -m venv env && source env/Scripts/activate \
103 | && pip install -r requirements.txt
104 | ```
105 |
106 | ### Possible errors that you might encounter
107 |
108 |
109 | LINUX USERS: If it throws "Web-driver exits unexpectedly" error
110 |
111 | Try installing extra dependencies to run `chromedriver`:
112 | ```bash
113 | $ apt-get install -y libglib2.0-0 libnss3 libgconf-2-4 libfontconfig1
114 | ```
115 |
116 | See resolved issue: [[Linux] Web-driver exits unexpectedly using CustomGoogleScholarOrganic() #7](https://github.com/dimitryzub/scrape-google-scholar-py/issues/7)
117 |
118 |
119 |
120 |
121 | For MAC users, possible issues and fixes
122 |
123 | - ✅ [(resolved question): Wheels failed to build while pip installing](https://github.com/dimitryzub/scrape-google-scholar-py/issues/12#issuecomment-1554266222)
124 |
125 |
126 |
127 |
128 | If it throws an error with `selenium-stealth`
129 |
130 | ```bash
131 | error: The 'selenium' distribution was not found and is required by selenium-stealth
132 | ```
133 |
134 | Use:
135 |
136 | ```bash
137 | $ pip install selenium-stealth
138 | ```
139 |
140 |
141 | ## 📝Example usage custom backend
142 |
143 | ```python
144 | from google_scholar_py import CustomGoogleScholarProfiles
145 | import json
146 |
147 | parser = CustomGoogleScholarProfiles()
148 | data = parser.scrape_google_scholar_profiles(
149 | query='blizzard',
150 | pagination=False,
151 | save_to_csv=False,
152 | save_to_json=False
153 | )
154 | print(json.dumps(data, indent=2))
155 | ```
156 |
157 |
158 | Google Scholar search operators could also be used
159 |
160 | ```lang-none
161 | label:computer_vision "Michigan State University"|"U.Michigan"
162 | ```
163 |
164 | This query will search all profiles from 2 universities based on "computer vision" query.
165 |
166 |
167 |
168 |
169 | JSON output
170 |
171 | ```json
172 | [
173 | {
174 | "name": "Adam Lobel",
175 | "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
176 | "affiliations": "Blizzard Entertainment",
177 | "interests": [
178 | "Gaming",
179 | "Emotion regulation"
180 | ],
181 | "email": "Verified email at AdamLobel.com",
182 | "cited_by_count": 3593
183 | },
184 | {
185 | "name": "Daniel Blizzard",
186 | "link": "https://scholar.google.com/citations?hl=en&user=dk4LWEgAAAAJ",
187 | "affiliations": "",
188 | "interests": null,
189 | "email": null,
190 | "cited_by_count": 1041
191 | },
192 | {
193 | "name": "Shuo Chen",
194 | "link": "https://scholar.google.com/citations?hl=en&user=OBf4YnkAAAAJ",
195 | "affiliations": "Senior Data Scientist, Blizzard Entertainment",
196 | "interests": [
197 | "Machine Learning",
198 | "Data Mining",
199 | "Artificial Intelligence"
200 | ],
201 | "email": "Verified email at cs.cornell.edu",
202 | "cited_by_count": 725
203 | },
204 | {
205 | "name": "Ian Livingston",
206 | "link": "https://scholar.google.com/citations?hl=en&user=xBHVqNIAAAAJ",
207 | "affiliations": "Blizzard Entertainment",
208 | "interests": [
209 | "Human-computer interaction",
210 | "User Experience",
211 | "Player Experience",
212 | "User Research",
213 | "Games"
214 | ],
215 | "email": "Verified email at usask.ca",
216 | "cited_by_count": 652
217 | },
218 | {
219 | "name": "Minli Xu",
220 | "link": "https://scholar.google.com/citations?hl=en&user=QST5iogAAAAJ",
221 | "affiliations": "Blizzard Entertainment",
222 | "interests": [
223 | "Game",
224 | "Machine Learning",
225 | "Data Science",
226 | "Bioinformatics"
227 | ],
228 | "email": "Verified email at blizzard.com",
229 | "cited_by_count": 541
230 | },
231 | {
232 | "name": "Je Seok Lee",
233 | "link": "https://scholar.google.com/citations?hl=en&user=vuvtlzQAAAAJ",
234 | "affiliations": "Blizzard Entertainment",
235 | "interests": [
236 | "HCI",
237 | "Player Experience",
238 | "Games",
239 | "Esports"
240 | ],
241 | "email": "Verified email at uci.edu",
242 | "cited_by_count": 386
243 | },
244 | {
245 | "name": "Alisha Ness",
246 | "link": "https://scholar.google.com/citations?hl=en&user=xQuwVfkAAAAJ",
247 | "affiliations": "Activision Blizzard",
248 | "interests": null,
249 | "email": null,
250 | "cited_by_count": 324
251 | },
252 | {
253 | "name": "Xingyu (Alfred) Liu",
254 | "link": "https://scholar.google.com/citations?hl=en&user=VW9ukOwAAAAJ",
255 | "affiliations": "Blizzard Entertainment",
256 | "interests": [
257 | "Machine Learning in Game Development"
258 | ],
259 | "email": null,
260 | "cited_by_count": 256
261 | },
262 | {
263 | "name": "Amanda LL Cullen",
264 | "link": "https://scholar.google.com/citations?hl=en&user=oqna6OgAAAAJ",
265 | "affiliations": "Blizzard Entertainment",
266 | "interests": [
267 | "Games Studies",
268 | "Fan Studies",
269 | "Live Streaming"
270 | ],
271 | "email": null,
272 | "cited_by_count": 247
273 | },
274 | {
275 | "name": "Nicole \"Nikki\" Crenshaw",
276 | "link": "https://scholar.google.com/citations?hl=en&user=zmRH6E0AAAAJ",
277 | "affiliations": "Blizzard Entertainment",
278 | "interests": [
279 | "MMOs",
280 | "Neoliberalism",
281 | "Social Affordances",
282 | "Identity",
283 | "Accessibility"
284 | ],
285 | "email": "Verified email at uci.edu",
286 | "cited_by_count": 202
287 | }
288 | ]
289 | ```
290 |
291 |
292 |
293 |
294 | ## 📝Example usage SerpApi backend
295 |
296 | ```python
297 | from google_scholar_py import SerpApiGoogleScholarOrganic
298 | import json
299 |
300 | profile_parser = SerpApiGoogleScholarProfiles()
301 | data = profile_parser.scrape_google_scholar_profile_results(
302 | query='blizzard',
303 | api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key
304 | pagination=False,
305 | # other params
306 | )
307 | print(json.dumps(data, indent=2))
308 | ```
309 |
310 |
311 | JSON output
312 |
313 | ```json
314 | [
315 | {
316 | "position": 0,
317 | "title": "Mining learning and crafting scientific experiments: a literature review on the use of minecraft in education and research",
318 | "result_id": "61OUs-3P374J",
319 | "link": "https://www.jstor.org/stable/pdf/jeductechsoci.19.2.355.pdf?&seq=1",
320 | "snippet": "\u2026 Minecraft have aroused the attention of teachers and researchers alike. To gain insights into the applicability of Minecraft, \u2026 our own considerable experience with Minecraft in courses on \u2026",
321 | "publication_info": {
322 | "summary": "S Nebel, S Schneider, GD Rey - Journal of Educational Technology & \u2026, 2016 - JSTOR",
323 | "authors": [
324 | {
325 | "name": "S Nebel",
326 | "link": "https://scholar.google.com/citations?user=_WTrwUwAAAAJ&hl=en&oi=sra",
327 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=_WTrwUwAAAAJ&engine=google_scholar_author&hl=en",
328 | "author_id": "_WTrwUwAAAAJ"
329 | },
330 | {
331 | "name": "S Schneider",
332 | "link": "https://scholar.google.com/citations?user=6Lh4FBMAAAAJ&hl=en&oi=sra",
333 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=6Lh4FBMAAAAJ&engine=google_scholar_author&hl=en",
334 | "author_id": "6Lh4FBMAAAAJ"
335 | },
336 | {
337 | "name": "GD Rey",
338 | "link": "https://scholar.google.com/citations?user=jCilMQoAAAAJ&hl=en&oi=sra",
339 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=jCilMQoAAAAJ&engine=google_scholar_author&hl=en",
340 | "author_id": "jCilMQoAAAAJ"
341 | }
342 | ]
343 | },
344 | "resources": [
345 | {
346 | "title": "researchgate.net",
347 | "file_format": "PDF",
348 | "link": "https://www.researchgate.net/profile/Steve-Nebel/publication/301232882_Mining_Learning_and_Crafting_Scientific_Experiments_A_Literature_Review_on_the_Use_of_Minecraft_in_Education_and_Research/links/570e709008aed4bec6fddad4/Mining-Learning-and-Crafting-Scientific-Experiments-A-Literature-Review-on-the-Use-of-Minecraft-in-Education-and-Research.pdf"
349 | }
350 | ],
351 | "inline_links": {
352 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=61OUs-3P374J",
353 | "cited_by": {
354 | "total": 358,
355 | "link": "https://scholar.google.com/scholar?cites=13753940406839825387&as_sdt=2005&sciodt=0,5&hl=en",
356 | "cites_id": "13753940406839825387",
357 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13753940406839825387&engine=google_scholar&hl=en"
358 | },
359 | "related_pages_link": "https://scholar.google.com/scholar?q=related:61OUs-3P374J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
360 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A61OUs-3P374J%3Ascholar.google.com%2F",
361 | "versions": {
362 | "total": 10,
363 | "link": "https://scholar.google.com/scholar?cluster=13753940406839825387&hl=en&as_sdt=0,5",
364 | "cluster_id": "13753940406839825387",
365 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13753940406839825387&engine=google_scholar&hl=en"
366 | }
367 | }
368 | },
369 | {
370 | "position": 1,
371 | "title": "Minecraft, beyond construction and survival",
372 | "result_id": "_Lo9erywZPUJ",
373 | "type": "Pdf",
374 | "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9",
375 | "snippet": "\" We\u2019ll keep releasing expansions and keep the game alive, but there needs to be some kind of final version that you can point at and say,\u2018I did this!\u2019... I\u2019m not sure why I feel a need to \u2026",
376 | "publication_info": {
377 | "summary": "SC Duncan - 2011 - stacks.stanford.edu",
378 | "authors": [
379 | {
380 | "name": "SC Duncan",
381 | "link": "https://scholar.google.com/citations?user=Ypqv_IEAAAAJ&hl=en&oi=sra",
382 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=Ypqv_IEAAAAJ&engine=google_scholar_author&hl=en",
383 | "author_id": "Ypqv_IEAAAAJ"
384 | }
385 | ]
386 | },
387 | "resources": [
388 | {
389 | "title": "stanford.edu",
390 | "file_format": "PDF",
391 | "link": "https://stacks.stanford.edu/file/druid:qq694ht6771/WellPlayed-v1n1-11.pdf#page=9"
392 | }
393 | ],
394 | "inline_links": {
395 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=_Lo9erywZPUJ",
396 | "cited_by": {
397 | "total": 288,
398 | "link": "https://scholar.google.com/scholar?cites=17682452360514616060&as_sdt=2005&sciodt=0,5&hl=en",
399 | "cites_id": "17682452360514616060",
400 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17682452360514616060&engine=google_scholar&hl=en"
401 | },
402 | "related_pages_link": "https://scholar.google.com/scholar?q=related:_Lo9erywZPUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
403 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A_Lo9erywZPUJ%3Ascholar.google.com%2F",
404 | "versions": {
405 | "total": 6,
406 | "link": "https://scholar.google.com/scholar?cluster=17682452360514616060&hl=en&as_sdt=0,5",
407 | "cluster_id": "17682452360514616060",
408 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17682452360514616060&engine=google_scholar&hl=en"
409 | },
410 | "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:_Lo9erywZPUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
411 | }
412 | },
413 | {
414 | "position": 2,
415 | "title": "Minecraft as a creative tool: A case study",
416 | "result_id": "wOTRJ8q0KIsJ",
417 | "link": "https://www.igi-global.com/article/minecraft-as-a-creative-tool/116516",
418 | "snippet": "\u2026 environment, Minecraft. In the following case study, the authors explored the use of Minecraft in \u2026 The authors demonstrate that Minecraft offers a unique opportunity for students to display \u2026",
419 | "publication_info": {
420 | "summary": "M Cipollone, CC Schifter, RA Moffat - International Journal of Game \u2026, 2014 - igi-global.com"
421 | },
422 | "resources": [
423 | {
424 | "title": "minecraft.school.nz",
425 | "file_format": "PDF",
426 | "link": "https://www.minecraft.school.nz/uploads/2/9/6/3/2963069/minecraft-as-a-creative-tool_-a-case-study_cipollone2014.pdf"
427 | }
428 | ],
429 | "inline_links": {
430 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=wOTRJ8q0KIsJ",
431 | "cited_by": {
432 | "total": 102,
433 | "link": "https://scholar.google.com/scholar?cites=10027463350684869824&as_sdt=2005&sciodt=0,5&hl=en",
434 | "cites_id": "10027463350684869824",
435 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=10027463350684869824&engine=google_scholar&hl=en"
436 | },
437 | "related_pages_link": "https://scholar.google.com/scholar?q=related:wOTRJ8q0KIsJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
438 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AwOTRJ8q0KIsJ%3Ascholar.google.com%2F",
439 | "versions": {
440 | "total": 9,
441 | "link": "https://scholar.google.com/scholar?cluster=10027463350684869824&hl=en&as_sdt=0,5",
442 | "cluster_id": "10027463350684869824",
443 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=10027463350684869824&engine=google_scholar&hl=en"
444 | }
445 | }
446 | },
447 | {
448 | "position": 3,
449 | "title": "Learning mathematics through Minecraft",
450 | "result_id": "Hh4p5NaYNu0J",
451 | "link": "https://pubs.nctm.org/abstract/journals/tcm/21/1/article-p56.xml",
452 | "snippet": "\u2026 Minecraft to explore area and perimeter. First, the teacher reviewed the definition of perimeter and area. Using a class set of iPods with Minecraft \u2026 Minecraft forms a medium to explore \u2026",
453 | "publication_info": {
454 | "summary": "B Bos, L Wilder, M Cook, R O'Donnell - Teaching Children \u2026, 2014 - pubs.nctm.org",
455 | "authors": [
456 | {
457 | "name": "B Bos",
458 | "link": "https://scholar.google.com/citations?user=DfdRg-8AAAAJ&hl=en&oi=sra",
459 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DfdRg-8AAAAJ&engine=google_scholar_author&hl=en",
460 | "author_id": "DfdRg-8AAAAJ"
461 | }
462 | ]
463 | },
464 | "resources": [
465 | {
466 | "title": "researchgate.net",
467 | "file_format": "PDF",
468 | "link": "https://www.researchgate.net/profile/Beth-Bos/publication/267507986_Learning_mathematics_through_Minecraft_Authors/links/545103b80cf249aa53dc8eb2/Learning-mathematics-through-Minecraft-Authors.pdf"
469 | }
470 | ],
471 | "inline_links": {
472 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Hh4p5NaYNu0J",
473 | "cited_by": {
474 | "total": 120,
475 | "link": "https://scholar.google.com/scholar?cites=17093017484449619486&as_sdt=2005&sciodt=0,5&hl=en",
476 | "cites_id": "17093017484449619486",
477 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=17093017484449619486&engine=google_scholar&hl=en"
478 | },
479 | "related_pages_link": "https://scholar.google.com/scholar?q=related:Hh4p5NaYNu0J:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
480 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AHh4p5NaYNu0J%3Ascholar.google.com%2F",
481 | "versions": {
482 | "total": 8,
483 | "link": "https://scholar.google.com/scholar?cluster=17093017484449619486&hl=en&as_sdt=0,5",
484 | "cluster_id": "17093017484449619486",
485 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=17093017484449619486&engine=google_scholar&hl=en"
486 | }
487 | }
488 | },
489 | {
490 | "position": 4,
491 | "title": "A deep hierarchical approach to lifelong learning in minecraft",
492 | "result_id": "a_Er9i3hDtUJ",
493 | "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744",
494 | "snippet": "We propose a lifelong learning system that has the ability to reuse and transfer knowledge from one task to another while efficiently retaining the previously learned knowledge-base. \u2026",
495 | "publication_info": {
496 | "summary": "C Tessler, S Givony, T Zahavy, D Mankowitz\u2026 - Proceedings of the \u2026, 2017 - ojs.aaai.org",
497 | "authors": [
498 | {
499 | "name": "C Tessler",
500 | "link": "https://scholar.google.com/citations?user=7eLKa3IAAAAJ&hl=en&oi=sra",
501 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=7eLKa3IAAAAJ&engine=google_scholar_author&hl=en",
502 | "author_id": "7eLKa3IAAAAJ"
503 | },
504 | {
505 | "name": "S Givony",
506 | "link": "https://scholar.google.com/citations?user=nlVsO4YAAAAJ&hl=en&oi=sra",
507 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=nlVsO4YAAAAJ&engine=google_scholar_author&hl=en",
508 | "author_id": "nlVsO4YAAAAJ"
509 | },
510 | {
511 | "name": "T Zahavy",
512 | "link": "https://scholar.google.com/citations?user=9dXN6cMAAAAJ&hl=en&oi=sra",
513 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=9dXN6cMAAAAJ&engine=google_scholar_author&hl=en",
514 | "author_id": "9dXN6cMAAAAJ"
515 | },
516 | {
517 | "name": "D Mankowitz",
518 | "link": "https://scholar.google.com/citations?user=v84tWxsAAAAJ&hl=en&oi=sra",
519 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=v84tWxsAAAAJ&engine=google_scholar_author&hl=en",
520 | "author_id": "v84tWxsAAAAJ"
521 | }
522 | ]
523 | },
524 | "resources": [
525 | {
526 | "title": "aaai.org",
527 | "file_format": "PDF",
528 | "link": "https://ojs.aaai.org/index.php/AAAI/article/view/10744/10603"
529 | }
530 | ],
531 | "inline_links": {
532 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=a_Er9i3hDtUJ",
533 | "cited_by": {
534 | "total": 364,
535 | "link": "https://scholar.google.com/scholar?cites=15352455767272452459&as_sdt=2005&sciodt=0,5&hl=en",
536 | "cites_id": "15352455767272452459",
537 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=15352455767272452459&engine=google_scholar&hl=en"
538 | },
539 | "related_pages_link": "https://scholar.google.com/scholar?q=related:a_Er9i3hDtUJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
540 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Aa_Er9i3hDtUJ%3Ascholar.google.com%2F",
541 | "versions": {
542 | "total": 13,
543 | "link": "https://scholar.google.com/scholar?cluster=15352455767272452459&hl=en&as_sdt=0,5",
544 | "cluster_id": "15352455767272452459",
545 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=15352455767272452459&engine=google_scholar&hl=en"
546 | },
547 | "cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:a_Er9i3hDtUJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
548 | }
549 | },
550 | {
551 | "position": 5,
552 | "title": "Teaching scientific concepts using a virtual world: Minecraft.",
553 | "result_id": "Oh88DuoTaLYJ",
554 | "link": "https://search.informit.org/doi/abs/10.3316/aeipt.195598",
555 | "snippet": "Minecraft is a multiplayer sandbox video game based in a virtual world modelled on the real \u2026 of Minecraft lends itself to the teaching of various academic subjects. Minecraft also has a \u2026",
556 | "publication_info": {
557 | "summary": "D Short - Teaching science, 2012 - search.informit.org",
558 | "authors": [
559 | {
560 | "name": "D Short",
561 | "link": "https://scholar.google.com/citations?user=ec_1ZmMAAAAJ&hl=en&oi=sra",
562 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=ec_1ZmMAAAAJ&engine=google_scholar_author&hl=en",
563 | "author_id": "ec_1ZmMAAAAJ"
564 | }
565 | ]
566 | },
567 | "resources": [
568 | {
569 | "title": "academia.edu",
570 | "file_format": "PDF",
571 | "link": "https://www.academia.edu/download/31153502/Short-2012-MC-Color-Version.pdf"
572 | }
573 | ],
574 | "inline_links": {
575 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=Oh88DuoTaLYJ",
576 | "cited_by": {
577 | "total": 274,
578 | "link": "https://scholar.google.com/scholar?cites=13143777408462888762&as_sdt=2005&sciodt=0,5&hl=en",
579 | "cites_id": "13143777408462888762",
580 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=13143777408462888762&engine=google_scholar&hl=en"
581 | },
582 | "related_pages_link": "https://scholar.google.com/scholar?q=related:Oh88DuoTaLYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
583 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AOh88DuoTaLYJ%3Ascholar.google.com%2F",
584 | "versions": {
585 | "total": 8,
586 | "link": "https://scholar.google.com/scholar?cluster=13143777408462888762&hl=en&as_sdt=0,5",
587 | "cluster_id": "13143777408462888762",
588 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=13143777408462888762&engine=google_scholar&hl=en"
589 | }
590 | }
591 | },
592 | {
593 | "position": 6,
594 | "title": "Investigating the role of Minecraft in educational learning environments",
595 | "result_id": "6RcOZdlG3CcJ",
596 | "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2016.1254877",
597 | "snippet": "\u2026 This research paper identifies the way in which Minecraft Edu can be used to contribute to the teaching
598 | and learning of secondary students via a multiple case research study. Minecraft \u2026",
599 | "publication_info": {
600 | "summary": "N Callaghan - Educational Media International, 2016 - Taylor & Francis"
601 | },
602 | "inline_links": {
603 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6RcOZdlG3CcJ",
604 | "cited_by": {
605 | "total": 95,
606 | "link": "https://scholar.google.com/scholar?cites=2872248561872803817&as_sdt=2005&sciodt=0,5&hl=en",
607 | "cites_id": "2872248561872803817",
608 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=2872248561872803817&engine=google_scholar&hl=en"
609 | },
610 | "related_pages_link": "https://scholar.google.com/scholar?q=related:6RcOZdlG3CcJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
611 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A6RcOZdlG3CcJ%3Ascholar.google.com%2F",
612 | "versions": {
613 | "total": 3,
614 | "link": "https://scholar.google.com/scholar?cluster=2872248561872803817&hl=en&as_sdt=0,5",
615 | "cluster_id": "2872248561872803817",
616 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=2872248561872803817&engine=google_scholar&hl=en"
617 | }
618 | }
619 | },
620 | {
621 | "position": 7,
622 | "title": "Maker culture and Minecraft: implications for the future of learning",
623 | "result_id": "h27IfZ5va2YJ",
624 | "link": "https://www.tandfonline.com/doi/abs/10.1080/09523987.2015.1075103",
625 | "snippet": "\u2026 be best to subscribe to for gathering information on Minecraft maker culture. From there, we \u2026 the
626 | Minecraft videos that we are studying \u201ccreators\u201d due to the culture of the Minecraft video \u2026",
627 | "publication_info": {
628 | "summary": "DJ Niemeyer, HR Gerber - Educational Media International, 2015 - Taylor & Francis",
629 | "authors": [
630 | {
631 | "name": "DJ Niemeyer",
632 | "link": "https://scholar.google.com/citations?user=iEZOnzQAAAAJ&hl=en&oi=sra",
633 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=iEZOnzQAAAAJ&engine=google_scholar_author&hl=en",
634 | "author_id": "iEZOnzQAAAAJ"
635 | },
636 | {
637 | "name": "HR Gerber",
638 | "link": "https://scholar.google.com/citations?user=DwyCTMUAAAAJ&hl=en&oi=sra",
639 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=DwyCTMUAAAAJ&engine=google_scholar_author&hl=en",
640 | "author_id": "DwyCTMUAAAAJ"
641 | }
642 | ]
643 | },
644 | "resources": [
645 | {
646 | "title": "publicservicesalliance.org",
647 | "file_format": "PDF",
648 | "link": "http://publicservicesalliance.org/wp-content/uploads/2016/06/Maker_culture_and_Minecraft_implications.pdf"
649 | }
650 | ],
651 | "inline_links": {
652 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=h27IfZ5va2YJ",
653 | "cited_by": {
654 | "total": 114,
655 | "link": "https://scholar.google.com/scholar?cites=7380115140882493063&as_sdt=2005&sciodt=0,5&hl=en",
656 | "cites_id": "7380115140882493063",
657 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=7380115140882493063&engine=google_scholar&hl=en"
658 | },
659 | "related_pages_link": "https://scholar.google.com/scholar?q=related:h27IfZ5va2YJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
660 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3Ah27IfZ5va2YJ%3Ascholar.google.com%2F",
661 | "versions": {
662 | "total": 8,
663 | "link": "https://scholar.google.com/scholar?cluster=7380115140882493063&hl=en&as_sdt=0,5",
664 | "cluster_id": "7380115140882493063",
665 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=7380115140882493063&engine=google_scholar&hl=en"
666 | }
667 | }
668 | },
669 | {
670 | "position": 8,
671 | "title": "Control of memory, active perception, and action in minecraft",
672 | "result_id": "-5uM8qRUviwJ",
673 | "link": "http://proceedings.mlr.press/v48/oh16.html",
674 | "snippet": "In this paper, we introduce a new set of reinforcement learning (RL) tasks in Minecraft (a flexible 3D world).
675 | We then use these tasks to systematically compare and contrast existing \u2026",
676 | "publication_info": {
677 | "summary": "J Oh, V Chockalingam, H Lee - \u2026 conference on machine \u2026, 2016 - proceedings.mlr.press",
678 | "authors": [
679 | {
680 | "name": "J Oh",
681 | "link": "https://scholar.google.com/citations?user=LNUeOu4AAAAJ&hl=en&oi=sra",
682 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=LNUeOu4AAAAJ&engine=google_scholar_author&hl=en",
683 | "author_id": "LNUeOu4AAAAJ"
684 | },
685 | {
686 | "name": "V Chockalingam",
687 | "link": "https://scholar.google.com/citations?user=CM2UkioAAAAJ&hl=en&oi=sra",
688 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=CM2UkioAAAAJ&engine=google_scholar_author&hl=en",
689 | "author_id": "CM2UkioAAAAJ"
690 | },
691 | {
692 | "name": "H Lee",
693 | "link": "https://scholar.google.com/citations?user=fmSHtE8AAAAJ&hl=en&oi=sra",
694 | "serpapi_scholar_link": "https://serpapi.com/search.json?author_id=fmSHtE8AAAAJ&engine=google_scholar_author&hl=en",
695 | "author_id": "fmSHtE8AAAAJ"
696 | }
697 | ]
698 | },
699 | "resources": [
700 | {
701 | "title": "mlr.press",
702 | "file_format": "PDF",
703 | "link": "http://proceedings.mlr.press/v48/oh16.pdf"
704 | }
705 | ],
706 | "inline_links": {
707 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=-5uM8qRUviwJ",
708 | "cited_by": {
709 | "total": 317,
710 | "link": "https://scholar.google.com/scholar?cites=3224107450664524795&as_sdt=2005&sciodt=0,5&hl=en",
711 | "cites_id": "3224107450664524795",
712 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=3224107450664524795&engine=google_scholar&hl=en"
713 | },
714 | "related_pages_link": "https://scholar.google.com/scholar?q=related:-5uM8qRUviwJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
715 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3A-5uM8qRUviwJ%3Ascholar.google.com%2F",
716 | "versions": {
717 | "total": 7,
718 | "link": "https://scholar.google.com/scholar?cluster=3224107450664524795&hl=en&as_sdt=0,5",
719 | "cluster_id": "3224107450664524795",
720 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=3224107450664524795&engine=google_scholar&hl=en"
721 | },
722 | "cached_page_link": "http://scholar.googleusercontent.com/scholar?q=cache:-5uM8qRUviwJ:scholar.google.com/+minecraft&hl=en&as_sdt=0,5"
723 | }
724 | },
725 | {
726 | "position": 9,
727 | "title": "Minecraft as a teaching tool: One case study",
728 | "result_id": "yItxbN8DVXYJ",
729 | "link": "https://www.learntechlib.org/p/48540/",
730 | "snippet": "We know games help students gain skills and insights in many ways, and that games are engaging. With new online MMOPRPG games, like Minecraft, what we do not know is what \u2026",
731 | "publication_info": {
732 | "summary": "C Schifter, M Cipollone - Society for Information Technology & \u2026, 2013 - learntechlib.org"
733 | },
734 | "inline_links": {
735 | "serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=yItxbN8DVXYJ",
736 | "cited_by": {
737 | "total": 55,
738 | "link": "https://scholar.google.com/scholar?cites=8526725727627873224&as_sdt=2005&sciodt=0,5&hl=en",
739 | "cites_id": "8526725727627873224",
740 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=8526725727627873224&engine=google_scholar&hl=en"
741 | },
742 | "related_pages_link": "https://scholar.google.com/scholar?q=related:yItxbN8DVXYJ:scholar.google.com/&scioq=minecraft&hl=en&as_sdt=0,5",
743 | "serpapi_related_pages_link": "https://serpapi.com/search.json?as_sdt=0%2C5&engine=google_scholar&hl=en&q=related%3AyItxbN8DVXYJ%3Ascholar.google.com%2F",
744 | "versions": {
745 | "total": 2,
746 | "link": "https://scholar.google.com/scholar?cluster=8526725727627873224&hl=en&as_sdt=0,5",
747 | "cluster_id": "8526725727627873224",
748 | "serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=8526725727627873224&engine=google_scholar&hl=en"
749 | }
750 | }
751 | }
752 | ]
753 | ```
754 |
755 |
756 |
757 | ## ✍Contributing
758 |
759 | Feel free to open an issue:
760 | - what bug you found.
761 | - something isn't working.
762 | - what feature to add.
763 | - anything else related to Google Scholar.
764 |
765 | If you find comfortable to open a PR, feel free to do so. Guidelines are simple: conventional commits + code as simple as possible without unnecessary complexity.
766 |
767 | There's exists a `.gitpod.yaml` config if you're using [Gitpod](https://www.gitpod.io/).
768 |
769 | ## 📜Licence
770 |
771 | `scrape-google-scholar` repository is licensed under MIT license.
772 |
--------------------------------------------------------------------------------
/example_usage.py:
--------------------------------------------------------------------------------
1 | # each function have documentation with an example "usage" script, after function arugments
2 | from google_scholar_py import CustomGoogleScholarOrganic
3 | from google_scholar_py import SerpApiGoogleScholarOrganic
4 | from google_scholar_py import CustomGoogleScholarTopPublicationArticle
5 |
6 | import json
7 |
8 | # TODO: add more examples
9 | custom_parser_get_organic_results = CustomGoogleScholarOrganic().scrape_google_scholar_organic_results(
10 | query='blizzard',
11 | pagination=False,
12 | save_to_csv=False,
13 | save_to_json=False
14 | )
15 |
16 | top_publication_citation = CustomGoogleScholarTopPublicationArticle().scrape_google_scholar_top_publication_articles(
17 | journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=TdhLrHqKTh8J.2022',
18 | pagination=True,
19 | save_to_csv=False,
20 | save_to_json=False
21 | )
22 |
23 | serpapi_parser_get_organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results(
24 | query='blizzard',
25 | api_key='your-serpapi-api-key', # https://serpapi.com/manage-api-key
26 | lang='en',
27 | pagination=False,
28 | )
29 |
30 |
31 | print(json.dumps(custom_parser_get_organic_results, indent=2, ensure_ascii=False))
32 | print(json.dumps(serpapi_parser_get_organic_results, indent=2, ensure_ascii=False))
33 | print(json.dumps(top_publication_citation, indent=2, ensure_ascii=False))
--------------------------------------------------------------------------------
/google_scholar_py/__init__.py:
--------------------------------------------------------------------------------
1 | from .custom_backend.organic_search import CustomGoogleScholarOrganic
2 | from .custom_backend.profiles_results import CustomGoogleScholarProfiles
3 | from .custom_backend.author_info_all_articles import CustomGoogleScholarAuthor
4 | from .custom_backend.top_mandates_metrics import CustomGoogleScholarTopMandates
5 | from .custom_backend.top_publications_metrics import CustomGoogleScholarTopPublications
6 | from .custom_backend.top_publications_article import CustomGoogleScholarTopPublicationArticle
7 | from .custom_backend.top_publications_article_citation import CustomGoogleScholarTopPublicationArticleCitation
8 |
9 | # serpapi backend
10 | from .serpapi_backend.organic_results import SerpApiGoogleScholarOrganic
11 | from .serpapi_backend.profile_results import SerpApiGoogleScholarProfiles
12 | from .serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite
13 | from .serpapi_backend.author_results import SerpApiGoogleScholarAuthor
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/author_info_all_articles.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Union, Dict
7 | from pathlib import Path
8 |
9 |
10 | class CustomGoogleScholarAuthor:
11 | def __init__(self) -> None:
12 | pass
13 |
14 |
15 | def scrape_google_scholar_author_data(
16 | self,
17 | user_id: str,
18 | parse_articles: bool = False,
19 | article_pagination: bool = False
20 | ) -> Dict[str, List[Union[str, int, None]]]:
21 | '''
22 | Extracts data from Google Scholar Author profile page:
23 | - Info about the author itself
24 | - Co-authors: name, link, affiliation
25 | - Author: title, link, authors, publication, cited by, year.
26 | - Articles: first 100 if pagination is False, or all if pagination is True.
27 |
28 | Arguments:
29 | - user_id: str. User ID from Google Scholar profile located in the URL.
30 | - parse_articles: True of False. If True, extracts first 100 articles. Default False.
31 | - article_pagination: True of False. If True, extracts beyond first 100 articles.
32 |
33 | Usage:
34 |
35 | from google_scholar_py import CustomGoogleScholarAuthor
36 |
37 | parser = CustomGoogleScholarAuthor()
38 | data = parser.scrape_google_scholar_author_data(
39 | user_id='nHhtvqkAAAAJ',
40 | parse_articles=True,
41 | article_pagination=True
42 | )
43 | print(json.dumps(data, indent=2))
44 |
45 | print(data['info']) # author info
46 | print(data['co-authors'])
47 |
48 | for article in data['articles']:
49 | print(article['title'])
50 | print(article['cited_by_count'])
51 | ...
52 | '''
53 |
54 | # selenium stealth
55 | options = webdriver.ChromeOptions()
56 | options.add_argument('--headless')
57 | options.add_argument('--no-sandbox')
58 | options.add_argument('--disable-dev-shm-usage')
59 |
60 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
61 | options.add_experimental_option('useAutomationExtension', False)
62 |
63 | service = Service(ChromeDriverManager().install())
64 | driver = webdriver.Chrome(service=service, options=options)
65 |
66 | stealth(driver,
67 | languages=['en-US', 'en'],
68 | vendor='Google Inc.',
69 | platform='Win32',
70 | webgl_vendor='Intel Inc.',
71 | renderer='Intel Iris OpenGL Engine',
72 | fix_hairline=True,
73 | )
74 |
75 | driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&pagesize=100')
76 | parser = LexborHTMLParser(driver.page_source)
77 |
78 | profile_info = {
79 | 'info': {},
80 | 'co-authors': [],
81 | 'articles': [][:-1] # [:-1] to not to return the last None element. Weird approach, I know. Revisit in the future.
82 | }
83 |
84 | profile_info['info']['name'] = parser.css_first('#gsc_prf_in').text()
85 | profile_info['info']['affiliations'] = parser.css_first('.gsc_prf_ila').text()
86 | profile_info['info']['email'] = parser.css_first('#gsc_prf_ivh').text()
87 | profile_info['info']['interests'] = [interest.text() for interest in parser.css('#gsc_prf_int .gs_ibl')]
88 |
89 | for co_author in parser.css('.gsc_rsb_aa'):
90 | profile_info['co-authors'].append({
91 | 'name': co_author.css_first('.gsc_rsb_a_desc a').text(),
92 | 'profile_link': f"https://scholar.google.com{co_author.css_first('.gsc_rsb_a_desc a').attrs['href']}",
93 | 'affiliation': co_author.css_first('.gsc_rsb_a_ext').text(),
94 | })
95 |
96 | # extracts only first 100 articles, WITHOUT paginaiton
97 | if parse_articles:
98 | # TODO: make a separate function to extract articles
99 | for index, article in enumerate(parser.css('.gsc_a_tr'), start=1):
100 | try:
101 | article_title = article.css_first('.gsc_a_at').text()
102 | except: article_title = None
103 |
104 | try:
105 | article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}"
106 | except: article_link = None
107 |
108 | try:
109 | if ',' in article.css_first('.gsc_a_at+ .gs_gray').text():
110 | article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors
111 | else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text() # single authour
112 | except: article_authors = None
113 |
114 | try:
115 | article_publication = article.css_first('.gs_gray+ .gs_gray').text()
116 | except: article_publication = None
117 |
118 | try:
119 | cited_by_count = article.css_first('.gsc_a_ac').text()
120 | except: cited_by_count = None
121 |
122 | try:
123 | publication_year = article.css_first('.gsc_a_hc').text()
124 | except: publication_year = None
125 |
126 | profile_info['articles'].append({
127 | 'title': article_title,
128 | 'link': article_link,
129 | 'authors': article_authors,
130 | 'publication': article_publication if article_publication else None,
131 | 'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str
132 | 'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str
133 | })
134 | elif parse_articles is False:
135 | profile_info.pop('articles')
136 |
137 | page_num = 0
138 |
139 | # extracts all articles
140 | if parse_articles and article_pagination:
141 | while True:
142 | driver.get(f'https://scholar.google.com/citations?user={user_id}&hl=en&gl=us&cstart={page_num}&pagesize=100')
143 | parser = LexborHTMLParser(driver.page_source)
144 |
145 | for article in parser.css('.gsc_a_tr'):
146 | try:
147 | article_title = article.css_first('.gsc_a_at').text()
148 | except: article_title = None
149 |
150 | try:
151 | article_link = f"https://scholar.google.com{article.css_first('.gsc_a_at').attrs['href']}"
152 | except: article_link = None
153 |
154 | try:
155 | if ',' in article.css_first('.gsc_a_at+ .gs_gray').text():
156 | article_authors: List[str] = article.css_first('.gsc_a_at+ .gs_gray').text().split(', ') # list of authors
157 | else: article_authors = article.css_first('.gsc_a_at+ .gs_gray').text() # single authour
158 | except: article_authors = None
159 |
160 | try:
161 | article_publication = article.css_first('.gs_gray+ .gs_gray').text()
162 | except: article_publication = None
163 |
164 | try:
165 | cited_by_count = article.css_first('.gsc_a_ac').text()
166 | except: cited_by_count = None
167 |
168 | try:
169 | publication_year = article.css_first('.gsc_a_hc').text()
170 | except: publication_year = None
171 |
172 | profile_info['articles'].append({
173 | 'title': article_title,
174 | 'link': article_link,
175 | 'authors': article_authors,
176 | 'publication': article_publication if article_publication else None,
177 | 'publication_year': int(publication_year) if publication_year else publication_year or None, # int value or None or empty str
178 | 'cited_by_count': int(cited_by_count) if cited_by_count else cited_by_count or None # int value or None or empty str
179 | })
180 |
181 | if parser.css_first('.gsc_a_e'):
182 | break
183 | else:
184 | page_num += 100 # paginate to the next page
185 |
186 | # remove articles key if user don't want to extract it
187 | elif article_pagination and parse_articles is False:
188 | profile_info.pop('articles')
189 |
190 | return profile_info
191 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/cite_results.py:
--------------------------------------------------------------------------------
1 | #TODO: support/refactor CITE extraction. This is not yet implemented.
2 |
3 | from parsel import Selector
4 | import requests
5 |
6 | params = {
7 | 'q': 'blizzard', # search query
8 | 'hl': 'en' # language of the search
9 | }
10 |
11 |
12 | headers = {
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
14 | 'accept-language': 'en-US,en',
15 | 'referer': f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}"
16 | }
17 |
18 |
19 | def parsel_get_cite_ids():
20 | html = requests.get('https://scholar.google.com/scholar', params=params, headers=headers)
21 | soup = Selector(text=html.text)
22 |
23 | # returns a list of publication ID's -> U8bh6Ca9uwQJ
24 | return soup.css('.gs_r.gs_or.gs_scl::attr(data-cid)').getall()
25 |
26 | def parsel_scrape_cite_results():
27 | citations = []
28 |
29 | for cite_id in parsel_get_cite_ids():
30 | html = requests.get(f'https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com', headers=headers)
31 | selector = Selector(text=html.text)
32 |
33 | # might be issues in the future with extracting data from the table
34 | if selector.css('#gs_citt').get():
35 | for result in selector.css('tr'):
36 | institution = result.xpath('th/text()').get()
37 | citation = result.xpath('td div/text()').get()
38 |
39 | citations.append({'institution': institution, 'citations': citation})
40 |
41 | return citations
42 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/google_scholar_cited_by_public_access_author.py:
--------------------------------------------------------------------------------
1 | from parsel import Selector
2 | import requests, json
3 |
4 | #TODO: add cited by graph extraction to author script
5 |
6 | def parsel_scrape_author_cited_by_graph():
7 | params = {
8 | 'user': '_xwYD2sAAAAJ', # user-id
9 | 'hl': 'en' # language
10 | }
11 |
12 | headers = {
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
14 | }
15 |
16 | data = {
17 | 'cited_by': [],
18 | 'graph': []
19 | }
20 |
21 | html = requests.get('https://scholar.google.com/citations', params=params, headers=headers, timeout=30)
22 | selector = Selector(text=html.text)
23 |
24 | since_year = selector.css('.gsc_rsb_sth~ .gsc_rsb_sth+ .gsc_rsb_sth::text').get().lower().replace(' ', '_')
25 |
26 | for cited_by_public_access in selector.css('.gsc_rsb'):
27 | data['cited_by'].append({
28 | 'citations_all': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
29 | f'citations_since_{since_year}': cited_by_public_access.css('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std::text').get(),
30 | 'h_index_all': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
31 | f'h_index_since_{since_year}': cited_by_public_access.css('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std::text').get(),
32 | 'i10_index_all': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std::text').get(),
33 | f'i10_index_since_{since_year}': cited_by_public_access.css('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std::text').get(),
34 | 'articles': {
35 | 'available': int(cited_by_public_access.css('.gsc_rsb_m_a:nth-child(1) span::text').get().split(' ')[0]), # to get only digit value
36 | 'not_available': int(cited_by_public_access.css('.gsc_rsb_m_na div::text').get().split(' ')[0]), # to get only digit value
37 | },
38 | 'articles_link': f"https://scholar.google.com{cited_by_public_access.css('#gsc_lwp_mndt_lnk::attr(href)').get()}"
39 | })
40 |
41 | for graph_year, graph_yaer_value in zip(selector.css('.gsc_g_t::text'), selector.css('.gsc_g_al::text')):
42 | data['graph'].append({
43 | 'year': graph_year.get(),
44 | 'value': int(graph_yaer_value.get())
45 | })
46 |
47 | if __name__ == '__main__':
48 | print(json.dumps(parsel_scrape_author_cited_by_graph(), indent=2, ensure_ascii=False))
49 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/organic_search.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Dict, Callable
7 | import time, random, re
8 | import pandas as pd
9 | from pathlib import Path
10 |
11 |
12 | class CustomGoogleScholarOrganic:
13 | def __init__(self) -> None:
14 | pass
15 |
16 |
17 | def parse(self, parser: Callable, organic_results_data: Callable):
18 | '''
19 | Arugments:
20 | - parser: Lexbor parser from scrape_google_scholar_organic_results() function.
21 | - organic_results_data: List to append data to. List origin location is scrape_google_scholar_organic_results() function. Line 104.
22 |
23 | This function parses data from Google Scholar Organic results and appends data to a List.
24 |
25 | It's used by scrape_google_scholar_organic_results().
26 |
27 | It returns nothing as it appends data to `organic_results_data`,
28 | which appends it to `organic_results_data` List in the scrape_google_scholar_organic_results() function.
29 | '''
30 |
31 | for result in parser.css('.gs_r.gs_or.gs_scl'):
32 | try:
33 | title: str = result.css_first('.gs_rt').text()
34 | except: title = None
35 |
36 | try:
37 | title_link: str = result.css_first('.gs_rt a').attrs['href']
38 | except: title_link = None
39 |
40 | try:
41 | publication_info: str = result.css_first('.gs_a').text()
42 | except: publication_info = None
43 |
44 | try:
45 | snippet: str = result.css_first('.gs_rs').text()
46 | except: snippet = None
47 |
48 | try:
49 | # if Cited by is present in inline links, it will be extracted
50 | cited_by_link = ''.join([link.attrs['href'] for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()])
51 | except: cited_by_link = None
52 |
53 | try:
54 | # if Cited by is present in inline links, it will be extracted and type cast it to integer
55 | cited_by_count = int(''.join([re.search(r'\d+', link.text()).group() for link in result.css('.gs_ri .gs_fl a') if 'Cited by' in link.text()]))
56 | except: cited_by_count = None
57 |
58 | try:
59 | pdf_file: str = result.css_first('.gs_or_ggsm a').attrs['href']
60 | except: pdf_file = None
61 |
62 | organic_results_data.append({
63 | 'title': title,
64 | 'title_link': title_link,
65 | 'publication_info': publication_info,
66 | 'snippet': snippet if snippet else None,
67 | 'cited_by_link': f'https://scholar.google.com{cited_by_link}' if cited_by_link else None,
68 | 'cited_by_count': cited_by_count if cited_by_count else None,
69 | 'pdf_file': pdf_file
70 | })
71 |
72 | #TODO: add lang support. https://serpapi.com/google-languages
73 | def scrape_google_scholar_organic_results(
74 | self,
75 | query: str,
76 | pagination: bool = False,
77 | save_to_csv: bool = False,
78 | save_to_json: bool = False
79 | ) -> List[Dict[str, str]]:
80 | '''
81 | Extracts data from Google Scholar Organic resutls page:
82 | - title: str
83 | - title_link: str
84 | - publication_info: str
85 | - snippet: str
86 | - cited_by_link: str
87 | - cited_by_count: int
88 | - pdf_file: str
89 |
90 | Arguments:
91 | - query: str. Search query.
92 | - pagination: bool. Enables or disables pagination. Default is False.
93 | - save_to_csv: bool. True of False. Default is False.
94 | - save_to_json: bool. True of False. Default is False.
95 |
96 | Usage:
97 |
98 | from google_scholar_py import CustomGoogleScholarOrganic
99 |
100 | parser = CustomGoogleScholarOrganic()
101 | data = parser.scrape_google_scholar_organic_results(
102 | query='blizzard',
103 | pagination=False,
104 | save_to_csv=True
105 | )
106 |
107 | for organic_result in data:
108 | print(organic_result['title'])
109 | print(organic_result['pdf_file'])
110 | '''
111 |
112 | # selenium stealth
113 | options = webdriver.ChromeOptions()
114 | options.add_argument('--headless')
115 | options.add_argument('--no-sandbox')
116 | options.add_argument('--disable-dev-shm-usage')
117 |
118 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
119 | options.add_experimental_option('useAutomationExtension', False)
120 |
121 | service = Service(ChromeDriverManager().install())
122 | driver = webdriver.Chrome(service=service, options=options)
123 |
124 | stealth(driver,
125 | languages=['en-US', 'en'],
126 | vendor='Google Inc.',
127 | platform='Win32',
128 | webgl_vendor='Intel Inc.',
129 | renderer='Intel Iris OpenGL Engine',
130 | fix_hairline=True,
131 | )
132 |
133 | page_num = 0
134 | organic_results_data = []
135 |
136 | # parse all pages
137 | if pagination:
138 | while True:
139 | # parse all pages
140 | driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}')
141 | parser = LexborHTMLParser(driver.page_source)
142 |
143 | self.parse(parser=parser, organic_results_data=organic_results_data)
144 |
145 | # pagination
146 | if parser.css_first('.gs_ico_nav_next'): # checks for the "Next" page button
147 | page_num += 10 # paginate to the next page
148 | time.sleep(random.randint(1, 3)) # sleep between paginations
149 | else:
150 | break
151 | else:
152 | # parse first page only
153 | driver.get(f'https://scholar.google.com/scholar?q={query}&hl=en&gl=us&start={page_num}')
154 | parser = LexborHTMLParser(driver.page_source)
155 |
156 | self.parse(parser=parser, organic_results_data=organic_results_data)
157 |
158 | if save_to_csv:
159 | pd.DataFrame(data=organic_results_data).to_csv('google_scholar_organic_results_data.csv',
160 | index=False, encoding='utf-8')
161 | if save_to_json:
162 | pd.DataFrame(data=organic_results_data).to_json('google_scholar_organic_results_data.json',
163 | orient='records')
164 | driver.quit()
165 |
166 | return organic_results_data
167 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/profiles_results.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from parsel import Selector
7 | from typing import List, Dict, Callable
8 | import time, random, re
9 | import pandas as pd
10 | from pathlib import Path
11 |
12 | class CustomGoogleScholarProfiles:
13 | def __init__(self) -> None:
14 | pass
15 |
16 |
17 | def parse(self, parser: Callable, profile_results_data: Callable):
18 | '''
19 | Arugments:
20 | - parser: Callable. Lexbor parser from scrape_google_scholar_profiles() function.
21 | - profile_results_data: Callable. List to append data to. List origin location is scrape_google_scholar_profiles() function. Line 100.
22 |
23 | This function parses data from Google Scholar Organic results and appends data to a List.
24 |
25 | It's used by scrape_google_scholar_profiles().
26 |
27 | It returns nothing as it appends data to `profile_results_data`,
28 | which appends it to `profile_results_data` List in the scrape_google_scholar_profiles() function.
29 | '''
30 |
31 | for profile in parser.css('.gs_ai_chpr'):
32 | try:
33 | name: str = profile.css_first('.gs_ai_name a').text()
34 | except: name = None
35 |
36 | try:
37 | link: str = f'https://scholar.google.com{profile.css_first(".gs_ai_name a").attrs["href"]}'
38 | except: link = None
39 |
40 | try:
41 | affiliations: str = profile.css_first('.gs_ai_aff').text()
42 | except: affiliations = None
43 |
44 | try:
45 | interests: list = [interest.text() for interest in profile.css('.gs_ai_one_int')]
46 | except: interests = None
47 |
48 | try:
49 | email: str = profile.css_first('.gs_ai_eml').text()
50 | except: email = None
51 |
52 | try:
53 | cited_by: int = re.search(r'\d+', profile.css_first('.gs_ai_cby').text()).group() # Cited by 17143 -> 17143
54 | except: cited_by = None
55 |
56 | profile_results_data.append({
57 | 'name': name,
58 | 'link': link,
59 | 'affiliations': affiliations,
60 | 'interests': interests if interests else None,
61 | 'email': email if email else None,
62 | 'cited_by_count': int(cited_by) if cited_by else None
63 | })
64 |
65 |
66 | def scrape_google_scholar_profiles(
67 | self,
68 | query: str,
69 | pagination: bool = False,
70 | save_to_csv: bool = False,
71 | save_to_json: bool = False
72 | ) -> List[Dict[str, str]]:
73 | '''
74 | Extracts data from Google Scholar Organic Profile resutls page:
75 | - name: str
76 | - link: str
77 | - affiliations: str
78 | - email: str
79 | - cited_by_count: int
80 |
81 | Arguments:
82 | - query: str. Search query.
83 | - pagination: bool. Enables or disables pagination. Default is False.
84 | - save_to_csv: bool. True of False. Default is False.
85 | - save_to_json: bool. True of False. Default is False.
86 |
87 | Usage:
88 |
89 | from google_scholar_py import CustomGoogleScholarProfiles
90 |
91 | parser = CustomGoogleScholarProfiles()
92 | data = parser.scrape_google_scholar_profiles(
93 | query='blizzard',
94 | pagination=False,
95 | save_to_csv=True
96 | )
97 | print(json.dumps(data, indent=2))
98 |
99 | for profile_results in data:
100 | print(profile_results['name'])
101 | print(profile_results['email'])
102 | '''
103 |
104 | # selenium stealth
105 | options = webdriver.ChromeOptions()
106 | options.add_argument('--headless')
107 | options.add_argument('--no-sandbox')
108 | options.add_argument('--disable-dev-shm-usage')
109 |
110 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
111 | options.add_experimental_option('useAutomationExtension', False)
112 |
113 | service = Service(ChromeDriverManager().install())
114 | driver = webdriver.Chrome(service=service, options=options)
115 |
116 | stealth(driver,
117 | languages=['en-US', 'en'],
118 | vendor='Google Inc.',
119 | platform='Win32',
120 | webgl_vendor='Intel Inc.',
121 | renderer='Intel Iris OpenGL Engine',
122 | fix_hairline=True
123 | )
124 |
125 | params = {} # stores next page token to add to URL later
126 | page_num = 0
127 | profile_results_data = []
128 |
129 | if pagination:
130 | while True:
131 | # if next page token appears, add to to URL as URL parameter
132 | # otherwise, do a search without next page token parameter (Line: 101)
133 | if params.get('after_author') is None:
134 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}')
135 | parser = LexborHTMLParser(driver.page_source)
136 |
137 | #TODO: replace parsel with selectolax completely
138 | selector = Selector(text=driver.page_source) # to check next page token
139 |
140 | self.parse(parser=parser, profile_results_data=profile_results_data)
141 |
142 | # check if the next arrow button is active by checking 'onclick' attribute
143 | if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get():
144 | # extracting next page token and passing to 'after_author' query URL parameter
145 | params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J
146 | page_num += 10 # paginate to the next page
147 | time.sleep(random.randint(1, 3)) # sleep between paginations
148 | else:
149 | break
150 | else:
151 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}&after_author={params["after_author"]}')
152 | parser = LexborHTMLParser(driver.page_source)
153 |
154 | #TODO: replace parsel with selectolax completely
155 | selector = Selector(text=driver.page_source) # to check next page token
156 |
157 | self.parse(parser=parser, profile_results_data=profile_results_data)
158 |
159 | if selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get():
160 | # extracting next page token and passing to 'after_author' query URL parameter
161 | params['after_author'] = re.search(r'after_author\\x3d(.*)\\x26', str(selector.css('.gsc_pgn button.gs_btnPR::attr(onclick)').get())).group(1) # -> XB0HAMS9__8J
162 | page_num += 10 # paginate to the next page
163 | time.sleep(random.randint(1, 3)) # sleep between paginations
164 | else:
165 | break
166 | else:
167 | # parse single, first page
168 | driver.get(f'https://scholar.google.com/citations?view_op=search_authors&mauthors={query}&hl=en&astart={page_num}')
169 | parser = LexborHTMLParser(driver.page_source)
170 |
171 | self.parse(parser=parser, profile_results_data=profile_results_data)
172 |
173 | driver.quit()
174 |
175 | if save_to_csv:
176 | pd.DataFrame(data=profile_results_data).to_csv('google_scholar_profile_results_data.csv',
177 | index=False, encoding='utf-8')
178 | if save_to_json:
179 | pd.DataFrame(data=profile_results_data).to_json('google_scholar_profile_results_data.json',
180 | orient='records')
181 |
182 | return profile_results_data
183 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_mandates_metrics.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Dict, Callable
7 | import pandas as pd
8 | import re
9 |
10 |
11 | class CustomGoogleScholarTopMandates:
12 | def __init__(self) -> None:
13 | pass
14 |
15 |
16 | def parse(self, parser: Callable, top_mandates_data: Callable):
17 | '''
18 | Arugments:
19 | - parser: Callable. Lexbor parser from google_scholar_top_mandates_metrics() function.
20 | - top_mandates_data: Callable. List to append data to. List origin location is google_scholar_top_mandates_metrics() function. Line 100.
21 |
22 | This function parses data from Google Scholar Organic results and appends data to a List.
23 |
24 | It's used by google_scholar_top_mandates_metrics().
25 |
26 | It returns nothing as it appends data to `top_mandates_data`,
27 | which appends it to `top_mandates_data` List in the google_scholar_top_mandates_metrics() function.
28 | '''
29 |
30 | for table in parser.css('tr'):
31 | try:
32 | # removes "... - cached"
33 | # https://regex101.com/r/EfljZp/1
34 | funder: str = re.sub(r'(\s\s-.*)', '', table.css_first('td.gsc_mlt_t').text())
35 | except: funder = None
36 |
37 | try:
38 | link: str = table.css_first('.gsc_mlt_t a').attrs['href']
39 | except: link = None
40 |
41 | try:
42 | two_eighteen: int = table.css_first('td:nth-child(4)').text()
43 | if '-' in two_eighteen:
44 | two_eighteen = None
45 | except: two_eighteen = None
46 |
47 | try:
48 | twenty_twenty: str = table.css_first('td:nth-child(5)').text()
49 | if '-' in twenty_twenty:
50 | twenty_twenty = None
51 | except: twenty_twenty = None
52 |
53 | try:
54 | twenty_one: str = table.css_first('td:nth-child(6)').text()
55 | if '-' in twenty_one: # missing % in the table
56 | twenty_one = None
57 | except: twenty_one = None
58 |
59 | #TODO: fix selector to extract "overall" data
60 | # `td:nth-child(6)` is not working also
61 | # try:
62 | # overall: str = table.css('.gsc_mlt_n.gsc_mlt_bd').text()
63 | # except: overall = None
64 |
65 | top_mandates_data.append({
66 | 'funder': funder,
67 | 'link': link,
68 | '2019': two_eighteen,
69 | '2020': twenty_twenty,
70 | '2021': twenty_one,
71 | # 'overall': overall
72 | })
73 |
74 |
75 | def scrape_top_mandates_metrics(
76 | self,
77 | save_to_csv: bool = False,
78 | save_to_json: bool = False,
79 | lang: str = 'en'
80 | ) -> List[Dict[str, str]]:
81 | #TODO add argument to support other languages https://serpapi.com/google-languages
82 |
83 | '''
84 | Results comes from: https://scholar.google.com/citations?view_op=mandates_leaderboard
85 |
86 | Returns:
87 | - funder: str
88 | - link: str
89 | - 2019: str
90 | - 2020: str
91 | - 2021: str
92 | - overall: str (not extracted at the moment, selector needs to be fixed)
93 |
94 | Arguments:
95 | - save_to_csv: True of False. Saves data to CSV file. Default is False.
96 | - save_to_json: True of False. Saves data to JSON file. Default is False.
97 | - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages
98 |
99 | Usage:
100 |
101 | from google_scholar_py import CustomGoogleScholarTopMandates
102 |
103 | parser = CustomGoogleScholarTopMandates()
104 | data = parser.scrape_top_mandates_metrics(
105 | save_to_csv=True,
106 | save_to_json=False
107 | )
108 | print(json.dumps(data, indent=2))
109 |
110 | for result in data:
111 | print(result['funder'])
112 | ...
113 | '''
114 |
115 | # selenium stealth
116 | options = webdriver.ChromeOptions()
117 | options.add_argument('--headless')
118 | options.add_argument('--no-sandbox')
119 | options.add_argument('--disable-dev-shm-usage')
120 |
121 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
122 | options.add_experimental_option('useAutomationExtension', False)
123 |
124 | service = Service(ChromeDriverManager().install())
125 | driver = webdriver.Chrome(service=service, options=options)
126 |
127 | stealth(driver,
128 | languages=['en-US', 'en'],
129 | vendor='Google Inc.',
130 | platform='Win32',
131 | webgl_vendor='Intel Inc.',
132 | renderer='Intel Iris OpenGL Engine',
133 | fix_hairline=True
134 | )
135 |
136 | top_mandates_data: list = []
137 |
138 | driver.get(f'https://scholar.google.com/citations?view_op=mandates_leaderboard&hl={lang}')
139 | parser = LexborHTMLParser(driver.page_source)
140 | self.parse(parser=parser, top_mandates_data=top_mandates_data)
141 |
142 | if save_to_csv:
143 | pd.DataFrame(data=top_mandates_data).to_csv('google_scholar_top_mandates_data.csv',
144 | index=False, encoding='utf-8')
145 |
146 | if save_to_json:
147 | pd.DataFrame(data=top_mandates_data).to_json('google_scholar_top_mandates_data.json',
148 | orient='records')
149 |
150 | driver.quit()
151 | return top_mandates_data
152 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_article.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Dict, Callable, Union
7 | import pandas as pd
8 | import time, random
9 |
10 | class CustomGoogleScholarTopPublicationArticle:
11 | def __init__(self) -> None:
12 | pass
13 |
14 |
15 | def parse(self, parser: Callable, publication_citation_data: Callable):
16 | '''
17 | Arugments:
18 | - parser: Lexbor parser from scrape_google_scholar_top_publication_articles() function.
19 | - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_articles() function. Line 104.
20 |
21 | This function parses data from Google Scholar Organic results and appends data to a List.
22 |
23 | It's used by scrape_google_scholar_top_publication_articles().
24 | '''
25 |
26 | # selects the whole table without the first row (header row)
27 | for result in parser.css('tr:not(:first-child)'):
28 | try:
29 | title: str = result.css_first('.gsc_mp_anchor_lrge').text()
30 | except: title = None
31 |
32 | try:
33 | title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}"
34 | except: title_link = None
35 |
36 | try:
37 | authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ')
38 | except: authors = None
39 |
40 | try:
41 | published_at: str = result.css_first('.gs_gray+ .gs_gray').text()
42 | except: published_at = None
43 |
44 | try:
45 | cited_by_count: int = int(result.css_first('.gsc_mpat_c .gsc_mp_anchor').text())
46 | except: cited_by_count = None
47 |
48 | try:
49 | cited_by_link: str = f"https://scholar.google.com{result.css_first('.gsc_mpat_c a.gsc_mp_anchor').attrs['href']}"
50 | except: cited_by_link = None
51 |
52 | try:
53 | year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text())
54 | except: year = None
55 |
56 |
57 | publication_citation_data.append({
58 | 'title': title,
59 | 'title_link': title_link,
60 | 'authors': authors,
61 | 'cited_by_link': cited_by_link,
62 | 'cited_by_count': cited_by_count,
63 | 'year': year,
64 | 'published_at': published_at
65 | })
66 |
67 | #TODO: add lang support. https://serpapi.com/google-languages
68 | def scrape_google_scholar_top_publication_articles(
69 | self,
70 | journal_publications_link: str,
71 | pagination: bool = False,
72 | save_to_csv: bool = False,
73 | save_to_json: bool = False
74 | ) -> List[Dict[str, Union[str, List[str], int]]]:
75 | '''
76 | Results comes from (for example): https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022
77 |
78 | Extracts data from Google Scholar Top Publication Metrics Citation page:
79 | - title: str
80 | - title_link: str
81 | - authors: list
82 | - cited_by_count: int
83 | - cited_by_link: str
84 | - year: int
85 | - published_at: str
86 |
87 | Arguments:
88 | - journal_publications_link: str. Search query.
89 | - pagination: bool. Enables or disables pagination. Default is False.
90 | - save_to_csv: bool. True of False. Default is False.
91 | - save_to_json: bool. True of False. Default is False.
92 |
93 | Usage:
94 |
95 | from google_scholar_py import CustomGoogleScholarTopPublicationArticle
96 |
97 | parser = CustomGoogleScholarTopPublicationArticle()
98 | data = parser.scrape_google_scholar_top_publication_articles(
99 | journal_publications_link='https://scholar.google.com/citations?hl=en&vq=en&view_op=list_hcore&venue=9oNLl9DgMnQJ.2022', # or link variable that stores the link
100 | pagination=False,
101 | save_to_csv=True
102 | )
103 |
104 | for citations in data:
105 | print(citations['title'], citations['year'], citations['published_at'], sep='\\n')
106 | '''
107 |
108 | # selenium stealth
109 | options = webdriver.ChromeOptions()
110 | options.add_argument('--headless')
111 | options.add_argument('--no-sandbox')
112 | options.add_argument('--disable-dev-shm-usage')
113 |
114 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
115 | options.add_experimental_option('useAutomationExtension', False)
116 |
117 | service = Service(ChromeDriverManager().install())
118 | driver = webdriver.Chrome(service=service, options=options)
119 |
120 | stealth(driver,
121 | languages=['en-US', 'en'],
122 | vendor='Google Inc.',
123 | platform='Win32',
124 | webgl_vendor='Intel Inc.',
125 | renderer='Intel Iris OpenGL Engine',
126 | fix_hairline=True,
127 | )
128 |
129 | page_num = 0
130 | publication_citation_data = []
131 |
132 | # parse all pages
133 | if pagination:
134 | while True:
135 | driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination
136 | parser = LexborHTMLParser(driver.page_source)
137 |
138 | self.parse(parser=parser, publication_citation_data=publication_citation_data)
139 |
140 | # pagination
141 | if parser.css_first('.gsc_pgn_pnx:not([disabled])'): # checks if the "Next" page button selector is not disabled
142 | page_num += 20 # paginate to the next page
143 | time.sleep(random.randint(1, 3)) # sleep between paginations
144 | else:
145 | break
146 | else:
147 | # parse first page only
148 | driver.get(journal_publications_link)
149 | parser = LexborHTMLParser(driver.page_source)
150 |
151 | self.parse(parser=parser, publication_citation_data=publication_citation_data)
152 |
153 | if save_to_csv:
154 | pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv',
155 | index=False, encoding='utf-8')
156 | if save_to_json:
157 | pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json',
158 | orient='records')
159 | driver.quit()
160 |
161 | return publication_citation_data
162 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_article_citation.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Dict, Callable, Union
7 | import pandas as pd
8 | import time, random
9 |
10 | class CustomGoogleScholarTopPublicationArticleCitation:
11 | def __init__(self) -> None:
12 | pass
13 |
14 |
15 | def parse(self, parser: Callable, publication_citation_data: Callable):
16 | '''
17 | Arugments:
18 | - parser: Lexbor parser from scrape_google_scholar_top_publication_article_citations() function.
19 | - publication_citation_data: List to append data to. List origin location is scrape_google_scholar_top_publication_article_citations() function. Line 104.
20 |
21 | This function parses data from Google Scholar Organic results and appends data to a List.
22 |
23 | It's used by scrape_google_scholar_top_publication_article_citations().
24 | '''
25 |
26 | # selects the whole table without the first row (header row)
27 | for result in parser.css('tr:not(:first-child)'):
28 | try:
29 | title: str = result.css_first('.gsc_mp_anchor_lrge').text()
30 | except: title = None
31 |
32 | try:
33 | title_link: str = f"https://scholar.google.com{result.css_first('a.gsc_mp_anchor_lrge').attrs['href']}"
34 | except: title_link = None
35 |
36 | try:
37 | authors: list = result.css_first('.gsc_mpat_ttl+ .gs_gray').text().split(', ')
38 | except: authors = None
39 |
40 | try:
41 | published_at: str = result.css_first('.gs_gray+ .gs_gray').text()
42 | except: published_at = None
43 |
44 | try:
45 | year: int = int(result.css_first('.gsc_mp_anchor.gs_nph').text())
46 | except: year = None
47 |
48 |
49 | publication_citation_data.append({
50 | 'title': title,
51 | 'title_link': title_link,
52 | 'authors': authors,
53 | 'year': year,
54 | 'published_at': published_at
55 | })
56 |
57 | #TODO: add lang support. https://serpapi.com/google-languages
58 | def scrape_google_scholar_top_publication_article_citations(
59 | self,
60 | journal_publications_link: str,
61 | pagination: bool = False,
62 | save_to_csv: bool = False,
63 | save_to_json: bool = False
64 | ) -> List[Dict[str, Union[str, List[str], int]]]:
65 | '''
66 | Results comes from (for example): https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18
67 |
68 | Extracts data from Google Scholar Top Publication Metrics Citation page:
69 | - title: str
70 | - title_link: str
71 | - authors: list
72 | - published_at: str
73 | - year: int
74 |
75 | Arguments:
76 | - journal_publications_link: str. Search query.
77 | - pagination: bool. Enables or disables pagination. Default is False.
78 | - save_to_csv: bool. True of False. Default is False.
79 | - save_to_json: bool. True of False. Default is False.
80 |
81 | Usage:
82 |
83 | from google_scholar_py import CustomGoogleScholarTopPublicationArticleCitation
84 | import json
85 |
86 | parser = CustomGoogleScholarTopPublicationArticleCitation()
87 | data = parser.scrape_google_scholar_top_publication_article_citations(
88 | journal_publications_link='https://scholar.google.com/citations?hl=en&venue=k6hd2dUel5kJ.2022&vq=en&view_op=hcore_citedby&hcore_pos=18', # or link variable that stores the link
89 | pagination=False,
90 | save_to_csv=True
91 | )
92 | print(json.dumps(data, indent=2))
93 |
94 | for citations in data:
95 | print(citations['title'], citations['year'], citations['published_at'], sep='\\n')
96 | '''
97 |
98 | # selenium stealth
99 | options = webdriver.ChromeOptions()
100 | options.add_argument('--headless')
101 | options.add_argument('--no-sandbox')
102 | options.add_argument('--disable-dev-shm-usage')
103 |
104 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
105 | options.add_experimental_option('useAutomationExtension', False)
106 |
107 | service = Service(ChromeDriverManager().install())
108 | driver = webdriver.Chrome(service=service, options=options)
109 |
110 | stealth(driver,
111 | languages=['en-US', 'en'],
112 | vendor='Google Inc.',
113 | platform='Win32',
114 | webgl_vendor='Intel Inc.',
115 | renderer='Intel Iris OpenGL Engine',
116 | fix_hairline=True,
117 | )
118 |
119 | page_num = 0
120 | publication_citation_data = []
121 |
122 | # parse all pages
123 | if pagination:
124 | while True:
125 | driver.get(journal_publications_link + f'&cstart={page_num}') # 'cstart' paramter is for pagination
126 | parser = LexborHTMLParser(driver.page_source)
127 |
128 | self.parse(parser=parser, publication_citation_data=publication_citation_data)
129 |
130 | # pagination
131 | if parser.css_first('.gsc_pgn_pnx:not([disabled])'): # checks if the "Next" page button selector is not disabled
132 | page_num += 20 # paginate to the next page
133 | time.sleep(random.randint(1, 3)) # sleep between paginations
134 | else:
135 | break
136 | else:
137 | # parse first page only
138 | driver.get(journal_publications_link)
139 | parser = LexborHTMLParser(driver.page_source)
140 |
141 | self.parse(parser=parser, publication_citation_data=publication_citation_data)
142 |
143 | if save_to_csv:
144 | pd.DataFrame(data=publication_citation_data).to_csv('google_scholar_top_publication_citations.csv',
145 | index=False, encoding='utf-8')
146 | if save_to_json:
147 | pd.DataFrame(data=publication_citation_data).to_json('google_scholar_top_publication_citations.json',
148 | orient='records')
149 | driver.quit()
150 |
151 | return publication_citation_data
152 |
--------------------------------------------------------------------------------
/google_scholar_py/custom_backend/top_publications_metrics.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium_stealth import stealth
3 | from selenium.webdriver.chrome.service import Service
4 | from webdriver_manager.chrome import ChromeDriverManager
5 | from selectolax.lexbor import LexborHTMLParser
6 | from typing import List, Dict, Callable, Union
7 | import pandas as pd
8 |
9 | class CustomGoogleScholarTopPublications:
10 | def __init__(self) -> None:
11 | pass
12 |
13 |
14 | def parse(self, parser: Callable, top_publications_data: Callable):
15 | '''
16 | Arugments:
17 | - parser: Callable. Lexbor parser from google_scholar_top_publication_metrics() function.
18 | - top_publications_data: Callable. List to append data to. List origin location is google_scholar_top_publication_metrics() function. Line 100.
19 |
20 | This function parses data from Google Scholar Organic results and appends data to a List.
21 |
22 | It's used by google_scholar_top_publication_metrics().
23 |
24 | It returns nothing as it appends data to `top_publications_data`,
25 | which appends it to `top_publications_data` List in the google_scholar_top_publication_metrics() function.
26 | '''
27 |
28 | # selectors skips table header row
29 | for table in parser.css('tr:not(:first-child)'):
30 | try:
31 | title: str = table.css_first('td.gsc_mvt_t').text()
32 | except: title = None
33 |
34 | try:
35 | h5_index: int = table.css_first('a.gs_ibl').text()
36 | except: h5_index = None
37 |
38 | try:
39 | h5_index_link: str = f"https://scholar.google.com{table.css_first('a.gs_ibl').attrs['href']}"
40 | except: h5_index_link = None
41 |
42 | try:
43 | h5_median: int = table.css_first('span.gs_ibl').text()
44 | except: h5_median = None
45 |
46 | top_publications_data.append({
47 | 'title': title,
48 | 'h5_index': int(h5_index) if h5_index else h5_index,
49 | 'h5_index_link': h5_index_link,
50 | 'h5_median': int(h5_median) if h5_median else h5_median
51 | })
52 |
53 |
54 | def scrape_top_publication_metrics(
55 | self,
56 | category: str = '',
57 | lang: str = 'en',
58 | save_to_csv: bool = False,
59 | save_to_json: bool = False,
60 | ) -> List[Dict[str, Union[str, int]]]:
61 | #TODO add subcategories to subcategory arg
62 | #TODO: support other languages: lang='spanish' -> 'sp'. https://serpapi.com/google-languages
63 |
64 |
65 | '''
66 | Results comes from: https://scholar.google.com/citations?view_op=top_venues
67 |
68 | Returns:
69 | - title: str
70 | - h5_index: int
71 | - h5_index_link: str
72 | - h5_median: int
73 |
74 | Arguments:
75 | - save_to_csv: True of False. Default is False. Saves data to CSV file.
76 | - save_to_json: True of False. Default is False. Saves data to JSON file.
77 | - lang: str. Language. Defaults to English ('en'). For now, need to be checked yourself. Other languages: https://serpapi.com/google-languages
78 | - category: str. Available categories showed in the function documentation below.
79 | Available categories:
80 | - "bus": Business, Economics & Management
81 | - "chm": Chemical & Material Sciences
82 | - "eng": Engineering & Computer Science
83 | - "med": Health & Medical Sciences
84 | - "hum": Humanities, Literature & Arts
85 | - "bio": Life Sciences & Earth Sciences
86 | - "phy": Physics & Mathematics
87 | - "soc": Social Sciences
88 |
89 | Usage:
90 |
91 | from google_scholar_py import CustomGoogleScholarTopPublications
92 |
93 | data = CustomGoogleScholarTopPublications().scrape_top_publication_metrics(category='eng', lang='en') # sv = swedish
94 |
95 | for result in data:
96 | print(result['title'])
97 | ...
98 | '''
99 |
100 | # selenium stealth
101 | options = webdriver.ChromeOptions()
102 | options.add_argument('--headless')
103 | options.add_argument('--no-sandbox')
104 | options.add_argument('--disable-dev-shm-usage')
105 |
106 | options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
107 | options.add_experimental_option('useAutomationExtension', False)
108 |
109 | service = Service(ChromeDriverManager().install())
110 | driver = webdriver.Chrome(service=service, options=options)
111 |
112 | stealth(driver,
113 | languages=['en-US', 'en'],
114 | vendor='Google Inc.',
115 | platform='Win32',
116 | webgl_vendor='Intel Inc.',
117 | renderer='Intel Iris OpenGL Engine',
118 | fix_hairline=True
119 | )
120 |
121 | top_publications_data = []
122 |
123 | if category:
124 | driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}')
125 | parser = LexborHTMLParser(driver.page_source)
126 | self.parse(parser=parser, top_publications_data=top_publications_data)
127 | else:
128 | # no vq={category} URL parameter
129 | driver.get(f'https://scholar.google.com/citations?view_op=top_venues&hl={lang}&vq={category}') # vq='' which will redirect to the page with no applied category
130 | parser = LexborHTMLParser(driver.page_source)
131 | self.parse(parser=parser, top_publications_data=top_publications_data)
132 |
133 | if save_to_csv:
134 | pd.DataFrame(data=top_publications_data).to_csv('google_scholar_top_publications_data.csv',
135 | index=False, encoding='utf-8')
136 | if save_to_json:
137 | pd.DataFrame(data=top_publications_data).to_json('google_scholar_top_publications_data.json',
138 | orient='records')
139 |
140 | driver.quit()
141 | return top_publications_data
142 |
--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/author_results.py:
--------------------------------------------------------------------------------
1 | from serpapi import GoogleScholarSearch
2 | from urllib.parse import urlsplit, parse_qsl
3 | import itertools
4 |
5 | #TODO: support pagination using `async` parameter
6 |
7 | class SerpApiGoogleScholarAuthor:
8 | def __init__(self) -> None:
9 | pass
10 |
11 | def scrape_google_scholar_author_results(
12 | self,
13 | author_id: str,
14 | api_key: str = None,
15 | lang: str = 'en',
16 | parse_articles: bool = False,
17 | article_pagination: bool = False,
18 | ):
19 |
20 | '''
21 | Extracts all author data: author info, cited by (table, graph), co-authors, all articles.
22 |
23 | Arguments:
24 | - author_id: author id.
25 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key
26 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
27 | - parse_articles: parses first page of authour articles. Defalul 'False'.
28 | - article_pagination: True of False. Enables to parse all articles. Default 'False'.
29 |
30 | Usage:
31 |
32 | from google_scholar_py.serpapi_backend.author_results import SerpApiGoogleScholarAuthor
33 |
34 | parser = SerpApiGoogleScholarAuthor()
35 | data = parser.scrape_google_scholar_author_results(
36 | author_id='nHhtvqkAAAAJ',
37 | api_key='serpapi_api_key',
38 | parse_articles=True,
39 | article_pagination=True,
40 | )
41 |
42 | print(data.keys()) # show available keys
43 |
44 | for article in data['articles']:
45 | print(article['title'])
46 | '''
47 |
48 | if api_key is None:
49 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
50 |
51 | if author_id is None:
52 | raise Exception('Please enter a author id.')
53 |
54 | if api_key and author_id is None:
55 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a author id to `author_id` arguments.')
56 |
57 | params = {
58 | 'api_key': api_key, # serpapi api key
59 | 'engine': 'google_scholar_author', # serpapi parsing engine
60 | 'author_id': author_id, # search by author id
61 | 'hl': lang # language
62 | }
63 |
64 | search = GoogleScholarSearch(params) # where data extracts on the backend
65 |
66 | # parsing ALL articles along with author info
67 | if parse_articles and article_pagination:
68 | params['start'] = 0 # page number: 0 is first page, 1 is second, etc.
69 | params['pagesize'] = 100 # number of articles per page
70 |
71 | author_all_articles = []
72 |
73 | while True:
74 | results = search.get_dict()
75 |
76 | if 'error' in results:
77 | print(results['error'])
78 | break
79 |
80 | author_all_articles.append(results['articles'])
81 |
82 | # check for the `next` page
83 | if 'next' in results.get('serpapi_pagination', {}):
84 | search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query)))
85 | else:
86 | break
87 |
88 | # remove articles key that creates a nested lists
89 | results.pop('articles')
90 |
91 | # flatten list of all articles
92 | author_all_articles_flatten = list(itertools.chain(*author_all_articles))
93 | results['articles'] = author_all_articles_flatten
94 |
95 | keys_to_delete = ['search_metadata', 'search_parameters']
96 | for key_to_delete in keys_to_delete:
97 | results.pop(key_to_delete)
98 |
99 | return results
100 |
101 | # parsing ONLY FIRST PAGE of articles along with author info
102 | if parse_articles:
103 | search = GoogleScholarSearch(params)
104 | results = search.get_dict() # JSON -> Python dict
105 |
106 | if 'error' in results:
107 | raise Exception(results['error'])
108 |
109 | keys_to_delete = ['search_metadata', 'search_parameters', 'serpapi_pagination']
110 |
111 | for key_to_delete in keys_to_delete:
112 | results.pop(key_to_delete)
113 |
114 | return results
115 |
116 | # if don't need to parse any articles -> remove them from the JSON
117 | elif article_pagination or parse_articles is False:
118 | search = GoogleScholarSearch(params)
119 | results = search.get_dict()
120 |
121 | if 'error' in results:
122 | raise Exception(results['error'])
123 |
124 | keys_to_delete = ['search_metadata', 'search_parameters', 'articles', 'serpapi_pagination']
125 |
126 | for key_to_delete in keys_to_delete:
127 | results.pop(key_to_delete)
128 |
129 | return results
130 |
131 |
--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/organic_cite_results.py:
--------------------------------------------------------------------------------
1 | from .organic_results import SerpApiGoogleScholarOrganic
2 | from serpapi import GoogleScholarSearch
3 |
4 | #TODO: support extracting actual Cite data, for example Bibtex: shorturl.at/vGNU5
5 |
6 | class SerpApiGoogleScholarOrganicCite:
7 | def __init__(self) -> None:
8 | pass
9 |
10 |
11 | def scrape_google_scholar_cite_results(
12 | self,
13 | query: str,
14 | api_key: str = None,
15 | lang: str = 'en',
16 | pagination: bool = False
17 | ):
18 |
19 | '''
20 | This function extract citations as well as BibTeX, EndNote, RefMan, RefWorks links.
21 |
22 | To extract citations, 2 requests has to be made: first for organic results, second for citation data.
23 | So if you need to get citations from 1000 articles, 2000 requests would be made accordingly.
24 |
25 | Arguments:
26 | - query: search query
27 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key
28 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
29 | - pagination: True of False. Enables pagination from all pages. Default 'False'.
30 |
31 | Usage:
32 |
33 | from google_scholar_py.serpapi_backend.organic_cite_results import SerpApiGoogleScholarOrganicCite
34 |
35 | parser = SerpApiGoogleScholarOrganicCite()
36 | data = parser.scrape_google_scholar_cite_results(
37 | query='minecraft',
38 | api_key='serpapi_api_key',
39 | pagination=True
40 | )
41 |
42 | # extracting bottom links
43 | for result in data:
44 | for citations in result['links']:
45 | print(citations['name']) # or ['link']
46 |
47 | # extracting citations
48 | for result in data:
49 | for citations in result['citations']:
50 | print(citations['title']) # or ['snippet']
51 | '''
52 |
53 | if api_key is None:
54 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
55 |
56 | #TODO: could be removed as function by itself throw an error if query is missing
57 | if api_key and query is None:
58 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
59 |
60 | # extract organic results from where citation data will be extracted
61 | organic_results = SerpApiGoogleScholarOrganic().scrape_google_scholar_organic_results(
62 | query=query,
63 | api_key=api_key,
64 | lang=lang,
65 | pagination=pagination
66 | )
67 |
68 | cite_results_data = []
69 |
70 | for citation in organic_results:
71 | params = {
72 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key
73 | 'engine': 'google_scholar_cite', # serpapi parsing engine
74 | 'q': citation['result_id'] # search query
75 | }
76 |
77 | search = GoogleScholarSearch(params) # where data extracts on the backend
78 | results = search.get_dict()
79 |
80 | # removes 2 keys from the JSON response
81 | for key_to_delete in ['search_metadata', 'search_parameters']:
82 | results.pop(key_to_delete)
83 |
84 | if 'error' in results:
85 | raise Exception(results['error'])
86 |
87 | cite_results_data.append(results)
88 |
89 | return cite_results_data
90 |
--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/organic_results.py:
--------------------------------------------------------------------------------
1 | from serpapi import GoogleScholarSearch
2 | from urllib.parse import urlsplit, parse_qsl
3 | import itertools
4 |
5 | #TODO: support pagination using `async` parameter
6 |
7 | class SerpApiGoogleScholarOrganic:
8 | def __init__(self) -> None:
9 | pass
10 |
11 |
12 | #TODO: add test API key so users can test out before passing their own?
13 | def scrape_google_scholar_organic_results(
14 | self,
15 | query: str,
16 | api_key: str = None,
17 | lang: str = 'en',
18 | pagination: bool = False,
19 | ):
20 |
21 | '''
22 | This function extracts all possible data from Google Scholar organic results. With or without pagination.
23 |
24 | Arguments:
25 | - query: search query
26 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key
27 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
28 | - pagination: True of False. Enables pagination from all pages. Default 'False'.
29 |
30 | Usage:
31 |
32 | from google_scholar_py.serpapi_backend.organic_results import SerpApiGoogleScholarOrganic
33 |
34 | parser = SerpApiGoogleScholarOrganic()
35 | data = parser.scrape_google_scholar_organic_results(
36 | query='minecraft',
37 | api_key='serpapi_api_key',
38 | pagination=True
39 | )
40 |
41 | print(data[0].keys()) # show available keys
42 |
43 | for result in data:
44 | print(result['title']) # and other data
45 | '''
46 |
47 | if api_key is None:
48 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
49 |
50 | if api_key and query is None:
51 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
52 |
53 | params = {
54 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key
55 | 'engine': 'google_scholar', # serpapi parsing engine
56 | 'q': query, # search query
57 | 'hl': lang, # language
58 | 'start': 0 # first page. Used for pagination: https://serpapi.com/google-scholar-api#api-parameters-pagination-start
59 | }
60 |
61 | search = GoogleScholarSearch(params) # where data extracts on the backend
62 |
63 | if pagination:
64 | organic_results_data = []
65 |
66 | while True:
67 | results = search.get_dict() # JSON -> Python dict
68 |
69 | if 'error' in results:
70 | print(results['error'])
71 | break
72 |
73 | organic_results_data.append(results['organic_results'])
74 |
75 | # check for `serpapi_pagination` and then for `next` page
76 | if 'next' in results.get('serpapi_pagination', {}):
77 | search.params_dict.update(dict(parse_qsl(urlsplit(results['serpapi_pagination']['next']).query)))
78 | else:
79 | break
80 |
81 | # flatten list
82 | return list(itertools.chain(*organic_results_data))
83 | else:
84 | # remove page number key from the request parameters
85 | # parse first page only
86 | params.pop('start')
87 |
88 | search = GoogleScholarSearch(params)
89 | results = search.get_dict()
90 |
91 | if 'error' in results:
92 | raise Exception(results['error'])
93 |
94 | return results['organic_results']
95 |
96 |
97 |
--------------------------------------------------------------------------------
/google_scholar_py/serpapi_backend/profile_results.py:
--------------------------------------------------------------------------------
1 | from serpapi import GoogleScholarSearch
2 | from urllib.parse import parse_qsl, urlsplit
3 | import itertools
4 |
5 |
6 | #TODO: support pagination using `async` parameter
7 |
8 | class SerpApiGoogleScholarProfiles:
9 | def __init__(self) -> None:
10 | pass
11 |
12 | def scrape_google_scholar_profile_results(
13 | self,
14 | query: str,
15 | api_key: str = None,
16 | lang: str = 'en',
17 | pagination: bool = False,
18 | ):
19 |
20 | '''
21 | This function extracts profile results. With or without pagination.
22 |
23 | Arguments:
24 | - query: search query
25 | - api_key: SerpApi api key, https://serpapi.com/manage-api-key
26 | - lang: language for the search. Default 'en'. More: https://serpapi.com/google-languages
27 | - pagination: True of False. Enables pagination from all pages. Default 'False'.
28 |
29 | Usage:
30 |
31 | from google_scholar_py.serpapi_backend.profile_results import SerpApiGoogleScholarProfiles
32 |
33 | parser = SerpApiGoogleScholarProfiles()
34 | data = parser.scrape_google_scholar_profile_results(
35 | query='minecraft',
36 | api_key='serpapi_api_key',
37 | pagination=True,
38 | )
39 |
40 | print(data[0].keys()) # show available keys
41 |
42 | for result in data:
43 | print(result['title'])
44 | # get other data
45 | '''
46 |
47 | if api_key is None:
48 | raise Exception('Please enter a SerpApi API key to a `api_key` argument. https://serpapi.com/manage-api-key')
49 |
50 | if api_key and query is None:
51 | raise Exception('Please enter a SerpApi API key to a `api_key`, and a search query to `query` arguments.')
52 |
53 | params = {
54 | 'api_key': api_key, # serpapi api key: https://serpapi.com/manage-api-key
55 | 'engine': 'google_scholar_profiles', # serpapi parsing engine
56 | 'mauthors': query, # search query
57 | 'hl': lang # language
58 | }
59 |
60 | search = GoogleScholarSearch(params) # where data extracts on the backend
61 |
62 | if pagination:
63 | profile_results_data = []
64 |
65 | while True:
66 | results = search.get_dict() # JSON -> Python dict
67 |
68 | if 'error' in results:
69 | print(results['error'])
70 | break
71 |
72 | profile_results_data.append(results['profiles'])
73 |
74 | # check for 'next' page
75 | if 'next' in results.get('pagination', {}):
76 | search.params_dict.update(dict(parse_qsl(urlsplit(results['pagination']['next']).query)))
77 | else:
78 | break
79 |
80 | # flatten list
81 | return list(itertools.chain(*profile_results_data))
82 | else:
83 | search = GoogleScholarSearch(params)
84 | results = search.get_dict()
85 |
86 | if 'error' in results:
87 | raise Exception(results['error'])
88 |
89 | return results['profiles']
90 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.ruff]
6 | line-length = 125
7 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==7.3.1
2 | pytest-cov==4.0.0
3 | pytest-xdist==3.3.0
4 | coverage==7.2.5
5 | ruff==0.0.243
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | async-generator==1.10
2 | attrs==22.2.0
3 | bleach==6.0.0
4 | CacheControl==0.12.11
5 | certifi==2022.12.7
6 | cffi==1.15.1
7 | charset-normalizer==3.0.1
8 | cleo==2.0.1
9 | colorama==0.4.6
10 | crashtest==0.4.1
11 | cryptography==39.0.1
12 | cssselect==1.2.0
13 | Cython==0.29.33
14 | distlib==0.3.6
15 | docutils==0.19
16 | dulwich==0.20.50
17 | exceptiongroup==1.1.0
18 | execnet==1.9.0
19 | filelock==3.9.0
20 | google-search-results==2.4.2
21 | h11==0.14.0
22 | html5lib==1.1
23 | idna==3.4
24 | importlib-metadata==4.13.0
25 | importlib-resources==5.10.2
26 | iniconfig==2.0.0
27 | jaraco.classes==3.2.3
28 | jeepney==0.8.0
29 | jsonschema==4.17.3
30 | keyring==23.13.1
31 | lockfile==0.12.2
32 | lxml==4.9.2
33 | markdown-it-py==2.1.0
34 | mdurl==0.1.2
35 | more-itertools==9.0.0
36 | msgpack==1.0.4
37 | numpy==1.24.2
38 | outcome==1.2.0
39 | packaging==23.0
40 | pandas==1.5.3
41 | parsel==1.7.0
42 | pexpect==4.8.0
43 | pkginfo==1.9.6
44 | pkgutil_resolve_name==1.3.10
45 | platformdirs==2.6.2
46 | pluggy==1.0.0
47 | poetry==1.3.2
48 | poetry-core==1.4.0
49 | poetry-plugin-export==1.3.0
50 | ptyprocess==0.7.0
51 | pycparser==2.21
52 | Pygments==2.14.0
53 | pyrsistent==0.19.3
54 | PySocks==1.7.1
55 | python-dateutil==2.8.2
56 | python-dotenv==1.0.0
57 | pytz==2022.7.1
58 | pywin32-ctypes==0.2.0
59 | rapidfuzz==2.13.7
60 | readme-renderer==37.3
61 | requests==2.28.2
62 | requests-toolbelt==0.10.1
63 | rfc3986==2.0.0
64 | rich==13.3.1
65 | SecretStorage==3.3.3
66 | selectolax==0.3.12
67 | selenium==4.8.0
68 | selenium-stealth==1.0.6
69 | shellingham==1.5.0.post1
70 | six==1.16.0
71 | sniffio==1.3.0
72 | sortedcontainers==2.4.0
73 | tomli==2.0.1
74 | tomlkit==0.11.6
75 | tqdm==4.65.0
76 | trio==0.22.0
77 | trio-websocket==0.9.2
78 | trove-classifiers==2023.1.20
79 | typing_extensions==4.4.0
80 | urllib3==1.26.14
81 | virtualenv==20.19.0
82 | w3lib==2.1.1
83 | webdriver-manager==3.8.5
84 | webencodings==0.5.1
85 | wsproto==1.2.0
86 | zipp==3.12.1
87 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | README = ''
4 | with open('README.md', 'r', encoding='utf-8') as readme_file:
5 | README = readme_file.read()
6 |
7 | setup(
8 | name='scrape-google-scholar-py',
9 | description = 'Extract data from all Google Scholar pages in Python. Sponsored by SerpApi.',
10 | url='https://github.com/dimitryzub/scrape-google-scholar',
11 | version='0.3.4',
12 | license='MIT',
13 | author='Dmitiry Zub',
14 | author_email='dimitryzub@gmail.com',
15 | maintainer='Dmitiry Zub',
16 | maintainer_email='dimitryzub@gmail.com',
17 | long_description_content_type='text/markdown',
18 | long_description=README,
19 | include_package_data=True,
20 | python_requires='>=3.10',
21 | classifiers = [
22 | 'Development Status :: 3 - Alpha',
23 | 'Intended Audience :: Developers',
24 | 'Operating System :: Microsoft :: Windows',
25 | 'Operating System :: MacOS',
26 | 'Operating System :: POSIX :: Linux',
27 | 'Topic :: Internet',
28 | 'Natural Language :: English',
29 | 'Topic :: Utilities',
30 | 'Programming Language :: Python :: 3.10',
31 | 'Programming Language :: Python :: 3.11',
32 | ],
33 | keywords=[
34 | 'google scholar',
35 | 'serpapi',
36 | 'scraper',
37 | 'python',
38 | 'python google scholar',
39 | 'python google scholar api',
40 | 'web scraping',
41 | 'python web scraping',
42 | 'research',
43 | 'lexbor',
44 | 'selectolax',
45 | 'selenium',
46 | 'selenium-stealth',
47 | 'pandas',
48 | ],
49 | install_requires=[
50 | 'google-search-results>=2.4.2',
51 | 'selectolax>=0.3.12',
52 | 'parsel>=1.7.0',
53 | 'selenium-stealth>=1.0.6',
54 | 'pandas>=1.5.3',
55 | 'webdriver-manager>=3.8.5'
56 | ],
57 | project_urls={
58 | 'Documentation': 'https://github.com/dimitryzub/scrape-google-scholar#example-usage-custom-backend',
59 | 'Source': 'https://github.com/dimitryzub/scrape-google-scholar',
60 | 'Tracker': 'https://github.com/dimitryzub/scrape-google-scholar/issues',
61 | },
62 | )
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimitryzub/scrape-google-scholar-py/2a11840c7d19d23faca0c544c61cc5fd1aa4dadd/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_custom_profile.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import unittest
3 | from pathlib import Path
4 | import os
5 | from google_scholar_py.custom_backend.profiles_results import CustomGoogleScholarProfiles
6 |
7 |
8 | # # Tests for CustomGoogleScholarProfiles class
9 | # @pytest.fixture(scope='session')
10 | # def google_scholar_parser():
11 | # return CustomGoogleScholarProfiles()
12 |
13 | @pytest.fixture(scope='session')
14 | def search_query():
15 | return 'blizzard'
16 |
17 | def test_custom_google_scholar_profiles_scrape_without_pagination(search_query):
18 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False)
19 | assert len(results) > 0
20 |
21 | def test_custom_google_scholar_profiles_scrape_with_pagination(search_query):
22 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=True)
23 | assert len(results) > 0
24 |
25 | def test_custom_google_scholar_profiles_save_to_csv(search_query):
26 | CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_csv=True)
27 |
28 | # ../ as file saves in root, might save to a special "results" folder
29 | assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.csv').exists()
30 |
31 | def test_custom_google_scholar_profiles_save_to_json(search_query):
32 | CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=search_query, pagination=False, save_to_json=True)
33 |
34 | # # ../ as file saves in root, might save to a special "results" folder
35 | assert Path().cwd().joinpath('tests', '../google_scholar_profile_results_data.json').exists()
36 |
37 | # @pytest.fixture(scope='session')
38 | # def remove_test_files():
39 | # csv_file = Path().cwd().parent / 'google_scholar_profile_results_data.csv'
40 | # json_file = Path().cwd().parent / 'google_scholar_profile_results_data.json'
41 | # os.remove(csv_file)
42 | # os.remove(json_file)
43 |
44 |
45 | # Tests for scrape_google_scholar_profiles function
46 | class TestScrapeGoogleScholarProfiles(unittest.TestCase):
47 |
48 | def test_scrape_google_scholar_profiles_returns_list(self):
49 | query = "machine learning"
50 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query)
51 | self.assertIsInstance(results, list)
52 |
53 | def test_scrape_google_scholar_profiles_returns_correct_data_types(self):
54 | query = "machine learning"
55 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query)
56 |
57 | for profile_data in results:
58 | self.assertIsInstance(profile_data, dict)
59 | self.assertIsInstance(profile_data['name'], str)
60 | self.assertIsInstance(profile_data['link'], str)
61 | self.assertIsInstance(profile_data['affiliations'], str)
62 | self.assertIsInstance(profile_data['email'], str)
63 | self.assertIsInstance(profile_data['cited_by_count'], int or None)
64 | self.assertIsInstance(profile_data['interests'], list or None)
65 | for interest in profile_data['interests']:
66 | self.assertIsInstance(interest, str)
67 |
68 | def test_scrape_google_scholar_profiles_returns_valid_data(self):
69 | query = "machine learning"
70 | results = CustomGoogleScholarProfiles().scrape_google_scholar_profiles(query=query)
71 |
72 | for profile_data in results:
73 | self.assertIsNotNone(profile_data['name'])
74 | self.assertIsNotNone(profile_data['link'])
75 | self.assertIsNotNone(profile_data['affiliations'])
76 | self.assertIsNotNone(profile_data['email'])
77 | self.assertIsNotNone(profile_data['cited_by_count'])
78 | self.assertGreater(len(profile_data['interests']), 0)
79 |
80 |
81 |
82 | if __name__ == '__main__':
83 | unittest.main()
--------------------------------------------------------------------------------