├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Pipfile ├── README.md ├── docs ├── Makefile ├── about.rst ├── conf.py ├── databases.rst ├── examples.rst ├── html │ ├── .buildinfo │ ├── .doctrees │ │ ├── about.doctree │ │ ├── databases.doctree │ │ ├── environment.pickle │ │ ├── examples.doctree │ │ ├── index.doctree │ │ ├── installation.doctree │ │ ├── methods.doctree │ │ ├── network.doctree │ │ └── utils.doctree │ ├── _sources │ │ ├── about.rst.txt │ │ ├── databases.rst.txt │ │ ├── examples.rst.txt │ │ ├── index.rst.txt │ │ ├── installation.rst.txt │ │ ├── methods.rst.txt │ │ ├── network.rst.txt │ │ └── utils.rst.txt │ ├── _static │ │ ├── _sphinx_javascript_frameworks_compat.js │ │ ├── alabaster.css │ │ ├── basic.css │ │ ├── custom.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── jquery-3.6.0.js │ │ ├── jquery.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── sphinx_highlight.js │ │ ├── underscore-1.13.1.js │ │ └── underscore.js │ ├── about.html │ ├── databases.html │ ├── examples.html │ ├── genindex.html │ ├── index.html │ ├── installation.html │ ├── methods.html │ ├── network.html │ ├── objects.inv │ ├── py-modindex.html │ ├── search.html │ ├── searchindex.js │ └── utils.html ├── index.rst ├── installation.rst ├── make.bat ├── methods.rst ├── network.rst ├── references.bib ├── requirements.txt └── utils.rst ├── examples ├── Getting_Started │ ├── DatabaseGrowth.pdf │ ├── Getting Started with APS.ipynb │ ├── Getting Started with DBLP.ipynb │ ├── Getting Started with MAG.ipynb │ ├── Getting Started with OpenAlex.ipynb │ ├── Getting Started with PubMed.ipynb │ ├── Getting Started with WOS.ipynb │ ├── Getting Started with a Custom DB.ipynb │ ├── Getting Started with a Dask Example.ipynb │ ├── The Growth of Science.ipynb │ └── Working from a publication list.ipynb ├── GlobalCitationNetwork │ ├── GlobalCitationNetwork_analysis.ipynb │ ├── GlobalCitationNetwork_dataprep.ipynb │ ├── data │ │ ├── Chengetal_idealist.csv.gz │ │ ├── diffusion_panel_data_1990_2017.csv.gz │ │ ├── link_prediction_panel_data_1990_2017.csv.gz │ │ ├── oa_country_productivity.csv │ │ └── oa_countrycites_nosameorg_auc.csv.gz │ └── vizutils.py ├── Method_Examples │ ├── Comparing pandas, spark and dask-- number of citations.ipynb │ ├── Example Career Analysis.ipynb │ ├── Example Career Topic Switching.ipynb │ ├── Example Novelty.ipynb │ ├── Example Publication Citations.ipynb │ ├── Example Reference Strength.ipynb │ ├── Example of Credit Allocation.ipynb │ └── Example of Interdisciplinarity.ipynb ├── NLP_Examples │ ├── CoWordMention.html │ ├── Example of Coword Mention Network.ipynb │ └── Example_Node2vec (umap,sem_axis).ipynb ├── Network_Examples │ ├── DeSollaPriceCarrerCitations.pdf │ ├── DiversityCocitiation.pdf │ ├── Example of Cocitation Network.ipynb │ ├── Example of Diffusion of Scientific Credit.ipynb │ ├── NetworkNormalizedCitationAPS.ipynb │ ├── StirlingCocitiation.pdf │ └── Three Databases and Career Topic Switching.ipynb ├── ScienceOfScienceTextbook │ ├── Chapter 0 Preparing PySciSci.ipynb │ ├── Chapter 01 Productivity of a Scientist.ipynb │ ├── Chapter 02 The h-index.ipynb │ ├── Chapter 05 Random Impact Rule.ipynb │ ├── Chapter 06 The Q-Factor.ipynb │ ├── Chapter 08 The Increasing Dominance of Teams in Science.ipynb │ ├── Chapter 10 Coauthorship Networks.ipynb │ └── Chapter 14 Credit Allocation.ipynb ├── example_data │ ├── DeSollaPriceCareer.csv │ ├── fenn_paa.csv │ ├── fenn_pub2ref.csv │ └── focus_publications_example.csv └── example_interactive_html │ ├── sem_axis.html │ └── umap_fig.html ├── pyscisci ├── __init__.py ├── all.py ├── database.py ├── datasource │ ├── APS.py │ ├── CustomDB.py │ ├── DBLP.py │ ├── MAG.py │ ├── OpenAlex.py │ ├── PubMed.py │ ├── WOS.py │ ├── __init__.py │ └── readwrite.py ├── embedding.py ├── filter.py ├── methods │ ├── __init__.py │ ├── author.py │ ├── careertopics.py │ ├── cindex.py │ ├── creditshare.py │ ├── diffusionscientificcredit.py │ ├── disruption.py │ ├── diversity.py │ ├── hindex.py │ ├── hotstreak.py │ ├── journal.py │ ├── longtermimpact.py │ ├── netnormcite.py │ ├── novelty.py │ ├── pivotscore.py │ ├── productivitytrajectory.py │ ├── publication.py │ ├── qfactor.py │ ├── raostirling.py │ ├── referencestrength.py │ ├── sleepingbeauty.py │ └── topicsimilarity.py ├── network.py ├── nlp.py ├── sparsenetworkutils.py ├── tests │ ├── __init__.py │ └── test_utils.py ├── utils.py └── visualization.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # DS_Store 7 | .DS_Store 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: 14 | - pdf 15 | 16 | # Optionally set the version of Python and requirements required to build your docs 17 | python: 18 | install: 19 | - requirements: docs/requirements.txt 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - 3.7 5 | - 3.8 6 | 7 | install: 8 | - pip install pytest 9 | - pip install . 10 | 11 | script: 12 | - py.test 13 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | 6 | Please note we have a code of conduct, please follow it in all your interactions with the project. 7 | 8 | ## Pull Request Process 9 | 10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 11 | build. 12 | 2. Update the README.md with details of changes to the interface, this includes new environment 13 | variables, exposed ports, useful file locations and container parameters. 14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this 15 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). 16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 17 | do not have permission to do that, you may request the second reviewer to merge it for you. 18 | 19 | ## Code of Conduct 20 | 21 | ### Our Pledge 22 | 23 | In the interest of fostering an open and welcoming environment, we as 24 | contributors and maintainers pledge to making participation in our project and 25 | our community a harassment-free experience for everyone, regardless of age, body 26 | size, disability, ethnicity, gender identity and expression, level of experience, 27 | nationality, personal appearance, race, religion, or sexual identity and 28 | orientation. 29 | 30 | ### Our Standards 31 | 32 | Examples of behavior that contributes to creating a positive environment 33 | include: 34 | 35 | * Using welcoming and inclusive language 36 | * Being respectful of differing viewpoints and experiences 37 | * Gracefully accepting constructive criticism 38 | * Focusing on what is best for the community 39 | * Showing empathy towards other community members 40 | 41 | Examples of unacceptable behavior by participants include: 42 | 43 | * The use of sexualized language or imagery and unwelcome sexual attention or 44 | advances 45 | * Trolling, insulting/derogatory comments, and personal or political attacks 46 | * Public or private harassment 47 | * Publishing others' private information, such as a physical or electronic 48 | address, without explicit permission 49 | * Other conduct which could reasonably be considered inappropriate in a 50 | professional setting 51 | 52 | ### Our Responsibilities 53 | 54 | Project maintainers are responsible for clarifying the standards of acceptable 55 | behavior and are expected to take appropriate and fair corrective action in 56 | response to any instances of unacceptable behavior. 57 | 58 | Project maintainers have the right and responsibility to remove, edit, or 59 | reject comments, commits, code, wiki edits, issues, and other contributions 60 | that are not aligned to this Code of Conduct, or to ban temporarily or 61 | permanently any contributor for other behaviors that they deem inappropriate, 62 | threatening, offensive, or harmful. 63 | 64 | ### Scope 65 | 66 | This Code of Conduct applies both within project spaces and in public spaces 67 | when an individual is representing the project or its community. Examples of 68 | representing a project or community include using an official project e-mail 69 | address, posting via an official social media account, or acting as an appointed 70 | representative at an online or offline event. Representation of a project may be 71 | further defined and clarified by project maintainers. 72 | 73 | ### Enforcement 74 | 75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All 77 | complaints will be reviewed and investigated and will result in a response that 78 | is deemed necessary and appropriate to the circumstances. The project team is 79 | obligated to maintain confidentiality with regard to the reporter of an incident. 80 | Further details of specific enforcement policies may be posted separately. 81 | 82 | Project maintainers who do not follow or enforce the Code of Conduct in good 83 | faith may face temporary or permanent repercussions as determined by other 84 | members of the project's leadership. 85 | 86 | ### Attribution 87 | 88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 89 | available at [http://contributor-covenant.org/version/1/4][version] 90 | 91 | [homepage]: http://contributor-covenant.org 92 | [version]: http://contributor-covenant.org/version/1/4/ -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to ``pySciSci`` contributing guide 2 | 3 | Thank you for investing your time in contributing to our project! 4 | 5 | Read our [Code of Conduct](./CODE_OF_CONDUCT.md) to keep our community approachable and respectable. 6 | 7 | In this guide you will get an overview of the contribution workflow from opening an issue, creating a PR, reviewing, and merging the PR. 8 | 9 | ## New contributor guide 10 | 11 | To get an overview of the project, read the [README](README.md). Here are some resources to help you get started with open source contributions: 12 | 13 | - [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github) 14 | - [Set up Git](https://docs.github.com/en/get-started/quickstart/set-up-git) 15 | - [GitHub flow](https://docs.github.com/en/get-started/quickstart/github-flow) 16 | - [Collaborating with pull requests](https://docs.github.com/en/github/collaborating-with-pull-requests) 17 | 18 | 19 | ## Getting started 20 | 21 | ### Requests 22 | 23 | The field of Science of Science is constantly evolving. If you see a feature which should be supported by the package, open an Issues and label it as an enhancement. 24 | 25 | ### Issues 26 | 27 | #### Create a new issue 28 | 29 | If you spot a problem with the package, [search if an issue already exists](https://docs.github.com/en/github/searching-for-information-on-github/searching-on-github/searching-issues-and-pull-requests#search-by-the-title-body-or-comments). 30 | 31 | #### Solve an issue 32 | 33 | Scan through our [existing issues](https://github.com/SciSciCollective/pyscisci/issues) to find one that interests you. You can narrow down the search using `labels` as filters. As a general rule, we don’t assign issues to anyone. If you find an issue to work on, you are welcome to open a PR with a fix. 34 | 35 | ### Make Changes 36 | 37 | 38 | #### Make changes in a codespace 39 | 40 | For more information about using a codespace for working on GitHub documentation, see "[Working in a codespace](https://github.com/github/docs/blob/main/contributing/codespace.md)." 41 | 42 | #### Make changes locally 43 | 44 | 1. Fork the repository. 45 | - Using GitHub Desktop: 46 | - [Getting started with GitHub Desktop](https://docs.github.com/en/desktop/installing-and-configuring-github-desktop/getting-started-with-github-desktop) will guide you through setting up Desktop. 47 | - Once Desktop is set up, you can use it to [fork the repo](https://docs.github.com/en/desktop/contributing-and-collaborating-using-github-desktop/cloning-and-forking-repositories-from-github-desktop)! 48 | 49 | - Using the command line: 50 | - [Fork the repo](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo#fork-an-example-repository) so that you can make your changes without affecting the original project until you're ready to merge them. 51 | 52 | 2. Create a working branch and start with your changes! 53 | 54 | ### Commit your update 55 | 56 | Commit the changes once you are happy with them. 57 | 58 | ### Pull Request 59 | 60 | When you're finished with the changes, create a pull request, also known as a PR. 61 | - Fill the "Ready for review" template so that we can review your PR. This template helps reviewers understand your changes as well as the purpose of your pull request. 62 | - Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one. 63 | - Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge. 64 | Once you submit your PR, a Docs team member will review your proposal. We may ask questions or request additional information. 65 | - We may ask for changes to be made before a PR can be merged, either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. You can apply suggested changes directly through the UI. You can make any other changes in your fork, then commit them to your branch. 66 | - As you update your PR and apply changes, mark each conversation as [resolved](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/commenting-on-a-pull-request#resolving-conversations). 67 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues. 68 | 69 | ### Your PR is merged! 70 | 71 | Congratulations :tada::tada: The ``pySciSci`` team thanks you :sparkles:. 72 | 73 | Now that you are part of the ``pySciSci`` community, add yourself to the ``pySciSci`` readme. 74 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Alexander Gates 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | 8 | [dev-packages] 9 | 10 | [requires] 11 | python_version = "3.8" 12 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | About 2 | =================== 3 | The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science :cite:p:`Fortunato2018scisci`. It encompasses diverse interdisciplinary research programs that study the processes underlying science :cite:`Wang2021scisci`. The field has benefited greatly from access to massive digital databases containing the products of scientific discourse---including publications, journals, patents, books, conference proceedings, and grants. The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc.. 4 | 5 | 6 | Here we present *pySciSci* for the analysis of large-scale bibliometric data. The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques. The *pySciSci* package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects. 7 | 8 | By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community. We also encourage the SciSci community to contribute their own implementations, data, and use cases. 9 | 10 | 11 | Funding 12 | --------- 13 | *pySciSci* acknowledges support from the following grants: 14 | 15 | - Air Force Office of Scientific Research Award FA9550-19-1-0354 16 | - Templeton Foundation Contract 61066 -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pySciSci' 21 | copyright = '2021, Alexander Gates' 22 | author = 'Alexander Gates' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.6' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinxcontrib.bibtex' 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # List of patterns, relative to source directory, that match files and 40 | # directories to ignore when looking for source files. 41 | # This pattern also affects html_static_path and html_extra_path. 42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 43 | 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = 'alabaster' 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | html_static_path = ['_static'] 56 | 57 | # bibtext references 58 | bibtex_bibfiles = ['references.bib'] 59 | 60 | # bibtext style 61 | bibtex_default_style = 'unsrt' -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | =================== 3 | 4 | Getting Started Examples 5 | ------------------------- 6 | - `Getting Started With OpenAlex `_ 7 | - `Getting Started With MAG `_ 8 | - `Getting Started With WoS `_ 9 | - `Getting Started With DBLP `_ 10 | - `Getting Started With APS `_ 11 | - `Getting Started With PubMed `_ 12 | - `Getting Started With Custom DB `_ 13 | - `The Growth of Science `_ 14 | 15 | Method Examples 16 | ------------------------- 17 | - `The Diffusion of Scientific Credit `_ 18 | - `Credit Allocation `_ 19 | - `Interdisciplinarity `_ 20 | - `Novelty and Conventionality `_ 21 | - `Cocitation Network `_ 22 | - `Methods for Publication Impact `_ 23 | - `Career Analysis `_ 24 | - `Career Topic Switching `_ 25 | - `Field Reference Strength `_ 26 | - `Node2Vec Coauthorship `_ 27 | - `CoWord Network `_ 28 | 29 | Advanced Functionality 30 | ------------------------- 31 | - `Using Dask `_ 32 | - `Working from a publication List `_ 33 | 34 | Science of Science Textbook 35 | ---------------------------- 36 | Examples taken from different chapters of the Science of Science Textbook :cite:`Wang2021scisci`. 37 | 38 | - `Chapter 0 Data Prep `_ 39 | - `Chapter 1 Productivity of a Scientist `_ 40 | - `Chapter 2 The HIndex `_ 41 | - `Chapter 5 The Random Impact Rule `_ 42 | - `Chapter 6 The Qfactor `_ 43 | - `Chapter 14 Credit Allocation `_ 44 | -------------------------------------------------------------------------------- /docs/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: cb6ac21b1379d0a62ccad8688800854b 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/html/.doctrees/about.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/about.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/databases.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/databases.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/html/.doctrees/examples.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/examples.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/index.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/installation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/installation.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/methods.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/methods.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/network.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/network.doctree -------------------------------------------------------------------------------- /docs/html/.doctrees/utils.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/utils.doctree -------------------------------------------------------------------------------- /docs/html/_sources/about.rst.txt: -------------------------------------------------------------------------------- 1 | About 2 | =================== 3 | The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science :cite:p:`Fortunato2018scisci`. It encompasses diverse interdisciplinary research programs that study the processes underlying science :cite:`Wang2021scisci`. The field has benefited greatly from access to massive digital databases containing the products of scientific discourse---including publications, journals, patents, books, conference proceedings, and grants. The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc.. 4 | 5 | 6 | Here we present *pySciSci* for the analysis of large-scale bibliometric data. The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques. The *pySciSci* package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects. 7 | 8 | By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community. We also encourage the SciSci community to contribute their own implementations, data, and use cases. 9 | 10 | 11 | Funding 12 | --------- 13 | *pySciSci* acknowledges support from the following grants: 14 | 15 | - Air Force Office of Scientific Research Award FA9550-19-1-0354 16 | - Templeton Foundation Contract 61066 -------------------------------------------------------------------------------- /docs/html/_sources/examples.rst.txt: -------------------------------------------------------------------------------- 1 | Examples 2 | =================== 3 | 4 | Getting Started Examples 5 | ------------------------- 6 | - `Getting Started With OpenAlex `_ 7 | - `Getting Started With MAG `_ 8 | - `Getting Started With WoS `_ 9 | - `Getting Started With DBLP `_ 10 | - `Getting Started With APS `_ 11 | - `Getting Started With PubMed `_ 12 | - `Getting Started With Custom DB `_ 13 | - `The Growth of Science `_ 14 | 15 | Method Examples 16 | ------------------------- 17 | - `The Diffusion of Scientific Credit `_ 18 | - `Credit Allocation `_ 19 | - `Interdisciplinarity `_ 20 | - `Novelty and Conventionality `_ 21 | - `Cocitation Network `_ 22 | - `Methods for Publication Impact `_ 23 | - `Career Analysis `_ 24 | - `Career Topic Switching `_ 25 | - `Field Reference Strength `_ 26 | - `Node2Vec Coauthorship `_ 27 | - `CoWord Network `_ 28 | 29 | Advanced Functionality 30 | ------------------------- 31 | - `Using Dask `_ 32 | - `Working from a publication List `_ 33 | 34 | Science of Science Textbook 35 | ---------------------------- 36 | Examples taken from different chapters of the Science of Science Textbook :cite:`Wang2021scisci`. 37 | - `Chapter 0 Data Prep `_ 38 | - `Chapter 1 Productivity of a Scientist `_ 39 | - `Chapter 2 The HIndex `_ 40 | - `Chapter 5 The Random Impact Rule `_ 41 | - `Chapter 6 The Qfactor `_ 42 | - `Chapter 14 Credit Allocation `_ 43 | -------------------------------------------------------------------------------- /docs/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. pySciSci documentation master file, created by 2 | sphinx-quickstart on Mon Jun 21 10:54:47 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pySciSci's documentation! 7 | ************************************** 8 | 9 | Table of Contents 10 | =================== 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | about 15 | installation 16 | examples 17 | databases 18 | methods 19 | network 20 | utils 21 | 22 | 23 | References 24 | =================== 25 | .. bibliography:: -------------------------------------------------------------------------------- /docs/html/_sources/installation.rst.txt: -------------------------------------------------------------------------------- 1 | Installation 2 | =================== 3 | This package is available in PyPI. Just run the following command on terminal to install. 4 | 5 | >>> pip install pyscisci 6 | 7 | You can also source the code directly from the github `project page `_ . -------------------------------------------------------------------------------- /docs/html/_sources/methods.rst.txt: -------------------------------------------------------------------------------- 1 | Bibliometrics 2 | ====================== 3 | *pySciSci* facilitates the analysis of publications, authors, citations and as well as citation time-series, fixed time window citation analysis, and citation count normalization by year and field. 4 | 5 | 6 | Publications and Citations 7 | --------------------------- 8 | The *pySciSci* package facilitates the analysis of interrelationships between publications as captured by references and citations. 9 | 10 | For example, the most common measure of scientific impact is the citation count, or the number of times a publication has been referenced by other publications. Variations also include citation time-series, fixed time window citation analysis, citation count normalization by year and field, and citation ranks. More advanced methods fit models to citation timeseries, such as in the prediction of the long-term citation counts to a publication :cite:`wang2013longterm`, or in the assignment of the sleeping beauty score :cite:`ke2015sleepingbeauty`. The package also removes of self-citations occurring between publications by the same author. 11 | 12 | More advanced metrics capture the diversity in the citation interrelationships between publications. These measures include the Rao-Stirling reference interdisciplinary :cite:`stirling2007diversity`, novelty & conventionality :cite:`uzzi2013atypical`, and the disruption index :cite:`funk2017dynamic`, :cite:`wu2019largeteams`. 13 | 14 | .. automodule:: pyscisci.methods.publication 15 | :members: 16 | 17 | 18 | 19 | Author-centric Methods 20 | ---------------------- 21 | 22 | The sociology of science has analyzed scientific careers in terms of individual incentives, productivity, competition, collaboration, and success. The *pySciSci* package facilitates author career analysis through both aggregate career statistics and temporal career trajectories. Highlights include the H-index :cite:`hirsch2005index`, Q-factor :cite:`sinatra2016quantifying`, yearly productivity trajectories :cite:`way2017misleading`, collective credit assignment :cite:`shen2014collective`, and hot-hand effect :cite:`liu2018hot`. 23 | 24 | 25 | .. automodule:: pyscisci.methods.author 26 | :members: 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/html/_sources/network.rst.txt: -------------------------------------------------------------------------------- 1 | Biblio-Networks 2 | ====================== 3 | Intro to networks 4 | 5 | .. automodule:: pyscisci.network 6 | :members: -------------------------------------------------------------------------------- /docs/html/_sources/utils.rst.txt: -------------------------------------------------------------------------------- 1 | General Functions 2 | ====================== 3 | 4 | .. automodule:: pyscisci.utils 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/html/_static/_sphinx_javascript_frameworks_compat.js: -------------------------------------------------------------------------------- 1 | /* 2 | * _sphinx_javascript_frameworks_compat.js 3 | * ~~~~~~~~~~ 4 | * 5 | * Compatability shim for jQuery and underscores.js. 6 | * 7 | * WILL BE REMOVED IN Sphinx 6.0 8 | * xref RemovedInSphinx60Warning 9 | * 10 | */ 11 | 12 | /** 13 | * select a different prefix for underscore 14 | */ 15 | $u = _.noConflict(); 16 | 17 | 18 | /** 19 | * small helper function to urldecode strings 20 | * 21 | * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL 22 | */ 23 | jQuery.urldecode = function(x) { 24 | if (!x) { 25 | return x 26 | } 27 | return decodeURIComponent(x.replace(/\+/g, ' ')); 28 | }; 29 | 30 | /** 31 | * small helper function to urlencode strings 32 | */ 33 | jQuery.urlencode = encodeURIComponent; 34 | 35 | /** 36 | * This function returns the parsed url parameters of the 37 | * current request. Multiple values per key are supported, 38 | * it will always return arrays of strings for the value parts. 39 | */ 40 | jQuery.getQueryParameters = function(s) { 41 | if (typeof s === 'undefined') 42 | s = document.location.search; 43 | var parts = s.substr(s.indexOf('?') + 1).split('&'); 44 | var result = {}; 45 | for (var i = 0; i < parts.length; i++) { 46 | var tmp = parts[i].split('=', 2); 47 | var key = jQuery.urldecode(tmp[0]); 48 | var value = jQuery.urldecode(tmp[1]); 49 | if (key in result) 50 | result[key].push(value); 51 | else 52 | result[key] = [value]; 53 | } 54 | return result; 55 | }; 56 | 57 | /** 58 | * highlight a given string on a jquery object by wrapping it in 59 | * span elements with the given class name. 60 | */ 61 | jQuery.fn.highlightText = function(text, className) { 62 | function highlight(node, addItems) { 63 | if (node.nodeType === 3) { 64 | var val = node.nodeValue; 65 | var pos = val.toLowerCase().indexOf(text); 66 | if (pos >= 0 && 67 | !jQuery(node.parentNode).hasClass(className) && 68 | !jQuery(node.parentNode).hasClass("nohighlight")) { 69 | var span; 70 | var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); 71 | if (isInSVG) { 72 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 73 | } else { 74 | span = document.createElement("span"); 75 | span.className = className; 76 | } 77 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 78 | node.parentNode.insertBefore(span, node.parentNode.insertBefore( 79 | document.createTextNode(val.substr(pos + text.length)), 80 | node.nextSibling)); 81 | node.nodeValue = val.substr(0, pos); 82 | if (isInSVG) { 83 | var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); 84 | var bbox = node.parentElement.getBBox(); 85 | rect.x.baseVal.value = bbox.x; 86 | rect.y.baseVal.value = bbox.y; 87 | rect.width.baseVal.value = bbox.width; 88 | rect.height.baseVal.value = bbox.height; 89 | rect.setAttribute('class', className); 90 | addItems.push({ 91 | "parent": node.parentNode, 92 | "target": rect}); 93 | } 94 | } 95 | } 96 | else if (!jQuery(node).is("button, select, textarea")) { 97 | jQuery.each(node.childNodes, function() { 98 | highlight(this, addItems); 99 | }); 100 | } 101 | } 102 | var addItems = []; 103 | var result = this.each(function() { 104 | highlight(this, addItems); 105 | }); 106 | for (var i = 0; i < addItems.length; ++i) { 107 | jQuery(addItems[i].parent).before(addItems[i].target); 108 | } 109 | return result; 110 | }; 111 | 112 | /* 113 | * backward compatibility for jQuery.browser 114 | * This will be supported until firefox bug is fixed. 115 | */ 116 | if (!jQuery.browser) { 117 | jQuery.uaMatch = function(ua) { 118 | ua = ua.toLowerCase(); 119 | 120 | var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || 121 | /(webkit)[ \/]([\w.]+)/.exec(ua) || 122 | /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || 123 | /(msie) ([\w.]+)/.exec(ua) || 124 | ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || 125 | []; 126 | 127 | return { 128 | browser: match[ 1 ] || "", 129 | version: match[ 2 ] || "0" 130 | }; 131 | }; 132 | jQuery.browser = {}; 133 | jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; 134 | } 135 | -------------------------------------------------------------------------------- /docs/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /docs/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Base JavaScript utilities for all Sphinx HTML documentation. 6 | * 7 | * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | "use strict"; 12 | 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ 14 | "TEXTAREA", 15 | "INPUT", 16 | "SELECT", 17 | "BUTTON", 18 | ]); 19 | 20 | const _ready = (callback) => { 21 | if (document.readyState !== "loading") { 22 | callback(); 23 | } else { 24 | document.addEventListener("DOMContentLoaded", callback); 25 | } 26 | }; 27 | 28 | /** 29 | * Small JavaScript module for the documentation. 30 | */ 31 | const Documentation = { 32 | init: () => { 33 | Documentation.initDomainIndexTable(); 34 | Documentation.initOnKeyListeners(); 35 | }, 36 | 37 | /** 38 | * i18n support 39 | */ 40 | TRANSLATIONS: {}, 41 | PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), 42 | LOCALE: "unknown", 43 | 44 | // gettext and ngettext don't access this so that the functions 45 | // can safely bound to a different name (_ = Documentation.gettext) 46 | gettext: (string) => { 47 | const translated = Documentation.TRANSLATIONS[string]; 48 | switch (typeof translated) { 49 | case "undefined": 50 | return string; // no translation 51 | case "string": 52 | return translated; // translation exists 53 | default: 54 | return translated[0]; // (singular, plural) translation tuple exists 55 | } 56 | }, 57 | 58 | ngettext: (singular, plural, n) => { 59 | const translated = Documentation.TRANSLATIONS[singular]; 60 | if (typeof translated !== "undefined") 61 | return translated[Documentation.PLURAL_EXPR(n)]; 62 | return n === 1 ? singular : plural; 63 | }, 64 | 65 | addTranslations: (catalog) => { 66 | Object.assign(Documentation.TRANSLATIONS, catalog.messages); 67 | Documentation.PLURAL_EXPR = new Function( 68 | "n", 69 | `return (${catalog.plural_expr})` 70 | ); 71 | Documentation.LOCALE = catalog.locale; 72 | }, 73 | 74 | /** 75 | * helper function to focus on search bar 76 | */ 77 | focusSearchBar: () => { 78 | document.querySelectorAll("input[name=q]")[0]?.focus(); 79 | }, 80 | 81 | /** 82 | * Initialise the domain index toggle buttons 83 | */ 84 | initDomainIndexTable: () => { 85 | const toggler = (el) => { 86 | const idNumber = el.id.substr(7); 87 | const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); 88 | if (el.src.substr(-9) === "minus.png") { 89 | el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; 90 | toggledRows.forEach((el) => (el.style.display = "none")); 91 | } else { 92 | el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; 93 | toggledRows.forEach((el) => (el.style.display = "")); 94 | } 95 | }; 96 | 97 | const togglerElements = document.querySelectorAll("img.toggler"); 98 | togglerElements.forEach((el) => 99 | el.addEventListener("click", (event) => toggler(event.currentTarget)) 100 | ); 101 | togglerElements.forEach((el) => (el.style.display = "")); 102 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); 103 | }, 104 | 105 | initOnKeyListeners: () => { 106 | // only install a listener if it is really needed 107 | if ( 108 | !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && 109 | !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS 110 | ) 111 | return; 112 | 113 | document.addEventListener("keydown", (event) => { 114 | // bail for input elements 115 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 116 | // bail with special keys 117 | if (event.altKey || event.ctrlKey || event.metaKey) return; 118 | 119 | if (!event.shiftKey) { 120 | switch (event.key) { 121 | case "ArrowLeft": 122 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 123 | 124 | const prevLink = document.querySelector('link[rel="prev"]'); 125 | if (prevLink && prevLink.href) { 126 | window.location.href = prevLink.href; 127 | event.preventDefault(); 128 | } 129 | break; 130 | case "ArrowRight": 131 | if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; 132 | 133 | const nextLink = document.querySelector('link[rel="next"]'); 134 | if (nextLink && nextLink.href) { 135 | window.location.href = nextLink.href; 136 | event.preventDefault(); 137 | } 138 | break; 139 | } 140 | } 141 | 142 | // some keyboard layouts may need Shift to get / 143 | switch (event.key) { 144 | case "/": 145 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; 146 | Documentation.focusSearchBar(); 147 | event.preventDefault(); 148 | } 149 | }); 150 | }, 151 | }; 152 | 153 | // quick alias for translations 154 | const _ = Documentation.gettext; 155 | 156 | _ready(Documentation.init); 157 | -------------------------------------------------------------------------------- /docs/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '0.6', 4 | LANGUAGE: 'en', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '.txt', 11 | NAVIGATION_WITH_KEYS: false, 12 | SHOW_SEARCH_SUMMARY: true, 13 | ENABLE_SEARCH_SHORTCUTS: true, 14 | }; -------------------------------------------------------------------------------- /docs/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/file.png -------------------------------------------------------------------------------- /docs/html/_static/language_data.js: -------------------------------------------------------------------------------- 1 | /* 2 | * language_data.js 3 | * ~~~~~~~~~~~~~~~~ 4 | * 5 | * This script contains the language-specific data used by searchtools.js, 6 | * namely the list of stopwords, stemmer, scorer and splitter. 7 | * 8 | * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. 9 | * :license: BSD, see LICENSE for details. 10 | * 11 | */ 12 | 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; 14 | 15 | 16 | /* Non-minified version is copied as a separate JS file, is available */ 17 | 18 | /** 19 | * Porter Stemmer 20 | */ 21 | var Stemmer = function() { 22 | 23 | var step2list = { 24 | ational: 'ate', 25 | tional: 'tion', 26 | enci: 'ence', 27 | anci: 'ance', 28 | izer: 'ize', 29 | bli: 'ble', 30 | alli: 'al', 31 | entli: 'ent', 32 | eli: 'e', 33 | ousli: 'ous', 34 | ization: 'ize', 35 | ation: 'ate', 36 | ator: 'ate', 37 | alism: 'al', 38 | iveness: 'ive', 39 | fulness: 'ful', 40 | ousness: 'ous', 41 | aliti: 'al', 42 | iviti: 'ive', 43 | biliti: 'ble', 44 | logi: 'log' 45 | }; 46 | 47 | var step3list = { 48 | icate: 'ic', 49 | ative: '', 50 | alize: 'al', 51 | iciti: 'ic', 52 | ical: 'ic', 53 | ful: '', 54 | ness: '' 55 | }; 56 | 57 | var c = "[^aeiou]"; // consonant 58 | var v = "[aeiouy]"; // vowel 59 | var C = c + "[^aeiouy]*"; // consonant sequence 60 | var V = v + "[aeiou]*"; // vowel sequence 61 | 62 | var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 63 | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 64 | var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 65 | var s_v = "^(" + C + ")?" + v; // vowel in stem 66 | 67 | this.stemWord = function (w) { 68 | var stem; 69 | var suffix; 70 | var firstch; 71 | var origword = w; 72 | 73 | if (w.length < 3) 74 | return w; 75 | 76 | var re; 77 | var re2; 78 | var re3; 79 | var re4; 80 | 81 | firstch = w.substr(0,1); 82 | if (firstch == "y") 83 | w = firstch.toUpperCase() + w.substr(1); 84 | 85 | // Step 1a 86 | re = /^(.+?)(ss|i)es$/; 87 | re2 = /^(.+?)([^s])s$/; 88 | 89 | if (re.test(w)) 90 | w = w.replace(re,"$1$2"); 91 | else if (re2.test(w)) 92 | w = w.replace(re2,"$1$2"); 93 | 94 | // Step 1b 95 | re = /^(.+?)eed$/; 96 | re2 = /^(.+?)(ed|ing)$/; 97 | if (re.test(w)) { 98 | var fp = re.exec(w); 99 | re = new RegExp(mgr0); 100 | if (re.test(fp[1])) { 101 | re = /.$/; 102 | w = w.replace(re,""); 103 | } 104 | } 105 | else if (re2.test(w)) { 106 | var fp = re2.exec(w); 107 | stem = fp[1]; 108 | re2 = new RegExp(s_v); 109 | if (re2.test(stem)) { 110 | w = stem; 111 | re2 = /(at|bl|iz)$/; 112 | re3 = new RegExp("([^aeiouylsz])\\1$"); 113 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 114 | if (re2.test(w)) 115 | w = w + "e"; 116 | else if (re3.test(w)) { 117 | re = /.$/; 118 | w = w.replace(re,""); 119 | } 120 | else if (re4.test(w)) 121 | w = w + "e"; 122 | } 123 | } 124 | 125 | // Step 1c 126 | re = /^(.+?)y$/; 127 | if (re.test(w)) { 128 | var fp = re.exec(w); 129 | stem = fp[1]; 130 | re = new RegExp(s_v); 131 | if (re.test(stem)) 132 | w = stem + "i"; 133 | } 134 | 135 | // Step 2 136 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 137 | if (re.test(w)) { 138 | var fp = re.exec(w); 139 | stem = fp[1]; 140 | suffix = fp[2]; 141 | re = new RegExp(mgr0); 142 | if (re.test(stem)) 143 | w = stem + step2list[suffix]; 144 | } 145 | 146 | // Step 3 147 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 148 | if (re.test(w)) { 149 | var fp = re.exec(w); 150 | stem = fp[1]; 151 | suffix = fp[2]; 152 | re = new RegExp(mgr0); 153 | if (re.test(stem)) 154 | w = stem + step3list[suffix]; 155 | } 156 | 157 | // Step 4 158 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 159 | re2 = /^(.+?)(s|t)(ion)$/; 160 | if (re.test(w)) { 161 | var fp = re.exec(w); 162 | stem = fp[1]; 163 | re = new RegExp(mgr1); 164 | if (re.test(stem)) 165 | w = stem; 166 | } 167 | else if (re2.test(w)) { 168 | var fp = re2.exec(w); 169 | stem = fp[1] + fp[2]; 170 | re2 = new RegExp(mgr1); 171 | if (re2.test(stem)) 172 | w = stem; 173 | } 174 | 175 | // Step 5 176 | re = /^(.+?)e$/; 177 | if (re.test(w)) { 178 | var fp = re.exec(w); 179 | stem = fp[1]; 180 | re = new RegExp(mgr1); 181 | re2 = new RegExp(meq1); 182 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 183 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) 184 | w = stem; 185 | } 186 | re = /ll$/; 187 | re2 = new RegExp(mgr1); 188 | if (re.test(w) && re2.test(w)) { 189 | re = /.$/; 190 | w = w.replace(re,""); 191 | } 192 | 193 | // and turn initial Y back to y 194 | if (firstch == "y") 195 | w = firstch.toLowerCase() + w.substr(1); 196 | return w; 197 | } 198 | } 199 | 200 | -------------------------------------------------------------------------------- /docs/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/minus.png -------------------------------------------------------------------------------- /docs/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/plus.png -------------------------------------------------------------------------------- /docs/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | pre { line-height: 125%; } 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } 6 | .highlight .hll { background-color: #ffffcc } 7 | .highlight { background: #f8f8f8; } 8 | .highlight .c { color: #8f5902; font-style: italic } /* Comment */ 9 | .highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */ 10 | .highlight .g { color: #000000 } /* Generic */ 11 | .highlight .k { color: #004461; font-weight: bold } /* Keyword */ 12 | .highlight .l { color: #000000 } /* Literal */ 13 | .highlight .n { color: #000000 } /* Name */ 14 | .highlight .o { color: #582800 } /* Operator */ 15 | .highlight .x { color: #000000 } /* Other */ 16 | .highlight .p { color: #000000; font-weight: bold } /* Punctuation */ 17 | .highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */ 18 | .highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */ 19 | .highlight .cp { color: #8f5902 } /* Comment.Preproc */ 20 | .highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */ 21 | .highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */ 22 | .highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */ 23 | .highlight .gd { color: #a40000 } /* Generic.Deleted */ 24 | .highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ 25 | .highlight .gr { color: #ef2929 } /* Generic.Error */ 26 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 27 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 28 | .highlight .go { color: #888888 } /* Generic.Output */ 29 | .highlight .gp { color: #745334 } /* Generic.Prompt */ 30 | .highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */ 31 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 32 | .highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */ 33 | .highlight .kc { color: #004461; font-weight: bold } /* Keyword.Constant */ 34 | .highlight .kd { color: #004461; font-weight: bold } /* Keyword.Declaration */ 35 | .highlight .kn { color: #004461; font-weight: bold } /* Keyword.Namespace */ 36 | .highlight .kp { color: #004461; font-weight: bold } /* Keyword.Pseudo */ 37 | .highlight .kr { color: #004461; font-weight: bold } /* Keyword.Reserved */ 38 | .highlight .kt { color: #004461; font-weight: bold } /* Keyword.Type */ 39 | .highlight .ld { color: #000000 } /* Literal.Date */ 40 | .highlight .m { color: #990000 } /* Literal.Number */ 41 | .highlight .s { color: #4e9a06 } /* Literal.String */ 42 | .highlight .na { color: #c4a000 } /* Name.Attribute */ 43 | .highlight .nb { color: #004461 } /* Name.Builtin */ 44 | .highlight .nc { color: #000000 } /* Name.Class */ 45 | .highlight .no { color: #000000 } /* Name.Constant */ 46 | .highlight .nd { color: #888888 } /* Name.Decorator */ 47 | .highlight .ni { color: #ce5c00 } /* Name.Entity */ 48 | .highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */ 49 | .highlight .nf { color: #000000 } /* Name.Function */ 50 | .highlight .nl { color: #f57900 } /* Name.Label */ 51 | .highlight .nn { color: #000000 } /* Name.Namespace */ 52 | .highlight .nx { color: #000000 } /* Name.Other */ 53 | .highlight .py { color: #000000 } /* Name.Property */ 54 | .highlight .nt { color: #004461; font-weight: bold } /* Name.Tag */ 55 | .highlight .nv { color: #000000 } /* Name.Variable */ 56 | .highlight .ow { color: #004461; font-weight: bold } /* Operator.Word */ 57 | .highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */ 58 | .highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */ 59 | .highlight .mb { color: #990000 } /* Literal.Number.Bin */ 60 | .highlight .mf { color: #990000 } /* Literal.Number.Float */ 61 | .highlight .mh { color: #990000 } /* Literal.Number.Hex */ 62 | .highlight .mi { color: #990000 } /* Literal.Number.Integer */ 63 | .highlight .mo { color: #990000 } /* Literal.Number.Oct */ 64 | .highlight .sa { color: #4e9a06 } /* Literal.String.Affix */ 65 | .highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */ 66 | .highlight .sc { color: #4e9a06 } /* Literal.String.Char */ 67 | .highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */ 68 | .highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */ 69 | .highlight .s2 { color: #4e9a06 } /* Literal.String.Double */ 70 | .highlight .se { color: #4e9a06 } /* Literal.String.Escape */ 71 | .highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */ 72 | .highlight .si { color: #4e9a06 } /* Literal.String.Interpol */ 73 | .highlight .sx { color: #4e9a06 } /* Literal.String.Other */ 74 | .highlight .sr { color: #4e9a06 } /* Literal.String.Regex */ 75 | .highlight .s1 { color: #4e9a06 } /* Literal.String.Single */ 76 | .highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */ 77 | .highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */ 78 | .highlight .fm { color: #000000 } /* Name.Function.Magic */ 79 | .highlight .vc { color: #000000 } /* Name.Variable.Class */ 80 | .highlight .vg { color: #000000 } /* Name.Variable.Global */ 81 | .highlight .vi { color: #000000 } /* Name.Variable.Instance */ 82 | .highlight .vm { color: #000000 } /* Name.Variable.Magic */ 83 | .highlight .il { color: #990000 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/html/_static/sphinx_highlight.js: -------------------------------------------------------------------------------- 1 | /* Highlighting utilities for Sphinx HTML documentation. */ 2 | "use strict"; 3 | 4 | const SPHINX_HIGHLIGHT_ENABLED = true 5 | 6 | /** 7 | * highlight a given string on a node by wrapping it in 8 | * span elements with the given class name. 9 | */ 10 | const _highlight = (node, addItems, text, className) => { 11 | if (node.nodeType === Node.TEXT_NODE) { 12 | const val = node.nodeValue; 13 | const parent = node.parentNode; 14 | const pos = val.toLowerCase().indexOf(text); 15 | if ( 16 | pos >= 0 && 17 | !parent.classList.contains(className) && 18 | !parent.classList.contains("nohighlight") 19 | ) { 20 | let span; 21 | 22 | const closestNode = parent.closest("body, svg, foreignObject"); 23 | const isInSVG = closestNode && closestNode.matches("svg"); 24 | if (isInSVG) { 25 | span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); 26 | } else { 27 | span = document.createElement("span"); 28 | span.classList.add(className); 29 | } 30 | 31 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 32 | parent.insertBefore( 33 | span, 34 | parent.insertBefore( 35 | document.createTextNode(val.substr(pos + text.length)), 36 | node.nextSibling 37 | ) 38 | ); 39 | node.nodeValue = val.substr(0, pos); 40 | 41 | if (isInSVG) { 42 | const rect = document.createElementNS( 43 | "http://www.w3.org/2000/svg", 44 | "rect" 45 | ); 46 | const bbox = parent.getBBox(); 47 | rect.x.baseVal.value = bbox.x; 48 | rect.y.baseVal.value = bbox.y; 49 | rect.width.baseVal.value = bbox.width; 50 | rect.height.baseVal.value = bbox.height; 51 | rect.setAttribute("class", className); 52 | addItems.push({ parent: parent, target: rect }); 53 | } 54 | } 55 | } else if (node.matches && !node.matches("button, select, textarea")) { 56 | node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); 57 | } 58 | }; 59 | const _highlightText = (thisNode, text, className) => { 60 | let addItems = []; 61 | _highlight(thisNode, addItems, text, className); 62 | addItems.forEach((obj) => 63 | obj.parent.insertAdjacentElement("beforebegin", obj.target) 64 | ); 65 | }; 66 | 67 | /** 68 | * Small JavaScript module for the documentation. 69 | */ 70 | const SphinxHighlight = { 71 | 72 | /** 73 | * highlight the search words provided in localstorage in the text 74 | */ 75 | highlightSearchWords: () => { 76 | if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight 77 | 78 | // get and clear terms from localstorage 79 | const url = new URL(window.location); 80 | const highlight = 81 | localStorage.getItem("sphinx_highlight_terms") 82 | || url.searchParams.get("highlight") 83 | || ""; 84 | localStorage.removeItem("sphinx_highlight_terms") 85 | url.searchParams.delete("highlight"); 86 | window.history.replaceState({}, "", url); 87 | 88 | // get individual terms from highlight string 89 | const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); 90 | if (terms.length === 0) return; // nothing to do 91 | 92 | // There should never be more than one element matching "div.body" 93 | const divBody = document.querySelectorAll("div.body"); 94 | const body = divBody.length ? divBody[0] : document.querySelector("body"); 95 | window.setTimeout(() => { 96 | terms.forEach((term) => _highlightText(body, term, "highlighted")); 97 | }, 10); 98 | 99 | const searchBox = document.getElementById("searchbox"); 100 | if (searchBox === null) return; 101 | searchBox.appendChild( 102 | document 103 | .createRange() 104 | .createContextualFragment( 105 | '" 109 | ) 110 | ); 111 | }, 112 | 113 | /** 114 | * helper function to hide the search marks again 115 | */ 116 | hideSearchWords: () => { 117 | document 118 | .querySelectorAll("#searchbox .highlight-link") 119 | .forEach((el) => el.remove()); 120 | document 121 | .querySelectorAll("span.highlighted") 122 | .forEach((el) => el.classList.remove("highlighted")); 123 | localStorage.removeItem("sphinx_highlight_terms") 124 | }, 125 | 126 | initEscapeListener: () => { 127 | // only install a listener if it is really needed 128 | if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; 129 | 130 | document.addEventListener("keydown", (event) => { 131 | // bail for input elements 132 | if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; 133 | // bail with special keys 134 | if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; 135 | if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { 136 | SphinxHighlight.hideSearchWords(); 137 | event.preventDefault(); 138 | } 139 | }); 140 | }, 141 | }; 142 | 143 | _ready(SphinxHighlight.highlightSearchWords); 144 | _ready(SphinxHighlight.initEscapeListener); 145 | -------------------------------------------------------------------------------- /docs/html/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | About — pySciSci 0.6 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 |
35 | 36 | 37 |
38 | 39 |
40 |

About

41 |

The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science [1]. It encompasses diverse interdisciplinary research programs that study the processes underlying science [2]. The field has benefited greatly from access to massive digital databases containing the products of scientific discourse—including publications, journals, patents, books, conference proceedings, and grants. The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc..

42 |

Here we present pySciSci for the analysis of large-scale bibliometric data. The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques. The pySciSci package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects.

43 |

By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community. We also encourage the SciSci community to contribute their own implementations, data, and use cases.

44 |
45 |

Funding

46 |

pySciSci acknowledges support from the following grants:

47 |
    48 |
  • Air Force Office of Scientific Research Award FA9550-19-1-0354

  • 49 |
  • Templeton Foundation Contract 61066

  • 50 |
51 |
52 |
53 | 54 | 55 |
56 | 57 |
58 |
59 | 113 |
114 |
115 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /docs/html/installation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Installation — pySciSci 0.6 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 |
35 | 36 | 37 |
38 | 39 |
40 |

Installation

41 |

This package is available in PyPI. Just run the following command on terminal to install.

42 |
>>> pip install pyscisci
 43 | 
44 |
45 |

You can also source the code directly from the github project page .

46 |
47 | 48 | 49 |
50 | 51 |
52 |
53 | 104 |
105 |
106 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/objects.inv -------------------------------------------------------------------------------- /docs/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Search — pySciSci 0.6 documentation 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
36 |
37 |
38 | 39 | 40 |
41 | 42 |

Search

43 | 44 | 52 | 53 | 54 |

55 | Searching for multiple words only shows matches that contain 56 | all words. 57 |

58 | 59 | 60 |
61 | 62 | 63 | 64 |
65 | 66 | 67 | 68 |
69 | 70 |
71 | 72 | 73 |
74 | 75 |
76 |
77 | 116 |
117 |
118 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pySciSci documentation master file, created by 2 | sphinx-quickstart on Mon Jun 21 10:54:47 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pySciSci's documentation! 7 | ************************************** 8 | 9 | Table of Contents 10 | =================== 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | about 15 | installation 16 | examples 17 | databases 18 | methods 19 | network 20 | utils 21 | 22 | 23 | References 24 | =================== 25 | .. bibliography:: -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | =================== 3 | This package is available in PyPI. Just run the following command on terminal to install. 4 | 5 | >>> pip install pyscisci 6 | 7 | You can also source the code directly from the github `project page `_ . -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/methods.rst: -------------------------------------------------------------------------------- 1 | Bibliometrics 2 | ====================== 3 | *pySciSci* facilitates the analysis of publications, authors, citations and as well as citation time-series, fixed time window citation analysis, and citation count normalization by year and field. 4 | 5 | 6 | Publications and Citations 7 | --------------------------- 8 | The *pySciSci* package facilitates the analysis of interrelationships between publications as captured by references and citations. 9 | 10 | For example, the most common measure of scientific impact is the citation count, or the number of times a publication has been referenced by other publications. Variations also include citation time-series, fixed time window citation analysis, citation count normalization by year and field, and citation ranks. More advanced methods fit models to citation timeseries, such as in the prediction of the long-term citation counts to a publication :cite:`wang2013longterm`, or in the assignment of the sleeping beauty score :cite:`ke2015sleepingbeauty`. The package also removes of self-citations occurring between publications by the same author. 11 | 12 | More advanced metrics capture the diversity in the citation interrelationships between publications. These measures include the Rao-Stirling reference interdisciplinary :cite:`stirling2007diversity`, novelty & conventionality :cite:`uzzi2013atypical`, and the disruption index :cite:`funk2017dynamic`, :cite:`wu2019largeteams`. 13 | 14 | .. automodule:: pyscisci.methods.publication 15 | :members: 16 | 17 | 18 | 19 | Author-centric Methods 20 | ---------------------- 21 | 22 | The sociology of science has analyzed scientific careers in terms of individual incentives, productivity, competition, collaboration, and success. The *pySciSci* package facilitates author career analysis through both aggregate career statistics and temporal career trajectories. Highlights include the H-index :cite:`hirsch2005index`, Q-factor :cite:`sinatra2016quantifying`, yearly productivity trajectories :cite:`way2017misleading`, collective credit assignment :cite:`shen2014collective`, and hot-hand effect :cite:`liu2018hot`. 23 | 24 | 25 | .. automodule:: pyscisci.methods.author 26 | :members: 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/network.rst: -------------------------------------------------------------------------------- 1 | Biblio-Networks 2 | ====================== 3 | Intro to networks 4 | 5 | .. automodule:: pyscisci.network 6 | :members: -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinxcontrib-bibtex -------------------------------------------------------------------------------- /docs/utils.rst: -------------------------------------------------------------------------------- 1 | General Functions 2 | ====================== 3 | 4 | .. automodule:: pyscisci.utils 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /examples/Getting_Started/DatabaseGrowth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Getting_Started/DatabaseGrowth.pdf -------------------------------------------------------------------------------- /examples/Getting_Started/Getting Started with OpenAlex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "874e645e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "\n", 11 | "import pyscisci.all as pyscisci\n", 12 | "\n", 13 | "import os\n", 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "import matplotlib.pylab as plt\n", 17 | "\n", 18 | "%matplotlib inline" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "122e3e61", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# set this path to where the OpenAlex database will be stored\n", 29 | "path2openalex = '/home/ajgates/OpenAlex'\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "61b2080c", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "myoa = pyscisci.OpenAlex(path2openalex, database_extension='csv.gz', keep_in_memory=False) \n", 40 | "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n", 41 | "# cant keep more than one dataframe in memory at a time\n", 42 | "# otherwise keep_in_memory=True will keep each database in memory after its loaded" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "3c15092c", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "#df_features_to_keep = ['all'] # keep all data\n", 53 | "# OR\n", 54 | "df_features_to_keep = ['affiliations', 'authors', 'publications', 'references', \n", 55 | " 'publicationauthoraffiliation', 'fields'] # keep everything besides the abstracts\n", 56 | "myoa.download_from_source(rewrite_existing = False,\n", 57 | " dataframe_list=df_features_to_keep)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "6395510f", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# before we can start running our analysis, we have to preprocess the raw data into\n", 68 | "# DataFrames that are more convinent to work with\n", 69 | "\n", 70 | "# we only need to run this for the first time, but it will take awhile\n", 71 | "myoa.preprocess(dataframe_list=['all'])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "20165e51", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3 (ipykernel)", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.10.14" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 5 104 | } 105 | -------------------------------------------------------------------------------- /examples/Getting_Started/Getting Started with PubMed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\n", 10 | "import pyscisci.all as pyscisci\n", 11 | "\n", 12 | "import os\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import matplotlib.pylab as plt\n", 16 | "\n", 17 | "%matplotlib inline" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# set this path to where the PubMed database will be stored\n", 27 | "path2pubmed = '/home/ajgates/PubMed'\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "mypubmed = pyscisci.PubMed(path2pubmed, database_extension='csv.gz', keep_in_memory=False) \n", 37 | "\n", 38 | "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n", 39 | "# cant keep more than one DataFrame in memory at a time\n", 40 | "\n", 41 | "# otherwise keep_in_memory=True will keep each DataFrame in memory after its loaded" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# This function will download the latest baseline version of PubMed.\n", 51 | "# Depending on your internet connection, it could take several hours to complete the download.\n", 52 | "\n", 53 | "mypubmed.download_from_source(rewrite_existing=False)\n", 54 | "# if your connetion breaks/download stops for any reason, set rewrite_existing = False and \n", 55 | "# rerun to continue downloading where you left off" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# before we can start running our analysis, we have to preprocess the raw data into\n", 65 | "# DataFrames that are more convinent to work with\n", 66 | "mypubmed.preprocess(show_progress=True)\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# PubMed creates three dataframes:\n", 76 | "# pub - keeps all of the publication information\n", 77 | "# columns : ['PublicationId', 'Title', 'Year', 'Month', 'Day', 'Doi', 'JournalId', 'ISSN', 'Volume', 'Issue', 'Pages', 'TeamSize']\n", 78 | "\n", 79 | "# paa - links the publications to authors and affiliations \n", 80 | "# NOTE: PubMed does not disambiguate authors!!!\n", 81 | "# columns : ['PublicationId', 'FirstName', 'LastName', 'FullName', 'Affiliations', 'AuthorSequence']\n", 82 | "\n", 83 | "# pub2field - links the publications to fields (aka subjectAreas)\n", 84 | "# columns : ['PublicationId', 'FieldId']\n", 85 | "\n", 86 | "# pub2ref - keeps the citation information\n", 87 | "# columns : ['CitingPublicationId', 'CitedPublicationId']\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# lets plot the number of publications each year\n", 97 | "yearly_articles = pyscisci.groupby_count(df=mypubmed.pub , colgroupby='Year', colcountby='PublicationId', count_unique=True,\n", 98 | " show_progress=True)\n", 99 | "yearly_articles.sort_values(by='Year', inplace=True)\n", 100 | "\n", 101 | "fig, ax = plt.subplots(1,1,figsize=(8,5))\n", 102 | "\n", 103 | "ax.plot(yearly_articles['Year'],yearly_articles['PublicationIdCount'])\n", 104 | "\n", 105 | "ax.set_xlabel('Year')\n", 106 | "ax.set_ylabel(\"# of publications\")\n", 107 | "ax.set_yscale('log')\n", 108 | "\n", 109 | "plt.show()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3 (ipykernel)", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.10.8" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 2 141 | } 142 | -------------------------------------------------------------------------------- /examples/Getting_Started/Getting Started with WOS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\n", 10 | "import pyscisci.all as pyscisci\n", 11 | "\n", 12 | "import os\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import matplotlib.pylab as plt\n", 16 | "\n", 17 | "%matplotlib inline" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# set this path to where the WOS database will be stored\n", 27 | "path2wos = '/home/ajgates/WOS'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "mywos = pyscisci.WOS(path2wos, database_extension='csv.gz', show_progress=True) \n", 37 | "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n", 38 | "# cant keep more than one database in memory at a time\n", 39 | "# otherwise keep_in_memory=True will keep each database in memory after its loaded" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# before we can start running our analysis, we have to preprocess the raw data into\n", 49 | "# DataFrames that are more convinent to work with\n", 50 | "\n", 51 | "# we only need to run this for the first time, but it will take awhile\n", 52 | "mywos.preprocess(xml_directory = 'RawXML', name_space = None, show_progress=True)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# WOS contains the following dataframes:\n", 62 | "\n", 63 | "# pub - keeps all of the publication information\n", 64 | "# columns : ['PublicationId', 'Year', 'JournalId', 'FamilyId', 'Doi', 'Title', 'Date', 'Volume', 'Issue', 'DocType']\n", 65 | "\n", 66 | "# author - keeps all of the author information (only some versions of WoS ship with the AuthorId known as the AuthorDAIS)\n", 67 | "# columns : ['AuthorId', 'FullName', 'LastName', 'FirstName', 'MiddleName']\n", 68 | "\n", 69 | "# pub2ref - links publications to their references or citations\n", 70 | "# columns : ['CitingPublicationId', 'CitedPublicationId']\n", 71 | "\n", 72 | "# paa - links publications, authors, and affiliations (only some versions of WoS ship with the AuthorId known as the AuthorDAIS)\n", 73 | "# columns : ['PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence', 'OrigAuthorName', 'OrigAffiliationName']\n", 74 | "\n", 75 | "# author2pub - links the authors to their publications\n", 76 | "# columns : ['PublicationId', 'AuthorId', 'AuthorOrder']\n", 77 | "\n", 78 | "# field - field information\n", 79 | "# columns : ['FieldId', 'FieldLevel', 'NumberPublications', 'FieldName']\n", 80 | "\n", 81 | "# pub2field - links publications to their fields\n", 82 | "# columns : ['PublicationId', 'FieldId']\n", 83 | "\n", 84 | "# affiliation - affiliation information\n", 85 | "# columns : ['AffiliationId', 'NumberPublications', 'NumberCitations', 'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude', 'Longitude']\n", 86 | "\n", 87 | "# journal - journal information\n", 88 | "# columns : ['JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage']\n", 89 | "\n", 90 | "\n", 91 | "# after additional processing, these DataFrames become available\n", 92 | "\n", 93 | "# pub2refnoself - links publications to their references or citations with self-citations removed\n", 94 | "# columns : ['CitingPublicationId', 'CitedPublicationId']\n", 95 | "\n", 96 | "# impact - precomputed citation counts, columns will depend on which counts are computed\n", 97 | "# columns : ['PublicationId', 'Year', ....]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# lets plot the number of publications each year\n", 107 | "pub = mywos.pub \n", 108 | "\n", 109 | "yearly_articles = pyscisci.groupby_count(df=pub , colgroupby='Year', colcountby='PublicationId', count_unique=True)\n", 110 | "yearly_articles.sort_values(by='Year', inplace=True)\n", 111 | "yearly_articles = yearly_articles.loc[yearly_articles['Year'] > 0]\n", 112 | "\n", 113 | "fig, ax = plt.subplots(1,1,figsize=(8,5))\n", 114 | "\n", 115 | "ax.plot(yearly_articles['Year'],yearly_articles['PublicationIdCount'])\n", 116 | "\n", 117 | "ax.set_xlabel('Year')\n", 118 | "ax.set_ylabel(\"# of publications\")\n", 119 | "ax.set_yscale('log')\n", 120 | "\n", 121 | "plt.show()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# now we can see the distribution of author productivity \n", 131 | "pa = mymag.author2pub # publication author relationships\n", 132 | "\n", 133 | "author_prod = pyscisci.author_productivity(pa , colgroupby = 'AuthorId', colcountby = 'PublicationId')\n", 134 | "\n", 135 | "prodvalues, prodcounts = np.unique(author_prod['Productivity'].values, return_counts=True)\n", 136 | "\n", 137 | "fig, ax = plt.subplots(1,1,figsize=(8,5))\n", 138 | "\n", 139 | "ax.scatter(prodvalues, prodcounts)\n", 140 | "\n", 141 | "ax.set_xlabel('Productivity')\n", 142 | "ax.set_ylabel(\"# of authors\")\n", 143 | "ax.set_xscale('log')\n", 144 | "ax.set_yscale('log')\n", 145 | "\n", 146 | "plt.show()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "Python 3 (ipykernel)", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.10.8" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 2 178 | } 179 | -------------------------------------------------------------------------------- /examples/GlobalCitationNetwork/data/Chengetal_idealist.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/Chengetal_idealist.csv.gz -------------------------------------------------------------------------------- /examples/GlobalCitationNetwork/data/diffusion_panel_data_1990_2017.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/diffusion_panel_data_1990_2017.csv.gz -------------------------------------------------------------------------------- /examples/GlobalCitationNetwork/data/link_prediction_panel_data_1990_2017.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/link_prediction_panel_data_1990_2017.csv.gz -------------------------------------------------------------------------------- /examples/GlobalCitationNetwork/data/oa_countrycites_nosameorg_auc.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/oa_countrycites_nosameorg_auc.csv.gz -------------------------------------------------------------------------------- /examples/GlobalCitationNetwork/vizutils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import matplotlib.gridspec as gridspec 5 | 6 | 7 | def pvalue2stars(pvalue): 8 | if pvalue < 0.001: 9 | return "^{***}" 10 | elif pvalue < 0.01: 11 | return "^{**}" 12 | elif pvalue < 0.05: 13 | return "^{*}" 14 | else: 15 | return "" 16 | 17 | 18 | def table_row(varname='const', offset=0, roun=2, modellist=[], namedict={},i=0,j=1): 19 | 20 | row_text = namedict[varname] + " & "*(offset+1) + " & ".join([ "$" + str(model.params[i][varname].round(roun)) + pvalue2stars(model.pvalues[i][varname]) + "$" for model in modellist[offset:]]) + "\\\\ \n" 21 | 22 | row_text += " & "*(offset+1) + " & ".join([ "$(" + str(model.conf_int().loc[str(j)].loc[varname]['lower'].round(roun)) + "," + str(model.conf_int().loc[str(j)].loc[varname]['upper'].round(roun)) + ")$" for model in modellist[offset:]]) + "\\\\ \n" 23 | row_text += " & "*(offset+1) + " & ".join([ "S.E. $" + str(model.bse[i][varname].round(roun)) + "$; p-v $" +str(model.pvalues[i][varname].round(4)) + "$" for model in modellist[offset:]]) + "\\\\ [0.8ex] \n" 24 | 25 | #row_text += " & "*(offset+1) + " & ".join([ "std. err. $" + str(model.std_errors[varname].round(roun)) + "$" for model in modellist[offset:]]) + "\\\\ [0.8ex] \n" 26 | return row_text 27 | 28 | def make_multinomial_latex_table(fit_models, exog_var_sets = [], dep_var = "", namedict = {}, caption_text=""): 29 | 30 | Nmodels = len(exog_var_sets) 31 | 32 | table_text= """{\\tiny 33 | \\begin{longtable}{p{0.2\\linewidth}"""+"p{0.12\\linewidth}"*Nmodels+"}"+"""\\caption{\\textbf{Fixed-effect multinomial logit regression.} Model coefficients labelled by $p$-value. Standard errors in parentheses.} 34 | \\label{table:multinomialfull} \\\\ 35 | \\hline \\hline \\\\ 36 | \\multicolumn{"""+str(Nmodels+1)+"""}{c}{\\textbf{Dependent variable: Citation preference}} \\\\ \\hline 37 | & \\multicolumn{"""+str(Nmodels)+"""}{c}{Model} \\\\""" 38 | 39 | table_text += "\cline{2-" + str(Nmodels + 1) + "}" 40 | 41 | for icol in range(Nmodels): 42 | table_text += "& (" + str(icol + 1) + ")" 43 | 44 | 45 | table_text += """\\\\[0.8ex] 46 | \\hline 47 | \\endfirsthead""" 48 | 49 | table_text += """\\multicolumn{2}{c}% 50 | {{\\tablename\\ \\thetable{} -- continued from previous page}} \\\\ 51 | \\hline \\\\""" 52 | 53 | for icol in range(Nmodels): 54 | table_text += "& (" + str(icol + 1) + ")" 55 | 56 | table_text += """\\\\ 57 | \\hline 58 | \\endhead """ 59 | 60 | table_text += "\\hline"+"&"*(Nmodels+1-2)+ """\\multicolumn{2}{r}{{Continued on next page}} \\\\ \\endfoot 61 | \\hline 62 | \\caption*{} \\\\ 63 | \\endlastfoot""" 64 | 65 | table_text += """$\\mathbf{Citation~Preference: Positive}$ & & & & & \\\\ [1.8ex]""" 66 | 67 | # add constant 68 | table_text += table_row(varname='const', offset=0, roun=2, modellist=fit_models, namedict=namedict,i=0,j=1) 69 | 70 | for offset, varlist in enumerate(exog_var_sets): 71 | for var in varlist: 72 | table_text += table_row(varname=var, offset=offset, roun=2, modellist=fit_models, namedict=namedict,i=0,j=1) 73 | 74 | table_text += """ \\hline \\\\ $\\mathbf{Citation~Preference: Negative}$ & & & & & \\\\ [1.8ex]""" 75 | 76 | table_text += table_row(varname='const', offset=0, roun=2, modellist=fit_models, namedict=namedict,i=1,j=2) 77 | 78 | for offset, varlist in enumerate(exog_var_sets): 79 | for var in varlist: 80 | table_text += table_row(varname=var, offset=offset, roun=2, modellist=fit_models, namedict=namedict,i=1,j=2) 81 | 82 | 83 | 84 | table_text += """\\hline 85 | \\hline \\\\[-1.8ex] 86 | \\textit{Note:} & \\multicolumn{2}{r}{$^{*}p<0.05$; $^{**}p<0.01$; $^{***}p<0.001$} \\\\ \n""" 87 | 88 | table_text += "Observations & " + " & ".join([str(model.nobs) for model in fit_models]) + " \\\\ \n" 89 | table_text += "Pseudo $R^2$ & " + " & ".join([str(np.round(model.prsquared, 4)) for model in fit_models]) + " \\\\ \n" 90 | table_text += "Log Likelihood & " + " & ".join([str(model.llf.round(2)) for model in fit_models]) + " \\\\ \n" 91 | #table_text += "F statistic & " + " & ".join(["$" + str(np.round(model.f_statistic.stat, 2)) + pvalue2stars(model.f_statistic.pval) + "$ (d.f.=" + str(model.f_statistic.df) + ")" for model in fit_models]) + " \\\\ \n" 92 | 93 | # m8.llr,m8.df_model,m8.llr_pvalue 94 | 95 | fstat_text = [] 96 | for model in fit_models: 97 | fstat_text.append("$" + str(np.round(model.llr, 2)) + pvalue2stars(model.llr_pvalue) + "$ (d.f.=" + str(model.df_model) + ")") 98 | 99 | table_text += "LLR $\\chi^2$ & " + " & ".join(fstat_text) + " \\\\ \n" 100 | table_text += "Year FE & " + " & ".join(['Yes' for model in fit_models]) + " \\\\ \n" 101 | 102 | table_text += "\\hline \n\\end{longtable} } }" 103 | 104 | print(table_text) -------------------------------------------------------------------------------- /examples/Method_Examples/Example of Credit Allocation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyscisci.all as pyscisci\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pylab as plt\n", 14 | "try:\n", 15 | " import seaborn as sns\n", 16 | " sns.set_style('white')\n", 17 | "except:\n", 18 | " pass\n", 19 | "\n", 20 | "%matplotlib inline" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "[0.75 0.25]\n", 33 | "{1: 0, 2: 1}\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "# Fig 1A\n", 39 | "# move d to have ids: 11 - 15\n", 40 | "pub2ref = [[11, 0], [11, 1], [11, 2], [11, 3]] # d1 citations\n", 41 | "pub2ref += [[12, 0], [12, 2], [12, 3]] # d2 citations\n", 42 | "pub2ref += [[13, 0], [13, 2], [13, 4]] # d3 citations\n", 43 | "pub2ref += [[14, 0]] # d4 citations\n", 44 | "pub2ref += [[15, 0], [15, 2], [15, 3], [15, 4]] # d5 citations\n", 45 | "pub2ref = pd.DataFrame(pub2ref , columns = ['CitingPublicationId', 'CitedPublicationId'])\n", 46 | "\n", 47 | "# authors have ids 1 - 8, assume each grey author is different\n", 48 | "pub2authorA = [[0, 1], [0, 2], [1, 1], [2,3], [2,1], [2,4], [2,5], [3,1], [4,6], [4,7], [4,8]]\n", 49 | "pub2authorA = pd.DataFrame(pub2authorA , columns = ['PublicationId', 'AuthorId'])\n", 50 | "\n", 51 | "credit_share, author2int = pyscisci.credit_share(focus_pid=0, \n", 52 | " pub2ref =pub2ref , \n", 53 | " pub2author =pub2authorA , \n", 54 | " temporal=False, \n", 55 | " normed=True,\n", 56 | " show_progress=False)\n", 57 | "\n", 58 | "print(credit_share)\n", 59 | "print(author2int)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "[0.5 0.5]\n", 72 | "{1: 0, 2: 1}\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "# Fig 1B\n", 78 | "\n", 79 | "# notice the difference between 1A and 1B is the authorships of papers 1-4\n", 80 | "# authors have ids 1 - 8, assume each grey author is different\n", 81 | "pub2authorB = [[0,1], [0,2], [1,2], [1,1], [2,2], [2,3], [2,4], [2,1], [3,5], [3,1], [3,2], [4,6], [4,7]]\n", 82 | "pub2authorB = pd.DataFrame(pub2authorB , columns = ['PublicationId', 'AuthorId'])\n", 83 | "\n", 84 | "credit_share, author2int = pyscisci.credit_share(focus_pid=0, \n", 85 | " pub2ref =pub2ref , \n", 86 | " pub2author =pub2authorB , \n", 87 | " temporal=False, \n", 88 | " normed=True,\n", 89 | " show_progress=False)\n", 90 | "\n", 91 | "print(credit_share)\n", 92 | "print(author2int)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "[[0.84615385 0.77777778 0.74193548 0.75 ]\n", 105 | " [0.15384615 0.22222222 0.25806452 0.25 ]]\n", 106 | "{1: 0, 2: 1}\n" 107 | ] 108 | }, 109 | { 110 | "name": "stderr", 111 | "output_type": "stream", 112 | "text": [ 113 | "/Users/ajgates/.pyenv/versions/3.9.0/lib/python3.9/site-packages/tqdm/std.py:699: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", 114 | " from pandas import Panel\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "# now lets make the pub2ref temporal\n", 120 | "pub2year = {11:2014, 12:2015, 13:2015, 14:2018, 15:2020}\n", 121 | "pub2ref ['CitingYear'] = [pub2year.get(pid) for pid in pub2ref ['CitingPublicationId']]\n", 122 | "\n", 123 | "credit_share, author2int, years = pyscisci.credit_share(focus_pid=0, \n", 124 | " pub2ref =pub2ref , \n", 125 | " pub2author =pub2authorA , \n", 126 | " temporal=True, \n", 127 | " normed=True,\n", 128 | " show_progress=False)\n", 129 | "\n", 130 | "print(credit_share)\n", 131 | "print(author2int)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3 (ipykernel)", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.8.12" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /examples/NLP_Examples/Example_Node2vec (umap,sem_axis).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "import pyscisci.all as pyscisci\n", 13 | "\n", 14 | "from pyscisci.embedding import Node2Vec\n", 15 | "\n", 16 | "path2dblp = '/u/yoonjis/ember_home/DBLP_new' #put yout own DBLP path here\n", 17 | "path2dblp = '/users/hgt6rn/Documents/DataSets/DBLP'\n", 18 | "mydblp = pyscisci.DBLP(path2database= path2dblp, keep_in_memory=False, show_progress=True)\n", 19 | "\n", 20 | "#a2p = mydblp.author2pub " 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# coauthorship network of Albert-Laszlo Barabasi and Mark E. J. Newman\n", 30 | "author = mydblp.author \n", 31 | "target_researcher = ['Albert-Laszlo Barabasi', 'Mark E. J. Newman']\n", 32 | "target_index = list(author[author.FullName.isin(target_researcher)].AuthorId)\n", 33 | "coauthornet, author2int = pyscisci.coauthorship_network(a2p , focus_author_ids = target_index, focus_constraint='ego', show_progress=True)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# 1. Get Node2Vec Embedding" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "\n", 50 | "model = Node2Vec(coauthornet, author2int)\n", 51 | "emb = model.learn_embedding()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "ids = list(author2int.keys())\n", 61 | "id_to_name = author.set_index('AuthorId')['FullName'].to_dict()\n", 62 | "names = [id_to_name[k] for k in author2int.keys()]\n", 63 | "emb_array = [emb[k] for k in ids]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# 2. 2-D projection of embeddings " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import umap\n", 80 | "fit = umap.UMAP(metric='cosine')\n", 81 | "u = fit.fit_transform(emb_array)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import plotly.graph_objects as go\n", 91 | "import plotly as py\n", 92 | "import plotly.express as px\n", 93 | "\n", 94 | "df = pd.DataFrame({\n", 95 | " 'x': u[:,0],\n", 96 | " 'y': u[:,1],\n", 97 | " 'name': names\n", 98 | "})\n", 99 | "\n", 100 | "fig = px.scatter(df, x=\"x\", y=\"y\", hover_name=\"name\")\n", 101 | "fig.update_layout(\n", 102 | " autosize=False,\n", 103 | " width=1000,\n", 104 | " height=800,\n", 105 | " )\n", 106 | "fig.update_traces(marker=dict(size=3),\n", 107 | " selector=dict(mode='markers'))\n", 108 | "\n", 109 | "py.offline.plot(fig, filename=\"example_interactive_html/umap_fig.html\", auto_open=False)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "# 3. Sem_axis results" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "For the detail, please read Sem_aixs paper, https://arxiv.org/abs/1806.05521.
\n", 124 | "Sem_axis usually used in word space, to characterize word semantics using many semantic axes. But It can be applied on the network also.
\n", 125 | "In this example, we define axis from two-person (Newman as a negative anchor and Barabasi as a positive anchor). \n", 126 | "Then, we can interpret persons with negative values as more Newman friendly researcher, and the person with positive values as more Barabasi friendly researcher." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "positive_entities = [245542] # Barabasi's vector\n", 136 | "negative_entities = [301349] # Newman's vector" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "sem_aixs_dict = pyscisci.sem_axis(emb, positive_entities , negative_entities)\n", 146 | "sem_axis_array = [sem_aixs_dict[id_] for id_ in ids]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "df = pd.DataFrame({\n", 156 | " 'sem_axis_result': sem_axis_array,\n", 157 | " 'y': 0,\n", 158 | " 'name': names,\n", 159 | "})" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "fig = px.scatter(df, x=\"sem_axis_result\", y=\"y\", hover_name=\"name\")\n", 169 | "fig.update_layout(\n", 170 | " autosize=False,\n", 171 | " width=1200,\n", 172 | " height=300,\n", 173 | " yaxis={\n", 174 | " 'range': [-0.1, 0.1],\n", 175 | " 'showgrid': False, # thin lines in the background\n", 176 | " 'zeroline': False, # thick line at x=0\n", 177 | " 'visible': False, # numbers below\n", 178 | " },\n", 179 | " xaxis={\n", 180 | " 'showgrid': False, # thin lines in the background\n", 181 | " 'zeroline': False, # thick line at x=0\n", 182 | " }\n", 183 | " \n", 184 | ")\n", 185 | "fig.update_traces(marker=dict(size=3),\n", 186 | " selector=dict(mode='markers'))\n", 187 | "\n", 188 | "py.offline.plot(fig, filename=\"example_interactive_html/sem_axis.html\", auto_open=False)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3 (ipykernel)", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.10.12" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 4 234 | } 235 | -------------------------------------------------------------------------------- /examples/Network_Examples/DeSollaPriceCarrerCitations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/DeSollaPriceCarrerCitations.pdf -------------------------------------------------------------------------------- /examples/Network_Examples/DiversityCocitiation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/DiversityCocitiation.pdf -------------------------------------------------------------------------------- /examples/Network_Examples/Example of Diffusion of Scientific Credit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "\n", 11 | "import os\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import networkx as nx\n", 15 | "\n", 16 | "from collections import defaultdict\n", 17 | "\n", 18 | "import scipy.sparse as spsparse\n", 19 | "import matplotlib.pylab as plt\n", 20 | "\n", 21 | "\n", 22 | "import pyscisci.all as pyscisci\n", 23 | "\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "path2aps = '/home/ajgates/APS'\n", 33 | "path2aps = '/Volumes/GatesExpansionDrive/DataSets/APS/APS2019'\n", 34 | "\n", 35 | "myaps = pyscisci.APS(path2aps, keep_in_memory=False)\n", 36 | "\n", 37 | "# NOTE: APS does not contain disambiguated author or affiliation information by default, although researchers \n", 38 | "# have produced their own disambiguation to supplement the raw data\n", 39 | "\n", 40 | "# Here, we include the author disambiguation used in Sinatra et al. (2016)\n", 41 | "# if you didn't already download the file, uncomment the line below\n", 42 | "#myaps.download_from_source(files_to_download='paa_supplement')\n", 43 | "myaps.set_new_data_path(dataframe_name='paa ', new_path='publicationauthoraffiliation_supp2010')\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "application/vnd.jupyter.widget-view+json": { 54 | "model_id": "f07e0e2dafe448ddbe8df9a509b50c5a", 55 | "version_major": 2, 56 | "version_minor": 0 57 | }, 58 | "text/plain": [ 59 | "HBox(children=(HTML(value='Loading Publications'), FloatProgress(value=0.0, max=1.0), HTML(value='')))" 60 | ] 61 | }, 62 | "metadata": {}, 63 | "output_type": "display_data" 64 | }, 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "\n" 70 | ] 71 | }, 72 | { 73 | "data": { 74 | "application/vnd.jupyter.widget-view+json": { 75 | "model_id": "323ac728696c45b8812c45d0e8aac859", 76 | "version_major": 2, 77 | "version_minor": 0 78 | }, 79 | "text/plain": [ 80 | "HBox(children=(HTML(value='Loading pub2ref'), FloatProgress(value=0.0, max=1.0), HTML(value='')))" 81 | ] 82 | }, 83 | "metadata": {}, 84 | "output_type": "display_data" 85 | }, 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "\n" 91 | ] 92 | }, 93 | { 94 | "data": { 95 | "application/vnd.jupyter.widget-view+json": { 96 | "model_id": "a41b406cb9c840c4a517a16c36ef1206", 97 | "version_major": 2, 98 | "version_minor": 0 99 | }, 100 | "text/plain": [ 101 | "HBox(children=(HTML(value='Loading Publication Author Affiliation'), FloatProgress(value=0.0, max=1.0), HTML(v…" 102 | ] 103 | }, 104 | "metadata": {}, 105 | "output_type": "display_data" 106 | }, 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "pub = myaps.pub \n", 117 | "\n", 118 | "# limit the publications to those published on/before 1966 \n", 119 | "pub = pub .loc[pub ['Year'] <= 1966]\n", 120 | "\n", 121 | "# get their references\n", 122 | "pub2ref = myaps.load_references(filter_dict={'CitingPublicationId':np.sort(pub ['PublicationId'].unique())})\n", 123 | "\n", 124 | "# and get their authors\n", 125 | "pub2author = myaps.load_publicationauthoraffiliation(columns = ['PublicationId', 'AuthorId', 'FullName'],\n", 126 | " filter_dict={'PublicationId':np.sort(pub ['PublicationId'].unique())})\n", 127 | " \n", 128 | "aid2name = {aid:name for aid, name in pub2author [['AuthorId', 'FullName']].values}\n", 129 | "del pub2author ['FullName']" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 4, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "(22015,)\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "sc, author2int = pyscisci.diffusion_of_scientific_credit(pub2ref , pub2author , \n", 147 | " pub =pub , alpha = 0.9, max_iter = 100, tol = 1.0e-10)\n", 148 | "\n", 149 | "int2aid = {i:aid for aid, i in author2int.items()}\n", 150 | "\n", 151 | "print(sc.shape)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "hans.a..bethe 0.008774540389162876\n", 164 | "john.c..slater 0.008551603901008543\n", 165 | "g...breit 0.007495337492029187\n", 166 | "j..s..schwinger 0.006353349118286071\n", 167 | "eugene.p..wigner 0.005233086936687401\n", 168 | "robert.a..millikan 0.005045043029409964\n", 169 | "robert.s..mulliken 0.004200793335143117\n", 170 | "arthur.h..compton 0.004038834819293501\n", 171 | "irving..langmuir 0.004025411535935083\n", 172 | "john.h..van vleck 0.004013967086421246\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "# print the top k authors\n", 178 | "# Note: here we use an algorithmicly disambiguated author careers. The original paper just\n", 179 | "# disambiguated authors based on unique name. So we expect the rankings to differ.\n", 180 | "\n", 181 | "topk = 10\n", 182 | "\n", 183 | "topk_authors = np.argpartition(sc, -topk)[-topk:]\n", 184 | "topk_authors = topk_authors[np.argsort(sc[topk_authors])][::-1]\n", 185 | "\n", 186 | "for int_id in topk_authors:\n", 187 | " print(aid2name[int2aid[int_id]], sc[int_id])\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3 (ipykernel)", 201 | "language": "python", 202 | "name": "python3" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.8.12" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 4 219 | } 220 | -------------------------------------------------------------------------------- /examples/Network_Examples/StirlingCocitiation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/StirlingCocitiation.pdf -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 0 Preparing PySciSci.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The Science of Science\n", 8 | "by: Dashun Wang and Albert-Laszlo Barabasi\n", 9 | "\n", 10 | "[You can get the textbook here.](https://www.amazon.com/Science-Dashun-Wang/dp/1108716954/ref=asc_df_1108716954/?tag=hyprod-20&linkCode=df0&hvadid=459526655425&hvpos=&hvnetw=g&hvrand=10075848530578766295&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9002059&hvtargid=pla-967727027885&psc=1)\n", 11 | "\n", 12 | "The companion notebooks by Alex Gates." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "To start, we need to download the data and preprocess it. \n", 20 | "These steps only need to be run once, when you first download the data. \n", 21 | "\n", 22 | "Note: We just learned Microsoft will discontinue their support for MAG as of Dec 2021. As other data become available, we will update this code." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# load the pyscisci package\n", 32 | "\n", 33 | "import pyscisci.all as pyscisci" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# you should download the MAG data from Microsoft's website:\n", 43 | "# https://www.microsoft.com/en-us/research/project/microsoft-academic-graph/\n", 44 | "\n", 45 | "# set this path to where the MAG database is locally stored\n", 46 | "path2mag = '/home/ajgates/MAG'\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Create a MAG object\n", 56 | "\n", 57 | "mymag = pyscisci.MAG(path2mag, keep_in_memory=False) \n", 58 | "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n", 59 | "# cant keep more than one database in memory at a time\n", 60 | "# otherwise keep_in_memory=True will keep each database in memory after its loaded" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# before we can start running our analysis, we have to preprocess the raw data into\n", 70 | "# DataFrames that are more convenient to work with\n", 71 | "\n", 72 | "# we only need to run this for the first time, but it will take awhile\n", 73 | "mymag.preprocess(verbose=True)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# MAG contains the following dataframes:\n", 83 | "\n", 84 | "# pub_df - keeps all of the publication information\n", 85 | "# columns : ['PublicationId', 'Year', 'JournalId', 'FamilyId', 'Doi', 'Title', 'Date', 'Volume', 'Issue', 'DocType']\n", 86 | "\n", 87 | "# author_df - keeps all of the author information\n", 88 | "# columns : ['AuthorId', 'FullName', 'LastName', 'FirstName', 'MiddleName']\n", 89 | "\n", 90 | "# pub2ref_df - links publications to their references or citations\n", 91 | "# columns : ['CitingPublicationId', 'CitedPublicationId']\n", 92 | "\n", 93 | "# paa_df - links publications, authors, and affiliations\n", 94 | "# columns : ['PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence', 'OrigAuthorName', 'OrigAffiliationName']\n", 95 | "\n", 96 | "# author2pub_df - links the authors to their publications\n", 97 | "# columns : ['PublicationId', 'AuthorId', 'AuthorOrder']\n", 98 | "\n", 99 | "# field_df - field information\n", 100 | "# columns : ['FieldId', 'FieldLevel', 'NumberPublications', 'FieldName']\n", 101 | "\n", 102 | "# pub2field_df - links publications to their fields\n", 103 | "# columns : ['PublicationId', 'FieldId']\n", 104 | "\n", 105 | "# affiliation_df - affiliation information\n", 106 | "# columns : ['AffiliationId', 'NumberPublications', 'NumberCitations', 'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude', 'Longitude']\n", 107 | "\n", 108 | "# journal_df - journal information\n", 109 | "# columns : ['JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage']\n", 110 | "\n", 111 | "\n", 112 | "# after additional processing, these DataFrames become available\n", 113 | "\n", 114 | "# pub2refnoself_df - links publications to their references or citations with self-citations removed\n", 115 | "# columns : ['CitingPublicationId', 'CitedPublicationId']\n", 116 | "\n", 117 | "# impact_df - precomputed citation counts, columns will depend on which counts are computed\n", 118 | "# columns : ['PublicationId', 'Year', ....]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 3 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython3", 145 | "version": "3.9.0" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 2 150 | } 151 | -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 05 Random Impact Rule.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 06 The Q-Factor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 08 The Increasing Dominance of Teams in Science.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 10 Coauthorship Networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /examples/ScienceOfScienceTextbook/Chapter 14 Credit Allocation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Chapter 14: Credit Allocation\n", 8 | "\n", 9 | "Note: Here we use the APS dataset with the author disambiguation used in Sinatra et al (2016)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# headers\n", 19 | "import pyscisci.all as pyscisci\n", 20 | "\n", 21 | "import numpy as np\n", 22 | "import scipy.stats as spstats\n", 23 | "\n", 24 | "import matplotlib.pylab as plt\n", 25 | "%matplotlib inline\n", 26 | "\n", 27 | "\n", 28 | "\n", 29 | "# some useful functions and definitions\n", 30 | "red_color = '#f27c96'\n", 31 | "lightblue_color = '#7cd0ea'\n", 32 | "darkblue_color = '#154959'\n", 33 | "green_color = '#93d0aa'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Experimental observation of isolated large transverse energy electrons\n", 50 | "# with associated missing energy at $\\sqrt s$ = 540 GeV\n", 51 | "fig14_1_focus_publicationid = 1674199955\n", 52 | "\n", 53 | "# A novel stereospecific alkenyl-alkenyl cross-coupling by a palladium- or nickel-catalyzed \n", 54 | "# reaction of alkenylalanes with alkenyl halides \n", 55 | "fig14_2b_publicationid = 2047184383\n", 56 | "\n", 57 | "# Electric Field Effect in Atomically Thin Carbon Films \n", 58 | "fig14_2c_publicationid = 2058122340" 59 | ] 60 | } 61 | ], 62 | "metadata": { 63 | "kernelspec": { 64 | "display_name": "Python 3", 65 | "language": "python", 66 | "name": "python3" 67 | }, 68 | "language_info": { 69 | "codemirror_mode": { 70 | "name": "ipython", 71 | "version": 3 72 | }, 73 | "file_extension": ".py", 74 | "mimetype": "text/x-python", 75 | "name": "python", 76 | "nbconvert_exporter": "python", 77 | "pygments_lexer": "ipython3", 78 | "version": "3.9.0" 79 | } 80 | }, 81 | "nbformat": 4, 82 | "nbformat_minor": 4 83 | } 84 | -------------------------------------------------------------------------------- /examples/example_data/fenn_paa.csv: -------------------------------------------------------------------------------- 1 | PublicationId,AuthorId,AffiliationId 2 | 14060727,2168590209,184840846.0 3 | 24099381,2168590209,32971472.0 4 | 36085056,2168590209, 5 | 60025733,2168590209, 6 | 66859150,2168590209, 7 | 132191034,2168590209, 8 | 144824799,2168590209,32971472.0 9 | 205131131,2168590209,32971472.0 10 | 250833172,2168590209, 11 | 305406204,2168590209, 12 | 954622638,2168590209, 13 | 976761365,2168590209,184840846.0 14 | 1036084810,2168590209, 15 | 1484230275,2168590209, 16 | 1508438708,2168590209,32971472.0 17 | 1518033024,2168590209,32971472.0 18 | 1555842123,2168590209, 19 | 1670759301,2168590209, 20 | 1684005362,2168590209,1313525311.0 21 | 1831455963,2168590209, 22 | 1937746889,2168590209,32971472.0 23 | 1967173439,2168590209,184840846.0 24 | 1969621117,2168590209, 25 | 1978232283,2168590209, 26 | 1978249208,2168590209, 27 | 1978859882,2168590209, 28 | 1980069844,2168590209,32971472.0 29 | 1980512427,2168590209,184840846.0 30 | 1980679837,2168590209, 31 | 1983689926,2168590209,184840846.0 32 | 1986330843,2168590209, 33 | 1994617550,2168590209,32971472.0 34 | 2000983103,2168590209, 35 | 2001302835,2168590209, 36 | 2005761128,2168590209,32971472.0 37 | 2015660202,2168590209,184840846.0 38 | 2016131879,2168590209, 39 | 2016297089,2168590209,32971472.0 40 | 2016717705,2168590209,32971472.0 41 | 2017661443,2168590209, 42 | 2024220877,2168590209,32971472.0 43 | 2024397888,2168590209,184840846.0 44 | 2026682897,2168590209, 45 | 2027129137,2168590209,32971472.0 46 | 2030043624,2168590209, 47 | 2036760271,2168590209, 48 | 2036940947,2168590209,32971472.0 49 | 2038636345,2168590209, 50 | 2044177896,2168590209,32971472.0 51 | 2046888612,2168590209,184840846.0 52 | 2050417652,2168590209,184840846.0 53 | 2056243786,2168590209,184840846.0 54 | 2056481117,2168590209,32971472.0 55 | 2058068992,2168590209,32971472.0 56 | 2062681226,2168590209,184840846.0 57 | 2065072453,2168590209, 58 | 2065077709,2168590209, 59 | 2065591377,2168590209, 60 | 2073292063,2168590209,32971472.0 61 | 2081767399,2168590209, 62 | 2089382829,2168590209,32971472.0 63 | 2092926413,2168590209, 64 | 2095344572,2168590209,184840846.0 65 | 2099545637,2168590209,184840846.0 66 | 2106493299,2168590209, 67 | 2119931004,2168590209,32971472.0 68 | 2141946863,2168590209,184840846.0 69 | 2143409410,2168590209,32971472.0 70 | 2153339430,2168590209,184840846.0 71 | 2154912073,2168590209,184840846.0 72 | 2162316463,2168590209, 73 | 2188402146,2168590209, 74 | 2242689142,2168590209, 75 | 2577637059,2168590209, 76 | 2601451314,2168590209, 77 | 2625037326,2168590209, 78 | 2802810717,2168590209,1313525311.0 79 | 2884933832,2168590209,32971472.0 80 | 2952384988,2168590209,184840846.0 81 | 2952841889,2168590209,32971472.0 82 | 2960356156,2168590209,184840846.0 83 | 2999970405,2168590209, 84 | 3053325085,2168590209, 85 | -------------------------------------------------------------------------------- /pyscisci/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'pyscisci' 2 | __title__ = 'pyscisci: A python package for the science of science' 3 | __description__ = 'Lets study science!' 4 | 5 | __copyright__ = '2021, Gates, A.J.' 6 | 7 | __author__ = """\n""".join([ 8 | 'Alexander J Gates ' 9 | ]) 10 | 11 | __version__ = '0.92' 12 | __release__ = '0.92' 13 | -------------------------------------------------------------------------------- /pyscisci/all.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: all 4 | :synopsis: easy interface to all of pyscisci 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | from pyscisci.utils import * 10 | from pyscisci.methods.publication import * 11 | from pyscisci.methods.journal import * 12 | from pyscisci.methods.author import * 13 | from pyscisci.methods.referencestrength import * 14 | from pyscisci.datasource.readwrite import load_preprocessed_data, append_to_preprocessed 15 | from pyscisci.network import * 16 | from pyscisci.sparsenetworkutils import * 17 | from pyscisci.nlp import * 18 | from pyscisci.datasource.MAG import MAG 19 | from pyscisci.datasource.WOS import WOS 20 | from pyscisci.datasource.DBLP import DBLP 21 | from pyscisci.datasource.APS import APS 22 | from pyscisci.datasource.PubMed import PubMed 23 | from pyscisci.datasource.OpenAlex import OpenAlex 24 | from pyscisci.datasource.CustomDB import CustomDB 25 | from pyscisci.filter import * 26 | from pyscisci.visualization import * 27 | -------------------------------------------------------------------------------- /pyscisci/datasource/CustomDB.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import gzip 5 | import zipfile 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from nameparser import HumanName 10 | 11 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 12 | if 'ipykernel' in sys.modules: 13 | from tqdm.notebook import tqdm 14 | else: 15 | from tqdm import tqdm 16 | 17 | from pyscisci.datasource.readwrite import load_preprocessed_data, load_int, load_float, load_html_str 18 | from pyscisci.database import BibDataBase 19 | from pyscisci.utils import download_file_from_google_drive 20 | 21 | # hide this annoying performance warnings 22 | import warnings 23 | warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) 24 | 25 | 26 | class CustomDB(BibDataBase): 27 | """ 28 | Base class for creating a CustomDB. 29 | 30 | """ 31 | 32 | def __init__(self, path2database = '', database_extension='csv.gz', keep_in_memory = False, global_filter=None, 33 | enable_dask=False, show_progress=True): 34 | 35 | self._default_init(path2database, database_extension, keep_in_memory, global_filter, enable_dask, show_progress) 36 | 37 | self.PublicationIdType = int 38 | self.AffiliationIdType = int 39 | self.AuthorIdType = int 40 | self.JournalIdType = int 41 | 42 | def set_new_data_paths(new_path_dict={}): 43 | """ 44 | Override path to the dataframe collections based on a new custom hierarchy. 45 | 46 | Parameters 47 | -------- 48 | new_path_dict : dict 49 | A dictionary where each key is a dataframe name to override. E.g. 'author', 'pub', 'paa', 'pub2field', etc. 50 | and each item is the new dataframe path. 51 | 52 | """ 53 | for dfname, new_path in new_path_dict.items(): 54 | self.set_new_data_path(dfname, new_path) 55 | -------------------------------------------------------------------------------- /pyscisci/datasource/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/datasource/__init__.py -------------------------------------------------------------------------------- /pyscisci/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: all 4 | :synopsis: easy interface to all of pyscisci 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | class RangeFilter(): 10 | 11 | def __init__(self, field, min_value=None, max_value=None): 12 | 13 | self.field = field 14 | self.min = min_value 15 | self.max = max_value 16 | 17 | self.check_value = lambda s,x: False 18 | 19 | if not self.min is None and not self.max is None: 20 | self.check_value = self.fullrange 21 | 22 | elif not self.min is None and self.max is None: 23 | self.check_value = self.lowerbound 24 | 25 | elif self.min is None and not self.max is None: 26 | self.check_value = self.upperbound 27 | 28 | else: 29 | raise ValueError("One of min_value or max_value must be set.") 30 | 31 | 32 | def fullrange(self, value): 33 | return (value >= self.min) and (value <= self.max) 34 | 35 | def lowerbound(self, value): 36 | return (value >= self.min) 37 | 38 | def upperbound(self, value): 39 | return (value <= self.max) 40 | 41 | class SetFilter(): 42 | 43 | def __init__(self, field, value_set=None): 44 | 45 | self.field = field 46 | self.value_set = set(value_set) 47 | 48 | def check_value(self, value): 49 | return value in self.value_set 50 | 51 | class YearFilter(RangeFilter): 52 | 53 | def __init__(self, min_year=None, max_year=None): 54 | 55 | self.field = 'Year' 56 | self.min = min_year 57 | self.max = max_year 58 | 59 | self.check_value = lambda s,x: False 60 | 61 | if not self.min is None and not self.max is None: 62 | self.check_value = self.fullrange 63 | 64 | elif not self.min is None and self.max is None: 65 | self.check_value = self.lowerbound 66 | 67 | elif self.min is None and not self.max is None: 68 | self.check_value = self.upperbound 69 | 70 | class DocTypeFilter(SetFilter): 71 | 72 | def __init__(self, doctypes=[]): 73 | 74 | self.field = 'DocType' 75 | self.value_set=set(doctypes) 76 | 77 | class FieldFilter(SetFilter): 78 | 79 | def __init__(self, valid_fields=[]): 80 | 81 | self.field = 'FieldId' 82 | self.value_set=set(valid_fields) 83 | 84 | class JournalFilter(SetFilter): 85 | 86 | def __init__(self, valid_journals=[]): 87 | 88 | self.field = 'JournalId' 89 | self.value_set=set(valid_journals) 90 | 91 | 92 | -------------------------------------------------------------------------------- /pyscisci/methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/methods/__init__.py -------------------------------------------------------------------------------- /pyscisci/methods/careertopics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: career topics 4 | :synopsis: Calculate the co-citing network and detect career communities. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import networkx as nx 14 | import scipy.sparse as spsparse 15 | 16 | 17 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 18 | if 'ipykernel' in sys.modules: 19 | from tqdm.notebook import tqdm 20 | else: 21 | from tqdm import tqdm 22 | 23 | from pyscisci.utils import isin_sorted 24 | from pyscisci.network import cociting_network 25 | from pyscisci.sparsenetworkutils import largest_connected_component_vertices 26 | 27 | ## Career Cociting Network 28 | def career_cociting_network_topics(paa, pub2ref, randomize=None, return_network=False, show_progress=False): 29 | """ 30 | This function calculates the topics throughout a career based on the co-citing network (two publications are linked if they share a reference). 31 | See :cite:`zeng2019topicswitch` for details. 32 | 33 | Parameters 34 | ---------- 35 | paa : dataframe 36 | The publication author affiliation linkages for the focus author. 37 | 38 | pub2ref : dataframe 39 | The citing-cited publication linkages which contains the citing articles from the focus author. 40 | 41 | randomize : int, default None 42 | See for random initialization of the community detection algorithm. 43 | 44 | return_network : bool, default False 45 | Return the networkx object of the co-citing network. 46 | 47 | show_progress : bool, default False 48 | Show calculation progress. 49 | 50 | Returns 51 | ---------- 52 | switching_career : DataFrame 53 | The paa with topic number included (topics are detected using the Louvain algorithm in the co-citing network). 54 | 55 | cociting_net : networkx.Graph(), optional 56 | If 'return_network == True' then the cociting network is returned as a networkx graph object. 57 | 58 | """ 59 | try: 60 | from cdlib import algorithms 61 | except ImportError: 62 | raise ImportError("Optional package cdlib needed for this analysis: pip install cdlib") 63 | 64 | try: 65 | from clusim.clustering import Clustering 66 | except ImportError: 67 | raise ImportError("Optional package clusim needed for this analysis: pip install clusim") 68 | 69 | focus_pub_ids = np.sort(paa['PublicationId'].unique()) 70 | 71 | # find the co-citing network 72 | cociting_adjmat, cociting2int = cociting_network(pub2ref, focus_pub_ids=focus_pub_ids, 73 | focus_constraint='citing', 74 | cited_col_name = 'CitedPublicationId', 75 | citing_col_name = 'CitingPublicationId') 76 | 77 | # now take the largest connected component 78 | lcc_nodes = largest_connected_component_vertices(cociting_adjmat) 79 | 80 | remapnodes = {nid:i for i, nid in enumerate(lcc_nodes)} 81 | cociting2int = {pid:remapnodes[i] for pid, i in cociting2int.items() if not remapnodes.get(i, None) is None} 82 | 83 | lcc_cociting_adjmat = spsparse.csr_matrix(cociting_adjmat)[lcc_nodes][:,lcc_nodes] 84 | 85 | # remove self-loops and binarize 86 | lcc_cociting_adjmat.setdiag(0) 87 | lcc_cociting_adjmat.data[lcc_cociting_adjmat.data >1] = 1 88 | lcc_cociting_adjmat.eliminate_zeros() 89 | 90 | lcc_cociting_net = nx.Graph(lcc_cociting_adjmat) 91 | coms = algorithms.louvain(lcc_cociting_net, resolution=1., randomize=randomize) 92 | louvain_communities = Clustering().from_cluster_list(coms.communities) 93 | 94 | pub2topiccomm = {pid:list(louvain_communities.elm2clu_dict[pid])[0] for pid in lcc_cociting_net.nodes()} 95 | 96 | switching_career = paa[['PublicationId', 'AuthorId', 'Year']].copy() 97 | switching_career.drop_duplicates(subset=['PublicationId'], inplace=True) 98 | switching_career = switching_career.loc[isin_sorted(switching_career['PublicationId'].values, np.sort(list(cociting2int.keys())))] 99 | 100 | switching_career['TopicCommunity'] = [pub2topiccomm[cociting2int[pid]] for pid in switching_career['PublicationId'].values] 101 | 102 | switching_career['Degree'] = [lcc_cociting_net.degree()[cociting2int[pid]] for pid in switching_career['PublicationId'].values] 103 | 104 | switching_career.dropna(inplace=True) 105 | 106 | switching_career.sort_values('Year', inplace=True) 107 | 108 | if return_network: 109 | nx.set_node_attributes(lcc_cociting_net, pub2topiccomm, "TopicCommunity") 110 | nx.set_node_attributes(lcc_cociting_net, {i:pid for pid,i in cociting2int.items()}, "PublicationId") 111 | 112 | return switching_career, lcc_cociting_net 113 | else: 114 | return switching_career 115 | -------------------------------------------------------------------------------- /pyscisci/methods/cindex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: cindex 4 | :synopsis: Calculate the cindex. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import zip2dict 21 | 22 | 23 | def compute_cindex(df, colgroupby, colcountby, show_progress=False): 24 | """ 25 | Calculate the cindex for each group in the DataFrame (the number of citations to the maximum cited publication). 26 | See :cite:`Waltman2008index` for detailed definition. 27 | 28 | Parameters 29 | ---------- 30 | :param df : DataFrame 31 | A DataFrame with the citation information for each Author. 32 | 33 | :param colgroupby : str 34 | The DataFrame column with Author Ids. 35 | 36 | :param colcountby : str 37 | The DataFrame column with Citation counts for each publication. 38 | 39 | Returns 40 | ------- 41 | DataFrame 42 | DataFrame with 2 columns: colgroupby, 'Cindex' 43 | 44 | """ 45 | # register our pandas apply with tqdm for a progress bar 46 | tqdm.pandas(desc='cindex', disable= not show_progress) 47 | 48 | newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Cindex']*2) 49 | return df.groupby(colgroupby, sort=False)[colcountby].max(qfactor).to_frame().reset_index().rename(columns=newname_dict) 50 | 51 | def compute_pindex(df, colgroupby, colcountby, show_progress=False): 52 | """ 53 | Calculate the pindex for each group in the DataFrame (the number of publications with >0 citations). 54 | See :cite:`Waltman2008index` for detailed definition. 55 | 56 | Parameters 57 | ---------- 58 | :param df : DataFrame 59 | A DataFrame with the citation information for each Author. 60 | 61 | :param colgroupby : str 62 | The DataFrame column with Author Ids. 63 | 64 | :param colcountby : str 65 | The DataFrame column with Citation counts for each publication. 66 | 67 | Returns 68 | ------- 69 | DataFrame 70 | DataFrame with 2 columns: colgroupby, 'Pindex' 71 | 72 | """ 73 | # register our pandas apply with tqdm for a progress bar 74 | tqdm.pandas(desc='pindex', disable= not show_progress) 75 | 76 | newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Pindex']*2) 77 | return df.groupby(colgroupby, sort=False)[colcountby].astype(bool).sum(axis=0).to_frame().reset_index().rename(columns=newname_dict) 78 | 79 | -------------------------------------------------------------------------------- /pyscisci/methods/creditshare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: credit sharing 4 | :synopsis: Set of functions for calcuating credit share amongst authors. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | from pyscisci.utils import isin_sorted, groupby_count 13 | from pyscisci.network import cocitation_network 14 | 15 | def credit_share(focus_pid, pub2ref, pub2author, temporal=False, normed=False, show_progress=False): 16 | """ 17 | Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`. 18 | 19 | Parameters 20 | ---------- 21 | :param focus_pid : int, str 22 | The focus publication id. 23 | 24 | :param pub2ref : DataFrame 25 | A DataFrame with the citation information for each Publication. 26 | 27 | :param pub2author : DataFrame 28 | A DataFrame with the author information for each Publication. 29 | 30 | :param temporal : bool, default False 31 | If True, compute the adjacency matrix using only publications for each year. 32 | 33 | :param normed : bool, default False 34 | Normalize the sum of credit share to 1.0 35 | 36 | :param show_progress : bool, default False 37 | If True, show a progress bar tracking the calculation. 38 | 39 | Returns 40 | ------- 41 | credit_share, numpy array 42 | If temporal == False: 43 | The adjacency matrix for the co-citation network 44 | 45 | If temporal == True: 46 | A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced 47 | by citing publications in that year. 48 | 49 | author2int, dict 50 | A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above). 51 | 52 | """ 53 | 54 | # the focus publication's authors 55 | focus_authors = np.sort(pub2author.loc[pub2author['PublicationId']==focus_pid]['AuthorId'].unique()) 56 | author2int = {aid:i for i, aid in enumerate(focus_authors)} 57 | 58 | if focus_authors.shape[0] > 1: 59 | # start by getting the co-citation network around the focus publication 60 | adj_mat, cited2int = cocitation_network(pub2ref, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited', 61 | temporal=temporal, show_progress=show_progress) 62 | 63 | # get the authorships for the publications in the cocitation network 64 | cocited_pubs = np.sort(list(cited2int.keys())) 65 | pa = pub2author.loc[isin_sorted(pub2author['PublicationId'].values, cocited_pubs)] 66 | 67 | if cocited_pubs.shape[0] > 0: 68 | # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub) 69 | credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float) 70 | 71 | # for each cocited publication, we count the number of authors 72 | # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors) 73 | for cocitedid, adf in pa.groupby('PublicationId'): 74 | author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None] 75 | if len(author2row) > 0: 76 | credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique() 77 | 78 | if temporal: 79 | # temporal credit allocation - broken down by year 80 | 81 | # we need the temporal citations to the focus article 82 | focus_citations = groupby_count(pub2ref.loc[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort([focus_pid]))], 83 | colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False) 84 | focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values} 85 | 86 | # when temporal is True, a temporal adj mat is returned where each key is the year 87 | years = np.sort(list(adj_mat.keys())) 88 | 89 | cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float) 90 | 91 | for iy, y in enumerate(years): 92 | cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year 93 | cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y] #set the diagonal to be the total citations from that year 94 | 95 | cocite_counts = cocite_counts.cumsum(axis=0) 96 | 97 | else: 98 | # just do credit allocation with the full cocitation matrix 99 | cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense() 100 | 101 | # the co-citation matrix misses the number of citations to the focus publication 102 | # so explicitly calculate the number of citations to the focus publication 103 | cocite_counts[0,cited2int[focus_pid]] = pub2ref.loc[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique() 104 | 105 | # credit share is the matrix product of the credit_allocation_mat with cocite_counts 106 | credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T))) 107 | 108 | # normalize the credit share vector to sum to 1 109 | if normed: 110 | credit_share = credit_share/credit_share.sum(axis=0) 111 | 112 | if temporal: 113 | return credit_share, author2int, years 114 | else: 115 | return credit_share, author2int 116 | else: 117 | if temporal: 118 | years = np.sort(pub2ref.loc[pub2ref['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) 119 | return np.array([[None for y in years] for a in author2int]), author2int, years 120 | else: 121 | return np.array([None for a in author2int]), author2int 122 | 123 | elif focus_authors.shape[0] == 1: 124 | if temporal: 125 | years = np.sort(pub2ref.loc[pub2ref['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) 126 | return np.ones(shape=(1,years.shape[0])), author2int, years 127 | else: 128 | return np.array([1.0]), author2int -------------------------------------------------------------------------------- /pyscisci/methods/diffusionscientificcredit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: diffusionscientificcredit 4 | :synopsis: Rank authors based on the pagerank within their citation graph. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | from pyscisci.utils import isin_sorted, groupby_count, groupby_total 13 | from pyscisci.network import cocitation_network 14 | from pyscisci.sparsenetworkutils import dataframe2bipartite, sparse_pagerank_scipy 15 | 16 | def diffusion_of_scientific_credit(pub2ref, pub2author, pub=None, alpha = 0.9, max_iter = 100, tol = 1.0e-10): 17 | """ 18 | Calculate the diffusion of scientific credits for each author based on :cite:`Radicchi2009authorpagerank`. 19 | 20 | Parameters 21 | ---------- 22 | 23 | :param pub2ref : DataFrame 24 | A DataFrame with the citation information for each Publication. 25 | 26 | :param pub2author : DataFrame 27 | A DataFrame with the author information for each Publication. 28 | 29 | :param pub : DataFrame 30 | A DataFrame with the publication information for each Publication. 31 | 32 | :param alpha : float, default 0.9 33 | The PageRank reset probility 34 | 35 | :param max_iter : int, default 100 36 | The maximum number of iterations when appllying the power method. 37 | 38 | :param tol : float, default 1.0e-10 39 | The error tolerance when appllying the power method. 40 | 41 | Returns 42 | ------- 43 | credit_share, numpy array 44 | If temporal == False: 45 | The adjacency matrix for the co-citation network 46 | 47 | If temporal == True: 48 | A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced 49 | by citing publications in that year. 50 | 51 | author2int, dict 52 | A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above). 53 | 54 | """ 55 | 56 | """ 57 | Diffusion of Scientific Credits and the Ranking of Scientists 58 | Radicchi et al (2009) Phys Rev E 59 | 60 | 61 | author_subset - each row is one article & author combination. 62 | at least two columns 'name':author name, and 'teamsize':number of total authors on the paper 63 | 64 | full_citation - each row is one citation from one article & author combination 65 | at least four columns 'name_citing': name of citing author, 'name_cited': name of cited author, 66 | 'teamsize_citing': number of authors for citing paper, 'teamsize_cited': number of authors for cited papers. 67 | """ 68 | 69 | # relabel the authors to map to network nodes 70 | focus_authors = np.sort(pub2author['AuthorId'].unique()) 71 | author2int = {aid:i for i, aid in enumerate(focus_authors)} 72 | Nauthors = len(author2int) 73 | 74 | pub2author.drop_duplicates(subset=['PublicationId', 'AuthorId'], inplace=True) 75 | pub2author['AuthorId'] = [author2int.get(aid, None) for aid in pub2author['AuthorId'].values] 76 | 77 | # check if we are given the teamsize in publication information 78 | if (not pub is None) and 'TeamSize' in list(pub): 79 | teamsize = {pid:ts for pid, ts in pub[['PublicationId', 'TeamSize']].values} 80 | 81 | # otherwies we need to calculate teamsize based on the authorship information 82 | else: 83 | teamsize = pub2author.groupby('PublicationId')['AuthorId'].nunique() 84 | 85 | 86 | full_citation = pub2ref.merge(pub2author[['PublicationId', 'AuthorId']], left_on = 'CitingPublicationId', right_on = 'PublicationId') 87 | del full_citation['PublicationId'] 88 | full_citation.rename(columns={'AuthorId':'CitingAuthorId'}, inplace=True) 89 | 90 | full_citation = full_citation.merge(pub2author[['PublicationId', 'AuthorId']], left_on = 'CitedPublicationId', right_on = 'PublicationId') 91 | del full_citation['PublicationId'] 92 | full_citation.rename(columns={'AuthorId':'CitedAuthorId'}, inplace=True) 93 | 94 | full_citation.dropna(inplace=True) 95 | 96 | # now add in the teamsize information to make edge weights 97 | full_citation['edge_weight'] = [1.0/(teamsize.get(citing_pid, 1) * teamsize.get(cited_pid, 1)) for citing_pid, cited_pid in full_citation[['CitingPublicationId', 'CitedPublicationId']].values] 98 | 99 | 100 | adj_mat = dataframe2bipartite(full_citation, rowname='CitingAuthorId', colname='CitedAuthorId', 101 | shape = (Nauthors,Nauthors), weightname = 'edge_weight') 102 | 103 | 104 | # make the weighted productivity vector to intialize the pagerank 105 | pub2author['AuthorCredit'] = [1/teamsize.get(pid, 1) for pid in pub2author['PublicationId'].values] 106 | weighted_productivity = groupby_total(pub2author, colgroupby = 'AuthorId', colcountby = 'AuthorCredit').sort_values('AuthorId') 107 | # norm vector 108 | weighted_productivity['AuthorCreditTotal'] = weighted_productivity['AuthorCreditTotal'] / weighted_productivity['AuthorCreditTotal'].sum() 109 | 110 | # run the power method to solve the diffusion 111 | sc = sparse_pagerank_scipy(adj_mat, alpha= alpha, 112 | personalization=weighted_productivity['AuthorCreditTotal'].values, 113 | initialization=weighted_productivity['AuthorCreditTotal'].values, 114 | max_iter=max_iter, tol=tol, dangling=None) 115 | 116 | return sc, author2int 117 | -------------------------------------------------------------------------------- /pyscisci/methods/disruption.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: distruption index 4 | :synopsis: Set of functions for finding the disruption index. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | import sys 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 13 | if 'ipykernel' in sys.modules: 14 | from tqdm.notebook import tqdm 15 | else: 16 | from tqdm import tqdm 17 | 18 | ### Disruption 19 | def disruption_index(pub2ref, focus_pub_ids = None, cite_window = None, ref_window = None, show_progress=False): 20 | """ 21 | Calculate the disruption index as first proposed in :cite:`Funk2017disrupt` and used in :cite:`Wu2019teamsdisrupt`. 22 | We also include the windowed disruption index used in :cite:`Park2023timedisrupt`. 23 | 24 | Parameters 25 | ---------- 26 | 27 | :param pub2ref : DataFrame 28 | A DataFrame with the citation information for each Publication. 29 | 30 | :param focus_pub_ids : numpy array 31 | A subset of publication ids to focus on for the disruption index. 32 | 33 | :param cite_window : list of two ints, default None 34 | If None, no citation window is applied. 35 | If [l, u] where, l,u are ints, then only citations whose year difference is greater than or equal to the lower bound l and 36 | less than or equal to the upper bound u are used. e.g. [0,5] uses citations within 5 years of publication (and not before publication). 37 | 38 | :param ref_window : list of two ints, default None 39 | If None, no reference window is applied. 40 | If [l, u] where, l,u are ints, then only references whose year difference is greater than or equal to the lower bound l and 41 | less than or equal to the upper bound u are used. e.g. [0,5] uses references within 5 years of publication (and not after). 42 | 43 | show_progress : bool, default False 44 | Show calculation progress. 45 | 46 | Returns 47 | ------- 48 | disruption : DataFrame 49 | A DataFrame with the disruption index for all (cited) publications or publications from the focus_pub_ids list. 50 | 51 | 52 | """ 53 | if show_progress: 54 | print("Starting computation of disruption index.") 55 | 56 | if ref_window is None: 57 | reference_groups = pub2ref.groupby('CitingPublicationId', sort = False)['CitedPublicationId'] 58 | else: 59 | ref_sub = [ ((y1-y2) >= ref_window[0] and (y1-y2) <=ref_window[1]) for y1,y2 in pub2ref[['CitingYear', 'CitedYear']].values] 60 | reference_groups = pub2ref.loc[ref_sub].groupby('CitingPublicationId', sort = False)['CitedPublicationId'] 61 | 62 | if cite_window is None: 63 | citation_groups = pub2ref.groupby('CitedPublicationId', sort = False)['CitingPublicationId'] 64 | else: 65 | cite_sub = [ ((y1-y2) >= cite_window[0] and (y1-y2) <=cite_window[1]) for y1,y2 in pub2ref[['CitingYear', 'CitedYear']].values] 66 | citation_groups = pub2ref.loc[cite_sub].groupby('CitedPublicationId', sort = False)['CitingPublicationId'] 67 | 68 | if focus_pub_ids is None: 69 | if cite_window is None: 70 | focus_pub_ids = pub2ref['CitedPublicationId'].unique() 71 | else: 72 | focus_pub_ids = pub2ref.loc[cite_sub]['CitedPublicationId'].unique() 73 | 74 | def get_citation_groups(pid): 75 | try: 76 | return citation_groups.get_group(pid).values 77 | except KeyError: 78 | return np.array([]) 79 | 80 | def _disruption_index(focusid): 81 | 82 | # if the focus publication has no references or citations, then it has a disruption of None 83 | try: 84 | focusref = reference_groups.get_group(focusid) 85 | except KeyError: 86 | return None 87 | 88 | try: 89 | citing_focus = citation_groups.get_group(focusid) 90 | except KeyError: 91 | return None 92 | 93 | 94 | # implementation 1: keep it numpy 95 | #cite2ref = reduce(np.union1d, [get_citation_groups(refid) for refid in focusref]) 96 | #nj = np.intersect1d(cite2ref, citing_focus.values).shape[0] 97 | #nk = cite2ref.shape[0] - nj 98 | 99 | # implementation 2: but dicts are faster... 100 | cite2ref = {citeid:1 for refid in focusref for citeid in get_citation_groups(refid)} 101 | nj = sum(cite2ref.get(pid, 0) for pid in citing_focus.values ) 102 | nk = len(cite2ref) - nj 103 | 104 | ni = citing_focus.shape[0] - nj 105 | 106 | return float(ni - nj)/(ni + nj + nk) 107 | 108 | disrupt = [[focusciting, _disruption_index(focusciting)] for focusciting 109 | in tqdm(focus_pub_ids, leave=True, desc='Disruption Index', disable= not show_progress) if get_citation_groups(focusciting).shape[0] > 0] 110 | 111 | return pd.DataFrame(disrupt, columns = ['PublicationId', 'DisruptionIndex']) 112 | 113 | -------------------------------------------------------------------------------- /pyscisci/methods/diversity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: interdisciplinary 4 | :synopsis: Set of functions for typical interdisciplinary analysis 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | from ..utils import isin_sorted, zip2dict, check4columns, simpson, simpson_finite, shannon_entropy 13 | 14 | 15 | 16 | def simpson_interdisciplinarity(pub2ref, pub2field, focus_pub_ids=None, 17 | citation_direction='references', finite_correction=False, show_progress=False): 18 | """ 19 | Calculate the Simpson index as a measure of a publication's interdisciplinarity. 20 | See :cite:`stirling20` for the definition. 21 | 22 | Parameters 23 | ---------- 24 | :param pub2ref : DataFrame 25 | A DataFrame with the citation information for each Publication. 26 | 27 | :param pub2field : DataFrame 28 | A DataFrame with the field information for each Publication. 29 | 30 | :param focus_pub_ids : numpy array or list, default None 31 | A list of the PublicationIds to calculate interdisciplinarity. 32 | 33 | :param finite_correction : bool, default False 34 | Whether to apply the correction for a finite sample. 35 | 36 | :param show_progress : bool, default False 37 | If True, show a progress bar tracking the calculation. 38 | 39 | Returns 40 | ------- 41 | DataFrame 42 | DataFrame with 2 columns: 'PublicationId', 'Simpsons' 43 | 44 | """ 45 | 46 | # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' 47 | if citation_direction == 'references': 48 | pub2ref_rename_dict = {'CitedPublicationId':'TargetId', 'CitingPublicationId':'SourceId'} 49 | year_col = 'CitingYear' 50 | elif citation_direction == 'citations': 51 | pub2ref_rename_dict = {'CitedPublicationId':'SourceId', 'CitingPublicationId':'TargetId'} 52 | year_col = 'CitedYear' 53 | 54 | required_columns = ['CitedPublicationId', 'CitingPublicationId'] 55 | check4columns(pub2ref, required_columns) 56 | pub2ref = pub2ref[required_columns].rename(columns=pub2ref_rename_dict) 57 | 58 | check4columns(pub2field, ['PublicationId', 'FieldId']) 59 | 60 | # merge the references to the fields for the target fields 61 | pub2ref = pub2ref.merge(pub2field, how='left', left_on='TargetId', 62 | right_on='PublicationId').rename(columns={'FieldId':'TargetFieldId'}) 63 | del pub2ref['PublicationId'] 64 | 65 | pub2ref = pub2ref.dropna() 66 | 67 | if finite_correction: 68 | simpdf = 1-pub2ref.groupby('SourceId')['TargetFieldId'].apply(simpson_finite) 69 | else: 70 | simpdf = 1-pub2ref.groupby('SourceId')['TargetFieldId'].apply(simpson) 71 | 72 | simpdf = simpdf.to_frame().reset_index().rename( 73 | columns={'TargetFieldId':'SimpsonInterdisciplinarity', 'SourceId':'PublicationId'}) 74 | 75 | return simpdf 76 | 77 | 78 | def shannon_interdisciplinarity(pub2ref, pub2field, focus_pub_ids=None, 79 | citation_direction='references', normalized=False, K=None, show_progress=False): 80 | """ 81 | Calculate the Shannon entropy as a measure of a publication's interdisciplinarity. 82 | See :cite:`stirling20` for the definition. 83 | 84 | Parameters 85 | ---------- 86 | :param pub2ref : DataFrame 87 | A DataFrame with the citation information for each Publication. 88 | 89 | :param pub2field : DataFrame 90 | A DataFrame with the field information for each Publication. 91 | 92 | :param focus_pub_ids : numpy array or list, default None 93 | A list of the PublicationIds to calculate interdisciplinarity. 94 | 95 | :param temporal : bool, default False 96 | If True, compute the distance matrix using only publications for each year. 97 | 98 | :param normalized : bool, default False 99 | If True, use the normalized entorpy bounded by the number of observed fields 100 | or K if not None. 101 | 102 | :param K : int, default None 103 | The maximum number of fields to consider. 104 | 105 | :param show_progress : bool, default False 106 | If True, show a progress bar tracking the calculation. 107 | 108 | Returns 109 | ------- 110 | DataFrame 111 | DataFrame with 2 columns: 'PublicationId', 'Simpsons' 112 | 113 | """ 114 | 115 | # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' 116 | if citation_direction == 'references': 117 | pub2ref_rename_dict = {'CitedPublicationId':'TargetId', 'CitingPublicationId':'SourceId'} 118 | year_col = 'CitingYear' 119 | elif citation_direction == 'citations': 120 | pub2ref_rename_dict = {'CitedPublicationId':'SourceId', 'CitingPublicationId':'TargetId'} 121 | year_col = 'CitedYear' 122 | 123 | required_columns = ['CitedPublicationId', 'CitingPublicationId'] 124 | check4columns(pub2ref, required_columns) 125 | pub2ref = pub2ref[required_columns].rename(columns=pub2ref_rename_dict) 126 | 127 | check4columns(pub2field, ['PublicationId', 'FieldId']) 128 | 129 | if K is None: 130 | K = pub2field['FieldId'].nunique() 131 | 132 | # merge the references to the fields for the target fields 133 | pub2ref = pub2ref.merge(pub2field, how='left', left_on='TargetId', 134 | right_on='PublicationId').rename(columns={'FieldId':'TargetFieldId'}) 135 | del pub2ref['PublicationId'] 136 | 137 | pub2ref = pub2ref.dropna() 138 | 139 | shan_inter = pub2ref.groupby('SourceId')['TargetFieldId'].apply(shannon_entropy) 140 | shan_inter = shan_inter.to_frame().reset_index().rename( 141 | columns={'TargetFieldId':'ShannonInterdisciplinarity', 'SourceId':'PublicationId'}) 142 | 143 | if normalized: 144 | shan_inter['ShannonInterdisciplinarity'] = shan_inter['ShannonInterdisciplinarity']/np.log(K) 145 | 146 | return shan_inter 147 | 148 | -------------------------------------------------------------------------------- /pyscisci/methods/hindex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: hindex 4 | :synopsis: Calculate the hindex. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import zip2dict 21 | 22 | 23 | ### H index 24 | 25 | def hindex(a): 26 | """ 27 | Calculate the h index for the array of citation values. See :cite:`hirsch2005index` for the definition. 28 | 29 | Parameters 30 | ---------- 31 | :param a : numpy array 32 | An array of citation counts for each publication by the Author. 33 | 34 | Returns 35 | ------- 36 | int 37 | The Hindex 38 | 39 | """ 40 | d = np.sort(a)[::-1] - np.arange(a.shape[0]) 41 | return (d>0).sum() 42 | 43 | def compute_hindex(df, colgroupby, colcountby, show_progress=False): 44 | """ 45 | Calculate the h index for each group in the DataFrame. See :cite:`hirsch2005index` for the definition. 46 | 47 | The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_hindex`. 48 | 49 | Parameters 50 | ---------- 51 | :param df : DataFrame 52 | A DataFrame with the citation information for each Author. 53 | 54 | :param colgroupby : str 55 | The DataFrame column with Author Ids. 56 | 57 | :param colcountby : str 58 | The DataFrame column with Citation counts for each publication. 59 | 60 | Returns 61 | ------- 62 | DataFrame 63 | DataFrame with 2 columns: colgroupby, 'Hindex' 64 | 65 | """ 66 | # register our pandas apply with tqdm for a progress bar 67 | tqdm.pandas(desc='Hindex', disable= not show_progress) 68 | 69 | newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Hindex']*2) 70 | return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(hindex).to_frame().reset_index().rename(columns=newname_dict) 71 | 72 | def gindex(a): 73 | """ 74 | Calculate the g index for the array of citation values. See :cite:`Waltman2008index` for detailed definition. 75 | 76 | Parameters 77 | ---------- 78 | :param a : numpy array 79 | An array of citation counts for each publication by the Author. 80 | 81 | Returns 82 | ------- 83 | int 84 | The Gindex 85 | 86 | """ 87 | d = np.cumsum(np.sort(a)[::-1]) - np.arange(a.shape[0])**2 88 | return (d>0).sum() 89 | 90 | def compute_gindex(df, colgroupby, colcountby, show_progress=False): 91 | """ 92 | Calculate the g index for each group in the DataFrame. See :cite:`Waltman2008index` for detailed definition. 93 | 94 | The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_gindex`. 95 | 96 | Parameters 97 | ---------- 98 | :param df : DataFrame 99 | A DataFrame with the citation information for each Author. 100 | 101 | :param colgroupby : str 102 | The DataFrame column with Author Ids. 103 | 104 | :param colcountby : str 105 | The DataFrame column with Citation counts for each publication. 106 | 107 | Returns 108 | ------- 109 | DataFrame 110 | DataFrame with 2 columns: colgroupby, 'Hindex' 111 | 112 | """ 113 | # register our pandas apply with tqdm for a progress bar 114 | tqdm.pandas(desc='Gindex', disable= not show_progress) 115 | 116 | newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Gindex']*2) 117 | return df.groupby(colgroupby, sort=False, as_index=False)[colcountby].progress_apply(gindex).rename(columns=newname_dict) 118 | -------------------------------------------------------------------------------- /pyscisci/methods/hotstreak.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: hotstreak 4 | :synopsis: Calculate the career hotstreak. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | import sys 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 13 | if 'ipykernel' in sys.modules: 14 | from tqdm.notebook import tqdm 15 | else: 16 | from tqdm import tqdm 17 | 18 | from pyscisci.utils import hard_rolling_window 19 | 20 | 21 | def piecewise_step(x, a,b,c,d): 22 | return np.piecewise(x, [x=a,x<=b), x>b], [c,d,c]) 23 | 24 | def piecewise_step_err(y, loc1, loc2): 25 | y1 = y[loc1:(loc2+1)] 26 | y0 = np.hstack([y[:loc1], y[(loc2+1):]]) 27 | return np.sqrt(np.sum([np.sum((a - a.mean())**2) for a in [y0,y1]])) 28 | 29 | def brut_fit_piecewise_step(ys): 30 | locs = np.vstack(np.triu_indices(ys.shape[0]-1, k=1)).T + 1 31 | errs = [piecewise_step_err(ys, i, j) for i,j in locs] 32 | 33 | return np.min(errs), locs[np.argmin(errs)] 34 | 35 | 36 | def piecewise_step_err2(y, loc1, loc2, loc3, loc4): 37 | y1 = y[loc1:(loc2+1)] 38 | y2 = y[loc3:(loc4+1)] 39 | y0 = np.hstack([y[:loc1], y[(loc2+1):loc3], y[(loc4+1):]]) 40 | return np.sqrt(np.sum([np.sum((a - a.mean())**2) for a in [y0,y1,y2]])) 41 | 42 | def brut_fit_piecewise_step2(ys): 43 | locs1 = np.vstack(np.triu_indices(ys.shape[0]-3, k=1)).T + 1 44 | locs2 = np.array([[i,j,j+k,j+m] for i,j in locs1 for k,m in np.vstack(np.triu_indices(ys.shape[0]-j-1, k=1)).T]) 45 | errs = [piecewise_step_err2(ys, i, j,k,m) for i,j,k,m in locs2] 46 | 47 | return np.min(errs), locs2[np.argmin(errs)] 48 | 49 | def career_hotstreak(author_career_df, citecol='c10', maxk=1, l1_lambda = 1.): 50 | """ 51 | Identify hot streaks in author careers :cite:`liu2018hotstreak'. 52 | 53 | TODO: this is an interger programming problem. Reimplement using an interger solver. 54 | Right now just using a brut force search (very inefficient)! 55 | 56 | Parameters 57 | ---------- 58 | author_career_df : DataFrame 59 | The author publication history. 60 | 61 | citecol : str, default 'c10' 62 | The column with publication citation information. 63 | 64 | max_k : int, default 1 65 | The maximum number of hot streaks to search for in a career. Should be 1 or 2. 66 | 67 | l1_lambda : float, default 1.0 68 | The l1 regularization for the number of streaks. 69 | Note, the authors never define the value they used for this in the SI. 70 | 71 | Returns 72 | ---------- 73 | lsm_err : float 74 | The least square mean error of the model plus the l1-regularized term for the number of model coefficients. 75 | 76 | streak_loc : array 77 | The index locations for the hot streak start and end locations. 78 | """ 79 | if maxk == 0 or maxk > 2: 80 | raise NotImplementedError("the career hotstreak is not implemented for this number of streaks. set maxk = 1 or maxk=2 ") 81 | 82 | Delta_N = max(5, int(0.1*author_career_df.shape[0])) 83 | gamma_N = hard_rolling_window(np.log10(author_career_df[citecol].values), window=Delta_N, step_size = 1).mean(axis=1) 84 | gamma_N = gamma_N[int(Delta_N/2):-int(Delta_N/2)] 85 | 86 | nostreak_err = np.sqrt(np.sum((gamma_N - gamma_N.mean())**2)) + l1_lambda # no step functions uses 1 model coefficient 87 | streak_gammas = [gamma_N.mean(), None, None] 88 | 89 | streak_err, streak_loc1 = brut_fit_piecewise_step(gamma_N) 90 | streak_err += 3*l1_lambda # 1 step function = 3 model coefficients 91 | 92 | streak_loc = [None]*4 93 | 94 | if (nostreak_err <= streak_err): 95 | streak_err = nostreak_err 96 | nstreak = 0 97 | else: 98 | streak_loc[:2] = streak_loc1 + int(Delta_N/2) 99 | streak_gammas[0] = np.hstack([gamma_N[:streak_loc1[0]], gamma_N[(streak_loc1[1]+1):]]).mean() 100 | streak_gammas[1] = gamma_N[streak_loc1[0]:(streak_loc1[1]+1)].mean() 101 | nstreak = 1 102 | 103 | if maxk == 2: 104 | streak_err2, streak_loc2 = brut_fit_piecewise_step2(gamma_N) 105 | streak_err2 += 6*l1_lambda # 2 step functions = 6 model coefficients 106 | if (streak_err > streak_err2): 107 | streak_err, streak_loc = streak_err2, list(streak_loc2 + int(Delta_N/2)) 108 | streak_gammas[0] = np.hstack([gamma_N[:streak_loc1[0]], gamma_N[streak_loc1[1]:(streak_loc1[2])], gamma_N[(streak_loc1[3]+1):]]).mean() 109 | streak_gammas[1] = gamma_N[streak_loc1[0]:(streak_loc1[1]+1)].mean() 110 | streak_gammas[2] = gamma_N[streak_loc1[2]:(streak_loc1[3]+1)].mean() 111 | nstreak = 2 112 | 113 | solution_df = [[streak_gammas[0], streak_gammas[1], streak_loc[0], streak_loc[1]]] 114 | if nstreak == 2: 115 | solution_df += [streak_gammas[0], streak_gammas[2], streak_loc[3], streak_loc[4]] 116 | return pd.DataFrame(solution_df, columns = ['Baseline', 'StreakGamma', 'StreakStart', 'StreakEnd']) 117 | -------------------------------------------------------------------------------- /pyscisci/methods/journal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: journalcitation 4 | :synopsis: Set of functions for typical journal bibliometric citation analysis 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import zip2dict, groupby_count 21 | from pyscisci.methods.hindex import compute_hindex 22 | 23 | def journal_productivity(pub2journal, colgroupby = 'JournalId', colcountby = 'PublicationId', show_progress=False): 24 | """ 25 | Calculate the total number of publications for each journal. 26 | 27 | Parameters 28 | ---------- 29 | :param pub2journal : DataFrame, default None, Optional 30 | A DataFrame with the author2publication information. 31 | 32 | :param colgroupby : str, default 'JournalId', Optional 33 | The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. 34 | 35 | :param colcountby : str, default 'PublicationId', Optional 36 | The DataFrame column with Publication Ids. If None then the database 'PublicationId' is used. 37 | 38 | 39 | Returns 40 | ------- 41 | DataFrame 42 | Productivity DataFrame with 2 columns: 'AuthorId', 'Productivity' 43 | 44 | """ 45 | 46 | # we can use show_progress to pass a label for the progress bar 47 | if show_progress: 48 | show_progress='Journal Productivity' 49 | 50 | newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['Productivity']*2) 51 | return groupby_count(pub2journal, colgroupby, colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict) 52 | 53 | def journal_yearly_productivity(pub2journal, colgroupby = 'JournalId', datecol = 'Year', colcountby = 'PublicationId', show_progress=False): 54 | """ 55 | Calculate the number of publications for each author in each year. 56 | 57 | Parameters 58 | ---------- 59 | :param pub2journal : DataFrame, default None, Optional 60 | A DataFrame with the author2publication information. 61 | 62 | :param colgroupby : str, default 'AuthorId', Optional 63 | The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. 64 | 65 | :param datecol : str, default 'Year', Optional 66 | The DataFrame column with Year information. If None then the database 'Year' is used. 67 | 68 | :param colcountby : str, default 'PublicationId', Optional 69 | The DataFrame column with Publication Ids. If None then the database 'PublicationId' is used. 70 | 71 | Returns 72 | ------- 73 | DataFrame 74 | Productivity DataFrame with 3 columns: 'AuthorId', 'Year', 'YearlyProductivity' 75 | 76 | """ 77 | 78 | # we can use show_progress to pass a label for the progress bar 79 | if show_progress: 80 | show_progress='Journal Yearly Productivity' 81 | 82 | newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['YearlyProductivity']*2) 83 | return groupby_count(pub2journal, [colgroupby, datecol], colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict) 84 | 85 | def journal_hindex(pub2journal, impact=None, colgroupby = 'JournalId', colcountby = 'Ctotal', show_progress=False): 86 | """ 87 | Calculate the author yearly productivity trajectory. See :cite:`hirsch2005index` for the derivation. 88 | 89 | The algorithmic implementation can be found in :py:func:`citationanalysis.compute_hindex`. 90 | 91 | Parameters 92 | ---------- 93 | :param pub2journal : DataFrame 94 | A DataFrame with the publication and journal information. 95 | 96 | :param impact : DataFrame, default None, Optional 97 | A DataFrame with the publication citation counts precalculated. If None, then it is assumed that the citation counts are already in pub2journal. 98 | 99 | :param colgroupby : str, default 'JournalId', Optional 100 | The DataFrame column with Author Ids. If None then the database 'JournalId' is used. 101 | 102 | :param colcountby : str, default 'Ctotal', Optional 103 | The DataFrame column with Citation counts for each publication. If None then the database 'Ctotal' is used. 104 | 105 | :param show_progress : bool, default False 106 | The DataFrame column with Citation counts for each publication. If None then the database 'Ctotal' is used. 107 | 108 | :param show_progress: bool, default False 109 | Show progress of the calculation. 110 | 111 | Returns 112 | ------- 113 | DataFrame 114 | Trajectory DataFrame with 2 columns: 'JournalId', 'Hindex' 115 | 116 | """ 117 | if not impact is None: 118 | pub2journal = pub2journal.merge(impact[[colgroupby, colcountby]], on='PublicationId', how='left') 119 | 120 | if show_progress: print("Computing Journal H-index.") 121 | return compute_hindex(pub2journal, colgroupby = colgroupby, colcountby = colcountby, show_progress=show_progress) 122 | 123 | def journal_impactfactor(pub, pub2ref, pub2year=None, citation_window=5, colgroupby = 'JournalId', show_progress=False): 124 | """ 125 | Calculate the impact factor for a journal. 126 | 127 | Parameters 128 | ---------- 129 | :param pub : DataFrame 130 | A DataFrame with the publication, journal, and year information. 131 | 132 | :param pub2ref : DataFrame 133 | A DataFrame with the author2publication information. If None then the database 'author2pub' is used. 134 | 135 | :param pub2year : dict, defualt None, Optional 136 | A dictionary mapping 'PublicationIds' to the publication year. If None then the 'CitingYear' is assumed to be a column of pub2ref. 137 | 138 | :param colgroupby : str, default 'JournalId', Optional 139 | The DataFrame column with Author Ids. If None then the database 'JournalId' is used. 140 | 141 | :param colcountby : str, default 'Ctotal', Optional 142 | The DataFrame column with Citation counts for each publication. If None then the database 'Ctotal' is used. 143 | 144 | :param show_progress: bool, default False 145 | Show progress of the calculation. 146 | 147 | Returns 148 | ------- 149 | DataFrame 150 | Trajectory DataFrame with 2 columns: 'JournalId', 'ImpactFactor{y}' where y is the citation_window size 151 | 152 | """ 153 | 154 | -------------------------------------------------------------------------------- /pyscisci/methods/longtermimpact.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: longterm impact 4 | :synopsis: Set of functions for typical bibliometric citation analysis 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | import scipy.optimize as spopt 13 | import scipy.stats as spstats 14 | 15 | from pyscisci.utils import zip2dict 16 | 17 | 18 | def _fit_predicted_citations(publication_citations): 19 | 20 | recenter_time = np.sort(publication_citations['DateDelta'].values) 21 | 22 | def fit_f(x): 23 | return np.arange(1,len(recenter_time) + 1) - np.array([predicted_c(t, x[0], x[1], x[2]) for t in recenter_time]) 24 | 25 | s, _ = spopt.leastsq(fit_f, x0 = np.ones(3)) 26 | return pd.Series(s) 27 | 28 | def predicted_c(t, lam, mu, sig, m = 30.): 29 | lognormt = (np.log(t) - mu) / sig 30 | return m * (np.exp(lam * spstats.norm.cdf( lognormt ) ) - 1.0) 31 | 32 | def longterm_impact(pub2ref, colgroupby = 'CitedPublicationId', coldate='CitingYear', show_progress=True): 33 | """ 34 | This function calculates the longterm scientific impact as introduced in :cite:`Wang2013longterm`. 35 | 36 | Following equation (3) from [w]: 37 | c_(t) = m * (e^{lam * PHI()}) 38 | 39 | """ 40 | pub2ref = pub2ref.copy() 41 | 42 | if 'Year' in coldate: 43 | pub2ref['DateDelta'] = pub2ref.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min()) 44 | elif 'Date' in coldate: 45 | pub2ref['DateDelta'] = pub2ref.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min()) / np.timedelta64(1,'D') 46 | else: 47 | print("Column Date Error") 48 | 49 | pub2ref = pub2ref.loc[pub2ref['DateDelta'] > 0] 50 | pub2ref.sort_values(by=['DateDelta'], inplace=True) 51 | 52 | newname_dict = zip2dict(list(range(4)), ['lam', 'mu', 'sig', 'm' ]) 53 | return pub2ref.groupby(colgroupby, sort=False).apply(_fit_predicted_citations).reset_index().rename(columns = newname_dict) 54 | -------------------------------------------------------------------------------- /pyscisci/methods/netnormcite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: network normalized citation index 4 | :synopsis: Set of functions for finding the network normalized citation index. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | import sys 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 13 | if 'ipykernel' in sys.modules: 14 | from tqdm.notebook import tqdm 15 | else: 16 | from tqdm import tqdm 17 | 18 | from pyscisci.utils import isin_sorted 19 | 20 | ### Network Normalized Citation 21 | def netnormcite_index(pub2ref, pub2year=None, focus_pub_ids = None, T=5, show_progress=False): 22 | """ 23 | Calculate the network normalized citation index as first proposed in :cite:`Ke2023netnorm`. 24 | 25 | Parameters 26 | ---------- 27 | 28 | :param pub2ref : DataFrame 29 | A DataFrame with the citation information for each Publication. 30 | 31 | :param pub2year : DataFrame, optional 32 | A DataFrame with the year of publication for each Publication if this is not already included in pub2ref. 33 | 34 | :param focus_pub_ids : numpy array 35 | A subset of publication ids to focus on for the citation index. 36 | Note, the full pub2ref is still required because we need to find the co-citation neighborhoods. 37 | 38 | :param T : int, default 5 39 | Number of years for citation window, must be 1 or greater. 40 | 41 | show_progress : bool, default False 42 | Show calculation progress. 43 | 44 | Returns 45 | ------- 46 | disruption : DataFrame 47 | A DataFrame with the network normalized citation index for all (cited) publications or publications from the focus_pub_ids list. 48 | 49 | 50 | """ 51 | if show_progress: 52 | print("Starting computation of network normalized index.") 53 | 54 | if not ('CitingYear' in list(pub2ref) or 'CitedYear' in list(pub2ref)): 55 | pub2ref = pub2ref.merge(pub2year, how='left', left_on='CitedPublicationId', right_on ='PublicationId').rename(columns={'Year':'CitedYear'}) 56 | del pub2ref['PublicationId'] 57 | pub2ref = pub2ref.merge(pub2year, how='left', left_on='CitingPublicationId', right_on ='PublicationId').rename(columns={'Year':'CitingYear'}) 58 | del pub2ref['PublicationId'] 59 | 60 | if focus_pub_ids is None: 61 | yfocus_pubs = pub2ref[['CitedPublicationId', 'CitedYear']].drop_duplicates(keep='first') 62 | else: 63 | yfocus_pubs = pub2ref[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort(focus_pub_ids))][['CitedPublicationId', 'CitedYear']].drop_duplicates(keep='first') 64 | 65 | 66 | reference_groups = pub2ref.groupby(['CitingPublicationId'], sort = False)['CitedPublicationId'] 67 | def get_reference_groups(pid): 68 | try: 69 | return reference_groups.get_group(pid).values 70 | except KeyError: 71 | return np.array([]) 72 | 73 | citation_groups = pub2ref.groupby(['CitingYear', 'CitedPublicationId'], sort = False)['CitingPublicationId'] 74 | def get_citation_groups(pid, y): 75 | try: 76 | return citation_groups.get_group((y, pid)).values 77 | except KeyError: 78 | return np.array([]) 79 | 80 | yearly_citation_counts = citation_groups.nunique() 81 | def get_yearly_ncites(y, pid): 82 | try: 83 | return yearly_citation_counts[(y, pid)] 84 | except KeyError: 85 | return 0 86 | 87 | 88 | def _netnorm_index(focusid, y): 89 | 90 | cnormt = 0 91 | for t in range(0, T+1): 92 | 93 | paper2y_cocite = {refid:get_yearly_ncites(y+t,refid) for citeid in get_citation_groups(focusid, y+t) for refid in get_reference_groups(citeid) } 94 | 95 | # the co-citation neighborhood doesnt include the focus publication 96 | cnorm_denom = sum(ncites for refid, ncites in paper2y_cocite.items() if refid != focusid) 97 | 98 | if cnorm_denom > 0 and len(paper2y_cocite) > 1: 99 | cnorm_denom = cnorm_denom / (len(paper2y_cocite) - 1) 100 | cnormt += get_yearly_ncites(y+t,focusid) / cnorm_denom 101 | 102 | return cnormt 103 | 104 | netnorm = [[focus_pub, yfocus, _netnorm_index(focus_pub, yfocus)] for focus_pub, yfocus 105 | in tqdm(yfocus_pubs.values, leave=True, desc='Network-normalized Citation', disable= not show_progress)] 106 | 107 | return pd.DataFrame(netnorm, columns = ['PublicationId', 'CitedYear', 'Cnorm{}'.format(T)]) 108 | 109 | -------------------------------------------------------------------------------- /pyscisci/methods/pivotscore.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: pivotscore 4 | :synopsis: Set of functions for typical bibliometric citation analysis 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | import sys 9 | import pandas as pd 10 | import numpy as np 11 | 12 | from pyscisci.utils import groupby_count, changepoint, pandas_cosine_similarity 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | 21 | 22 | 23 | ### Pivot Score 24 | def pivot_score(pub2author, pub2ref, previous_k=None, year_window=None, show_progress=False): 25 | """ 26 | Calculate the pivot score index as proposed in :cite:`Hill2022pivotpenalty`. 27 | 28 | None is returned for the first publication in an author's career (because there is no previous history), or when 29 | no other publications have been published within the year_window. 30 | 31 | Parameters 32 | ---------- 33 | pub2author : dataframe 34 | The publication author linkages with Year. 35 | 36 | pub2ref : dataframe 37 | The citing-cited publication linkages which contains the CitedJournalId for the cited articles. 38 | 39 | previous_k : int, default None 40 | Only compare against the previous k publications. 41 | 42 | year_window : int, default None 43 | Only compare against publications from the last year_window years. 44 | 45 | show_progress : bool, default False 46 | Show calculation progress. 47 | 48 | Returns 49 | ---------- 50 | pivotscore : DataFrame 51 | The pivotscore for each Author Publication. 52 | 53 | """ 54 | if show_progress: 55 | print("Starting computation of pivot score.") 56 | 57 | pub2refjournalcounts = pyscisci.groupby_count(pub2ref, ['CitingPublicationId', 'CitedJournalId'], 58 | 'CitedPublicationId', count_unique=True) 59 | pub2refjournalcounts.rename(columns={'CitedPublicationIdCount':'CitedJournalCount'}, inplace=True) 60 | 61 | pa_refs = paa.merge(pub2refjournalcounts, how='left', left_on = 'PublicationId', right_on='CitingPublicationId') 62 | del pa_refs['CitingPublicationId'] 63 | 64 | pa_refs.dropna(inplace=True) 65 | pa_refs['CitedJournalId'] = pa_refs['CitedJournalId'].astype(int) 66 | pa_refs.sort_values(by=['AuthorId', 'Year', 'PublicationId', 'CitedJournalId'], inplace=True) 67 | pa_refs.reset_index(drop=True, inplace=True) 68 | 69 | pscore = pa_refs.groupby('AuthorId').apply(author_pivot, 70 | previous_k=previous_k, year_window=year_window).reset_index() 71 | del pscore['level_1'] 72 | 73 | return pscore 74 | 75 | 76 | def author_pivot(authordf): 77 | 78 | pubgroups = authordf.groupby('PublicationId', sort=False) 79 | 80 | allpubidx = None 81 | if not previous_k is None: 82 | allpubidx = pyscisci.changepoint(authordf['PublicationId'].values) 83 | 84 | 85 | pivotresults = [] 86 | 87 | def publication_pivot(pubgroup): 88 | pubidx = pubgroup.index[0] 89 | pid = pubgroup.name 90 | if pubidx==0: pivotresults.append([pid, None]) 91 | else: 92 | i=len(pivotresults) 93 | if not previous_k is None and i > previous_k: 94 | history = authordf.iloc[allpubidx[i-previous_k]:pubidx] 95 | else: 96 | history = authordf.iloc[:pubidx] 97 | 98 | if not year_window is None: 99 | history = history[history['Year'] >= pubgroup['Year'].values[0] - year_window] 100 | 101 | if history.shape[0] > 0: 102 | history = history.groupby('CitedJournalId', sort=False, as_index=False)['CitedJournalCount'].sum() 103 | 104 | cosine = pandas_cosine_similarity(history, pubgroup, col_key='CitedJournalId', col_values='CitedJournalCount') 105 | 106 | pivotresults.append([pid, cosine]) 107 | else: 108 | pivotresults.append([pid, None]) 109 | 110 | pubgroups.apply(publication_pivot) 111 | 112 | return pd.DataFrame(pivotresults, columns=['PublicationId', 'PivotScore']) 113 | -------------------------------------------------------------------------------- /pyscisci/methods/productivitytrajectory.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: productivitytrajectory 4 | :synopsis: Calculate the productivity trajectory. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | import scipy.optimize as spopt 15 | 16 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 17 | if 'ipykernel' in sys.modules: 18 | from tqdm.notebook import tqdm 19 | else: 20 | from tqdm import tqdm 21 | 22 | from pyscisci.utils import zip2dict 23 | 24 | 25 | ### Productivity Trajectory 26 | 27 | def piecewise_linear(x, x_break, b, m1, m2): 28 | """ 29 | A piece wise linear function: 30 | x <= x_break: y = m1*x + b 31 | x > x_break : y = m1*x_break + b + m2*(x - x_break) 32 | """ 33 | return np.piecewise(x, [x <= x_break], [lambda x:m1*x + b-m1*x_break, lambda x:m2*x + b-m2*x_break]) 34 | 35 | def fit_piecewise_linear(xvalues, yvalues): 36 | m0 = (yvalues.max()-yvalues.min())/(xvalues.max() - xvalues.min()) 37 | p0 = [np.median(xvalues), yvalues.mean(), m0, m0] 38 | p , e = spopt.curve_fit(piecewise_linear, xvalues, yvalues, p0=p0) 39 | return pd.Series(p) 40 | 41 | 42 | def _fit_piecewise_lineardf(author, args): 43 | return fit_piecewise_linear(author[args[0]].values, author[args[1]].values) 44 | 45 | def yearly_productivity_traj(df, colgroupby = 'AuthorId', colx='Year',coly='YearlyProductivity'): 46 | """ 47 | Calculate the piecewise linear yearly productivity trajectory original studied in :cite:`way2017misleading`. 48 | 49 | """ 50 | 51 | newname_dict = zip2dict(list(range(4)), ['t_break', 'b', 'm1', 'm2' ]) #[str(i) for i in range(4)] 52 | return df.groupby(colgroupby, sort=False).apply(_fit_piecewise_lineardf, args=(colx,coly) ).reset_index().rename(columns = newname_dict) 53 | 54 | -------------------------------------------------------------------------------- /pyscisci/methods/publication.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: publicationmetrics 4 | :synopsis: Set of functions for the bibliometric analysis of publications 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import rank_array, check4columns 21 | 22 | from pyscisci.methods.raostirling import * 23 | from pyscisci.methods.diversity import * 24 | from pyscisci.methods.creditshare import * 25 | from pyscisci.methods.disruption import * 26 | from pyscisci.methods.longtermimpact import * 27 | from pyscisci.methods.sleepingbeauty import * 28 | from pyscisci.methods.pivotscore import * 29 | from pyscisci.methods.novelty import * 30 | from pyscisci.methods.netnormcite import * 31 | 32 | def citation_rank(df, colgroupby='Year', colrankby='C10', ascending=True, normed=False, show_progress=False): 33 | """ 34 | Rank publications by the number of citations (smallest) to N -1 (largest) 35 | 36 | Parameters 37 | ---------- 38 | df : DataFrame 39 | A DataFrame with the citation information for each Publication. 40 | 41 | colgroupby : str, list 42 | The DataFrame column(s) to subset by. 43 | 44 | colrankby : str 45 | The DataFrame column to rank by. 46 | 47 | ascending : bool, default True 48 | Sort ascending vs. descending. 49 | 50 | normed : bool, default False 51 | - False : rank is from 0 to N -1 52 | - True : rank is from 0 to 1 53 | 54 | show_progress : bool, default False 55 | If True, show a progress bar tracking the calculation. 56 | 57 | Returns 58 | ------- 59 | DataFrame 60 | The original dataframe with a new column for rank: colrankby+"Rank" 61 | 62 | """ 63 | # register our pandas apply with tqdm for a progress bar 64 | tqdm.pandas(desc='Citation Rank', disable= not show_progress) 65 | 66 | df[str(colrankby)+"Rank"] = df.groupby(colgroupby)[colrankby].progress_transform(lambda x: rank_array(x, ascending, normed)) 67 | return df 68 | 69 | def publication_beauty(pub2ref , colgroupby = 'CitedPublicationId', colcountby = 'CitingPublicationId', show_progress=False): 70 | """ 71 | Calculate the sleeping beauty and awakening time for each cited publication. See :cite:`Sinatra2016qfactor` for the derivation. 72 | 73 | The algorithmic implementation can be found in :py:func:`metrics.qfactor`. 74 | 75 | Parameters 76 | ---------- 77 | pub2ref : DataFrame, default None, Optional 78 | A DataFrame with the temporal citing information information. 79 | 80 | colgroupby : str, default 'CitedPublicationId', Optional 81 | The DataFrame column with Author Ids. If None then the database 'CitedPublicationId' is used. 82 | 83 | colcountby : str, default 'CitingPublicationId', Optional 84 | The DataFrame column with Citation counts for each publication. If None then the database 'CitingPublicationId' is used. 85 | 86 | Returns 87 | ------- 88 | DataFrame 89 | Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex' 90 | 91 | """ 92 | 93 | check4columns(pub2ref , ['CitedPublicationId', 'CitingPublicationId', 'CitingYear']) 94 | 95 | tqdm.pandas(desc='Beauty', disable= not show_progress) 96 | 97 | df = groupby_count(pub2ref , colgroupby = ['CitedPublicationId', 'CitingYear'], colcountby = 'CitingPublicationId', count_unique = True) 98 | 99 | newname_dict = zip2dict([str(colcountby), '0', '1'], [str(colgroupby)+'Beauty']*2 + ['Awakening']) 100 | return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform(beauty_coefficient).rename(columns=newname_dict) 101 | -------------------------------------------------------------------------------- /pyscisci/methods/qfactor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: qfactor 4 | :synopsis: Calculate the qfactor. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import zip2dict 21 | 22 | 23 | ## Q-factor 24 | def qfactor(a): 25 | """ 26 | This function calculates the Q-factor for an author. See :cite:`Sinatra2016individual` for details. 27 | 28 | """ 29 | 30 | return np.exp(np.mean(np.log(a[a>0]))) 31 | 32 | def compute_qfactor(df, colgroupby, colcountby, show_progress=False): 33 | """ 34 | Calculate the q factor for each group in the DataFrame. See :cite:`Sinatra2016individual` for the definition. 35 | 36 | The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_qfactor`. 37 | 38 | Parameters 39 | ---------- 40 | :param df : DataFrame 41 | A DataFrame with the citation information for each Author. 42 | 43 | :param colgroupby : str 44 | The DataFrame column with Author Ids. 45 | 46 | :param colcountby : str 47 | The DataFrame column with Citation counts for each publication. 48 | 49 | Returns 50 | ------- 51 | DataFrame 52 | DataFrame with 2 columns: colgroupby, 'Hindex' 53 | 54 | """ 55 | # register our pandas apply with tqdm for a progress bar 56 | tqdm.pandas(desc='Qfactor', disable= not show_progress) 57 | 58 | newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Qfactor']*2) 59 | return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(qfactor).to_frame().reset_index().rename(columns=newname_dict) -------------------------------------------------------------------------------- /pyscisci/methods/sleepingbeauty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: sleepingbeauty 4 | :synopsis: Calculate the sleeping beauty coefficient. 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | 9 | import sys 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | from pyscisci.utils import zip2dict 21 | 22 | 23 | ## Beauty-coefficient 24 | def beauty_coefficient(c): 25 | """ 26 | This function calculates the sleeping beauty coefficient and awakening time for a publication. See :cite:`ke2015beauty` for details. 27 | 28 | Parameters 29 | ---------- 30 | c : numpy array 31 | The yearly citation counts for the publication. 32 | 33 | Returns 34 | ---------- 35 | B : float 36 | Sleeping Beauty Coefficient 37 | 38 | t_a : int 39 | The awakening time 40 | 41 | """ 42 | c = c.values 43 | t_m = np.argmax(c) 44 | B_denom = c 45 | B_denom[c==0] = 1 46 | 47 | # :cite:`ke2015beauty` eq 1/2 48 | l_t = ((c[t_m] - c[0])/t_m *np.arange(c.shape[0]) + c[0] - c)/B_denom 49 | 50 | # :cite:`ke2015beauty` eq 2 51 | B = l_t[:(t_m+1)].sum() 52 | 53 | d_denom = np.sqrt((c[t_m] - c[0])**2 + t_m**2) 54 | d_t = np.abs( (c[t_m] - c[0]) * np.arange(c.shape[0]) + t_m * (c[0] - c)) / d_denom 55 | 56 | # :cite:`ke2015beauty` eq 3 57 | t_a = np.argmax(d_t[:(t_m+1)]) 58 | 59 | return pd.Series([B, t_a], index=['BeautyCoefficient', 'Awakening']) 60 | 61 | def compute_sleepingbeauty(df, colgroupby, colcountby, coldate='Year', show_progress=False): 62 | """ 63 | Calculate the sleeping beauty and awakening time for each group in the DataFrame. See :cite:`ke2015beauty` for details. 64 | 65 | The algorithmic implementation for each publication can be found in :py:func:`sleepingbeauty.beauty_coefficient`. 66 | 67 | Parameters 68 | ---------- 69 | df : DataFrame 70 | A DataFrame with the citation information for each publication in each year. 71 | 72 | colgroupby : str 73 | The DataFrame column with Publication Ids. 74 | 75 | colcountby : str 76 | The DataFrame column with Citation counts for each publication. 77 | 78 | coldate : str 79 | The DataFrame column with Year information. 80 | 81 | Returns 82 | ------- 83 | DataFrame 84 | DataFrame with 3 columns: colgroupby, 'Beauty' and 'Awakening' 85 | 86 | """ 87 | # register our pandas apply with tqdm for a progress bar 88 | tqdm.pandas(desc='Beauty', disable= not show_progress) 89 | 90 | def fill_missing_dates(subdf): 91 | subdf = subdf.set_index(coldate).reindex(np.arange(subdf[coldate].min(), subdf[coldate].max()+1)).fillna(0).reset_index() 92 | return subdf 93 | 94 | # first fill in missing dates 95 | df = df.groupby(colgroupby, sort=False, group_keys=False).apply(fill_missing_dates) 96 | 97 | #get start year 98 | syear = df.groupby(colgroupby, sort=False)[coldate].min() 99 | 100 | # now find the beauty coefficient and awakening year 101 | beauty = df.groupby(colgroupby, sort=False)[colcountby].progress_apply(beauty_coefficient).unstack(1).reset_index() 102 | 103 | # translate the awakening from index to year 104 | beauty['Awakening'] = [a+syear[pid] for pid,a in beauty[[colgroupby, 'Awakening']].values] 105 | 106 | return beauty -------------------------------------------------------------------------------- /pyscisci/sparsenetworkutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. module:: sparsenetworkutils 4 | :synopsis: The main Network class 5 | 6 | .. moduleauthor:: Alex Gates 7 | """ 8 | import sys 9 | import numpy as np 10 | import pandas as pd 11 | 12 | import scipy.sparse as spsparse 13 | 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars) 15 | if 'ipykernel' in sys.modules: 16 | from tqdm.notebook import tqdm 17 | else: 18 | from tqdm import tqdm 19 | 20 | def threshold_network(adj_mat, threshold=0): 21 | """ 22 | 23 | """ 24 | if adj_mat.getformat() != 'coo': 25 | adj_mat = spsparse.coo_matrix(adj_mat) 26 | 27 | adj_mat.data[adj_mat.data <=threshold] = 0 28 | adj_mat.eliminate_zeros() 29 | 30 | return adj_mat 31 | 32 | def largest_connected_component_vertices(adj_mat): 33 | """ 34 | """ 35 | n_components, labels = spsparse.csgraph.connected_components(adj_mat) 36 | comidx, compsizes = np.unique(labels, return_counts=True) 37 | 38 | return np.arange(adj_mat.shape[0])[labels==np.argmax(compsizes)] 39 | 40 | 41 | def dataframe2bipartite(df, rowname, colname, shape=None, weightname=None): 42 | 43 | if shape is None: 44 | shape = (int(df[rowname].max()+1), int(df[colname].max()+1) ) 45 | 46 | if weightname is None: 47 | weights = np.ones(df.shape[0], dtype=int) 48 | else: 49 | weights = df[weightname].values 50 | 51 | # create a bipartite adj matrix connecting authors to their publications 52 | bipartite_adj = spsparse.coo_matrix( ( weights, 53 | (df[rowname].values, df[colname].values) ), 54 | shape=shape, dtype=weights.dtype) 55 | 56 | bipartite_adj.sum_duplicates() 57 | 58 | return bipartite_adj 59 | 60 | def project_bipartite_mat(bipartite_adj, project_to = 'row'): 61 | 62 | if project_to == 'row': 63 | adj_mat = bipartite_adj.dot(bipartite_adj.T).tocoo() 64 | elif project_to == 'col': 65 | adj_mat = bipartite_adj.T.dot(bipartite_adj).tocoo() 66 | 67 | return adj_mat 68 | 69 | def extract_multiscale_backbone(Xs, alpha): 70 | """ 71 | A sparse matrix implementation of the multiscale backbone :cite:`Serrano2009backbone`. 72 | 73 | Parameters 74 | ---------- 75 | :param Xs : numpy.array or sp.sparse matrix 76 | The adjacency matrix for the network. 77 | 78 | :param alpha : float 79 | The significance value. 80 | 81 | 82 | Returns 83 | ------- 84 | coo_matrix 85 | The directed, weighted multiscale backbone 86 | 87 | """ 88 | 89 | X = spsparse.coo_matrix(Xs) 90 | X.eliminate_zeros() 91 | 92 | #normalize 93 | row_sums = X.sum(axis = 1) 94 | degrees = X.getnnz(axis = 1) 95 | 96 | 97 | pijs = np.multiply(X.data, 1.0/np.array(row_sums[X.row]).squeeze()) 98 | powers = degrees[X.row.squeeze()] - 1 99 | 100 | # equation 2 => where 1 - (k - 1) * integrate.quad(lambda x: (1 - x) ** (k - 2)) = (1-x)**(k - 1) if k > 1 101 | significance = np.logical_and(pijs < 1, np.power(1.0 - pijs, powers) < alpha) 102 | 103 | keep_graph = spsparse.coo_matrix((X.data[significance], (X.row[significance], X.col[significance])), shape = X.shape) 104 | keep_graph.eliminate_zeros() 105 | 106 | return keep_graph 107 | 108 | def sparse_pagerank_scipy(adjmat, alpha=0.85, personalization=None, initialization=None, 109 | max_iter=100, tol=1.0e-6, dangling=None): 110 | 111 | """ 112 | Pagerank for sparse matrices using the power method 113 | """ 114 | 115 | N, _ = adjmat.shape 116 | assert(adjmat.shape==(N,N)) 117 | 118 | if N == 0: 119 | return np.array([]) 120 | 121 | out_strength = np.array(adjmat.sum(axis=1)).flatten() 122 | out_strength[out_strength != 0] = 1.0 / out_strength[out_strength != 0] 123 | 124 | Q = spsparse.spdiags(out_strength.T, 0, *adjmat.shape, format='csr') 125 | adjmat = Q * adjmat 126 | 127 | # initial vector 128 | if initialization is None: 129 | x = np.repeat(1.0 / N, N) 130 | 131 | else: 132 | x = initialization / initialization.sum() 133 | 134 | # Personalization vector 135 | if personalization is None: 136 | p = np.repeat(1.0 / N, N) 137 | else: 138 | p = personalization / personalization.sum() 139 | 140 | # Dangling nodes 141 | if dangling is None: 142 | dangling_weights = p 143 | else: 144 | dangling_weights = dangling / dangling.sum() 145 | 146 | is_dangling = np.where(out_strength == 0)[0] 147 | 148 | # power iteration: make up to max_iter iterations 149 | for _ in range(max_iter): 150 | xlast = x 151 | x = alpha * (x * adjmat + sum(x[is_dangling]) * dangling_weights) + \ 152 | (1 - alpha) * p 153 | # check convergence, l1 norm 154 | err = np.absolute(x - xlast).sum() 155 | if err < N * tol: 156 | return x 157 | 158 | print('power iteration failed to converge in %d iterations.' % max_iter) 159 | 160 | 161 | def sparse_eigenvector_centrality_scipy(adjmat, max_iter=100, tol=1.0e-6, initialization=None): 162 | 163 | adjmat = spsparse.csr_matrix(adjmat) 164 | 165 | N, _ = adjmat.shape 166 | assert(adjmat.shape==(N,N)) 167 | 168 | if N == 0: 169 | return np.array([]) 170 | 171 | # initial vector 172 | if initialization is None: 173 | x = np.repeat(1.0 / N, N) 174 | 175 | else: 176 | x = initialization / initialization.sum() 177 | 178 | 179 | # make up to max_iter iterations 180 | for _ in range(max_iter): 181 | xlast = x 182 | x = xlast.copy() # Start with xlast times I to iterate with (A+I) 183 | # do the multiplication y^T = x^T A (left eigenvector) 184 | x = xlast*adjmat 185 | 186 | 187 | norm = np.sqrt( np.square(x).sum() ) or 1 188 | x = x/norm 189 | 190 | # Check for convergence (in the L_1 norm). 191 | err = np.absolute(x - xlast).sum() 192 | if err < N * tol: 193 | return x 194 | print('power iteration failed to converge in %d iterations.' % max_iter) 195 | -------------------------------------------------------------------------------- /pyscisci/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/tests/__init__.py -------------------------------------------------------------------------------- /pyscisci/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from pyscisci.utils import load_int 2 | 3 | 4 | def test_load_int(): 5 | assert load_int(1) == 1 6 | assert load_int("") is None 7 | assert load_int("x") is None 8 | -------------------------------------------------------------------------------- /pyscisci/visualization.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | import numpy as np 4 | 5 | import matplotlib.pylab as plt 6 | import matplotlib.colors as colors 7 | 8 | from pyscisci.methods.productivitytrajectory import piecewise_linear 9 | 10 | def career_impacttimeline(impact, datecol = 'Date', impactcol='Ctotal', fill_color='orange', hot_streak_info = None, 11 | edge_color='k', streak_color='firebrick', ax=None): 12 | 13 | if ax is None: 14 | fig, ax = plt.subplots(1,1,figsize=(10,6)) 15 | 16 | if isinstance(datecol, str): 17 | dates = impact[datecol].values 18 | elif isinstance(datecol, np.ndarray): 19 | dates = datecol 20 | 21 | if isinstance(impactcol, str): 22 | cites = impact[impactcol].values 23 | elif isinstance(impactcol, np.ndarray): 24 | cites = impactcol 25 | 26 | for d, c in zip(dates, cites): 27 | if isinstance(d, str): 28 | if 'T' in d: 29 | d= datetime.datetime.strptime(d.split('T')[0], '%Y-%m-%d') 30 | else: 31 | d= datetime.datetime.strptime(d.split(' ')[0], '%Y-%m-%d') 32 | 33 | ax.plot([d]*2, [0,c], c='k', lw = 0.5) 34 | ax.scatter(d, c, color=fill_color, edgecolor=edge_color, linewidth=0.5, zorder=100) 35 | 36 | if not hot_streak_info is None: 37 | nstreaks = 2 - hot_streak_info[:3].isnull().sum() 38 | 39 | if nstreaks == 1: 40 | gamma0, gamma1 = 10**(hot_streak_info[:2]) 41 | streak_start, streak_end = hot_streak_info[3:5].astype(int) 42 | ax.plot(datecol[:streak_start], [gamma0]*streak_start, c=streak_color) 43 | ax.plot(datecol[streak_start:(streak_end+1)], [gamma1]*(streak_end-streak_start+1), c=streak_color) 44 | ax.plot(datecol[(streak_end+1):], [gamma0]*(datecol.shape[0]-streak_end-1), c=streak_color) 45 | 46 | return ax 47 | 48 | 49 | def career_productivitytimeline(yearlyprod, productivity_trajectory = None, datecol = 'Year', fill_color='blue', ax=None): 50 | 51 | if ax is None: 52 | fig, ax = plt.subplots(1,1,figsize=(10,6)) 53 | 54 | ax.bar(yearlyprod[datecol].values, yearlyprod['YearlyProductivity'].values, color=fill_color) 55 | 56 | if not productivity_trajectory is None: 57 | t_break, b, m1, m2 = productivity_trajectory[['t_break','b','m1','m2']].values[0] 58 | 59 | ts = np.arange(yearlyprod[datecol].min(), yearlyprod[datecol].max()+1) 60 | ax.plot(ts, piecewise_linear(ts, t_break, b, m1, m2), color='black') 61 | 62 | return ax 63 | 64 | def hex2rgb(value): 65 | value = value.lstrip('#') 66 | lv = len(value) 67 | return tuple(int(value[i:i + lv // 3], 16)/255. for i in range(0, lv, lv // 3)) 68 | 69 | def hex2rgba(value, alpha = 1): 70 | return hex2rgb(value) + (alpha,) 71 | 72 | class MidpointNormalize(colors.Normalize): 73 | """ 74 | Normalise the colorbar so that diverging bars work there way either side from a prescribed midpoint value) 75 | 76 | e.g. im=ax1.imshow(array, norm=MidpointNormalize(midpoint=0.,vmin=-100, vmax=100)) 77 | """ 78 | def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False): 79 | self.midpoint = midpoint 80 | colors.Normalize.__init__(self, vmin, vmax, clip) 81 | 82 | def __call__(self, value, clip=None): 83 | # I'm ignoring masked values and all kinds of edge cases to make a 84 | # simple example... 85 | x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1] 86 | return np.ma.masked_array(np.interp(value, x, y), np.isnan(value)) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | 5 | from setuptools import setup, find_packages 6 | from pyscisci import __package__, __description__, __version__ 7 | 8 | 9 | setup(name=__package__, 10 | version=__version__, 11 | description='Science of Science', 12 | long_description=__description__, 13 | classifiers=[ 14 | 'Development Status :: 4 - Beta', 15 | 'License :: OSI Approved :: MIT License', 16 | 'Programming Language :: Python :: 3.10', 17 | 'Intended Audience :: Science/Research', 18 | 'Topic :: Scientific/Engineering :: Information Analysis', 19 | ], 20 | keywords=["science of science", "citation network", 'bibliometric'], 21 | url="https://github.com/ajgates42/pyscisci", 22 | author = 'Alex Gates ', 23 | license="MIT", 24 | packages = find_packages(), 25 | install_requires=[ 26 | 'pandas', 27 | 'numpy', 28 | 'scipy', 29 | 'scikit-learn', 30 | 'nameparser', 31 | 'lxml', 32 | 'requests', 33 | 'unidecode', 34 | 'tqdm', 35 | 'dask', 36 | 'numba' 37 | ], 38 | extras_require = { 39 | 'nlp': ["sparse_dot_topn", "python-Levenshtein"], 40 | 'hdf':['tables']}, 41 | include_package_data=True, 42 | zip_safe=False 43 | ) 44 | --------------------------------------------------------------------------------