├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Pipfile
├── README.md
├── docs
    ├── Makefile
    ├── about.rst
    ├── conf.py
    ├── databases.rst
    ├── examples.rst
    ├── html
    │   ├── .buildinfo
    │   ├── .doctrees
    │   │   ├── about.doctree
    │   │   ├── databases.doctree
    │   │   ├── environment.pickle
    │   │   ├── examples.doctree
    │   │   ├── index.doctree
    │   │   ├── installation.doctree
    │   │   ├── methods.doctree
    │   │   ├── network.doctree
    │   │   └── utils.doctree
    │   ├── _sources
    │   │   ├── about.rst.txt
    │   │   ├── databases.rst.txt
    │   │   ├── examples.rst.txt
    │   │   ├── index.rst.txt
    │   │   ├── installation.rst.txt
    │   │   ├── methods.rst.txt
    │   │   ├── network.rst.txt
    │   │   └── utils.rst.txt
    │   ├── _static
    │   │   ├── _sphinx_javascript_frameworks_compat.js
    │   │   ├── alabaster.css
    │   │   ├── basic.css
    │   │   ├── custom.css
    │   │   ├── doctools.js
    │   │   ├── documentation_options.js
    │   │   ├── file.png
    │   │   ├── jquery-3.6.0.js
    │   │   ├── jquery.js
    │   │   ├── language_data.js
    │   │   ├── minus.png
    │   │   ├── plus.png
    │   │   ├── pygments.css
    │   │   ├── searchtools.js
    │   │   ├── sphinx_highlight.js
    │   │   ├── underscore-1.13.1.js
    │   │   └── underscore.js
    │   ├── about.html
    │   ├── databases.html
    │   ├── examples.html
    │   ├── genindex.html
    │   ├── index.html
    │   ├── installation.html
    │   ├── methods.html
    │   ├── network.html
    │   ├── objects.inv
    │   ├── py-modindex.html
    │   ├── search.html
    │   ├── searchindex.js
    │   └── utils.html
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── methods.rst
    ├── network.rst
    ├── references.bib
    ├── requirements.txt
    └── utils.rst
├── examples
    ├── Getting_Started
    │   ├── DatabaseGrowth.pdf
    │   ├── Getting Started with APS.ipynb
    │   ├── Getting Started with DBLP.ipynb
    │   ├── Getting Started with MAG.ipynb
    │   ├── Getting Started with OpenAlex.ipynb
    │   ├── Getting Started with PubMed.ipynb
    │   ├── Getting Started with WOS.ipynb
    │   ├── Getting Started with a Custom DB.ipynb
    │   ├── Getting Started with a Dask Example.ipynb
    │   ├── The Growth of Science.ipynb
    │   └── Working from a publication list.ipynb
    ├── GlobalCitationNetwork
    │   ├── GlobalCitationNetwork_analysis.ipynb
    │   ├── GlobalCitationNetwork_dataprep.ipynb
    │   ├── data
    │   │   ├── Chengetal_idealist.csv.gz
    │   │   ├── diffusion_panel_data_1990_2017.csv.gz
    │   │   ├── link_prediction_panel_data_1990_2017.csv.gz
    │   │   ├── oa_country_productivity.csv
    │   │   └── oa_countrycites_nosameorg_auc.csv.gz
    │   └── vizutils.py
    ├── Method_Examples
    │   ├── Comparing pandas, spark and dask-- number of citations.ipynb
    │   ├── Example Career Analysis.ipynb
    │   ├── Example Career Topic Switching.ipynb
    │   ├── Example Novelty.ipynb
    │   ├── Example Publication Citations.ipynb
    │   ├── Example Reference Strength.ipynb
    │   ├── Example of Credit Allocation.ipynb
    │   └── Example of Interdisciplinarity.ipynb
    ├── NLP_Examples
    │   ├── CoWordMention.html
    │   ├── Example of Coword Mention Network.ipynb
    │   └── Example_Node2vec (umap,sem_axis).ipynb
    ├── Network_Examples
    │   ├── DeSollaPriceCarrerCitations.pdf
    │   ├── DiversityCocitiation.pdf
    │   ├── Example of Cocitation Network.ipynb
    │   ├── Example of Diffusion of Scientific Credit.ipynb
    │   ├── NetworkNormalizedCitationAPS.ipynb
    │   ├── StirlingCocitiation.pdf
    │   └── Three Databases and Career Topic Switching.ipynb
    ├── ScienceOfScienceTextbook
    │   ├── Chapter 0 Preparing PySciSci.ipynb
    │   ├── Chapter 01 Productivity of a Scientist.ipynb
    │   ├── Chapter 02 The h-index.ipynb
    │   ├── Chapter 05 Random Impact Rule.ipynb
    │   ├── Chapter 06 The Q-Factor.ipynb
    │   ├── Chapter 08 The Increasing Dominance of Teams in Science.ipynb
    │   ├── Chapter 10 Coauthorship Networks.ipynb
    │   └── Chapter 14 Credit Allocation.ipynb
    ├── example_data
    │   ├── DeSollaPriceCareer.csv
    │   ├── fenn_paa.csv
    │   ├── fenn_pub2ref.csv
    │   └── focus_publications_example.csv
    └── example_interactive_html
    │   ├── sem_axis.html
    │   └── umap_fig.html
├── pyscisci
    ├── __init__.py
    ├── all.py
    ├── database.py
    ├── datasource
    │   ├── APS.py
    │   ├── CustomDB.py
    │   ├── DBLP.py
    │   ├── MAG.py
    │   ├── OpenAlex.py
    │   ├── PubMed.py
    │   ├── WOS.py
    │   ├── __init__.py
    │   └── readwrite.py
    ├── embedding.py
    ├── filter.py
    ├── methods
    │   ├── __init__.py
    │   ├── author.py
    │   ├── careertopics.py
    │   ├── cindex.py
    │   ├── creditshare.py
    │   ├── diffusionscientificcredit.py
    │   ├── disruption.py
    │   ├── diversity.py
    │   ├── hindex.py
    │   ├── hotstreak.py
    │   ├── journal.py
    │   ├── longtermimpact.py
    │   ├── netnormcite.py
    │   ├── novelty.py
    │   ├── pivotscore.py
    │   ├── productivitytrajectory.py
    │   ├── publication.py
    │   ├── qfactor.py
    │   ├── raostirling.py
    │   ├── referencestrength.py
    │   ├── sleepingbeauty.py
    │   └── topicsimilarity.py
    ├── network.py
    ├── nlp.py
    ├── sparsenetworkutils.py
    ├── tests
    │   ├── __init__.py
    │   └── test_utils.py
    ├── utils.py
    └── visualization.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # DS_Store
  7 | .DS_Store
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |    configuration: docs/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 |    - pdf
15 | 
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 |    install:
19 |    - requirements: docs/requirements.txt
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |     - 3.7
 5 |     - 3.8
 6 | 
 7 | install: 
 8 |     - pip install pytest
 9 |     - pip install .
10 | 
11 | script: 
12 |     - py.test 
13 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change. 
 5 | 
 6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
 7 | 
 8 | ## Pull Request Process
 9 | 
10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 
11 |    build.
12 | 2. Update the README.md with details of changes to the interface, this includes new environment 
13 |    variables, exposed ports, useful file locations and container parameters.
14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
15 |    Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/).
16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 
17 |    do not have permission to do that, you may request the second reviewer to merge it for you.
18 | 
19 | ## Code of Conduct
20 | 
21 | ### Our Pledge
22 | 
23 | In the interest of fostering an open and welcoming environment, we as
24 | contributors and maintainers pledge to making participation in our project and
25 | our community a harassment-free experience for everyone, regardless of age, body
26 | size, disability, ethnicity, gender identity and expression, level of experience,
27 | nationality, personal appearance, race, religion, or sexual identity and
28 | orientation.
29 | 
30 | ### Our Standards
31 | 
32 | Examples of behavior that contributes to creating a positive environment
33 | include:
34 | 
35 | * Using welcoming and inclusive language
36 | * Being respectful of differing viewpoints and experiences
37 | * Gracefully accepting constructive criticism
38 | * Focusing on what is best for the community
39 | * Showing empathy towards other community members
40 | 
41 | Examples of unacceptable behavior by participants include:
42 | 
43 | * The use of sexualized language or imagery and unwelcome sexual attention or
44 | advances
45 | * Trolling, insulting/derogatory comments, and personal or political attacks
46 | * Public or private harassment
47 | * Publishing others' private information, such as a physical or electronic
48 |   address, without explicit permission
49 | * Other conduct which could reasonably be considered inappropriate in a
50 |   professional setting
51 | 
52 | ### Our Responsibilities
53 | 
54 | Project maintainers are responsible for clarifying the standards of acceptable
55 | behavior and are expected to take appropriate and fair corrective action in
56 | response to any instances of unacceptable behavior.
57 | 
58 | Project maintainers have the right and responsibility to remove, edit, or
59 | reject comments, commits, code, wiki edits, issues, and other contributions
60 | that are not aligned to this Code of Conduct, or to ban temporarily or
61 | permanently any contributor for other behaviors that they deem inappropriate,
62 | threatening, offensive, or harmful.
63 | 
64 | ### Scope
65 | 
66 | This Code of Conduct applies both within project spaces and in public spaces
67 | when an individual is representing the project or its community. Examples of
68 | representing a project or community include using an official project e-mail
69 | address, posting via an official social media account, or acting as an appointed
70 | representative at an online or offline event. Representation of a project may be
71 | further defined and clarified by project maintainers.
72 | 
73 | ### Enforcement
74 | 
75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All
77 | complaints will be reviewed and investigated and will result in a response that
78 | is deemed necessary and appropriate to the circumstances. The project team is
79 | obligated to maintain confidentiality with regard to the reporter of an incident.
80 | Further details of specific enforcement policies may be posted separately.
81 | 
82 | Project maintainers who do not follow or enforce the Code of Conduct in good
83 | faith may face temporary or permanent repercussions as determined by other
84 | members of the project's leadership.
85 | 
86 | ### Attribution
87 | 
88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
89 | available at [http://contributor-covenant.org/version/1/4][version]
90 | 
91 | [homepage]: http://contributor-covenant.org
92 | [version]: http://contributor-covenant.org/version/1/4/


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Welcome to ``pySciSci`` contributing guide
 2 | 
 3 | Thank you for investing your time in contributing to our project!
 4 | 
 5 | Read our [Code of Conduct](./CODE_OF_CONDUCT.md) to keep our community approachable and respectable.
 6 | 
 7 | In this guide you will get an overview of the contribution workflow from opening an issue, creating a PR, reviewing, and merging the PR.
 8 | 
 9 | ## New contributor guide
10 | 
11 | To get an overview of the project, read the [README](README.md). Here are some resources to help you get started with open source contributions:
12 | 
13 | - [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github)
14 | - [Set up Git](https://docs.github.com/en/get-started/quickstart/set-up-git)
15 | - [GitHub flow](https://docs.github.com/en/get-started/quickstart/github-flow)
16 | - [Collaborating with pull requests](https://docs.github.com/en/github/collaborating-with-pull-requests)
17 | 
18 | 
19 | ## Getting started
20 | 
21 | ### Requests
22 | 
23 | The field of Science of Science is constantly evolving.  If you see a feature which should be supported by the package, open an Issues and label it as an enhancement.
24 | 
25 | ### Issues
26 | 
27 | #### Create a new issue
28 | 
29 | If you spot a problem with the package, [search if an issue already exists](https://docs.github.com/en/github/searching-for-information-on-github/searching-on-github/searching-issues-and-pull-requests#search-by-the-title-body-or-comments).
30 | 
31 | #### Solve an issue
32 | 
33 | Scan through our [existing issues](https://github.com/SciSciCollective/pyscisci/issues) to find one that interests you. You can narrow down the search using `labels` as filters. As a general rule, we don’t assign issues to anyone. If you find an issue to work on, you are welcome to open a PR with a fix.
34 | 
35 | ### Make Changes
36 | 
37 | 
38 | #### Make changes in a codespace
39 | 
40 | For more information about using a codespace for working on GitHub documentation, see "[Working in a codespace](https://github.com/github/docs/blob/main/contributing/codespace.md)."
41 | 
42 | #### Make changes locally
43 | 
44 | 1. Fork the repository.
45 | - Using GitHub Desktop:
46 |   - [Getting started with GitHub Desktop](https://docs.github.com/en/desktop/installing-and-configuring-github-desktop/getting-started-with-github-desktop) will guide you through setting up Desktop.
47 |   - Once Desktop is set up, you can use it to [fork the repo](https://docs.github.com/en/desktop/contributing-and-collaborating-using-github-desktop/cloning-and-forking-repositories-from-github-desktop)!
48 | 
49 | - Using the command line:
50 |   - [Fork the repo](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo#fork-an-example-repository) so that you can make your changes without affecting the original project until you're ready to merge them.
51 | 
52 | 2. Create a working branch and start with your changes!
53 | 
54 | ### Commit your update
55 | 
56 | Commit the changes once you are happy with them.
57 | 
58 | ### Pull Request
59 | 
60 | When you're finished with the changes, create a pull request, also known as a PR.
61 | - Fill the "Ready for review" template so that we can review your PR. This template helps reviewers understand your changes as well as the purpose of your pull request. 
62 | - Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one.
63 | - Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge.
64 | Once you submit your PR, a Docs team member will review your proposal. We may ask questions or request additional information.
65 | - We may ask for changes to be made before a PR can be merged, either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. You can apply suggested changes directly through the UI. You can make any other changes in your fork, then commit them to your branch.
66 | - As you update your PR and apply changes, mark each conversation as [resolved](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/commenting-on-a-pull-request#resolving-conversations).
67 | - If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues.
68 | 
69 | ### Your PR is merged!
70 | 
71 | Congratulations :tada::tada: The ``pySciSci`` team thanks you :sparkles:. 
72 | 
73 | Now that you are part of the ``pySciSci`` community, add yourself to the ``pySciSci`` readme.
74 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Alexander Gates
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | 
 8 | [dev-packages]
 9 | 
10 | [requires]
11 | python_version = "3.8"
12 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
 1 | About
 2 | ===================
 3 | The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science :cite:p:`Fortunato2018scisci`.  It encompasses diverse interdisciplinary research programs that study the processes underlying science :cite:`Wang2021scisci`.  The field has benefited greatly from access to massive digital databases containing the products of scientific discourse---including publications, journals, patents, books, conference proceedings, and grants.  The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc..
 4 | 
 5 | 
 6 | Here we present *pySciSci* for the analysis of large-scale bibliometric data.  The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques.  The *pySciSci* package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects.
 7 | 
 8 | By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community.  We also encourage the SciSci community to contribute their own implementations, data, and use cases.
 9 | 
10 | 
11 | Funding
12 | ---------
13 | *pySciSci* acknowledges support from the following grants:
14 | 
15 | - Air Force Office of Scientific Research Award FA9550-19-1-0354
16 | - Templeton Foundation Contract 61066


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'pySciSci'
21 | copyright = '2021, Alexander Gates'
22 | author = 'Alexander Gates'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '0.6'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinxcontrib.bibtex'
34 | ]
35 | 
36 | # Add any paths that contain templates here, relative to this directory.
37 | templates_path = ['_templates']
38 | 
39 | # List of patterns, relative to source directory, that match files and
40 | # directories to ignore when looking for source files.
41 | # This pattern also affects html_static_path and html_extra_path.
42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
43 | 
44 | 
45 | # -- Options for HTML output -------------------------------------------------
46 | 
47 | # The theme to use for HTML and HTML Help pages.  See the documentation for
48 | # a list of builtin themes.
49 | #
50 | html_theme = 'alabaster'
51 | 
52 | # Add any paths that contain custom static files (such as style sheets) here,
53 | # relative to this directory. They are copied after the builtin static files,
54 | # so a file named "default.css" will overwrite the builtin "default.css".
55 | html_static_path = ['_static']
56 | 
57 | # bibtext references
58 | bibtex_bibfiles = ['references.bib']
59 | 
60 | # bibtext style
61 | bibtex_default_style = 'unsrt'


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ===================
 3 | 
 4 | Getting Started Examples
 5 | -------------------------
 6 | - `Getting Started With OpenAlex <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20OpenAlex.ipynb>`_
 7 | - `Getting Started With MAG <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20MAG.ipynb>`_
 8 | - `Getting Started With WoS <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20WOS.ipynb>`_
 9 | - `Getting Started With DBLP <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20DBLP.ipynb>`_
10 | - `Getting Started With APS <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20APS.ipynb>`_
11 | - `Getting Started With PubMed <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20PubMed.ipynb>`_
12 | - `Getting Started With Custom DB <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20a%20Custom%20DB.ipynb>`_
13 | - `The Growth of Science <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/The%20Growth%20of%20Science.ipynb>`_
14 | 
15 | Method Examples
16 | -------------------------
17 | - `The Diffusion of Scientific Credit <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Diffusion%20of%20Scientific%20Credit.ipynb>`_
18 | - `Credit Allocation <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Credit%20Allocation.ipynb>`_
19 | - `Interdisciplinarity <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Interdisciplinarity.ipynb>`_
20 | - `Novelty and Conventionality <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Novelty.ipynb>`_
21 | - `Cocitation Network <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20cocitation%20network.ipynb>`_
22 | - `Methods for Publication Impact <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Publication%20Citations.ipynb>`_
23 | - `Career Analysis <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Career%20Analysis.ipynb>`_
24 | - `Career Topic Switching <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Career%20Topic%20Switching.ipynb>`_
25 | - `Field Reference Strength <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Reference%20Strength.ipynb>`_
26 | - `Node2Vec Coauthorship <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example_Node2vec%20(umap%2Csem_axis).ipynb>`_
27 | - `CoWord Network <https://github.com/SciSciCollective/pyscisci/blob/master/examples/NLP_Examples/Example%20of%20Coword%20Mention%20Network.ipynb>`_
28 | 
29 | Advanced Functionality
30 | -------------------------
31 | - `Using Dask <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20a%20Dask%20Example.ipynb>`_
32 | - `Working from a publication List <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Working%20from%20a%20publication%20list.ipynb>`_
33 | 
34 | Science of Science Textbook
35 | ----------------------------
36 | Examples taken from different chapters of the Science of Science Textbook :cite:`Wang2021scisci`.
37 | 
38 | - `Chapter 0 Data Prep <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%200%20Preparing%20PySciSci.ipynb>`_
39 | - `Chapter 1 Productivity of a Scientist <https://github.com/examples/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2001%20Productivity%20of%20a%20Scientist.ipynb>`_
40 | - `Chapter 2 The HIndex <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2002%20The%20h-index.ipynb>`_
41 | - `Chapter 5 The Random Impact Rule <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2005%20Random%20Impact%20Rule.ipynb>`_
42 | - `Chapter 6 The Qfactor <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2006%20The%20Q-Factor.ipynb>`_
43 | - `Chapter 14 Credit Allocation <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2014%20Credit%20Allocation.ipynb>`_
44 | 


--------------------------------------------------------------------------------
/docs/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: cb6ac21b1379d0a62ccad8688800854b
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/html/.doctrees/about.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/about.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/databases.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/databases.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/environment.pickle


--------------------------------------------------------------------------------
/docs/html/.doctrees/examples.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/examples.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/index.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/installation.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/installation.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/methods.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/methods.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/network.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/network.doctree


--------------------------------------------------------------------------------
/docs/html/.doctrees/utils.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/.doctrees/utils.doctree


--------------------------------------------------------------------------------
/docs/html/_sources/about.rst.txt:
--------------------------------------------------------------------------------
 1 | About
 2 | ===================
 3 | The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science :cite:p:`Fortunato2018scisci`.  It encompasses diverse interdisciplinary research programs that study the processes underlying science :cite:`Wang2021scisci`.  The field has benefited greatly from access to massive digital databases containing the products of scientific discourse---including publications, journals, patents, books, conference proceedings, and grants.  The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc..
 4 | 
 5 | 
 6 | Here we present *pySciSci* for the analysis of large-scale bibliometric data.  The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques.  The *pySciSci* package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects.
 7 | 
 8 | By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community.  We also encourage the SciSci community to contribute their own implementations, data, and use cases.
 9 | 
10 | 
11 | Funding
12 | ---------
13 | *pySciSci* acknowledges support from the following grants:
14 | 
15 | - Air Force Office of Scientific Research Award FA9550-19-1-0354
16 | - Templeton Foundation Contract 61066


--------------------------------------------------------------------------------
/docs/html/_sources/examples.rst.txt:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ===================
 3 | 
 4 | Getting Started Examples
 5 | -------------------------
 6 | - `Getting Started With OpenAlex <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20OpenAlex.ipynb>`_
 7 | - `Getting Started With MAG <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20MAG.ipynb>`_
 8 | - `Getting Started With WoS <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20WOS.ipynb>`_
 9 | - `Getting Started With DBLP <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20DBLP.ipynb>`_
10 | - `Getting Started With APS <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20APS.ipynb>`_
11 | - `Getting Started With PubMed <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20PubMed.ipynb>`_
12 | - `Getting Started With Custom DB <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20a%20Custom%20DB.ipynb>`_
13 | - `The Growth of Science <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/The%20Growth%20of%20Science.ipynb>`_
14 | 
15 | Method Examples
16 | -------------------------
17 | - `The Diffusion of Scientific Credit <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Diffusion%20of%20Scientific%20Credit.ipynb>`_
18 | - `Credit Allocation <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Credit%20Allocation.ipynb>`_
19 | - `Interdisciplinarity <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20Interdisciplinarity.ipynb>`_
20 | - `Novelty and Conventionality <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Novelty.ipynb>`_
21 | - `Cocitation Network <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20of%20cocitation%20network.ipynb>`_
22 | - `Methods for Publication Impact <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Publication%20Citations.ipynb>`_
23 | - `Career Analysis <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Career%20Analysis.ipynb>`_
24 | - `Career Topic Switching <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Career%20Topic%20Switching.ipynb>`_
25 | - `Field Reference Strength <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example%20Reference%20Strength.ipynb>`_
26 | - `Node2Vec Coauthorship <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Method_Examples/Example_Node2vec%20(umap%2Csem_axis).ipynb>`_
27 | - `CoWord Network <https://github.com/SciSciCollective/pyscisci/blob/master/examples/NLP_Examples/Example%20of%20Coword%20Mention%20Network.ipynb>`_
28 | 
29 | Advanced Functionality
30 | -------------------------
31 | - `Using Dask <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Getting%20Started%20with%20a%20Dask%20Example.ipynb>`_
32 | - `Working from a publication List <https://github.com/SciSciCollective/pyscisci/blob/master/examples/Getting_Started/Working%20from%20a%20publication%20list.ipynb>`_
33 | 
34 | Science of Science Textbook
35 | ----------------------------
36 | Examples taken from different chapters of the Science of Science Textbook :cite:`Wang2021scisci`.
37 | - `Chapter 0 Data Prep <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%200%20Preparing%20PySciSci.ipynb>`_
38 | - `Chapter 1 Productivity of a Scientist <https://github.com/examples/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2001%20Productivity%20of%20a%20Scientist.ipynb>`_
39 | - `Chapter 2 The HIndex <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2002%20The%20h-index.ipynb>`_
40 | - `Chapter 5 The Random Impact Rule <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2005%20Random%20Impact%20Rule.ipynb>`_
41 | - `Chapter 6 The Qfactor <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2006%20The%20Q-Factor.ipynb>`_
42 | - `Chapter 14 Credit Allocation <https://github.com/SciSciCollective/pyscisci/blob/master/examples/ScienceOfScienceTextbook/Chapter%2014%20Credit%20Allocation.ipynb>`_
43 | 


--------------------------------------------------------------------------------
/docs/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. pySciSci documentation master file, created by
 2 |    sphinx-quickstart on Mon Jun 21 10:54:47 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pySciSci's documentation!
 7 | **************************************
 8 | 
 9 | Table of Contents
10 | ===================
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    about
15 |    installation
16 |    examples
17 |    databases
18 |    methods
19 |    network
20 |    utils
21 | 
22 | 
23 | References
24 | ===================
25 | .. bibliography::


--------------------------------------------------------------------------------
/docs/html/_sources/installation.rst.txt:
--------------------------------------------------------------------------------
1 | Installation
2 | ===================
3 | This package is available in PyPI. Just run the following command on terminal to install.
4 | 
5 | >>> pip install pyscisci
6 | 
7 | You can also source the code directly from the github `project page <https://github.com/SciSciCollective/pyscisci>`_ .


--------------------------------------------------------------------------------
/docs/html/_sources/methods.rst.txt:
--------------------------------------------------------------------------------
 1 | Bibliometrics
 2 | ======================
 3 | *pySciSci* facilitates the analysis of publications, authors, citations and as well as citation time-series, fixed time window citation analysis, and citation count normalization by year and field. 
 4 | 
 5 | 
 6 | Publications and Citations
 7 | ---------------------------
 8 | The *pySciSci* package facilitates the analysis of interrelationships between publications as captured by references and citations.
 9 | 
10 | For example, the most common measure of scientific impact is the citation count, or the number of times a publication has been referenced by other publications.  Variations also include citation time-series, fixed time window citation analysis, citation count normalization by year and field, and citation ranks.  More advanced methods fit models to citation timeseries, such as in the prediction of the long-term citation counts to a publication :cite:`wang2013longterm`, or in the assignment of the sleeping beauty score :cite:`ke2015sleepingbeauty`.  The package also removes of self-citations occurring between publications by the same author.
11 | 
12 | More advanced metrics capture the diversity in the citation interrelationships between publications.  These measures include the Rao-Stirling reference interdisciplinary :cite:`stirling2007diversity`, novelty & conventionality :cite:`uzzi2013atypical`, and the disruption index :cite:`funk2017dynamic`, :cite:`wu2019largeteams`.
13 | 
14 | .. automodule:: pyscisci.methods.publication
15 |    :members:
16 | 
17 | 
18 | 
19 | Author-centric Methods
20 | ----------------------
21 | 
22 | The sociology of science has analyzed scientific careers in terms of individual incentives, productivity, competition, collaboration, and success.  The *pySciSci* package facilitates author career analysis through both aggregate career statistics and temporal career trajectories.  Highlights include the H-index :cite:`hirsch2005index`, Q-factor :cite:`sinatra2016quantifying`, yearly productivity trajectories :cite:`way2017misleading`, collective credit assignment :cite:`shen2014collective`, and hot-hand effect :cite:`liu2018hot`.
23 | 
24 | 
25 | .. automodule:: pyscisci.methods.author
26 |    :members:
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/html/_sources/network.rst.txt:
--------------------------------------------------------------------------------
1 | Biblio-Networks
2 | ======================
3 | Intro to networks
4 | 
5 | .. automodule:: pyscisci.network
6 |    :members:


--------------------------------------------------------------------------------
/docs/html/_sources/utils.rst.txt:
--------------------------------------------------------------------------------
1 | General Functions
2 | ======================
3 | 
4 | .. automodule:: pyscisci.utils
5 |    :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/html/_static/_sphinx_javascript_frameworks_compat.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * _sphinx_javascript_frameworks_compat.js
  3 |  * ~~~~~~~~~~
  4 |  *
  5 |  * Compatability shim for jQuery and underscores.js.
  6 |  *
  7 |  * WILL BE REMOVED IN Sphinx 6.0
  8 |  * xref RemovedInSphinx60Warning
  9 |  *
 10 |  */
 11 | 
 12 | /**
 13 |  * select a different prefix for underscore
 14 |  */
 15 | $u = _.noConflict();
 16 | 
 17 | 
 18 | /**
 19 |  * small helper function to urldecode strings
 20 |  *
 21 |  * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL
 22 |  */
 23 | jQuery.urldecode = function(x) {
 24 |     if (!x) {
 25 |         return x
 26 |     }
 27 |     return decodeURIComponent(x.replace(/\+/g, ' '));
 28 | };
 29 | 
 30 | /**
 31 |  * small helper function to urlencode strings
 32 |  */
 33 | jQuery.urlencode = encodeURIComponent;
 34 | 
 35 | /**
 36 |  * This function returns the parsed url parameters of the
 37 |  * current request. Multiple values per key are supported,
 38 |  * it will always return arrays of strings for the value parts.
 39 |  */
 40 | jQuery.getQueryParameters = function(s) {
 41 |     if (typeof s === 'undefined')
 42 |         s = document.location.search;
 43 |     var parts = s.substr(s.indexOf('?') + 1).split('&');
 44 |     var result = {};
 45 |     for (var i = 0; i < parts.length; i++) {
 46 |         var tmp = parts[i].split('=', 2);
 47 |         var key = jQuery.urldecode(tmp[0]);
 48 |         var value = jQuery.urldecode(tmp[1]);
 49 |         if (key in result)
 50 |             result[key].push(value);
 51 |         else
 52 |             result[key] = [value];
 53 |     }
 54 |     return result;
 55 | };
 56 | 
 57 | /**
 58 |  * highlight a given string on a jquery object by wrapping it in
 59 |  * span elements with the given class name.
 60 |  */
 61 | jQuery.fn.highlightText = function(text, className) {
 62 |     function highlight(node, addItems) {
 63 |         if (node.nodeType === 3) {
 64 |             var val = node.nodeValue;
 65 |             var pos = val.toLowerCase().indexOf(text);
 66 |             if (pos >= 0 &&
 67 |                 !jQuery(node.parentNode).hasClass(className) &&
 68 |                 !jQuery(node.parentNode).hasClass("nohighlight")) {
 69 |                 var span;
 70 |                 var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg");
 71 |                 if (isInSVG) {
 72 |                     span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
 73 |                 } else {
 74 |                     span = document.createElement("span");
 75 |                     span.className = className;
 76 |                 }
 77 |                 span.appendChild(document.createTextNode(val.substr(pos, text.length)));
 78 |                 node.parentNode.insertBefore(span, node.parentNode.insertBefore(
 79 |                     document.createTextNode(val.substr(pos + text.length)),
 80 |                     node.nextSibling));
 81 |                 node.nodeValue = val.substr(0, pos);
 82 |                 if (isInSVG) {
 83 |                     var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
 84 |                     var bbox = node.parentElement.getBBox();
 85 |                     rect.x.baseVal.value = bbox.x;
 86 |                     rect.y.baseVal.value = bbox.y;
 87 |                     rect.width.baseVal.value = bbox.width;
 88 |                     rect.height.baseVal.value = bbox.height;
 89 |                     rect.setAttribute('class', className);
 90 |                     addItems.push({
 91 |                         "parent": node.parentNode,
 92 |                         "target": rect});
 93 |                 }
 94 |             }
 95 |         }
 96 |         else if (!jQuery(node).is("button, select, textarea")) {
 97 |             jQuery.each(node.childNodes, function() {
 98 |                 highlight(this, addItems);
 99 |             });
100 |         }
101 |     }
102 |     var addItems = [];
103 |     var result = this.each(function() {
104 |         highlight(this, addItems);
105 |     });
106 |     for (var i = 0; i < addItems.length; ++i) {
107 |         jQuery(addItems[i].parent).before(addItems[i].target);
108 |     }
109 |     return result;
110 | };
111 | 
112 | /*
113 |  * backward compatibility for jQuery.browser
114 |  * This will be supported until firefox bug is fixed.
115 |  */
116 | if (!jQuery.browser) {
117 |     jQuery.uaMatch = function(ua) {
118 |         ua = ua.toLowerCase();
119 | 
120 |         var match = /(chrome)[ \/]([\w.]+)/.exec(ua) ||
121 |             /(webkit)[ \/]([\w.]+)/.exec(ua) ||
122 |             /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) ||
123 |             /(msie) ([\w.]+)/.exec(ua) ||
124 |             ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) ||
125 |             [];
126 | 
127 |         return {
128 |             browser: match[ 1 ] || "",
129 |             version: match[ 2 ] || "0"
130 |         };
131 |     };
132 |     jQuery.browser = {};
133 |     jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true;
134 | }
135 | 


--------------------------------------------------------------------------------
/docs/html/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* This file intentionally left blank. */
2 | 


--------------------------------------------------------------------------------
/docs/html/_static/doctools.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * doctools.js
  3 |  * ~~~~~~~~~~~
  4 |  *
  5 |  * Base JavaScript utilities for all Sphinx HTML documentation.
  6 |  *
  7 |  * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | "use strict";
 12 | 
 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
 14 |   "TEXTAREA",
 15 |   "INPUT",
 16 |   "SELECT",
 17 |   "BUTTON",
 18 | ]);
 19 | 
 20 | const _ready = (callback) => {
 21 |   if (document.readyState !== "loading") {
 22 |     callback();
 23 |   } else {
 24 |     document.addEventListener("DOMContentLoaded", callback);
 25 |   }
 26 | };
 27 | 
 28 | /**
 29 |  * Small JavaScript module for the documentation.
 30 |  */
 31 | const Documentation = {
 32 |   init: () => {
 33 |     Documentation.initDomainIndexTable();
 34 |     Documentation.initOnKeyListeners();
 35 |   },
 36 | 
 37 |   /**
 38 |    * i18n support
 39 |    */
 40 |   TRANSLATIONS: {},
 41 |   PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
 42 |   LOCALE: "unknown",
 43 | 
 44 |   // gettext and ngettext don't access this so that the functions
 45 |   // can safely bound to a different name (_ = Documentation.gettext)
 46 |   gettext: (string) => {
 47 |     const translated = Documentation.TRANSLATIONS[string];
 48 |     switch (typeof translated) {
 49 |       case "undefined":
 50 |         return string; // no translation
 51 |       case "string":
 52 |         return translated; // translation exists
 53 |       default:
 54 |         return translated[0]; // (singular, plural) translation tuple exists
 55 |     }
 56 |   },
 57 | 
 58 |   ngettext: (singular, plural, n) => {
 59 |     const translated = Documentation.TRANSLATIONS[singular];
 60 |     if (typeof translated !== "undefined")
 61 |       return translated[Documentation.PLURAL_EXPR(n)];
 62 |     return n === 1 ? singular : plural;
 63 |   },
 64 | 
 65 |   addTranslations: (catalog) => {
 66 |     Object.assign(Documentation.TRANSLATIONS, catalog.messages);
 67 |     Documentation.PLURAL_EXPR = new Function(
 68 |       "n",
 69 |       `return (${catalog.plural_expr})`
 70 |     );
 71 |     Documentation.LOCALE = catalog.locale;
 72 |   },
 73 | 
 74 |   /**
 75 |    * helper function to focus on search bar
 76 |    */
 77 |   focusSearchBar: () => {
 78 |     document.querySelectorAll("input[name=q]")[0]?.focus();
 79 |   },
 80 | 
 81 |   /**
 82 |    * Initialise the domain index toggle buttons
 83 |    */
 84 |   initDomainIndexTable: () => {
 85 |     const toggler = (el) => {
 86 |       const idNumber = el.id.substr(7);
 87 |       const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
 88 |       if (el.src.substr(-9) === "minus.png") {
 89 |         el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
 90 |         toggledRows.forEach((el) => (el.style.display = "none"));
 91 |       } else {
 92 |         el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
 93 |         toggledRows.forEach((el) => (el.style.display = ""));
 94 |       }
 95 |     };
 96 | 
 97 |     const togglerElements = document.querySelectorAll("img.toggler");
 98 |     togglerElements.forEach((el) =>
 99 |       el.addEventListener("click", (event) => toggler(event.currentTarget))
100 |     );
101 |     togglerElements.forEach((el) => (el.style.display = ""));
102 |     if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
103 |   },
104 | 
105 |   initOnKeyListeners: () => {
106 |     // only install a listener if it is really needed
107 |     if (
108 |       !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
109 |       !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
110 |     )
111 |       return;
112 | 
113 |     document.addEventListener("keydown", (event) => {
114 |       // bail for input elements
115 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
116 |       // bail with special keys
117 |       if (event.altKey || event.ctrlKey || event.metaKey) return;
118 | 
119 |       if (!event.shiftKey) {
120 |         switch (event.key) {
121 |           case "ArrowLeft":
122 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
123 | 
124 |             const prevLink = document.querySelector('link[rel="prev"]');
125 |             if (prevLink && prevLink.href) {
126 |               window.location.href = prevLink.href;
127 |               event.preventDefault();
128 |             }
129 |             break;
130 |           case "ArrowRight":
131 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
132 | 
133 |             const nextLink = document.querySelector('link[rel="next"]');
134 |             if (nextLink && nextLink.href) {
135 |               window.location.href = nextLink.href;
136 |               event.preventDefault();
137 |             }
138 |             break;
139 |         }
140 |       }
141 | 
142 |       // some keyboard layouts may need Shift to get /
143 |       switch (event.key) {
144 |         case "/":
145 |           if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
146 |           Documentation.focusSearchBar();
147 |           event.preventDefault();
148 |       }
149 |     });
150 |   },
151 | };
152 | 
153 | // quick alias for translations
154 | const _ = Documentation.gettext;
155 | 
156 | _ready(Documentation.init);
157 | 


--------------------------------------------------------------------------------
/docs/html/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '0.6',
 4 |     LANGUAGE: 'en',
 5 |     COLLAPSE_INDEX: false,
 6 |     BUILDER: 'html',
 7 |     FILE_SUFFIX: '.html',
 8 |     LINK_SUFFIX: '.html',
 9 |     HAS_SOURCE: true,
10 |     SOURCELINK_SUFFIX: '.txt',
11 |     NAVIGATION_WITH_KEYS: false,
12 |     SHOW_SEARCH_SUMMARY: true,
13 |     ENABLE_SEARCH_SHORTCUTS: true,
14 | };


--------------------------------------------------------------------------------
/docs/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/file.png


--------------------------------------------------------------------------------
/docs/html/_static/language_data.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * language_data.js
  3 |  * ~~~~~~~~~~~~~~~~
  4 |  *
  5 |  * This script contains the language-specific data used by searchtools.js,
  6 |  * namely the list of stopwords, stemmer, scorer and splitter.
  7 |  *
  8 |  * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
  9 |  * :license: BSD, see LICENSE for details.
 10 |  *
 11 |  */
 12 | 
 13 | var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
 14 | 
 15 | 
 16 | /* Non-minified version is copied as a separate JS file, is available */
 17 | 
 18 | /**
 19 |  * Porter Stemmer
 20 |  */
 21 | var Stemmer = function() {
 22 | 
 23 |   var step2list = {
 24 |     ational: 'ate',
 25 |     tional: 'tion',
 26 |     enci: 'ence',
 27 |     anci: 'ance',
 28 |     izer: 'ize',
 29 |     bli: 'ble',
 30 |     alli: 'al',
 31 |     entli: 'ent',
 32 |     eli: 'e',
 33 |     ousli: 'ous',
 34 |     ization: 'ize',
 35 |     ation: 'ate',
 36 |     ator: 'ate',
 37 |     alism: 'al',
 38 |     iveness: 'ive',
 39 |     fulness: 'ful',
 40 |     ousness: 'ous',
 41 |     aliti: 'al',
 42 |     iviti: 'ive',
 43 |     biliti: 'ble',
 44 |     logi: 'log'
 45 |   };
 46 | 
 47 |   var step3list = {
 48 |     icate: 'ic',
 49 |     ative: '',
 50 |     alize: 'al',
 51 |     iciti: 'ic',
 52 |     ical: 'ic',
 53 |     ful: '',
 54 |     ness: ''
 55 |   };
 56 | 
 57 |   var c = "[^aeiou]";          // consonant
 58 |   var v = "[aeiouy]";          // vowel
 59 |   var C = c + "[^aeiouy]*";    // consonant sequence
 60 |   var V = v + "[aeiou]*";      // vowel sequence
 61 | 
 62 |   var mgr0 = "^(" + C + ")?" + V + C;                      // [C]VC... is m>0
 63 |   var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$";    // [C]VC[V] is m=1
 64 |   var mgr1 = "^(" + C + ")?" + V + C + V + C;              // [C]VCVC... is m>1
 65 |   var s_v   = "^(" + C + ")?" + v;                         // vowel in stem
 66 | 
 67 |   this.stemWord = function (w) {
 68 |     var stem;
 69 |     var suffix;
 70 |     var firstch;
 71 |     var origword = w;
 72 | 
 73 |     if (w.length < 3)
 74 |       return w;
 75 | 
 76 |     var re;
 77 |     var re2;
 78 |     var re3;
 79 |     var re4;
 80 | 
 81 |     firstch = w.substr(0,1);
 82 |     if (firstch == "y")
 83 |       w = firstch.toUpperCase() + w.substr(1);
 84 | 
 85 |     // Step 1a
 86 |     re = /^(.+?)(ss|i)es$/;
 87 |     re2 = /^(.+?)([^s])s$/;
 88 | 
 89 |     if (re.test(w))
 90 |       w = w.replace(re,"$1$2");
 91 |     else if (re2.test(w))
 92 |       w = w.replace(re2,"$1$2");
 93 | 
 94 |     // Step 1b
 95 |     re = /^(.+?)eed$/;
 96 |     re2 = /^(.+?)(ed|ing)$/;
 97 |     if (re.test(w)) {
 98 |       var fp = re.exec(w);
 99 |       re = new RegExp(mgr0);
100 |       if (re.test(fp[1])) {
101 |         re = /.$/;
102 |         w = w.replace(re,"");
103 |       }
104 |     }
105 |     else if (re2.test(w)) {
106 |       var fp = re2.exec(w);
107 |       stem = fp[1];
108 |       re2 = new RegExp(s_v);
109 |       if (re2.test(stem)) {
110 |         w = stem;
111 |         re2 = /(at|bl|iz)$/;
112 |         re3 = new RegExp("([^aeiouylsz])\\1$");
113 |         re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
114 |         if (re2.test(w))
115 |           w = w + "e";
116 |         else if (re3.test(w)) {
117 |           re = /.$/;
118 |           w = w.replace(re,"");
119 |         }
120 |         else if (re4.test(w))
121 |           w = w + "e";
122 |       }
123 |     }
124 | 
125 |     // Step 1c
126 |     re = /^(.+?)y$/;
127 |     if (re.test(w)) {
128 |       var fp = re.exec(w);
129 |       stem = fp[1];
130 |       re = new RegExp(s_v);
131 |       if (re.test(stem))
132 |         w = stem + "i";
133 |     }
134 | 
135 |     // Step 2
136 |     re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
137 |     if (re.test(w)) {
138 |       var fp = re.exec(w);
139 |       stem = fp[1];
140 |       suffix = fp[2];
141 |       re = new RegExp(mgr0);
142 |       if (re.test(stem))
143 |         w = stem + step2list[suffix];
144 |     }
145 | 
146 |     // Step 3
147 |     re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
148 |     if (re.test(w)) {
149 |       var fp = re.exec(w);
150 |       stem = fp[1];
151 |       suffix = fp[2];
152 |       re = new RegExp(mgr0);
153 |       if (re.test(stem))
154 |         w = stem + step3list[suffix];
155 |     }
156 | 
157 |     // Step 4
158 |     re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
159 |     re2 = /^(.+?)(s|t)(ion)$/;
160 |     if (re.test(w)) {
161 |       var fp = re.exec(w);
162 |       stem = fp[1];
163 |       re = new RegExp(mgr1);
164 |       if (re.test(stem))
165 |         w = stem;
166 |     }
167 |     else if (re2.test(w)) {
168 |       var fp = re2.exec(w);
169 |       stem = fp[1] + fp[2];
170 |       re2 = new RegExp(mgr1);
171 |       if (re2.test(stem))
172 |         w = stem;
173 |     }
174 | 
175 |     // Step 5
176 |     re = /^(.+?)e$/;
177 |     if (re.test(w)) {
178 |       var fp = re.exec(w);
179 |       stem = fp[1];
180 |       re = new RegExp(mgr1);
181 |       re2 = new RegExp(meq1);
182 |       re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
183 |       if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
184 |         w = stem;
185 |     }
186 |     re = /ll$/;
187 |     re2 = new RegExp(mgr1);
188 |     if (re.test(w) && re2.test(w)) {
189 |       re = /.$/;
190 |       w = w.replace(re,"");
191 |     }
192 | 
193 |     // and turn initial Y back to y
194 |     if (firstch == "y")
195 |       w = firstch.toLowerCase() + w.substr(1);
196 |     return w;
197 |   }
198 | }
199 | 
200 | 


--------------------------------------------------------------------------------
/docs/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/minus.png


--------------------------------------------------------------------------------
/docs/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/_static/plus.png


--------------------------------------------------------------------------------
/docs/html/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | pre { line-height: 125%; }
 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 6 | .highlight .hll { background-color: #ffffcc }
 7 | .highlight { background: #f8f8f8; }
 8 | .highlight .c { color: #8f5902; font-style: italic } /* Comment */
 9 | .highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */
10 | .highlight .g { color: #000000 } /* Generic */
11 | .highlight .k { color: #004461; font-weight: bold } /* Keyword */
12 | .highlight .l { color: #000000 } /* Literal */
13 | .highlight .n { color: #000000 } /* Name */
14 | .highlight .o { color: #582800 } /* Operator */
15 | .highlight .x { color: #000000 } /* Other */
16 | .highlight .p { color: #000000; font-weight: bold } /* Punctuation */
17 | .highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */
18 | .highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */
19 | .highlight .cp { color: #8f5902 } /* Comment.Preproc */
20 | .highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */
21 | .highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */
22 | .highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */
23 | .highlight .gd { color: #a40000 } /* Generic.Deleted */
24 | .highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */
25 | .highlight .gr { color: #ef2929 } /* Generic.Error */
26 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
27 | .highlight .gi { color: #00A000 } /* Generic.Inserted */
28 | .highlight .go { color: #888888 } /* Generic.Output */
29 | .highlight .gp { color: #745334 } /* Generic.Prompt */
30 | .highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */
31 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
32 | .highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */
33 | .highlight .kc { color: #004461; font-weight: bold } /* Keyword.Constant */
34 | .highlight .kd { color: #004461; font-weight: bold } /* Keyword.Declaration */
35 | .highlight .kn { color: #004461; font-weight: bold } /* Keyword.Namespace */
36 | .highlight .kp { color: #004461; font-weight: bold } /* Keyword.Pseudo */
37 | .highlight .kr { color: #004461; font-weight: bold } /* Keyword.Reserved */
38 | .highlight .kt { color: #004461; font-weight: bold } /* Keyword.Type */
39 | .highlight .ld { color: #000000 } /* Literal.Date */
40 | .highlight .m { color: #990000 } /* Literal.Number */
41 | .highlight .s { color: #4e9a06 } /* Literal.String */
42 | .highlight .na { color: #c4a000 } /* Name.Attribute */
43 | .highlight .nb { color: #004461 } /* Name.Builtin */
44 | .highlight .nc { color: #000000 } /* Name.Class */
45 | .highlight .no { color: #000000 } /* Name.Constant */
46 | .highlight .nd { color: #888888 } /* Name.Decorator */
47 | .highlight .ni { color: #ce5c00 } /* Name.Entity */
48 | .highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */
49 | .highlight .nf { color: #000000 } /* Name.Function */
50 | .highlight .nl { color: #f57900 } /* Name.Label */
51 | .highlight .nn { color: #000000 } /* Name.Namespace */
52 | .highlight .nx { color: #000000 } /* Name.Other */
53 | .highlight .py { color: #000000 } /* Name.Property */
54 | .highlight .nt { color: #004461; font-weight: bold } /* Name.Tag */
55 | .highlight .nv { color: #000000 } /* Name.Variable */
56 | .highlight .ow { color: #004461; font-weight: bold } /* Operator.Word */
57 | .highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */
58 | .highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */
59 | .highlight .mb { color: #990000 } /* Literal.Number.Bin */
60 | .highlight .mf { color: #990000 } /* Literal.Number.Float */
61 | .highlight .mh { color: #990000 } /* Literal.Number.Hex */
62 | .highlight .mi { color: #990000 } /* Literal.Number.Integer */
63 | .highlight .mo { color: #990000 } /* Literal.Number.Oct */
64 | .highlight .sa { color: #4e9a06 } /* Literal.String.Affix */
65 | .highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */
66 | .highlight .sc { color: #4e9a06 } /* Literal.String.Char */
67 | .highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */
68 | .highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */
69 | .highlight .s2 { color: #4e9a06 } /* Literal.String.Double */
70 | .highlight .se { color: #4e9a06 } /* Literal.String.Escape */
71 | .highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */
72 | .highlight .si { color: #4e9a06 } /* Literal.String.Interpol */
73 | .highlight .sx { color: #4e9a06 } /* Literal.String.Other */
74 | .highlight .sr { color: #4e9a06 } /* Literal.String.Regex */
75 | .highlight .s1 { color: #4e9a06 } /* Literal.String.Single */
76 | .highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */
77 | .highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */
78 | .highlight .fm { color: #000000 } /* Name.Function.Magic */
79 | .highlight .vc { color: #000000 } /* Name.Variable.Class */
80 | .highlight .vg { color: #000000 } /* Name.Variable.Global */
81 | .highlight .vi { color: #000000 } /* Name.Variable.Instance */
82 | .highlight .vm { color: #000000 } /* Name.Variable.Magic */
83 | .highlight .il { color: #990000 } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/docs/html/_static/sphinx_highlight.js:
--------------------------------------------------------------------------------
  1 | /* Highlighting utilities for Sphinx HTML documentation. */
  2 | "use strict";
  3 | 
  4 | const SPHINX_HIGHLIGHT_ENABLED = true
  5 | 
  6 | /**
  7 |  * highlight a given string on a node by wrapping it in
  8 |  * span elements with the given class name.
  9 |  */
 10 | const _highlight = (node, addItems, text, className) => {
 11 |   if (node.nodeType === Node.TEXT_NODE) {
 12 |     const val = node.nodeValue;
 13 |     const parent = node.parentNode;
 14 |     const pos = val.toLowerCase().indexOf(text);
 15 |     if (
 16 |       pos >= 0 &&
 17 |       !parent.classList.contains(className) &&
 18 |       !parent.classList.contains("nohighlight")
 19 |     ) {
 20 |       let span;
 21 | 
 22 |       const closestNode = parent.closest("body, svg, foreignObject");
 23 |       const isInSVG = closestNode && closestNode.matches("svg");
 24 |       if (isInSVG) {
 25 |         span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
 26 |       } else {
 27 |         span = document.createElement("span");
 28 |         span.classList.add(className);
 29 |       }
 30 | 
 31 |       span.appendChild(document.createTextNode(val.substr(pos, text.length)));
 32 |       parent.insertBefore(
 33 |         span,
 34 |         parent.insertBefore(
 35 |           document.createTextNode(val.substr(pos + text.length)),
 36 |           node.nextSibling
 37 |         )
 38 |       );
 39 |       node.nodeValue = val.substr(0, pos);
 40 | 
 41 |       if (isInSVG) {
 42 |         const rect = document.createElementNS(
 43 |           "http://www.w3.org/2000/svg",
 44 |           "rect"
 45 |         );
 46 |         const bbox = parent.getBBox();
 47 |         rect.x.baseVal.value = bbox.x;
 48 |         rect.y.baseVal.value = bbox.y;
 49 |         rect.width.baseVal.value = bbox.width;
 50 |         rect.height.baseVal.value = bbox.height;
 51 |         rect.setAttribute("class", className);
 52 |         addItems.push({ parent: parent, target: rect });
 53 |       }
 54 |     }
 55 |   } else if (node.matches && !node.matches("button, select, textarea")) {
 56 |     node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
 57 |   }
 58 | };
 59 | const _highlightText = (thisNode, text, className) => {
 60 |   let addItems = [];
 61 |   _highlight(thisNode, addItems, text, className);
 62 |   addItems.forEach((obj) =>
 63 |     obj.parent.insertAdjacentElement("beforebegin", obj.target)
 64 |   );
 65 | };
 66 | 
 67 | /**
 68 |  * Small JavaScript module for the documentation.
 69 |  */
 70 | const SphinxHighlight = {
 71 | 
 72 |   /**
 73 |    * highlight the search words provided in localstorage in the text
 74 |    */
 75 |   highlightSearchWords: () => {
 76 |     if (!SPHINX_HIGHLIGHT_ENABLED) return;  // bail if no highlight
 77 | 
 78 |     // get and clear terms from localstorage
 79 |     const url = new URL(window.location);
 80 |     const highlight =
 81 |         localStorage.getItem("sphinx_highlight_terms")
 82 |         || url.searchParams.get("highlight")
 83 |         || "";
 84 |     localStorage.removeItem("sphinx_highlight_terms")
 85 |     url.searchParams.delete("highlight");
 86 |     window.history.replaceState({}, "", url);
 87 | 
 88 |     // get individual terms from highlight string
 89 |     const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
 90 |     if (terms.length === 0) return; // nothing to do
 91 | 
 92 |     // There should never be more than one element matching "div.body"
 93 |     const divBody = document.querySelectorAll("div.body");
 94 |     const body = divBody.length ? divBody[0] : document.querySelector("body");
 95 |     window.setTimeout(() => {
 96 |       terms.forEach((term) => _highlightText(body, term, "highlighted"));
 97 |     }, 10);
 98 | 
 99 |     const searchBox = document.getElementById("searchbox");
100 |     if (searchBox === null) return;
101 |     searchBox.appendChild(
102 |       document
103 |         .createRange()
104 |         .createContextualFragment(
105 |           '<p class="highlight-link">' +
106 |             '<a href="javascript:SphinxHighlight.hideSearchWords()">' +
107 |             _("Hide Search Matches") +
108 |             "</a></p>"
109 |         )
110 |     );
111 |   },
112 | 
113 |   /**
114 |    * helper function to hide the search marks again
115 |    */
116 |   hideSearchWords: () => {
117 |     document
118 |       .querySelectorAll("#searchbox .highlight-link")
119 |       .forEach((el) => el.remove());
120 |     document
121 |       .querySelectorAll("span.highlighted")
122 |       .forEach((el) => el.classList.remove("highlighted"));
123 |     localStorage.removeItem("sphinx_highlight_terms")
124 |   },
125 | 
126 |   initEscapeListener: () => {
127 |     // only install a listener if it is really needed
128 |     if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
129 | 
130 |     document.addEventListener("keydown", (event) => {
131 |       // bail for input elements
132 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
133 |       // bail with special keys
134 |       if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
135 |       if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
136 |         SphinxHighlight.hideSearchWords();
137 |         event.preventDefault();
138 |       }
139 |     });
140 |   },
141 | };
142 | 
143 | _ready(SphinxHighlight.highlightSearchWords);
144 | _ready(SphinxHighlight.initEscapeListener);
145 | 


--------------------------------------------------------------------------------
/docs/html/about.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html lang="en">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
  8 | 
  9 |     <title>About &#8212; pySciSci 0.6 documentation</title>
 10 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
 11 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css" />
 12 |     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
 13 |     <script src="_static/jquery.js"></script>
 14 |     <script src="_static/underscore.js"></script>
 15 |     <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
 16 |     <script src="_static/doctools.js"></script>
 17 |     <script src="_static/sphinx_highlight.js"></script>
 18 |     <link rel="author" title="About these documents" href="#" />
 19 |     <link rel="index" title="Index" href="genindex.html" />
 20 |     <link rel="search" title="Search" href="search.html" />
 21 |     <link rel="next" title="Installation" href="installation.html" />
 22 |     <link rel="prev" title="Welcome to pySciSci’s documentation!" href="index.html" />
 23 |    
 24 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 25 |   
 26 |   
 27 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 28 | 
 29 |   </head><body>
 30 |   
 31 | 
 32 |     <div class="document">
 33 |       <div class="documentwrapper">
 34 |         <div class="bodywrapper">
 35 |           
 36 | 
 37 |           <div class="body" role="main">
 38 |             
 39 |   <section id="about">
 40 | <h1>About<a class="headerlink" href="#about" title="Permalink to this heading">¶</a></h1>
 41 | <p>The Science of Science (SciSci) is a growing field at the boundary of sociology, network science, and computational social science <span id="id1">[<a class="reference internal" href="index.html#id15" title="Santo Fortunato, Carl T. Bergstrom, Katy Borner, James A. Evans, Dirk Helbing, Staša Milojevic, Alexander M. Petersen, Filippo Radicchi, Roberta Sinatra, Brian Uzzi, Alessandro Vespignani, Ludo Waltman, Dashun Wang, and Albert-László Barabási. Science of science. Science, 359(6379):eaao0185, Mar 2018.">1</a>]</span>.  It encompasses diverse interdisciplinary research programs that study the processes underlying science <span id="id2">[<a class="reference internal" href="index.html#id16" title="Dashun Wang and Albert-László Barabási. Science of Science. Cambridge University press, 2021.">2</a>]</span>.  The field has benefited greatly from access to massive digital databases containing the products of scientific discourse—including publications, journals, patents, books, conference proceedings, and grants.  The subsequent proliferation of mathematical models and computational techniques for quantifying the dynamics of innovation and success in science has made it difficult to disentangle universal scientific processes from those dependent on specific databases, data-processing decisions, field practices, etc..</p>
 42 | <p>Here we present <em>pySciSci</em> for the analysis of large-scale bibliometric data.  The package standardizes access to many of the most common datasets in SciSci and provides efficient implementations of common and advanced analytical techniques.  The <em>pySciSci</em> package is intended for researchers of SciSci or those who wish to integrate large-scale bibliometric data into other existing projects.</p>
 43 | <p>By creating a standardized and adaptable programmatic base for the study of bibliometric data, we intend to help democratize SciSci, support diverse research efforts based on bibliometric datasets, and address calls for open access and reproducibility in the SciSci literature and community.  We also encourage the SciSci community to contribute their own implementations, data, and use cases.</p>
 44 | <section id="funding">
 45 | <h2>Funding<a class="headerlink" href="#funding" title="Permalink to this heading">¶</a></h2>
 46 | <p><em>pySciSci</em> acknowledges support from the following grants:</p>
 47 | <ul class="simple">
 48 | <li><p>Air Force Office of Scientific Research Award FA9550-19-1-0354</p></li>
 49 | <li><p>Templeton Foundation Contract 61066</p></li>
 50 | </ul>
 51 | </section>
 52 | </section>
 53 | 
 54 | 
 55 |           </div>
 56 |           
 57 |         </div>
 58 |       </div>
 59 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 60 |         <div class="sphinxsidebarwrapper">
 61 | <h1 class="logo"><a href="index.html">pySciSci</a></h1>
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | <h3>Navigation</h3>
 71 | <ul class="current">
 72 | <li class="toctree-l1 current"><a class="current reference internal" href="#">About</a><ul>
 73 | <li class="toctree-l2"><a class="reference internal" href="#funding">Funding</a></li>
 74 | </ul>
 75 | </li>
 76 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 77 | <li class="toctree-l1"><a class="reference internal" href="examples.html">Examples</a></li>
 78 | <li class="toctree-l1"><a class="reference internal" href="databases.html">The DataBase</a></li>
 79 | <li class="toctree-l1"><a class="reference internal" href="methods.html">Bibliometrics</a></li>
 80 | <li class="toctree-l1"><a class="reference internal" href="network.html">Biblio-Networks</a></li>
 81 | <li class="toctree-l1"><a class="reference internal" href="utils.html">Bibliometrics</a></li>
 82 | </ul>
 83 | 
 84 | <div class="relations">
 85 | <h3>Related Topics</h3>
 86 | <ul>
 87 |   <li><a href="index.html">Documentation overview</a><ul>
 88 |       <li>Previous: <a href="index.html" title="previous chapter">Welcome to pySciSci’s documentation!</a></li>
 89 |       <li>Next: <a href="installation.html" title="next chapter">Installation</a></li>
 90 |   </ul></li>
 91 | </ul>
 92 | </div>
 93 | <div id="searchbox" style="display: none" role="search">
 94 |   <h3 id="searchlabel">Quick search</h3>
 95 |     <div class="searchformwrapper">
 96 |     <form class="search" action="search.html" method="get">
 97 |       <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
 98 |       <input type="submit" value="Go" />
 99 |     </form>
100 |     </div>
101 | </div>
102 | <script>document.getElementById('searchbox').style.display = "block"</script>
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 |         </div>
112 |       </div>
113 |       <div class="clearer"></div>
114 |     </div>
115 |     <div class="footer">
116 |       &copy;2021, Alexander Gates.
117 |       
118 |       |
119 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 5.3.0</a>
120 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
121 |       
122 |       |
123 |       <a href="_sources/about.rst.txt"
124 |           rel="nofollow">Page source</a>
125 |     </div>
126 | 
127 |     
128 | 
129 |     
130 |   </body>
131 | </html>


--------------------------------------------------------------------------------
/docs/html/installation.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html lang="en">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
  8 | 
  9 |     <title>Installation &#8212; pySciSci 0.6 documentation</title>
 10 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
 11 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css" />
 12 |     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
 13 |     <script src="_static/jquery.js"></script>
 14 |     <script src="_static/underscore.js"></script>
 15 |     <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
 16 |     <script src="_static/doctools.js"></script>
 17 |     <script src="_static/sphinx_highlight.js"></script>
 18 |     <link rel="author" title="About these documents" href="about.html" />
 19 |     <link rel="index" title="Index" href="genindex.html" />
 20 |     <link rel="search" title="Search" href="search.html" />
 21 |     <link rel="next" title="Examples" href="examples.html" />
 22 |     <link rel="prev" title="About" href="about.html" />
 23 |    
 24 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 25 |   
 26 |   
 27 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 28 | 
 29 |   </head><body>
 30 |   
 31 | 
 32 |     <div class="document">
 33 |       <div class="documentwrapper">
 34 |         <div class="bodywrapper">
 35 |           
 36 | 
 37 |           <div class="body" role="main">
 38 |             
 39 |   <section id="installation">
 40 | <h1>Installation<a class="headerlink" href="#installation" title="Permalink to this heading">¶</a></h1>
 41 | <p>This package is available in PyPI. Just run the following command on terminal to install.</p>
 42 | <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pip</span> <span class="n">install</span> <span class="n">pyscisci</span>
 43 | </pre></div>
 44 | </div>
 45 | <p>You can also source the code directly from the github <a class="reference external" href="https://github.com/SciSciCollective/pyscisci">project page</a> .</p>
 46 | </section>
 47 | 
 48 | 
 49 |           </div>
 50 |           
 51 |         </div>
 52 |       </div>
 53 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 54 |         <div class="sphinxsidebarwrapper">
 55 | <h1 class="logo"><a href="index.html">pySciSci</a></h1>
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | <h3>Navigation</h3>
 65 | <ul class="current">
 66 | <li class="toctree-l1"><a class="reference internal" href="about.html">About</a></li>
 67 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Installation</a></li>
 68 | <li class="toctree-l1"><a class="reference internal" href="examples.html">Examples</a></li>
 69 | <li class="toctree-l1"><a class="reference internal" href="databases.html">The DataBase</a></li>
 70 | <li class="toctree-l1"><a class="reference internal" href="methods.html">Bibliometrics</a></li>
 71 | <li class="toctree-l1"><a class="reference internal" href="network.html">Biblio-Networks</a></li>
 72 | <li class="toctree-l1"><a class="reference internal" href="utils.html">Bibliometrics</a></li>
 73 | </ul>
 74 | 
 75 | <div class="relations">
 76 | <h3>Related Topics</h3>
 77 | <ul>
 78 |   <li><a href="index.html">Documentation overview</a><ul>
 79 |       <li>Previous: <a href="about.html" title="previous chapter">About</a></li>
 80 |       <li>Next: <a href="examples.html" title="next chapter">Examples</a></li>
 81 |   </ul></li>
 82 | </ul>
 83 | </div>
 84 | <div id="searchbox" style="display: none" role="search">
 85 |   <h3 id="searchlabel">Quick search</h3>
 86 |     <div class="searchformwrapper">
 87 |     <form class="search" action="search.html" method="get">
 88 |       <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
 89 |       <input type="submit" value="Go" />
 90 |     </form>
 91 |     </div>
 92 | </div>
 93 | <script>document.getElementById('searchbox').style.display = "block"</script>
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 |         </div>
103 |       </div>
104 |       <div class="clearer"></div>
105 |     </div>
106 |     <div class="footer">
107 |       &copy;2021, Alexander Gates.
108 |       
109 |       |
110 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 5.3.0</a>
111 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
112 |       
113 |       |
114 |       <a href="_sources/installation.rst.txt"
115 |           rel="nofollow">Page source</a>
116 |     </div>
117 | 
118 |     
119 | 
120 |     
121 |   </body>
122 | </html>


--------------------------------------------------------------------------------
/docs/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/docs/html/objects.inv


--------------------------------------------------------------------------------
/docs/html/search.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | 
  4 | <html lang="en">
  5 |   <head>
  6 |     <meta charset="utf-8" />
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |     <title>Search &#8212; pySciSci 0.6 documentation</title>
  9 |     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
 10 |     <link rel="stylesheet" type="text/css" href="_static/alabaster.css" />
 11 |     
 12 |     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
 13 |     <script src="_static/jquery.js"></script>
 14 |     <script src="_static/underscore.js"></script>
 15 |     <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
 16 |     <script src="_static/doctools.js"></script>
 17 |     <script src="_static/sphinx_highlight.js"></script>
 18 |     <script src="_static/searchtools.js"></script>
 19 |     <script src="_static/language_data.js"></script>
 20 |     <link rel="author" title="About these documents" href="about.html" />
 21 |     <link rel="index" title="Index" href="genindex.html" />
 22 |     <link rel="search" title="Search" href="#" />
 23 |   <script src="searchindex.js" defer></script>
 24 |   
 25 |    
 26 |   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
 27 |   
 28 |   
 29 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
 30 | 
 31 | 
 32 |   </head><body>
 33 |   
 34 | 
 35 |     <div class="document">
 36 |       <div class="documentwrapper">
 37 |         <div class="bodywrapper">
 38 |           
 39 | 
 40 |           <div class="body" role="main">
 41 |             
 42 |   <h1 id="search-documentation">Search</h1>
 43 |   
 44 |   <noscript>
 45 |   <div class="admonition warning">
 46 |   <p>
 47 |     Please activate JavaScript to enable the search
 48 |     functionality.
 49 |   </p>
 50 |   </div>
 51 |   </noscript>
 52 |   
 53 |   
 54 |   <p>
 55 |     Searching for multiple words only shows matches that contain
 56 |     all words.
 57 |   </p>
 58 |   
 59 |   
 60 |   <form action="" method="get">
 61 |     <input type="text" name="q" aria-labelledby="search-documentation" value="" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
 62 |     <input type="submit" value="search" />
 63 |     <span id="search-progress" style="padding-left: 10px"></span>
 64 |   </form>
 65 |   
 66 |   
 67 |   
 68 |   <div id="search-results">
 69 |   
 70 |   </div>
 71 |   
 72 | 
 73 |           </div>
 74 |           
 75 |         </div>
 76 |       </div>
 77 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 78 |         <div class="sphinxsidebarwrapper">
 79 | <h1 class="logo"><a href="index.html">pySciSci</a></h1>
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | <h3>Navigation</h3>
 89 | <ul>
 90 | <li class="toctree-l1"><a class="reference internal" href="about.html">About</a></li>
 91 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 92 | <li class="toctree-l1"><a class="reference internal" href="examples.html">Examples</a></li>
 93 | <li class="toctree-l1"><a class="reference internal" href="databases.html">The DataBase</a></li>
 94 | <li class="toctree-l1"><a class="reference internal" href="methods.html">Bibliometrics</a></li>
 95 | <li class="toctree-l1"><a class="reference internal" href="network.html">Biblio-Networks</a></li>
 96 | <li class="toctree-l1"><a class="reference internal" href="utils.html">General Functions</a></li>
 97 | </ul>
 98 | 
 99 | <div class="relations">
100 | <h3>Related Topics</h3>
101 | <ul>
102 |   <li><a href="index.html">Documentation overview</a><ul>
103 |   </ul></li>
104 | </ul>
105 | </div>
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 |         </div>
115 |       </div>
116 |       <div class="clearer"></div>
117 |     </div>
118 |     <div class="footer">
119 |       &copy;2021, Alexander Gates.
120 |       
121 |       |
122 |       Powered by <a href="http://sphinx-doc.org/">Sphinx 5.3.0</a>
123 |       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
124 |       
125 |     </div>
126 | 
127 |     
128 | 
129 |     
130 |   </body>
131 | </html>


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pySciSci documentation master file, created by
 2 |    sphinx-quickstart on Mon Jun 21 10:54:47 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pySciSci's documentation!
 7 | **************************************
 8 | 
 9 | Table of Contents
10 | ===================
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    about
15 |    installation
16 |    examples
17 |    databases
18 |    methods
19 |    network
20 |    utils
21 | 
22 | 
23 | References
24 | ===================
25 | .. bibliography::


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ===================
3 | This package is available in PyPI. Just run the following command on terminal to install.
4 | 
5 | >>> pip install pyscisci
6 | 
7 | You can also source the code directly from the github `project page <https://github.com/SciSciCollective/pyscisci>`_ .


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/methods.rst:
--------------------------------------------------------------------------------
 1 | Bibliometrics
 2 | ======================
 3 | *pySciSci* facilitates the analysis of publications, authors, citations and as well as citation time-series, fixed time window citation analysis, and citation count normalization by year and field. 
 4 | 
 5 | 
 6 | Publications and Citations
 7 | ---------------------------
 8 | The *pySciSci* package facilitates the analysis of interrelationships between publications as captured by references and citations.
 9 | 
10 | For example, the most common measure of scientific impact is the citation count, or the number of times a publication has been referenced by other publications.  Variations also include citation time-series, fixed time window citation analysis, citation count normalization by year and field, and citation ranks.  More advanced methods fit models to citation timeseries, such as in the prediction of the long-term citation counts to a publication :cite:`wang2013longterm`, or in the assignment of the sleeping beauty score :cite:`ke2015sleepingbeauty`.  The package also removes of self-citations occurring between publications by the same author.
11 | 
12 | More advanced metrics capture the diversity in the citation interrelationships between publications.  These measures include the Rao-Stirling reference interdisciplinary :cite:`stirling2007diversity`, novelty & conventionality :cite:`uzzi2013atypical`, and the disruption index :cite:`funk2017dynamic`, :cite:`wu2019largeteams`.
13 | 
14 | .. automodule:: pyscisci.methods.publication
15 |    :members:
16 | 
17 | 
18 | 
19 | Author-centric Methods
20 | ----------------------
21 | 
22 | The sociology of science has analyzed scientific careers in terms of individual incentives, productivity, competition, collaboration, and success.  The *pySciSci* package facilitates author career analysis through both aggregate career statistics and temporal career trajectories.  Highlights include the H-index :cite:`hirsch2005index`, Q-factor :cite:`sinatra2016quantifying`, yearly productivity trajectories :cite:`way2017misleading`, collective credit assignment :cite:`shen2014collective`, and hot-hand effect :cite:`liu2018hot`.
23 | 
24 | 
25 | .. automodule:: pyscisci.methods.author
26 |    :members:
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/network.rst:
--------------------------------------------------------------------------------
1 | Biblio-Networks
2 | ======================
3 | Intro to networks
4 | 
5 | .. automodule:: pyscisci.network
6 |    :members:


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinxcontrib-bibtex


--------------------------------------------------------------------------------
/docs/utils.rst:
--------------------------------------------------------------------------------
1 | General Functions
2 | ======================
3 | 
4 | .. automodule:: pyscisci.utils
5 |    :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/examples/Getting_Started/DatabaseGrowth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Getting_Started/DatabaseGrowth.pdf


--------------------------------------------------------------------------------
/examples/Getting_Started/Getting Started with OpenAlex.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "874e645e",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "\n",
 11 |     "import pyscisci.all as pyscisci\n",
 12 |     "\n",
 13 |     "import os\n",
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np\n",
 16 |     "import matplotlib.pylab as plt\n",
 17 |     "\n",
 18 |     "%matplotlib inline"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "122e3e61",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# set this path to where the OpenAlex database will be stored\n",
 29 |     "path2openalex = '/home/ajgates/OpenAlex'\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "61b2080c",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "myoa = pyscisci.OpenAlex(path2openalex, database_extension='csv.gz', keep_in_memory=False) \n",
 40 |     "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n",
 41 |     "# cant keep more than one dataframe in memory at a time\n",
 42 |     "# otherwise keep_in_memory=True will keep each database in memory after its loaded"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "id": "3c15092c",
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "#df_features_to_keep = ['all']  # keep all data\n",
 53 |     "# OR\n",
 54 |     "df_features_to_keep = ['affiliations', 'authors', 'publications', 'references', \n",
 55 |     "                       'publicationauthoraffiliation', 'fields']  # keep everything besides the abstracts\n",
 56 |     "myoa.download_from_source(rewrite_existing = False,\n",
 57 |     "                         dataframe_list=df_features_to_keep)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "6395510f",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# before we can start running our analysis, we have to preprocess the raw data into\n",
 68 |     "# DataFrames that are more convinent to work with\n",
 69 |     "\n",
 70 |     "# we only need to run this for the first time, but it will take awhile\n",
 71 |     "myoa.preprocess(dataframe_list=['all'])"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "20165e51",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": []
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3 (ipykernel)",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.10.14"
100 |   }
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 5
104 | }
105 | 


--------------------------------------------------------------------------------
/examples/Getting_Started/Getting Started with PubMed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "\n",
 10 |     "import pyscisci.all as pyscisci\n",
 11 |     "\n",
 12 |     "import os\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np\n",
 15 |     "import matplotlib.pylab as plt\n",
 16 |     "\n",
 17 |     "%matplotlib inline"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# set this path to where the PubMed database will be stored\n",
 27 |     "path2pubmed = '/home/ajgates/PubMed'\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "mypubmed = pyscisci.PubMed(path2pubmed, database_extension='csv.gz', keep_in_memory=False) \n",
 37 |     "\n",
 38 |     "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n",
 39 |     "# cant keep more than one DataFrame in memory at a time\n",
 40 |     "\n",
 41 |     "# otherwise keep_in_memory=True will keep each DataFrame in memory after its loaded"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# This function will download the latest baseline version of PubMed.\n",
 51 |     "# Depending on your internet connection, it could take several hours to complete the download.\n",
 52 |     "\n",
 53 |     "mypubmed.download_from_source(rewrite_existing=False)\n",
 54 |     "# if your connetion breaks/download stops for any reason, set rewrite_existing = False and \n",
 55 |     "# rerun to continue downloading where you left off"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# before we can start running our analysis, we have to preprocess the raw data into\n",
 65 |     "# DataFrames that are more convinent to work with\n",
 66 |     "mypubmed.preprocess(show_progress=True)\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# PubMed creates three dataframes:\n",
 76 |     "# pub  - keeps all of the publication information\n",
 77 |     "# columns : ['PublicationId', 'Title', 'Year', 'Month', 'Day', 'Doi', 'JournalId', 'ISSN', 'Volume', 'Issue', 'Pages', 'TeamSize']\n",
 78 |     "\n",
 79 |     "# paa  - links the publications to authors and affiliations \n",
 80 |     "# NOTE: PubMed does not disambiguate authors!!!\n",
 81 |     "# columns : ['PublicationId', 'FirstName', 'LastName', 'FullName', 'Affiliations', 'AuthorSequence']\n",
 82 |     "\n",
 83 |     "# pub2field  - links the publications to fields (aka subjectAreas)\n",
 84 |     "# columns : ['PublicationId', 'FieldId']\n",
 85 |     "\n",
 86 |     "# pub2ref  - keeps the citation information\n",
 87 |     "# columns : ['CitingPublicationId', 'CitedPublicationId']\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# lets plot the number of publications each year\n",
 97 |     "yearly_articles = pyscisci.groupby_count(df=mypubmed.pub , colgroupby='Year', colcountby='PublicationId', count_unique=True,\n",
 98 |     "                               show_progress=True)\n",
 99 |     "yearly_articles.sort_values(by='Year', inplace=True)\n",
100 |     "\n",
101 |     "fig, ax = plt.subplots(1,1,figsize=(8,5))\n",
102 |     "\n",
103 |     "ax.plot(yearly_articles['Year'],yearly_articles['PublicationIdCount'])\n",
104 |     "\n",
105 |     "ax.set_xlabel('Year')\n",
106 |     "ax.set_ylabel(\"# of publications\")\n",
107 |     "ax.set_yscale('log')\n",
108 |     "\n",
109 |     "plt.show()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 3 (ipykernel)",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.10.8"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 2
141 | }
142 | 


--------------------------------------------------------------------------------
/examples/Getting_Started/Getting Started with WOS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "\n",
 10 |     "import pyscisci.all as pyscisci\n",
 11 |     "\n",
 12 |     "import os\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np\n",
 15 |     "import matplotlib.pylab as plt\n",
 16 |     "\n",
 17 |     "%matplotlib inline"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# set this path to where the WOS database will be stored\n",
 27 |     "path2wos = '/home/ajgates/WOS'"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "mywos = pyscisci.WOS(path2wos, database_extension='csv.gz', show_progress=True) \n",
 37 |     "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n",
 38 |     "# cant keep more than one database in memory at a time\n",
 39 |     "# otherwise keep_in_memory=True will keep each database in memory after its loaded"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# before we can start running our analysis, we have to preprocess the raw data into\n",
 49 |     "# DataFrames that are more convinent to work with\n",
 50 |     "\n",
 51 |     "# we only need to run this for the first time, but it will take awhile\n",
 52 |     "mywos.preprocess(xml_directory = 'RawXML', name_space = None, show_progress=True)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# WOS contains the following dataframes:\n",
 62 |     "\n",
 63 |     "# pub  - keeps all of the publication information\n",
 64 |     "# columns : ['PublicationId', 'Year', 'JournalId', 'FamilyId',  'Doi', 'Title', 'Date', 'Volume', 'Issue', 'DocType']\n",
 65 |     "\n",
 66 |     "# author  - keeps all of the author information (only some versions of WoS ship with the AuthorId known as the AuthorDAIS)\n",
 67 |     "# columns : ['AuthorId', 'FullName', 'LastName', 'FirstName', 'MiddleName']\n",
 68 |     "\n",
 69 |     "# pub2ref  - links publications to their references or citations\n",
 70 |     "# columns : ['CitingPublicationId', 'CitedPublicationId']\n",
 71 |     "\n",
 72 |     "# paa  - links publications, authors, and affiliations (only some versions of WoS ship with the AuthorId known as the AuthorDAIS)\n",
 73 |     "# columns : ['PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence',  'OrigAuthorName', 'OrigAffiliationName']\n",
 74 |     "\n",
 75 |     "# author2pub  - links the authors to their publications\n",
 76 |     "# columns : ['PublicationId', 'AuthorId', 'AuthorOrder']\n",
 77 |     "\n",
 78 |     "# field  - field information\n",
 79 |     "# columns : ['FieldId', 'FieldLevel', 'NumberPublications', 'FieldName']\n",
 80 |     "\n",
 81 |     "# pub2field  - links publications to their fields\n",
 82 |     "# columns : ['PublicationId', 'FieldId']\n",
 83 |     "\n",
 84 |     "# affiliation  - affiliation information\n",
 85 |     "# columns : ['AffiliationId', 'NumberPublications', 'NumberCitations', 'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude', 'Longitude']\n",
 86 |     "\n",
 87 |     "# journal  - journal information\n",
 88 |     "# columns : ['JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage']\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "# after additional processing, these DataFrames become available\n",
 92 |     "\n",
 93 |     "# pub2refnoself  - links publications to their references or citations with self-citations removed\n",
 94 |     "# columns : ['CitingPublicationId', 'CitedPublicationId']\n",
 95 |     "\n",
 96 |     "# impact  - precomputed citation counts, columns will depend on which counts are computed\n",
 97 |     "# columns : ['PublicationId', 'Year', ....]"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# lets plot the number of publications each year\n",
107 |     "pub  = mywos.pub \n",
108 |     "\n",
109 |     "yearly_articles = pyscisci.groupby_count(df=pub , colgroupby='Year', colcountby='PublicationId', count_unique=True)\n",
110 |     "yearly_articles.sort_values(by='Year', inplace=True)\n",
111 |     "yearly_articles = yearly_articles.loc[yearly_articles['Year'] > 0]\n",
112 |     "\n",
113 |     "fig, ax = plt.subplots(1,1,figsize=(8,5))\n",
114 |     "\n",
115 |     "ax.plot(yearly_articles['Year'],yearly_articles['PublicationIdCount'])\n",
116 |     "\n",
117 |     "ax.set_xlabel('Year')\n",
118 |     "ax.set_ylabel(\"# of publications\")\n",
119 |     "ax.set_yscale('log')\n",
120 |     "\n",
121 |     "plt.show()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# now we can see the distribution of author productivity \n",
131 |     "pa  = mymag.author2pub  # publication author relationships\n",
132 |     "\n",
133 |     "author_prod = pyscisci.author_productivity(pa , colgroupby = 'AuthorId', colcountby = 'PublicationId')\n",
134 |     "\n",
135 |     "prodvalues, prodcounts = np.unique(author_prod['Productivity'].values, return_counts=True)\n",
136 |     "\n",
137 |     "fig, ax = plt.subplots(1,1,figsize=(8,5))\n",
138 |     "\n",
139 |     "ax.scatter(prodvalues, prodcounts)\n",
140 |     "\n",
141 |     "ax.set_xlabel('Productivity')\n",
142 |     "ax.set_ylabel(\"# of authors\")\n",
143 |     "ax.set_xscale('log')\n",
144 |     "ax.set_yscale('log')\n",
145 |     "\n",
146 |     "plt.show()"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": []
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "Python 3 (ipykernel)",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.10.8"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 2
178 | }
179 | 


--------------------------------------------------------------------------------
/examples/GlobalCitationNetwork/data/Chengetal_idealist.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/Chengetal_idealist.csv.gz


--------------------------------------------------------------------------------
/examples/GlobalCitationNetwork/data/diffusion_panel_data_1990_2017.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/diffusion_panel_data_1990_2017.csv.gz


--------------------------------------------------------------------------------
/examples/GlobalCitationNetwork/data/link_prediction_panel_data_1990_2017.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/link_prediction_panel_data_1990_2017.csv.gz


--------------------------------------------------------------------------------
/examples/GlobalCitationNetwork/data/oa_countrycites_nosameorg_auc.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/GlobalCitationNetwork/data/oa_countrycites_nosameorg_auc.csv.gz


--------------------------------------------------------------------------------
/examples/GlobalCitationNetwork/vizutils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import pandas as pd
  3 | 
  4 | import matplotlib.gridspec as gridspec
  5 | 
  6 | 
  7 | def pvalue2stars(pvalue):
  8 |     if pvalue < 0.001:
  9 |         return "^{***}"
 10 |     elif pvalue < 0.01:
 11 |         return "^{**}"
 12 |     elif pvalue < 0.05:
 13 |         return "^{*}"
 14 |     else:
 15 |         return ""
 16 | 
 17 | 
 18 | def table_row(varname='const', offset=0, roun=2, modellist=[], namedict={},i=0,j=1):
 19 |     
 20 |     row_text = namedict[varname] + " & "*(offset+1) + " & ".join([ "$" + str(model.params[i][varname].round(roun)) + pvalue2stars(model.pvalues[i][varname]) + "$" for model in modellist[offset:]]) + "\\\\ \n"
 21 | 
 22 |     row_text += " & "*(offset+1) + " & ".join([ "$(" + str(model.conf_int().loc[str(j)].loc[varname]['lower'].round(roun)) + "," + str(model.conf_int().loc[str(j)].loc[varname]['upper'].round(roun)) + ")$" for model in modellist[offset:]]) + "\\\\ \n"
 23 |     row_text +=  " & "*(offset+1) + " & ".join([ "S.E. $" + str(model.bse[i][varname].round(roun)) + "$; p-v $" +str(model.pvalues[i][varname].round(4)) + "$"  for model in modellist[offset:]]) + "\\\\ [0.8ex]  \n"
 24 | 
 25 |       #row_text +=  " & "*(offset+1) + " & ".join([ "std. err. $" + str(model.std_errors[varname].round(roun)) + "$" for model in modellist[offset:]]) + "\\\\ [0.8ex] \n"
 26 |     return row_text
 27 | 
 28 | def make_multinomial_latex_table(fit_models, exog_var_sets = [], dep_var = "", namedict = {}, caption_text=""):
 29 |     
 30 |     Nmodels = len(exog_var_sets)
 31 |     
 32 |     table_text= """{\\tiny
 33 |     \\begin{longtable}{p{0.2\\linewidth}"""+"p{0.12\\linewidth}"*Nmodels+"}"+"""\\caption{\\textbf{Fixed-effect multinomial logit regression.} Model coefficients labelled by $p$-value. Standard errors in parentheses.} 
 34 |       \\label{table:multinomialfull} \\\\
 35 |       \\hline \\hline \\\\
 36 |     \\multicolumn{"""+str(Nmodels+1)+"""}{c}{\\textbf{Dependent variable: Citation preference}} \\\\ \\hline 
 37 |      & \\multicolumn{"""+str(Nmodels)+"""}{c}{Model}  \\\\"""
 38 |     
 39 |     table_text += "\cline{2-" + str(Nmodels + 1) + "}"
 40 | 
 41 |     for icol in range(Nmodels):
 42 |         table_text += "& (" + str(icol + 1) + ")"
 43 |     
 44 |     
 45 |     table_text += """\\\\[0.8ex]
 46 |     \\hline
 47 |     \\endfirsthead"""
 48 | 
 49 |     table_text += """\\multicolumn{2}{c}%
 50 |       {{\\tablename\\ \\thetable{} -- continued from previous page}} \\\\
 51 |       \\hline \\\\"""
 52 | 
 53 |     for icol in range(Nmodels):
 54 |         table_text += "& (" + str(icol + 1) + ")"
 55 | 
 56 |     table_text += """\\\\
 57 |       \\hline
 58 |       \\endhead """
 59 | 
 60 |     table_text += "\\hline"+"&"*(Nmodels+1-2)+ """\\multicolumn{2}{r}{{Continued on next page}} \\\\ \\endfoot
 61 |     \\hline
 62 |     \\caption*{} \\\\
 63 |     \\endlastfoot"""
 64 |     
 65 |     table_text += """$\\mathbf{Citation~Preference: Positive}$ & & & & & \\\\ [1.8ex]"""
 66 | 
 67 |     # add constant
 68 |     table_text += table_row(varname='const', offset=0, roun=2, modellist=fit_models, namedict=namedict,i=0,j=1)
 69 |     
 70 |     for offset, varlist in enumerate(exog_var_sets):
 71 |         for var in varlist:
 72 |             table_text += table_row(varname=var, offset=offset, roun=2, modellist=fit_models, namedict=namedict,i=0,j=1)
 73 | 
 74 |     table_text += """ \\hline \\\\ $\\mathbf{Citation~Preference: Negative}$ & & & & & \\\\ [1.8ex]"""
 75 | 
 76 |     table_text += table_row(varname='const', offset=0, roun=2, modellist=fit_models, namedict=namedict,i=1,j=2)
 77 | 
 78 |     for offset, varlist in enumerate(exog_var_sets):
 79 |         for var in varlist:
 80 |             table_text += table_row(varname=var, offset=offset, roun=2, modellist=fit_models, namedict=namedict,i=1,j=2)
 81 | 
 82 | 
 83 |     
 84 |     table_text += """\\hline 
 85 | \\hline \\\\[-1.8ex] 
 86 | \\textit{Note:} & \\multicolumn{2}{r}{$^{*}p<0.05$; $^{**}p<0.01$; $^{***}p<0.001$} \\\\ \n"""
 87 |     
 88 |     table_text += "Observations & " + " & ".join([str(model.nobs) for model in fit_models]) + " \\\\ \n"
 89 |     table_text += "Pseudo $R^2$ & " + " & ".join([str(np.round(model.prsquared, 4)) for model in fit_models]) + " \\\\ \n"
 90 |     table_text += "Log Likelihood & " + " & ".join([str(model.llf.round(2)) for model in fit_models]) + " \\\\ \n"
 91 |     #table_text += "F statistic & " + " & ".join(["$" + str(np.round(model.f_statistic.stat, 2)) + pvalue2stars(model.f_statistic.pval) + "$ (d.f.=" + str(model.f_statistic.df) + ")" for model in fit_models]) + " \\\\ \n"
 92 | 
 93 |     # m8.llr,m8.df_model,m8.llr_pvalue
 94 | 
 95 |     fstat_text = []
 96 |     for model in fit_models:
 97 |         fstat_text.append("$" + str(np.round(model.llr, 2)) + pvalue2stars(model.llr_pvalue) + "$ (d.f.=" + str(model.df_model) + ")")
 98 |         
 99 |     table_text += "LLR $\\chi^2$  & " + " & ".join(fstat_text) + " \\\\ \n"
100 |     table_text += "Year FE  & " + " & ".join(['Yes' for model in fit_models]) + " \\\\ \n"
101 |     
102 |     table_text += "\\hline \n\\end{longtable} } }"
103 |     
104 |     print(table_text)


--------------------------------------------------------------------------------
/examples/Method_Examples/Example of Credit Allocation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyscisci.all as pyscisci\n",
 10 |     "\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pylab as plt\n",
 14 |     "try:\n",
 15 |     "    import seaborn as sns\n",
 16 |     "    sns.set_style('white')\n",
 17 |     "except:\n",
 18 |     "    pass\n",
 19 |     "\n",
 20 |     "%matplotlib inline"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "[0.75 0.25]\n",
 33 |       "{1: 0, 2: 1}\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "# Fig 1A\n",
 39 |     "# move d to have ids: 11 - 15\n",
 40 |     "pub2ref  = [[11, 0], [11, 1], [11, 2], [11, 3]] # d1 citations\n",
 41 |     "pub2ref  += [[12, 0], [12, 2], [12, 3]] # d2 citations\n",
 42 |     "pub2ref  += [[13, 0], [13, 2], [13, 4]] # d3 citations\n",
 43 |     "pub2ref  += [[14, 0]] # d4 citations\n",
 44 |     "pub2ref  += [[15, 0], [15, 2], [15, 3], [15, 4]] # d5 citations\n",
 45 |     "pub2ref  = pd.DataFrame(pub2ref , columns = ['CitingPublicationId', 'CitedPublicationId'])\n",
 46 |     "\n",
 47 |     "# authors have ids 1 - 8, assume each grey author is different\n",
 48 |     "pub2authorA  = [[0, 1], [0, 2], [1, 1], [2,3], [2,1], [2,4], [2,5], [3,1], [4,6], [4,7], [4,8]]\n",
 49 |     "pub2authorA  = pd.DataFrame(pub2authorA , columns = ['PublicationId', 'AuthorId'])\n",
 50 |     "\n",
 51 |     "credit_share, author2int = pyscisci.credit_share(focus_pid=0, \n",
 52 |     "                                            pub2ref =pub2ref , \n",
 53 |     "                                            pub2author =pub2authorA , \n",
 54 |     "                                            temporal=False, \n",
 55 |     "                                            normed=True,\n",
 56 |     "                                            show_progress=False)\n",
 57 |     "\n",
 58 |     "print(credit_share)\n",
 59 |     "print(author2int)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "[0.5 0.5]\n",
 72 |       "{1: 0, 2: 1}\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# Fig 1B\n",
 78 |     "\n",
 79 |     "# notice the difference between 1A and 1B is the authorships of papers 1-4\n",
 80 |     "# authors have ids 1 - 8, assume each grey author is different\n",
 81 |     "pub2authorB  = [[0,1], [0,2], [1,2], [1,1], [2,2], [2,3], [2,4], [2,1], [3,5], [3,1], [3,2], [4,6], [4,7]]\n",
 82 |     "pub2authorB  = pd.DataFrame(pub2authorB , columns = ['PublicationId', 'AuthorId'])\n",
 83 |     "\n",
 84 |     "credit_share, author2int = pyscisci.credit_share(focus_pid=0, \n",
 85 |     "                                            pub2ref =pub2ref , \n",
 86 |     "                                            pub2author =pub2authorB , \n",
 87 |     "                                            temporal=False, \n",
 88 |     "                                            normed=True,\n",
 89 |     "                                            show_progress=False)\n",
 90 |     "\n",
 91 |     "print(credit_share)\n",
 92 |     "print(author2int)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 4,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "[[0.84615385 0.77777778 0.74193548 0.75      ]\n",
105 |       " [0.15384615 0.22222222 0.25806452 0.25      ]]\n",
106 |       "{1: 0, 2: 1}\n"
107 |      ]
108 |     },
109 |     {
110 |      "name": "stderr",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "/Users/ajgates/.pyenv/versions/3.9.0/lib/python3.9/site-packages/tqdm/std.py:699: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
114 |       "  from pandas import Panel\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "# now lets make the pub2ref  temporal\n",
120 |     "pub2year = {11:2014, 12:2015, 13:2015, 14:2018, 15:2020}\n",
121 |     "pub2ref ['CitingYear'] = [pub2year.get(pid) for pid in pub2ref ['CitingPublicationId']]\n",
122 |     "\n",
123 |     "credit_share, author2int, years = pyscisci.credit_share(focus_pid=0, \n",
124 |     "                                            pub2ref =pub2ref , \n",
125 |     "                                            pub2author =pub2authorA , \n",
126 |     "                                            temporal=True, \n",
127 |     "                                            normed=True,\n",
128 |     "                                            show_progress=False)\n",
129 |     "\n",
130 |     "print(credit_share)\n",
131 |     "print(author2int)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": []
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3 (ipykernel)",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.8.12"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/examples/NLP_Examples/Example_Node2vec (umap,sem_axis).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import pandas as pd\n",
 11 |     "\n",
 12 |     "import pyscisci.all as pyscisci\n",
 13 |     "\n",
 14 |     "from pyscisci.embedding import Node2Vec\n",
 15 |     "\n",
 16 |     "path2dblp = '/u/yoonjis/ember_home/DBLP_new' #put yout own DBLP path here\n",
 17 |     "path2dblp = '/users/hgt6rn/Documents/DataSets/DBLP'\n",
 18 |     "mydblp = pyscisci.DBLP(path2database= path2dblp, keep_in_memory=False, show_progress=True)\n",
 19 |     "\n",
 20 |     "#a2p  = mydblp.author2pub "
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# coauthorship network of Albert-Laszlo Barabasi and Mark E. J. Newman\n",
 30 |     "author  = mydblp.author \n",
 31 |     "target_researcher = ['Albert-Laszlo Barabasi', 'Mark E. J. Newman']\n",
 32 |     "target_index = list(author[author.FullName.isin(target_researcher)].AuthorId)\n",
 33 |     "coauthornet, author2int = pyscisci.coauthorship_network(a2p , focus_author_ids = target_index, focus_constraint='ego', show_progress=True)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "# 1. Get Node2Vec Embedding"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "\n",
 50 |     "model = Node2Vec(coauthornet, author2int)\n",
 51 |     "emb = model.learn_embedding()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "ids = list(author2int.keys())\n",
 61 |     "id_to_name = author.set_index('AuthorId')['FullName'].to_dict()\n",
 62 |     "names = [id_to_name[k] for k in author2int.keys()]\n",
 63 |     "emb_array = [emb[k] for k in ids]"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "# 2. 2-D projection of embeddings "
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import umap\n",
 80 |     "fit = umap.UMAP(metric='cosine')\n",
 81 |     "u = fit.fit_transform(emb_array)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "import plotly.graph_objects as go\n",
 91 |     "import plotly as py\n",
 92 |     "import plotly.express as px\n",
 93 |     "\n",
 94 |     "df = pd.DataFrame({\n",
 95 |     "    'x': u[:,0],\n",
 96 |     "    'y': u[:,1],\n",
 97 |     "    'name': names\n",
 98 |     "})\n",
 99 |     "\n",
100 |     "fig = px.scatter(df, x=\"x\", y=\"y\", hover_name=\"name\")\n",
101 |     "fig.update_layout(\n",
102 |     "            autosize=False,\n",
103 |     "            width=1000,\n",
104 |     "            height=800,\n",
105 |     "        )\n",
106 |     "fig.update_traces(marker=dict(size=3),\n",
107 |     "                          selector=dict(mode='markers'))\n",
108 |     "\n",
109 |     "py.offline.plot(fig, filename=\"example_interactive_html/umap_fig.html\",  auto_open=False)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "# 3. Sem_axis results"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "For the detail, please read Sem_aixs paper, https://arxiv.org/abs/1806.05521. <br>\n",
124 |     "Sem_axis usually used in word space, to characterize word semantics using many semantic axes. But It can be applied on the network also.<br>\n",
125 |     "In this example, we define axis from two-person (Newman as a negative anchor and Barabasi as a positive anchor). \n",
126 |     "Then, we can interpret persons with negative values as more Newman friendly researcher, and the person with positive values as more Barabasi friendly researcher."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "positive_entities = [245542] # Barabasi's vector\n",
136 |     "negative_entities = [301349] # Newman's vector"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "sem_aixs_dict = pyscisci.sem_axis(emb, positive_entities , negative_entities)\n",
146 |     "sem_axis_array = [sem_aixs_dict[id_] for id_ in ids]"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "df = pd.DataFrame({\n",
156 |     "    'sem_axis_result': sem_axis_array,\n",
157 |     "    'y': 0,\n",
158 |     "    'name': names,\n",
159 |     "})"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "fig = px.scatter(df, x=\"sem_axis_result\", y=\"y\", hover_name=\"name\")\n",
169 |     "fig.update_layout(\n",
170 |     "    autosize=False,\n",
171 |     "    width=1200,\n",
172 |     "    height=300,\n",
173 |     "    yaxis={\n",
174 |     "        'range': [-0.1, 0.1],\n",
175 |     "        'showgrid': False, # thin lines in the background\n",
176 |     "        'zeroline': False, # thick line at x=0\n",
177 |     "        'visible': False,  # numbers below\n",
178 |     "    },\n",
179 |     "    xaxis={\n",
180 |     "        'showgrid': False, # thin lines in the background\n",
181 |     "        'zeroline': False, # thick line at x=0\n",
182 |     "    }\n",
183 |     "    \n",
184 |     ")\n",
185 |     "fig.update_traces(marker=dict(size=3),\n",
186 |     "                          selector=dict(mode='markers'))\n",
187 |     "\n",
188 |     "py.offline.plot(fig, filename=\"example_interactive_html/sem_axis.html\",  auto_open=False)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": []
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": []
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3 (ipykernel)",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.10.12"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 4
234 | }
235 | 


--------------------------------------------------------------------------------
/examples/Network_Examples/DeSollaPriceCarrerCitations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/DeSollaPriceCarrerCitations.pdf


--------------------------------------------------------------------------------
/examples/Network_Examples/DiversityCocitiation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/DiversityCocitiation.pdf


--------------------------------------------------------------------------------
/examples/Network_Examples/Example of Diffusion of Scientific Credit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib inline\n",
 10 |     "\n",
 11 |     "import os\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "import networkx as nx\n",
 15 |     "\n",
 16 |     "from collections import defaultdict\n",
 17 |     "\n",
 18 |     "import scipy.sparse as spsparse\n",
 19 |     "import matplotlib.pylab as plt\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "import pyscisci.all as pyscisci\n",
 23 |     "\n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "path2aps = '/home/ajgates/APS'\n",
 33 |     "path2aps = '/Volumes/GatesExpansionDrive/DataSets/APS/APS2019'\n",
 34 |     "\n",
 35 |     "myaps = pyscisci.APS(path2aps, keep_in_memory=False)\n",
 36 |     "\n",
 37 |     "# NOTE: APS does not contain disambiguated author or affiliation information by default, although researchers \n",
 38 |     "# have produced their own disambiguation to supplement the raw data\n",
 39 |     "\n",
 40 |     "# Here, we include the author disambiguation used in Sinatra et al. (2016)\n",
 41 |     "# if you didn't already download the file, uncomment the line below\n",
 42 |     "#myaps.download_from_source(files_to_download='paa_supplement')\n",
 43 |     "myaps.set_new_data_path(dataframe_name='paa ', new_path='publicationauthoraffiliation_supp2010')\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "application/vnd.jupyter.widget-view+json": {
 54 |        "model_id": "f07e0e2dafe448ddbe8df9a509b50c5a",
 55 |        "version_major": 2,
 56 |        "version_minor": 0
 57 |       },
 58 |       "text/plain": [
 59 |        "HBox(children=(HTML(value='Loading Publications'), FloatProgress(value=0.0, max=1.0), HTML(value='')))"
 60 |       ]
 61 |      },
 62 |      "metadata": {},
 63 |      "output_type": "display_data"
 64 |     },
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "\n"
 70 |      ]
 71 |     },
 72 |     {
 73 |      "data": {
 74 |       "application/vnd.jupyter.widget-view+json": {
 75 |        "model_id": "323ac728696c45b8812c45d0e8aac859",
 76 |        "version_major": 2,
 77 |        "version_minor": 0
 78 |       },
 79 |       "text/plain": [
 80 |        "HBox(children=(HTML(value='Loading pub2ref'), FloatProgress(value=0.0, max=1.0), HTML(value='')))"
 81 |       ]
 82 |      },
 83 |      "metadata": {},
 84 |      "output_type": "display_data"
 85 |     },
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "\n"
 91 |      ]
 92 |     },
 93 |     {
 94 |      "data": {
 95 |       "application/vnd.jupyter.widget-view+json": {
 96 |        "model_id": "a41b406cb9c840c4a517a16c36ef1206",
 97 |        "version_major": 2,
 98 |        "version_minor": 0
 99 |       },
100 |       "text/plain": [
101 |        "HBox(children=(HTML(value='Loading Publication Author Affiliation'), FloatProgress(value=0.0, max=1.0), HTML(v…"
102 |       ]
103 |      },
104 |      "metadata": {},
105 |      "output_type": "display_data"
106 |     },
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "pub  = myaps.pub \n",
117 |     "\n",
118 |     "# limit the publications to those published on/before 1966 \n",
119 |     "pub  = pub .loc[pub ['Year'] <= 1966]\n",
120 |     "\n",
121 |     "# get their references\n",
122 |     "pub2ref  = myaps.load_references(filter_dict={'CitingPublicationId':np.sort(pub ['PublicationId'].unique())})\n",
123 |     "\n",
124 |     "# and get their authors\n",
125 |     "pub2author  = myaps.load_publicationauthoraffiliation(columns = ['PublicationId', 'AuthorId', 'FullName'],\n",
126 |     "    filter_dict={'PublicationId':np.sort(pub ['PublicationId'].unique())})\n",
127 |     "  \n",
128 |     "aid2name = {aid:name for aid, name in pub2author [['AuthorId', 'FullName']].values}\n",
129 |     "del pub2author ['FullName']"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 4,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "(22015,)\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "sc, author2int = pyscisci.diffusion_of_scientific_credit(pub2ref , pub2author , \n",
147 |     "                                        pub =pub , alpha = 0.9, max_iter = 100, tol = 1.0e-10)\n",
148 |     "\n",
149 |     "int2aid = {i:aid for aid, i in author2int.items()}\n",
150 |     "\n",
151 |     "print(sc.shape)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 5,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "hans.a..bethe 0.008774540389162876\n",
164 |       "john.c..slater 0.008551603901008543\n",
165 |       "g...breit 0.007495337492029187\n",
166 |       "j..s..schwinger 0.006353349118286071\n",
167 |       "eugene.p..wigner 0.005233086936687401\n",
168 |       "robert.a..millikan 0.005045043029409964\n",
169 |       "robert.s..mulliken 0.004200793335143117\n",
170 |       "arthur.h..compton 0.004038834819293501\n",
171 |       "irving..langmuir 0.004025411535935083\n",
172 |       "john.h..van vleck 0.004013967086421246\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "# print the top k authors\n",
178 |     "# Note: here we use an algorithmicly disambiguated author careers.  The original paper just\n",
179 |     "# disambiguated authors based on unique name.  So we expect the rankings to differ.\n",
180 |     "\n",
181 |     "topk = 10\n",
182 |     "\n",
183 |     "topk_authors = np.argpartition(sc, -topk)[-topk:]\n",
184 |     "topk_authors = topk_authors[np.argsort(sc[topk_authors])][::-1]\n",
185 |     "\n",
186 |     "for int_id in topk_authors:\n",
187 |     "    print(aid2name[int2aid[int_id]], sc[int_id])\n"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3 (ipykernel)",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.8.12"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 4
219 | }
220 | 


--------------------------------------------------------------------------------
/examples/Network_Examples/StirlingCocitiation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/examples/Network_Examples/StirlingCocitiation.pdf


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 0 Preparing PySciSci.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# The Science of Science\n",
  8 |     "by: Dashun Wang and Albert-Laszlo Barabasi\n",
  9 |     "\n",
 10 |     "[You can get the textbook here.](https://www.amazon.com/Science-Dashun-Wang/dp/1108716954/ref=asc_df_1108716954/?tag=hyprod-20&linkCode=df0&hvadid=459526655425&hvpos=&hvnetw=g&hvrand=10075848530578766295&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9002059&hvtargid=pla-967727027885&psc=1)\n",
 11 |     "\n",
 12 |     "The companion notebooks by Alex Gates."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "To start, we need to download the data and preprocess it.  \n",
 20 |     "These steps only need to be run once, when you first download the data.  \n",
 21 |     "\n",
 22 |     "Note: We just learned Microsoft will discontinue their support for MAG as of Dec 2021.  As other data become available, we will update this code."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# load the pyscisci package\n",
 32 |     "\n",
 33 |     "import pyscisci.all as pyscisci"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# you should download the MAG data from Microsoft's website:\n",
 43 |     "# https://www.microsoft.com/en-us/research/project/microsoft-academic-graph/\n",
 44 |     "\n",
 45 |     "# set this path to where the MAG database is locally stored\n",
 46 |     "path2mag = '/home/ajgates/MAG'\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Create a MAG object\n",
 56 |     "\n",
 57 |     "mymag = pyscisci.MAG(path2mag, keep_in_memory=False) \n",
 58 |     "# set keep_in_memory=False if you want to load the database each time its needed - good for when you \n",
 59 |     "# cant keep more than one database in memory at a time\n",
 60 |     "# otherwise keep_in_memory=True will keep each database in memory after its loaded"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# before we can start running our analysis, we have to preprocess the raw data into\n",
 70 |     "# DataFrames that are more convenient to work with\n",
 71 |     "\n",
 72 |     "# we only need to run this for the first time, but it will take awhile\n",
 73 |     "mymag.preprocess(verbose=True)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# MAG contains the following dataframes:\n",
 83 |     "\n",
 84 |     "# pub_df - keeps all of the publication information\n",
 85 |     "# columns : ['PublicationId', 'Year', 'JournalId', 'FamilyId',  'Doi', 'Title', 'Date', 'Volume', 'Issue', 'DocType']\n",
 86 |     "\n",
 87 |     "# author_df - keeps all of the author information\n",
 88 |     "# columns : ['AuthorId', 'FullName', 'LastName', 'FirstName', 'MiddleName']\n",
 89 |     "\n",
 90 |     "# pub2ref_df - links publications to their references or citations\n",
 91 |     "# columns : ['CitingPublicationId', 'CitedPublicationId']\n",
 92 |     "\n",
 93 |     "# paa_df - links publications, authors, and affiliations\n",
 94 |     "# columns : ['PublicationId', 'AuthorId', 'AffiliationId', 'AuthorSequence',  'OrigAuthorName', 'OrigAffiliationName']\n",
 95 |     "\n",
 96 |     "# author2pub_df - links the authors to their publications\n",
 97 |     "# columns : ['PublicationId', 'AuthorId', 'AuthorOrder']\n",
 98 |     "\n",
 99 |     "# field_df - field information\n",
100 |     "# columns : ['FieldId', 'FieldLevel', 'NumberPublications', 'FieldName']\n",
101 |     "\n",
102 |     "# pub2field_df - links publications to their fields\n",
103 |     "# columns : ['PublicationId', 'FieldId']\n",
104 |     "\n",
105 |     "# affiliation_df - affiliation information\n",
106 |     "# columns : ['AffiliationId', 'NumberPublications', 'NumberCitations', 'FullName', 'GridId', 'OfficialPage', 'WikiPage', 'Latitude', 'Longitude']\n",
107 |     "\n",
108 |     "# journal_df - journal information\n",
109 |     "# columns : ['JournalId', 'FullName', 'Issn', 'Publisher', 'Webpage']\n",
110 |     "\n",
111 |     "\n",
112 |     "# after additional processing, these DataFrames become available\n",
113 |     "\n",
114 |     "# pub2refnoself_df - links publications to their references or citations with self-citations removed\n",
115 |     "# columns : ['CitingPublicationId', 'CitedPublicationId']\n",
116 |     "\n",
117 |     "# impact_df - precomputed citation counts, columns will depend on which counts are computed\n",
118 |     "# columns : ['PublicationId', 'Year', ....]"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 3
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython3",
145 |    "version": "3.9.0"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 2
150 | }
151 | 


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 05 Random Impact Rule.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 06 The Q-Factor.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 08 The Increasing Dominance of Teams in Science.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 10 Coauthorship Networks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/ScienceOfScienceTextbook/Chapter 14 Credit Allocation.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Chapter 14: Credit Allocation\n",
 8 |     "\n",
 9 |     "Note: Here we use the APS dataset with the author disambiguation used in Sinatra et al (2016)."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "# headers\n",
19 |     "import pyscisci.all as pyscisci\n",
20 |     "\n",
21 |     "import numpy as np\n",
22 |     "import scipy.stats as spstats\n",
23 |     "\n",
24 |     "import matplotlib.pylab as plt\n",
25 |     "%matplotlib inline\n",
26 |     "\n",
27 |     "\n",
28 |     "\n",
29 |     "# some useful functions and definitions\n",
30 |     "red_color = '#f27c96'\n",
31 |     "lightblue_color = '#7cd0ea'\n",
32 |     "darkblue_color = '#154959'\n",
33 |     "green_color = '#93d0aa'"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": [
49 |     "# Experimental observation of isolated large transverse energy electrons\n",
50 |     "# with associated missing energy at $\\sqrt s$ = 540 GeV\n",
51 |     "fig14_1_focus_publicationid = 1674199955\n",
52 |     "\n",
53 |     "# A novel stereospecific alkenyl-alkenyl cross-coupling by a palladium- or nickel-catalyzed \n",
54 |     "# reaction of alkenylalanes with alkenyl halides \n",
55 |     "fig14_2b_publicationid = 2047184383\n",
56 |     "\n",
57 |     "# Electric Field Effect in Atomically Thin Carbon Films  \n",
58 |     "fig14_2c_publicationid = 2058122340"
59 |    ]
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "kernelspec": {
64 |    "display_name": "Python 3",
65 |    "language": "python",
66 |    "name": "python3"
67 |   },
68 |   "language_info": {
69 |    "codemirror_mode": {
70 |     "name": "ipython",
71 |     "version": 3
72 |    },
73 |    "file_extension": ".py",
74 |    "mimetype": "text/x-python",
75 |    "name": "python",
76 |    "nbconvert_exporter": "python",
77 |    "pygments_lexer": "ipython3",
78 |    "version": "3.9.0"
79 |   }
80 |  },
81 |  "nbformat": 4,
82 |  "nbformat_minor": 4
83 | }
84 | 


--------------------------------------------------------------------------------
/examples/example_data/fenn_paa.csv:
--------------------------------------------------------------------------------
 1 | PublicationId,AuthorId,AffiliationId
 2 | 14060727,2168590209,184840846.0
 3 | 24099381,2168590209,32971472.0
 4 | 36085056,2168590209,
 5 | 60025733,2168590209,
 6 | 66859150,2168590209,
 7 | 132191034,2168590209,
 8 | 144824799,2168590209,32971472.0
 9 | 205131131,2168590209,32971472.0
10 | 250833172,2168590209,
11 | 305406204,2168590209,
12 | 954622638,2168590209,
13 | 976761365,2168590209,184840846.0
14 | 1036084810,2168590209,
15 | 1484230275,2168590209,
16 | 1508438708,2168590209,32971472.0
17 | 1518033024,2168590209,32971472.0
18 | 1555842123,2168590209,
19 | 1670759301,2168590209,
20 | 1684005362,2168590209,1313525311.0
21 | 1831455963,2168590209,
22 | 1937746889,2168590209,32971472.0
23 | 1967173439,2168590209,184840846.0
24 | 1969621117,2168590209,
25 | 1978232283,2168590209,
26 | 1978249208,2168590209,
27 | 1978859882,2168590209,
28 | 1980069844,2168590209,32971472.0
29 | 1980512427,2168590209,184840846.0
30 | 1980679837,2168590209,
31 | 1983689926,2168590209,184840846.0
32 | 1986330843,2168590209,
33 | 1994617550,2168590209,32971472.0
34 | 2000983103,2168590209,
35 | 2001302835,2168590209,
36 | 2005761128,2168590209,32971472.0
37 | 2015660202,2168590209,184840846.0
38 | 2016131879,2168590209,
39 | 2016297089,2168590209,32971472.0
40 | 2016717705,2168590209,32971472.0
41 | 2017661443,2168590209,
42 | 2024220877,2168590209,32971472.0
43 | 2024397888,2168590209,184840846.0
44 | 2026682897,2168590209,
45 | 2027129137,2168590209,32971472.0
46 | 2030043624,2168590209,
47 | 2036760271,2168590209,
48 | 2036940947,2168590209,32971472.0
49 | 2038636345,2168590209,
50 | 2044177896,2168590209,32971472.0
51 | 2046888612,2168590209,184840846.0
52 | 2050417652,2168590209,184840846.0
53 | 2056243786,2168590209,184840846.0
54 | 2056481117,2168590209,32971472.0
55 | 2058068992,2168590209,32971472.0
56 | 2062681226,2168590209,184840846.0
57 | 2065072453,2168590209,
58 | 2065077709,2168590209,
59 | 2065591377,2168590209,
60 | 2073292063,2168590209,32971472.0
61 | 2081767399,2168590209,
62 | 2089382829,2168590209,32971472.0
63 | 2092926413,2168590209,
64 | 2095344572,2168590209,184840846.0
65 | 2099545637,2168590209,184840846.0
66 | 2106493299,2168590209,
67 | 2119931004,2168590209,32971472.0
68 | 2141946863,2168590209,184840846.0
69 | 2143409410,2168590209,32971472.0
70 | 2153339430,2168590209,184840846.0
71 | 2154912073,2168590209,184840846.0
72 | 2162316463,2168590209,
73 | 2188402146,2168590209,
74 | 2242689142,2168590209,
75 | 2577637059,2168590209,
76 | 2601451314,2168590209,
77 | 2625037326,2168590209,
78 | 2802810717,2168590209,1313525311.0
79 | 2884933832,2168590209,32971472.0
80 | 2952384988,2168590209,184840846.0
81 | 2952841889,2168590209,32971472.0
82 | 2960356156,2168590209,184840846.0
83 | 2999970405,2168590209,
84 | 3053325085,2168590209,
85 | 


--------------------------------------------------------------------------------
/pyscisci/__init__.py:
--------------------------------------------------------------------------------
 1 | __package__ = 'pyscisci'
 2 | __title__ = 'pyscisci: A python package for the science of science'
 3 | __description__ = 'Lets study science!'
 4 | 
 5 | __copyright__ = '2021, Gates, A.J.'
 6 | 
 7 | __author__ = """\n""".join([
 8 |     'Alexander J Gates <ajgates42@gmail.com>'
 9 | ])
10 | 
11 | __version__ = '0.92'
12 | __release__ = '0.92'
13 | 


--------------------------------------------------------------------------------
/pyscisci/all.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: all
 4 |     :synopsis: easy interface to all of pyscisci
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | from pyscisci.utils import *
10 | from pyscisci.methods.publication import *
11 | from pyscisci.methods.journal import *
12 | from pyscisci.methods.author import *
13 | from pyscisci.methods.referencestrength import *
14 | from pyscisci.datasource.readwrite import load_preprocessed_data, append_to_preprocessed
15 | from pyscisci.network import *
16 | from pyscisci.sparsenetworkutils import *
17 | from pyscisci.nlp import *
18 | from pyscisci.datasource.MAG import MAG
19 | from pyscisci.datasource.WOS import WOS
20 | from pyscisci.datasource.DBLP import DBLP
21 | from pyscisci.datasource.APS import APS
22 | from pyscisci.datasource.PubMed import PubMed
23 | from pyscisci.datasource.OpenAlex import OpenAlex
24 | from pyscisci.datasource.CustomDB import CustomDB
25 | from pyscisci.filter import *
26 | from pyscisci.visualization import *
27 | 


--------------------------------------------------------------------------------
/pyscisci/datasource/CustomDB.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import gzip
 5 | import zipfile
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | from nameparser import HumanName
10 | 
11 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
12 | if 'ipykernel' in sys.modules:
13 |     from tqdm.notebook import tqdm
14 | else:
15 |     from tqdm import tqdm
16 | 
17 | from pyscisci.datasource.readwrite import load_preprocessed_data, load_int, load_float, load_html_str
18 | from pyscisci.database import BibDataBase
19 | from pyscisci.utils import download_file_from_google_drive
20 | 
21 | # hide this annoying performance warnings
22 | import warnings
23 | warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
24 | 
25 | 
26 | class CustomDB(BibDataBase):
27 |     """
28 |     Base class for creating a CustomDB.
29 | 
30 |     """
31 | 
32 |     def __init__(self, path2database = '', database_extension='csv.gz', keep_in_memory = False, global_filter=None, 
33 |         enable_dask=False, show_progress=True):
34 | 
35 |         self._default_init(path2database, database_extension, keep_in_memory, global_filter, enable_dask, show_progress)
36 | 
37 |         self.PublicationIdType = int
38 |         self.AffiliationIdType = int
39 |         self.AuthorIdType = int
40 |         self.JournalIdType = int
41 | 
42 |     def set_new_data_paths(new_path_dict={}):
43 |         """
44 |         Override path to the dataframe collections based on a new custom hierarchy.
45 | 
46 |         Parameters
47 |         --------
48 |         new_path_dict : dict
49 |             A dictionary where each key is a dataframe name to override.  E.g. 'author', 'pub', 'paa', 'pub2field', etc.
50 |             and each item is the new dataframe path.
51 | 
52 |         """
53 |         for dfname, new_path in new_path_dict.items():
54 |             self.set_new_data_path(dfname, new_path)
55 | 


--------------------------------------------------------------------------------
/pyscisci/datasource/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/datasource/__init__.py


--------------------------------------------------------------------------------
/pyscisci/filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: all
 4 |     :synopsis: easy interface to all of pyscisci
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | class RangeFilter():
10 | 
11 | 	def __init__(self, field, min_value=None, max_value=None):
12 | 
13 | 		self.field = field
14 | 		self.min = min_value
15 | 		self.max = max_value
16 | 
17 | 		self.check_value = lambda s,x: False
18 | 
19 | 		if not self.min is None and not self.max is None:
20 | 			self.check_value = self.fullrange
21 | 
22 | 		elif not self.min is None and self.max is None:
23 | 			self.check_value = self.lowerbound
24 | 
25 | 		elif self.min is None and not self.max is None:
26 | 			self.check_value = self.upperbound
27 | 
28 | 		else:
29 | 			raise ValueError("One of min_value or max_value must be set.")
30 | 
31 | 
32 | 	def fullrange(self, value):
33 | 		return (value >= self.min) and (value <= self.max)
34 | 
35 | 	def lowerbound(self, value):
36 | 		return (value >= self.min)
37 | 
38 | 	def upperbound(self, value):
39 | 		return (value <= self.max)
40 | 
41 | class SetFilter():
42 | 
43 | 	def __init__(self, field, value_set=None):
44 | 
45 | 		self.field = field
46 | 		self.value_set = set(value_set)
47 | 
48 | 	def check_value(self, value):
49 | 		return value in self.value_set
50 | 
51 | class YearFilter(RangeFilter):
52 | 
53 | 	def __init__(self, min_year=None, max_year=None):
54 | 
55 | 		self.field = 'Year'
56 | 		self.min = min_year
57 | 		self.max = max_year
58 | 
59 | 		self.check_value = lambda s,x: False
60 | 
61 | 		if not self.min is None and not self.max is None:
62 | 			self.check_value = self.fullrange
63 | 
64 | 		elif not self.min is None and self.max is None:
65 | 			self.check_value = self.lowerbound
66 | 
67 | 		elif self.min is None and not self.max is None:
68 | 			self.check_value = self.upperbound
69 | 
70 | class DocTypeFilter(SetFilter):
71 | 
72 | 	def __init__(self, doctypes=[]):
73 | 
74 | 		self.field = 'DocType'
75 | 		self.value_set=set(doctypes)
76 | 
77 | class FieldFilter(SetFilter):
78 | 
79 | 	def __init__(self, valid_fields=[]):
80 | 		
81 | 		self.field = 'FieldId'
82 | 		self.value_set=set(valid_fields)
83 | 
84 | class JournalFilter(SetFilter):
85 | 
86 | 	def __init__(self, valid_journals=[]):
87 | 		
88 | 		self.field = 'JournalId'
89 | 		self.value_set=set(valid_journals)
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/pyscisci/methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/methods/__init__.py


--------------------------------------------------------------------------------
/pyscisci/methods/careertopics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: career topics
  4 |     :synopsis: Calculate the co-citing network and detect career communities.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import networkx as nx
 14 | import scipy.sparse as spsparse
 15 | 
 16 | 
 17 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 18 | if 'ipykernel' in sys.modules:
 19 |     from tqdm.notebook import tqdm
 20 | else:
 21 |     from tqdm import tqdm
 22 | 
 23 | from pyscisci.utils import isin_sorted
 24 | from pyscisci.network import cociting_network
 25 | from pyscisci.sparsenetworkutils import largest_connected_component_vertices
 26 | 
 27 | ## Career Cociting Network
 28 | def career_cociting_network_topics(paa, pub2ref, randomize=None, return_network=False, show_progress=False):
 29 |     """
 30 |     This function calculates the topics throughout a career based on the co-citing network (two publications are linked if they share a reference).  
 31 |     See :cite:`zeng2019topicswitch` for details.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     paa : dataframe
 36 |         The publication author affiliation linkages for the focus author.
 37 | 
 38 |     pub2ref : dataframe
 39 |         The citing-cited publication linkages which contains the citing articles from the focus author.
 40 | 
 41 |     randomize : int, default None
 42 |         See for random initialization of the community detection algorithm.
 43 | 
 44 |     return_network : bool, default False
 45 |         Return the networkx object of the co-citing network.
 46 | 
 47 |     show_progress : bool, default False
 48 |         Show calculation progress. 
 49 | 
 50 |     Returns
 51 |     ----------
 52 |     switching_career : DataFrame
 53 |         The paa with topic number included (topics are detected using the Louvain algorithm in the co-citing network).
 54 | 
 55 |     cociting_net : networkx.Graph(), optional
 56 |         If 'return_network == True' then the cociting network is returned as a networkx graph object.
 57 | 
 58 |     """
 59 |     try:
 60 |         from cdlib import algorithms
 61 |     except ImportError:
 62 |         raise ImportError("Optional package cdlib needed for this analysis: pip install cdlib")
 63 | 
 64 |     try:
 65 |         from clusim.clustering import Clustering
 66 |     except ImportError:
 67 |         raise ImportError("Optional package clusim needed for this analysis: pip install clusim")
 68 | 
 69 |     focus_pub_ids = np.sort(paa['PublicationId'].unique())
 70 | 
 71 |     # find the co-citing network
 72 |     cociting_adjmat, cociting2int = cociting_network(pub2ref, focus_pub_ids=focus_pub_ids, 
 73 |         focus_constraint='citing', 
 74 |         cited_col_name = 'CitedPublicationId', 
 75 |         citing_col_name = 'CitingPublicationId')
 76 | 
 77 |     # now take the largest connected component
 78 |     lcc_nodes = largest_connected_component_vertices(cociting_adjmat)
 79 | 
 80 |     remapnodes = {nid:i for i, nid in enumerate(lcc_nodes)}
 81 |     cociting2int = {pid:remapnodes[i] for pid, i in cociting2int.items() if not remapnodes.get(i, None) is None}
 82 | 
 83 |     lcc_cociting_adjmat = spsparse.csr_matrix(cociting_adjmat)[lcc_nodes][:,lcc_nodes]
 84 | 
 85 |     # remove self-loops and binarize
 86 |     lcc_cociting_adjmat.setdiag(0)
 87 |     lcc_cociting_adjmat.data[lcc_cociting_adjmat.data >1] = 1
 88 |     lcc_cociting_adjmat.eliminate_zeros()
 89 | 
 90 |     lcc_cociting_net = nx.Graph(lcc_cociting_adjmat)
 91 |     coms = algorithms.louvain(lcc_cociting_net, resolution=1., randomize=randomize)
 92 |     louvain_communities = Clustering().from_cluster_list(coms.communities)
 93 | 
 94 |     pub2topiccomm = {pid:list(louvain_communities.elm2clu_dict[pid])[0] for pid in lcc_cociting_net.nodes()}
 95 | 
 96 |     switching_career = paa[['PublicationId', 'AuthorId', 'Year']].copy()
 97 |     switching_career.drop_duplicates(subset=['PublicationId'], inplace=True)
 98 |     switching_career = switching_career.loc[isin_sorted(switching_career['PublicationId'].values, np.sort(list(cociting2int.keys())))]
 99 |     
100 |     switching_career['TopicCommunity'] = [pub2topiccomm[cociting2int[pid]] for pid in switching_career['PublicationId'].values] 
101 | 
102 |     switching_career['Degree'] = [lcc_cociting_net.degree()[cociting2int[pid]] for pid in switching_career['PublicationId'].values] 
103 | 
104 |     switching_career.dropna(inplace=True)
105 | 
106 |     switching_career.sort_values('Year', inplace=True)
107 | 
108 |     if return_network:
109 |         nx.set_node_attributes(lcc_cociting_net, pub2topiccomm, "TopicCommunity")
110 |         nx.set_node_attributes(lcc_cociting_net, {i:pid for pid,i in cociting2int.items()}, "PublicationId")
111 | 
112 |         return switching_career, lcc_cociting_net
113 |     else:
114 |         return switching_career
115 |     


--------------------------------------------------------------------------------
/pyscisci/methods/cindex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: cindex
 4 |     :synopsis: Calculate the cindex.
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | import sys
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
15 | if 'ipykernel' in sys.modules:
16 |     from tqdm.notebook import tqdm
17 | else:
18 |     from tqdm import tqdm
19 | 
20 | from pyscisci.utils import zip2dict
21 | 
22 | 
23 | def compute_cindex(df, colgroupby, colcountby, show_progress=False):
24 |     """
25 |     Calculate the cindex for each group in the DataFrame (the number of citations to the maximum cited publication).
26 |     See :cite:`Waltman2008index` for detailed definition.
27 | 
28 |     Parameters
29 |     ----------
30 |     :param df : DataFrame
31 |         A DataFrame with the citation information for each Author.
32 | 
33 |     :param colgroupby : str
34 |         The DataFrame column with Author Ids.
35 | 
36 |     :param colcountby : str
37 |         The DataFrame column with Citation counts for each publication.
38 | 
39 |     Returns
40 |     -------
41 |     DataFrame
42 |         DataFrame with 2 columns: colgroupby, 'Cindex'
43 | 
44 |         """
45 |     # register our pandas apply with tqdm for a progress bar
46 |     tqdm.pandas(desc='cindex', disable= not show_progress)
47 | 
48 |     newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Cindex']*2)
49 |     return df.groupby(colgroupby, sort=False)[colcountby].max(qfactor).to_frame().reset_index().rename(columns=newname_dict)
50 | 
51 | def compute_pindex(df, colgroupby, colcountby, show_progress=False):
52 |     """
53 |     Calculate the pindex for each group in the DataFrame (the number of publications with >0 citations).  
54 |     See :cite:`Waltman2008index` for detailed definition.
55 | 
56 |     Parameters
57 |     ----------
58 |     :param df : DataFrame
59 |         A DataFrame with the citation information for each Author.
60 | 
61 |     :param colgroupby : str
62 |         The DataFrame column with Author Ids.
63 | 
64 |     :param colcountby : str
65 |         The DataFrame column with Citation counts for each publication.
66 | 
67 |     Returns
68 |     -------
69 |     DataFrame
70 |         DataFrame with 2 columns: colgroupby, 'Pindex'
71 | 
72 |         """
73 |     # register our pandas apply with tqdm for a progress bar
74 |     tqdm.pandas(desc='pindex', disable= not show_progress)
75 | 
76 |     newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Pindex']*2)
77 |     return df.groupby(colgroupby, sort=False)[colcountby].astype(bool).sum(axis=0).to_frame().reset_index().rename(columns=newname_dict)
78 | 
79 | 


--------------------------------------------------------------------------------
/pyscisci/methods/creditshare.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: credit sharing
  4 |     :synopsis: Set of functions for calcuating credit share amongst authors.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | from pyscisci.utils import isin_sorted, groupby_count
 13 | from pyscisci.network import cocitation_network
 14 | 
 15 | def credit_share(focus_pid, pub2ref, pub2author, temporal=False, normed=False, show_progress=False):
 16 |     """
 17 |     Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     :param focus_pid : int, str
 22 |         The focus publication id.
 23 | 
 24 |     :param pub2ref : DataFrame
 25 |         A DataFrame with the citation information for each Publication.
 26 | 
 27 |     :param pub2author : DataFrame
 28 |         A DataFrame with the author information for each Publication.
 29 | 
 30 |     :param temporal : bool, default False
 31 |         If True, compute the adjacency matrix using only publications for each year.
 32 | 
 33 |     :param normed : bool, default False
 34 |         Normalize the sum of credit share to 1.0
 35 | 
 36 |     :param show_progress : bool, default False
 37 |         If True, show a progress bar tracking the calculation.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     credit_share, numpy array
 42 |         If temporal == False:
 43 |             The adjacency matrix for the co-citation network
 44 | 
 45 |         If temporal == True:
 46 |             A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
 47 |             by citing publications in that year.
 48 | 
 49 |     author2int, dict
 50 |         A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above).
 51 | 
 52 |     """
 53 | 
 54 |     # the focus publication's authors
 55 |     focus_authors = np.sort(pub2author.loc[pub2author['PublicationId']==focus_pid]['AuthorId'].unique())
 56 |     author2int = {aid:i for i, aid in enumerate(focus_authors)}
 57 | 
 58 |     if focus_authors.shape[0] > 1:
 59 |         # start by getting the co-citation network around the focus publication
 60 |         adj_mat, cited2int = cocitation_network(pub2ref, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited',
 61 |                 temporal=temporal, show_progress=show_progress)
 62 | 
 63 |         # get the authorships for the publications in the cocitation network
 64 |         cocited_pubs = np.sort(list(cited2int.keys()))
 65 |         pa = pub2author.loc[isin_sorted(pub2author['PublicationId'].values, cocited_pubs)]
 66 | 
 67 |         if cocited_pubs.shape[0] > 0:
 68 |             # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub)
 69 |             credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float)
 70 | 
 71 |             # for each cocited publication, we count the number of authors
 72 |             # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors)
 73 |             for cocitedid, adf in pa.groupby('PublicationId'):
 74 |                 author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None]
 75 |                 if len(author2row) > 0:
 76 |                     credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique()
 77 | 
 78 |             if temporal:
 79 |                 # temporal credit allocation - broken down by year
 80 | 
 81 |                 # we need the temporal citations to the focus article
 82 |                 focus_citations = groupby_count(pub2ref.loc[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort([focus_pid]))],
 83 |                     colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False)
 84 |                 focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values}
 85 | 
 86 |                 # when temporal is True, a temporal adj mat is returned where each key is the year
 87 |                 years = np.sort(list(adj_mat.keys()))
 88 | 
 89 |                 cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float)
 90 | 
 91 |                 for iy, y in enumerate(years):
 92 |                     cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year
 93 |                     cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y]          #set the diagonal to be the total citations from that year
 94 | 
 95 |                 cocite_counts = cocite_counts.cumsum(axis=0)
 96 | 
 97 |             else:
 98 |                 # just do credit allocation with the full cocitation matrix
 99 |                 cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense()
100 | 
101 |                 # the co-citation matrix misses the number of citations to the focus publication
102 |                 # so explicitly calculate the number of citations to the focus publication
103 |                 cocite_counts[0,cited2int[focus_pid]] = pub2ref.loc[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique()
104 | 
105 |             # credit share is the matrix product of the credit_allocation_mat with cocite_counts
106 |             credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T)))
107 | 
108 |             # normalize the credit share vector to sum to 1
109 |             if normed:
110 |                 credit_share = credit_share/credit_share.sum(axis=0)
111 | 
112 |             if temporal:
113 |                 return credit_share, author2int, years
114 |             else:
115 |                 return credit_share, author2int
116 |         else:
117 |             if temporal:
118 |                 years = np.sort(pub2ref.loc[pub2ref['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
119 |                 return np.array([[None for y in years] for a in author2int]), author2int, years
120 |             else:
121 |                 return np.array([None for a in author2int]), author2int
122 | 
123 |     elif focus_authors.shape[0] == 1:
124 |         if temporal:
125 |             years = np.sort(pub2ref.loc[pub2ref['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
126 |             return np.ones(shape=(1,years.shape[0])), author2int, years
127 |         else:
128 |             return np.array([1.0]), author2int


--------------------------------------------------------------------------------
/pyscisci/methods/diffusionscientificcredit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: diffusionscientificcredit
  4 |     :synopsis: Rank authors based on the pagerank within their citation graph.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | from pyscisci.utils import isin_sorted, groupby_count, groupby_total
 13 | from pyscisci.network import cocitation_network
 14 | from pyscisci.sparsenetworkutils import dataframe2bipartite, sparse_pagerank_scipy
 15 | 
 16 | def diffusion_of_scientific_credit(pub2ref, pub2author, pub=None, alpha = 0.9, max_iter = 100, tol = 1.0e-10):
 17 |     """
 18 |     Calculate the diffusion of scientific credits for each author based on :cite:`Radicchi2009authorpagerank`.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 | 
 23 |     :param pub2ref : DataFrame
 24 |         A DataFrame with the citation information for each Publication.
 25 | 
 26 |     :param pub2author : DataFrame
 27 |         A DataFrame with the author information for each Publication.
 28 | 
 29 |     :param pub : DataFrame
 30 |         A DataFrame with the publication information for each Publication.
 31 | 
 32 |     :param alpha : float, default 0.9
 33 |         The PageRank reset probility
 34 | 
 35 |     :param max_iter : int, default 100
 36 |         The maximum number of iterations when appllying the power method.
 37 | 
 38 |     :param tol : float, default 1.0e-10
 39 |         The error tolerance when appllying the power method.
 40 | 
 41 |     Returns
 42 |     -------
 43 |     credit_share, numpy array
 44 |         If temporal == False:
 45 |             The adjacency matrix for the co-citation network
 46 | 
 47 |         If temporal == True:
 48 |             A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
 49 |             by citing publications in that year.
 50 | 
 51 |     author2int, dict
 52 |         A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above).
 53 | 
 54 |     """
 55 | 
 56 |     """
 57 |     Diffusion of Scientific Credits and the Ranking of Scientists
 58 |     Radicchi et al (2009) Phys Rev E
 59 | 
 60 | 
 61 |     author_subset - each row is one article & author combination.  
 62 |             at least two columns 'name':author name, and 'teamsize':number of total authors on the paper
 63 | 
 64 |     full_citation - each row is one citation from one article & author combination
 65 |             at least four columns 'name_citing': name of citing author, 'name_cited': name of cited author, 
 66 |             'teamsize_citing': number of authors for citing paper, 'teamsize_cited': number of authors for cited papers.
 67 |     """
 68 | 
 69 |     # relabel the authors to map to network nodes
 70 |     focus_authors = np.sort(pub2author['AuthorId'].unique())
 71 |     author2int = {aid:i for i, aid in enumerate(focus_authors)}
 72 |     Nauthors = len(author2int)
 73 | 
 74 |     pub2author.drop_duplicates(subset=['PublicationId', 'AuthorId'], inplace=True)
 75 |     pub2author['AuthorId'] = [author2int.get(aid, None) for aid in pub2author['AuthorId'].values]
 76 | 
 77 |     # check if we are given the teamsize in publication information
 78 |     if (not pub is None) and 'TeamSize' in list(pub):
 79 |         teamsize = {pid:ts for pid, ts in pub[['PublicationId', 'TeamSize']].values}
 80 |     
 81 |     # otherwies we need to calculate teamsize based on the authorship information
 82 |     else:
 83 |         teamsize = pub2author.groupby('PublicationId')['AuthorId'].nunique()
 84 | 
 85 | 
 86 |     full_citation = pub2ref.merge(pub2author[['PublicationId', 'AuthorId']], left_on = 'CitingPublicationId', right_on = 'PublicationId')
 87 |     del full_citation['PublicationId']
 88 |     full_citation.rename(columns={'AuthorId':'CitingAuthorId'}, inplace=True)
 89 | 
 90 |     full_citation = full_citation.merge(pub2author[['PublicationId', 'AuthorId']], left_on = 'CitedPublicationId', right_on = 'PublicationId')
 91 |     del full_citation['PublicationId']
 92 |     full_citation.rename(columns={'AuthorId':'CitedAuthorId'}, inplace=True)
 93 | 
 94 |     full_citation.dropna(inplace=True)
 95 | 
 96 |     # now add in the teamsize information to make edge weights
 97 |     full_citation['edge_weight'] = [1.0/(teamsize.get(citing_pid, 1) * teamsize.get(cited_pid, 1)) for citing_pid, cited_pid in full_citation[['CitingPublicationId', 'CitedPublicationId']].values]
 98 | 
 99 | 
100 |     adj_mat = dataframe2bipartite(full_citation, rowname='CitingAuthorId', colname='CitedAuthorId', 
101 |         shape = (Nauthors,Nauthors), weightname = 'edge_weight')
102 | 
103 | 
104 |     # make the weighted productivity vector to intialize the pagerank
105 |     pub2author['AuthorCredit'] = [1/teamsize.get(pid, 1) for pid in pub2author['PublicationId'].values]
106 |     weighted_productivity = groupby_total(pub2author, colgroupby = 'AuthorId', colcountby = 'AuthorCredit').sort_values('AuthorId')
107 |     # norm vector
108 |     weighted_productivity['AuthorCreditTotal'] = weighted_productivity['AuthorCreditTotal'] / weighted_productivity['AuthorCreditTotal'].sum()
109 | 
110 |     # run the power method to solve the diffusion 
111 |     sc = sparse_pagerank_scipy(adj_mat, alpha= alpha, 
112 |         personalization=weighted_productivity['AuthorCreditTotal'].values, 
113 |         initialization=weighted_productivity['AuthorCreditTotal'].values,
114 |                    max_iter=max_iter, tol=tol, dangling=None)
115 | 
116 |     return sc, author2int
117 | 


--------------------------------------------------------------------------------
/pyscisci/methods/disruption.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: distruption index
  4 |     :synopsis: Set of functions for finding the disruption index.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | import sys
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 13 | if 'ipykernel' in sys.modules:
 14 |     from tqdm.notebook import tqdm
 15 | else:
 16 |     from tqdm import tqdm
 17 | 
 18 | ### Disruption
 19 | def disruption_index(pub2ref, focus_pub_ids = None, cite_window = None, ref_window = None, show_progress=False):
 20 |     """
 21 |     Calculate the disruption index as first proposed in :cite:`Funk2017disrupt` and used in :cite:`Wu2019teamsdisrupt`.
 22 |     We also include the windowed disruption index used in :cite:`Park2023timedisrupt`.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 | 
 27 |     :param pub2ref : DataFrame
 28 |         A DataFrame with the citation information for each Publication.
 29 | 
 30 |     :param focus_pub_ids : numpy array
 31 |         A subset of publication ids to focus on for the disruption index.
 32 | 
 33 |     :param cite_window : list of two ints, default None
 34 |         If None, no citation window is applied.
 35 |         If [l, u] where, l,u are ints, then only citations whose year difference is greater than or equal to the lower bound l and 
 36 |             less than or equal to the upper bound u are used.  e.g. [0,5] uses citations within 5 years of publication (and not before publication).
 37 | 
 38 |     :param ref_window : list of two ints, default None
 39 |         If None, no reference window is applied.
 40 |         If [l, u] where, l,u are ints, then only references whose year difference is greater than or equal to the lower bound l and 
 41 |             less than or equal to the upper bound u are used.  e.g. [0,5] uses references within 5 years of publication (and not after).
 42 | 
 43 |     show_progress : bool, default False
 44 |         Show calculation progress. 
 45 | 
 46 |     Returns
 47 |     -------
 48 |     disruption : DataFrame
 49 |         A DataFrame with the disruption index for all (cited) publications or publications from the focus_pub_ids list.
 50 | 
 51 | 
 52 |     """
 53 |     if show_progress:
 54 |         print("Starting computation of disruption index.")
 55 | 
 56 |     if ref_window is None:
 57 |         reference_groups = pub2ref.groupby('CitingPublicationId', sort = False)['CitedPublicationId']
 58 |     else:
 59 |         ref_sub = [ ((y1-y2) >= ref_window[0] and (y1-y2) <=ref_window[1]) for y1,y2 in pub2ref[['CitingYear', 'CitedYear']].values]
 60 |         reference_groups = pub2ref.loc[ref_sub].groupby('CitingPublicationId', sort = False)['CitedPublicationId']
 61 | 
 62 |     if cite_window is None:
 63 |         citation_groups = pub2ref.groupby('CitedPublicationId', sort = False)['CitingPublicationId']
 64 |     else:
 65 |         cite_sub = [ ((y1-y2) >= cite_window[0] and (y1-y2) <=cite_window[1]) for y1,y2 in pub2ref[['CitingYear', 'CitedYear']].values]
 66 |         citation_groups = pub2ref.loc[cite_sub].groupby('CitedPublicationId', sort = False)['CitingPublicationId']
 67 | 
 68 |     if focus_pub_ids is None:
 69 |         if cite_window is None:
 70 |             focus_pub_ids = pub2ref['CitedPublicationId'].unique()
 71 |         else:
 72 |             focus_pub_ids = pub2ref.loc[cite_sub]['CitedPublicationId'].unique()
 73 | 
 74 |     def get_citation_groups(pid):
 75 |         try:
 76 |             return citation_groups.get_group(pid).values
 77 |         except KeyError:
 78 |             return np.array([])
 79 | 
 80 |     def _disruption_index(focusid):
 81 | 
 82 |         # if the focus publication has no references or citations, then it has a disruption of None
 83 |         try:
 84 |             focusref = reference_groups.get_group(focusid)
 85 |         except KeyError:
 86 |             return None
 87 | 
 88 |         try:
 89 |             citing_focus = citation_groups.get_group(focusid)
 90 |         except KeyError:
 91 |             return None
 92 | 
 93 | 
 94 |         # implementation 1: keep it numpy
 95 |         #cite2ref = reduce(np.union1d, [get_citation_groups(refid) for refid in focusref])
 96 |         #nj = np.intersect1d(cite2ref, citing_focus.values).shape[0]
 97 |         #nk = cite2ref.shape[0] - nj
 98 | 
 99 |         # implementation 2: but dicts are faster...
100 |         cite2ref = {citeid:1 for refid in focusref for citeid in get_citation_groups(refid)}
101 |         nj = sum(cite2ref.get(pid, 0) for pid in citing_focus.values )
102 |         nk = len(cite2ref) - nj
103 | 
104 |         ni = citing_focus.shape[0] - nj
105 | 
106 |         return float(ni - nj)/(ni + nj + nk)
107 | 
108 |     disrupt = [[focusciting, _disruption_index(focusciting)] for focusciting
109 |         in tqdm(focus_pub_ids, leave=True, desc='Disruption Index', disable= not show_progress) if get_citation_groups(focusciting).shape[0] > 0]
110 | 
111 |     return pd.DataFrame(disrupt, columns = ['PublicationId', 'DisruptionIndex'])
112 | 
113 | 


--------------------------------------------------------------------------------
/pyscisci/methods/diversity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: interdisciplinary
  4 |     :synopsis: Set of functions for typical interdisciplinary analysis
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | from ..utils import isin_sorted, zip2dict, check4columns, simpson, simpson_finite, shannon_entropy
 13 | 
 14 | 
 15 | 
 16 | def simpson_interdisciplinarity(pub2ref, pub2field, focus_pub_ids=None,
 17 |     citation_direction='references', finite_correction=False, show_progress=False):
 18 |     """
 19 |     Calculate the Simpson index as a measure of a publication's interdisciplinarity.
 20 |     See :cite:`stirling20` for the definition.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     :param pub2ref : DataFrame
 25 |         A DataFrame with the citation information for each Publication.
 26 | 
 27 |     :param pub2field : DataFrame
 28 |         A DataFrame with the field information for each Publication.
 29 | 
 30 |     :param focus_pub_ids : numpy array or list, default None
 31 |         A list of the PublicationIds to calculate interdisciplinarity.
 32 | 
 33 |     :param finite_correction : bool, default False
 34 |         Whether to apply the correction for a finite sample.
 35 | 
 36 |     :param show_progress : bool, default False
 37 |         If True, show a progress bar tracking the calculation.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     DataFrame
 42 |         DataFrame with 2 columns: 'PublicationId', 'Simpsons'
 43 | 
 44 |     """
 45 | 
 46 |     # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
 47 |     if citation_direction == 'references':
 48 |         pub2ref_rename_dict = {'CitedPublicationId':'TargetId', 'CitingPublicationId':'SourceId'}
 49 |         year_col = 'CitingYear'
 50 |     elif citation_direction == 'citations':
 51 |         pub2ref_rename_dict = {'CitedPublicationId':'SourceId', 'CitingPublicationId':'TargetId'}
 52 |         year_col = 'CitedYear'
 53 | 
 54 |     required_columns = ['CitedPublicationId', 'CitingPublicationId']
 55 |     check4columns(pub2ref, required_columns)
 56 |     pub2ref = pub2ref[required_columns].rename(columns=pub2ref_rename_dict)
 57 | 
 58 |     check4columns(pub2field, ['PublicationId', 'FieldId'])
 59 | 
 60 |     # merge the references to the fields for the target fields
 61 |     pub2ref = pub2ref.merge(pub2field, how='left', left_on='TargetId', 
 62 |         right_on='PublicationId').rename(columns={'FieldId':'TargetFieldId'})
 63 |     del pub2ref['PublicationId']
 64 | 
 65 |     pub2ref = pub2ref.dropna()
 66 |     
 67 |     if finite_correction:
 68 |         simpdf = 1-pub2ref.groupby('SourceId')['TargetFieldId'].apply(simpson_finite)
 69 |     else:
 70 |         simpdf = 1-pub2ref.groupby('SourceId')['TargetFieldId'].apply(simpson)
 71 | 
 72 |     simpdf = simpdf.to_frame().reset_index().rename(
 73 |         columns={'TargetFieldId':'SimpsonInterdisciplinarity', 'SourceId':'PublicationId'})
 74 |     
 75 |     return simpdf
 76 | 
 77 | 
 78 | def shannon_interdisciplinarity(pub2ref, pub2field, focus_pub_ids=None,
 79 |     citation_direction='references', normalized=False, K=None, show_progress=False):
 80 |     """
 81 |     Calculate the Shannon entropy as a measure of a publication's interdisciplinarity.
 82 |     See :cite:`stirling20` for the definition.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     :param pub2ref : DataFrame
 87 |         A DataFrame with the citation information for each Publication.
 88 | 
 89 |     :param pub2field : DataFrame
 90 |         A DataFrame with the field information for each Publication.
 91 | 
 92 |     :param focus_pub_ids : numpy array or list, default None
 93 |         A list of the PublicationIds to calculate interdisciplinarity.
 94 | 
 95 |     :param temporal : bool, default False
 96 |         If True, compute the distance matrix using only publications for each year.
 97 | 
 98 |     :param normalized : bool, default False
 99 |         If True, use the normalized entorpy bounded by the number of observed fields
100 |         or K if not None.
101 |     
102 |     :param K : int, default None
103 |         The maximum number of fields to consider.
104 | 
105 |     :param show_progress : bool, default False
106 |         If True, show a progress bar tracking the calculation.
107 | 
108 |     Returns
109 |     -------
110 |     DataFrame
111 |         DataFrame with 2 columns: 'PublicationId', 'Simpsons'
112 | 
113 |     """
114 | 
115 |     # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
116 |     if citation_direction == 'references':
117 |         pub2ref_rename_dict = {'CitedPublicationId':'TargetId', 'CitingPublicationId':'SourceId'}
118 |         year_col = 'CitingYear'
119 |     elif citation_direction == 'citations':
120 |         pub2ref_rename_dict = {'CitedPublicationId':'SourceId', 'CitingPublicationId':'TargetId'}
121 |         year_col = 'CitedYear'
122 | 
123 |     required_columns = ['CitedPublicationId', 'CitingPublicationId']
124 |     check4columns(pub2ref, required_columns)
125 |     pub2ref = pub2ref[required_columns].rename(columns=pub2ref_rename_dict)
126 | 
127 |     check4columns(pub2field, ['PublicationId', 'FieldId'])
128 | 
129 |     if K is None:
130 |         K = pub2field['FieldId'].nunique()
131 | 
132 |     # merge the references to the fields for the target fields
133 |     pub2ref = pub2ref.merge(pub2field, how='left', left_on='TargetId', 
134 |         right_on='PublicationId').rename(columns={'FieldId':'TargetFieldId'})
135 |     del pub2ref['PublicationId']
136 | 
137 |     pub2ref = pub2ref.dropna()
138 |     
139 |     shan_inter = pub2ref.groupby('SourceId')['TargetFieldId'].apply(shannon_entropy)
140 |     shan_inter = shan_inter.to_frame().reset_index().rename(
141 |         columns={'TargetFieldId':'ShannonInterdisciplinarity', 'SourceId':'PublicationId'})
142 | 
143 |     if normalized:
144 |         shan_inter['ShannonInterdisciplinarity'] = shan_inter['ShannonInterdisciplinarity']/np.log(K)
145 |     
146 |     return shan_inter
147 | 
148 | 


--------------------------------------------------------------------------------
/pyscisci/methods/hindex.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: hindex
  4 |     :synopsis: Calculate the hindex.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | from pyscisci.utils import zip2dict
 21 | 
 22 | 
 23 | ### H index
 24 | 
 25 | def hindex(a):
 26 |     """
 27 |     Calculate the h index for the array of citation values.  See :cite:`hirsch2005index` for the definition.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     :param a : numpy array
 32 |         An array of citation counts for each publication by the Author.
 33 | 
 34 |     Returns
 35 |     -------
 36 |     int
 37 |         The Hindex
 38 | 
 39 |     """
 40 |     d = np.sort(a)[::-1] - np.arange(a.shape[0])
 41 |     return (d>0).sum()
 42 | 
 43 | def compute_hindex(df, colgroupby, colcountby, show_progress=False):
 44 |     """
 45 |     Calculate the h index for each group in the DataFrame.  See :cite:`hirsch2005index` for the definition.
 46 | 
 47 |     The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_hindex`.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     :param df : DataFrame
 52 |         A DataFrame with the citation information for each Author.
 53 | 
 54 |     :param colgroupby : str
 55 |         The DataFrame column with Author Ids.
 56 | 
 57 |     :param colcountby : str
 58 |         The DataFrame column with Citation counts for each publication.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     DataFrame
 63 |         DataFrame with 2 columns: colgroupby, 'Hindex'
 64 | 
 65 |         """
 66 |     # register our pandas apply with tqdm for a progress bar
 67 |     tqdm.pandas(desc='Hindex', disable= not show_progress)
 68 | 
 69 |     newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Hindex']*2)
 70 |     return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(hindex).to_frame().reset_index().rename(columns=newname_dict)
 71 | 
 72 | def gindex(a):
 73 |     """
 74 |     Calculate the g index for the array of citation values.  See :cite:`Waltman2008index` for detailed definition.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     :param a : numpy array
 79 |         An array of citation counts for each publication by the Author.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     int
 84 |         The Gindex
 85 | 
 86 |     """
 87 |     d = np.cumsum(np.sort(a)[::-1]) - np.arange(a.shape[0])**2
 88 |     return (d>0).sum()
 89 | 
 90 | def compute_gindex(df, colgroupby, colcountby, show_progress=False):
 91 |     """
 92 |     Calculate the g index for each group in the DataFrame.  See :cite:`Waltman2008index` for detailed definition.
 93 | 
 94 |     The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_gindex`.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     :param df : DataFrame
 99 |         A DataFrame with the citation information for each Author.
100 | 
101 |     :param colgroupby : str
102 |         The DataFrame column with Author Ids.
103 | 
104 |     :param colcountby : str
105 |         The DataFrame column with Citation counts for each publication.
106 | 
107 |     Returns
108 |     -------
109 |     DataFrame
110 |         DataFrame with 2 columns: colgroupby, 'Hindex'
111 | 
112 |         """
113 |     # register our pandas apply with tqdm for a progress bar
114 |     tqdm.pandas(desc='Gindex', disable= not show_progress)
115 | 
116 |     newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Gindex']*2)
117 |     return df.groupby(colgroupby, sort=False, as_index=False)[colcountby].progress_apply(gindex).rename(columns=newname_dict)
118 | 


--------------------------------------------------------------------------------
/pyscisci/methods/hotstreak.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: hotstreak
  4 |     :synopsis: Calculate the career hotstreak.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | import sys
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 13 | if 'ipykernel' in sys.modules:
 14 |     from tqdm.notebook import tqdm
 15 | else:
 16 |     from tqdm import tqdm
 17 | 
 18 | from pyscisci.utils import hard_rolling_window
 19 | 
 20 | 
 21 | def piecewise_step(x, a,b,c,d):
 22 |     return np.piecewise(x, [x<a, np.logical_and(x>=a,x<=b), x>b], [c,d,c])
 23 | 
 24 | def piecewise_step_err(y, loc1, loc2):
 25 |     y1 = y[loc1:(loc2+1)]
 26 |     y0 = np.hstack([y[:loc1], y[(loc2+1):]])
 27 |     return np.sqrt(np.sum([np.sum((a - a.mean())**2) for a in [y0,y1]]))
 28 | 
 29 | def brut_fit_piecewise_step(ys):
 30 |     locs = np.vstack(np.triu_indices(ys.shape[0]-1, k=1)).T + 1
 31 |     errs = [piecewise_step_err(ys, i, j) for i,j in locs]
 32 |     
 33 |     return np.min(errs), locs[np.argmin(errs)]
 34 | 
 35 | 
 36 | def piecewise_step_err2(y, loc1, loc2, loc3, loc4):
 37 |     y1 = y[loc1:(loc2+1)]
 38 |     y2 = y[loc3:(loc4+1)]
 39 |     y0 = np.hstack([y[:loc1], y[(loc2+1):loc3], y[(loc4+1):]])
 40 |     return np.sqrt(np.sum([np.sum((a - a.mean())**2) for a in [y0,y1,y2]]))
 41 | 
 42 | def brut_fit_piecewise_step2(ys):
 43 |     locs1 = np.vstack(np.triu_indices(ys.shape[0]-3, k=1)).T + 1
 44 |     locs2 = np.array([[i,j,j+k,j+m] for i,j in locs1 for k,m in np.vstack(np.triu_indices(ys.shape[0]-j-1, k=1)).T])
 45 |     errs = [piecewise_step_err2(ys, i, j,k,m) for i,j,k,m in locs2]
 46 |     
 47 |     return np.min(errs), locs2[np.argmin(errs)]
 48 | 
 49 | def career_hotstreak(author_career_df, citecol='c10', maxk=1, l1_lambda = 1.):
 50 |     """
 51 |     Identify hot streaks in author careers :cite:`liu2018hotstreak'.
 52 | 
 53 |     TODO: this is an interger programming problem.  Reimplement using an interger solver.
 54 |     Right now just using a brut force search (very inefficient)!
 55 |     
 56 |     Parameters
 57 |     ----------
 58 |     author_career_df : DataFrame
 59 |         The author publication history.
 60 | 
 61 |     citecol : str, default 'c10'
 62 |         The column with publication citation information.
 63 |     
 64 |     max_k : int, default 1
 65 |         The maximum number of hot streaks to search for in a career. Should be 1 or 2.
 66 |     
 67 |     l1_lambda : float, default 1.0
 68 |         The l1 regularization for the number of streaks.  
 69 |         Note, the authors never define the value they used for this in the SI.
 70 |         
 71 |     Returns
 72 |     ----------
 73 |     lsm_err : float
 74 |         The least square mean error of the model plus the l1-regularized term for the number of model coefficients.
 75 | 
 76 |     streak_loc : array
 77 |         The index locations for the hot streak start and end locations.
 78 |     """
 79 |     if maxk == 0 or maxk > 2:
 80 |         raise NotImplementedError("the career hotstreak is not implemented for this number of streaks.  set maxk = 1 or maxk=2 ")
 81 | 
 82 |     Delta_N = max(5, int(0.1*author_career_df.shape[0]))
 83 |     gamma_N = hard_rolling_window(np.log10(author_career_df[citecol].values), window=Delta_N, step_size = 1).mean(axis=1)
 84 |     gamma_N = gamma_N[int(Delta_N/2):-int(Delta_N/2)]
 85 |     
 86 |     nostreak_err = np.sqrt(np.sum((gamma_N - gamma_N.mean())**2)) + l1_lambda # no step functions uses 1 model coefficient
 87 |     streak_gammas = [gamma_N.mean(), None, None]
 88 | 
 89 |     streak_err, streak_loc1 = brut_fit_piecewise_step(gamma_N)
 90 |     streak_err += 3*l1_lambda # 1 step function = 3 model coefficients
 91 |     
 92 |     streak_loc = [None]*4
 93 | 
 94 |     if (nostreak_err <= streak_err): 
 95 |         streak_err = nostreak_err
 96 |         nstreak = 0
 97 |     else:
 98 |         streak_loc[:2] = streak_loc1 + int(Delta_N/2)
 99 |         streak_gammas[0] = np.hstack([gamma_N[:streak_loc1[0]], gamma_N[(streak_loc1[1]+1):]]).mean()
100 |         streak_gammas[1] = gamma_N[streak_loc1[0]:(streak_loc1[1]+1)].mean()
101 |         nstreak = 1
102 | 
103 |     if maxk == 2:
104 |         streak_err2, streak_loc2 = brut_fit_piecewise_step2(gamma_N)
105 |         streak_err2 += 6*l1_lambda # 2 step functions = 6 model coefficients
106 |         if (streak_err > streak_err2): 
107 |             streak_err, streak_loc = streak_err2, list(streak_loc2 + int(Delta_N/2))
108 |             streak_gammas[0] = np.hstack([gamma_N[:streak_loc1[0]], gamma_N[streak_loc1[1]:(streak_loc1[2])], gamma_N[(streak_loc1[3]+1):]]).mean()
109 |             streak_gammas[1] = gamma_N[streak_loc1[0]:(streak_loc1[1]+1)].mean()
110 |             streak_gammas[2] = gamma_N[streak_loc1[2]:(streak_loc1[3]+1)].mean()
111 |             nstreak = 2
112 |     
113 |     solution_df = [[streak_gammas[0], streak_gammas[1], streak_loc[0], streak_loc[1]]]
114 |     if nstreak == 2:
115 |         solution_df += [streak_gammas[0], streak_gammas[2], streak_loc[3], streak_loc[4]]
116 |     return pd.DataFrame(solution_df, columns = ['Baseline', 'StreakGamma', 'StreakStart', 'StreakEnd'])
117 | 


--------------------------------------------------------------------------------
/pyscisci/methods/journal.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: journalcitation
  4 |     :synopsis: Set of functions for typical journal bibliometric citation analysis
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | from pyscisci.utils import zip2dict, groupby_count
 21 | from pyscisci.methods.hindex import compute_hindex
 22 | 
 23 | def journal_productivity(pub2journal, colgroupby = 'JournalId', colcountby = 'PublicationId', show_progress=False):
 24 |     """
 25 |     Calculate the total number of publications for each journal.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     :param pub2journal : DataFrame, default None, Optional
 30 |         A DataFrame with the author2publication information.
 31 | 
 32 |     :param colgroupby : str, default 'JournalId', Optional
 33 |         The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.
 34 | 
 35 |     :param colcountby : str, default 'PublicationId', Optional
 36 |         The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.
 37 | 
 38 | 
 39 |     Returns
 40 |     -------
 41 |     DataFrame
 42 |         Productivity DataFrame with 2 columns: 'AuthorId', 'Productivity'
 43 | 
 44 |     """
 45 | 
 46 |     # we can use show_progress to pass a label for the progress bar
 47 |     if show_progress:
 48 |         show_progress='Journal Productivity'
 49 | 
 50 |     newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['Productivity']*2)
 51 |     return groupby_count(pub2journal, colgroupby, colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
 52 | 
 53 | def journal_yearly_productivity(pub2journal, colgroupby = 'JournalId', datecol = 'Year', colcountby = 'PublicationId', show_progress=False):
 54 |     """
 55 |     Calculate the number of publications for each author in each year.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     :param pub2journal : DataFrame, default None, Optional
 60 |         A DataFrame with the author2publication information. 
 61 | 
 62 |     :param colgroupby : str, default 'AuthorId', Optional
 63 |         The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.
 64 | 
 65 |     :param datecol : str, default 'Year', Optional
 66 |         The DataFrame column with Year information.  If None then the database 'Year' is used.
 67 | 
 68 |     :param colcountby : str, default 'PublicationId', Optional
 69 |         The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.
 70 | 
 71 |     Returns
 72 |     -------
 73 |     DataFrame
 74 |         Productivity DataFrame with 3 columns: 'AuthorId', 'Year', 'YearlyProductivity'
 75 | 
 76 |     """
 77 |     
 78 |     # we can use show_progress to pass a label for the progress bar
 79 |     if show_progress:
 80 |         show_progress='Journal Yearly Productivity'
 81 | 
 82 |     newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['YearlyProductivity']*2)
 83 |     return groupby_count(pub2journal, [colgroupby, datecol], colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
 84 | 
 85 | def journal_hindex(pub2journal, impact=None, colgroupby = 'JournalId', colcountby = 'Ctotal', show_progress=False):
 86 |     """
 87 |     Calculate the author yearly productivity trajectory.  See :cite:`hirsch2005index` for the derivation.
 88 | 
 89 |     The algorithmic implementation can be found in :py:func:`citationanalysis.compute_hindex`.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     :param pub2journal : DataFrame
 94 |         A DataFrame with the publication and journal information.
 95 | 
 96 |     :param impact : DataFrame, default None, Optional
 97 |         A DataFrame with the publication citation counts precalculated.  If None, then it is assumed that the citation counts are already in pub2journal.
 98 | 
 99 |     :param colgroupby : str, default 'JournalId', Optional
100 |         The DataFrame column with Author Ids.  If None then the database 'JournalId' is used.
101 | 
102 |     :param colcountby : str, default 'Ctotal', Optional
103 |         The DataFrame column with Citation counts for each publication.  If None then the database 'Ctotal' is used.
104 | 
105 |     :param show_progress : bool, default False
106 |         The DataFrame column with Citation counts for each publication.  If None then the database 'Ctotal' is used.
107 |     
108 |     :param show_progress: bool, default False
109 |             Show progress of the calculation.
110 | 
111 |     Returns
112 |     -------
113 |     DataFrame
114 |         Trajectory DataFrame with 2 columns: 'JournalId', 'Hindex'
115 | 
116 |     """
117 |     if not impact is None:
118 |         pub2journal = pub2journal.merge(impact[[colgroupby, colcountby]], on='PublicationId', how='left')
119 | 
120 |     if show_progress: print("Computing Journal H-index.")
121 |     return compute_hindex(pub2journal, colgroupby = colgroupby, colcountby = colcountby, show_progress=show_progress)
122 | 
123 | def journal_impactfactor(pub, pub2ref, pub2year=None, citation_window=5, colgroupby = 'JournalId', show_progress=False):
124 |     """
125 |     Calculate the impact factor for a journal.
126 | 
127 |     Parameters
128 |     ----------
129 |     :param pub : DataFrame
130 |         A DataFrame with the publication, journal, and year information.
131 | 
132 |     :param pub2ref : DataFrame
133 |         A DataFrame with the author2publication information.  If None then the database 'author2pub' is used.
134 | 
135 |     :param pub2year : dict, defualt None, Optional
136 |         A dictionary mapping 'PublicationIds' to the publication year.  If None then the 'CitingYear' is assumed to be a column of pub2ref.
137 | 
138 |     :param colgroupby : str, default 'JournalId', Optional
139 |         The DataFrame column with Author Ids.  If None then the database 'JournalId' is used.
140 | 
141 |     :param colcountby : str, default 'Ctotal', Optional
142 |         The DataFrame column with Citation counts for each publication.  If None then the database 'Ctotal' is used.
143 |     
144 |     :param show_progress: bool, default False
145 |             Show progress of the calculation.
146 | 
147 |     Returns
148 |     -------
149 |     DataFrame
150 |         Trajectory DataFrame with 2 columns: 'JournalId', 'ImpactFactor{y}' where y is the citation_window size
151 | 
152 |     """
153 | 
154 | 


--------------------------------------------------------------------------------
/pyscisci/methods/longtermimpact.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: longterm impact
 4 |     :synopsis: Set of functions for typical bibliometric citation analysis
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | import scipy.optimize as spopt
13 | import scipy.stats as spstats
14 | 
15 | from pyscisci.utils import zip2dict
16 | 
17 | 
18 | def _fit_predicted_citations(publication_citations):
19 | 
20 |     recenter_time = np.sort(publication_citations['DateDelta'].values)
21 | 
22 |     def fit_f(x):
23 |         return np.arange(1,len(recenter_time) + 1) - np.array([predicted_c(t, x[0], x[1], x[2]) for t in recenter_time])
24 | 
25 |     s, _ = spopt.leastsq(fit_f, x0 = np.ones(3))
26 |     return pd.Series(s)
27 | 
28 | def predicted_c(t, lam, mu, sig, m = 30.):
29 |     lognormt = (np.log(t) - mu) / sig
30 |     return m * (np.exp(lam * spstats.norm.cdf( lognormt ) ) - 1.0)
31 | 
32 | def longterm_impact(pub2ref, colgroupby = 'CitedPublicationId', coldate='CitingYear', show_progress=True):
33 |     """
34 |     This function calculates the longterm scientific impact as introduced in :cite:`Wang2013longterm`.
35 | 
36 |     Following equation (3) from [w]:
37 |     c_(t) = m * (e^{lam * PHI()})
38 | 
39 |     """
40 |     pub2ref = pub2ref.copy()
41 | 
42 |     if 'Year' in coldate:
43 |         pub2ref['DateDelta'] = pub2ref.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min())
44 |     elif 'Date' in coldate:
45 |         pub2ref['DateDelta'] = pub2ref.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min()) / np.timedelta64(1,'D')
46 |     else:
47 |         print("Column Date Error") 
48 | 
49 |     pub2ref = pub2ref.loc[pub2ref['DateDelta'] > 0]
50 |     pub2ref.sort_values(by=['DateDelta'], inplace=True)
51 | 
52 |     newname_dict = zip2dict(list(range(4)), ['lam', 'mu', 'sig', 'm' ])
53 |     return pub2ref.groupby(colgroupby, sort=False).apply(_fit_predicted_citations).reset_index().rename(columns = newname_dict)
54 | 


--------------------------------------------------------------------------------
/pyscisci/methods/netnormcite.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: network normalized citation index
  4 |     :synopsis: Set of functions for finding the network normalized citation index.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | import sys
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 13 | if 'ipykernel' in sys.modules:
 14 |     from tqdm.notebook import tqdm
 15 | else:
 16 |     from tqdm import tqdm
 17 | 
 18 | from pyscisci.utils import isin_sorted
 19 | 
 20 | ### Network Normalized Citation
 21 | def netnormcite_index(pub2ref, pub2year=None, focus_pub_ids = None, T=5, show_progress=False):
 22 |     """
 23 |     Calculate the network normalized citation index as first proposed in :cite:`Ke2023netnorm`.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 | 
 28 |     :param pub2ref : DataFrame
 29 |         A DataFrame with the citation information for each Publication.
 30 | 
 31 |     :param pub2year : DataFrame, optional
 32 |         A DataFrame with the year of publication for each Publication if this is not already included in pub2ref.
 33 | 
 34 |     :param focus_pub_ids : numpy array
 35 |         A subset of publication ids to focus on for the citation index.  
 36 |         Note, the full pub2ref is still required because we need to find the co-citation neighborhoods.
 37 | 
 38 |     :param T : int, default 5
 39 |         Number of years for citation window, must be 1 or greater.
 40 | 
 41 |     show_progress : bool, default False
 42 |         Show calculation progress. 
 43 | 
 44 |     Returns
 45 |     -------
 46 |     disruption : DataFrame
 47 |         A DataFrame with the network normalized citation index for all (cited) publications or publications from the focus_pub_ids list.
 48 | 
 49 | 
 50 |     """
 51 |     if show_progress:
 52 |         print("Starting computation of network normalized index.")
 53 | 
 54 |     if not ('CitingYear' in list(pub2ref) or 'CitedYear' in list(pub2ref)):
 55 |         pub2ref = pub2ref.merge(pub2year, how='left', left_on='CitedPublicationId', right_on ='PublicationId').rename(columns={'Year':'CitedYear'})
 56 |         del pub2ref['PublicationId']
 57 |         pub2ref = pub2ref.merge(pub2year, how='left', left_on='CitingPublicationId', right_on ='PublicationId').rename(columns={'Year':'CitingYear'})
 58 |         del pub2ref['PublicationId']
 59 | 
 60 |     if focus_pub_ids is None:
 61 |         yfocus_pubs = pub2ref[['CitedPublicationId', 'CitedYear']].drop_duplicates(keep='first')
 62 |     else:
 63 |         yfocus_pubs = pub2ref[isin_sorted(pub2ref['CitedPublicationId'].values, np.sort(focus_pub_ids))][['CitedPublicationId', 'CitedYear']].drop_duplicates(keep='first')
 64 | 
 65 | 
 66 |     reference_groups = pub2ref.groupby(['CitingPublicationId'], sort = False)['CitedPublicationId']
 67 |     def get_reference_groups(pid):
 68 |         try:
 69 |             return reference_groups.get_group(pid).values
 70 |         except KeyError:
 71 |             return np.array([])
 72 | 
 73 |     citation_groups = pub2ref.groupby(['CitingYear', 'CitedPublicationId'], sort = False)['CitingPublicationId']
 74 |     def get_citation_groups(pid, y):
 75 |         try:
 76 |             return citation_groups.get_group((y, pid)).values
 77 |         except KeyError:
 78 |             return np.array([])
 79 | 
 80 |     yearly_citation_counts = citation_groups.nunique()
 81 |     def get_yearly_ncites(y, pid):
 82 |         try:
 83 |             return yearly_citation_counts[(y, pid)]
 84 |         except KeyError:
 85 |             return 0
 86 | 
 87 | 
 88 |     def _netnorm_index(focusid, y):
 89 | 
 90 |         cnormt = 0
 91 |         for t in range(0, T+1):
 92 | 
 93 |             paper2y_cocite = {refid:get_yearly_ncites(y+t,refid) for citeid in get_citation_groups(focusid, y+t) for refid in get_reference_groups(citeid) }
 94 |             
 95 |             # the co-citation neighborhood doesnt include the focus publication
 96 |             cnorm_denom = sum(ncites for refid, ncites in paper2y_cocite.items() if refid != focusid)
 97 |             
 98 |             if cnorm_denom > 0 and len(paper2y_cocite)  > 1:
 99 |                 cnorm_denom = cnorm_denom / (len(paper2y_cocite) - 1)
100 |                 cnormt += get_yearly_ncites(y+t,focusid) / cnorm_denom
101 | 
102 |         return cnormt
103 | 
104 |     netnorm = [[focus_pub, yfocus, _netnorm_index(focus_pub, yfocus)] for focus_pub, yfocus
105 |         in tqdm(yfocus_pubs.values, leave=True, desc='Network-normalized Citation', disable= not show_progress)]
106 | 
107 |     return pd.DataFrame(netnorm, columns = ['PublicationId', 'CitedYear', 'Cnorm{}'.format(T)])
108 | 
109 | 


--------------------------------------------------------------------------------
/pyscisci/methods/pivotscore.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: pivotscore
  4 |     :synopsis: Set of functions for typical bibliometric citation analysis
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | import sys
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | from pyscisci.utils import groupby_count, changepoint, pandas_cosine_similarity
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ### Pivot Score
 24 | def pivot_score(pub2author, pub2ref, previous_k=None, year_window=None, show_progress=False):
 25 |     """
 26 |     Calculate the pivot score index as proposed in :cite:`Hill2022pivotpenalty`.
 27 | 
 28 |     None is returned for the first publication in an author's career (because there is no previous history), or when
 29 |     no other publications have been published within the year_window.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     pub2author : dataframe
 34 |         The publication author linkages with Year.
 35 | 
 36 |     pub2ref : dataframe
 37 |         The citing-cited publication linkages which contains the CitedJournalId for the cited articles.
 38 | 
 39 |     previous_k : int, default None
 40 |         Only compare against the previous k publications.
 41 | 
 42 |     year_window : int, default None
 43 |         Only compare against publications from the last year_window years.
 44 | 
 45 |     show_progress : bool, default False
 46 |         Show calculation progress. 
 47 | 
 48 |     Returns
 49 |     ----------
 50 |     pivotscore : DataFrame
 51 |         The pivotscore for each Author Publication.
 52 | 
 53 |     """
 54 |     if show_progress:
 55 |         print("Starting computation of pivot score.")
 56 | 
 57 |     pub2refjournalcounts = pyscisci.groupby_count(pub2ref, ['CitingPublicationId', 'CitedJournalId'], 
 58 |                                               'CitedPublicationId', count_unique=True)
 59 |     pub2refjournalcounts.rename(columns={'CitedPublicationIdCount':'CitedJournalCount'}, inplace=True)
 60 |     
 61 |     pa_refs = paa.merge(pub2refjournalcounts, how='left', left_on = 'PublicationId', right_on='CitingPublicationId')
 62 |     del pa_refs['CitingPublicationId']
 63 |     
 64 |     pa_refs.dropna(inplace=True)
 65 |     pa_refs['CitedJournalId'] = pa_refs['CitedJournalId'].astype(int)
 66 |     pa_refs.sort_values(by=['AuthorId', 'Year', 'PublicationId', 'CitedJournalId'], inplace=True)
 67 |     pa_refs.reset_index(drop=True, inplace=True)
 68 | 
 69 |     pscore = pa_refs.groupby('AuthorId').apply(author_pivot, 
 70 |                                                previous_k=previous_k, year_window=year_window).reset_index()
 71 |     del pscore['level_1']
 72 | 
 73 |     return pscore
 74 | 
 75 | 
 76 | def author_pivot(authordf):
 77 |         
 78 |         pubgroups = authordf.groupby('PublicationId', sort=False)
 79 |         
 80 |         allpubidx = None
 81 |         if not previous_k is None:
 82 |             allpubidx = pyscisci.changepoint(authordf['PublicationId'].values)
 83 |         
 84 |         
 85 |         pivotresults = []
 86 |         
 87 |         def publication_pivot(pubgroup):
 88 |             pubidx = pubgroup.index[0]
 89 |             pid = pubgroup.name
 90 |             if pubidx==0: pivotresults.append([pid, None])
 91 |             else:
 92 |                 i=len(pivotresults)
 93 |                 if not previous_k is None and i > previous_k:
 94 |                     history = authordf.iloc[allpubidx[i-previous_k]:pubidx]
 95 |                 else:
 96 |                     history = authordf.iloc[:pubidx]
 97 | 
 98 |                 if not year_window is None:
 99 |                     history = history[history['Year'] >= pubgroup['Year'].values[0] - year_window]
100 | 
101 |                 if history.shape[0] > 0:
102 |                     history = history.groupby('CitedJournalId', sort=False, as_index=False)['CitedJournalCount'].sum()
103 |                     
104 |                     cosine = pandas_cosine_similarity(history, pubgroup, col_key='CitedJournalId', col_values='CitedJournalCount')
105 |                     
106 |                     pivotresults.append([pid, cosine])
107 |                 else:
108 |                     pivotresults.append([pid, None])  
109 |         
110 |         pubgroups.apply(publication_pivot)
111 |                 
112 |         return pd.DataFrame(pivotresults, columns=['PublicationId', 'PivotScore'])
113 | 


--------------------------------------------------------------------------------
/pyscisci/methods/productivitytrajectory.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: productivitytrajectory
 4 |     :synopsis: Calculate the productivity trajectory.
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | import sys
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | import scipy.optimize as spopt
15 | 
16 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
17 | if 'ipykernel' in sys.modules:
18 |     from tqdm.notebook import tqdm
19 | else:
20 |     from tqdm import tqdm
21 | 
22 | from pyscisci.utils import zip2dict
23 | 
24 | 
25 | ### Productivity Trajectory
26 | 
27 | def piecewise_linear(x, x_break, b, m1, m2):
28 |     """
29 |     A piece wise linear function:
30 |     x <= x_break: y = m1*x + b
31 |     x > x_break : y = m1*x_break + b + m2*(x - x_break)
32 |     """
33 |     return np.piecewise(x, [x <= x_break], [lambda x:m1*x + b-m1*x_break, lambda x:m2*x + b-m2*x_break])
34 | 
35 | def fit_piecewise_linear(xvalues, yvalues):
36 |     m0 = (yvalues.max()-yvalues.min())/(xvalues.max() - xvalues.min())
37 |     p0 = [np.median(xvalues), yvalues.mean(), m0, m0]
38 |     p , e = spopt.curve_fit(piecewise_linear, xvalues, yvalues, p0=p0)
39 |     return pd.Series(p)
40 | 
41 | 
42 | def _fit_piecewise_lineardf(author, args):
43 |     return fit_piecewise_linear(author[args[0]].values, author[args[1]].values)
44 | 
45 | def yearly_productivity_traj(df, colgroupby = 'AuthorId', colx='Year',coly='YearlyProductivity'):
46 |     """
47 |     Calculate the piecewise linear yearly productivity trajectory original studied in :cite:`way2017misleading`.
48 | 
49 |     """
50 | 
51 |     newname_dict = zip2dict(list(range(4)), ['t_break', 'b', 'm1', 'm2' ]) #[str(i) for i in range(4)]
52 |     return df.groupby(colgroupby, sort=False).apply(_fit_piecewise_lineardf, args=(colx,coly) ).reset_index().rename(columns = newname_dict)
53 | 
54 | 


--------------------------------------------------------------------------------
/pyscisci/methods/publication.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: publicationmetrics
  4 |     :synopsis: Set of functions for the bibliometric analysis of publications
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | from pyscisci.utils import rank_array, check4columns
 21 | 
 22 | from pyscisci.methods.raostirling import *
 23 | from pyscisci.methods.diversity import *
 24 | from pyscisci.methods.creditshare import *
 25 | from pyscisci.methods.disruption import *
 26 | from pyscisci.methods.longtermimpact import *
 27 | from pyscisci.methods.sleepingbeauty import *
 28 | from pyscisci.methods.pivotscore import *
 29 | from pyscisci.methods.novelty import *
 30 | from pyscisci.methods.netnormcite import *
 31 | 
 32 | def citation_rank(df, colgroupby='Year', colrankby='C10', ascending=True, normed=False, show_progress=False):
 33 |     """
 34 |     Rank publications by the number of citations (smallest) to N -1 (largest)
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     df : DataFrame
 39 |         A DataFrame with the citation information for each Publication.
 40 | 
 41 |     colgroupby : str, list
 42 |         The DataFrame column(s) to subset by.
 43 | 
 44 |     colrankby : str
 45 |         The DataFrame column to rank by.
 46 | 
 47 |     ascending : bool, default True
 48 |         Sort ascending vs. descending.
 49 | 
 50 |     normed : bool, default False
 51 |         - False : rank is from 0 to N -1
 52 |         - True : rank is from 0 to 1
 53 | 
 54 |     show_progress : bool, default False
 55 |         If True, show a progress bar tracking the calculation.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     DataFrame
 60 |         The original dataframe with a new column for rank: colrankby+"Rank"
 61 | 
 62 |     """
 63 |     # register our pandas apply with tqdm for a progress bar
 64 |     tqdm.pandas(desc='Citation Rank', disable= not show_progress)
 65 | 
 66 |     df[str(colrankby)+"Rank"] = df.groupby(colgroupby)[colrankby].progress_transform(lambda x: rank_array(x, ascending, normed))
 67 |     return df
 68 | 
 69 | def publication_beauty(pub2ref , colgroupby = 'CitedPublicationId', colcountby = 'CitingPublicationId', show_progress=False):
 70 |     """
 71 |     Calculate the sleeping beauty and awakening time for each cited publication.  See :cite:`Sinatra2016qfactor` for the derivation.
 72 | 
 73 |     The algorithmic implementation can be found in :py:func:`metrics.qfactor`.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     pub2ref  : DataFrame, default None, Optional
 78 |         A DataFrame with the temporal citing information information.
 79 | 
 80 |     colgroupby : str, default 'CitedPublicationId', Optional
 81 |         The DataFrame column with Author Ids.  If None then the database 'CitedPublicationId' is used.
 82 | 
 83 |     colcountby : str, default 'CitingPublicationId', Optional
 84 |         The DataFrame column with Citation counts for each publication.  If None then the database 'CitingPublicationId' is used.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     DataFrame
 89 |         Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex'
 90 | 
 91 |     """
 92 | 
 93 |     check4columns(pub2ref , ['CitedPublicationId', 'CitingPublicationId', 'CitingYear'])
 94 | 
 95 |     tqdm.pandas(desc='Beauty', disable= not show_progress)
 96 | 
 97 |     df = groupby_count(pub2ref , colgroupby = ['CitedPublicationId', 'CitingYear'], colcountby = 'CitingPublicationId', count_unique = True)
 98 | 
 99 |     newname_dict = zip2dict([str(colcountby), '0', '1'], [str(colgroupby)+'Beauty']*2 + ['Awakening'])
100 |     return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform(beauty_coefficient).rename(columns=newname_dict)
101 | 


--------------------------------------------------------------------------------
/pyscisci/methods/qfactor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | .. module:: qfactor
 4 |     :synopsis: Calculate the qfactor.
 5 | 
 6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
 7 |  """
 8 | 
 9 | import sys
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
15 | if 'ipykernel' in sys.modules:
16 |     from tqdm.notebook import tqdm
17 | else:
18 |     from tqdm import tqdm
19 | 
20 | from pyscisci.utils import zip2dict
21 | 
22 | 
23 | ## Q-factor
24 | def qfactor(a):
25 |     """
26 |     This function calculates the Q-factor for an author.  See :cite:`Sinatra2016individual` for details.
27 | 
28 |     """
29 | 
30 |     return np.exp(np.mean(np.log(a[a>0])))
31 | 
32 | def compute_qfactor(df, colgroupby, colcountby, show_progress=False):
33 |     """
34 |     Calculate the q factor for each group in the DataFrame.  See :cite:`Sinatra2016individual` for the definition.
35 | 
36 |     The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_qfactor`.
37 | 
38 |     Parameters
39 |     ----------
40 |     :param df : DataFrame
41 |         A DataFrame with the citation information for each Author.
42 | 
43 |     :param colgroupby : str
44 |         The DataFrame column with Author Ids.
45 | 
46 |     :param colcountby : str
47 |         The DataFrame column with Citation counts for each publication.
48 | 
49 |     Returns
50 |     -------
51 |     DataFrame
52 |         DataFrame with 2 columns: colgroupby, 'Hindex'
53 | 
54 |         """
55 |     # register our pandas apply with tqdm for a progress bar
56 |     tqdm.pandas(desc='Qfactor', disable= not show_progress)
57 | 
58 |     newname_dict = zip2dict([str(colcountby), '0'], [str(colgroupby)+'Qfactor']*2)
59 |     return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(qfactor).to_frame().reset_index().rename(columns=newname_dict)


--------------------------------------------------------------------------------
/pyscisci/methods/sleepingbeauty.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: sleepingbeauty
  4 |     :synopsis: Calculate the sleeping beauty coefficient.
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | 
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | from pyscisci.utils import zip2dict
 21 | 
 22 | 
 23 | ## Beauty-coefficient
 24 | def beauty_coefficient(c):
 25 |     """
 26 |     This function calculates the sleeping beauty coefficient and awakening time for a publication.  See :cite:`ke2015beauty` for details.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     c : numpy array
 31 |         The yearly citation counts for the publication.
 32 | 
 33 |     Returns
 34 |     ----------
 35 |     B : float
 36 |         Sleeping Beauty Coefficient
 37 | 
 38 |     t_a : int
 39 |         The awakening time
 40 | 
 41 |     """
 42 |     c = c.values
 43 |     t_m = np.argmax(c)
 44 |     B_denom = c
 45 |     B_denom[c==0] = 1
 46 | 
 47 |     # :cite:`ke2015beauty` eq 1/2
 48 |     l_t = ((c[t_m] - c[0])/t_m *np.arange(c.shape[0]) + c[0] - c)/B_denom
 49 | 
 50 |     # :cite:`ke2015beauty` eq 2
 51 |     B = l_t[:(t_m+1)].sum()
 52 | 
 53 |     d_denom = np.sqrt((c[t_m] - c[0])**2 + t_m**2)
 54 |     d_t = np.abs( (c[t_m] - c[0]) * np.arange(c.shape[0]) + t_m * (c[0] - c)) / d_denom
 55 | 
 56 |     # :cite:`ke2015beauty` eq 3
 57 |     t_a = np.argmax(d_t[:(t_m+1)])
 58 | 
 59 |     return pd.Series([B, t_a], index=['BeautyCoefficient', 'Awakening'])
 60 | 
 61 | def compute_sleepingbeauty(df, colgroupby, colcountby, coldate='Year', show_progress=False):
 62 |     """
 63 |     Calculate the sleeping beauty and awakening time for each group in the DataFrame.  See :cite:`ke2015beauty` for details.
 64 | 
 65 |     The algorithmic implementation for each publication can be found in :py:func:`sleepingbeauty.beauty_coefficient`.
 66 | 
 67 |     Parameters
 68 |     ----------
 69 |     df : DataFrame
 70 |         A DataFrame with the citation information for each publication in each year.
 71 | 
 72 |     colgroupby : str
 73 |         The DataFrame column with Publication Ids.
 74 | 
 75 |     colcountby : str
 76 |         The DataFrame column with Citation counts for each publication.
 77 | 
 78 |     coldate : str
 79 |         The DataFrame column with Year information.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     DataFrame
 84 |         DataFrame with 3 columns: colgroupby, 'Beauty' and 'Awakening'
 85 | 
 86 |         """
 87 |     # register our pandas apply with tqdm for a progress bar
 88 |     tqdm.pandas(desc='Beauty', disable= not show_progress)
 89 | 
 90 |     def fill_missing_dates(subdf):
 91 |         subdf = subdf.set_index(coldate).reindex(np.arange(subdf[coldate].min(), subdf[coldate].max()+1)).fillna(0).reset_index()
 92 |         return subdf
 93 | 
 94 |     # first fill in missing dates
 95 |     df = df.groupby(colgroupby, sort=False, group_keys=False).apply(fill_missing_dates)
 96 | 
 97 |     #get start year
 98 |     syear = df.groupby(colgroupby, sort=False)[coldate].min()
 99 | 
100 |     # now find the beauty coefficient and awakening year
101 |     beauty = df.groupby(colgroupby, sort=False)[colcountby].progress_apply(beauty_coefficient).unstack(1).reset_index()
102 | 
103 |     # translate the awakening from index to year
104 |     beauty['Awakening'] = [a+syear[pid] for pid,a in beauty[[colgroupby, 'Awakening']].values]
105 |     
106 |     return beauty


--------------------------------------------------------------------------------
/pyscisci/sparsenetworkutils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | .. module:: sparsenetworkutils
  4 |     :synopsis: The main Network class
  5 | 
  6 | .. moduleauthor:: Alex Gates <ajgates42@gmail.com>
  7 |  """
  8 | import sys
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | import scipy.sparse as spsparse
 13 | 
 14 | # determine if we are loading from a jupyter notebook (to make pretty progress bars)
 15 | if 'ipykernel' in sys.modules:
 16 |     from tqdm.notebook import tqdm
 17 | else:
 18 |     from tqdm import tqdm
 19 | 
 20 | def threshold_network(adj_mat, threshold=0):
 21 |     """
 22 | 
 23 |     """
 24 |     if adj_mat.getformat() != 'coo':
 25 |         adj_mat = spsparse.coo_matrix(adj_mat)
 26 | 
 27 |     adj_mat.data[adj_mat.data <=threshold] = 0
 28 |     adj_mat.eliminate_zeros()
 29 | 
 30 |     return adj_mat
 31 | 
 32 | def largest_connected_component_vertices(adj_mat):
 33 |     """
 34 |     """
 35 |     n_components, labels = spsparse.csgraph.connected_components(adj_mat)
 36 |     comidx, compsizes = np.unique(labels, return_counts=True)
 37 | 
 38 |     return np.arange(adj_mat.shape[0])[labels==np.argmax(compsizes)]
 39 | 
 40 | 
 41 | def dataframe2bipartite(df, rowname, colname, shape=None, weightname=None):
 42 | 
 43 |     if shape is None:
 44 |         shape = (int(df[rowname].max()+1), int(df[colname].max()+1) )
 45 | 
 46 |     if weightname is None:
 47 |         weights = np.ones(df.shape[0], dtype=int)
 48 |     else:
 49 |         weights = df[weightname].values
 50 | 
 51 |     # create a bipartite adj matrix connecting authors to their publications
 52 |     bipartite_adj = spsparse.coo_matrix( ( weights,
 53 |                                         (df[rowname].values, df[colname].values) ),
 54 |                                         shape=shape, dtype=weights.dtype)
 55 | 
 56 |     bipartite_adj.sum_duplicates()
 57 | 
 58 |     return bipartite_adj
 59 | 
 60 | def project_bipartite_mat(bipartite_adj, project_to = 'row'):
 61 | 
 62 |     if project_to == 'row':
 63 |         adj_mat = bipartite_adj.dot(bipartite_adj.T).tocoo()
 64 |     elif project_to == 'col':
 65 |         adj_mat = bipartite_adj.T.dot(bipartite_adj).tocoo()
 66 | 
 67 |     return adj_mat
 68 | 
 69 | def extract_multiscale_backbone(Xs, alpha):
 70 |     """
 71 |     A sparse matrix implementation of the multiscale backbone :cite:`Serrano2009backbone`.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     :param Xs : numpy.array or sp.sparse matrix
 76 |         The adjacency matrix for the network.
 77 | 
 78 |     :param alpha : float
 79 |         The significance value.
 80 | 
 81 | 
 82 |     Returns
 83 |     -------
 84 |     coo_matrix
 85 |         The directed, weighted multiscale backbone
 86 | 
 87 |     """
 88 | 
 89 |     X = spsparse.coo_matrix(Xs)
 90 |     X.eliminate_zeros()
 91 | 
 92 |     #normalize
 93 |     row_sums = X.sum(axis = 1)
 94 |     degrees = X.getnnz(axis = 1)
 95 | 
 96 | 
 97 |     pijs = np.multiply(X.data, 1.0/np.array(row_sums[X.row]).squeeze())
 98 |     powers = degrees[X.row.squeeze()] - 1
 99 | 
100 |     # equation 2 => where 1 - (k - 1) * integrate.quad(lambda x: (1 - x) ** (k - 2)) = (1-x)**(k - 1) if k  > 1
101 |     significance = np.logical_and(pijs < 1, np.power(1.0 - pijs, powers) < alpha)
102 | 
103 |     keep_graph = spsparse.coo_matrix((X.data[significance], (X.row[significance], X.col[significance])), shape = X.shape)
104 |     keep_graph.eliminate_zeros()
105 | 
106 |     return keep_graph
107 | 
108 | def sparse_pagerank_scipy(adjmat, alpha=0.85, personalization=None, initialization=None,
109 |                    max_iter=100, tol=1.0e-6, dangling=None):
110 |     
111 |     """
112 |     Pagerank for sparse matrices using the power method
113 |     """
114 | 
115 |     N, _ = adjmat.shape
116 |     assert(adjmat.shape==(N,N))
117 |     
118 |     if N == 0:
119 |         return np.array([])
120 | 
121 |     out_strength = np.array(adjmat.sum(axis=1)).flatten()
122 |     out_strength[out_strength != 0] = 1.0 / out_strength[out_strength != 0]
123 |     
124 |     Q = spsparse.spdiags(out_strength.T, 0, *adjmat.shape, format='csr')
125 |     adjmat = Q * adjmat
126 | 
127 |     # initial vector
128 |     if initialization is None:
129 |         x = np.repeat(1.0 / N, N)
130 |     
131 |     else:
132 |         x = initialization / initialization.sum()
133 | 
134 |     # Personalization vector
135 |     if personalization is None:
136 |         p = np.repeat(1.0 / N, N)
137 |     else:
138 |         p = personalization / personalization.sum()
139 | 
140 |     # Dangling nodes
141 |     if dangling is None:
142 |         dangling_weights = p
143 |     else:
144 |         dangling_weights = dangling / dangling.sum()
145 | 
146 |     is_dangling = np.where(out_strength == 0)[0]
147 | 
148 |     # power iteration: make up to max_iter iterations
149 |     for _ in range(max_iter):
150 |         xlast = x
151 |         x = alpha * (x * adjmat + sum(x[is_dangling]) * dangling_weights) + \
152 |             (1 - alpha) * p
153 |         # check convergence, l1 norm
154 |         err = np.absolute(x - xlast).sum()
155 |         if err < N * tol:
156 |             return x
157 | 
158 |     print('power iteration failed to converge in %d iterations.' % max_iter)
159 | 
160 | 
161 | def sparse_eigenvector_centrality_scipy(adjmat, max_iter=100, tol=1.0e-6, initialization=None):
162 | 
163 |     adjmat = spsparse.csr_matrix(adjmat)
164 | 
165 |     N, _ = adjmat.shape
166 |     assert(adjmat.shape==(N,N))
167 |     
168 |     if N == 0:
169 |         return np.array([])
170 | 
171 |     # initial vector
172 |     if initialization is None:
173 |         x = np.repeat(1.0 / N, N)
174 |     
175 |     else:
176 |         x = initialization / initialization.sum()
177 | 
178 |     
179 |     # make up to max_iter iterations
180 |     for _ in range(max_iter):
181 |         xlast = x
182 |         x = xlast.copy()  # Start with xlast times I to iterate with (A+I)
183 |         # do the multiplication y^T = x^T A (left eigenvector)
184 |         x = xlast*adjmat
185 | 
186 |         
187 |         norm = np.sqrt( np.square(x).sum() ) or 1
188 |         x = x/norm
189 | 
190 |         # Check for convergence (in the L_1 norm).
191 |         err = np.absolute(x - xlast).sum()
192 |         if err < N * tol:
193 |             return x
194 |     print('power iteration failed to converge in %d iterations.' % max_iter)
195 | 


--------------------------------------------------------------------------------
/pyscisci/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSciCollective/pyscisci/dc061d55bdbf1f66dc0eb499de61c84ae20616e6/pyscisci/tests/__init__.py


--------------------------------------------------------------------------------
/pyscisci/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from pyscisci.utils import load_int
2 | 
3 | 
4 | def test_load_int():
5 |     assert load_int(1) == 1
6 |     assert load_int("") is None
7 |     assert load_int("x") is None
8 | 


--------------------------------------------------------------------------------
/pyscisci/visualization.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import datetime
 3 | import numpy as np
 4 | 
 5 | import matplotlib.pylab as plt
 6 | import matplotlib.colors as colors
 7 | 
 8 | from pyscisci.methods.productivitytrajectory import piecewise_linear
 9 | 
10 | def career_impacttimeline(impact, datecol = 'Date', impactcol='Ctotal', fill_color='orange', hot_streak_info = None, 
11 |     edge_color='k', streak_color='firebrick', ax=None):
12 | 
13 |     if ax is None:
14 |         fig, ax = plt.subplots(1,1,figsize=(10,6))
15 | 
16 |     if isinstance(datecol, str):
17 |         dates = impact[datecol].values
18 |     elif isinstance(datecol, np.ndarray):
19 |         dates = datecol
20 | 
21 |     if isinstance(impactcol, str):
22 |         cites = impact[impactcol].values
23 |     elif isinstance(impactcol, np.ndarray):
24 |         cites = impactcol
25 | 
26 |     for d, c in zip(dates, cites):
27 |         if isinstance(d, str):
28 |             if 'T' in d:
29 |                 d= datetime.datetime.strptime(d.split('T')[0], '%Y-%m-%d')
30 |             else:
31 |                 d= datetime.datetime.strptime(d.split(' ')[0], '%Y-%m-%d')
32 | 
33 |         ax.plot([d]*2, [0,c], c='k', lw = 0.5) 
34 |         ax.scatter(d, c, color=fill_color, edgecolor=edge_color, linewidth=0.5, zorder=100)
35 |     
36 |     if not hot_streak_info is None:
37 |         nstreaks = 2 - hot_streak_info[:3].isnull().sum()
38 |         
39 |         if nstreaks == 1:
40 |             gamma0, gamma1 = 10**(hot_streak_info[:2])
41 |             streak_start, streak_end = hot_streak_info[3:5].astype(int)
42 |             ax.plot(datecol[:streak_start], [gamma0]*streak_start, c=streak_color)
43 |             ax.plot(datecol[streak_start:(streak_end+1)], [gamma1]*(streak_end-streak_start+1), c=streak_color)
44 |             ax.plot(datecol[(streak_end+1):], [gamma0]*(datecol.shape[0]-streak_end-1), c=streak_color)
45 | 
46 |     return ax
47 | 
48 | 
49 | def career_productivitytimeline(yearlyprod, productivity_trajectory = None, datecol = 'Year', fill_color='blue', ax=None):
50 | 
51 |     if ax is None:
52 |         fig, ax = plt.subplots(1,1,figsize=(10,6))
53 | 
54 |     ax.bar(yearlyprod[datecol].values, yearlyprod['YearlyProductivity'].values, color=fill_color) 
55 |     
56 |     if not productivity_trajectory is None:
57 |         t_break, b, m1, m2 = productivity_trajectory[['t_break','b','m1','m2']].values[0]
58 | 
59 |         ts = np.arange(yearlyprod[datecol].min(), yearlyprod[datecol].max()+1)
60 |         ax.plot(ts, piecewise_linear(ts, t_break, b, m1, m2), color='black')
61 |         
62 |     return ax
63 | 
64 | def hex2rgb(value):
65 |     value = value.lstrip('#')
66 |     lv = len(value)
67 |     return tuple(int(value[i:i + lv // 3], 16)/255. for i in range(0, lv, lv // 3))
68 | 
69 | def hex2rgba(value, alpha = 1):
70 |     return hex2rgb(value) + (alpha,)
71 | 
72 | class MidpointNormalize(colors.Normalize):
73 |     """
74 |     Normalise the colorbar so that diverging bars work there way either side from a prescribed midpoint value)
75 | 
76 |     e.g. im=ax1.imshow(array, norm=MidpointNormalize(midpoint=0.,vmin=-100, vmax=100))
77 |     """
78 |     def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
79 |         self.midpoint = midpoint
80 |         colors.Normalize.__init__(self, vmin, vmax, clip)
81 | 
82 |     def __call__(self, value, clip=None):
83 |         # I'm ignoring masked values and all kinds of edge cases to make a
84 |         # simple example...
85 |         x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
86 |         return np.ma.masked_array(np.interp(value, x, y), np.isnan(value))


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | 
 5 | from setuptools import setup, find_packages
 6 | from pyscisci import __package__, __description__, __version__
 7 | 
 8 | 
 9 | setup(name=__package__,
10 |       version=__version__,
11 |       description='Science of Science',
12 |       long_description=__description__,
13 |       classifiers=[
14 |             'Development Status :: 4 - Beta',
15 |             'License :: OSI Approved :: MIT License',
16 |             'Programming Language :: Python :: 3.10',
17 |             'Intended Audience :: Science/Research',
18 |             'Topic :: Scientific/Engineering :: Information Analysis',
19 |       ],
20 |       keywords=["science of science", "citation network", 'bibliometric'],
21 |       url="https://github.com/ajgates42/pyscisci",
22 |       author = 'Alex Gates <ajgates42@gmail.com>',
23 |       license="MIT",
24 |       packages = find_packages(),
25 |       install_requires=[
26 |             'pandas',
27 |             'numpy',
28 |             'scipy',
29 |             'scikit-learn',
30 |             'nameparser',
31 |             'lxml',
32 |             'requests',
33 |             'unidecode',
34 |             'tqdm',
35 |             'dask',
36 |             'numba'
37 |             ],
38 |       extras_require = {
39 |         'nlp':  ["sparse_dot_topn", "python-Levenshtein"],
40 |         'hdf':['tables']},
41 |       include_package_data=True,
42 |       zip_safe=False
43 |       )
44 | 


--------------------------------------------------------------------------------