├── .gitattributes
├── .github
    └── workflows
    │   ├── ci-workflow.yml
    │   └── pythonpublish.yml
├── .gitignore
├── .zenodo.json
├── LICENSE
├── README.md
├── Tutorials.md
├── asreviewcontrib
    └── datatools
    │   ├── __init__.py
    │   ├── compose.py
    │   ├── convert.py
    │   ├── dedup.py
    │   ├── describe.py
    │   ├── entrypoint.py
    │   ├── sample.py
    │   ├── snowball.py
    │   └── stack.py
├── dedup_similar.png
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── demo_data
        ├── dataset_1.ris
        ├── dataset_2.ris
        ├── duplicate_data_with_doi.csv
        ├── duplicate_data_without_doi.csv
        ├── sample_data.csv
        ├── snowballing_doi.csv
        └── snowballing_openalex.csv
    ├── test_compose.py
    ├── test_dedup.py
    ├── test_describe.py
    ├── test_sample.py
    ├── test_snowball.py
    └── test_stack.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | asreviewcontrib/datatools/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-workflow.yml:
--------------------------------------------------------------------------------
 1 | name: test-suite
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   lint-python:
 5 |     name: lint-python
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |     - uses: actions/checkout@v4
 9 |     - uses: actions/setup-python@v5
10 |       with:
11 |         python-version: '3.11'
12 |         architecture: 'x64'
13 |     - name: Install ruff
14 |       run: |
15 |         pip install ruff
16 |     - name: Lint python with ruff
17 |       run: |
18 |         ruff check .
19 |   test-master:
20 |     name: pytest
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - uses: actions/setup-python@v5
25 |     - name: Install packages and run tests
26 |       run: |
27 |         python3 -m pip install pip -U
28 |         pip install pytest
29 |         pip install .
30 |         pytest
31 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 |   id-token: write
10 | 
11 | jobs:
12 |   deploy:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: "3.x"
20 |       - name: Install dependencies
21 |         run: |
22 |           python -m pip install --upgrade pip
23 |           pip install build
24 |       - name: Build package
25 |         run: python -m build
26 |       - name: Publish package
27 |         uses: pypa/gh-action-pypi-publish@release/v1
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | asreviewcontrib/datatools/_version.py


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "description":"ASReview Datatools is an extension to ASReview LAB that can be used for describing basic properties of a dataset (e.g., number of papers, number of inclusions, the amount of missing data and duplicates), converting file formats via the command line, and cleaning your (input) data by removing duplicate records.",
 3 |    "title":"ASReview Datatools",
 4 |    "creators":[
 5 |       {
 6 |          "name":"ASReview LAB developers",
 7 |          "affiliation":"Utrecht University"
 8 |       }
 9 |    ],
10 |    "keywords":[
11 |       "data",
12 |       "systematic review",
13 |       "active learning",
14 |       "statistics",
15 |       "machine learning",
16 |       "text data",
17 |       "natural language processing"
18 |    ],
19 |    "related_identifiers":[
20 |       {
21 |          "scheme":"doi",
22 |          "relation":"isSupplementTo",
23 |          "identifier":"10.1038/s42256-020-00287-7"
24 |       },
25 |       {
26 |          "scheme": "doi",
27 |          "identifier": "10.5281/zenodo.3345592",
28 |          "relation": "isSupplementTo"
29 |       }
30 |    ],
31 |    "license":"MIT",
32 |    "upload_type":"software"
33 | }
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ASReview - Utrecht University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ASReview Datatools
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/asreview-datatools.svg)](https://badge.fury.io/py/asreview-datatools) [![Downloads](https://pepy.tech/badge/asreview-datatools)](https://pepy.tech/project/asreview-datatools) [![DOI](https://zenodo.org/badge/239740436.svg)](https://zenodo.org/badge/latestdoi/239740436)
  4 | 
  5 | ASReview Datatools is an extension to [ASReview
  6 | LAB](https://github.com/asreview/asreview) that can be used to:
  7 | - [**Describe**](#data-describe) basic properties of a dataset
  8 | - [**Convert**](#data-convert) file formats
  9 | - [**Deduplicate**](#data-dedup) data
 10 | - [**Stack**](#data-vstack-experimental) multiple datasets
 11 | - [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets
 12 | - [**Snowball**](#snowball) a dataset to find incoming or outgoing citations
 13 | - [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time.
 14 | 
 15 | Several [tutorials](Tutorials.md) are available that show how
 16 | `ASReview-Datatools` can be used in different scenarios.
 17 | 
 18 | ASReview datatools is available for ASReview LAB version 1 or later.
 19 | If you are using ASReview LAB version 0.x, use [ASReview-statistics](https://pypi.org/project/asreview-statistics/) instead of ASReview datatools.
 20 | 
 21 | ## Installation
 22 | ASReview Datatools requires Python 3.7+ and [ASReview LAB](https://github.com/asreview/asreview) version 1.1 or later.
 23 | 
 24 | The easiest way to install the extension is to install it from PyPI:
 25 | 
 26 | ``` bash
 27 | pip install asreview-datatools
 28 | ```
 29 | 
 30 | After installation of the datatools extension, `asreview` should automatically
 31 | detect it. Test this with the following command:
 32 | 
 33 | ```bash
 34 | asreview --help
 35 | ```
 36 | 
 37 | The extension is successfully installed if it lists `asreview data`.
 38 | 
 39 | To make sure that you are working with the latest version of datatools you can use:
 40 | 
 41 | ```bash
 42 | pip install asreview-datatools --upgrade
 43 | ```
 44 | 
 45 | ## Getting started
 46 | 
 47 | ASReview Datatools is a command line tool that extends ASReview LAB. Each
 48 | subsection below describes one of the tools. The structure is
 49 | 
 50 | ```bash
 51 | asreview data NAME_OF_TOOL
 52 | ```
 53 | 
 54 | where `NAME_OF_TOOL` is the name of one of the tools below (`describe`, `convert`, `dedup`, `vstack`, or `compose`)
 55 | followed by positional arguments and optional arguments.
 56 | 
 57 | Each tool has its own help description which is available with
 58 | 
 59 | ```bash
 60 | asreview data NAME_OF_TOOL -h
 61 | ```
 62 | 
 63 | ## Tools
 64 | ### Data Describe
 65 | 
 66 | Describe the content of a dataset
 67 | 
 68 | ```bash
 69 | asreview data describe MY_DATASET.csv
 70 | ```
 71 | 
 72 | Export the results to a file (`output.json`)
 73 | 
 74 | ```bash
 75 | asreview data describe MY_DATASET.csv -o output.json
 76 | ```
 77 | 
 78 | Describe the `van_de_schoot_2018` dataset from the [benchmark
 79 | platform](https://github.com/asreview/systematic-review-datasets).
 80 | 
 81 | ```bash
 82 | asreview data describe synergy:van_de_schoot_2018 -o output.json
 83 | ```
 84 | ```
 85 | {
 86 |   "asreviewVersion": "1.1",
 87 |   "apiVersion": "1.1.1",
 88 |   "data": {
 89 |     "items": [
 90 |       {
 91 |         "id": "n_records",
 92 |         "title": "Number of records",
 93 |         "description": "The number of records in the dataset.",
 94 |         "value": 6189
 95 |       },
 96 |       {
 97 |         "id": "n_relevant",
 98 |         "title": "Number of relevant records",
 99 |         "description": "The number of relevant records in the dataset.",
100 |         "value": 43
101 |       },
102 |       {
103 |         "id": "n_irrelevant",
104 |         "title": "Number of irrelevant records",
105 |         "description": "The number of irrelevant records in the dataset.",
106 |         "value": 6146
107 |       },
108 |       {
109 |         "id": "n_unlabeled",
110 |         "title": "Number of unlabeled records",
111 |         "description": "The number of unlabeled records in the dataset.",
112 |         "value": 0
113 |       },
114 |       {
115 |         "id": "n_missing_title",
116 |         "title": "Number of records with missing title",
117 |         "description": "The number of records in the dataset with missing title.",
118 |         "value": 5
119 |       },
120 |       {
121 |         "id": "n_missing_abstract",
122 |         "title": "Number of records with missing abstract",
123 |         "description": "The number of records in the dataset with missing abstract.",
124 |         "value": 764
125 |       },
126 |       {
127 |         "id": "n_duplicates",
128 |         "title": "Number of duplicate records (basic algorithm)",
129 |         "description": "The number of duplicate records in the dataset based on similar text.",
130 |         "value": 104
131 |       }
132 |     ]
133 |   }
134 | }
135 | ```
136 | 
137 | ### Data Convert
138 | 
139 | Convert the format of a dataset. For example, convert a RIS dataset into a
140 | CSV, Excel, or TAB dataset.
141 | 
142 | ```
143 | asreview data convert MY_DATASET.ris MY_OUTPUT.csv
144 | ```
145 | 
146 | ### Data Dedup
147 | 
148 | Remove duplicate records with a simple and straightforward deduplication
149 | [algorithm](https://asreview.readthedocs.io/en/latest/generated/asreview.ASReviewData.duplicated.html#asreview.ASReviewData.duplicated). The algorithm first removes all duplicates based on a persistent
150 | identifier (PID). Then it concatenates the title and abstract, whereafter it
151 | removes all non-alphanumeric tokens. Then the duplicates are removed.
152 | 
153 | ```
154 | asreview data dedup MY_DATASET.ris
155 | ```
156 | 
157 | Export the deduplicated dataset to a file (`output.csv`)
158 | 
159 | ```
160 | asreview data dedup MY_DATASET.ris -o output.csv
161 | ```
162 | 
163 | By default, the PID is set to 'doi'. The `dedup` function offers the option to
164 | use a different PID. Consider a dataset with PubMed identifiers (`PMID`), the
165 | identifier can be used for deduplication.
166 | 
167 | ```
168 | asreview data dedup MY_DATASET.csv -o output.csv --pid PMID
169 | ```
170 | 
171 | Using the `van_de_schoot_2018` dataset from the [benchmark
172 | platform](https://github.com/asreview/systematic-review-datasets).
173 | 
174 | ```bash
175 | asreview data dedup synergy:van_de_schoot_2018 -o van_de_schoot_2018_dedup.csv
176 | ```
177 | ```
178 | Removed 104 records from dataset with 6189 records.
179 | ```
180 | 
181 | We can also choose to deduplicate based on the similarity of the title and abstract, instead of checking for an exact match. This way we can find duplicates that have small differences, but are actually the same record (for example, an additional comma or a fixed typo). This can be done by using the `--drop_similar` flag. This process takes about 4s on a dataset of about 2068 entries.
182 | 
183 | ```bash
184 | asreview data dedup neurips_2020.tsv --drop_similar
185 | ```
186 | ```
187 | Not using doi for deduplication because there is no such data.
188 | Deduplicating: 100%|████████████████████████████████████| 2068/2068 [00:03<00:00, 531.93it/s]
189 | Found 2 duplicates in dataset with 2068 records.
190 | ```
191 | 
192 | If we want to check which entries were found as duplicates, we can use the `--verbose` flag. This will print the lines of the dataset that were found as duplicates, as well as the difference between them. Any text that has to be removed from the first entry to become the second one is shown as red and has a strikethrough, and any text that has to be added to the first entry is shown as green. All text that is the same in both entries is dimmed.
193 | 
194 | ```bash
195 | asreview data dedup neurips_2020.tsv --drop_similar --verbose
196 | ```
197 | 
198 | ![Verbose drop similar](./dedup_similar.png)
199 | 
200 | The similarity threshold can be set with the `--similarity` flag. The default similarity threshold is `0.98`. We can also choose to only use the title for deduplication by using the `--skip_abstract` flag.
201 | 
202 | ```bash
203 | asreview data dedup neurips_2020.tsv --drop_similar --similarity 0.98 --skip_abstract
204 | ```
205 | ```
206 | Not using doi for deduplication because there is no such data.
207 | Deduplicating: 100%|████████████████████████████████████| 2068/2068 [00:02<00:00, 770.74it/s]
208 | Found 4 duplicates in dataset with 2068 records.
209 | ```
210 | 
211 | Note that you might have to adjust your similarity score if you choose to only use the title for deduplication. The similarity score is calculated using the [SequenceMatcher](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher) class from the `difflib` package. The similarity score is calculated as the ratio of the number of matching characters to the total number of characters in the two strings. For example, the similarity score between the strings "hello" and "hello world" is 0.625. By default, we use the [real_quick_ratio](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.real_quick_ratio) and [quick_ratio](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.quick_ratio) methods, which are faster and usually good enough, but less accurate. If you want to use the ratio method, you can use the `--strict_similarity` flag.
212 | 
213 | Now, if we want to discard stopwords for deduplication (for a more strict check on the important words), we can use the `--discard_stopwords` flag. The default language for the stopwords is `english`, but that can be set with the `--stopwords_language` flag. The list of supported languages for the stopwords are the same supported by the [nltk](https://www.nltk.org/index.html) package. To check the list of available languages, you can run the following commands on your python environment:
214 | 
215 | ```python
216 | from nltk.corpus import stopwords
217 | print(stopwords.fileids())
218 | ```
219 | ```
220 | ['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']
221 | ```
222 | 
223 | ### Data Vstack (Experimental)
224 | 
225 | Vertical stacking: combine as many datasets in the same file format as you want into a single dataset.
226 | 
227 | ❗ Vstack is an experimental feature. We would love to hear your feedback.
228 | Please keep in mind that this feature can change in the future.
229 | 
230 | Stack several datasets on top of each other:
231 | ```
232 | asreview data vstack output.csv MY_DATASET_1.csv MY_DATASET_2.csv MY_DATASET_3.csv
233 | ```
234 | Here, three datasets are exported into a single dataset `output.csv`.
235 | The output path can be followed by any number of datasets to be stacked.
236 | 
237 |  This is an example using the [demo datasets](https://github.com/asreview/asreview-datatools/tree/master/tests/demo_data):
238 | 
239 | ```bash
240 | asreview data vstack output.ris dataset_1.ris dataset_2.ris
241 | ```
242 | 
243 | 
244 | ### Data Compose (Experimental)
245 | 
246 | Compose is where datasets containing records with different labels (or no
247 | labels) can be assembled into a single dataset.
248 | 
249 | ❗ Compose is an experimental feature. We would love to hear your feedback.
250 | Please keep in mind that this feature can change in the future.
251 | 
252 | Overview of possible input files and corresponding properties, use at least
253 | one of the following arguments:
254 | 
255 | | Arguments            | Action                                     |
256 | |----------------------|--------------------------------------------|
257 | | `--relevant`, `-r`   | Label all records from this dataset as `relevant` in the composed dataset.   |
258 | | `--irrelevant`, `-i` | Label all records from this dataset as `irrelevant` in the composed dataset. |
259 | | `--labeled`, `-l`    | Use existing labels from this dataset in the composed dataset.           |
260 | | `--unlabeled`, `-u`  | Remove all labels from this dataset in the composed dataset.              |
261 | 
262 | The output path should always be specified.
263 | 
264 | Duplicate checking is based on title/abstract and a persistent identifier
265 | (PID) like the digital object identifier (DOI). By default, `doi` is used as
266 | PID. It is possible to use the flag `--pid`  to specify a persistent
267 | identifier other than `doi`. In case duplicate records are detected, the user
268 | is warned, and the conflicting records are shown. To specify what happens in
269 | case of conflicts, use the `--conflict_resolve`/`-c` flag. This is set to
270 | `keep_one` by default, options are:
271 | 
272 | | Resolve method | Action in case of conflict                                                              |
273 | |----------------|-----------------------------------------------------------------------------------------|
274 | | `keep_one`     | Keep one label, using `--hierarchy` to determine which label to keep                    |
275 | | `keep_all`     | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) |
276 | | `abort`        | Abort                                                                                   |
277 | 
278 | 
279 | In case of an ambiguously labeled record (e.g., one record with two different
280 | labels), use `--hierarchy` to specify a hierarchy of labels. Pass the letters
281 | `r` (relevant), `i` (irrelevant), and `u` (unlabeled) in any order to set
282 | label hierarchy. By default, the order is `riu`  meaning that relevant labels
283 | are prioritized over irrelevant and unlabeled, and irrelevant labels are
284 | prioritized over unlabeled ones.
285 | 
286 | 
287 | Asume you have records in `MY_DATASET_1.ris` from which you want to keep all
288 | existing labels and records in `MY_DATASET_2.ris` which you want to keep
289 | unlabeled. Both datasets can be composed into a single dataset using:
290 | 
291 | ```bash
292 | asreview data compose composed_output.ris -l DATASET_1.ris -u DATASET_2.ris --hierarchy uir -c abort
293 | ```
294 | Because of the flag `-c abort` in case of conflicting/contradictory labels,
295 | the user is warned, records with inconsistent labels are shown, and the script
296 | is aborted. The flag `--hierarchy uir` results in the following hierarch if any
297 | duplicate ambiguously labeled records exist: unlabeled is prioritized over
298 | irrelevant and relevant labels, and irrelevant labels are prioritized over
299 | relevant labels.
300 | 
301 | ## Snowball
302 | 
303 | ASReview Datatools supports snowballing via the `asreview data snowball` subcommand.
304 | It can perform both backwards (outgoing citations) and forwards (incoming citations)
305 | snowballing. The tool works by searching the [OpenAlex](https://openalex.org/) database
306 | for citation data. An example usage would be:
307 | 
308 | ```bash
309 | asreview data snowball input_dataset.csv output_dataset.csv --forward
310 | ```
311 | 
312 | This performs forwards snowballing on `input_dataset.csv` and writes the results to
313 | `output_dataset.csv`. For this to work it is necessary that the input dataset contains
314 | a column with DOI's or a column called `openalex_id` containing OpenAlex work
315 | identifiers. The output dataset will contain the columns `id`, `doi`, `title`, `abstract`, `referenced_works` and `publication_date`. In the case of forward snowballing it will
316 | contain all works in OpenAlex that have a reference to one of the included works in the
317 | input dataset. In the case of backward snowballing it will contain all works in OpenAlex
318 | with referenced by one of the included works of the input dataset.
319 | 
320 | If you want to find references for all records in your dataset, instead of just the included works, you can include the flag `--all`, so for example:
321 | 
322 | ```bash
323 | asreview data snowball input_dataset.csv output_dataset.csv --backward --all
324 | ```
325 | 
326 | One thing to note is that OpenAlex will handle data requests faster if the sender sends along their email with the request (see [OpenAlex Polite Pool](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool
327 | )), you can to this using the `--email` argument. An example would be:
328 | 
329 | ```bash
330 | asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com
331 | ```
332 | 
333 | ## Sample
334 | 
335 | This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be:
336 | 
337 | ```bash
338 | asreview data sample input_dataset.xlsx output_dataset.xlsx 50
339 | ```
340 | This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`.
341 | 
342 | ## License
343 | 
344 | This extension is published under the [MIT license](/LICENSE).
345 | 
346 | ## Contact
347 | 
348 | This extension is part of the ASReview project ([asreview.ai](https://asreview.ai)). It is maintained by the
349 | maintainers of ASReview LAB. See [ASReview
350 | LAB](https://github.com/asreview/asreview) for contact information and more
351 | resources.
352 | 


--------------------------------------------------------------------------------
/Tutorials.md:
--------------------------------------------------------------------------------
  1 | # Tutorials
  2 | 
  3 | ---
  4 | Below are several examples to illustrate how to use `ASReview-datatools`.  Make
  5 | sure to have installed
  6 | [asreview-datatools](https://github.com/asreview/asreview-datatools) and
  7 | [ASReview LAB](https://asreview.nl/download/) v1.1 or higher.
  8 | 
  9 | Overview of the tutorials:
 10 | 1. [Update systematic review](#update-systematic-review)
 11 | 2. [Add prior knowledge](#add-prior-knowledge)
 12 | 3. [Prepare a dataset for a simulation study](#prepare-a-dataset-for-a-simulation-study)
 13 | 
 14 | 
 15 | Allowed data formats are described in the [ASReview
 16 | documentation](https://asreview.readthedocs.io/en/latest/data_format.html).
 17 | ASReview converts the labeling decisions in [RIS files](https://asreview.readthedocs.io/en/latest/data_format.html#ris-file-format) to a binary variable:
 18 | irrelevant as `0` and relevant as `1`. Records marked as unseen or with
 19 | missing labeling decisions are converted to `-1`.
 20 | 
 21 | ---
 22 | 
 23 | ## Update Systematic Review
 24 | 
 25 | Assume you are working on a systematic review and you want to update the
 26 | review with newly available records. The original data is stored in
 27 | `MY_LABELED_DATASET.csv` and the file contains a
 28 | [column](https://asreview.readthedocs.io/en/latest/data_labeled.html#label-format)
 29 | containing the labeling decissions. In order to update the systematic review,
 30 | you run the original  search query again but with a new date. You save the
 31 | newly found records in `SEARCH_UPDATE.ris`.
 32 | 
 33 | 
 34 | In the command line interface (CLI), navigate to the directory where the
 35 | dataset(s) are stored:
 36 | 
 37 | ```bash
 38 | cd Parent_directory
 39 | ```
 40 | 
 41 | ### Preparing your data
 42 | 
 43 | The original data and the newly found records are in a different datafile
 44 | format (CSV and RIS).  You can convert files to the same file format using the
 45 | `convert` script.  For example, to convert SEARCH_UPDATE.ris to CSV format,
 46 | open the command line interface (CLI) and navigate to the directory where the
 47 | dataset(s) are stored and run
 48 | 
 49 | ```bash
 50 | asreview data convert SEARCH_UPDATE.ris SEARCH_UPDATE.csv
 51 | ```
 52 | 
 53 | Duplicate records can be removed with with `dedup` script. The algorithm
 54 | removes duplicates using the Digital Object Indentifier
 55 | ([DOI](https://www.doi.org/)) and the title plus abstract.
 56 | 
 57 | ```bash
 58 | asreview data dedup SEARCH_UPDATE.csv -o SEARCH_UPDATE_DEDUP.csv
 59 | ```
 60 | 
 61 | This can also be done considering a similarity threshold between the titles and abstracts.
 62 | 
 63 | ```bash
 64 | asreview data dedup SEARCH_UPDATE.csv -o SEARCH_UPDATE_DEDUP.csv --drop_similar
 65 | ```
 66 | 
 67 | ### Describe input
 68 | 
 69 | If you want to see descriptive info on your input datasets, run these commands:
 70 | 
 71 | ```bash
 72 | asreview data describe MY_LABELED_DATASET.csv -o MY_LABELED_DATASET_description.json
 73 | asreview data describe SEARCH_UPDATE_DEDUP.csv -o SEARCH_UPDATE_description.json
 74 | ```
 75 | The results will be exported to `MY_LABELED_DATASET_description.json` and `SEARCH_UPDATE_description.json`.
 76 | 
 77 | ### Compose datasets
 78 | 
 79 | Use the `compose` script to add `SEARCH_UPDATE_DEDUP.csv` to `MY_LABELED_DATASET.csv`:
 80 | 
 81 | ```bash
 82 | asreview data compose updated_search.csv -l MY_LABELED_DATASET.csv -u SEARCH_UPDATE_DEDUP.csv
 83 | ```
 84 | The flag `-l` means the labels in `MY_LABELED_DATASET.csv` will be kept.
 85 | 
 86 | The flag `-u` means all records from `SEARCH_UPDATE_DEDUP.csv` will be
 87 | added as unlabeled to the composed dataset.
 88 | 
 89 | If a record exists in both datasets, it is assumed the record containing a
 90 | label is maintained, see the default [conflict resolving
 91 | strategy](https://github.com/asreview/asreview-datatools#resolving-conflicting-labels).
 92 | To keep both records (with and without label), use
 93 | 
 94 | ```bash
 95 | asreview data compose updated_search.csv -l MY_LABELED_DATASET.csv -u SEARCH_UPDATE_DEDUP.csv -c keep
 96 | ```
 97 | 
 98 | The composed dataset will be exported to `COMPOSED_DATA.csv`.
 99 | 
100 | ### Describe output
101 | 
102 | To see descriptive info on the composed dataset:
103 | 
104 | ```bash
105 | asreview data describe COMPOSED_DATA.csv -o updated_search_description.json
106 | ```
107 | The result will be exported to `updated_search_description.json`.
108 | 
109 | ### Continue screening in ASReview lab
110 | 
111 | The [partly
112 | labelled](https://asreview.readthedocs.io/en/latest/data_labeled.html#partially-labeled-data)
113 | data, `COMPOSED_DATA.csv`, can be uploaded to [ASReview lab - Oracle
114 | mode](https://asreview.readthedocs.io/en/latest/project_create.html). The
115 | lables will be reckognized by ASReview and used to train the first iteration
116 | of the model and you can continue screening all unlabeled records found in the
117 | new search.
118 | 
119 | ---
120 | ## Add prior knowledge
121 | 
122 | Assume you have just executed a search query for a systematic review and you
123 | want to use a pre-defined set of relevant and irrelevant records as training
124 | data. The search results are stored in `SEARCH_RESULTS.ris`, and the records
125 | you already know to be relevant/irrelevant are saved in
126 | `PRIOR_RELEVANT.ris` and `PRIOR_IRRELEVANT.ris` respectively.
127 | 
128 | 
129 | In the command line interface (CLI), navigate to the directory where the dataset(s) are stored:
130 | ```bash
131 | cd Parent_directory
132 | ```
133 | ### Describe input
134 | If you want to see descriptive info on your input datasets, run these commands:
135 | ```bash
136 | asreview data describe SEARCH_RESULTS.ris -o SEARCH_RESULTS_description.json
137 | asreview data describe PRIOR_RELEVANT.ris -o PRIOR_RELEVANT_description.json
138 | asreview data describe PRIOR_IRRELEVANT.ris -o PRIOR_IRRELEVANT_description.json
139 | ```
140 | 
141 | The results will be exported to `SEARCH_RESULTS_description.json`,
142 | `PRIOR_RELEVANT_description.json` and `PRIOR_IRRELEVANT_description.json`.
143 | 
144 | 
145 | ### Compose datasets
146 | To create one dataset with labels only for the training data to be used in ASREview, run:
147 | 
148 | ```bash
149 | asreview data compose search_with_priors.ris -u SEARCH_RESULTS.ris -r PRIOR_RELEVANT.ris -i PRIOR_IRRELEVANT.ris
150 | ```
151 | 
152 | The flag `-r` means all records from `PRIOR_RELEVANT.ris` will be added as
153 | relevant records to the composed dataset.
154 | 
155 | The flag `-i` means all records from `PRIOR_IRRELEVANT.ris` will be added
156 | as irrelevant.
157 | 
158 | The flag `-u` means all other records from `SEARCH_RESULTS.ris` will be
159 | added as unlabeled.
160 | 
161 | If any duplicate records exist across the datasets, by default the order of
162 | keeping labels is:
163 | 1. relevant
164 | 2. irrelevant
165 | 3. unlabeled
166 | 
167 | You can configure the behavior in resolving conflicting labels by setting the
168 | hierarchy differently. To do so, pass the letters r (relevant), i
169 | (irrelevant), and u (unlabeled) in any order to, for example, `--hierarchy
170 | uir`.
171 | 
172 | 
173 | The composed dataset will be exported to `search_with_priors.ris`.
174 | 
175 | ### Describe output
176 | To see descriptive info on the composed dataset:
177 | 
178 | ```bash
179 | asreview data describe search_with_priors.ris -o search_with_priors_description.json
180 | ```
181 | 
182 | The result will be exported to `search_with_priors_description.json` in the
183 | output folder.
184 | 
185 | 
186 | ### Start screening in ASReview lab
187 | 
188 | The [partly
189 | labelled](https://asreview.readthedocs.io/en/latest/data_labeled.html#partially-labeled-data)
190 | data, `search_with_priors.ris`, can be uploaded to [ASReview lab - Oracle
191 | mode](https://asreview.readthedocs.io/en/latest/project_create.html). The
192 | lables will be reckognized by ASReview and used to train the first iteration
193 | of the model and you can continue screening all unlabeled records found in the
194 | new search.
195 | 
196 | ---
197 | ## Prepare a dataset for a simulation study
198 | 
199 | Assume you want to use the [simulation
200 | mode](https://asreview.readthedocs.io/en/latest/simulation_overview.html) of
201 | ASReview but the data is not stored in one singe file containing the meta-data
202 | and labelling decissions as required by ASReview.
203 | 
204 | Suppose the following files are available:
205 | 
206 | - `SCREENED.ris`: all records that were screened
207 | - `RELEVANT.ris`: the subset of relevant records after manually screening all the records.
208 | 
209 | You need to compose the files into a single file where all records from
210 | `RELEVANT.csv` are relevant all other records are irrelevant.
211 | 
212 | In the command line interface (CLI), navigate to the directory where the
213 | dataset(s) are stored:
214 | 
215 | ```bash
216 | cd Parent_directory
217 | ```
218 | 
219 | ### Describe input
220 | 
221 | If you want to see descriptive info on your input datasets, run these commands:
222 | 
223 | ```bash
224 | asreview data describe SCREENED.ris -o SCREENED_description.json
225 | asreview data describe RELEVANT.ris -o RELEVANT_description.json
226 | ```
227 | The results will be exported to `SCREENED_description.json` and `RELEVANT_description.json`.
228 | 
229 | ### Compose datasets
230 | 
231 | Run `compose.py` to compose a new dataset from `SCREENED.ris` and `RELEVANT.ris`:
232 | 
233 | ```bash
234 | asreview data compose screened_with_labels.ris -i SCREENED.ris -r RELEVANT.ris
235 | ```
236 | 
237 | The flag `-r` means all records from `RELEVANT.ris` will be added as
238 | relevant to the composed dataset.
239 | 
240 | The flag `-i` means all other records from `SCREENED.ris` will be added as
241 | irrelevant.
242 | 
243 | The composed dataset will be exported to `screened_with_labels.ris`.
244 | 
245 | ### Describe output
246 | 
247 | To see descriptive info on the composed dataset:
248 | 
249 | ```bash
250 | asreview data describe screened_with_labels.ris -o screened_with_labels_description.json
251 | ```
252 | The result will be exported to `screened_with_labels_description.json`.
253 | 
254 | ### Run simulation in ASReview lab
255 | 
256 | The resulting file `screened_with_labels.ris` can be uploaded to [ASReview lab
257 | Simulation
258 | mode](https://asreview.readthedocs.io/en/latest/simulation_webapp.html). This
259 | allows you to simulate the screening procedure of the systematic review as if
260 | it were carried out using ASReview lab.
261 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from asreviewcontrib.datatools._version import __version__
3 |     from asreviewcontrib.datatools._version import __version_tuple__
4 | except ImportError:
5 |     __version__ = "0.0.0"
6 |     __version_tuple__ = (0, 0, 0)
7 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/compose.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import warnings
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | from asreview import ASReviewData
  7 | from asreview.data.base import load_data
  8 | 
  9 | 
 10 | def _check_order_arg(order):
 11 |     # if no hierarchy is specified, set to default: "riu"
 12 |     if order is None:
 13 |         return "riu"
 14 | 
 15 |     allowed_orders = ["riu", "rui", "uri", "uir", "iru", "iur"]
 16 |     if order in allowed_orders:
 17 |         return order
 18 |     else:
 19 |         raise ValueError(
 20 |             f"hierarchy '{order}' not found, should be one of the"
 21 |             f" following: {allowed_orders}"
 22 |         )
 23 | 
 24 | 
 25 | def _check_resolve_arg(resolve):
 26 |     # if no resolve method is specified, set to default: "keep_one"
 27 |     if resolve is None:
 28 |         return "keep_one"
 29 | 
 30 |     allowed_resolve = ["keep_one", "keep_all", "abort"]
 31 |     if resolve in allowed_resolve:
 32 |         return resolve
 33 |     else:
 34 |         raise ValueError(
 35 |             f"conflict_resolve '{resolve}' not found, should be one "
 36 |             f"of the following: {allowed_resolve}"
 37 |         )
 38 | 
 39 | 
 40 | def _check_suffix(input_files, output_file):
 41 |     # Also raises ValueError on URLs that do not end with a file extension
 42 |     suffixes = [Path(item).suffix for item in input_files if item is not None]
 43 |     suffixes.append(Path(output_file).suffix)
 44 | 
 45 |     set_ris = {".txt", ".ris"}
 46 |     set_tabular = {".csv", ".tab", ".tsv", ".xlsx"}
 47 |     set_suffixes = set(suffixes)
 48 | 
 49 |     if len(set(suffixes)) > 1:
 50 |         if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
 51 |             raise ValueError(
 52 |                 "Files with different file types were; all input files, as well as the"
 53 |                 " output file, should be of the same type. "
 54 |             )
 55 | 
 56 | 
 57 | def _check_label_errors(as_lab, path_lab):
 58 |     if as_lab is not None:
 59 |         if as_lab.labels is None:
 60 |             warnings.warn(
 61 |                 f"'{path_lab}' was passed as a labeled dataset but no labels were"
 62 |                 " found, continuing with its records marked as unlabeled. If this is"
 63 |                 " not correct, check if your data format complies with:"
 64 |                 " https://asreview.readthedocs.io/en/latest/data_format.html",
 65 |                 stacklevel=1,
 66 |             )
 67 | 
 68 | 
 69 | def _append_df(list_df, as_obj, label):
 70 |     # retrieve part of dataframe with label -1, 0 or 1
 71 |     df_slice = as_obj.df[as_obj.labels == label].reset_index(drop=True)
 72 | 
 73 |     if not df_slice.empty:
 74 |         list_df.append(df_slice)
 75 | 
 76 | 
 77 | def _concat_label(list_df, label, pid="doi"):
 78 |     # if there are any dataframes with the given label, concatenate and drop
 79 |     # duplicates on pid and title/abstract
 80 |     if list_df:
 81 |         df_all = pd.concat(list_df).reset_index(drop=True)
 82 |         df_all["included"] = label
 83 |         n_total = len(df_all)
 84 | 
 85 |         df_all = ASReviewData(df=df_all).drop_duplicates(pid=pid).reset_index(drop=True)
 86 | 
 87 |         n_total_dedup = n_total - len(df_all)
 88 |         print(
 89 |             f"Detected {n_total} records with label '{label}', from which"
 90 |             f" {n_total_dedup} duplicate records with the same label were removed."
 91 |         )
 92 |     else:
 93 |         df_all = pd.DataFrame()
 94 | 
 95 |     return df_all
 96 | 
 97 | 
 98 | def create_composition(
 99 |     rel_path=None,
100 |     irr_path=None,
101 |     lab_path=None,
102 |     unl_path=None,
103 |     pid="doi",
104 |     order="riu",
105 |     resolve="keep_one",
106 | ):
107 |     # load all input files and URLs into ASReviewData objects, fill with None
108 |     # if input was not specified
109 |     input_files = [rel_path, irr_path, lab_path, unl_path]
110 |     as_rel, as_irr, as_lab, as_unl = (
111 |         load_data(item) if item is not None else None for item in input_files
112 |     )
113 | 
114 |     # check whether input files are correctly labeled
115 |     _check_label_errors(as_lab, lab_path)
116 | 
117 |     # create lists to append dataframes with a specific label to
118 |     list_df_rel, list_df_irr, list_df_unl = [], [], []
119 | 
120 |     # split labeled input data in relevant, irrelevant and unlabeled and add
121 |     # to list of dataframes for that label
122 |     if as_lab is not None:
123 |         if as_lab.labels is not None:
124 |             _append_df(list_df_rel, as_lab, 1)
125 |             _append_df(list_df_irr, as_lab, 0)
126 |             _append_df(list_df_unl, as_lab, -1)
127 |         else:
128 |             list_df_unl.append(as_lab.df)
129 | 
130 |     # add dataframe to list of dataframes for that label
131 |     if as_rel is not None:
132 |         list_df_rel.append(as_rel.df)
133 |     if as_irr is not None:
134 |         list_df_irr.append(as_irr.df)
135 |     if as_unl is not None:
136 |         list_df_unl.append(as_unl.df)
137 | 
138 |     # concatenate all dataframes with the same label, drop duplicates and map
139 |     # them in a dictionary
140 |     dict_dfs = {
141 |         "r": _concat_label(list_df_rel, 1, pid),
142 |         "i": _concat_label(list_df_irr, 0, pid),
143 |         "u": _concat_label(list_df_unl, -1, pid),
144 |     }
145 | 
146 |     # map letters to corresponding term
147 |     dict_terms = {"r": "relevant", "i": "irrelevant", "u": "unlabeled"}
148 | 
149 |     # concatenate in specified order, only the first duplicate entry is kept
150 |     as_conflict = ASReviewData(
151 |         df=pd.concat(
152 |             [dict_dfs[order[0]], dict_dfs[order[1]], dict_dfs[order[2]]]
153 |         ).reset_index(drop=True)
154 |     )
155 | 
156 |     # check for label conflicts
157 |     df_conflicting_dups = as_conflict.df[as_conflict.duplicated(pid)]
158 |     if len(df_conflicting_dups) > 0:
159 |         as_conflicts_only = ASReviewData(df=df_conflicting_dups.reset_index(drop=True))
160 |         # create a dataframe with the relevant info for the user
161 |         if pid in as_conflicts_only.df.columns:
162 |             df_info_conflicts = pd.DataFrame(
163 |                 {
164 |                     pid: as_conflicts_only.df[pid].fillna(""),
165 |                     "Title": as_conflicts_only.title,
166 |                     "Abstract": as_conflicts_only.abstract,
167 |                 }
168 |             )
169 |         else:
170 |             df_info_conflicts = pd.DataFrame(
171 |                 {
172 |                     "Title": as_conflicts_only.title,
173 |                     "Abstract": as_conflicts_only.abstract,
174 |                 }
175 |             )
176 | 
177 |         # pandas settings to print properly
178 |         with pd.option_context(
179 |             "display.max_rows",
180 |             None,
181 |             "display.max_columns",
182 |             3,
183 |             "max_colwidth",
184 |             40,
185 |             "display.width",
186 |             500,
187 |             "display.colheader_justify",
188 |             "left",
189 |         ):
190 |             print(
191 |                 f"\nSome records have inconsistent labels in the input files. This may"
192 |                 " be intentional because you are trying to overwrite labels in an input"
193 |                 " file with labels from another input file. However, it may also be"
194 |                 " because some records are unintentionally labeled inconsistently.\n\n"
195 |                 "The following records have inconsistent labels in the input files:\n"
196 |                 f"{df_info_conflicts}\n"
197 |             )
198 | 
199 |         if resolve == "abort":
200 |             raise ValueError("Abort composing because inconsistent labels were found.")
201 | 
202 |         elif resolve == "keep_one":
203 |             warnings.warn(
204 |                 f"Continuing, keeping one label for records with inconsistent labels,"
205 |                 " resolving conflicts using the following hierarchy:"
206 |                 f"\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}"
207 |                 f"\n3. {dict_terms[order[2]]}",
208 |                 stacklevel=1,
209 |             )
210 |             df_composed = as_conflict.drop_duplicates(pid=pid).reset_index(drop=True)
211 | 
212 |         elif resolve == "keep_all":
213 |             warnings.warn(
214 |                 "Continuing, keeping all labels for duplicate records with inconsistent"
215 |                 " labels.",
216 |                 stacklevel=1,
217 |             )
218 |             df_composed = as_conflict.df
219 | 
220 |     else:
221 |         df_composed = as_conflict.df
222 | 
223 |     # move included column to the end of dataframe
224 |     included = df_composed.pop("included")
225 |     df_composed = df_composed.assign(included=included)
226 | 
227 |     return df_composed
228 | 
229 | 
230 | def _output_composition(final_df, output_file):
231 |     # prepare collected labels to pass to the output file
232 |     labels = [[index, row["included"]] for index, row in final_df.iterrows()]
233 |     as_composed = ASReviewData(df=final_df)
234 | 
235 |     as_composed.to_file(output_file, labels=labels)
236 | 
237 |     print(f"Finished, exported composed dataset to {output_file}.")
238 | 
239 | 
240 | def compose(
241 |     output_file, rel, irr, lab, unl, pid="doi", order="riu", resolve="keep_one"
242 | ):
243 |     # check whether all input has the same file extension
244 |     _check_suffix([rel, irr, lab, unl], output_file)
245 | 
246 |     df_composition = create_composition(
247 |         rel, irr, lab, unl, pid=pid, order=order, resolve=resolve
248 |     )
249 |     _output_composition(df_composition, output_file)
250 | 
251 | 
252 | def _parse_arguments_compose():
253 |     parser = argparse.ArgumentParser(prog="asreview data compose")
254 |     parser.add_argument("output_path", type=str, help="The output file path.")
255 |     parser.add_argument(
256 |         "--relevant", "-r", type=str, help="A dataset with relevant records."
257 |     )
258 |     parser.add_argument(
259 |         "--irrelevant", "-i", type=str, help="A dataset with irrelevant records."
260 |     )
261 |     parser.add_argument("--labeled", "-l", type=str, help="A labeled dataset.")
262 |     parser.add_argument("--unlabeled", "-u", type=str, help="An unlabeled dataset.")
263 |     parser.add_argument(
264 |         "--hierarchy",
265 |         dest="hierarchy",
266 |         type=_check_order_arg,
267 |         default="riu",
268 |         help="Hierarchy of labels in case of duplicates." "Default: riu.",
269 |     )
270 |     parser.add_argument(
271 |         "--conflict_resolve",
272 |         "-c",
273 |         dest="conflict_resolve",
274 |         type=_check_resolve_arg,
275 |         default="keep_one",
276 |         help="Method for dealing with " "conflicting labels.",
277 |     )
278 |     parser.add_argument(
279 |         "--pid",
280 |         type=str,
281 |         default="doi",
282 |         help="Persistent identifier used for deduplication. " "Default: doi.",
283 |     )
284 |     return parser
285 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/convert.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from asreview.data import ASReviewData
 4 | 
 5 | 
 6 | def convert(input_path, output_path):
 7 |     # read data in ASReview data object
 8 |     asdata = ASReviewData.from_file(input_path)
 9 | 
10 |     asdata.to_file(output_path)
11 | 
12 | 
13 | def _parse_arguments_convert():
14 |     parser = argparse.ArgumentParser(prog="asreview data convert")
15 |     parser.add_argument("input_path", type=str, help="The file path of the dataset.")
16 |     parser.add_argument("output_path", type=str, help="The file path of the dataset.")
17 |     return parser
18 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/dedup.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from difflib import SequenceMatcher
  3 | 
  4 | import ftfy
  5 | import pandas as pd
  6 | from asreview import ASReviewData
  7 | from pandas.api.types import is_object_dtype
  8 | from pandas.api.types import is_string_dtype
  9 | from rich.console import Console
 10 | from rich.text import Text
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def _print_similar_list(
 15 |     similar_list: list[tuple[int, int]],
 16 |     data: pd.Series,
 17 |     pid: str,
 18 |     pids: pd.Series = None,
 19 | ) -> None:
 20 |     print_seq_matcher = SequenceMatcher()
 21 |     console = Console()
 22 | 
 23 |     if pids is not None:
 24 |         print(f"Found similar titles or same {pid} at lines:")
 25 |     else:
 26 |         print("Found similar titles at lines:")
 27 | 
 28 |     for i, j in similar_list:
 29 |         print_seq_matcher.set_seq1(data.iloc[i])
 30 |         print_seq_matcher.set_seq2(data.iloc[j])
 31 |         text = Text()
 32 | 
 33 |         if pids is not None:
 34 |             text.append(f"\nLines {i + 1} and {j + 1} ", style="bold")
 35 |             if pids.iloc[i] == pids.iloc[j]:
 36 |                 text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim")
 37 |             else:
 38 |                 text.append(
 39 |                     f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim"
 40 |                 )
 41 | 
 42 |         else:
 43 |             text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold")
 44 | 
 45 |         for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
 46 |             if tag == "replace":
 47 |                 # add rich strikethrough
 48 |                 text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
 49 |                 text.append(f"{data.iloc[j][j1:j2]}", style="green")
 50 |             if tag == "delete":
 51 |                 text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
 52 |             if tag == "insert":
 53 |                 text.append(f"{data.iloc[j][j1:j2]}", style="green")
 54 |             if tag == "equal":
 55 |                 text.append(f"{data.iloc[i][i1:i2]}", style="dim")
 56 | 
 57 |         console.print(text)
 58 | 
 59 |     print("")
 60 | 
 61 | 
 62 | def _drop_duplicates_by_similarity(
 63 |     asdata: ASReviewData,
 64 |     pid: str,
 65 |     threshold: float = 0.98,
 66 |     title_only: bool = False,
 67 |     stopwords_language: str = None,
 68 |     strict: bool = False,
 69 |     verbose: bool = False,
 70 | ) -> None:
 71 |     if title_only:
 72 |         data = asdata.df["title"]
 73 |     else:
 74 |         data = pd.Series(asdata.texts)
 75 | 
 76 |     symbols_regex = re.compile(r"[^ \w\d\-_]")
 77 |     spaces_regex = re.compile(r"\s+")
 78 | 
 79 |     # clean the data
 80 |     s = (
 81 |         data.apply(ftfy.fix_text)
 82 |         .str.replace(symbols_regex, "", regex=True)
 83 |         .str.replace(spaces_regex, " ", regex=True)
 84 |         .str.lower()
 85 |         .str.strip()
 86 |         .replace("", None)
 87 |     )
 88 | 
 89 |     if stopwords_language:
 90 |         try:
 91 |             from nltk.corpus import stopwords
 92 | 
 93 |             stopwords_set = set(stopwords.words(stopwords_language))
 94 |         except LookupError:
 95 |             import nltk
 96 | 
 97 |             nltk.download("stopwords")
 98 |             stopwords_set = set(stopwords.words(stopwords_language))
 99 | 
100 |         stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b")
101 |         s = s.str.replace(stopwords_regex, "", regex=True)
102 | 
103 |     seq_matcher = SequenceMatcher()
104 |     duplicated = [False] * len(s)
105 | 
106 |     similar_list = []
107 |     if pid in asdata.df.columns:
108 |         if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
109 |             pids = asdata.df[pid].str.strip().replace("", None)
110 |             if pid == "doi":
111 |                 pids = pids.str.lower().str.replace(
112 |                     r"^https?://(www\.)?doi\.org/", "", regex=True
113 |                 )
114 |         else:
115 |             pids = asdata.df[pid]
116 | 
117 |         for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
118 |             seq_matcher.set_seq2(text)
119 | 
120 |             # loop through the rest of the data if it has the same pid or similar length
121 |             for j, t in s.iloc[i + 1 :][
122 |                 (asdata.df[pid] == asdata.df.iloc[i][pid])
123 |                 | (abs(s.str.len() - len(text)) < 5)
124 |             ].items():
125 |                 seq_matcher.set_seq1(t)
126 | 
127 |                 # if the texts have the same pid or are similar enough,
128 |                 # mark the second one as duplicate
129 |                 if pids.iloc[i] == pids.iloc[j] or (
130 |                     seq_matcher.real_quick_ratio() > threshold
131 |                     and seq_matcher.quick_ratio() > threshold
132 |                     and (not strict or seq_matcher.ratio() > threshold)
133 |                 ):
134 |                     if not duplicated[j]:
135 |                         similar_list.append((i, j))
136 |                     duplicated[j] = True
137 | 
138 |     else:
139 |         print(f"Not using {pid} for deduplication because there is no such data.")
140 | 
141 |         for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
142 |             seq_matcher.set_seq2(text)
143 | 
144 |             # loop through the rest of the data if it has similar length
145 |             for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items():
146 |                 seq_matcher.set_seq1(t)
147 | 
148 |                 # if the texts are similar enough, mark the second one as duplicate
149 |                 if (
150 |                     seq_matcher.real_quick_ratio() > threshold
151 |                     and seq_matcher.quick_ratio() > threshold
152 |                     and (not strict or seq_matcher.ratio() > threshold)
153 |                 ):
154 |                     if not duplicated[j]:
155 |                         similar_list.append((i, j))
156 |                     duplicated[j] = True
157 |     asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
158 |     if verbose:
159 |         _print_similar_list(similar_list, data, pid)
160 | 
161 | 
162 | def deduplicate_data(
163 |     asdata: ASReviewData,
164 |     output_path: str = None,
165 |     pid: str = "doi",
166 |     similar: bool = False,
167 |     threshold: float = 0.98,
168 |     title_only: bool = False,
169 |     stopwords_language: str = None,
170 |     strict: bool = False,
171 |     verbose: bool = False,
172 | ) -> None:
173 |     """Deduplicate an ASReview data object.
174 | 
175 |     Parameters
176 |     ----------
177 |     asdata : ASReviewData
178 |         The data object.
179 |     output_path : str, optional
180 |         If provided, the deduplicated data object is stored at this location. By
181 |         default None.
182 |     pid : str, optional
183 |         Principal identifier to use for deduplication, by default "doi"
184 |     similar : bool, optional
185 |         Where to deduplicate 'similar' record. The similarity of the records is
186 |         calculated using the `SequenceMatcher` from `difflib`. By default False.
187 |     threshold : float, optional
188 |         Threshold score above which two records are considered duplicate.
189 |         By default 0.98. Only applies if `similar` is set to `True`.
190 |     title_only : bool, optional
191 |         Only use the title for deduplication, by default False
192 |     stopwords_language : str, optional
193 |         Remove stopwords from this language before deduplicating, for example 'english'.
194 |         By default None. Only applies if `similar` is set to `True`.
195 |     strict : bool, optional
196 |         Use a stricter algorithm to calculate the similarity between records.
197 |         By default False. Only applies if `similar` is set to `True`.
198 |     verbose : bool, optional
199 |         Get verbose output during deduplicating. By default False. Only applies if
200 |         `similar` is set to `True`.
201 |     """
202 |     initial_length = len(asdata.df)
203 | 
204 |     if not similar:
205 |         if pid not in asdata.df.columns:
206 |             print(f"Not using {pid} for deduplication because there is no such data.")
207 | 
208 |         # retrieve deduplicated ASReview data object
209 |         asdata.drop_duplicates(pid=pid, inplace=True)
210 | 
211 |     else:
212 |         _drop_duplicates_by_similarity(
213 |             asdata=asdata,
214 |             pid=pid,
215 |             threshold=threshold,
216 |             title_only=title_only,
217 |             stopwords_language=stopwords_language,
218 |             strict=strict,
219 |             verbose=verbose,
220 |         )
221 | 
222 |     if output_path:
223 |         asdata.to_file(output_path)
224 | 
225 |     # count duplicates
226 |     n_dup = initial_length - len(asdata.df)
227 |     print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")
228 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/describe.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import asreview
 5 | from asreview.data import load_data
 6 | from asreview.data.statistics import n_duplicates
 7 | from asreview.data.statistics import n_irrelevant
 8 | from asreview.data.statistics import n_missing_abstract
 9 | from asreview.data.statistics import n_missing_title
10 | from asreview.data.statistics import n_records
11 | from asreview.data.statistics import n_relevant
12 | from asreview.data.statistics import n_unlabeled
13 | 
14 | from asreviewcontrib.datatools import __version__
15 | 
16 | 
17 | def describe(input_path, output_path=None):
18 |     # read data in ASReview data object
19 |     asdata = load_data(input_path)
20 | 
21 |     # based on https://google.github.io/styleguide/jsoncstyleguide.xml
22 |     stats = {
23 |         "asreviewVersion": asreview.__version__,
24 |         "apiVersion": __version__,
25 |         "data": {
26 |             "items": [
27 |                 {
28 |                     "id": "n_records",
29 |                     "title": "Number of records",
30 |                     "description": "The number of records in the dataset.",
31 |                     "value": n_records(asdata),
32 |                 },
33 |                 {
34 |                     "id": "n_relevant",
35 |                     "title": "Number of relevant records",
36 |                     "description": "The number of relevant records in the dataset.",
37 |                     "value": n_relevant(asdata),
38 |                 },
39 |                 {
40 |                     "id": "n_irrelevant",
41 |                     "title": "Number of irrelevant records",
42 |                     "description": "The number of irrelevant records in the dataset.",
43 |                     "value": n_irrelevant(asdata),
44 |                 },
45 |                 {
46 |                     "id": "n_unlabeled",
47 |                     "title": "Number of unlabeled records",
48 |                     "description": "The number of unlabeled records in the dataset.",
49 |                     "value": n_unlabeled(asdata),
50 |                 },
51 |                 {
52 |                     "id": "n_missing_title",
53 |                     "title": "Number of records with missing title",
54 |                     "description": (
55 |                         "The number of records in the dataset with missing title."
56 |                     ),
57 |                     "value": n_missing_title(asdata)[0],
58 |                 },
59 |                 {
60 |                     "id": "n_missing_abstract",
61 |                     "title": "Number of records with missing abstract",
62 |                     "description": (
63 |                         "The number of records in the dataset with missing abstract."
64 |                     ),
65 |                     "value": n_missing_abstract(asdata)[0],
66 |                 },
67 |                 {
68 |                     "id": "n_duplicates",
69 |                     "title": "Number of duplicate records (basic algorithm)",
70 |                     "description": (
71 |                         "The number of duplicate records in the dataset based on"
72 |                         " similar text."
73 |                     ),
74 |                     "value": n_duplicates(asdata),
75 |                 },
76 |             ]
77 |         },
78 |     }  # noqa
79 | 
80 |     if output_path:
81 |         with open(output_path, "w") as f:
82 |             json.dump(stats, f, indent=2)
83 | 
84 |     print(json.dumps(stats, indent=2))
85 | 
86 | 
87 | def _parse_arguments_describe():
88 |     parser = argparse.ArgumentParser(prog="asreview data describe")
89 |     parser.add_argument("input_path", type=str, help="The file path of the dataset.")
90 |     parser.add_argument(
91 |         "--output_path",
92 |         "-o",
93 |         default=None,
94 |         type=str,
95 |         help="The file path of the dataset.",
96 |     )
97 | 
98 |     return parser
99 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from asreview.data import load_data
  4 | from asreview.entry_points import BaseEntryPoint
  5 | 
  6 | from asreviewcontrib.datatools import __version__
  7 | from asreviewcontrib.datatools.compose import _parse_arguments_compose
  8 | from asreviewcontrib.datatools.compose import compose
  9 | from asreviewcontrib.datatools.convert import _parse_arguments_convert
 10 | from asreviewcontrib.datatools.convert import convert
 11 | from asreviewcontrib.datatools.dedup import deduplicate_data
 12 | from asreviewcontrib.datatools.describe import _parse_arguments_describe
 13 | from asreviewcontrib.datatools.describe import describe
 14 | from asreviewcontrib.datatools.sample import _parse_arguments_sample
 15 | from asreviewcontrib.datatools.sample import sample
 16 | from asreviewcontrib.datatools.snowball import _parse_arguments_snowball
 17 | from asreviewcontrib.datatools.snowball import snowball
 18 | from asreviewcontrib.datatools.stack import _parse_arguments_vstack
 19 | from asreviewcontrib.datatools.stack import vstack
 20 | 
 21 | DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"]
 22 | 
 23 | 
 24 | class DataEntryPoint(BaseEntryPoint):
 25 |     description = "Home of all data tools for ASReview."
 26 |     extension_name = "asreview-datatools"
 27 | 
 28 |     def __init__(self):
 29 |         from asreviewcontrib.datatools.__init__ import __version__
 30 | 
 31 |         super().__init__()
 32 | 
 33 |         self.version = __version__
 34 | 
 35 |     def execute(self, argv):
 36 |         if len(argv) > 1 and argv[0] in DATATOOLS:
 37 |             if argv[0] == "describe":
 38 |                 args_describe_parser = _parse_arguments_describe()
 39 |                 args_describe = vars(args_describe_parser.parse_args(argv[1:]))
 40 |                 describe(**args_describe)
 41 |             if argv[0] == "convert":
 42 |                 args_convert_parser = _parse_arguments_convert()
 43 |                 args_convert = vars(args_convert_parser.parse_args(argv[1:]))
 44 |                 convert(**args_convert)
 45 |             if argv[0] == "dedup":
 46 |                 dedup_parser = argparse.ArgumentParser(prog="asreview data dedup")
 47 |                 dedup_parser.add_argument(
 48 |                     "input_path", type=str, help="The file path of the dataset."
 49 |                 )
 50 |                 dedup_parser.add_argument(
 51 |                     "--output_path",
 52 |                     "-o",
 53 |                     default=None,
 54 |                     type=str,
 55 |                     help="The file path of the output dataset.",
 56 |                 )
 57 |                 dedup_parser.add_argument(
 58 |                     "--pid",
 59 |                     default="doi",
 60 |                     type=str,
 61 |                     help="Persistent identifier used for deduplication. Default: doi.",
 62 |                 )
 63 |                 dedup_parser.add_argument(
 64 |                     "--similar",
 65 |                     action="store_true",
 66 |                     help=(
 67 |                         "Drop similar records, not only exactly matching records. The"
 68 |                         " Ratcliff-Obershelp algorithm is used to calculate the"
 69 |                         " similarity of records."
 70 |                     ),
 71 |                 )
 72 |                 dedup_parser.add_argument(
 73 |                     "--threshold",
 74 |                     default=0.98,
 75 |                     type=float,
 76 |                     help=(
 77 |                         "Record with a similarity score above this threshold are"
 78 |                         " considered duplicate. Default: 0.98. Only applies if"
 79 |                         " similarity is set to True."
 80 |                     ),
 81 |                 )
 82 |                 dedup_parser.add_argument(
 83 |                     "--title_only",
 84 |                     action="store_true",
 85 |                     help=(
 86 |                         "Use only title for deduplication. Only applies if similarity"
 87 |                         " is set to True"
 88 |                     ),
 89 |                 )
 90 |                 dedup_parser.add_argument(
 91 |                     "--strict",
 92 |                     action="store_true",
 93 |                     help=(
 94 |                         "Use a more strict version of the similarity algorithm. Only"
 95 |                         " applies if similarity is set to True."
 96 |                     ),
 97 |                 )
 98 |                 dedup_parser.add_argument(
 99 |                     "--stopwords_language",
100 |                     default=None,
101 |                     type=str,
102 |                     help=(
103 |                         "Remove stopwords from this language before calculating"
104 |                         " similarity. For example 'english'. Only applies if similarity"
105 |                         " is set to True."
106 |                     ),
107 |                 )
108 |                 dedup_parser.add_argument(
109 |                     "--verbose",
110 |                     action="store_true",
111 |                     help=(
112 |                         "Print verbose output. Only applies if similarity is set to"
113 |                         " True."
114 |                     ),
115 |                 )
116 | 
117 |                 args_dedup = dedup_parser.parse_args(argv[1:])
118 | 
119 |                 # read data in ASReview data object
120 |                 asdata = load_data(args_dedup.input_path)
121 |                 deduplicate_data(
122 |                     asdata=asdata,
123 |                     output_path=args_dedup.output_path,
124 |                     pid=args_dedup.pid,
125 |                     similar=args_dedup.similar,
126 |                     threshold=args_dedup.threshold,
127 |                     title_only=args_dedup.title_only,
128 |                     stopwords_language=args_dedup.stopwords_language,
129 |                     strict=args_dedup.strict,
130 |                     verbose=args_dedup.verbose,
131 |                 )
132 | 
133 |             if argv[0] == "compose":
134 |                 args_compose_parser = _parse_arguments_compose()
135 |                 args_compose = args_compose_parser.parse_args(argv[1:])
136 |                 compose(
137 |                     args_compose.output_path,
138 |                     args_compose.relevant,
139 |                     args_compose.irrelevant,
140 |                     args_compose.labeled,
141 |                     args_compose.unlabeled,
142 |                     pid=args_compose.pid,
143 |                     order=args_compose.hierarchy,
144 |                     resolve=args_compose.conflict_resolve,
145 |                 )
146 |             if argv[0] == "snowball":
147 |                 args_snowballing_parser = _parse_arguments_snowball()
148 |                 args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:]))
149 |                 snowball(**args_snowballing)
150 |             if argv[0] == "sample":
151 |                 args_sample_parser = _parse_arguments_sample()
152 |                 args_sample = vars(args_sample_parser.parse_args(argv[1:]))
153 |                 sample(**args_sample)
154 |             if argv[0] == "vstack":
155 |                 args_vstack_parser = _parse_arguments_vstack()
156 |                 args_vstack = args_vstack_parser.parse_args(argv[1:])
157 |                 vstack(args_vstack.output_path, args_vstack.datasets)
158 | 
159 |         # Print help message if subcommand not given or incorrect
160 |         else:
161 |             parser = argparse.ArgumentParser(
162 |                 prog="asreview data",
163 |                 formatter_class=argparse.RawTextHelpFormatter,
164 |                 description="Tools for data preprocessing for ASReview.",
165 |             )
166 |             parser.add_argument(
167 |                 "subcommand",
168 |                 nargs="?",
169 |                 default=None,
170 |                 help=f"The datatool to launch. Available commands:\n\n{DATATOOLS}",
171 |             )
172 |             parser.add_argument(
173 |                 "-V",
174 |                 "--version",
175 |                 action="version",
176 |                 default=False,
177 |                 version=f"{self.extension_name}: {self.version}",
178 |             )
179 |             args, _ = parser.parse_known_args()
180 | 
181 |             print(args)
182 |             # output the version
183 |             if args.version:
184 |                 print(__version__)
185 |                 return
186 | 
187 |             parser.print_help()
188 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/sample.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import pandas as pd
 4 | from asreview import ASReviewData
 5 | from asreview.data.base import load_data
 6 | 
 7 | 
 8 | def sample(input_path, output_path, nr_records, year_column="publication_year"):
 9 |     df_input = load_data(input_path).df
10 | 
11 |     # Check for presence of any variation of a year column
12 |     if year_column not in df_input.columns:
13 |         raise ValueError(f"• The input file should have a {year_column} column.")
14 | 
15 |     # Check if k is not too large
16 |     if nr_records * 3 > len(df_input):
17 |         raise ValueError(
18 |             f"• The number of records to sample is too large."
19 |             f"Only {len(df_input)} records are present in the input file."
20 |             f" You are trying to sample {nr_records*3} records."
21 |         )
22 | 
23 |     if nr_records < 1:
24 |         raise ValueError("• The number of records to sample should be at least 1.")
25 | 
26 |     # Sort by year
27 |     dated_records = df_input[df_input[year_column].notnull()]
28 | 
29 |     if dated_records.empty:
30 |         raise ValueError(f"• The input file has no {year_column} values.")
31 | 
32 |     if len(dated_records) < nr_records * 2:
33 |         raise ValueError("• Not enough dated records to sample from.")
34 | 
35 |     sorted_records = dated_records.sort_values(year_column, ascending=True)
36 | 
37 |     # Take k old and k new records
38 |     old_records = sorted_records.head(nr_records)
39 |     new_records = sorted_records.tail(nr_records)
40 | 
41 |     # Sample k records without overlap with old/new records
42 |     records_to_exclude = pd.concat([old_records, new_records]).index
43 |     remaining_records = df_input[~df_input.index.isin(records_to_exclude)]
44 | 
45 |     sampled_records = remaining_records.sample(nr_records)
46 | 
47 |     # Combine old, new, and sampled records
48 |     df_out = pd.concat([old_records, sampled_records, new_records])
49 | 
50 |     asdata = ASReviewData(df=df_out)
51 |     asdata.to_file(output_path)
52 | 
53 | 
54 | def _parse_arguments_sample():
55 |     parser = argparse.ArgumentParser(prog="asreview data sample")
56 |     parser.add_argument("input_path", type=str, help="The input file path.")
57 |     parser.add_argument("output_path", type=str, help="The output file path.")
58 |     parser.add_argument(
59 |         "nr_records",
60 |         type=int,
61 |         help="The amount of records for old, random, and new records each.",
62 |     )
63 |     parser.add_argument(
64 |         "--year_column",
65 |         default="publication_year",
66 |         type=str,
67 |         help="The name of the column containing the publication year.",
68 |     )
69 | 
70 |     return parser
71 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/snowball.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | from pathlib import Path
  5 | 
  6 | import pandas as pd
  7 | import pyalex
  8 | from asreview import ASReviewData
  9 | from asreview import load_data
 10 | 
 11 | # Maximum number of statements joined by a logical OR in a call to OpenAlex.
 12 | OPENALEX_MAX_OR_LENGTH = 100
 13 | OPENALEX_MAX_PAGE_LENGTH = 200
 14 | OPENALEX_PREFIX = "https://openalex.org/"
 15 | DOI_PREFIX = "https://doi.org/"
 16 | 
 17 | # OpenAlex data fields to retrieve.
 18 | USED_FIELDS = [
 19 |     "id",
 20 |     "doi",
 21 |     "title",
 22 |     "abstract_inverted_index",
 23 |     "referenced_works",
 24 |     "publication_date",
 25 | ]
 26 | 
 27 | 
 28 | def forward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]:
 29 |     """Get all works citing a work with the OpenAlex identifier from the list.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     identifiers : list[str]
 34 |         List of OpenAlex identifiers.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     dict[str, list[dict]]
 39 |         Dictionary of the form
 40 |             `{input OpenAlex identifier : list of OpenAlex works}`
 41 |         where each work in the list references the work with the input identifier and
 42 |         it is a dictionary of the form `{field_name : field_value}`.
 43 |     """
 44 |     citing_works = {}
 45 |     for idx, openalex_id in enumerate(identifiers):
 46 |         print(f"{idx}. Getting works citing {openalex_id}")
 47 |         pager = (
 48 |             pyalex.Works()
 49 |             .filter(cites=openalex_id)
 50 |             .select(USED_FIELDS)
 51 |             .paginate(per_page=OPENALEX_MAX_PAGE_LENGTH, n_max=None)
 52 |         )
 53 |         citing_works[openalex_id] = []
 54 |         for page in pager:
 55 |             citing_works[openalex_id] += [
 56 |                 {
 57 |                     key: work[key]
 58 |                     for key in [
 59 |                         col if col != "abstract_inverted_index" else "abstract"
 60 |                         for col in USED_FIELDS
 61 |                     ]
 62 |                 }
 63 |                 for work in page
 64 |             ]
 65 |     return citing_works
 66 | 
 67 | 
 68 | def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]:
 69 |     """Get all works cited by a work with the OpenAlex identifier from the list.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     identifiers : list[str]
 74 |         List of OpenAlex identifiers.
 75 | 
 76 |     Returns
 77 |     -------
 78 |     dict[str, list[dict]]
 79 |         Dictionary of the form
 80 |             `{input OpenAlex identifier : list of OpenAlex works}`
 81 |         where each work in the list is referenced by the work with the input identifier
 82 |         and it is a dictionary of the form `{field_name : field_value}`.
 83 |     """
 84 |     # Get the referenced works.
 85 |     referenced_works = {}
 86 |     page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH)
 87 | 
 88 |     for i in range(0, len(identifiers), page_length):
 89 |         print(f"Getting works citing records {i}-{i+page_length}")
 90 |         # We need to remove the prefix here because otherwise the URL is too long.
 91 |         fltr = "|".join(
 92 |             identifier.removeprefix(OPENALEX_PREFIX)
 93 |             for identifier in identifiers[i : i + page_length]
 94 |         )
 95 |         for work in (
 96 |             pyalex.Works()
 97 |             .filter(openalex=fltr)
 98 |             .select("id,referenced_works")
 99 |             .get(per_page=page_length)
100 |         ):
101 |             referenced_works[work["id"]] = work["referenced_works"]
102 | 
103 |     # Get the fields for the referenced works.
104 |     all_identifiers = []
105 |     for reference_list in referenced_works.values():
106 |         all_identifiers += reference_list
107 |     all_identifiers = list(set(all_identifiers))
108 |     print(f"Found {len(all_identifiers)} records")
109 | 
110 |     all_referenced_works = {}
111 |     for i in range(0, len(all_identifiers), page_length):
112 |         # We need to remove the prefix here because otherwise the URL is too long.
113 |         fltr = "|".join(
114 |             identifier.removeprefix(OPENALEX_PREFIX)
115 |             for identifier in all_identifiers[i : i + page_length]
116 |         )
117 |         for work in (
118 |             pyalex.Works()
119 |             .filter(openalex=fltr)
120 |             .select(USED_FIELDS)
121 |             .get(per_page=page_length)
122 |         ):
123 |             all_referenced_works[work["id"]] = {
124 |                 key: work[key]
125 |                 for key in [
126 |                     col if col != "abstract_inverted_index" else "abstract"
127 |                     for col in USED_FIELDS
128 |                 ]
129 |             }
130 | 
131 |     # Connect the referenced works back to the input works.
132 |     output = {}
133 |     for identifier, ref_id_list in referenced_works.items():
134 |         # We need the last check if 'ref_id' is in 'all_referenced_works': If a work
135 |         # references an ID that redirects to another ID, it won't be present here.
136 |         # Example: https://openalex.org/W2015370450 has in the references the identifier
137 |         # https://openalex.org/W2008744335, but this redirects to
138 |         # https://openalex.org/W4233569835
139 |         output[identifier] = [
140 |             all_referenced_works[ref_id]
141 |             for ref_id in ref_id_list
142 |             if ref_id in all_referenced_works
143 |         ]
144 |     return output
145 | 
146 | 
147 | def openalex_from_doi(dois: list[str]) -> dict[str, str]:
148 |     """Get the OpenAlex identifiers corresponding to a list of DOIs.
149 | 
150 |     Parameters
151 |     ----------
152 |     dois : list[str]
153 |         List of DOIs.
154 | 
155 |     Returns
156 |     -------
157 |     dict[str, str]
158 |         Dictionary {doi: openalex_id}. If there was no OpenAlex identifier found for a
159 |         DOI, the corresponding value will be None.
160 |     """
161 |     page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH)
162 |     id_mapping = {doi.removeprefix(DOI_PREFIX): None for doi in dois}
163 |     for i in range(0, len(dois), page_length):
164 |         fltr = "|".join(dois[i : i + page_length])
165 |         for work in (
166 |             pyalex.Works()
167 |             .filter(doi=fltr)
168 |             .select(["id", "doi"])
169 |             .get(per_page=page_length)
170 |         ):
171 |             id_mapping[work["doi"].removeprefix(DOI_PREFIX)] = work["id"]
172 |     return id_mapping
173 | 
174 | 
175 | def snowball(
176 |     input_path: Path,
177 |     output_path: Path,
178 |     forward: bool,
179 |     backward: bool,
180 |     use_all: bool = False,
181 |     email: str = None,
182 | ) -> None:
183 |     """Perform snowballing on an ASReview dataset.
184 | 
185 |     Parameters
186 |     ----------
187 |     input_path : Path
188 |         Location of the input ASReview dataset.
189 |     output_path : Path
190 |         Location where to save the output dataset.
191 |     forward : bool
192 |         Perform forward snowballing. At least one of `forward` or `backward` should be
193 |         True.
194 |     backward : bool
195 |         Perform backward snowballing. At least one of `forward` or `backward` should be
196 |         True.
197 |     use_all : bool, optional
198 |         Perform snowballing on all records in the dataset or only the included
199 |         records, by default False
200 |     email : str, optional
201 |         Email address to send along with request to OpenAlex, by default None
202 | 
203 |     Raises
204 |     ------
205 |     ValueError
206 |         If `forward` and `backward` are both False.
207 |     ValueError
208 |         If the dataset contains no column name `openalex_id` and no column names `doi`.
209 |     """
210 |     if not (forward or backward):
211 |         raise ValueError("At least one of 'forward' or 'backward' should be True.")
212 | 
213 |     data = load_data(input_path)
214 |     if use_all or (data.included is None):
215 |         data = data.df
216 |     else:
217 |         data = data.df.loc[data.included.astype(bool)]
218 | 
219 |     # Add OpenAlex identifiers if not available.
220 |     if "openalex_id" not in data.columns:
221 |         if "doi" not in data.columns:
222 |             raise ValueError(
223 |                 "Dataset should contain a column 'openalex_id' containing OpenAlex"
224 |                 " identifiers or a column 'doi' containing DOIs."
225 |             )
226 |         id_mapping = openalex_from_doi(data.doi.dropna().to_list())
227 |         n_openalex_ids = len(
228 |             [
229 |                 openalex_id
230 |                 for openalex_id in id_mapping.values()
231 |                 if openalex_id is not None
232 |             ]
233 |         )
234 |         print(
235 |             f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(data)}"
236 |             " records. Performing snowballing for those records."
237 |         )
238 |         data["openalex_id"] = None
239 |         data.loc[data.doi.notna(), "openalex_id"] = (
240 |             data.loc[data.doi.notna(), "doi"]
241 |             .str.removeprefix(DOI_PREFIX)
242 |             .apply(lambda doi: id_mapping[doi])
243 |         )
244 | 
245 |     identifiers = data["openalex_id"].dropna().to_list()
246 | 
247 |     if email is not None:
248 |         pyalex.config.email = email
249 | 
250 |     if forward:
251 |         print("Starting forward snowballing")
252 |         forward_data = forward_snowballing(identifiers)
253 |     else:
254 |         forward_data = {}
255 |     if backward:
256 |         print("Starting backward snowballing")
257 |         backward_data = backward_snowballing(identifiers)
258 |     else:
259 |         backward_data = {}
260 | 
261 |     all_works = []
262 |     for works_list in forward_data.values():
263 |         all_works += works_list
264 |     for works_list in backward_data.values():
265 |         all_works += works_list
266 | 
267 |     output_data = pd.DataFrame(all_works)
268 |     output_data.drop_duplicates(subset=["id"], inplace=True)
269 |     output_data.rename({"id": "openalex_id"}, axis=1, inplace=True)
270 |     output_data = ASReviewData(output_data)
271 |     output_data.to_file(output_path)
272 |     print("Saved dataset")
273 | 
274 | 
275 | def _parse_arguments_snowball():
276 |     parser = argparse.ArgumentParser(prog="asreview data snowballing")
277 |     parser.add_argument(
278 |         "input_path", type=str, help="The file path of the input dataset."
279 |     )
280 |     parser.add_argument(
281 |         "output_path", type=str, help="The file path of the output dataset."
282 |     )
283 |     parser.add_argument(
284 |         "--forward", "-f", action="store_true", help="Do forward snowballing."
285 |     )
286 |     parser.add_argument(
287 |         "--backward", "-b", action="store_true", help="Do backward snowballing."
288 |     )
289 |     parser.add_argument(
290 |         "--all",
291 |         "-a",
292 |         action="store_true",
293 |         dest="use_all",
294 |         help=(
295 |             "Do snowballing on all records in the dataset, not just the included ones."
296 |         ),
297 |     )
298 |     parser.add_argument(
299 |         "--email",
300 |         "-e",
301 |         type=str,
302 |         required=False,
303 |         help=(
304 |             "Email address to send along with requests to OpenAlex. This will make"
305 |             " requests faster. See also "
306 |             "https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool"
307 |         ),
308 |     )
309 |     return parser
310 | 


--------------------------------------------------------------------------------
/asreviewcontrib/datatools/stack.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | from asreview import ASReviewData
 6 | from asreview.data.base import load_data
 7 | 
 8 | 
 9 | def _check_suffix(input_files, output_file):
10 |     # Also raises ValueError on URLs that do not end with a file extension
11 |     suffixes = [Path(item).suffix for item in input_files if item is not None]
12 |     suffixes.append(Path(output_file).suffix)
13 | 
14 |     set_ris = {".txt", ".ris"}
15 |     set_tabular = {".csv", ".tab", ".tsv", ".xlsx"}
16 |     set_suffixes = set(suffixes)
17 | 
18 |     if len(set(suffixes)) > 1:
19 |         if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)):
20 |             raise ValueError(
21 |                 "• Several file types were given; All input files, as well as the"
22 |                 " output file should be of the same type. "
23 |             )
24 | 
25 | 
26 | def vstack(output_file, input_files):
27 |     _check_suffix(input_files, output_file)
28 | 
29 |     list_dfs = [load_data(item).df for item in input_files]
30 |     df_vstacked = pd.concat(list_dfs).reset_index(drop=True)
31 |     as_vstacked = ASReviewData(df=df_vstacked)
32 | 
33 |     as_vstacked.to_file(output_file)
34 | 
35 | 
36 | def _parse_arguments_vstack():
37 |     parser = argparse.ArgumentParser(prog="asreview data vstack")
38 |     parser.add_argument("output_path", type=str, help="The output file path.")
39 |     parser.add_argument(
40 |         "datasets",
41 |         type=str,
42 |         nargs="+",
43 |         help="Any number of datasets to stack vertically.",
44 |     )
45 | 
46 |     return parser
47 | 


--------------------------------------------------------------------------------
/dedup_similar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asreview/asreview-datatools/32c5e3b5e65042716bd70bad17f0ff4da84f908b/dedup_similar.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "asreview-datatools"
 3 | description = "Powerful command line tools for data handling in ASReview"
 4 | authors = [
 5 |     { name = "ASReview LAB developers", email = "asreview@uu.nl" }
 6 | ]
 7 | readme = "README.md"
 8 | classifiers = [
 9 |     "Development Status :: 5 - Production/Stable",
10 |     "License :: OSI Approved :: MIT License",
11 |     "Programming Language :: Python :: 3.8",
12 |     "Programming Language :: Python :: 3.9",
13 |     "Programming Language :: Python :: 3.10",
14 |     "Programming Language :: Python :: 3.11"
15 | ]
16 | license = {text = "MIT License"}
17 | dependencies = ["asreview>=1.1,<2", "ftfy", "nltk", "pandas", "pyalex", "rich", "tqdm"]
18 | dynamic = ["version"]
19 | requires-python = ">=3.8"
20 | 
21 | [project.urls]
22 | homepage = "https://asreview.ai"
23 | repository = "https://github.com/asreview/asreview-datatools"
24 | issues = "https://github.com/asreview/asreview-datatools/issues"
25 | 
26 | [project.entry-points."asreview.entry_points"]
27 | data = "asreviewcontrib.datatools.entrypoint:DataEntryPoint"
28 | 
29 | [project.optional-dependencies]
30 | lint = ["ruff"]
31 | test = ["pytest"]
32 | 
33 | [build-system]
34 | build-backend = 'setuptools.build_meta'
35 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
36 | 
37 | [tool.setuptools]
38 | packages = ["asreviewcontrib"]
39 | 
40 | [tool.setuptools_scm]
41 | write_to = "asreviewcontrib/datatools/_version.py"
42 | 
43 | [tool.ruff.lint]
44 | select = ["E", "F", "UP", "I", "B"]
45 | 
46 | [tool.ruff.lint.isort]
47 | force-single-line = true


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asreview/asreview-datatools/32c5e3b5e65042716bd70bad17f0ff4da84f908b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/demo_data/dataset_1.ris:
--------------------------------------------------------------------------------
  1 | TY  - JOUR
  2 | TI  - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC): The phase III CAIRO3 study of the Dutch Colorectal Cancer Group (DCCG).
  3 | AU  - Koopman, Miriam
  4 | AU  - Simkens, Lieke HJ
  5 | AU  - Ten Tije, Albert J.
  6 | AU  - Creemers, Geert-Jan
  7 | AU  - Loosveld, Olaf JL
  8 | AU  - de Jongh, Felix E.
  9 | AU  - Erdkamp, Frans
 10 | AU  - Erjavec, Zoran
 11 | AU  - van der Torren, Adelheid ME
 12 | AU  - Van der Hoeven, Jacobus JM
 13 | AU  - Nieboer, Peter
 14 | AU  - Braun, J. J.
 15 | AU  - Jansen, Rob L.
 16 | AU  - Haasjes, Janny G.
 17 | AU  - Cats, Annemieke
 18 | AU  - Wals, Jacob J.
 19 | AU  - Mol, Linda
 20 | AU  - Dalesio, Otilia
 21 | AU  - van Tinteren, Harm
 22 | AU  - Punt, Cornelis J. A.
 23 | T2  - Journal of Clinical Oncology
 24 | AB  - 3502
 25 | 
 26 | Background: The optimal duration of chemotherapy and bevacizumab in mCRC is not well established. The CAIRO3 study investigated the efficacy of maintenance treatment with capecitabine plus bevacizumab versus observation in mCRC pts not progressing during induction treatment with capecitabine, oxaliplatin and bevacizumab (CAPOX-B). Methods: Previously untreated mCRC pts, PS 0-1, with stable disease or better after 6 cycles of CAPOX-B, not eligible for metastasectomy and eligible for future treatment with oxaliplatin, were randomized between observation (arm A) or maintenance treatment with capecitabine 625 mg/m2 bid dailycontinuouslyand bevacizumab 7.5 mg/kg iv q 3 weeks (arm B). Upon first progression (PFS1), pts in both arms were treated with CAPOX-B until second progression (PFS2, primary endpoint). For pts not able to receive CAPOX-B upon PFS1, PFS2 was considered equal to PFS1. Secondary endpoints were overall survival (OS) and time to second progression (TTP2), which was defined as the time to progression or death on any treatment following PFS1. All endpoints were calculated from the time of randomization. Results: A total of 558 pts were randomized. Median follow-up is 33 months. The median number of maintenance cycles in arm B was 9 (range 1-54). The median PFS1 in arm A vs B was 4.1 vs 7.4 months (HR 0.44, 95% CI 0.37-0.54, p<0.0001). Upon PFS1, 72% of pts received CAPOX-B in arm A and 44% in arm B. The median PFS2 was 10.4 vs 10.4 months (HR 0.86, 95% CI 0.7-1.04, p=0.12). The median TTP2 in arm A vs B was 11.5 vs 15.4 months (HR 0.58, 95% CI 0.48-0.72, p<0.0001), and the median OS was 17.9 vs 21.7 months (HR 0.77, 95% CI 0.62-0.96, p=0.02), respectively. Conclusions: Maintenance treatment with capecitabine plus bevacizumab after 6 cycles CAPOX-B did not significantly prolong PFS2, which may be due to the lower number of pts in arm B that received CAPOX-B following PFS1. Maintenance treatment significantly prolonged PFS1, TTP2 and OS. Our data support the use of bevacizumab plus capecitabine until progression or unacceptable toxicity. Updated results will be presented. Clinical trial information: NCT00442637.
 27 | DA  - 2013/05/20/
 28 | PY  - 2013
 29 | DO  - 10.1200/jco.2013.31.15_suppl.3502
 30 | DP  - ascopubs-org.proxy.library.uu.nl (Atypon)
 31 | VL  - 31
 32 | IS  - 15_suppl
 33 | SP  - 3502
 34 | EP  - 3502
 35 | J2  - JCO
 36 | SN  - 0732-183X
 37 | ST  - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC)
 38 | UR  - https://ascopubs-org.proxy.library.uu.nl/doi/abs/10.1200/jco.2013.31.15_suppl.3502
 39 | Y2  - 2022/09/20/
 40 | ER  - 
 41 | 
 42 | TY  - ELEC
 43 | TI  - Full article: Public transport planning adaption under the COVID-19 pandemic crisis: literature review of research needs and directions
 44 | AB  - Lets think of somethings
 45 | UR  - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886
 46 | Y2  - 2022/09/20/
 47 | L2  - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886
 48 | N1  - <div data-schema-version="8"><p>ASReview_relevant</p>
 49 | </div>
 50 | ER  - 
 51 | 
 52 | TY  - JOUR
 53 | TI  - An Overview of the Incidences and Costs of Low Back Pain
 54 | AU  - Frymoyer, John W.
 55 | AU  - Cats-Baril, William L.
 56 | T2  - Orthopedic Clinics of North America
 57 | AB  - The basic premise of this article is that low back disorders are extremely prevalent in all societies, and probably have not increased substantially over the past two decades. What has increased is the rate of disability, the reasons for which are uncertain. Not only has this phenomenon heightened the awareness of low back pain, but it has led to an explosion in costs. Although a precise estimate is impossible, it is plausible that the direct medical and indirect costs of these conditions are in the range of more than $50 billion per annum, and could be as high as $100 billion at the extreme. Of these costs, 75% or more can be attributed to the 5% of people who become disabled temporarily or permanently from back pain—a phenomenon that seems more rooted in psychosocial rather than disease determinants. Within this overall equation, spinal surgery plays a relatively small role, although the contribution to disability probably has more than passing significance. The future challenge, if costs are to be controlled, appears to lie squarely with prevention and optimum management of disability, rather than perpetrating a myth that low back pain is a serious health disorder.
 58 | DA  - 1991/04/01/
 59 | PY  - 1991
 60 | DO  - 10.1016/S0030-5898(20)31652-7
 61 | DP  - ScienceDirect
 62 | VL  - 22
 63 | IS  - 2
 64 | SP  - 263
 65 | EP  - 271
 66 | J2  - Orthopedic Clinics of North America
 67 | LA  - en
 68 | SN  - 0030-5898
 69 | UR  - https://www.sciencedirect.com/science/article/pii/S0030589820316527
 70 | Y2  - 2022/09/20/
 71 | L2  - http://www.sciencedirect.com/science/article/abs/pii/S0030589820316527
 72 | N1  - <p>ASReview_irrelevant</p>
 73 | ER  - 
 74 | 
 75 | TY  - JOUR
 76 | TI  - Hereditary diffuse gastric cancer: updated clinical guidelines with an emphasis on germline CDH1 mutation carriers
 77 | AU  - Post, Rachel S. van der
 78 | AU  - Vogelaar, Ingrid P.
 79 | AU  - Carneiro, Fátima
 80 | AU  - Guilford, Parry
 81 | AU  - Huntsman, David
 82 | AU  - Hoogerbrugge, Nicoline
 83 | AU  - Caldas, Carlos
 84 | AU  - Schreiber, Karen E. Chelcun
 85 | AU  - Hardwick, Richard H.
 86 | AU  - Ausems, Margreet G. E. M.
 87 | AU  - Bardram, Linda
 88 | AU  - Benusiglio, Patrick R.
 89 | AU  - Bisseling, Tanya M.
 90 | AU  - Blair, Vanessa
 91 | AU  - Bleiker, Eveline
 92 | AU  - Boussioutas, Alex
 93 | AU  - Cats, Annemieke
 94 | AU  - Coit, Daniel
 95 | AU  - DeGregorio, Lynn
 96 | AU  - Figueiredo, Joana
 97 | AU  - Ford, James M.
 98 | AU  - Heijkoop, Esther
 99 | AU  - Hermens, Rosella
100 | AU  - Humar, Bostjan
101 | AU  - Kaurah, Pardeep
102 | AU  - Keller, Gisella
103 | AU  - Lai, Jennifer
104 | AU  - Ligtenberg, Marjolijn J. L.
105 | AU  - O'Donovan, Maria
106 | AU  - Oliveira, Carla
107 | AU  - Pinheiro, Hugo
108 | AU  - Ragunath, Krish
109 | AU  - Rasenberg, Esther
110 | AU  - Richardson, Susan
111 | AU  - Roviello, Franco
112 | AU  - Schackert, Hans
113 | AU  - Seruca, Raquel
114 | AU  - Taylor, Amy
115 | AU  - Huurne, Anouk ter
116 | AU  - Tischkowitz, Marc
117 | AU  - Joe, Sheena Tjon A.
118 | AU  - Dijck, Benjamin van
119 | AU  - Grieken, Nicole C. T. van
120 | AU  - Hillegersberg, Richard van
121 | AU  - Sandick, Johanna W. van
122 | AU  - Vehof, Rianne
123 | AU  - Krieken, J. Han van
124 | AU  - Fitzgerald, Rebecca C.
125 | T2  - Journal of Medical Genetics
126 | AB  - Germline CDH1 mutations confer a high lifetime risk of developing diffuse gastric (DGC) and lobular breast cancer (LBC). A multidisciplinary workshop was organised to discuss genetic testing, surgery, surveillance strategies, pathology reporting and the patient's perspective on multiple aspects, including diet post gastrectomy. The updated guidelines include revised CDH1 testing criteria (taking into account first-degree and second-degree relatives): (1) families with two or more patients with gastric cancer at any age, one confirmed DGC; (2) individuals with DGC before the age of 40 and (3) families with diagnoses of both DGC and LBC (one diagnosis before the age of 50). Additionally, CDH1 testing could be considered in patients with bilateral or familial LBC before the age of 50, patients with DGC and cleft lip/palate, and those with precursor lesions for signet ring cell carcinoma. Given the high mortality associated with invasive disease, prophylactic total gastrectomy at a centre of expertise is advised for individuals with pathogenic CDH1 mutations. Breast cancer surveillance with annual breast MRI starting at age 30 for women with a CDH1 mutation is recommended. Standardised endoscopic surveillance in experienced centres is recommended for those opting not to have gastrectomy at the current time, those with CDH1 variants of uncertain significance and those that fulfil hereditary DGC criteria without germline CDH1 mutations. Expert histopathological confirmation of (early) signet ring cell carcinoma is recommended. The impact of gastrectomy and mastectomy should not be underestimated; these can have severe consequences on a psychological, physiological and metabolic level. Nutritional problems should be carefully monitored.
127 | DA  - 2015/06/01/
128 | PY  - 2015
129 | DO  - 10.1136/jmedgenet-2015-103094
130 | DP  - jmg.bmj.com
131 | VL  - 52
132 | IS  - 6
133 | SP  - 361
134 | EP  - 374
135 | LA  - en
136 | SN  - 0022-2593, 1468-6244
137 | ST  - Hereditary diffuse gastric cancer
138 | UR  - https://jmg.bmj.com/content/52/6/361
139 | Y2  - 2022/09/20/
140 | L2  - http://www.ncbi.nlm.nih.gov/pubmed/25979631
141 | L2  - https://jmg.bmj.com/content/52/6/361.short
142 | L4  - https://jmg.bmj.com/content/jmedgenet/52/6/361.full.pdf
143 | KW  - Cancer: breast
144 | KW  - Cancer: gastric
145 | KW  - Clinical genetics
146 | KW  - Diagnostics
147 | KW  - Stomach and duodenum
148 | ER  - 
149 | 
150 | TY  - BOOK
151 | TI  - Zinne- en minnebeelden
152 | AU  - Cats, Jacob
153 | DA  - 1729///
154 | PY  - 1729
155 | DP  - Google Books
156 | SP  - 674
157 | LA  - nl
158 | PB  - by Evert Visscher
159 | L2  - https://books.google.nl/books?id=x4FYAAAAcAAJ
160 | N1  - <p>ASReview_irrelevant</p>
161 | ER  - 
162 | 
163 | TY  - JOUR
164 | TI  - Epidemiology of osteoarthritis: Zoetermeer survey. Comparison of radiological osteoarthritis in a Dutch population with that in 10 other populations.
165 | AU  - Saase, J. L. van
166 | AU  - Romunde, L. K. van
167 | AU  - Cats, A.
168 | AU  - Vandenbroucke, J. P.
169 | AU  - Valkenburg, H. A.
170 | T2  - Annals of the Rheumatic Diseases
171 | AB  - The prevalence of mild and severe radiological osteoarthritis was investigated in a random sample of 6585 inhabitants of a Dutch village. Radiographs were graded 0-4 according to the criteria described by Kellgren and Lawrence. The prevalence of radiological osteoarthritis increased strongly with age and was highest for cervical spine (peak: men 84.8%, women 84.3%), lumbar spine (peak: 71.9%, women 67.3%), and distal interphalangeal joints of the hands (peak: men 64.4%, women 76%). Prevalence did not exceed 10% in sacroiliac joints, lateral carpometacarpal joints, and tarsometatarsal joints. Severe radiological osteoarthritis (grade 3 or grade 4) was uncommon under age 45; in elderly persons the prevalence of severe radiological osteoarthritis did not exceed 20% except for the cervical and lumbar spine, distal interphalangeal joints of the hands and, in women only, metacarpophalangeal joints, first carpometacarpal joints, first metatarsophalangeal joints, and knees. Overall, differences between men and women were small except for hips and knees; however, severe radiological osteoarthritis was found in a higher proportion in most of the joints in women. Our data were compared with data from similar population surveys. The slope between joint involvement and age was strikingly constant for most of the joints. Differences between populations were mainly differences in level. These differences of prevalence of radiological osteoarthritis may be attributed to interobserver differences--that is, different criteria used to establish radiological osteoarthritis, in addition to genetic or environmental factors, or both.
172 | DA  - 1989/04/01/
173 | PY  - 1989
174 | DO  - 10.1136/ard.48.4.271
175 | DP  - ard.bmj.com
176 | VL  - 48
177 | IS  - 4
178 | SP  - 271
179 | EP  - 280
180 | LA  - en
181 | SN  - 0003-4967, 1468-2060
182 | ST  - Epidemiology of osteoarthritis
183 | UR  - https://ard.bmj.com/content/48/4/271
184 | Y2  - 2022/09/20/
185 | L2  - http://www.ncbi.nlm.nih.gov/pubmed/2712610
186 | L2  - https://ard.bmj.com/content/48/4/271.short
187 | L4  - https://ard.bmj.com/content/annrheumdis/48/4/271.full.pdf
188 | N1  - <p>ASReview_irrelevant</p>
189 | ER  - 
190 | 
191 | 


--------------------------------------------------------------------------------
/tests/demo_data/dataset_2.ris:
--------------------------------------------------------------------------------
  1 | TY  - JOUR
  2 | TI  - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC): The phase III CAIRO3 study of the Dutch Colorectal Cancer Group (DCCG).
  3 | AU  - Koopman, Miriam
  4 | AU  - Simkens, Lieke HJ
  5 | AU  - Ten Tije, Albert J.
  6 | AU  - Creemers, Geert-Jan
  7 | AU  - Loosveld, Olaf JL
  8 | AU  - de Jongh, Felix E.
  9 | AU  - Erdkamp, Frans
 10 | AU  - Erjavec, Zoran
 11 | AU  - van der Torren, Adelheid ME
 12 | AU  - Van der Hoeven, Jacobus JM
 13 | AU  - Nieboer, Peter
 14 | AU  - Braun, J. J.
 15 | AU  - Jansen, Rob L.
 16 | AU  - Haasjes, Janny G.
 17 | AU  - Cats, Annemieke
 18 | AU  - Wals, Jacob J.
 19 | AU  - Mol, Linda
 20 | AU  - Dalesio, Otilia
 21 | AU  - van Tinteren, Harm
 22 | AU  - Punt, Cornelis J. A.
 23 | T2  - Journal of Clinical Oncology
 24 | AB  - 3502
 25 | 
 26 | Background: The optimal duration of chemotherapy and bevacizumab in mCRC is not well established. The CAIRO3 study investigated the efficacy of maintenance treatment with capecitabine plus bevacizumab versus observation in mCRC pts not progressing during induction treatment with capecitabine, oxaliplatin and bevacizumab (CAPOX-B). Methods: Previously untreated mCRC pts, PS 0-1, with stable disease or better after 6 cycles of CAPOX-B, not eligible for metastasectomy and eligible for future treatment with oxaliplatin, were randomized between observation (arm A) or maintenance treatment with capecitabine 625 mg/m2 bid dailycontinuouslyand bevacizumab 7.5 mg/kg iv q 3 weeks (arm B). Upon first progression (PFS1), pts in both arms were treated with CAPOX-B until second progression (PFS2, primary endpoint). For pts not able to receive CAPOX-B upon PFS1, PFS2 was considered equal to PFS1. Secondary endpoints were overall survival (OS) and time to second progression (TTP2), which was defined as the time to progression or death on any treatment following PFS1. All endpoints were calculated from the time of randomization. Results: A total of 558 pts were randomized. Median follow-up is 33 months. The median number of maintenance cycles in arm B was 9 (range 1-54). The median PFS1 in arm A vs B was 4.1 vs 7.4 months (HR 0.44, 95% CI 0.37-0.54, p<0.0001). Upon PFS1, 72% of pts received CAPOX-B in arm A and 44% in arm B. The median PFS2 was 10.4 vs 10.4 months (HR 0.86, 95% CI 0.7-1.04, p=0.12). The median TTP2 in arm A vs B was 11.5 vs 15.4 months (HR 0.58, 95% CI 0.48-0.72, p<0.0001), and the median OS was 17.9 vs 21.7 months (HR 0.77, 95% CI 0.62-0.96, p=0.02), respectively. Conclusions: Maintenance treatment with capecitabine plus bevacizumab after 6 cycles CAPOX-B did not significantly prolong PFS2, which may be due to the lower number of pts in arm B that received CAPOX-B following PFS1. Maintenance treatment significantly prolonged PFS1, TTP2 and OS. Our data support the use of bevacizumab plus capecitabine until progression or unacceptable toxicity. Updated results will be presented. Clinical trial information: NCT00442637.
 27 | DA  - 2013/05/20/
 28 | PY  - 2013
 29 | DO  - 10.1200/jco.2013.31.15_suppl.3502
 30 | DP  - ascopubs-org.proxy.library.uu.nl (Atypon)
 31 | VL  - 31
 32 | IS  - 15_suppl
 33 | SP  - 3502
 34 | EP  - 3502
 35 | J2  - JCO
 36 | SN  - 0732-183X
 37 | ST  - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC)
 38 | UR  - https://ascopubs-org.proxy.library.uu.nl/doi/abs/10.1200/jco.2013.31.15_suppl.3502
 39 | Y2  - 2022/09/20/
 40 | ER  - 
 41 | 
 42 | TY  - ELEC
 43 | TI  - Full article: Public transport planning adaption under the COVID-19 pandemic crisis: literature review of research needs and directions
 44 | AB  - Lets think of somethings
 45 | UR  - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886
 46 | Y2  - 2022/09/20/
 47 | L2  - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886
 48 | N1  - <div data-schema-version="8"><p>ASReview_relevant</p>
 49 | </div>
 50 | ER  - 
 51 | 
 52 | TY  - JOUR
 53 | TI  - Hereditary diffuse gastric cancer: updated clinical guidelines with an emphasis on germline CDH1 mutation carriers
 54 | AU  - Post, Rachel S. van der
 55 | AU  - Vogelaar, Ingrid P.
 56 | AU  - Carneiro, Fátima
 57 | AU  - Guilford, Parry
 58 | AU  - Huntsman, David
 59 | AU  - Hoogerbrugge, Nicoline
 60 | AU  - Caldas, Carlos
 61 | AU  - Schreiber, Karen E. Chelcun
 62 | AU  - Hardwick, Richard H.
 63 | AU  - Ausems, Margreet G. E. M.
 64 | AU  - Bardram, Linda
 65 | AU  - Benusiglio, Patrick R.
 66 | AU  - Bisseling, Tanya M.
 67 | AU  - Blair, Vanessa
 68 | AU  - Bleiker, Eveline
 69 | AU  - Boussioutas, Alex
 70 | AU  - Cats, Annemieke
 71 | AU  - Coit, Daniel
 72 | AU  - DeGregorio, Lynn
 73 | AU  - Figueiredo, Joana
 74 | AU  - Ford, James M.
 75 | AU  - Heijkoop, Esther
 76 | AU  - Hermens, Rosella
 77 | AU  - Humar, Bostjan
 78 | AU  - Kaurah, Pardeep
 79 | AU  - Keller, Gisella
 80 | AU  - Lai, Jennifer
 81 | AU  - Ligtenberg, Marjolijn J. L.
 82 | AU  - O'Donovan, Maria
 83 | AU  - Oliveira, Carla
 84 | AU  - Pinheiro, Hugo
 85 | AU  - Ragunath, Krish
 86 | AU  - Rasenberg, Esther
 87 | AU  - Richardson, Susan
 88 | AU  - Roviello, Franco
 89 | AU  - Schackert, Hans
 90 | AU  - Seruca, Raquel
 91 | AU  - Taylor, Amy
 92 | AU  - Huurne, Anouk ter
 93 | AU  - Tischkowitz, Marc
 94 | AU  - Joe, Sheena Tjon A.
 95 | AU  - Dijck, Benjamin van
 96 | AU  - Grieken, Nicole C. T. van
 97 | AU  - Hillegersberg, Richard van
 98 | AU  - Sandick, Johanna W. van
 99 | AU  - Vehof, Rianne
100 | AU  - Krieken, J. Han van
101 | AU  - Fitzgerald, Rebecca C.
102 | T2  - Journal of Medical Genetics
103 | AB  - Germline CDH1 mutations confer a high lifetime risk of developing diffuse gastric (DGC) and lobular breast cancer (LBC). A multidisciplinary workshop was organised to discuss genetic testing, surgery, surveillance strategies, pathology reporting and the patient's perspective on multiple aspects, including diet post gastrectomy. The updated guidelines include revised CDH1 testing criteria (taking into account first-degree and second-degree relatives): (1) families with two or more patients with gastric cancer at any age, one confirmed DGC; (2) individuals with DGC before the age of 40 and (3) families with diagnoses of both DGC and LBC (one diagnosis before the age of 50). Additionally, CDH1 testing could be considered in patients with bilateral or familial LBC before the age of 50, patients with DGC and cleft lip/palate, and those with precursor lesions for signet ring cell carcinoma. Given the high mortality associated with invasive disease, prophylactic total gastrectomy at a centre of expertise is advised for individuals with pathogenic CDH1 mutations. Breast cancer surveillance with annual breast MRI starting at age 30 for women with a CDH1 mutation is recommended. Standardised endoscopic surveillance in experienced centres is recommended for those opting not to have gastrectomy at the current time, those with CDH1 variants of uncertain significance and those that fulfil hereditary DGC criteria without germline CDH1 mutations. Expert histopathological confirmation of (early) signet ring cell carcinoma is recommended. The impact of gastrectomy and mastectomy should not be underestimated; these can have severe consequences on a psychological, physiological and metabolic level. Nutritional problems should be carefully monitored.
104 | DA  - 2015/06/01/
105 | PY  - 2015
106 | DO  - 10.1136/jmedgenet-2015-103094
107 | DP  - jmg.bmj.com
108 | VL  - 52
109 | IS  - 6
110 | SP  - 361
111 | EP  - 374
112 | LA  - en
113 | SN  - 0022-2593, 1468-6244
114 | ST  - Hereditary diffuse gastric cancer
115 | UR  - https://jmg.bmj.com/content/52/6/361
116 | Y2  - 2022/09/20/
117 | L2  - http://www.ncbi.nlm.nih.gov/pubmed/25979631
118 | L2  - https://jmg.bmj.com/content/52/6/361.short
119 | L4  - https://jmg.bmj.com/content/jmedgenet/52/6/361.full.pdf
120 | KW  - Cancer: breast
121 | KW  - Cancer: gastric
122 | KW  - Clinical genetics
123 | KW  - Diagnostics
124 | KW  - Stomach and duodenum
125 | ER  - 
126 | 
127 | TY  - JOUR
128 | TI  - An open source machine learning framework for efficient and transparent systematic reviews
129 | AU  - van de Schoot, Rens
130 | AU  - de Bruin, Jonathan
131 | AU  - Schram, Raoul
132 | AU  - Zahedi, Parisa
133 | AU  - de Boer, Jan
134 | AU  - Weijdema, Felix
135 | AU  - Kramer, Bianca
136 | AU  - Huijts, Martijn
137 | AU  - Hoogerwerf, Maarten
138 | AU  - Ferdinands, Gerbrich
139 | AU  - Harkema, Albert
140 | AU  - Willemsen, Joukje
141 | AU  - Ma, Yongchao
142 | AU  - Fang, Qixiang
143 | AU  - Hindriks, Sybren
144 | AU  - Tummers, Lars
145 | AU  - Oberski, Daniel L.
146 | T2  - Nature Machine Intelligence
147 | AB  - To help researchers conduct a systematic review or meta-analysis as efficiently and transparently as possible, we designed a tool to accelerate the step of screening titles and abstracts. For many tasks—including but not limited to systematic reviews and meta-analyses—the scientific literature needs to be checked systematically. Scholars and practitioners currently screen thousands of studies by hand to determine which studies to include in their review or meta-analysis. This is error prone and inefficient because of extremely imbalanced data: only a fraction of the screened studies is relevant. The future of systematic reviewing will be an interaction with machine learning algorithms to deal with the enormous increase of available text. We therefore developed an open source machine learning-aided pipeline applying active learning: ASReview. We demonstrate by means of simulation studies that active learning can yield far more efficient reviewing than manual reviewing while providing high quality. Furthermore, we describe the options of the free and open source research software and present the results from user experience tests. We invite the community to contribute to open source projects such as our own that provide measurable and reproducible improvements over current practice.
148 | DA  - 2021/02//
149 | PY  - 2021
150 | DO  - 10.1038/s42256-020-00287-7
151 | DP  - www-nature-com.proxy.library.uu.nl
152 | VL  - 3
153 | IS  - 2
154 | SP  - 125
155 | EP  - 133
156 | J2  - Nat Mach Intell
157 | LA  - en
158 | SN  - 2522-5839
159 | UR  - http://www.nature.com/articles/s42256-020-00287-7
160 | Y2  - 2022/09/12/09:02:50
161 | L2  - http://www.nature.com/articles/s42256-020-00287-7
162 | L4  - http://www.nature.com/articles/s42256-020-00287-7.pdf
163 | KW  - Computational biology and bioinformatics
164 | KW  - Computer science
165 | KW  - Medical research
166 | KW  - SARS-CoV-2
167 | ER  - 
168 | 
169 | TY  - JOUR
170 | TI  - Machine learning for screening prioritization in systematic reviews: comparative performance of Abstrackr and EPPI-Reviewer
171 | AU  - Tsou, Amy Y.
172 | AU  - Treadwell, Jonathan R.
173 | AU  - Erinoff, Eileen
174 | AU  - Schoelles, Karen
175 | T2  - Systematic Reviews
176 | AB  - Improving the speed of systematic review (SR) development is key to supporting evidence-based medicine. Machine learning tools which semi-automate citation screening might improve efficiency. Few studies have assessed use of screening prioritization functionality or compared two tools head to head. In this project, we compared performance of two machine-learning tools for potential use in citation screening.
177 | DA  - 2020/04/02/
178 | PY  - 2020
179 | DO  - 10.1186/s13643-020-01324-7
180 | DP  - Springer Link
181 | VL  - 9
182 | IS  - 1
183 | SP  - 73
184 | J2  - Syst Rev
185 | LA  - en
186 | SN  - 2046-4053
187 | ST  - Machine learning for screening prioritization in systematic reviews
188 | UR  - https://doi.org/10.1186/s13643-020-01324-7
189 | Y2  - 2022/09/12/09:04:15
190 | L4  - http://link.springer.com/content/pdf/10.1186%2Fs13643-020-01324-7.pdf
191 | KW  - Abstrackr
192 | KW  - Citation screening
193 | KW  - Efficiency
194 | KW  - EPPI-Reviewer
195 | KW  - Machine learning
196 | KW  - Methodology
197 | KW  - Screening burden
198 | KW  - Screening prioritization
199 | KW  - Text-mining
200 | ER  - 
201 | 
202 | TY  - JOUR
203 | TI  - Machine learning techniques for the automation of literature reviews and systematic reviews in EFSA
204 | AU  - Jaspers, Stijn
205 | AU  - De Troyer, Ewoud
206 | AU  - Aerts, Marc
207 | T2  - EFSA Supporting Publications
208 | AB  - This Report presents the results from EFSA project RC/EFSA/AMU/2016/01 related to the implementation of machine learning techniques for literature reviews and systematic reviews in EFSA. An overview of the different steps of a systematic review is provided, along with possible ways for automation. Although it was found that most steps could benefit from automation, it was also observed that some steps require more sophisticated methods than those encompassed within the machine learning framework. Availability of data and methodology allowed for the development of an automatic screening tool based on several machine learning techniques. The developed shiny R application can be used for the screening of abstracts and full texts. Properties of machine learning techniques are discussed in this Report together with their most important advantages and disadvantages. The latter discussion includes both general properties, as well as context-specific properties based on their performance in three case studies. Although creating a universal automatic data extraction tool was considered to be infeasible in this stage, this step of the systematic review was addressed to allow the reviewer to scan the uploaded pdf files for certain words or string of words. Based on observations from the performed case studies, recommendations were made regarding which methods are preferred in specific situations. More explicitly, a discussion is made about the performance of the classifiers with respect to the magnitude of the pool of papers to be screened as well as to the amount of imbalance, referring to the proportion of relevant and irrelevant papers. Finally, it was concluded that the results presented in this report provide proof that the developed shiny application could be efficiently used in combination with other software such as DistillerSR.
209 | DA  - 2018///
210 | PY  - 2018
211 | DO  - 10.2903/sp.efsa.2018.EN-1427
212 | DP  - Wiley Online Library
213 | VL  - 15
214 | IS  - 6
215 | SP  - 1427E
216 | LA  - en
217 | SN  - 2397-8325
218 | UR  - https://onlinelibrary.wiley.com/doi/abs/10.2903/sp.efsa.2018.EN-1427
219 | Y2  - 2022/09/12/09:04:41
220 | L2  - https://efsa.onlinelibrary.wiley.com/doi/abs/10.2903/sp.efsa.2018.EN-1427
221 | L4  - https://onlinelibrary.wiley.com/doi/pdfdirect/10.2903/sp.efsa.2018.EN-1427
222 | ER  - 
223 | 
224 | TY  - JOUR
225 | TI  - Living systematic reviews: 2. Combining human and machine effort
226 | AU  - Thomas, James
227 | AU  - Noel-Storr, Anna
228 | AU  - Marshall, Iain
229 | AU  - Wallace, Byron
230 | AU  - McDonald, Steven
231 | AU  - Mavergames, Chris
232 | AU  - Glasziou, Paul
233 | AU  - Shemilt, Ian
234 | AU  - Synnot, Anneliese
235 | AU  - Turner, Tari
236 | AU  - Elliott, Julian
237 | AU  - Agoritsas, Thomas
238 | AU  - Hilton, John
239 | AU  - Perron, Caroline
240 | AU  - Akl, Elie
241 | AU  - Hodder, Rebecca
242 | AU  - Pestridge, Charlotte
243 | AU  - Albrecht, Lauren
244 | AU  - Horsley, Tanya
245 | AU  - Platt, Joanne
246 | AU  - Armstrong, Rebecca
247 | AU  - Nguyen, Phi Hung
248 | AU  - Plovnick, Robert
249 | AU  - Arno, Anneliese
250 | AU  - Ivers, Noah
251 | AU  - Quinn, Gail
252 | AU  - Au, Agnes
253 | AU  - Johnston, Renea
254 | AU  - Rada, Gabriel
255 | AU  - Bagg, Matthew
256 | AU  - Jones, Arwel
257 | AU  - Ravaud, Philippe
258 | AU  - Boden, Catherine
259 | AU  - Kahale, Lara
260 | AU  - Richter, Bernt
261 | AU  - Boisvert, Isabelle
262 | AU  - Keshavarz, Homa
263 | AU  - Ryan, Rebecca
264 | AU  - Brandt, Linn
265 | AU  - Kolakowsky-Hayner, Stephanie A.
266 | AU  - Salama, Dina
267 | AU  - Brazinova, Alexandra
268 | AU  - Nagraj, Sumanth Kumbargere
269 | AU  - Salanti, Georgia
270 | AU  - Buchbinder, Rachelle
271 | AU  - Lasserson, Toby
272 | AU  - Santaguida, Lina
273 | AU  - Champion, Chris
274 | AU  - Lawrence, Rebecca
275 | AU  - Santesso, Nancy
276 | AU  - Chandler, Jackie
277 | AU  - Les, Zbigniew
278 | AU  - Schünemann, Holger J.
279 | AU  - Charidimou, Andreas
280 | AU  - Leucht, Stefan
281 | AU  - Shemilt, Ian
282 | AU  - Chou, Roger
283 | AU  - Low, Nicola
284 | AU  - Sherifali, Diana
285 | AU  - Churchill, Rachel
286 | AU  - Maas, Andrew
287 | AU  - Siemieniuk, Reed
288 | AU  - Cnossen, Maryse C.
289 | AU  - MacLehose, Harriet
290 | AU  - Simmonds, Mark
291 | AU  - Cossi, Marie-Joelle
292 | AU  - Macleod, Malcolm
293 | AU  - Skoetz, Nicole
294 | AU  - Counotte, Michel
295 | AU  - Marshall, Iain
296 | AU  - Soares-Weiser, Karla
297 | AU  - Craigie, Samantha
298 | AU  - Marshall, Rachel
299 | AU  - Srikanth, Velandai
300 | AU  - Dahm, Philipp
301 | AU  - Martin, Nicole
302 | AU  - Sullivan, Katrina
303 | AU  - Danilkewich, Alanna
304 | AU  - Martínez García, Laura
305 | AU  - Synnot, Anneliese
306 | AU  - Danko, Kristen
307 | AU  - Mavergames, Chris
308 | AU  - Taylor, Mark
309 | AU  - Donoghue, Emma
310 | AU  - Maxwell, Lara J.
311 | AU  - Thayer, Kris
312 | AU  - Dressler, Corinna
313 | AU  - McAuley, James
314 | AU  - Thomas, James
315 | AU  - Egan, Cathy
316 | AU  - McDonald, Steve
317 | AU  - Tritton, Roger
318 | AU  - Elliott, Julian
319 | AU  - McKenzie, Joanne
320 | AU  - Tsafnat, Guy
321 | AU  - Elliott, Sarah A.
322 | AU  - Meerpohl, Joerg
323 | AU  - Tugwell, Peter
324 | AU  - Etxeandia, Itziar
325 | AU  - Merner, Bronwen
326 | AU  - Turgeon, Alexis
327 | AU  - Featherstone, Robin
328 | AU  - Mondello, Stefania
329 | AU  - Turner, Tari
330 | AU  - Foxlee, Ruth
331 | AU  - Morley, Richard
332 | AU  - van Valkenhoef, Gert
333 | AU  - Garner, Paul
334 | AU  - Munafo, Marcus
335 | AU  - Vandvik, Per
336 | AU  - Gerrity, Martha
337 | AU  - Munn, Zachary
338 | AU  - Wallace, Byron
339 | AU  - Glasziou, Paul
340 | AU  - Murano, Melissa
341 | AU  - Wallace, Sheila A.
342 | AU  - Green, Sally
343 | AU  - Newman, Kristine
344 | AU  - Watts, Chris
345 | AU  - Grimshaw, Jeremy
346 | AU  - Nieuwlaat, Robby
347 | AU  - Weeks, Laura
348 | AU  - Gurusamy, Kurinchi
349 | AU  - Nikolakopoulou, Adriani
350 | AU  - Weigl, Aaron
351 | AU  - Haddaway, Neal
352 | AU  - Noel-Storr, Anna
353 | AU  - Wells, George
354 | AU  - Hartling, Lisa
355 | AU  - O'Connor, Annette
356 | AU  - Wiercioch, Wojtek
357 | AU  - Hayden, Jill
358 | AU  - Page, Matthew
359 | AU  - Wolfenden, Luke
360 | AU  - Helfand, Mark
361 | AU  - Pahwa, Manisha
362 | AU  - Yepes Nuñez, Juan José
363 | AU  - Higgins, Julian
364 | AU  - Pardo, Jordi Pardo
365 | AU  - Yost, Jennifer
366 | AU  - Hill, Sophie
367 | AU  - Pearson, Leslea
368 | T2  - Journal of Clinical Epidemiology
369 | AB  - New approaches to evidence synthesis, which use human effort and machine automation in mutually reinforcing ways, can enhance the feasibility and sustainability of living systematic reviews. Human effort is a scarce and valuable resource, required when automation is impossible or undesirable, and includes contributions from online communities (“crowds”) as well as more conventional contributions from review authors and information specialists. Automation can assist with some systematic review tasks, including searching, eligibility assessment, identification and retrieval of full-text reports, extraction of data, and risk of bias assessment. Workflows can be developed in which human effort and machine automation can each enable the other to operate in more effective and efficient ways, offering substantial enhancement to the productivity of systematic reviews. This paper describes and discusses the potential—and limitations—of new ways of undertaking specific tasks in living systematic reviews, identifying areas where these human/machine “technologies” are already in use, and where further research and development is needed. While the context is living systematic reviews, many of these enabling technologies apply equally to standard approaches to systematic reviewing.
370 | DA  - 2017/11/01/
371 | PY  - 2017
372 | DP  - ScienceDirect
373 | VL  - 91
374 | SP  - 31
375 | EP  - 37
376 | J2  - Journal of Clinical Epidemiology
377 | LA  - en
378 | SN  - 0895-4356
379 | ST  - Living systematic reviews
380 | UR  - https://www.sciencedirect.com/science/article/pii/S0895435617306042
381 | Y2  - 2022/09/12/09:04:49
382 | L2  - http://www.sciencedirect.com/science/article/pii/S0895435617306042
383 | L4  - http://www.sciencedirect.com/science/article/pii/S0895435617306042/pdfft?md5=4979be51940c68e214145bce3c4183f9&pid=1-s2.0-S0895435617306042-main.pdf&isDTMRedir=Y
384 | KW  - Machine learning
385 | KW  - Automation
386 | KW  - Citizen science
387 | KW  - Crowdsourcing
388 | KW  - Systematic review
389 | KW  - Text mining
390 | ER  - 
391 | 
392 | TY  - JOUR
393 | TI  - Machine learning to assist risk-of-bias assessments in systematic reviews
394 | AU  - Millard, Louise AC
395 | AU  - Flach, Peter A
396 | AU  - Higgins, Julian PT
397 | T2  - International Journal of Epidemiology
398 | AB  - Background: Risk-of-bias assessments are now a standard component of systematic reviews. At present, reviewers need to manually identify relevant parts of research articles for a set of methodological elements that affect the risk of bias, in order to make a risk-of-bias judgement for each of these elements. We investigate the use of text mining methods to automate risk-of-bias assessments in systematic reviews. We aim to identify relevant sentences within the text of included articles, to rank articles by risk of bias and to reduce the number of risk-of-bias assessments that the reviewers need to perform by hand. Methods: We use supervised machine learning to train two types of models, for each of the three risk-of-bias properties of sequence generation, allocation concealment and blinding. The first model predicts whether a sentence in a research article contains relevant information. The second model predicts a risk-of-bias value for each research article. We use logistic regression, where each independent variable is the frequency of a word in a sentence or article, respectively. Results: We found that sentences can be successfully ranked by relevance with area under the receiver operating characteristic (ROC) curve (AUC) &gt; 0.98. Articles can be ranked by risk of bias with AUC &gt; 0.72. We estimate that more than 33% of articles can be assessed by just one reviewer, where two reviewers are normally required. Conclusions: We show that text mining can be used to assist risk-of-bias assessments.
399 | DA  - 2016/02/01/
400 | PY  - 2016
401 | DO  - 10.1093/ije/dyv306
402 | DP  - Silverchair
403 | VL  - 45
404 | IS  - 1
405 | SP  - 266
406 | EP  - 277
407 | J2  - International Journal of Epidemiology
408 | SN  - 0300-5771
409 | UR  - https://doi.org/10.1093/ije/dyv306
410 | Y2  - 2022/09/12/09:05:00
411 | L2  - https://academic-oup-com.proxy.library.uu.nl/ije/article/45/1/266/2363602
412 | L4  - https://academic-oup-com.proxy.library.uu.nl/ije/article-pdf/45/1/266/24170552/dyv306.pdf
413 | ER  - 
414 | 
415 | 


--------------------------------------------------------------------------------
/tests/demo_data/duplicate_data_with_doi.csv:
--------------------------------------------------------------------------------
1 | title, abstract, doi, publication_year
2 | exact copy title without stopwords, exact copy abstract without stopwords, doi1, 2005
3 | exact copy title without stopwords, exact copy abstract without stopwords, doi1, 2005
4 | other title without stopwords same doi, other abstract without stopwords same doi, doi1, 2006
5 | 1 exact copy title without stopwords, 1 exact copy abstract without stopwords, doi2, 2005
6 | an "exact" copy of title without stopwords, an "exact" copy of abstract without stopwords, doi3, 2007


--------------------------------------------------------------------------------
/tests/demo_data/duplicate_data_without_doi.csv:
--------------------------------------------------------------------------------
1 | title, abstract, publication_year
2 | exact copy title without stopwords, exact copy abstract without stopwords, 2005
3 | exact copy title without stopwords, exact copy abstract without stopwords, 2005
4 | other title without stopwords, other abstract without stopwords, 2006
5 | 1 exact copy title without stopwords, 1 exact copy abstract without stopwords, 2005
6 | an "exact" copy of title without stopwords, an "exact" copy of abstract without stopwords, 2007


--------------------------------------------------------------------------------
/tests/demo_data/sample_data.csv:
--------------------------------------------------------------------------------
1 | title, doi, publication_year
2 | title1, doi1, 2005
3 | title2, doi2, 2001
4 | title3, doi3,
5 | title4, doi4, 2003
6 | title5, doi5, 2004
7 | title6, doi6, 2000


--------------------------------------------------------------------------------
/tests/demo_data/snowballing_doi.csv:
--------------------------------------------------------------------------------
1 | ,title,doi,included
2 | 0,"Myrmecochorous plants in Australia and their dispersal by ants",https://doi.org/10.1071/bt9750475,1
3 | 1,"Mimicking the one-dimensional marginal distributions of processes having an ito differential",https://doi.org/10.1007/bf00699039,0
4 | 


--------------------------------------------------------------------------------
/tests/demo_data/snowballing_openalex.csv:
--------------------------------------------------------------------------------
1 | ,openalex_id,title,included
2 | 0,https://openalex.org/W2051970045,"Myrmecochorous plants in Australia and their dispersal by ants",1
3 | 1,https://openalex.org/W104454400,"Mimicking the one-dimensional marginal distributions of processes having an ito differential",0
4 | 


--------------------------------------------------------------------------------
/tests/test_compose.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from asreviewcontrib.datatools.compose import _check_order_arg
 6 | from asreviewcontrib.datatools.compose import _check_resolve_arg
 7 | from asreviewcontrib.datatools.compose import _check_suffix
 8 | from asreviewcontrib.datatools.compose import create_composition
 9 | 
10 | parent_dir = Path(__file__).parent
11 | file_1 = Path(parent_dir, "demo_data", "dataset_1.ris")
12 | file_2 = Path(parent_dir, "demo_data", "dataset_2.ris")
13 | 
14 | # labeling action on input paths in list = [relevant, irrelevant, labeled, unlabeled]
15 | input_files_1 = [
16 |     file_1,
17 |     file_1,
18 |     file_1,
19 |     file_1,
20 | ]
21 | 
22 | input_files_2 = [None, None, file_1, file_2]
23 | 
24 | 
25 | # test whether input and output suffixes are compatible
26 | def test_suffixes():
27 |     with pytest.raises(ValueError):
28 |         _check_suffix(input_files_1, "conflicting_suffix.csv")
29 | 
30 | 
31 | # test whether wrong input hierarchy/order raises error
32 | def test_input_hierarchy():
33 |     with pytest.raises(ValueError):
34 |         _check_order_arg("abc")
35 |     with pytest.raises(ValueError):
36 |         _check_order_arg("riur")
37 | 
38 | 
39 | # test whether wrong input conflict resolve raises error
40 | def test_input_resolve():
41 |     with pytest.raises(ValueError):
42 |         _check_resolve_arg("fly")
43 | 
44 | 
45 | def test_label_prioritization():
46 |     # input identical datasets and overwrite everything with the relevant labels
47 |     df_1 = create_composition(*input_files_1, order="riu")
48 |     assert df_1["included"].value_counts()[1] == len(df_1)
49 | 
50 |     # input identical datasets and overwrite everything with the irrelevant labels
51 |     df_2 = create_composition(*input_files_1, order="iru")
52 |     assert df_2["included"].value_counts()[0] == len(df_2)
53 | 
54 |     # input identical datasets and overwrite everything as unlabeled
55 |     df_3 = create_composition(*input_files_1, order="uri")
56 |     assert df_3["included"].value_counts()[-1] == len(df_3)
57 | 
58 |     # input different datasets with some identical records, combining as labeled and
59 |     # unlabeled data
60 |     df_4 = create_composition(*input_files_2, order="riu")
61 |     df_4_counts = df_4["included"].value_counts()
62 |     assert df_4_counts[-1] == 7 and df_4_counts[0] == 3 and df_4_counts[1] == 1
63 | 


--------------------------------------------------------------------------------
/tests/test_dedup.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from asreview.data import ASReviewData
  4 | 
  5 | from asreviewcontrib.datatools.dedup import deduplicate_data
  6 | 
  7 | test_dir = Path(__file__).parent
  8 | file_without_doi = Path(test_dir, "demo_data", "duplicate_data_without_doi.csv")
  9 | file_with_doi = Path(test_dir, "demo_data", "duplicate_data_with_doi.csv")
 10 | 
 11 | 
 12 | def test_dedup_without_doi():
 13 |     """
 14 |     Test deduplication without DOI.
 15 | 
 16 |     The test data contains 5 records, 1 of which is an exact duplicate.
 17 | 
 18 |     Same as:
 19 | 
 20 |     asreview data dedup tests/demo_data/duplicate_data_without_doi.csv
 21 |     Not using doi for deduplication because there is no such data.
 22 |     Found 1 duplicates in dataset with 5 records.
 23 |     """
 24 |     data = ASReviewData.from_file(file_without_doi)
 25 |     deduplicate_data(data)
 26 |     assert len(data.df) == 4
 27 | 
 28 | 
 29 | def test_output(tmpdir):
 30 |     data = ASReviewData.from_file(file_without_doi)
 31 |     output_path = Path(tmpdir, "test_dedup.csv")
 32 |     deduplicate_data(data, output_path=output_path)
 33 |     as_test = ASReviewData.from_file(output_path)
 34 |     assert len(data.df) == 4
 35 |     assert len(as_test.df) == 4
 36 | 
 37 | 
 38 | def test_dedup_with_doi():
 39 |     """
 40 |     Test deduplication with DOI.
 41 | 
 42 |     The test data contains 5 records, 1 of which is an exact duplicate
 43 |     and 1 of which is a duplicate based on DOI.
 44 | 
 45 |     Same as:
 46 | 
 47 |     asreview data dedup tests/demo_data/duplicate_data_with_doi.csv
 48 |     Found 2 duplicates in dataset with 5 records.
 49 |     """
 50 |     data = ASReviewData.from_file(file_with_doi)
 51 |     deduplicate_data(data)
 52 |     assert len(data.df) == 3
 53 | 
 54 | 
 55 | def test_dedup_with_similarity_without_doi():
 56 |     """
 57 |     Test deduplication with similarity without DOI.
 58 | 
 59 |     The test data contains 5 records, 1 of which is an exact duplicate
 60 |     and 1 of which is a duplicate based on similarity.
 61 | 
 62 |     Same as:
 63 | 
 64 |     asreview data dedup tests/demo_data/duplicate_data_without_doi.csv --similar \
 65 |         --threshold 0.95
 66 |     Not using doi for deduplication because there is no such data.
 67 |     Found 2 duplicates in dataset with 5 records.
 68 |     """
 69 |     data = ASReviewData.from_file(file_without_doi)
 70 |     deduplicate_data(data, similar=True, threshold=0.95)
 71 |     assert len(data.df) == 3, "Original data should have 5 records."
 72 | 
 73 | 
 74 | def test_dedup_with_similarity_with_doi():
 75 |     """
 76 |     Test deduplication with similarity with DOI.
 77 | 
 78 |     The test data contains 5 records, 1 of which is an exact duplicate,
 79 |     1 of which is a duplicate based on DOI, and 1 of which is a duplicate
 80 |     based on similarity.
 81 | 
 82 |     Same as:
 83 | 
 84 |     asreview data dedup tests/demo_data/duplicate_data_with_doi.csv --similar \
 85 |         --threshold 0.95
 86 |     Found 3 duplicates in dataset with 5 records.
 87 |     """
 88 |     data = ASReviewData.from_file(file_with_doi)
 89 |     deduplicate_data(data, similar=True, threshold=0.95)
 90 |     assert len(data.df) == 2
 91 | 
 92 | 
 93 | def test_dedup_with_similarity_without_doi_stopwords():
 94 |     """
 95 |     Test deduplication with similarity without DOI and removing stopwords.
 96 | 
 97 |     The test data contains 5 records, 1 of which is an exact duplicate,
 98 |     1 of which is a duplicate based on similarity, and 1 of which is a
 99 |     duplicate based on similarity without stopwords.
100 | 
101 |     Same as:
102 | 
103 |     asreview data dedup tests/demo_data/duplicate_data_without_doi.csv --similar \
104 |         --threshold 0.95 --stopwords
105 |     Not using doi for deduplication because there is no such data.
106 |     Found 3 duplicates in dataset with 5 records.
107 |     """
108 |     data = ASReviewData.from_file(file_without_doi)
109 |     deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english")
110 |     assert len(data.df) == 2
111 | 
112 | 
113 | def test_dedup_with_similarity_with_doi_stopwords():
114 |     """
115 |     Test deduplication with similarity with DOI and removing stopwords.
116 | 
117 |     The test data contains 5 records, 1 of which is an exact duplicate,
118 |     1 of which is a duplicate based on DOI, 1 of which is a duplicate
119 |     based on similarity, and 1 of which is a duplicate based on similarity
120 |     without stopwords.
121 | 
122 |     Same as:
123 | 
124 |     asreview data dedup tests/demo_data/duplicate_data_with_doi.csv --similar \
125 |         --threshold 0.95 --stopwords
126 |     Found 4 duplicates in dataset with 5 records.
127 |     """
128 |     data = ASReviewData.from_file(file_with_doi)
129 |     deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english")
130 |     assert len(data.df) == 1
131 | 
132 | 
133 | def test_threshold_zero():
134 |     data = ASReviewData.from_file(file_with_doi)
135 |     deduplicate_data(data, similar=True, threshold=0)
136 |     assert len(data.df) == 1
137 | 


--------------------------------------------------------------------------------
/tests/test_describe.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | 
3 | 
4 | def test_describe():
5 |     subprocess.run(["asreview", "data-describe", "benchmark:van_de_schoot2017"])
6 | 


--------------------------------------------------------------------------------
/tests/test_sample.py:
--------------------------------------------------------------------------------
 1 | # create unit tests for the sample.py file
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from asreviewcontrib.datatools.sample import sample
 7 | 
 8 | INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv"
 9 | 
10 | 
11 | def test_sample(tmpdir):
12 |     sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year")
13 |     df = pd.read_csv(tmpdir / "output.csv")
14 |     assert len(df) == 3
15 |     assert "publication_year" in df.columns
16 |     assert df.iloc[0]["publication_year"] == 2000
17 |     assert df.iloc[2]["publication_year"] == 2005
18 | 


--------------------------------------------------------------------------------
/tests/test_snowball.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pandas as pd
  4 | import pyalex
  5 | 
  6 | from asreviewcontrib.datatools.snowball import backward_snowballing
  7 | from asreviewcontrib.datatools.snowball import forward_snowballing
  8 | from asreviewcontrib.datatools.snowball import openalex_from_doi
  9 | from asreviewcontrib.datatools.snowball import snowball
 10 | 
 11 | INPUT_DIR = Path(__file__).parent / "demo_data"
 12 | EMAIL = "asreview@uu.nl"
 13 | 
 14 | pyalex.config.email = EMAIL
 15 | 
 16 | # These works were chosen for testing forward snowballing.
 17 | # They have a DOI, they cite and are cited by, their cited_by_count is less than 400,
 18 | # so it takes only two requests to get all citing works. And they are from the previous
 19 | # century so the cited_by_count is unlikely to change very much.
 20 | # These are also the same records as in the demo datasets 'snowballing_doi.csv' and
 21 | # 'snowballing_openalex.csv'.
 22 | WORKS = [
 23 |     {
 24 |         "id": "https://openalex.org/W2051970045",
 25 |         "doi": "https://doi.org/10.1071/bt9750475",
 26 |         "title": "Myrmecochorous plants in Australia and their dispersal by ants",
 27 |         "cited_by_count": 372,
 28 |         "cited_by": "https://openalex.org/W2174650845",
 29 |         "cites": "https://openalex.org/W1538725992",
 30 |     },
 31 |     {
 32 |         "id": "https://openalex.org/W104454400",
 33 |         "doi": "https://doi.org/10.1007/bf00699039",
 34 |         "title": (
 35 |             "Mimicking the one-dimensional marginal distributions of processes having"
 36 |             " an ito differential"
 37 |         ),
 38 |         "cited_by_count": 299,
 39 |         "cited_by": "https://openalex.org/W1842249978",
 40 |         "cites": "https://openalex.org/W1513091520",
 41 |     },
 42 | ]
 43 | 
 44 | 
 45 | def test_openalex_from_doi():
 46 |     dois = [
 47 |         "https://doi.org/10.1042/cs20220150",
 48 |         "https://doi.org/10.1042/bst20220734",
 49 |         "not_a_doi",
 50 |     ]
 51 | 
 52 |     assert openalex_from_doi(dois) == {
 53 |         "10.1042/cs20220150": "https://openalex.org/W4386305682",
 54 |         "10.1042/bst20220734": "https://openalex.org/W4312006214",
 55 |         "not_a_doi": None,
 56 |     }
 57 | 
 58 | 
 59 | def test_backward_snowballing():
 60 |     identifiers = [
 61 |         "https://openalex.org/W4281483266",
 62 |         "https://openalex.org/W2008620264",
 63 |     ]
 64 | 
 65 |     backwards_citations = backward_snowballing(identifiers)
 66 | 
 67 |     assert "https://openalex.org/W1864285629" in [
 68 |         field_dict["id"] for field_dict in backwards_citations[identifiers[0]]
 69 |     ]
 70 |     assert "https://openalex.org/W950821216" in [
 71 |         field_dict["id"] for field_dict in backwards_citations[identifiers[1]]
 72 |     ]
 73 | 
 74 | 
 75 | def test_forward_snowballing():
 76 |     identifiers = [work["id"] for work in WORKS]
 77 | 
 78 |     forwards_citations = forward_snowballing(identifiers)
 79 | 
 80 |     assert WORKS[0]["cited_by"] in [
 81 |         field_dict["id"] for field_dict in forwards_citations[identifiers[0]]
 82 |     ]
 83 |     assert WORKS[1]["cited_by"] in [
 84 |         field_dict["id"] for field_dict in forwards_citations[identifiers[1]]
 85 |     ]
 86 | 
 87 | 
 88 | def test_openalex_id_forward(tmpdir):
 89 |     out_fp = Path(tmpdir, "forward.csv")
 90 |     snowball(
 91 |         input_path=INPUT_DIR / "snowballing_openalex.csv",
 92 |         output_path=out_fp,
 93 |         forward=True,
 94 |         backward=False,
 95 |         use_all=False,
 96 |         email=EMAIL,
 97 |     )
 98 |     df = pd.read_csv(out_fp)
 99 |     assert len(df) >= 364
100 | 
101 |     all_out_fp = Path(tmpdir, "forward_all.csv")
102 |     snowball(
103 |         input_path=INPUT_DIR / "snowballing_openalex.csv",
104 |         output_path=all_out_fp,
105 |         forward=True,
106 |         backward=False,
107 |         use_all=True,
108 |         email=EMAIL,
109 |     )
110 |     df_all = pd.read_csv(all_out_fp)
111 |     assert len(df_all) >= 656
112 | 
113 | 
114 | def test_openalex_id_backward(tmpdir):
115 |     out_fp = Path(tmpdir, "backward.csv")
116 |     snowball(
117 |         input_path=INPUT_DIR / "snowballing_openalex.csv",
118 |         output_path=out_fp,
119 |         forward=False,
120 |         backward=True,
121 |         use_all=False,
122 |         email=EMAIL,
123 |     )
124 |     df = pd.read_csv(out_fp)
125 |     # Actual value at time of writing test is 40 (2024-08-26).
126 |     # In theory the number of results should be stable for backward snowballing,
127 |     # but OpenAlex sometimes makes changes, so we allow for a margin.
128 |     # The margins of this assert and the next assert should not overlap,
129 |     # otherwise we don't test if 'use_all' works.
130 |     assert 38 <= len(df) <= 42
131 | 
132 |     all_out_fp = Path(tmpdir, "backward_all.csv")
133 |     snowball(
134 |         input_path=INPUT_DIR / "snowballing_openalex.csv",
135 |         output_path=all_out_fp,
136 |         forward=False,
137 |         backward=True,
138 |         use_all=True,
139 |         email=EMAIL,
140 |     )
141 |     df_all = pd.read_csv(all_out_fp)
142 |     # Actual value at time of writing test is 46 (2024-08-26).
143 |     assert 43 <= len(df_all) <= 49
144 | 
145 | 
146 | def test_snowballing_from_doi(tmpdir):
147 |     out_fp = Path(tmpdir, "doi_all.csv")
148 |     snowball(
149 |         input_path=INPUT_DIR / "snowballing_doi.csv",
150 |         output_path=out_fp,
151 |         forward=False,
152 |         backward=True,
153 |         use_all=True,
154 |         email=EMAIL,
155 |     )
156 |     df = pd.read_csv(out_fp)
157 |     # Actual value at time of writing test is 46 (2024-08-26).
158 |     # See comments in 'test_openalex_id_backward'.
159 |     assert 43 <= len(df) <= 49
160 | 


--------------------------------------------------------------------------------
/tests/test_stack.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from asreview.data import ASReviewData
 4 | 
 5 | from asreviewcontrib.datatools.stack import vstack
 6 | 
 7 | test_dir = Path(__file__).parent
 8 | file_1 = Path(test_dir, "demo_data", "dataset_1.ris")
 9 | file_2 = Path(test_dir, "demo_data", "dataset_2.ris")
10 | 
11 | 
12 | def test_stack(tmpdir):
13 |     output_path = Path(tmpdir, "test_output.ris")
14 |     vstack(output_path, [file_1, file_2])
15 |     as_test = ASReviewData.from_file(output_path)
16 | 
17 |     assert len(as_test.df) == 14
18 |     assert as_test.df["included"].value_counts()[-1] == 9
19 |     assert as_test.df["included"].value_counts()[0] == 3
20 |     assert as_test.df["included"].value_counts()[1] == 2
21 | 


--------------------------------------------------------------------------------