├── .gitattributes ├── .github └── workflows │ ├── ci-workflow.yml │ └── pythonpublish.yml ├── .gitignore ├── .zenodo.json ├── LICENSE ├── README.md ├── Tutorials.md ├── asreviewcontrib └── datatools │ ├── __init__.py │ ├── compose.py │ ├── convert.py │ ├── dedup.py │ ├── describe.py │ ├── entrypoint.py │ ├── sample.py │ ├── snowball.py │ └── stack.py ├── dedup_similar.png ├── pyproject.toml └── tests ├── __init__.py ├── demo_data ├── dataset_1.ris ├── dataset_2.ris ├── duplicate_data_with_doi.csv ├── duplicate_data_without_doi.csv ├── sample_data.csv ├── snowballing_doi.csv └── snowballing_openalex.csv ├── test_compose.py ├── test_dedup.py ├── test_describe.py ├── test_sample.py ├── test_snowball.py └── test_stack.py /.gitattributes: -------------------------------------------------------------------------------- 1 | asreviewcontrib/datatools/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci-workflow.yml: -------------------------------------------------------------------------------- 1 | name: test-suite 2 | on: [push, pull_request] 3 | jobs: 4 | lint-python: 5 | name: lint-python 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | - uses: actions/setup-python@v5 10 | with: 11 | python-version: '3.11' 12 | architecture: 'x64' 13 | - name: Install ruff 14 | run: | 15 | pip install ruff 16 | - name: Lint python with ruff 17 | run: | 18 | ruff check . 19 | test-master: 20 | name: pytest 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v4 24 | - uses: actions/setup-python@v5 25 | - name: Install packages and run tests 26 | run: | 27 | python3 -m pip install pip -U 28 | pip install pytest 29 | pip install . 30 | pytest 31 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | id-token: write 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.x" 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install build 24 | - name: Build package 25 | run: python -m build 26 | - name: Publish package 27 | uses: pypa/gh-action-pypi-publish@release/v1 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | asreviewcontrib/datatools/_version.py -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "description":"ASReview Datatools is an extension to ASReview LAB that can be used for describing basic properties of a dataset (e.g., number of papers, number of inclusions, the amount of missing data and duplicates), converting file formats via the command line, and cleaning your (input) data by removing duplicate records.", 3 | "title":"ASReview Datatools", 4 | "creators":[ 5 | { 6 | "name":"ASReview LAB developers", 7 | "affiliation":"Utrecht University" 8 | } 9 | ], 10 | "keywords":[ 11 | "data", 12 | "systematic review", 13 | "active learning", 14 | "statistics", 15 | "machine learning", 16 | "text data", 17 | "natural language processing" 18 | ], 19 | "related_identifiers":[ 20 | { 21 | "scheme":"doi", 22 | "relation":"isSupplementTo", 23 | "identifier":"10.1038/s42256-020-00287-7" 24 | }, 25 | { 26 | "scheme": "doi", 27 | "identifier": "10.5281/zenodo.3345592", 28 | "relation": "isSupplementTo" 29 | } 30 | ], 31 | "license":"MIT", 32 | "upload_type":"software" 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ASReview - Utrecht University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ASReview Datatools 2 | 3 | [![PyPI version](https://badge.fury.io/py/asreview-datatools.svg)](https://badge.fury.io/py/asreview-datatools) [![Downloads](https://pepy.tech/badge/asreview-datatools)](https://pepy.tech/project/asreview-datatools) [![DOI](https://zenodo.org/badge/239740436.svg)](https://zenodo.org/badge/latestdoi/239740436) 4 | 5 | ASReview Datatools is an extension to [ASReview 6 | LAB](https://github.com/asreview/asreview) that can be used to: 7 | - [**Describe**](#data-describe) basic properties of a dataset 8 | - [**Convert**](#data-convert) file formats 9 | - [**Deduplicate**](#data-dedup) data 10 | - [**Stack**](#data-vstack-experimental) multiple datasets 11 | - [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets 12 | - [**Snowball**](#snowball) a dataset to find incoming or outgoing citations 13 | - [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time. 14 | 15 | Several [tutorials](Tutorials.md) are available that show how 16 | `ASReview-Datatools` can be used in different scenarios. 17 | 18 | ASReview datatools is available for ASReview LAB version 1 or later. 19 | If you are using ASReview LAB version 0.x, use [ASReview-statistics](https://pypi.org/project/asreview-statistics/) instead of ASReview datatools. 20 | 21 | ## Installation 22 | ASReview Datatools requires Python 3.7+ and [ASReview LAB](https://github.com/asreview/asreview) version 1.1 or later. 23 | 24 | The easiest way to install the extension is to install it from PyPI: 25 | 26 | ``` bash 27 | pip install asreview-datatools 28 | ``` 29 | 30 | After installation of the datatools extension, `asreview` should automatically 31 | detect it. Test this with the following command: 32 | 33 | ```bash 34 | asreview --help 35 | ``` 36 | 37 | The extension is successfully installed if it lists `asreview data`. 38 | 39 | To make sure that you are working with the latest version of datatools you can use: 40 | 41 | ```bash 42 | pip install asreview-datatools --upgrade 43 | ``` 44 | 45 | ## Getting started 46 | 47 | ASReview Datatools is a command line tool that extends ASReview LAB. Each 48 | subsection below describes one of the tools. The structure is 49 | 50 | ```bash 51 | asreview data NAME_OF_TOOL 52 | ``` 53 | 54 | where `NAME_OF_TOOL` is the name of one of the tools below (`describe`, `convert`, `dedup`, `vstack`, or `compose`) 55 | followed by positional arguments and optional arguments. 56 | 57 | Each tool has its own help description which is available with 58 | 59 | ```bash 60 | asreview data NAME_OF_TOOL -h 61 | ``` 62 | 63 | ## Tools 64 | ### Data Describe 65 | 66 | Describe the content of a dataset 67 | 68 | ```bash 69 | asreview data describe MY_DATASET.csv 70 | ``` 71 | 72 | Export the results to a file (`output.json`) 73 | 74 | ```bash 75 | asreview data describe MY_DATASET.csv -o output.json 76 | ``` 77 | 78 | Describe the `van_de_schoot_2018` dataset from the [benchmark 79 | platform](https://github.com/asreview/systematic-review-datasets). 80 | 81 | ```bash 82 | asreview data describe synergy:van_de_schoot_2018 -o output.json 83 | ``` 84 | ``` 85 | { 86 | "asreviewVersion": "1.1", 87 | "apiVersion": "1.1.1", 88 | "data": { 89 | "items": [ 90 | { 91 | "id": "n_records", 92 | "title": "Number of records", 93 | "description": "The number of records in the dataset.", 94 | "value": 6189 95 | }, 96 | { 97 | "id": "n_relevant", 98 | "title": "Number of relevant records", 99 | "description": "The number of relevant records in the dataset.", 100 | "value": 43 101 | }, 102 | { 103 | "id": "n_irrelevant", 104 | "title": "Number of irrelevant records", 105 | "description": "The number of irrelevant records in the dataset.", 106 | "value": 6146 107 | }, 108 | { 109 | "id": "n_unlabeled", 110 | "title": "Number of unlabeled records", 111 | "description": "The number of unlabeled records in the dataset.", 112 | "value": 0 113 | }, 114 | { 115 | "id": "n_missing_title", 116 | "title": "Number of records with missing title", 117 | "description": "The number of records in the dataset with missing title.", 118 | "value": 5 119 | }, 120 | { 121 | "id": "n_missing_abstract", 122 | "title": "Number of records with missing abstract", 123 | "description": "The number of records in the dataset with missing abstract.", 124 | "value": 764 125 | }, 126 | { 127 | "id": "n_duplicates", 128 | "title": "Number of duplicate records (basic algorithm)", 129 | "description": "The number of duplicate records in the dataset based on similar text.", 130 | "value": 104 131 | } 132 | ] 133 | } 134 | } 135 | ``` 136 | 137 | ### Data Convert 138 | 139 | Convert the format of a dataset. For example, convert a RIS dataset into a 140 | CSV, Excel, or TAB dataset. 141 | 142 | ``` 143 | asreview data convert MY_DATASET.ris MY_OUTPUT.csv 144 | ``` 145 | 146 | ### Data Dedup 147 | 148 | Remove duplicate records with a simple and straightforward deduplication 149 | [algorithm](https://asreview.readthedocs.io/en/latest/generated/asreview.ASReviewData.duplicated.html#asreview.ASReviewData.duplicated). The algorithm first removes all duplicates based on a persistent 150 | identifier (PID). Then it concatenates the title and abstract, whereafter it 151 | removes all non-alphanumeric tokens. Then the duplicates are removed. 152 | 153 | ``` 154 | asreview data dedup MY_DATASET.ris 155 | ``` 156 | 157 | Export the deduplicated dataset to a file (`output.csv`) 158 | 159 | ``` 160 | asreview data dedup MY_DATASET.ris -o output.csv 161 | ``` 162 | 163 | By default, the PID is set to 'doi'. The `dedup` function offers the option to 164 | use a different PID. Consider a dataset with PubMed identifiers (`PMID`), the 165 | identifier can be used for deduplication. 166 | 167 | ``` 168 | asreview data dedup MY_DATASET.csv -o output.csv --pid PMID 169 | ``` 170 | 171 | Using the `van_de_schoot_2018` dataset from the [benchmark 172 | platform](https://github.com/asreview/systematic-review-datasets). 173 | 174 | ```bash 175 | asreview data dedup synergy:van_de_schoot_2018 -o van_de_schoot_2018_dedup.csv 176 | ``` 177 | ``` 178 | Removed 104 records from dataset with 6189 records. 179 | ``` 180 | 181 | We can also choose to deduplicate based on the similarity of the title and abstract, instead of checking for an exact match. This way we can find duplicates that have small differences, but are actually the same record (for example, an additional comma or a fixed typo). This can be done by using the `--drop_similar` flag. This process takes about 4s on a dataset of about 2068 entries. 182 | 183 | ```bash 184 | asreview data dedup neurips_2020.tsv --drop_similar 185 | ``` 186 | ``` 187 | Not using doi for deduplication because there is no such data. 188 | Deduplicating: 100%|████████████████████████████████████| 2068/2068 [00:03<00:00, 531.93it/s] 189 | Found 2 duplicates in dataset with 2068 records. 190 | ``` 191 | 192 | If we want to check which entries were found as duplicates, we can use the `--verbose` flag. This will print the lines of the dataset that were found as duplicates, as well as the difference between them. Any text that has to be removed from the first entry to become the second one is shown as red and has a strikethrough, and any text that has to be added to the first entry is shown as green. All text that is the same in both entries is dimmed. 193 | 194 | ```bash 195 | asreview data dedup neurips_2020.tsv --drop_similar --verbose 196 | ``` 197 | 198 | ![Verbose drop similar](./dedup_similar.png) 199 | 200 | The similarity threshold can be set with the `--similarity` flag. The default similarity threshold is `0.98`. We can also choose to only use the title for deduplication by using the `--skip_abstract` flag. 201 | 202 | ```bash 203 | asreview data dedup neurips_2020.tsv --drop_similar --similarity 0.98 --skip_abstract 204 | ``` 205 | ``` 206 | Not using doi for deduplication because there is no such data. 207 | Deduplicating: 100%|████████████████████████████████████| 2068/2068 [00:02<00:00, 770.74it/s] 208 | Found 4 duplicates in dataset with 2068 records. 209 | ``` 210 | 211 | Note that you might have to adjust your similarity score if you choose to only use the title for deduplication. The similarity score is calculated using the [SequenceMatcher](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher) class from the `difflib` package. The similarity score is calculated as the ratio of the number of matching characters to the total number of characters in the two strings. For example, the similarity score between the strings "hello" and "hello world" is 0.625. By default, we use the [real_quick_ratio](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.real_quick_ratio) and [quick_ratio](https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.quick_ratio) methods, which are faster and usually good enough, but less accurate. If you want to use the ratio method, you can use the `--strict_similarity` flag. 212 | 213 | Now, if we want to discard stopwords for deduplication (for a more strict check on the important words), we can use the `--discard_stopwords` flag. The default language for the stopwords is `english`, but that can be set with the `--stopwords_language` flag. The list of supported languages for the stopwords are the same supported by the [nltk](https://www.nltk.org/index.html) package. To check the list of available languages, you can run the following commands on your python environment: 214 | 215 | ```python 216 | from nltk.corpus import stopwords 217 | print(stopwords.fileids()) 218 | ``` 219 | ``` 220 | ['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish'] 221 | ``` 222 | 223 | ### Data Vstack (Experimental) 224 | 225 | Vertical stacking: combine as many datasets in the same file format as you want into a single dataset. 226 | 227 | ❗ Vstack is an experimental feature. We would love to hear your feedback. 228 | Please keep in mind that this feature can change in the future. 229 | 230 | Stack several datasets on top of each other: 231 | ``` 232 | asreview data vstack output.csv MY_DATASET_1.csv MY_DATASET_2.csv MY_DATASET_3.csv 233 | ``` 234 | Here, three datasets are exported into a single dataset `output.csv`. 235 | The output path can be followed by any number of datasets to be stacked. 236 | 237 | This is an example using the [demo datasets](https://github.com/asreview/asreview-datatools/tree/master/tests/demo_data): 238 | 239 | ```bash 240 | asreview data vstack output.ris dataset_1.ris dataset_2.ris 241 | ``` 242 | 243 | 244 | ### Data Compose (Experimental) 245 | 246 | Compose is where datasets containing records with different labels (or no 247 | labels) can be assembled into a single dataset. 248 | 249 | ❗ Compose is an experimental feature. We would love to hear your feedback. 250 | Please keep in mind that this feature can change in the future. 251 | 252 | Overview of possible input files and corresponding properties, use at least 253 | one of the following arguments: 254 | 255 | | Arguments | Action | 256 | |----------------------|--------------------------------------------| 257 | | `--relevant`, `-r` | Label all records from this dataset as `relevant` in the composed dataset. | 258 | | `--irrelevant`, `-i` | Label all records from this dataset as `irrelevant` in the composed dataset. | 259 | | `--labeled`, `-l` | Use existing labels from this dataset in the composed dataset. | 260 | | `--unlabeled`, `-u` | Remove all labels from this dataset in the composed dataset. | 261 | 262 | The output path should always be specified. 263 | 264 | Duplicate checking is based on title/abstract and a persistent identifier 265 | (PID) like the digital object identifier (DOI). By default, `doi` is used as 266 | PID. It is possible to use the flag `--pid` to specify a persistent 267 | identifier other than `doi`. In case duplicate records are detected, the user 268 | is warned, and the conflicting records are shown. To specify what happens in 269 | case of conflicts, use the `--conflict_resolve`/`-c` flag. This is set to 270 | `keep_one` by default, options are: 271 | 272 | | Resolve method | Action in case of conflict | 273 | |----------------|-----------------------------------------------------------------------------------------| 274 | | `keep_one` | Keep one label, using `--hierarchy` to determine which label to keep | 275 | | `keep_all` | Keep conflicting records as duplicates in the composed dataset (ignoring `--hierarchy`) | 276 | | `abort` | Abort | 277 | 278 | 279 | In case of an ambiguously labeled record (e.g., one record with two different 280 | labels), use `--hierarchy` to specify a hierarchy of labels. Pass the letters 281 | `r` (relevant), `i` (irrelevant), and `u` (unlabeled) in any order to set 282 | label hierarchy. By default, the order is `riu` meaning that relevant labels 283 | are prioritized over irrelevant and unlabeled, and irrelevant labels are 284 | prioritized over unlabeled ones. 285 | 286 | 287 | Asume you have records in `MY_DATASET_1.ris` from which you want to keep all 288 | existing labels and records in `MY_DATASET_2.ris` which you want to keep 289 | unlabeled. Both datasets can be composed into a single dataset using: 290 | 291 | ```bash 292 | asreview data compose composed_output.ris -l DATASET_1.ris -u DATASET_2.ris --hierarchy uir -c abort 293 | ``` 294 | Because of the flag `-c abort` in case of conflicting/contradictory labels, 295 | the user is warned, records with inconsistent labels are shown, and the script 296 | is aborted. The flag `--hierarchy uir` results in the following hierarch if any 297 | duplicate ambiguously labeled records exist: unlabeled is prioritized over 298 | irrelevant and relevant labels, and irrelevant labels are prioritized over 299 | relevant labels. 300 | 301 | ## Snowball 302 | 303 | ASReview Datatools supports snowballing via the `asreview data snowball` subcommand. 304 | It can perform both backwards (outgoing citations) and forwards (incoming citations) 305 | snowballing. The tool works by searching the [OpenAlex](https://openalex.org/) database 306 | for citation data. An example usage would be: 307 | 308 | ```bash 309 | asreview data snowball input_dataset.csv output_dataset.csv --forward 310 | ``` 311 | 312 | This performs forwards snowballing on `input_dataset.csv` and writes the results to 313 | `output_dataset.csv`. For this to work it is necessary that the input dataset contains 314 | a column with DOI's or a column called `openalex_id` containing OpenAlex work 315 | identifiers. The output dataset will contain the columns `id`, `doi`, `title`, `abstract`, `referenced_works` and `publication_date`. In the case of forward snowballing it will 316 | contain all works in OpenAlex that have a reference to one of the included works in the 317 | input dataset. In the case of backward snowballing it will contain all works in OpenAlex 318 | with referenced by one of the included works of the input dataset. 319 | 320 | If you want to find references for all records in your dataset, instead of just the included works, you can include the flag `--all`, so for example: 321 | 322 | ```bash 323 | asreview data snowball input_dataset.csv output_dataset.csv --backward --all 324 | ``` 325 | 326 | One thing to note is that OpenAlex will handle data requests faster if the sender sends along their email with the request (see [OpenAlex Polite Pool](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool 327 | )), you can to this using the `--email` argument. An example would be: 328 | 329 | ```bash 330 | asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com 331 | ``` 332 | 333 | ## Sample 334 | 335 | This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be: 336 | 337 | ```bash 338 | asreview data sample input_dataset.xlsx output_dataset.xlsx 50 339 | ``` 340 | This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`. 341 | 342 | ## License 343 | 344 | This extension is published under the [MIT license](/LICENSE). 345 | 346 | ## Contact 347 | 348 | This extension is part of the ASReview project ([asreview.ai](https://asreview.ai)). It is maintained by the 349 | maintainers of ASReview LAB. See [ASReview 350 | LAB](https://github.com/asreview/asreview) for contact information and more 351 | resources. 352 | -------------------------------------------------------------------------------- /Tutorials.md: -------------------------------------------------------------------------------- 1 | # Tutorials 2 | 3 | --- 4 | Below are several examples to illustrate how to use `ASReview-datatools`. Make 5 | sure to have installed 6 | [asreview-datatools](https://github.com/asreview/asreview-datatools) and 7 | [ASReview LAB](https://asreview.nl/download/) v1.1 or higher. 8 | 9 | Overview of the tutorials: 10 | 1. [Update systematic review](#update-systematic-review) 11 | 2. [Add prior knowledge](#add-prior-knowledge) 12 | 3. [Prepare a dataset for a simulation study](#prepare-a-dataset-for-a-simulation-study) 13 | 14 | 15 | Allowed data formats are described in the [ASReview 16 | documentation](https://asreview.readthedocs.io/en/latest/data_format.html). 17 | ASReview converts the labeling decisions in [RIS files](https://asreview.readthedocs.io/en/latest/data_format.html#ris-file-format) to a binary variable: 18 | irrelevant as `0` and relevant as `1`. Records marked as unseen or with 19 | missing labeling decisions are converted to `-1`. 20 | 21 | --- 22 | 23 | ## Update Systematic Review 24 | 25 | Assume you are working on a systematic review and you want to update the 26 | review with newly available records. The original data is stored in 27 | `MY_LABELED_DATASET.csv` and the file contains a 28 | [column](https://asreview.readthedocs.io/en/latest/data_labeled.html#label-format) 29 | containing the labeling decissions. In order to update the systematic review, 30 | you run the original search query again but with a new date. You save the 31 | newly found records in `SEARCH_UPDATE.ris`. 32 | 33 | 34 | In the command line interface (CLI), navigate to the directory where the 35 | dataset(s) are stored: 36 | 37 | ```bash 38 | cd Parent_directory 39 | ``` 40 | 41 | ### Preparing your data 42 | 43 | The original data and the newly found records are in a different datafile 44 | format (CSV and RIS). You can convert files to the same file format using the 45 | `convert` script. For example, to convert SEARCH_UPDATE.ris to CSV format, 46 | open the command line interface (CLI) and navigate to the directory where the 47 | dataset(s) are stored and run 48 | 49 | ```bash 50 | asreview data convert SEARCH_UPDATE.ris SEARCH_UPDATE.csv 51 | ``` 52 | 53 | Duplicate records can be removed with with `dedup` script. The algorithm 54 | removes duplicates using the Digital Object Indentifier 55 | ([DOI](https://www.doi.org/)) and the title plus abstract. 56 | 57 | ```bash 58 | asreview data dedup SEARCH_UPDATE.csv -o SEARCH_UPDATE_DEDUP.csv 59 | ``` 60 | 61 | This can also be done considering a similarity threshold between the titles and abstracts. 62 | 63 | ```bash 64 | asreview data dedup SEARCH_UPDATE.csv -o SEARCH_UPDATE_DEDUP.csv --drop_similar 65 | ``` 66 | 67 | ### Describe input 68 | 69 | If you want to see descriptive info on your input datasets, run these commands: 70 | 71 | ```bash 72 | asreview data describe MY_LABELED_DATASET.csv -o MY_LABELED_DATASET_description.json 73 | asreview data describe SEARCH_UPDATE_DEDUP.csv -o SEARCH_UPDATE_description.json 74 | ``` 75 | The results will be exported to `MY_LABELED_DATASET_description.json` and `SEARCH_UPDATE_description.json`. 76 | 77 | ### Compose datasets 78 | 79 | Use the `compose` script to add `SEARCH_UPDATE_DEDUP.csv` to `MY_LABELED_DATASET.csv`: 80 | 81 | ```bash 82 | asreview data compose updated_search.csv -l MY_LABELED_DATASET.csv -u SEARCH_UPDATE_DEDUP.csv 83 | ``` 84 | The flag `-l` means the labels in `MY_LABELED_DATASET.csv` will be kept. 85 | 86 | The flag `-u` means all records from `SEARCH_UPDATE_DEDUP.csv` will be 87 | added as unlabeled to the composed dataset. 88 | 89 | If a record exists in both datasets, it is assumed the record containing a 90 | label is maintained, see the default [conflict resolving 91 | strategy](https://github.com/asreview/asreview-datatools#resolving-conflicting-labels). 92 | To keep both records (with and without label), use 93 | 94 | ```bash 95 | asreview data compose updated_search.csv -l MY_LABELED_DATASET.csv -u SEARCH_UPDATE_DEDUP.csv -c keep 96 | ``` 97 | 98 | The composed dataset will be exported to `COMPOSED_DATA.csv`. 99 | 100 | ### Describe output 101 | 102 | To see descriptive info on the composed dataset: 103 | 104 | ```bash 105 | asreview data describe COMPOSED_DATA.csv -o updated_search_description.json 106 | ``` 107 | The result will be exported to `updated_search_description.json`. 108 | 109 | ### Continue screening in ASReview lab 110 | 111 | The [partly 112 | labelled](https://asreview.readthedocs.io/en/latest/data_labeled.html#partially-labeled-data) 113 | data, `COMPOSED_DATA.csv`, can be uploaded to [ASReview lab - Oracle 114 | mode](https://asreview.readthedocs.io/en/latest/project_create.html). The 115 | lables will be reckognized by ASReview and used to train the first iteration 116 | of the model and you can continue screening all unlabeled records found in the 117 | new search. 118 | 119 | --- 120 | ## Add prior knowledge 121 | 122 | Assume you have just executed a search query for a systematic review and you 123 | want to use a pre-defined set of relevant and irrelevant records as training 124 | data. The search results are stored in `SEARCH_RESULTS.ris`, and the records 125 | you already know to be relevant/irrelevant are saved in 126 | `PRIOR_RELEVANT.ris` and `PRIOR_IRRELEVANT.ris` respectively. 127 | 128 | 129 | In the command line interface (CLI), navigate to the directory where the dataset(s) are stored: 130 | ```bash 131 | cd Parent_directory 132 | ``` 133 | ### Describe input 134 | If you want to see descriptive info on your input datasets, run these commands: 135 | ```bash 136 | asreview data describe SEARCH_RESULTS.ris -o SEARCH_RESULTS_description.json 137 | asreview data describe PRIOR_RELEVANT.ris -o PRIOR_RELEVANT_description.json 138 | asreview data describe PRIOR_IRRELEVANT.ris -o PRIOR_IRRELEVANT_description.json 139 | ``` 140 | 141 | The results will be exported to `SEARCH_RESULTS_description.json`, 142 | `PRIOR_RELEVANT_description.json` and `PRIOR_IRRELEVANT_description.json`. 143 | 144 | 145 | ### Compose datasets 146 | To create one dataset with labels only for the training data to be used in ASREview, run: 147 | 148 | ```bash 149 | asreview data compose search_with_priors.ris -u SEARCH_RESULTS.ris -r PRIOR_RELEVANT.ris -i PRIOR_IRRELEVANT.ris 150 | ``` 151 | 152 | The flag `-r` means all records from `PRIOR_RELEVANT.ris` will be added as 153 | relevant records to the composed dataset. 154 | 155 | The flag `-i` means all records from `PRIOR_IRRELEVANT.ris` will be added 156 | as irrelevant. 157 | 158 | The flag `-u` means all other records from `SEARCH_RESULTS.ris` will be 159 | added as unlabeled. 160 | 161 | If any duplicate records exist across the datasets, by default the order of 162 | keeping labels is: 163 | 1. relevant 164 | 2. irrelevant 165 | 3. unlabeled 166 | 167 | You can configure the behavior in resolving conflicting labels by setting the 168 | hierarchy differently. To do so, pass the letters r (relevant), i 169 | (irrelevant), and u (unlabeled) in any order to, for example, `--hierarchy 170 | uir`. 171 | 172 | 173 | The composed dataset will be exported to `search_with_priors.ris`. 174 | 175 | ### Describe output 176 | To see descriptive info on the composed dataset: 177 | 178 | ```bash 179 | asreview data describe search_with_priors.ris -o search_with_priors_description.json 180 | ``` 181 | 182 | The result will be exported to `search_with_priors_description.json` in the 183 | output folder. 184 | 185 | 186 | ### Start screening in ASReview lab 187 | 188 | The [partly 189 | labelled](https://asreview.readthedocs.io/en/latest/data_labeled.html#partially-labeled-data) 190 | data, `search_with_priors.ris`, can be uploaded to [ASReview lab - Oracle 191 | mode](https://asreview.readthedocs.io/en/latest/project_create.html). The 192 | lables will be reckognized by ASReview and used to train the first iteration 193 | of the model and you can continue screening all unlabeled records found in the 194 | new search. 195 | 196 | --- 197 | ## Prepare a dataset for a simulation study 198 | 199 | Assume you want to use the [simulation 200 | mode](https://asreview.readthedocs.io/en/latest/simulation_overview.html) of 201 | ASReview but the data is not stored in one singe file containing the meta-data 202 | and labelling decissions as required by ASReview. 203 | 204 | Suppose the following files are available: 205 | 206 | - `SCREENED.ris`: all records that were screened 207 | - `RELEVANT.ris`: the subset of relevant records after manually screening all the records. 208 | 209 | You need to compose the files into a single file where all records from 210 | `RELEVANT.csv` are relevant all other records are irrelevant. 211 | 212 | In the command line interface (CLI), navigate to the directory where the 213 | dataset(s) are stored: 214 | 215 | ```bash 216 | cd Parent_directory 217 | ``` 218 | 219 | ### Describe input 220 | 221 | If you want to see descriptive info on your input datasets, run these commands: 222 | 223 | ```bash 224 | asreview data describe SCREENED.ris -o SCREENED_description.json 225 | asreview data describe RELEVANT.ris -o RELEVANT_description.json 226 | ``` 227 | The results will be exported to `SCREENED_description.json` and `RELEVANT_description.json`. 228 | 229 | ### Compose datasets 230 | 231 | Run `compose.py` to compose a new dataset from `SCREENED.ris` and `RELEVANT.ris`: 232 | 233 | ```bash 234 | asreview data compose screened_with_labels.ris -i SCREENED.ris -r RELEVANT.ris 235 | ``` 236 | 237 | The flag `-r` means all records from `RELEVANT.ris` will be added as 238 | relevant to the composed dataset. 239 | 240 | The flag `-i` means all other records from `SCREENED.ris` will be added as 241 | irrelevant. 242 | 243 | The composed dataset will be exported to `screened_with_labels.ris`. 244 | 245 | ### Describe output 246 | 247 | To see descriptive info on the composed dataset: 248 | 249 | ```bash 250 | asreview data describe screened_with_labels.ris -o screened_with_labels_description.json 251 | ``` 252 | The result will be exported to `screened_with_labels_description.json`. 253 | 254 | ### Run simulation in ASReview lab 255 | 256 | The resulting file `screened_with_labels.ris` can be uploaded to [ASReview lab 257 | Simulation 258 | mode](https://asreview.readthedocs.io/en/latest/simulation_webapp.html). This 259 | allows you to simulate the screening procedure of the systematic review as if 260 | it were carried out using ASReview lab. 261 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from asreviewcontrib.datatools._version import __version__ 3 | from asreviewcontrib.datatools._version import __version_tuple__ 4 | except ImportError: 5 | __version__ = "0.0.0" 6 | __version_tuple__ = (0, 0, 0) 7 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/compose.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import warnings 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | from asreview import ASReviewData 7 | from asreview.data.base import load_data 8 | 9 | 10 | def _check_order_arg(order): 11 | # if no hierarchy is specified, set to default: "riu" 12 | if order is None: 13 | return "riu" 14 | 15 | allowed_orders = ["riu", "rui", "uri", "uir", "iru", "iur"] 16 | if order in allowed_orders: 17 | return order 18 | else: 19 | raise ValueError( 20 | f"hierarchy '{order}' not found, should be one of the" 21 | f" following: {allowed_orders}" 22 | ) 23 | 24 | 25 | def _check_resolve_arg(resolve): 26 | # if no resolve method is specified, set to default: "keep_one" 27 | if resolve is None: 28 | return "keep_one" 29 | 30 | allowed_resolve = ["keep_one", "keep_all", "abort"] 31 | if resolve in allowed_resolve: 32 | return resolve 33 | else: 34 | raise ValueError( 35 | f"conflict_resolve '{resolve}' not found, should be one " 36 | f"of the following: {allowed_resolve}" 37 | ) 38 | 39 | 40 | def _check_suffix(input_files, output_file): 41 | # Also raises ValueError on URLs that do not end with a file extension 42 | suffixes = [Path(item).suffix for item in input_files if item is not None] 43 | suffixes.append(Path(output_file).suffix) 44 | 45 | set_ris = {".txt", ".ris"} 46 | set_tabular = {".csv", ".tab", ".tsv", ".xlsx"} 47 | set_suffixes = set(suffixes) 48 | 49 | if len(set(suffixes)) > 1: 50 | if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)): 51 | raise ValueError( 52 | "Files with different file types were; all input files, as well as the" 53 | " output file, should be of the same type. " 54 | ) 55 | 56 | 57 | def _check_label_errors(as_lab, path_lab): 58 | if as_lab is not None: 59 | if as_lab.labels is None: 60 | warnings.warn( 61 | f"'{path_lab}' was passed as a labeled dataset but no labels were" 62 | " found, continuing with its records marked as unlabeled. If this is" 63 | " not correct, check if your data format complies with:" 64 | " https://asreview.readthedocs.io/en/latest/data_format.html", 65 | stacklevel=1, 66 | ) 67 | 68 | 69 | def _append_df(list_df, as_obj, label): 70 | # retrieve part of dataframe with label -1, 0 or 1 71 | df_slice = as_obj.df[as_obj.labels == label].reset_index(drop=True) 72 | 73 | if not df_slice.empty: 74 | list_df.append(df_slice) 75 | 76 | 77 | def _concat_label(list_df, label, pid="doi"): 78 | # if there are any dataframes with the given label, concatenate and drop 79 | # duplicates on pid and title/abstract 80 | if list_df: 81 | df_all = pd.concat(list_df).reset_index(drop=True) 82 | df_all["included"] = label 83 | n_total = len(df_all) 84 | 85 | df_all = ASReviewData(df=df_all).drop_duplicates(pid=pid).reset_index(drop=True) 86 | 87 | n_total_dedup = n_total - len(df_all) 88 | print( 89 | f"Detected {n_total} records with label '{label}', from which" 90 | f" {n_total_dedup} duplicate records with the same label were removed." 91 | ) 92 | else: 93 | df_all = pd.DataFrame() 94 | 95 | return df_all 96 | 97 | 98 | def create_composition( 99 | rel_path=None, 100 | irr_path=None, 101 | lab_path=None, 102 | unl_path=None, 103 | pid="doi", 104 | order="riu", 105 | resolve="keep_one", 106 | ): 107 | # load all input files and URLs into ASReviewData objects, fill with None 108 | # if input was not specified 109 | input_files = [rel_path, irr_path, lab_path, unl_path] 110 | as_rel, as_irr, as_lab, as_unl = ( 111 | load_data(item) if item is not None else None for item in input_files 112 | ) 113 | 114 | # check whether input files are correctly labeled 115 | _check_label_errors(as_lab, lab_path) 116 | 117 | # create lists to append dataframes with a specific label to 118 | list_df_rel, list_df_irr, list_df_unl = [], [], [] 119 | 120 | # split labeled input data in relevant, irrelevant and unlabeled and add 121 | # to list of dataframes for that label 122 | if as_lab is not None: 123 | if as_lab.labels is not None: 124 | _append_df(list_df_rel, as_lab, 1) 125 | _append_df(list_df_irr, as_lab, 0) 126 | _append_df(list_df_unl, as_lab, -1) 127 | else: 128 | list_df_unl.append(as_lab.df) 129 | 130 | # add dataframe to list of dataframes for that label 131 | if as_rel is not None: 132 | list_df_rel.append(as_rel.df) 133 | if as_irr is not None: 134 | list_df_irr.append(as_irr.df) 135 | if as_unl is not None: 136 | list_df_unl.append(as_unl.df) 137 | 138 | # concatenate all dataframes with the same label, drop duplicates and map 139 | # them in a dictionary 140 | dict_dfs = { 141 | "r": _concat_label(list_df_rel, 1, pid), 142 | "i": _concat_label(list_df_irr, 0, pid), 143 | "u": _concat_label(list_df_unl, -1, pid), 144 | } 145 | 146 | # map letters to corresponding term 147 | dict_terms = {"r": "relevant", "i": "irrelevant", "u": "unlabeled"} 148 | 149 | # concatenate in specified order, only the first duplicate entry is kept 150 | as_conflict = ASReviewData( 151 | df=pd.concat( 152 | [dict_dfs[order[0]], dict_dfs[order[1]], dict_dfs[order[2]]] 153 | ).reset_index(drop=True) 154 | ) 155 | 156 | # check for label conflicts 157 | df_conflicting_dups = as_conflict.df[as_conflict.duplicated(pid)] 158 | if len(df_conflicting_dups) > 0: 159 | as_conflicts_only = ASReviewData(df=df_conflicting_dups.reset_index(drop=True)) 160 | # create a dataframe with the relevant info for the user 161 | if pid in as_conflicts_only.df.columns: 162 | df_info_conflicts = pd.DataFrame( 163 | { 164 | pid: as_conflicts_only.df[pid].fillna(""), 165 | "Title": as_conflicts_only.title, 166 | "Abstract": as_conflicts_only.abstract, 167 | } 168 | ) 169 | else: 170 | df_info_conflicts = pd.DataFrame( 171 | { 172 | "Title": as_conflicts_only.title, 173 | "Abstract": as_conflicts_only.abstract, 174 | } 175 | ) 176 | 177 | # pandas settings to print properly 178 | with pd.option_context( 179 | "display.max_rows", 180 | None, 181 | "display.max_columns", 182 | 3, 183 | "max_colwidth", 184 | 40, 185 | "display.width", 186 | 500, 187 | "display.colheader_justify", 188 | "left", 189 | ): 190 | print( 191 | f"\nSome records have inconsistent labels in the input files. This may" 192 | " be intentional because you are trying to overwrite labels in an input" 193 | " file with labels from another input file. However, it may also be" 194 | " because some records are unintentionally labeled inconsistently.\n\n" 195 | "The following records have inconsistent labels in the input files:\n" 196 | f"{df_info_conflicts}\n" 197 | ) 198 | 199 | if resolve == "abort": 200 | raise ValueError("Abort composing because inconsistent labels were found.") 201 | 202 | elif resolve == "keep_one": 203 | warnings.warn( 204 | f"Continuing, keeping one label for records with inconsistent labels," 205 | " resolving conflicts using the following hierarchy:" 206 | f"\n1. {dict_terms[order[0]]}\n2. {dict_terms[order[1]]}" 207 | f"\n3. {dict_terms[order[2]]}", 208 | stacklevel=1, 209 | ) 210 | df_composed = as_conflict.drop_duplicates(pid=pid).reset_index(drop=True) 211 | 212 | elif resolve == "keep_all": 213 | warnings.warn( 214 | "Continuing, keeping all labels for duplicate records with inconsistent" 215 | " labels.", 216 | stacklevel=1, 217 | ) 218 | df_composed = as_conflict.df 219 | 220 | else: 221 | df_composed = as_conflict.df 222 | 223 | # move included column to the end of dataframe 224 | included = df_composed.pop("included") 225 | df_composed = df_composed.assign(included=included) 226 | 227 | return df_composed 228 | 229 | 230 | def _output_composition(final_df, output_file): 231 | # prepare collected labels to pass to the output file 232 | labels = [[index, row["included"]] for index, row in final_df.iterrows()] 233 | as_composed = ASReviewData(df=final_df) 234 | 235 | as_composed.to_file(output_file, labels=labels) 236 | 237 | print(f"Finished, exported composed dataset to {output_file}.") 238 | 239 | 240 | def compose( 241 | output_file, rel, irr, lab, unl, pid="doi", order="riu", resolve="keep_one" 242 | ): 243 | # check whether all input has the same file extension 244 | _check_suffix([rel, irr, lab, unl], output_file) 245 | 246 | df_composition = create_composition( 247 | rel, irr, lab, unl, pid=pid, order=order, resolve=resolve 248 | ) 249 | _output_composition(df_composition, output_file) 250 | 251 | 252 | def _parse_arguments_compose(): 253 | parser = argparse.ArgumentParser(prog="asreview data compose") 254 | parser.add_argument("output_path", type=str, help="The output file path.") 255 | parser.add_argument( 256 | "--relevant", "-r", type=str, help="A dataset with relevant records." 257 | ) 258 | parser.add_argument( 259 | "--irrelevant", "-i", type=str, help="A dataset with irrelevant records." 260 | ) 261 | parser.add_argument("--labeled", "-l", type=str, help="A labeled dataset.") 262 | parser.add_argument("--unlabeled", "-u", type=str, help="An unlabeled dataset.") 263 | parser.add_argument( 264 | "--hierarchy", 265 | dest="hierarchy", 266 | type=_check_order_arg, 267 | default="riu", 268 | help="Hierarchy of labels in case of duplicates." "Default: riu.", 269 | ) 270 | parser.add_argument( 271 | "--conflict_resolve", 272 | "-c", 273 | dest="conflict_resolve", 274 | type=_check_resolve_arg, 275 | default="keep_one", 276 | help="Method for dealing with " "conflicting labels.", 277 | ) 278 | parser.add_argument( 279 | "--pid", 280 | type=str, 281 | default="doi", 282 | help="Persistent identifier used for deduplication. " "Default: doi.", 283 | ) 284 | return parser 285 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/convert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from asreview.data import ASReviewData 4 | 5 | 6 | def convert(input_path, output_path): 7 | # read data in ASReview data object 8 | asdata = ASReviewData.from_file(input_path) 9 | 10 | asdata.to_file(output_path) 11 | 12 | 13 | def _parse_arguments_convert(): 14 | parser = argparse.ArgumentParser(prog="asreview data convert") 15 | parser.add_argument("input_path", type=str, help="The file path of the dataset.") 16 | parser.add_argument("output_path", type=str, help="The file path of the dataset.") 17 | return parser 18 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/dedup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from difflib import SequenceMatcher 3 | 4 | import ftfy 5 | import pandas as pd 6 | from asreview import ASReviewData 7 | from pandas.api.types import is_object_dtype 8 | from pandas.api.types import is_string_dtype 9 | from rich.console import Console 10 | from rich.text import Text 11 | from tqdm import tqdm 12 | 13 | 14 | def _print_similar_list( 15 | similar_list: list[tuple[int, int]], 16 | data: pd.Series, 17 | pid: str, 18 | pids: pd.Series = None, 19 | ) -> None: 20 | print_seq_matcher = SequenceMatcher() 21 | console = Console() 22 | 23 | if pids is not None: 24 | print(f"Found similar titles or same {pid} at lines:") 25 | else: 26 | print("Found similar titles at lines:") 27 | 28 | for i, j in similar_list: 29 | print_seq_matcher.set_seq1(data.iloc[i]) 30 | print_seq_matcher.set_seq2(data.iloc[j]) 31 | text = Text() 32 | 33 | if pids is not None: 34 | text.append(f"\nLines {i + 1} and {j + 1} ", style="bold") 35 | if pids.iloc[i] == pids.iloc[j]: 36 | text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim") 37 | else: 38 | text.append( 39 | f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim" 40 | ) 41 | 42 | else: 43 | text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold") 44 | 45 | for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes(): 46 | if tag == "replace": 47 | # add rich strikethrough 48 | text.append(f"{data.iloc[i][i1:i2]}", style="red strike") 49 | text.append(f"{data.iloc[j][j1:j2]}", style="green") 50 | if tag == "delete": 51 | text.append(f"{data.iloc[i][i1:i2]}", style="red strike") 52 | if tag == "insert": 53 | text.append(f"{data.iloc[j][j1:j2]}", style="green") 54 | if tag == "equal": 55 | text.append(f"{data.iloc[i][i1:i2]}", style="dim") 56 | 57 | console.print(text) 58 | 59 | print("") 60 | 61 | 62 | def _drop_duplicates_by_similarity( 63 | asdata: ASReviewData, 64 | pid: str, 65 | threshold: float = 0.98, 66 | title_only: bool = False, 67 | stopwords_language: str = None, 68 | strict: bool = False, 69 | verbose: bool = False, 70 | ) -> None: 71 | if title_only: 72 | data = asdata.df["title"] 73 | else: 74 | data = pd.Series(asdata.texts) 75 | 76 | symbols_regex = re.compile(r"[^ \w\d\-_]") 77 | spaces_regex = re.compile(r"\s+") 78 | 79 | # clean the data 80 | s = ( 81 | data.apply(ftfy.fix_text) 82 | .str.replace(symbols_regex, "", regex=True) 83 | .str.replace(spaces_regex, " ", regex=True) 84 | .str.lower() 85 | .str.strip() 86 | .replace("", None) 87 | ) 88 | 89 | if stopwords_language: 90 | try: 91 | from nltk.corpus import stopwords 92 | 93 | stopwords_set = set(stopwords.words(stopwords_language)) 94 | except LookupError: 95 | import nltk 96 | 97 | nltk.download("stopwords") 98 | stopwords_set = set(stopwords.words(stopwords_language)) 99 | 100 | stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b") 101 | s = s.str.replace(stopwords_regex, "", regex=True) 102 | 103 | seq_matcher = SequenceMatcher() 104 | duplicated = [False] * len(s) 105 | 106 | similar_list = [] 107 | if pid in asdata.df.columns: 108 | if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]): 109 | pids = asdata.df[pid].str.strip().replace("", None) 110 | if pid == "doi": 111 | pids = pids.str.lower().str.replace( 112 | r"^https?://(www\.)?doi\.org/", "", regex=True 113 | ) 114 | else: 115 | pids = asdata.df[pid] 116 | 117 | for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): 118 | seq_matcher.set_seq2(text) 119 | 120 | # loop through the rest of the data if it has the same pid or similar length 121 | for j, t in s.iloc[i + 1 :][ 122 | (asdata.df[pid] == asdata.df.iloc[i][pid]) 123 | | (abs(s.str.len() - len(text)) < 5) 124 | ].items(): 125 | seq_matcher.set_seq1(t) 126 | 127 | # if the texts have the same pid or are similar enough, 128 | # mark the second one as duplicate 129 | if pids.iloc[i] == pids.iloc[j] or ( 130 | seq_matcher.real_quick_ratio() > threshold 131 | and seq_matcher.quick_ratio() > threshold 132 | and (not strict or seq_matcher.ratio() > threshold) 133 | ): 134 | if not duplicated[j]: 135 | similar_list.append((i, j)) 136 | duplicated[j] = True 137 | 138 | else: 139 | print(f"Not using {pid} for deduplication because there is no such data.") 140 | 141 | for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): 142 | seq_matcher.set_seq2(text) 143 | 144 | # loop through the rest of the data if it has similar length 145 | for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items(): 146 | seq_matcher.set_seq1(t) 147 | 148 | # if the texts are similar enough, mark the second one as duplicate 149 | if ( 150 | seq_matcher.real_quick_ratio() > threshold 151 | and seq_matcher.quick_ratio() > threshold 152 | and (not strict or seq_matcher.ratio() > threshold) 153 | ): 154 | if not duplicated[j]: 155 | similar_list.append((i, j)) 156 | duplicated[j] = True 157 | asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True) 158 | if verbose: 159 | _print_similar_list(similar_list, data, pid) 160 | 161 | 162 | def deduplicate_data( 163 | asdata: ASReviewData, 164 | output_path: str = None, 165 | pid: str = "doi", 166 | similar: bool = False, 167 | threshold: float = 0.98, 168 | title_only: bool = False, 169 | stopwords_language: str = None, 170 | strict: bool = False, 171 | verbose: bool = False, 172 | ) -> None: 173 | """Deduplicate an ASReview data object. 174 | 175 | Parameters 176 | ---------- 177 | asdata : ASReviewData 178 | The data object. 179 | output_path : str, optional 180 | If provided, the deduplicated data object is stored at this location. By 181 | default None. 182 | pid : str, optional 183 | Principal identifier to use for deduplication, by default "doi" 184 | similar : bool, optional 185 | Where to deduplicate 'similar' record. The similarity of the records is 186 | calculated using the `SequenceMatcher` from `difflib`. By default False. 187 | threshold : float, optional 188 | Threshold score above which two records are considered duplicate. 189 | By default 0.98. Only applies if `similar` is set to `True`. 190 | title_only : bool, optional 191 | Only use the title for deduplication, by default False 192 | stopwords_language : str, optional 193 | Remove stopwords from this language before deduplicating, for example 'english'. 194 | By default None. Only applies if `similar` is set to `True`. 195 | strict : bool, optional 196 | Use a stricter algorithm to calculate the similarity between records. 197 | By default False. Only applies if `similar` is set to `True`. 198 | verbose : bool, optional 199 | Get verbose output during deduplicating. By default False. Only applies if 200 | `similar` is set to `True`. 201 | """ 202 | initial_length = len(asdata.df) 203 | 204 | if not similar: 205 | if pid not in asdata.df.columns: 206 | print(f"Not using {pid} for deduplication because there is no such data.") 207 | 208 | # retrieve deduplicated ASReview data object 209 | asdata.drop_duplicates(pid=pid, inplace=True) 210 | 211 | else: 212 | _drop_duplicates_by_similarity( 213 | asdata=asdata, 214 | pid=pid, 215 | threshold=threshold, 216 | title_only=title_only, 217 | stopwords_language=stopwords_language, 218 | strict=strict, 219 | verbose=verbose, 220 | ) 221 | 222 | if output_path: 223 | asdata.to_file(output_path) 224 | 225 | # count duplicates 226 | n_dup = initial_length - len(asdata.df) 227 | print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") 228 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/describe.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import asreview 5 | from asreview.data import load_data 6 | from asreview.data.statistics import n_duplicates 7 | from asreview.data.statistics import n_irrelevant 8 | from asreview.data.statistics import n_missing_abstract 9 | from asreview.data.statistics import n_missing_title 10 | from asreview.data.statistics import n_records 11 | from asreview.data.statistics import n_relevant 12 | from asreview.data.statistics import n_unlabeled 13 | 14 | from asreviewcontrib.datatools import __version__ 15 | 16 | 17 | def describe(input_path, output_path=None): 18 | # read data in ASReview data object 19 | asdata = load_data(input_path) 20 | 21 | # based on https://google.github.io/styleguide/jsoncstyleguide.xml 22 | stats = { 23 | "asreviewVersion": asreview.__version__, 24 | "apiVersion": __version__, 25 | "data": { 26 | "items": [ 27 | { 28 | "id": "n_records", 29 | "title": "Number of records", 30 | "description": "The number of records in the dataset.", 31 | "value": n_records(asdata), 32 | }, 33 | { 34 | "id": "n_relevant", 35 | "title": "Number of relevant records", 36 | "description": "The number of relevant records in the dataset.", 37 | "value": n_relevant(asdata), 38 | }, 39 | { 40 | "id": "n_irrelevant", 41 | "title": "Number of irrelevant records", 42 | "description": "The number of irrelevant records in the dataset.", 43 | "value": n_irrelevant(asdata), 44 | }, 45 | { 46 | "id": "n_unlabeled", 47 | "title": "Number of unlabeled records", 48 | "description": "The number of unlabeled records in the dataset.", 49 | "value": n_unlabeled(asdata), 50 | }, 51 | { 52 | "id": "n_missing_title", 53 | "title": "Number of records with missing title", 54 | "description": ( 55 | "The number of records in the dataset with missing title." 56 | ), 57 | "value": n_missing_title(asdata)[0], 58 | }, 59 | { 60 | "id": "n_missing_abstract", 61 | "title": "Number of records with missing abstract", 62 | "description": ( 63 | "The number of records in the dataset with missing abstract." 64 | ), 65 | "value": n_missing_abstract(asdata)[0], 66 | }, 67 | { 68 | "id": "n_duplicates", 69 | "title": "Number of duplicate records (basic algorithm)", 70 | "description": ( 71 | "The number of duplicate records in the dataset based on" 72 | " similar text." 73 | ), 74 | "value": n_duplicates(asdata), 75 | }, 76 | ] 77 | }, 78 | } # noqa 79 | 80 | if output_path: 81 | with open(output_path, "w") as f: 82 | json.dump(stats, f, indent=2) 83 | 84 | print(json.dumps(stats, indent=2)) 85 | 86 | 87 | def _parse_arguments_describe(): 88 | parser = argparse.ArgumentParser(prog="asreview data describe") 89 | parser.add_argument("input_path", type=str, help="The file path of the dataset.") 90 | parser.add_argument( 91 | "--output_path", 92 | "-o", 93 | default=None, 94 | type=str, 95 | help="The file path of the dataset.", 96 | ) 97 | 98 | return parser 99 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/entrypoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from asreview.data import load_data 4 | from asreview.entry_points import BaseEntryPoint 5 | 6 | from asreviewcontrib.datatools import __version__ 7 | from asreviewcontrib.datatools.compose import _parse_arguments_compose 8 | from asreviewcontrib.datatools.compose import compose 9 | from asreviewcontrib.datatools.convert import _parse_arguments_convert 10 | from asreviewcontrib.datatools.convert import convert 11 | from asreviewcontrib.datatools.dedup import deduplicate_data 12 | from asreviewcontrib.datatools.describe import _parse_arguments_describe 13 | from asreviewcontrib.datatools.describe import describe 14 | from asreviewcontrib.datatools.sample import _parse_arguments_sample 15 | from asreviewcontrib.datatools.sample import sample 16 | from asreviewcontrib.datatools.snowball import _parse_arguments_snowball 17 | from asreviewcontrib.datatools.snowball import snowball 18 | from asreviewcontrib.datatools.stack import _parse_arguments_vstack 19 | from asreviewcontrib.datatools.stack import vstack 20 | 21 | DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"] 22 | 23 | 24 | class DataEntryPoint(BaseEntryPoint): 25 | description = "Home of all data tools for ASReview." 26 | extension_name = "asreview-datatools" 27 | 28 | def __init__(self): 29 | from asreviewcontrib.datatools.__init__ import __version__ 30 | 31 | super().__init__() 32 | 33 | self.version = __version__ 34 | 35 | def execute(self, argv): 36 | if len(argv) > 1 and argv[0] in DATATOOLS: 37 | if argv[0] == "describe": 38 | args_describe_parser = _parse_arguments_describe() 39 | args_describe = vars(args_describe_parser.parse_args(argv[1:])) 40 | describe(**args_describe) 41 | if argv[0] == "convert": 42 | args_convert_parser = _parse_arguments_convert() 43 | args_convert = vars(args_convert_parser.parse_args(argv[1:])) 44 | convert(**args_convert) 45 | if argv[0] == "dedup": 46 | dedup_parser = argparse.ArgumentParser(prog="asreview data dedup") 47 | dedup_parser.add_argument( 48 | "input_path", type=str, help="The file path of the dataset." 49 | ) 50 | dedup_parser.add_argument( 51 | "--output_path", 52 | "-o", 53 | default=None, 54 | type=str, 55 | help="The file path of the output dataset.", 56 | ) 57 | dedup_parser.add_argument( 58 | "--pid", 59 | default="doi", 60 | type=str, 61 | help="Persistent identifier used for deduplication. Default: doi.", 62 | ) 63 | dedup_parser.add_argument( 64 | "--similar", 65 | action="store_true", 66 | help=( 67 | "Drop similar records, not only exactly matching records. The" 68 | " Ratcliff-Obershelp algorithm is used to calculate the" 69 | " similarity of records." 70 | ), 71 | ) 72 | dedup_parser.add_argument( 73 | "--threshold", 74 | default=0.98, 75 | type=float, 76 | help=( 77 | "Record with a similarity score above this threshold are" 78 | " considered duplicate. Default: 0.98. Only applies if" 79 | " similarity is set to True." 80 | ), 81 | ) 82 | dedup_parser.add_argument( 83 | "--title_only", 84 | action="store_true", 85 | help=( 86 | "Use only title for deduplication. Only applies if similarity" 87 | " is set to True" 88 | ), 89 | ) 90 | dedup_parser.add_argument( 91 | "--strict", 92 | action="store_true", 93 | help=( 94 | "Use a more strict version of the similarity algorithm. Only" 95 | " applies if similarity is set to True." 96 | ), 97 | ) 98 | dedup_parser.add_argument( 99 | "--stopwords_language", 100 | default=None, 101 | type=str, 102 | help=( 103 | "Remove stopwords from this language before calculating" 104 | " similarity. For example 'english'. Only applies if similarity" 105 | " is set to True." 106 | ), 107 | ) 108 | dedup_parser.add_argument( 109 | "--verbose", 110 | action="store_true", 111 | help=( 112 | "Print verbose output. Only applies if similarity is set to" 113 | " True." 114 | ), 115 | ) 116 | 117 | args_dedup = dedup_parser.parse_args(argv[1:]) 118 | 119 | # read data in ASReview data object 120 | asdata = load_data(args_dedup.input_path) 121 | deduplicate_data( 122 | asdata=asdata, 123 | output_path=args_dedup.output_path, 124 | pid=args_dedup.pid, 125 | similar=args_dedup.similar, 126 | threshold=args_dedup.threshold, 127 | title_only=args_dedup.title_only, 128 | stopwords_language=args_dedup.stopwords_language, 129 | strict=args_dedup.strict, 130 | verbose=args_dedup.verbose, 131 | ) 132 | 133 | if argv[0] == "compose": 134 | args_compose_parser = _parse_arguments_compose() 135 | args_compose = args_compose_parser.parse_args(argv[1:]) 136 | compose( 137 | args_compose.output_path, 138 | args_compose.relevant, 139 | args_compose.irrelevant, 140 | args_compose.labeled, 141 | args_compose.unlabeled, 142 | pid=args_compose.pid, 143 | order=args_compose.hierarchy, 144 | resolve=args_compose.conflict_resolve, 145 | ) 146 | if argv[0] == "snowball": 147 | args_snowballing_parser = _parse_arguments_snowball() 148 | args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:])) 149 | snowball(**args_snowballing) 150 | if argv[0] == "sample": 151 | args_sample_parser = _parse_arguments_sample() 152 | args_sample = vars(args_sample_parser.parse_args(argv[1:])) 153 | sample(**args_sample) 154 | if argv[0] == "vstack": 155 | args_vstack_parser = _parse_arguments_vstack() 156 | args_vstack = args_vstack_parser.parse_args(argv[1:]) 157 | vstack(args_vstack.output_path, args_vstack.datasets) 158 | 159 | # Print help message if subcommand not given or incorrect 160 | else: 161 | parser = argparse.ArgumentParser( 162 | prog="asreview data", 163 | formatter_class=argparse.RawTextHelpFormatter, 164 | description="Tools for data preprocessing for ASReview.", 165 | ) 166 | parser.add_argument( 167 | "subcommand", 168 | nargs="?", 169 | default=None, 170 | help=f"The datatool to launch. Available commands:\n\n{DATATOOLS}", 171 | ) 172 | parser.add_argument( 173 | "-V", 174 | "--version", 175 | action="version", 176 | default=False, 177 | version=f"{self.extension_name}: {self.version}", 178 | ) 179 | args, _ = parser.parse_known_args() 180 | 181 | print(args) 182 | # output the version 183 | if args.version: 184 | print(__version__) 185 | return 186 | 187 | parser.print_help() 188 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import pandas as pd 4 | from asreview import ASReviewData 5 | from asreview.data.base import load_data 6 | 7 | 8 | def sample(input_path, output_path, nr_records, year_column="publication_year"): 9 | df_input = load_data(input_path).df 10 | 11 | # Check for presence of any variation of a year column 12 | if year_column not in df_input.columns: 13 | raise ValueError(f"• The input file should have a {year_column} column.") 14 | 15 | # Check if k is not too large 16 | if nr_records * 3 > len(df_input): 17 | raise ValueError( 18 | f"• The number of records to sample is too large." 19 | f"Only {len(df_input)} records are present in the input file." 20 | f" You are trying to sample {nr_records*3} records." 21 | ) 22 | 23 | if nr_records < 1: 24 | raise ValueError("• The number of records to sample should be at least 1.") 25 | 26 | # Sort by year 27 | dated_records = df_input[df_input[year_column].notnull()] 28 | 29 | if dated_records.empty: 30 | raise ValueError(f"• The input file has no {year_column} values.") 31 | 32 | if len(dated_records) < nr_records * 2: 33 | raise ValueError("• Not enough dated records to sample from.") 34 | 35 | sorted_records = dated_records.sort_values(year_column, ascending=True) 36 | 37 | # Take k old and k new records 38 | old_records = sorted_records.head(nr_records) 39 | new_records = sorted_records.tail(nr_records) 40 | 41 | # Sample k records without overlap with old/new records 42 | records_to_exclude = pd.concat([old_records, new_records]).index 43 | remaining_records = df_input[~df_input.index.isin(records_to_exclude)] 44 | 45 | sampled_records = remaining_records.sample(nr_records) 46 | 47 | # Combine old, new, and sampled records 48 | df_out = pd.concat([old_records, sampled_records, new_records]) 49 | 50 | asdata = ASReviewData(df=df_out) 51 | asdata.to_file(output_path) 52 | 53 | 54 | def _parse_arguments_sample(): 55 | parser = argparse.ArgumentParser(prog="asreview data sample") 56 | parser.add_argument("input_path", type=str, help="The input file path.") 57 | parser.add_argument("output_path", type=str, help="The output file path.") 58 | parser.add_argument( 59 | "nr_records", 60 | type=int, 61 | help="The amount of records for old, random, and new records each.", 62 | ) 63 | parser.add_argument( 64 | "--year_column", 65 | default="publication_year", 66 | type=str, 67 | help="The name of the column containing the publication year.", 68 | ) 69 | 70 | return parser 71 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/snowball.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | import pyalex 8 | from asreview import ASReviewData 9 | from asreview import load_data 10 | 11 | # Maximum number of statements joined by a logical OR in a call to OpenAlex. 12 | OPENALEX_MAX_OR_LENGTH = 100 13 | OPENALEX_MAX_PAGE_LENGTH = 200 14 | OPENALEX_PREFIX = "https://openalex.org/" 15 | DOI_PREFIX = "https://doi.org/" 16 | 17 | # OpenAlex data fields to retrieve. 18 | USED_FIELDS = [ 19 | "id", 20 | "doi", 21 | "title", 22 | "abstract_inverted_index", 23 | "referenced_works", 24 | "publication_date", 25 | ] 26 | 27 | 28 | def forward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: 29 | """Get all works citing a work with the OpenAlex identifier from the list. 30 | 31 | Parameters 32 | ---------- 33 | identifiers : list[str] 34 | List of OpenAlex identifiers. 35 | 36 | Returns 37 | ------- 38 | dict[str, list[dict]] 39 | Dictionary of the form 40 | `{input OpenAlex identifier : list of OpenAlex works}` 41 | where each work in the list references the work with the input identifier and 42 | it is a dictionary of the form `{field_name : field_value}`. 43 | """ 44 | citing_works = {} 45 | for idx, openalex_id in enumerate(identifiers): 46 | print(f"{idx}. Getting works citing {openalex_id}") 47 | pager = ( 48 | pyalex.Works() 49 | .filter(cites=openalex_id) 50 | .select(USED_FIELDS) 51 | .paginate(per_page=OPENALEX_MAX_PAGE_LENGTH, n_max=None) 52 | ) 53 | citing_works[openalex_id] = [] 54 | for page in pager: 55 | citing_works[openalex_id] += [ 56 | { 57 | key: work[key] 58 | for key in [ 59 | col if col != "abstract_inverted_index" else "abstract" 60 | for col in USED_FIELDS 61 | ] 62 | } 63 | for work in page 64 | ] 65 | return citing_works 66 | 67 | 68 | def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: 69 | """Get all works cited by a work with the OpenAlex identifier from the list. 70 | 71 | Parameters 72 | ---------- 73 | identifiers : list[str] 74 | List of OpenAlex identifiers. 75 | 76 | Returns 77 | ------- 78 | dict[str, list[dict]] 79 | Dictionary of the form 80 | `{input OpenAlex identifier : list of OpenAlex works}` 81 | where each work in the list is referenced by the work with the input identifier 82 | and it is a dictionary of the form `{field_name : field_value}`. 83 | """ 84 | # Get the referenced works. 85 | referenced_works = {} 86 | page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) 87 | 88 | for i in range(0, len(identifiers), page_length): 89 | print(f"Getting works citing records {i}-{i+page_length}") 90 | # We need to remove the prefix here because otherwise the URL is too long. 91 | fltr = "|".join( 92 | identifier.removeprefix(OPENALEX_PREFIX) 93 | for identifier in identifiers[i : i + page_length] 94 | ) 95 | for work in ( 96 | pyalex.Works() 97 | .filter(openalex=fltr) 98 | .select("id,referenced_works") 99 | .get(per_page=page_length) 100 | ): 101 | referenced_works[work["id"]] = work["referenced_works"] 102 | 103 | # Get the fields for the referenced works. 104 | all_identifiers = [] 105 | for reference_list in referenced_works.values(): 106 | all_identifiers += reference_list 107 | all_identifiers = list(set(all_identifiers)) 108 | print(f"Found {len(all_identifiers)} records") 109 | 110 | all_referenced_works = {} 111 | for i in range(0, len(all_identifiers), page_length): 112 | # We need to remove the prefix here because otherwise the URL is too long. 113 | fltr = "|".join( 114 | identifier.removeprefix(OPENALEX_PREFIX) 115 | for identifier in all_identifiers[i : i + page_length] 116 | ) 117 | for work in ( 118 | pyalex.Works() 119 | .filter(openalex=fltr) 120 | .select(USED_FIELDS) 121 | .get(per_page=page_length) 122 | ): 123 | all_referenced_works[work["id"]] = { 124 | key: work[key] 125 | for key in [ 126 | col if col != "abstract_inverted_index" else "abstract" 127 | for col in USED_FIELDS 128 | ] 129 | } 130 | 131 | # Connect the referenced works back to the input works. 132 | output = {} 133 | for identifier, ref_id_list in referenced_works.items(): 134 | # We need the last check if 'ref_id' is in 'all_referenced_works': If a work 135 | # references an ID that redirects to another ID, it won't be present here. 136 | # Example: https://openalex.org/W2015370450 has in the references the identifier 137 | # https://openalex.org/W2008744335, but this redirects to 138 | # https://openalex.org/W4233569835 139 | output[identifier] = [ 140 | all_referenced_works[ref_id] 141 | for ref_id in ref_id_list 142 | if ref_id in all_referenced_works 143 | ] 144 | return output 145 | 146 | 147 | def openalex_from_doi(dois: list[str]) -> dict[str, str]: 148 | """Get the OpenAlex identifiers corresponding to a list of DOIs. 149 | 150 | Parameters 151 | ---------- 152 | dois : list[str] 153 | List of DOIs. 154 | 155 | Returns 156 | ------- 157 | dict[str, str] 158 | Dictionary {doi: openalex_id}. If there was no OpenAlex identifier found for a 159 | DOI, the corresponding value will be None. 160 | """ 161 | page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) 162 | id_mapping = {doi.removeprefix(DOI_PREFIX): None for doi in dois} 163 | for i in range(0, len(dois), page_length): 164 | fltr = "|".join(dois[i : i + page_length]) 165 | for work in ( 166 | pyalex.Works() 167 | .filter(doi=fltr) 168 | .select(["id", "doi"]) 169 | .get(per_page=page_length) 170 | ): 171 | id_mapping[work["doi"].removeprefix(DOI_PREFIX)] = work["id"] 172 | return id_mapping 173 | 174 | 175 | def snowball( 176 | input_path: Path, 177 | output_path: Path, 178 | forward: bool, 179 | backward: bool, 180 | use_all: bool = False, 181 | email: str = None, 182 | ) -> None: 183 | """Perform snowballing on an ASReview dataset. 184 | 185 | Parameters 186 | ---------- 187 | input_path : Path 188 | Location of the input ASReview dataset. 189 | output_path : Path 190 | Location where to save the output dataset. 191 | forward : bool 192 | Perform forward snowballing. At least one of `forward` or `backward` should be 193 | True. 194 | backward : bool 195 | Perform backward snowballing. At least one of `forward` or `backward` should be 196 | True. 197 | use_all : bool, optional 198 | Perform snowballing on all records in the dataset or only the included 199 | records, by default False 200 | email : str, optional 201 | Email address to send along with request to OpenAlex, by default None 202 | 203 | Raises 204 | ------ 205 | ValueError 206 | If `forward` and `backward` are both False. 207 | ValueError 208 | If the dataset contains no column name `openalex_id` and no column names `doi`. 209 | """ 210 | if not (forward or backward): 211 | raise ValueError("At least one of 'forward' or 'backward' should be True.") 212 | 213 | data = load_data(input_path) 214 | if use_all or (data.included is None): 215 | data = data.df 216 | else: 217 | data = data.df.loc[data.included.astype(bool)] 218 | 219 | # Add OpenAlex identifiers if not available. 220 | if "openalex_id" not in data.columns: 221 | if "doi" not in data.columns: 222 | raise ValueError( 223 | "Dataset should contain a column 'openalex_id' containing OpenAlex" 224 | " identifiers or a column 'doi' containing DOIs." 225 | ) 226 | id_mapping = openalex_from_doi(data.doi.dropna().to_list()) 227 | n_openalex_ids = len( 228 | [ 229 | openalex_id 230 | for openalex_id in id_mapping.values() 231 | if openalex_id is not None 232 | ] 233 | ) 234 | print( 235 | f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(data)}" 236 | " records. Performing snowballing for those records." 237 | ) 238 | data["openalex_id"] = None 239 | data.loc[data.doi.notna(), "openalex_id"] = ( 240 | data.loc[data.doi.notna(), "doi"] 241 | .str.removeprefix(DOI_PREFIX) 242 | .apply(lambda doi: id_mapping[doi]) 243 | ) 244 | 245 | identifiers = data["openalex_id"].dropna().to_list() 246 | 247 | if email is not None: 248 | pyalex.config.email = email 249 | 250 | if forward: 251 | print("Starting forward snowballing") 252 | forward_data = forward_snowballing(identifiers) 253 | else: 254 | forward_data = {} 255 | if backward: 256 | print("Starting backward snowballing") 257 | backward_data = backward_snowballing(identifiers) 258 | else: 259 | backward_data = {} 260 | 261 | all_works = [] 262 | for works_list in forward_data.values(): 263 | all_works += works_list 264 | for works_list in backward_data.values(): 265 | all_works += works_list 266 | 267 | output_data = pd.DataFrame(all_works) 268 | output_data.drop_duplicates(subset=["id"], inplace=True) 269 | output_data.rename({"id": "openalex_id"}, axis=1, inplace=True) 270 | output_data = ASReviewData(output_data) 271 | output_data.to_file(output_path) 272 | print("Saved dataset") 273 | 274 | 275 | def _parse_arguments_snowball(): 276 | parser = argparse.ArgumentParser(prog="asreview data snowballing") 277 | parser.add_argument( 278 | "input_path", type=str, help="The file path of the input dataset." 279 | ) 280 | parser.add_argument( 281 | "output_path", type=str, help="The file path of the output dataset." 282 | ) 283 | parser.add_argument( 284 | "--forward", "-f", action="store_true", help="Do forward snowballing." 285 | ) 286 | parser.add_argument( 287 | "--backward", "-b", action="store_true", help="Do backward snowballing." 288 | ) 289 | parser.add_argument( 290 | "--all", 291 | "-a", 292 | action="store_true", 293 | dest="use_all", 294 | help=( 295 | "Do snowballing on all records in the dataset, not just the included ones." 296 | ), 297 | ) 298 | parser.add_argument( 299 | "--email", 300 | "-e", 301 | type=str, 302 | required=False, 303 | help=( 304 | "Email address to send along with requests to OpenAlex. This will make" 305 | " requests faster. See also " 306 | "https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool" 307 | ), 308 | ) 309 | return parser 310 | -------------------------------------------------------------------------------- /asreviewcontrib/datatools/stack.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | from asreview import ASReviewData 6 | from asreview.data.base import load_data 7 | 8 | 9 | def _check_suffix(input_files, output_file): 10 | # Also raises ValueError on URLs that do not end with a file extension 11 | suffixes = [Path(item).suffix for item in input_files if item is not None] 12 | suffixes.append(Path(output_file).suffix) 13 | 14 | set_ris = {".txt", ".ris"} 15 | set_tabular = {".csv", ".tab", ".tsv", ".xlsx"} 16 | set_suffixes = set(suffixes) 17 | 18 | if len(set(suffixes)) > 1: 19 | if not (set_suffixes.issubset(set_ris) or set_suffixes.issubset(set_tabular)): 20 | raise ValueError( 21 | "• Several file types were given; All input files, as well as the" 22 | " output file should be of the same type. " 23 | ) 24 | 25 | 26 | def vstack(output_file, input_files): 27 | _check_suffix(input_files, output_file) 28 | 29 | list_dfs = [load_data(item).df for item in input_files] 30 | df_vstacked = pd.concat(list_dfs).reset_index(drop=True) 31 | as_vstacked = ASReviewData(df=df_vstacked) 32 | 33 | as_vstacked.to_file(output_file) 34 | 35 | 36 | def _parse_arguments_vstack(): 37 | parser = argparse.ArgumentParser(prog="asreview data vstack") 38 | parser.add_argument("output_path", type=str, help="The output file path.") 39 | parser.add_argument( 40 | "datasets", 41 | type=str, 42 | nargs="+", 43 | help="Any number of datasets to stack vertically.", 44 | ) 45 | 46 | return parser 47 | -------------------------------------------------------------------------------- /dedup_similar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asreview/asreview-datatools/32c5e3b5e65042716bd70bad17f0ff4da84f908b/dedup_similar.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "asreview-datatools" 3 | description = "Powerful command line tools for data handling in ASReview" 4 | authors = [ 5 | { name = "ASReview LAB developers", email = "asreview@uu.nl" } 6 | ] 7 | readme = "README.md" 8 | classifiers = [ 9 | "Development Status :: 5 - Production/Stable", 10 | "License :: OSI Approved :: MIT License", 11 | "Programming Language :: Python :: 3.8", 12 | "Programming Language :: Python :: 3.9", 13 | "Programming Language :: Python :: 3.10", 14 | "Programming Language :: Python :: 3.11" 15 | ] 16 | license = {text = "MIT License"} 17 | dependencies = ["asreview>=1.1,<2", "ftfy", "nltk", "pandas", "pyalex", "rich", "tqdm"] 18 | dynamic = ["version"] 19 | requires-python = ">=3.8" 20 | 21 | [project.urls] 22 | homepage = "https://asreview.ai" 23 | repository = "https://github.com/asreview/asreview-datatools" 24 | issues = "https://github.com/asreview/asreview-datatools/issues" 25 | 26 | [project.entry-points."asreview.entry_points"] 27 | data = "asreviewcontrib.datatools.entrypoint:DataEntryPoint" 28 | 29 | [project.optional-dependencies] 30 | lint = ["ruff"] 31 | test = ["pytest"] 32 | 33 | [build-system] 34 | build-backend = 'setuptools.build_meta' 35 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] 36 | 37 | [tool.setuptools] 38 | packages = ["asreviewcontrib"] 39 | 40 | [tool.setuptools_scm] 41 | write_to = "asreviewcontrib/datatools/_version.py" 42 | 43 | [tool.ruff.lint] 44 | select = ["E", "F", "UP", "I", "B"] 45 | 46 | [tool.ruff.lint.isort] 47 | force-single-line = true -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asreview/asreview-datatools/32c5e3b5e65042716bd70bad17f0ff4da84f908b/tests/__init__.py -------------------------------------------------------------------------------- /tests/demo_data/dataset_1.ris: -------------------------------------------------------------------------------- 1 | TY - JOUR 2 | TI - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC): The phase III CAIRO3 study of the Dutch Colorectal Cancer Group (DCCG). 3 | AU - Koopman, Miriam 4 | AU - Simkens, Lieke HJ 5 | AU - Ten Tije, Albert J. 6 | AU - Creemers, Geert-Jan 7 | AU - Loosveld, Olaf JL 8 | AU - de Jongh, Felix E. 9 | AU - Erdkamp, Frans 10 | AU - Erjavec, Zoran 11 | AU - van der Torren, Adelheid ME 12 | AU - Van der Hoeven, Jacobus JM 13 | AU - Nieboer, Peter 14 | AU - Braun, J. J. 15 | AU - Jansen, Rob L. 16 | AU - Haasjes, Janny G. 17 | AU - Cats, Annemieke 18 | AU - Wals, Jacob J. 19 | AU - Mol, Linda 20 | AU - Dalesio, Otilia 21 | AU - van Tinteren, Harm 22 | AU - Punt, Cornelis J. A. 23 | T2 - Journal of Clinical Oncology 24 | AB - 3502 25 | 26 | Background: The optimal duration of chemotherapy and bevacizumab in mCRC is not well established. The CAIRO3 study investigated the efficacy of maintenance treatment with capecitabine plus bevacizumab versus observation in mCRC pts not progressing during induction treatment with capecitabine, oxaliplatin and bevacizumab (CAPOX-B). Methods: Previously untreated mCRC pts, PS 0-1, with stable disease or better after 6 cycles of CAPOX-B, not eligible for metastasectomy and eligible for future treatment with oxaliplatin, were randomized between observation (arm A) or maintenance treatment with capecitabine 625 mg/m2 bid dailycontinuouslyand bevacizumab 7.5 mg/kg iv q 3 weeks (arm B). Upon first progression (PFS1), pts in both arms were treated with CAPOX-B until second progression (PFS2, primary endpoint). For pts not able to receive CAPOX-B upon PFS1, PFS2 was considered equal to PFS1. Secondary endpoints were overall survival (OS) and time to second progression (TTP2), which was defined as the time to progression or death on any treatment following PFS1. All endpoints were calculated from the time of randomization. Results: A total of 558 pts were randomized. Median follow-up is 33 months. The median number of maintenance cycles in arm B was 9 (range 1-54). The median PFS1 in arm A vs B was 4.1 vs 7.4 months (HR 0.44, 95% CI 0.37-0.54, p<0.0001). Upon PFS1, 72% of pts received CAPOX-B in arm A and 44% in arm B. The median PFS2 was 10.4 vs 10.4 months (HR 0.86, 95% CI 0.7-1.04, p=0.12). The median TTP2 in arm A vs B was 11.5 vs 15.4 months (HR 0.58, 95% CI 0.48-0.72, p<0.0001), and the median OS was 17.9 vs 21.7 months (HR 0.77, 95% CI 0.62-0.96, p=0.02), respectively. Conclusions: Maintenance treatment with capecitabine plus bevacizumab after 6 cycles CAPOX-B did not significantly prolong PFS2, which may be due to the lower number of pts in arm B that received CAPOX-B following PFS1. Maintenance treatment significantly prolonged PFS1, TTP2 and OS. Our data support the use of bevacizumab plus capecitabine until progression or unacceptable toxicity. Updated results will be presented. Clinical trial information: NCT00442637. 27 | DA - 2013/05/20/ 28 | PY - 2013 29 | DO - 10.1200/jco.2013.31.15_suppl.3502 30 | DP - ascopubs-org.proxy.library.uu.nl (Atypon) 31 | VL - 31 32 | IS - 15_suppl 33 | SP - 3502 34 | EP - 3502 35 | J2 - JCO 36 | SN - 0732-183X 37 | ST - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC) 38 | UR - https://ascopubs-org.proxy.library.uu.nl/doi/abs/10.1200/jco.2013.31.15_suppl.3502 39 | Y2 - 2022/09/20/ 40 | ER - 41 | 42 | TY - ELEC 43 | TI - Full article: Public transport planning adaption under the COVID-19 pandemic crisis: literature review of research needs and directions 44 | AB - Lets think of somethings 45 | UR - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886 46 | Y2 - 2022/09/20/ 47 | L2 - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886 48 | N1 -

ASReview_relevant

49 |
50 | ER - 51 | 52 | TY - JOUR 53 | TI - An Overview of the Incidences and Costs of Low Back Pain 54 | AU - Frymoyer, John W. 55 | AU - Cats-Baril, William L. 56 | T2 - Orthopedic Clinics of North America 57 | AB - The basic premise of this article is that low back disorders are extremely prevalent in all societies, and probably have not increased substantially over the past two decades. What has increased is the rate of disability, the reasons for which are uncertain. Not only has this phenomenon heightened the awareness of low back pain, but it has led to an explosion in costs. Although a precise estimate is impossible, it is plausible that the direct medical and indirect costs of these conditions are in the range of more than $50 billion per annum, and could be as high as $100 billion at the extreme. Of these costs, 75% or more can be attributed to the 5% of people who become disabled temporarily or permanently from back pain—a phenomenon that seems more rooted in psychosocial rather than disease determinants. Within this overall equation, spinal surgery plays a relatively small role, although the contribution to disability probably has more than passing significance. The future challenge, if costs are to be controlled, appears to lie squarely with prevention and optimum management of disability, rather than perpetrating a myth that low back pain is a serious health disorder. 58 | DA - 1991/04/01/ 59 | PY - 1991 60 | DO - 10.1016/S0030-5898(20)31652-7 61 | DP - ScienceDirect 62 | VL - 22 63 | IS - 2 64 | SP - 263 65 | EP - 271 66 | J2 - Orthopedic Clinics of North America 67 | LA - en 68 | SN - 0030-5898 69 | UR - https://www.sciencedirect.com/science/article/pii/S0030589820316527 70 | Y2 - 2022/09/20/ 71 | L2 - http://www.sciencedirect.com/science/article/abs/pii/S0030589820316527 72 | N1 -

ASReview_irrelevant

73 | ER - 74 | 75 | TY - JOUR 76 | TI - Hereditary diffuse gastric cancer: updated clinical guidelines with an emphasis on germline CDH1 mutation carriers 77 | AU - Post, Rachel S. van der 78 | AU - Vogelaar, Ingrid P. 79 | AU - Carneiro, Fátima 80 | AU - Guilford, Parry 81 | AU - Huntsman, David 82 | AU - Hoogerbrugge, Nicoline 83 | AU - Caldas, Carlos 84 | AU - Schreiber, Karen E. Chelcun 85 | AU - Hardwick, Richard H. 86 | AU - Ausems, Margreet G. E. M. 87 | AU - Bardram, Linda 88 | AU - Benusiglio, Patrick R. 89 | AU - Bisseling, Tanya M. 90 | AU - Blair, Vanessa 91 | AU - Bleiker, Eveline 92 | AU - Boussioutas, Alex 93 | AU - Cats, Annemieke 94 | AU - Coit, Daniel 95 | AU - DeGregorio, Lynn 96 | AU - Figueiredo, Joana 97 | AU - Ford, James M. 98 | AU - Heijkoop, Esther 99 | AU - Hermens, Rosella 100 | AU - Humar, Bostjan 101 | AU - Kaurah, Pardeep 102 | AU - Keller, Gisella 103 | AU - Lai, Jennifer 104 | AU - Ligtenberg, Marjolijn J. L. 105 | AU - O'Donovan, Maria 106 | AU - Oliveira, Carla 107 | AU - Pinheiro, Hugo 108 | AU - Ragunath, Krish 109 | AU - Rasenberg, Esther 110 | AU - Richardson, Susan 111 | AU - Roviello, Franco 112 | AU - Schackert, Hans 113 | AU - Seruca, Raquel 114 | AU - Taylor, Amy 115 | AU - Huurne, Anouk ter 116 | AU - Tischkowitz, Marc 117 | AU - Joe, Sheena Tjon A. 118 | AU - Dijck, Benjamin van 119 | AU - Grieken, Nicole C. T. van 120 | AU - Hillegersberg, Richard van 121 | AU - Sandick, Johanna W. van 122 | AU - Vehof, Rianne 123 | AU - Krieken, J. Han van 124 | AU - Fitzgerald, Rebecca C. 125 | T2 - Journal of Medical Genetics 126 | AB - Germline CDH1 mutations confer a high lifetime risk of developing diffuse gastric (DGC) and lobular breast cancer (LBC). A multidisciplinary workshop was organised to discuss genetic testing, surgery, surveillance strategies, pathology reporting and the patient's perspective on multiple aspects, including diet post gastrectomy. The updated guidelines include revised CDH1 testing criteria (taking into account first-degree and second-degree relatives): (1) families with two or more patients with gastric cancer at any age, one confirmed DGC; (2) individuals with DGC before the age of 40 and (3) families with diagnoses of both DGC and LBC (one diagnosis before the age of 50). Additionally, CDH1 testing could be considered in patients with bilateral or familial LBC before the age of 50, patients with DGC and cleft lip/palate, and those with precursor lesions for signet ring cell carcinoma. Given the high mortality associated with invasive disease, prophylactic total gastrectomy at a centre of expertise is advised for individuals with pathogenic CDH1 mutations. Breast cancer surveillance with annual breast MRI starting at age 30 for women with a CDH1 mutation is recommended. Standardised endoscopic surveillance in experienced centres is recommended for those opting not to have gastrectomy at the current time, those with CDH1 variants of uncertain significance and those that fulfil hereditary DGC criteria without germline CDH1 mutations. Expert histopathological confirmation of (early) signet ring cell carcinoma is recommended. The impact of gastrectomy and mastectomy should not be underestimated; these can have severe consequences on a psychological, physiological and metabolic level. Nutritional problems should be carefully monitored. 127 | DA - 2015/06/01/ 128 | PY - 2015 129 | DO - 10.1136/jmedgenet-2015-103094 130 | DP - jmg.bmj.com 131 | VL - 52 132 | IS - 6 133 | SP - 361 134 | EP - 374 135 | LA - en 136 | SN - 0022-2593, 1468-6244 137 | ST - Hereditary diffuse gastric cancer 138 | UR - https://jmg.bmj.com/content/52/6/361 139 | Y2 - 2022/09/20/ 140 | L2 - http://www.ncbi.nlm.nih.gov/pubmed/25979631 141 | L2 - https://jmg.bmj.com/content/52/6/361.short 142 | L4 - https://jmg.bmj.com/content/jmedgenet/52/6/361.full.pdf 143 | KW - Cancer: breast 144 | KW - Cancer: gastric 145 | KW - Clinical genetics 146 | KW - Diagnostics 147 | KW - Stomach and duodenum 148 | ER - 149 | 150 | TY - BOOK 151 | TI - Zinne- en minnebeelden 152 | AU - Cats, Jacob 153 | DA - 1729/// 154 | PY - 1729 155 | DP - Google Books 156 | SP - 674 157 | LA - nl 158 | PB - by Evert Visscher 159 | L2 - https://books.google.nl/books?id=x4FYAAAAcAAJ 160 | N1 -

ASReview_irrelevant

161 | ER - 162 | 163 | TY - JOUR 164 | TI - Epidemiology of osteoarthritis: Zoetermeer survey. Comparison of radiological osteoarthritis in a Dutch population with that in 10 other populations. 165 | AU - Saase, J. L. van 166 | AU - Romunde, L. K. van 167 | AU - Cats, A. 168 | AU - Vandenbroucke, J. P. 169 | AU - Valkenburg, H. A. 170 | T2 - Annals of the Rheumatic Diseases 171 | AB - The prevalence of mild and severe radiological osteoarthritis was investigated in a random sample of 6585 inhabitants of a Dutch village. Radiographs were graded 0-4 according to the criteria described by Kellgren and Lawrence. The prevalence of radiological osteoarthritis increased strongly with age and was highest for cervical spine (peak: men 84.8%, women 84.3%), lumbar spine (peak: 71.9%, women 67.3%), and distal interphalangeal joints of the hands (peak: men 64.4%, women 76%). Prevalence did not exceed 10% in sacroiliac joints, lateral carpometacarpal joints, and tarsometatarsal joints. Severe radiological osteoarthritis (grade 3 or grade 4) was uncommon under age 45; in elderly persons the prevalence of severe radiological osteoarthritis did not exceed 20% except for the cervical and lumbar spine, distal interphalangeal joints of the hands and, in women only, metacarpophalangeal joints, first carpometacarpal joints, first metatarsophalangeal joints, and knees. Overall, differences between men and women were small except for hips and knees; however, severe radiological osteoarthritis was found in a higher proportion in most of the joints in women. Our data were compared with data from similar population surveys. The slope between joint involvement and age was strikingly constant for most of the joints. Differences between populations were mainly differences in level. These differences of prevalence of radiological osteoarthritis may be attributed to interobserver differences--that is, different criteria used to establish radiological osteoarthritis, in addition to genetic or environmental factors, or both. 172 | DA - 1989/04/01/ 173 | PY - 1989 174 | DO - 10.1136/ard.48.4.271 175 | DP - ard.bmj.com 176 | VL - 48 177 | IS - 4 178 | SP - 271 179 | EP - 280 180 | LA - en 181 | SN - 0003-4967, 1468-2060 182 | ST - Epidemiology of osteoarthritis 183 | UR - https://ard.bmj.com/content/48/4/271 184 | Y2 - 2022/09/20/ 185 | L2 - http://www.ncbi.nlm.nih.gov/pubmed/2712610 186 | L2 - https://ard.bmj.com/content/48/4/271.short 187 | L4 - https://ard.bmj.com/content/annrheumdis/48/4/271.full.pdf 188 | N1 -

ASReview_irrelevant

189 | ER - 190 | 191 | -------------------------------------------------------------------------------- /tests/demo_data/dataset_2.ris: -------------------------------------------------------------------------------- 1 | TY - JOUR 2 | TI - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC): The phase III CAIRO3 study of the Dutch Colorectal Cancer Group (DCCG). 3 | AU - Koopman, Miriam 4 | AU - Simkens, Lieke HJ 5 | AU - Ten Tije, Albert J. 6 | AU - Creemers, Geert-Jan 7 | AU - Loosveld, Olaf JL 8 | AU - de Jongh, Felix E. 9 | AU - Erdkamp, Frans 10 | AU - Erjavec, Zoran 11 | AU - van der Torren, Adelheid ME 12 | AU - Van der Hoeven, Jacobus JM 13 | AU - Nieboer, Peter 14 | AU - Braun, J. J. 15 | AU - Jansen, Rob L. 16 | AU - Haasjes, Janny G. 17 | AU - Cats, Annemieke 18 | AU - Wals, Jacob J. 19 | AU - Mol, Linda 20 | AU - Dalesio, Otilia 21 | AU - van Tinteren, Harm 22 | AU - Punt, Cornelis J. A. 23 | T2 - Journal of Clinical Oncology 24 | AB - 3502 25 | 26 | Background: The optimal duration of chemotherapy and bevacizumab in mCRC is not well established. The CAIRO3 study investigated the efficacy of maintenance treatment with capecitabine plus bevacizumab versus observation in mCRC pts not progressing during induction treatment with capecitabine, oxaliplatin and bevacizumab (CAPOX-B). Methods: Previously untreated mCRC pts, PS 0-1, with stable disease or better after 6 cycles of CAPOX-B, not eligible for metastasectomy and eligible for future treatment with oxaliplatin, were randomized between observation (arm A) or maintenance treatment with capecitabine 625 mg/m2 bid dailycontinuouslyand bevacizumab 7.5 mg/kg iv q 3 weeks (arm B). Upon first progression (PFS1), pts in both arms were treated with CAPOX-B until second progression (PFS2, primary endpoint). For pts not able to receive CAPOX-B upon PFS1, PFS2 was considered equal to PFS1. Secondary endpoints were overall survival (OS) and time to second progression (TTP2), which was defined as the time to progression or death on any treatment following PFS1. All endpoints were calculated from the time of randomization. Results: A total of 558 pts were randomized. Median follow-up is 33 months. The median number of maintenance cycles in arm B was 9 (range 1-54). The median PFS1 in arm A vs B was 4.1 vs 7.4 months (HR 0.44, 95% CI 0.37-0.54, p<0.0001). Upon PFS1, 72% of pts received CAPOX-B in arm A and 44% in arm B. The median PFS2 was 10.4 vs 10.4 months (HR 0.86, 95% CI 0.7-1.04, p=0.12). The median TTP2 in arm A vs B was 11.5 vs 15.4 months (HR 0.58, 95% CI 0.48-0.72, p<0.0001), and the median OS was 17.9 vs 21.7 months (HR 0.77, 95% CI 0.62-0.96, p=0.02), respectively. Conclusions: Maintenance treatment with capecitabine plus bevacizumab after 6 cycles CAPOX-B did not significantly prolong PFS2, which may be due to the lower number of pts in arm B that received CAPOX-B following PFS1. Maintenance treatment significantly prolonged PFS1, TTP2 and OS. Our data support the use of bevacizumab plus capecitabine until progression or unacceptable toxicity. Updated results will be presented. Clinical trial information: NCT00442637. 27 | DA - 2013/05/20/ 28 | PY - 2013 29 | DO - 10.1200/jco.2013.31.15_suppl.3502 30 | DP - ascopubs-org.proxy.library.uu.nl (Atypon) 31 | VL - 31 32 | IS - 15_suppl 33 | SP - 3502 34 | EP - 3502 35 | J2 - JCO 36 | SN - 0732-183X 37 | ST - Maintenance treatment with capecitabine and bevacizumab versus observation after induction treatment with chemotherapy and bevacizumab in metastatic colorectal cancer (mCRC) 38 | UR - https://ascopubs-org.proxy.library.uu.nl/doi/abs/10.1200/jco.2013.31.15_suppl.3502 39 | Y2 - 2022/09/20/ 40 | ER - 41 | 42 | TY - ELEC 43 | TI - Full article: Public transport planning adaption under the COVID-19 pandemic crisis: literature review of research needs and directions 44 | AB - Lets think of somethings 45 | UR - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886 46 | Y2 - 2022/09/20/ 47 | L2 - https://www-tandfonline-com.proxy.library.uu.nl/doi/full/10.1080/01441647.2020.1857886 48 | N1 -

ASReview_relevant

49 |
50 | ER - 51 | 52 | TY - JOUR 53 | TI - Hereditary diffuse gastric cancer: updated clinical guidelines with an emphasis on germline CDH1 mutation carriers 54 | AU - Post, Rachel S. van der 55 | AU - Vogelaar, Ingrid P. 56 | AU - Carneiro, Fátima 57 | AU - Guilford, Parry 58 | AU - Huntsman, David 59 | AU - Hoogerbrugge, Nicoline 60 | AU - Caldas, Carlos 61 | AU - Schreiber, Karen E. Chelcun 62 | AU - Hardwick, Richard H. 63 | AU - Ausems, Margreet G. E. M. 64 | AU - Bardram, Linda 65 | AU - Benusiglio, Patrick R. 66 | AU - Bisseling, Tanya M. 67 | AU - Blair, Vanessa 68 | AU - Bleiker, Eveline 69 | AU - Boussioutas, Alex 70 | AU - Cats, Annemieke 71 | AU - Coit, Daniel 72 | AU - DeGregorio, Lynn 73 | AU - Figueiredo, Joana 74 | AU - Ford, James M. 75 | AU - Heijkoop, Esther 76 | AU - Hermens, Rosella 77 | AU - Humar, Bostjan 78 | AU - Kaurah, Pardeep 79 | AU - Keller, Gisella 80 | AU - Lai, Jennifer 81 | AU - Ligtenberg, Marjolijn J. L. 82 | AU - O'Donovan, Maria 83 | AU - Oliveira, Carla 84 | AU - Pinheiro, Hugo 85 | AU - Ragunath, Krish 86 | AU - Rasenberg, Esther 87 | AU - Richardson, Susan 88 | AU - Roviello, Franco 89 | AU - Schackert, Hans 90 | AU - Seruca, Raquel 91 | AU - Taylor, Amy 92 | AU - Huurne, Anouk ter 93 | AU - Tischkowitz, Marc 94 | AU - Joe, Sheena Tjon A. 95 | AU - Dijck, Benjamin van 96 | AU - Grieken, Nicole C. T. van 97 | AU - Hillegersberg, Richard van 98 | AU - Sandick, Johanna W. van 99 | AU - Vehof, Rianne 100 | AU - Krieken, J. Han van 101 | AU - Fitzgerald, Rebecca C. 102 | T2 - Journal of Medical Genetics 103 | AB - Germline CDH1 mutations confer a high lifetime risk of developing diffuse gastric (DGC) and lobular breast cancer (LBC). A multidisciplinary workshop was organised to discuss genetic testing, surgery, surveillance strategies, pathology reporting and the patient's perspective on multiple aspects, including diet post gastrectomy. The updated guidelines include revised CDH1 testing criteria (taking into account first-degree and second-degree relatives): (1) families with two or more patients with gastric cancer at any age, one confirmed DGC; (2) individuals with DGC before the age of 40 and (3) families with diagnoses of both DGC and LBC (one diagnosis before the age of 50). Additionally, CDH1 testing could be considered in patients with bilateral or familial LBC before the age of 50, patients with DGC and cleft lip/palate, and those with precursor lesions for signet ring cell carcinoma. Given the high mortality associated with invasive disease, prophylactic total gastrectomy at a centre of expertise is advised for individuals with pathogenic CDH1 mutations. Breast cancer surveillance with annual breast MRI starting at age 30 for women with a CDH1 mutation is recommended. Standardised endoscopic surveillance in experienced centres is recommended for those opting not to have gastrectomy at the current time, those with CDH1 variants of uncertain significance and those that fulfil hereditary DGC criteria without germline CDH1 mutations. Expert histopathological confirmation of (early) signet ring cell carcinoma is recommended. The impact of gastrectomy and mastectomy should not be underestimated; these can have severe consequences on a psychological, physiological and metabolic level. Nutritional problems should be carefully monitored. 104 | DA - 2015/06/01/ 105 | PY - 2015 106 | DO - 10.1136/jmedgenet-2015-103094 107 | DP - jmg.bmj.com 108 | VL - 52 109 | IS - 6 110 | SP - 361 111 | EP - 374 112 | LA - en 113 | SN - 0022-2593, 1468-6244 114 | ST - Hereditary diffuse gastric cancer 115 | UR - https://jmg.bmj.com/content/52/6/361 116 | Y2 - 2022/09/20/ 117 | L2 - http://www.ncbi.nlm.nih.gov/pubmed/25979631 118 | L2 - https://jmg.bmj.com/content/52/6/361.short 119 | L4 - https://jmg.bmj.com/content/jmedgenet/52/6/361.full.pdf 120 | KW - Cancer: breast 121 | KW - Cancer: gastric 122 | KW - Clinical genetics 123 | KW - Diagnostics 124 | KW - Stomach and duodenum 125 | ER - 126 | 127 | TY - JOUR 128 | TI - An open source machine learning framework for efficient and transparent systematic reviews 129 | AU - van de Schoot, Rens 130 | AU - de Bruin, Jonathan 131 | AU - Schram, Raoul 132 | AU - Zahedi, Parisa 133 | AU - de Boer, Jan 134 | AU - Weijdema, Felix 135 | AU - Kramer, Bianca 136 | AU - Huijts, Martijn 137 | AU - Hoogerwerf, Maarten 138 | AU - Ferdinands, Gerbrich 139 | AU - Harkema, Albert 140 | AU - Willemsen, Joukje 141 | AU - Ma, Yongchao 142 | AU - Fang, Qixiang 143 | AU - Hindriks, Sybren 144 | AU - Tummers, Lars 145 | AU - Oberski, Daniel L. 146 | T2 - Nature Machine Intelligence 147 | AB - To help researchers conduct a systematic review or meta-analysis as efficiently and transparently as possible, we designed a tool to accelerate the step of screening titles and abstracts. For many tasks—including but not limited to systematic reviews and meta-analyses—the scientific literature needs to be checked systematically. Scholars and practitioners currently screen thousands of studies by hand to determine which studies to include in their review or meta-analysis. This is error prone and inefficient because of extremely imbalanced data: only a fraction of the screened studies is relevant. The future of systematic reviewing will be an interaction with machine learning algorithms to deal with the enormous increase of available text. We therefore developed an open source machine learning-aided pipeline applying active learning: ASReview. We demonstrate by means of simulation studies that active learning can yield far more efficient reviewing than manual reviewing while providing high quality. Furthermore, we describe the options of the free and open source research software and present the results from user experience tests. We invite the community to contribute to open source projects such as our own that provide measurable and reproducible improvements over current practice. 148 | DA - 2021/02// 149 | PY - 2021 150 | DO - 10.1038/s42256-020-00287-7 151 | DP - www-nature-com.proxy.library.uu.nl 152 | VL - 3 153 | IS - 2 154 | SP - 125 155 | EP - 133 156 | J2 - Nat Mach Intell 157 | LA - en 158 | SN - 2522-5839 159 | UR - http://www.nature.com/articles/s42256-020-00287-7 160 | Y2 - 2022/09/12/09:02:50 161 | L2 - http://www.nature.com/articles/s42256-020-00287-7 162 | L4 - http://www.nature.com/articles/s42256-020-00287-7.pdf 163 | KW - Computational biology and bioinformatics 164 | KW - Computer science 165 | KW - Medical research 166 | KW - SARS-CoV-2 167 | ER - 168 | 169 | TY - JOUR 170 | TI - Machine learning for screening prioritization in systematic reviews: comparative performance of Abstrackr and EPPI-Reviewer 171 | AU - Tsou, Amy Y. 172 | AU - Treadwell, Jonathan R. 173 | AU - Erinoff, Eileen 174 | AU - Schoelles, Karen 175 | T2 - Systematic Reviews 176 | AB - Improving the speed of systematic review (SR) development is key to supporting evidence-based medicine. Machine learning tools which semi-automate citation screening might improve efficiency. Few studies have assessed use of screening prioritization functionality or compared two tools head to head. In this project, we compared performance of two machine-learning tools for potential use in citation screening. 177 | DA - 2020/04/02/ 178 | PY - 2020 179 | DO - 10.1186/s13643-020-01324-7 180 | DP - Springer Link 181 | VL - 9 182 | IS - 1 183 | SP - 73 184 | J2 - Syst Rev 185 | LA - en 186 | SN - 2046-4053 187 | ST - Machine learning for screening prioritization in systematic reviews 188 | UR - https://doi.org/10.1186/s13643-020-01324-7 189 | Y2 - 2022/09/12/09:04:15 190 | L4 - http://link.springer.com/content/pdf/10.1186%2Fs13643-020-01324-7.pdf 191 | KW - Abstrackr 192 | KW - Citation screening 193 | KW - Efficiency 194 | KW - EPPI-Reviewer 195 | KW - Machine learning 196 | KW - Methodology 197 | KW - Screening burden 198 | KW - Screening prioritization 199 | KW - Text-mining 200 | ER - 201 | 202 | TY - JOUR 203 | TI - Machine learning techniques for the automation of literature reviews and systematic reviews in EFSA 204 | AU - Jaspers, Stijn 205 | AU - De Troyer, Ewoud 206 | AU - Aerts, Marc 207 | T2 - EFSA Supporting Publications 208 | AB - This Report presents the results from EFSA project RC/EFSA/AMU/2016/01 related to the implementation of machine learning techniques for literature reviews and systematic reviews in EFSA. An overview of the different steps of a systematic review is provided, along with possible ways for automation. Although it was found that most steps could benefit from automation, it was also observed that some steps require more sophisticated methods than those encompassed within the machine learning framework. Availability of data and methodology allowed for the development of an automatic screening tool based on several machine learning techniques. The developed shiny R application can be used for the screening of abstracts and full texts. Properties of machine learning techniques are discussed in this Report together with their most important advantages and disadvantages. The latter discussion includes both general properties, as well as context-specific properties based on their performance in three case studies. Although creating a universal automatic data extraction tool was considered to be infeasible in this stage, this step of the systematic review was addressed to allow the reviewer to scan the uploaded pdf files for certain words or string of words. Based on observations from the performed case studies, recommendations were made regarding which methods are preferred in specific situations. More explicitly, a discussion is made about the performance of the classifiers with respect to the magnitude of the pool of papers to be screened as well as to the amount of imbalance, referring to the proportion of relevant and irrelevant papers. Finally, it was concluded that the results presented in this report provide proof that the developed shiny application could be efficiently used in combination with other software such as DistillerSR. 209 | DA - 2018/// 210 | PY - 2018 211 | DO - 10.2903/sp.efsa.2018.EN-1427 212 | DP - Wiley Online Library 213 | VL - 15 214 | IS - 6 215 | SP - 1427E 216 | LA - en 217 | SN - 2397-8325 218 | UR - https://onlinelibrary.wiley.com/doi/abs/10.2903/sp.efsa.2018.EN-1427 219 | Y2 - 2022/09/12/09:04:41 220 | L2 - https://efsa.onlinelibrary.wiley.com/doi/abs/10.2903/sp.efsa.2018.EN-1427 221 | L4 - https://onlinelibrary.wiley.com/doi/pdfdirect/10.2903/sp.efsa.2018.EN-1427 222 | ER - 223 | 224 | TY - JOUR 225 | TI - Living systematic reviews: 2. Combining human and machine effort 226 | AU - Thomas, James 227 | AU - Noel-Storr, Anna 228 | AU - Marshall, Iain 229 | AU - Wallace, Byron 230 | AU - McDonald, Steven 231 | AU - Mavergames, Chris 232 | AU - Glasziou, Paul 233 | AU - Shemilt, Ian 234 | AU - Synnot, Anneliese 235 | AU - Turner, Tari 236 | AU - Elliott, Julian 237 | AU - Agoritsas, Thomas 238 | AU - Hilton, John 239 | AU - Perron, Caroline 240 | AU - Akl, Elie 241 | AU - Hodder, Rebecca 242 | AU - Pestridge, Charlotte 243 | AU - Albrecht, Lauren 244 | AU - Horsley, Tanya 245 | AU - Platt, Joanne 246 | AU - Armstrong, Rebecca 247 | AU - Nguyen, Phi Hung 248 | AU - Plovnick, Robert 249 | AU - Arno, Anneliese 250 | AU - Ivers, Noah 251 | AU - Quinn, Gail 252 | AU - Au, Agnes 253 | AU - Johnston, Renea 254 | AU - Rada, Gabriel 255 | AU - Bagg, Matthew 256 | AU - Jones, Arwel 257 | AU - Ravaud, Philippe 258 | AU - Boden, Catherine 259 | AU - Kahale, Lara 260 | AU - Richter, Bernt 261 | AU - Boisvert, Isabelle 262 | AU - Keshavarz, Homa 263 | AU - Ryan, Rebecca 264 | AU - Brandt, Linn 265 | AU - Kolakowsky-Hayner, Stephanie A. 266 | AU - Salama, Dina 267 | AU - Brazinova, Alexandra 268 | AU - Nagraj, Sumanth Kumbargere 269 | AU - Salanti, Georgia 270 | AU - Buchbinder, Rachelle 271 | AU - Lasserson, Toby 272 | AU - Santaguida, Lina 273 | AU - Champion, Chris 274 | AU - Lawrence, Rebecca 275 | AU - Santesso, Nancy 276 | AU - Chandler, Jackie 277 | AU - Les, Zbigniew 278 | AU - Schünemann, Holger J. 279 | AU - Charidimou, Andreas 280 | AU - Leucht, Stefan 281 | AU - Shemilt, Ian 282 | AU - Chou, Roger 283 | AU - Low, Nicola 284 | AU - Sherifali, Diana 285 | AU - Churchill, Rachel 286 | AU - Maas, Andrew 287 | AU - Siemieniuk, Reed 288 | AU - Cnossen, Maryse C. 289 | AU - MacLehose, Harriet 290 | AU - Simmonds, Mark 291 | AU - Cossi, Marie-Joelle 292 | AU - Macleod, Malcolm 293 | AU - Skoetz, Nicole 294 | AU - Counotte, Michel 295 | AU - Marshall, Iain 296 | AU - Soares-Weiser, Karla 297 | AU - Craigie, Samantha 298 | AU - Marshall, Rachel 299 | AU - Srikanth, Velandai 300 | AU - Dahm, Philipp 301 | AU - Martin, Nicole 302 | AU - Sullivan, Katrina 303 | AU - Danilkewich, Alanna 304 | AU - Martínez García, Laura 305 | AU - Synnot, Anneliese 306 | AU - Danko, Kristen 307 | AU - Mavergames, Chris 308 | AU - Taylor, Mark 309 | AU - Donoghue, Emma 310 | AU - Maxwell, Lara J. 311 | AU - Thayer, Kris 312 | AU - Dressler, Corinna 313 | AU - McAuley, James 314 | AU - Thomas, James 315 | AU - Egan, Cathy 316 | AU - McDonald, Steve 317 | AU - Tritton, Roger 318 | AU - Elliott, Julian 319 | AU - McKenzie, Joanne 320 | AU - Tsafnat, Guy 321 | AU - Elliott, Sarah A. 322 | AU - Meerpohl, Joerg 323 | AU - Tugwell, Peter 324 | AU - Etxeandia, Itziar 325 | AU - Merner, Bronwen 326 | AU - Turgeon, Alexis 327 | AU - Featherstone, Robin 328 | AU - Mondello, Stefania 329 | AU - Turner, Tari 330 | AU - Foxlee, Ruth 331 | AU - Morley, Richard 332 | AU - van Valkenhoef, Gert 333 | AU - Garner, Paul 334 | AU - Munafo, Marcus 335 | AU - Vandvik, Per 336 | AU - Gerrity, Martha 337 | AU - Munn, Zachary 338 | AU - Wallace, Byron 339 | AU - Glasziou, Paul 340 | AU - Murano, Melissa 341 | AU - Wallace, Sheila A. 342 | AU - Green, Sally 343 | AU - Newman, Kristine 344 | AU - Watts, Chris 345 | AU - Grimshaw, Jeremy 346 | AU - Nieuwlaat, Robby 347 | AU - Weeks, Laura 348 | AU - Gurusamy, Kurinchi 349 | AU - Nikolakopoulou, Adriani 350 | AU - Weigl, Aaron 351 | AU - Haddaway, Neal 352 | AU - Noel-Storr, Anna 353 | AU - Wells, George 354 | AU - Hartling, Lisa 355 | AU - O'Connor, Annette 356 | AU - Wiercioch, Wojtek 357 | AU - Hayden, Jill 358 | AU - Page, Matthew 359 | AU - Wolfenden, Luke 360 | AU - Helfand, Mark 361 | AU - Pahwa, Manisha 362 | AU - Yepes Nuñez, Juan José 363 | AU - Higgins, Julian 364 | AU - Pardo, Jordi Pardo 365 | AU - Yost, Jennifer 366 | AU - Hill, Sophie 367 | AU - Pearson, Leslea 368 | T2 - Journal of Clinical Epidemiology 369 | AB - New approaches to evidence synthesis, which use human effort and machine automation in mutually reinforcing ways, can enhance the feasibility and sustainability of living systematic reviews. Human effort is a scarce and valuable resource, required when automation is impossible or undesirable, and includes contributions from online communities (“crowds”) as well as more conventional contributions from review authors and information specialists. Automation can assist with some systematic review tasks, including searching, eligibility assessment, identification and retrieval of full-text reports, extraction of data, and risk of bias assessment. Workflows can be developed in which human effort and machine automation can each enable the other to operate in more effective and efficient ways, offering substantial enhancement to the productivity of systematic reviews. This paper describes and discusses the potential—and limitations—of new ways of undertaking specific tasks in living systematic reviews, identifying areas where these human/machine “technologies” are already in use, and where further research and development is needed. While the context is living systematic reviews, many of these enabling technologies apply equally to standard approaches to systematic reviewing. 370 | DA - 2017/11/01/ 371 | PY - 2017 372 | DP - ScienceDirect 373 | VL - 91 374 | SP - 31 375 | EP - 37 376 | J2 - Journal of Clinical Epidemiology 377 | LA - en 378 | SN - 0895-4356 379 | ST - Living systematic reviews 380 | UR - https://www.sciencedirect.com/science/article/pii/S0895435617306042 381 | Y2 - 2022/09/12/09:04:49 382 | L2 - http://www.sciencedirect.com/science/article/pii/S0895435617306042 383 | L4 - http://www.sciencedirect.com/science/article/pii/S0895435617306042/pdfft?md5=4979be51940c68e214145bce3c4183f9&pid=1-s2.0-S0895435617306042-main.pdf&isDTMRedir=Y 384 | KW - Machine learning 385 | KW - Automation 386 | KW - Citizen science 387 | KW - Crowdsourcing 388 | KW - Systematic review 389 | KW - Text mining 390 | ER - 391 | 392 | TY - JOUR 393 | TI - Machine learning to assist risk-of-bias assessments in systematic reviews 394 | AU - Millard, Louise AC 395 | AU - Flach, Peter A 396 | AU - Higgins, Julian PT 397 | T2 - International Journal of Epidemiology 398 | AB - Background: Risk-of-bias assessments are now a standard component of systematic reviews. At present, reviewers need to manually identify relevant parts of research articles for a set of methodological elements that affect the risk of bias, in order to make a risk-of-bias judgement for each of these elements. We investigate the use of text mining methods to automate risk-of-bias assessments in systematic reviews. We aim to identify relevant sentences within the text of included articles, to rank articles by risk of bias and to reduce the number of risk-of-bias assessments that the reviewers need to perform by hand. Methods: We use supervised machine learning to train two types of models, for each of the three risk-of-bias properties of sequence generation, allocation concealment and blinding. The first model predicts whether a sentence in a research article contains relevant information. The second model predicts a risk-of-bias value for each research article. We use logistic regression, where each independent variable is the frequency of a word in a sentence or article, respectively. Results: We found that sentences can be successfully ranked by relevance with area under the receiver operating characteristic (ROC) curve (AUC) > 0.98. Articles can be ranked by risk of bias with AUC > 0.72. We estimate that more than 33% of articles can be assessed by just one reviewer, where two reviewers are normally required. Conclusions: We show that text mining can be used to assist risk-of-bias assessments. 399 | DA - 2016/02/01/ 400 | PY - 2016 401 | DO - 10.1093/ije/dyv306 402 | DP - Silverchair 403 | VL - 45 404 | IS - 1 405 | SP - 266 406 | EP - 277 407 | J2 - International Journal of Epidemiology 408 | SN - 0300-5771 409 | UR - https://doi.org/10.1093/ije/dyv306 410 | Y2 - 2022/09/12/09:05:00 411 | L2 - https://academic-oup-com.proxy.library.uu.nl/ije/article/45/1/266/2363602 412 | L4 - https://academic-oup-com.proxy.library.uu.nl/ije/article-pdf/45/1/266/24170552/dyv306.pdf 413 | ER - 414 | 415 | -------------------------------------------------------------------------------- /tests/demo_data/duplicate_data_with_doi.csv: -------------------------------------------------------------------------------- 1 | title, abstract, doi, publication_year 2 | exact copy title without stopwords, exact copy abstract without stopwords, doi1, 2005 3 | exact copy title without stopwords, exact copy abstract without stopwords, doi1, 2005 4 | other title without stopwords same doi, other abstract without stopwords same doi, doi1, 2006 5 | 1 exact copy title without stopwords, 1 exact copy abstract without stopwords, doi2, 2005 6 | an "exact" copy of title without stopwords, an "exact" copy of abstract without stopwords, doi3, 2007 -------------------------------------------------------------------------------- /tests/demo_data/duplicate_data_without_doi.csv: -------------------------------------------------------------------------------- 1 | title, abstract, publication_year 2 | exact copy title without stopwords, exact copy abstract without stopwords, 2005 3 | exact copy title without stopwords, exact copy abstract without stopwords, 2005 4 | other title without stopwords, other abstract without stopwords, 2006 5 | 1 exact copy title without stopwords, 1 exact copy abstract without stopwords, 2005 6 | an "exact" copy of title without stopwords, an "exact" copy of abstract without stopwords, 2007 -------------------------------------------------------------------------------- /tests/demo_data/sample_data.csv: -------------------------------------------------------------------------------- 1 | title, doi, publication_year 2 | title1, doi1, 2005 3 | title2, doi2, 2001 4 | title3, doi3, 5 | title4, doi4, 2003 6 | title5, doi5, 2004 7 | title6, doi6, 2000 -------------------------------------------------------------------------------- /tests/demo_data/snowballing_doi.csv: -------------------------------------------------------------------------------- 1 | ,title,doi,included 2 | 0,"Myrmecochorous plants in Australia and their dispersal by ants",https://doi.org/10.1071/bt9750475,1 3 | 1,"Mimicking the one-dimensional marginal distributions of processes having an ito differential",https://doi.org/10.1007/bf00699039,0 4 | -------------------------------------------------------------------------------- /tests/demo_data/snowballing_openalex.csv: -------------------------------------------------------------------------------- 1 | ,openalex_id,title,included 2 | 0,https://openalex.org/W2051970045,"Myrmecochorous plants in Australia and their dispersal by ants",1 3 | 1,https://openalex.org/W104454400,"Mimicking the one-dimensional marginal distributions of processes having an ito differential",0 4 | -------------------------------------------------------------------------------- /tests/test_compose.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from asreviewcontrib.datatools.compose import _check_order_arg 6 | from asreviewcontrib.datatools.compose import _check_resolve_arg 7 | from asreviewcontrib.datatools.compose import _check_suffix 8 | from asreviewcontrib.datatools.compose import create_composition 9 | 10 | parent_dir = Path(__file__).parent 11 | file_1 = Path(parent_dir, "demo_data", "dataset_1.ris") 12 | file_2 = Path(parent_dir, "demo_data", "dataset_2.ris") 13 | 14 | # labeling action on input paths in list = [relevant, irrelevant, labeled, unlabeled] 15 | input_files_1 = [ 16 | file_1, 17 | file_1, 18 | file_1, 19 | file_1, 20 | ] 21 | 22 | input_files_2 = [None, None, file_1, file_2] 23 | 24 | 25 | # test whether input and output suffixes are compatible 26 | def test_suffixes(): 27 | with pytest.raises(ValueError): 28 | _check_suffix(input_files_1, "conflicting_suffix.csv") 29 | 30 | 31 | # test whether wrong input hierarchy/order raises error 32 | def test_input_hierarchy(): 33 | with pytest.raises(ValueError): 34 | _check_order_arg("abc") 35 | with pytest.raises(ValueError): 36 | _check_order_arg("riur") 37 | 38 | 39 | # test whether wrong input conflict resolve raises error 40 | def test_input_resolve(): 41 | with pytest.raises(ValueError): 42 | _check_resolve_arg("fly") 43 | 44 | 45 | def test_label_prioritization(): 46 | # input identical datasets and overwrite everything with the relevant labels 47 | df_1 = create_composition(*input_files_1, order="riu") 48 | assert df_1["included"].value_counts()[1] == len(df_1) 49 | 50 | # input identical datasets and overwrite everything with the irrelevant labels 51 | df_2 = create_composition(*input_files_1, order="iru") 52 | assert df_2["included"].value_counts()[0] == len(df_2) 53 | 54 | # input identical datasets and overwrite everything as unlabeled 55 | df_3 = create_composition(*input_files_1, order="uri") 56 | assert df_3["included"].value_counts()[-1] == len(df_3) 57 | 58 | # input different datasets with some identical records, combining as labeled and 59 | # unlabeled data 60 | df_4 = create_composition(*input_files_2, order="riu") 61 | df_4_counts = df_4["included"].value_counts() 62 | assert df_4_counts[-1] == 7 and df_4_counts[0] == 3 and df_4_counts[1] == 1 63 | -------------------------------------------------------------------------------- /tests/test_dedup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from asreview.data import ASReviewData 4 | 5 | from asreviewcontrib.datatools.dedup import deduplicate_data 6 | 7 | test_dir = Path(__file__).parent 8 | file_without_doi = Path(test_dir, "demo_data", "duplicate_data_without_doi.csv") 9 | file_with_doi = Path(test_dir, "demo_data", "duplicate_data_with_doi.csv") 10 | 11 | 12 | def test_dedup_without_doi(): 13 | """ 14 | Test deduplication without DOI. 15 | 16 | The test data contains 5 records, 1 of which is an exact duplicate. 17 | 18 | Same as: 19 | 20 | asreview data dedup tests/demo_data/duplicate_data_without_doi.csv 21 | Not using doi for deduplication because there is no such data. 22 | Found 1 duplicates in dataset with 5 records. 23 | """ 24 | data = ASReviewData.from_file(file_without_doi) 25 | deduplicate_data(data) 26 | assert len(data.df) == 4 27 | 28 | 29 | def test_output(tmpdir): 30 | data = ASReviewData.from_file(file_without_doi) 31 | output_path = Path(tmpdir, "test_dedup.csv") 32 | deduplicate_data(data, output_path=output_path) 33 | as_test = ASReviewData.from_file(output_path) 34 | assert len(data.df) == 4 35 | assert len(as_test.df) == 4 36 | 37 | 38 | def test_dedup_with_doi(): 39 | """ 40 | Test deduplication with DOI. 41 | 42 | The test data contains 5 records, 1 of which is an exact duplicate 43 | and 1 of which is a duplicate based on DOI. 44 | 45 | Same as: 46 | 47 | asreview data dedup tests/demo_data/duplicate_data_with_doi.csv 48 | Found 2 duplicates in dataset with 5 records. 49 | """ 50 | data = ASReviewData.from_file(file_with_doi) 51 | deduplicate_data(data) 52 | assert len(data.df) == 3 53 | 54 | 55 | def test_dedup_with_similarity_without_doi(): 56 | """ 57 | Test deduplication with similarity without DOI. 58 | 59 | The test data contains 5 records, 1 of which is an exact duplicate 60 | and 1 of which is a duplicate based on similarity. 61 | 62 | Same as: 63 | 64 | asreview data dedup tests/demo_data/duplicate_data_without_doi.csv --similar \ 65 | --threshold 0.95 66 | Not using doi for deduplication because there is no such data. 67 | Found 2 duplicates in dataset with 5 records. 68 | """ 69 | data = ASReviewData.from_file(file_without_doi) 70 | deduplicate_data(data, similar=True, threshold=0.95) 71 | assert len(data.df) == 3, "Original data should have 5 records." 72 | 73 | 74 | def test_dedup_with_similarity_with_doi(): 75 | """ 76 | Test deduplication with similarity with DOI. 77 | 78 | The test data contains 5 records, 1 of which is an exact duplicate, 79 | 1 of which is a duplicate based on DOI, and 1 of which is a duplicate 80 | based on similarity. 81 | 82 | Same as: 83 | 84 | asreview data dedup tests/demo_data/duplicate_data_with_doi.csv --similar \ 85 | --threshold 0.95 86 | Found 3 duplicates in dataset with 5 records. 87 | """ 88 | data = ASReviewData.from_file(file_with_doi) 89 | deduplicate_data(data, similar=True, threshold=0.95) 90 | assert len(data.df) == 2 91 | 92 | 93 | def test_dedup_with_similarity_without_doi_stopwords(): 94 | """ 95 | Test deduplication with similarity without DOI and removing stopwords. 96 | 97 | The test data contains 5 records, 1 of which is an exact duplicate, 98 | 1 of which is a duplicate based on similarity, and 1 of which is a 99 | duplicate based on similarity without stopwords. 100 | 101 | Same as: 102 | 103 | asreview data dedup tests/demo_data/duplicate_data_without_doi.csv --similar \ 104 | --threshold 0.95 --stopwords 105 | Not using doi for deduplication because there is no such data. 106 | Found 3 duplicates in dataset with 5 records. 107 | """ 108 | data = ASReviewData.from_file(file_without_doi) 109 | deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") 110 | assert len(data.df) == 2 111 | 112 | 113 | def test_dedup_with_similarity_with_doi_stopwords(): 114 | """ 115 | Test deduplication with similarity with DOI and removing stopwords. 116 | 117 | The test data contains 5 records, 1 of which is an exact duplicate, 118 | 1 of which is a duplicate based on DOI, 1 of which is a duplicate 119 | based on similarity, and 1 of which is a duplicate based on similarity 120 | without stopwords. 121 | 122 | Same as: 123 | 124 | asreview data dedup tests/demo_data/duplicate_data_with_doi.csv --similar \ 125 | --threshold 0.95 --stopwords 126 | Found 4 duplicates in dataset with 5 records. 127 | """ 128 | data = ASReviewData.from_file(file_with_doi) 129 | deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") 130 | assert len(data.df) == 1 131 | 132 | 133 | def test_threshold_zero(): 134 | data = ASReviewData.from_file(file_with_doi) 135 | deduplicate_data(data, similar=True, threshold=0) 136 | assert len(data.df) == 1 137 | -------------------------------------------------------------------------------- /tests/test_describe.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def test_describe(): 5 | subprocess.run(["asreview", "data-describe", "benchmark:van_de_schoot2017"]) 6 | -------------------------------------------------------------------------------- /tests/test_sample.py: -------------------------------------------------------------------------------- 1 | # create unit tests for the sample.py file 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | from asreviewcontrib.datatools.sample import sample 7 | 8 | INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv" 9 | 10 | 11 | def test_sample(tmpdir): 12 | sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year") 13 | df = pd.read_csv(tmpdir / "output.csv") 14 | assert len(df) == 3 15 | assert "publication_year" in df.columns 16 | assert df.iloc[0]["publication_year"] == 2000 17 | assert df.iloc[2]["publication_year"] == 2005 18 | -------------------------------------------------------------------------------- /tests/test_snowball.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | import pyalex 5 | 6 | from asreviewcontrib.datatools.snowball import backward_snowballing 7 | from asreviewcontrib.datatools.snowball import forward_snowballing 8 | from asreviewcontrib.datatools.snowball import openalex_from_doi 9 | from asreviewcontrib.datatools.snowball import snowball 10 | 11 | INPUT_DIR = Path(__file__).parent / "demo_data" 12 | EMAIL = "asreview@uu.nl" 13 | 14 | pyalex.config.email = EMAIL 15 | 16 | # These works were chosen for testing forward snowballing. 17 | # They have a DOI, they cite and are cited by, their cited_by_count is less than 400, 18 | # so it takes only two requests to get all citing works. And they are from the previous 19 | # century so the cited_by_count is unlikely to change very much. 20 | # These are also the same records as in the demo datasets 'snowballing_doi.csv' and 21 | # 'snowballing_openalex.csv'. 22 | WORKS = [ 23 | { 24 | "id": "https://openalex.org/W2051970045", 25 | "doi": "https://doi.org/10.1071/bt9750475", 26 | "title": "Myrmecochorous plants in Australia and their dispersal by ants", 27 | "cited_by_count": 372, 28 | "cited_by": "https://openalex.org/W2174650845", 29 | "cites": "https://openalex.org/W1538725992", 30 | }, 31 | { 32 | "id": "https://openalex.org/W104454400", 33 | "doi": "https://doi.org/10.1007/bf00699039", 34 | "title": ( 35 | "Mimicking the one-dimensional marginal distributions of processes having" 36 | " an ito differential" 37 | ), 38 | "cited_by_count": 299, 39 | "cited_by": "https://openalex.org/W1842249978", 40 | "cites": "https://openalex.org/W1513091520", 41 | }, 42 | ] 43 | 44 | 45 | def test_openalex_from_doi(): 46 | dois = [ 47 | "https://doi.org/10.1042/cs20220150", 48 | "https://doi.org/10.1042/bst20220734", 49 | "not_a_doi", 50 | ] 51 | 52 | assert openalex_from_doi(dois) == { 53 | "10.1042/cs20220150": "https://openalex.org/W4386305682", 54 | "10.1042/bst20220734": "https://openalex.org/W4312006214", 55 | "not_a_doi": None, 56 | } 57 | 58 | 59 | def test_backward_snowballing(): 60 | identifiers = [ 61 | "https://openalex.org/W4281483266", 62 | "https://openalex.org/W2008620264", 63 | ] 64 | 65 | backwards_citations = backward_snowballing(identifiers) 66 | 67 | assert "https://openalex.org/W1864285629" in [ 68 | field_dict["id"] for field_dict in backwards_citations[identifiers[0]] 69 | ] 70 | assert "https://openalex.org/W950821216" in [ 71 | field_dict["id"] for field_dict in backwards_citations[identifiers[1]] 72 | ] 73 | 74 | 75 | def test_forward_snowballing(): 76 | identifiers = [work["id"] for work in WORKS] 77 | 78 | forwards_citations = forward_snowballing(identifiers) 79 | 80 | assert WORKS[0]["cited_by"] in [ 81 | field_dict["id"] for field_dict in forwards_citations[identifiers[0]] 82 | ] 83 | assert WORKS[1]["cited_by"] in [ 84 | field_dict["id"] for field_dict in forwards_citations[identifiers[1]] 85 | ] 86 | 87 | 88 | def test_openalex_id_forward(tmpdir): 89 | out_fp = Path(tmpdir, "forward.csv") 90 | snowball( 91 | input_path=INPUT_DIR / "snowballing_openalex.csv", 92 | output_path=out_fp, 93 | forward=True, 94 | backward=False, 95 | use_all=False, 96 | email=EMAIL, 97 | ) 98 | df = pd.read_csv(out_fp) 99 | assert len(df) >= 364 100 | 101 | all_out_fp = Path(tmpdir, "forward_all.csv") 102 | snowball( 103 | input_path=INPUT_DIR / "snowballing_openalex.csv", 104 | output_path=all_out_fp, 105 | forward=True, 106 | backward=False, 107 | use_all=True, 108 | email=EMAIL, 109 | ) 110 | df_all = pd.read_csv(all_out_fp) 111 | assert len(df_all) >= 656 112 | 113 | 114 | def test_openalex_id_backward(tmpdir): 115 | out_fp = Path(tmpdir, "backward.csv") 116 | snowball( 117 | input_path=INPUT_DIR / "snowballing_openalex.csv", 118 | output_path=out_fp, 119 | forward=False, 120 | backward=True, 121 | use_all=False, 122 | email=EMAIL, 123 | ) 124 | df = pd.read_csv(out_fp) 125 | # Actual value at time of writing test is 40 (2024-08-26). 126 | # In theory the number of results should be stable for backward snowballing, 127 | # but OpenAlex sometimes makes changes, so we allow for a margin. 128 | # The margins of this assert and the next assert should not overlap, 129 | # otherwise we don't test if 'use_all' works. 130 | assert 38 <= len(df) <= 42 131 | 132 | all_out_fp = Path(tmpdir, "backward_all.csv") 133 | snowball( 134 | input_path=INPUT_DIR / "snowballing_openalex.csv", 135 | output_path=all_out_fp, 136 | forward=False, 137 | backward=True, 138 | use_all=True, 139 | email=EMAIL, 140 | ) 141 | df_all = pd.read_csv(all_out_fp) 142 | # Actual value at time of writing test is 46 (2024-08-26). 143 | assert 43 <= len(df_all) <= 49 144 | 145 | 146 | def test_snowballing_from_doi(tmpdir): 147 | out_fp = Path(tmpdir, "doi_all.csv") 148 | snowball( 149 | input_path=INPUT_DIR / "snowballing_doi.csv", 150 | output_path=out_fp, 151 | forward=False, 152 | backward=True, 153 | use_all=True, 154 | email=EMAIL, 155 | ) 156 | df = pd.read_csv(out_fp) 157 | # Actual value at time of writing test is 46 (2024-08-26). 158 | # See comments in 'test_openalex_id_backward'. 159 | assert 43 <= len(df) <= 49 160 | -------------------------------------------------------------------------------- /tests/test_stack.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from asreview.data import ASReviewData 4 | 5 | from asreviewcontrib.datatools.stack import vstack 6 | 7 | test_dir = Path(__file__).parent 8 | file_1 = Path(test_dir, "demo_data", "dataset_1.ris") 9 | file_2 = Path(test_dir, "demo_data", "dataset_2.ris") 10 | 11 | 12 | def test_stack(tmpdir): 13 | output_path = Path(tmpdir, "test_output.ris") 14 | vstack(output_path, [file_1, file_2]) 15 | as_test = ASReviewData.from_file(output_path) 16 | 17 | assert len(as_test.df) == 14 18 | assert as_test.df["included"].value_counts()[-1] == 9 19 | assert as_test.df["included"].value_counts()[0] == 3 20 | assert as_test.df["included"].value_counts()[1] == 2 21 | --------------------------------------------------------------------------------