├── .devcontainer
    └── devcontainer.json
├── .editorconfig
├── .gitattributes
├── .gitignore
├── .vscode
    ├── launch.json
    └── settings.json
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── images
    └── pmc_15m_pipeline.jpg
├── pmc15_pipeline
    ├── __init__.py
    ├── constants.py
    ├── data.py
    ├── types.py
    └── utils
    │   ├── __init__.py
    │   └── fs_utils.py
├── requirements.txt
├── run_pmc15_pipeline.ipynb
└── tox.ini


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python
 3 | {
 4 | 	"name": "Python 3",
 5 | 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 6 | 	"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye"
 7 | 
 8 | 	// Features to add to the dev container. More info: https://containers.dev/features.
 9 | 	// "features": {},
10 | 
11 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
12 | 	// "forwardPorts": [],
13 | 
14 | 	// Use 'postCreateCommand' to run commands after the container is created.
15 | 	// "postCreateCommand": "pip3 install --user -r requirements.txt",
16 | 
17 | 	// Configure tool-specific properties.
18 | 	// "customizations": {},
19 | 
20 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
21 | 	// "remoteUser": "root"
22 | }
23 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | # Unix-style newlines with a newline ending every file
 4 | [*]
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | charset = utf-8
 9 | 
10 | [*.py]
11 | max_line_length = 79
12 | 
13 | # 4 space indentation
14 | [*.{py,json}]
15 | indent_style = space
16 | indent_size = 4
17 | 
18 | # 2 space indentation
19 | [*.{md,sh,yaml,yml}]
20 | indent_style = space
21 | indent_size = 2
22 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # https://git-scm.com/docs/gitattributes
 2 | 
 3 | # Set the default behavior, in case people don't have core.autocrlf set.
 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion
 5 | * text=auto
 6 | 
 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
 8 | # Source files
 9 | # ============
10 | *.pxd    text diff=python
11 | *.py     text diff=python
12 | *.py3    text diff=python
13 | *.pyw    text diff=python
14 | *.pyx    text diff=python
15 | *.pyz    text diff=python
16 | *.pyi    text diff=python
17 | 
18 | # Binary files
19 | # ============
20 | *.db     binary
21 | *.p      binary
22 | *.pkl    binary
23 | *.pickle binary
24 | *.pyc    binary export-ignore
25 | *.pyo    binary export-ignore
26 | *.pyd    binary
27 | 
28 | # Jupyter notebook
29 | *.ipynb  text eol=lf
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | _results/
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | docs/source/generated
27 | .pytest_cache
28 | 
29 | # Jupyter Notebook
30 | .ipynb_checkpoints
31 | 
32 | # virtualenv
33 | .venv
34 | venv/
35 | ENV/
36 | 
37 | # Experimental Folder
38 | experimental/*
39 | 
40 | # Weights and Biases logs
41 | wandb/
42 | 
43 | # Debug results dir
44 | _results/*
45 | 
46 | # PyTorch Checkpoints
47 | *.pt
48 | 
49 | # YOLO output
50 | runs/
51 | 
52 | # SAM output
53 | *.jpg
54 | 
55 | # MLFlow output
56 | mlruns/
57 | 
58 | # Output from unit tests
59 | expansion_pipeline/code/tests/test_results/*
60 | 
61 | # Environment Variables
62 | .env*
63 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // Use IntelliSense to learn about possible attributes.
  3 |     // Hover to view descriptions of existing attributes.
  4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  5 |     "version": "0.2.0",
  6 |     "configurations": [
  7 |         {
  8 |             "name": "UI: Caption Splitter",
  9 |             "type": "python",
 10 |             "request": "launch",
 11 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 12 |             "module": "components.caption_splitter.caption_splitter_prefilter_ui",
 13 |             "console": "integratedTerminal",
 14 |             "justMyCode": false
 15 |         },
 16 |         {
 17 |             "name": "UI: Citance Adder",
 18 |             "type": "python",
 19 |             "request": "launch",
 20 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 21 |             "module": "components.citance_adder.citance_adder_ui",
 22 |             "console": "integratedTerminal",
 23 |             "justMyCode": false
 24 |         },
 25 |         {
 26 |             "name": "UI: Text Label Matcher",
 27 |             "type": "python",
 28 |             "request": "launch",
 29 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 30 |             "module": "components.text_label_matcher.text_label_matcher_ui",
 31 |             "console": "integratedTerminal",
 32 |             "justMyCode": false
 33 |         },
 34 |         {
 35 |             "name": "UI: Panel Matcher",
 36 |             "type": "python",
 37 |             "request": "launch",
 38 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 39 |             "module": "components.panel_matcher.panel_matcher_ui",
 40 |             "console": "integratedTerminal",
 41 |             "justMyCode": false
 42 |         },
 43 |         {
 44 |             "name": "UI: Data Merger",
 45 |             "type": "python",
 46 |             "request": "launch",
 47 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 48 |             "module": "components.data_merger.data_merger_ui",
 49 |             "console": "integratedTerminal",
 50 |             "justMyCode": false
 51 |         },
 52 |         {
 53 |             "name": "UI: Data Expander",
 54 |             "type": "python",
 55 |             "request": "launch",
 56 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 57 |             "module": "components.data_expander.data_expander_ui",
 58 |             "console": "integratedTerminal",
 59 |             "justMyCode": false
 60 |         },
 61 |         {
 62 |             "name": "UI: Component Comparitor",
 63 |             "type": "python",
 64 |             "request": "launch",
 65 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 66 |             "module": "components.component_comparator.component_comparator_ui",
 67 |             "console": "integratedTerminal",
 68 |             "justMyCode": false
 69 |         },
 70 |         {
 71 |             "name": "Local Run: Caption Splitter",
 72 |             "type": "python",
 73 |             "request": "launch",
 74 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 75 |             "module": "components.caption_splitter.caption_splitter_prefilter_runner",
 76 |             "console": "integratedTerminal",
 77 |             "justMyCode": false
 78 |         },
 79 |         {
 80 |             "name": "Local Run: Citance Adder",
 81 |             "type": "python",
 82 |             "request": "launch",
 83 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 84 |             "module": "components.citance_adder.citance_adder_runner",
 85 |             "console": "integratedTerminal",
 86 |             "justMyCode": false
 87 |         },
 88 |         {
 89 |             "name": "Local Run: Text Label Matcher Input",
 90 |             "type": "python",
 91 |             "request": "launch",
 92 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
 93 |             "module": "components.text_label_matcher_input.text_label_matcher_input_runner",
 94 |             "console": "integratedTerminal",
 95 |             "justMyCode": false
 96 |         },
 97 |         {
 98 |             "name": "Local Run: Text Label Matcher",
 99 |             "type": "python",
100 |             "request": "launch",
101 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
102 |             "module": "components.text_label_matcher.text_label_matcher_runner",
103 |             "console": "integratedTerminal",
104 |             "justMyCode": false
105 |         },
106 |         {
107 |             "name": "Local Run: Panel Matcher Input",
108 |             "type": "python",
109 |             "request": "launch",
110 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
111 |             "module": "components.panel_matcher_input.panel_matcher_input_runner",
112 |             "console": "integratedTerminal",
113 |             "justMyCode": false
114 |         },
115 |         {
116 |             "name": "Local Run: Panel Matcher",
117 |             "type": "python",
118 |             "request": "launch",
119 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
120 |             "module": "components.panel_matcher.panel_matcher_runner",
121 |             "console": "integratedTerminal",
122 |             "justMyCode": false
123 |         },
124 |         {
125 |             "name": "Local Run: Data Expander",
126 |             "type": "python",
127 |             "request": "launch",
128 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
129 |             "module": "components.data_expander.data_expander_runner",
130 |             "console": "integratedTerminal",
131 |             "justMyCode": false
132 |         },
133 |         {
134 |             "name": "Local Run: Data Merger",
135 |             "type": "python",
136 |             "request": "launch",
137 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
138 |             "module": "components.data_merger.data_merger_runner",
139 |             "console": "integratedTerminal",
140 |             "justMyCode": false
141 |         },
142 |         {
143 |             "name": "Local Run: Analyze Data",
144 |             "type": "python",
145 |             "request": "launch",
146 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
147 |             "module": "shared.analyze_pipeline_results",
148 |             "console": "integratedTerminal",
149 |             "justMyCode": false
150 |         },
151 |         {
152 |             "name": "Python: Text Label Matcher",
153 |             "type": "python",
154 |             "request": "launch",
155 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
156 |             "module": "components.text_label_matcher.cli_runner",
157 |             "console": "integratedTerminal",
158 |             "justMyCode": false,
159 |             "args": [
160 |                 "--input_folder_path",
161 |                 "${workspaceFolder}/_results/3.text_label_matcher_input",
162 |                 "--output_folder_path",
163 |                 "${workspaceFolder}/_results/3.text_label_matcher"
164 |             ],
165 |             "env": {
166 |                 "LOGLEVEL": "DEBUG"
167 |             }
168 |         },
169 |         {
170 |             "name": "Python: OpenCV Panel Split",
171 |             "type": "python",
172 |             "request": "launch",
173 |             "module": "segmentation.opencv_panel_split",
174 |             "console": "integratedTerminal",
175 |             "justMyCode": false,
176 |             "args": [
177 |                 "--input_image_folder_path",
178 |                 "${workspaceFolder}/_results/sample_multi_panel_images"
179 |             ],
180 |         },
181 |         {
182 |             "name": "Python: Caption Splitter",
183 |             "type": "python",
184 |             "request": "launch",
185 |             "module": "text_caption_segmentation.segment_captions",
186 |             "console": "integratedTerminal",
187 |             "justMyCode": false
188 |         },
189 |         {
190 |             "name": "Python: Download Pub Med Articles",
191 |             "type": "python",
192 |             "request": "launch",
193 |             "module": "data.download_pubmed_files",
194 |             "console": "integratedTerminal",
195 |             "justMyCode": false,
196 |             "args": [
197 |             ],
198 |         },
199 |         {
200 |             "name": "Python: Decompress Pub Med Articles",
201 |             "type": "python",
202 |             "request": "launch",
203 |             "module": "data.decompress_pubmed_files",
204 |             "console": "integratedTerminal",
205 |             "justMyCode": false,
206 |             "args": [
207 |             ],
208 |         },
209 |         {
210 |             "name": "Python: Optical Character Recognition (v3.2)",
211 |             "type": "python",
212 |             "request": "launch",
213 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
214 |             "module": "components.image_label_detector.aml_component_cli_runner",
215 |             "console": "integratedTerminal",
216 |             "justMyCode": false,
217 |             "args": [
218 |                 "--input_folder_path",
219 |                 "${workspaceFolder}/_results/image_label_detector/input",
220 |                 "--output_folder_path",
221 |                 "${workspaceFolder}/_results/image_label_detector/output",
222 |                 "--batch_size",
223 |                 "2",
224 |                 "--max_images",
225 |                 "5",
226 |                 // "--log_figure_texts",
227 |                 // "True",
228 |             ],
229 |             "envFile": "${workspaceFolder}/.env",
230 |             "env": {
231 |                 "AZURE_VISION_ENDPOINT": "https://westus2.api.cognitive.microsoft.com/",
232 |                 "LOGLEVEL": "DEBUG"
233 |             }
234 |         },
235 |         {
236 |             "name": "Python: Image Analysis (v4.0)",
237 |             "type": "python",
238 |             "request": "launch",
239 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
240 |             "module": "components.image_label_detector.aml_component_cli_runner",
241 |             "console": "integratedTerminal",
242 |             "justMyCode": false,
243 |             "args": [
244 |                 "--input_folder_path",
245 |                 "${workspaceFolder}/_results/2.1.caption_splitter/2023-07-24T18-04/success",
246 |                 "--output_folder_path",
247 |                 "${workspaceFolder}/_results/image_label_detector/output",
248 |                 "--max_images",
249 |                 "5",
250 |                 // "--log_figure_texts",
251 |                 // "True",
252 |                 "--use_version_4",
253 |                 "True",
254 |             ],
255 |             "envFile": "${workspaceFolder}/.env",
256 |             "env": {
257 |                 "AZURE_VISION_ENDPOINT": "https://westus2.api.cognitive.microsoft.com/",
258 |                 "LOGLEVEL": "DEBUG"
259 |             }
260 |         },
261 |         {
262 |             "name": "Python: Image Label Comparer",
263 |             "type": "python",
264 |             "request": "launch",
265 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
266 |             "module": "components.image_label_detector.figure_texts_comparer",
267 |             "console": "integratedTerminal",
268 |             "justMyCode": false,
269 |             "args": [
270 |                 "--figures_file_path",
271 |                 "${workspaceFolder}/_results/image_label_detector/input/figures.jsonl",
272 |                 "--figure_texts_objects_a_file_path",
273 |                 "${workspaceFolder}/_results/image_label_detector/output/20230717.190855_v3.2/success/figure_texts.jsonl",
274 |                 // "${workspaceFolder}/_results/image_label_detector/output/20230713.164600/output.jsonl",
275 |                 "--figure_texts_objects_b_file_path",
276 |                 "${workspaceFolder}/_results/image_label_detector/output/20230717.190918_v4.0/success/figure_texts.jsonl",
277 |                 // "${workspaceFolder}/_results/image_label_detector/output/20230718.174032_v4.0/success/figure_texts.jsonl",
278 |                 "--output_folder_path",
279 |                 "${workspaceFolder}/_results/image_label_detector/output",
280 |             ],
281 |             "envFile": "${workspaceFolder}/.env",
282 |             "env": {
283 |                 "LOGLEVEL": "DEBUG"
284 |             }
285 |         },
286 |         {
287 |             "name": "Python: Figure Splitter",
288 |             "type": "python",
289 |             "request": "launch",
290 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
291 |             "module": "components.figure_splitter.aml_component_cli_runner",
292 |             "console": "integratedTerminal",
293 |             "justMyCode": false,
294 |             "args": [
295 |                 "--input_folder_path",
296 |                 "${workspaceFolder}/_results/2.1.caption_splitter/2023-07-24T18-04/success",
297 |                 "--output_folder_path",
298 |                 "${workspaceFolder}/_results/figure_splitter/output/",
299 |                 "--max_images",
300 |                 "5",
301 |             ],
302 |             "env": {
303 |                 "LOGLEVEL": "DEBUG"
304 |             }
305 |         },
306 |         {
307 |             "name": "Python: Data Expander CLI Runner",
308 |             "type": "python",
309 |             "request": "launch",
310 |             "cwd": "${workspaceFolder}/expansion_pipeline/code",
311 |             "module": "components.data_expander.data_expander_cli_runner",
312 |             "console": "integratedTerminal",
313 |             "justMyCode": false,
314 |             "args": [
315 |                 "--input_folder_path",
316 |                 "${workspaceFolder}/_results/4.panel_matcher/2023-07-26T16-41/success",
317 |                 "--output_folder_path",
318 |                 "${workspaceFolder}/_results/5.data_expander",
319 |                 "--output_dbfs_paths",
320 |                 "True",
321 |                 "--max_figures",
322 |                 "5",
323 |             ],
324 |             "envFile": "${workspaceFolder}/.env",
325 |             "env": {
326 |                 "LOGLEVEL": "DEBUG"
327 |             }
328 |         },
329 |     ]
330 | }
331 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "autoDocstring.docstringFormat": "google",
 3 |     "python.analysis.autoImportCompletions": true,
 4 |     "python.analysis.completeFunctionParens": true,
 5 |     "python.linting.flake8Enabled": true,
 6 |     "python.testing.pytestEnabled": true,
 7 |     "python.testing.pytestArgs": [
 8 |         "expansion_pipeline/code"
 9 |     ],
10 |     "python.testing.unittestEnabled": false,
11 |     "python.analysis.typeCheckingMode": "basic"
12 | }
13 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BiomedCLIP Data Pipeline
 2 | 
 3 | [![Code License](https://img.shields.io/badge/Code%20License-MIT%20License-red)](LICENSE)
 4 | 
 5 | *A pipeline to construct millions of image-caption figures from PubMed.*
 6 | 
 7 | [[NEJM AI Article](https://ai.nejm.org/stoken/default+domain/9VPKUGJYJ5BPFXY83IBS/full?redirectUri=doi/full/10.1056/AIoa2400640)] 
 8 | 
 9 | **BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs** <br>
10 | 
11 | Sheng Zhang, Yanbo Xu, Naoto Usuyama, Hanwen Xu, Jaspreet Bagga, Robert Tinn, Sam Preston, Rajesh Rao, Mu Wei, Naveen Valluri, Cliff Wong, Andrea Tupini, Yu Wang, Matt Mazzola, Swadheen Shukla, Lars Liden, Jianfeng Gao, Angela Crabtree, Brian Piening, Carlo Bifulco, Matthew P. Lungren, Tristan Naumann, Sheng Wang, Hoifung Poon
12 | 
13 | <p align="center">
14 |     <img src="images/pmc_15m_pipeline.jpg" width="80%"> <br>
15 | </p>
16 | 
17 | 
18 | This repository hosts the **BiomedCLIP Data Pipeline**, which automatically downloads and processes a set of articles from the PubMed Central Open Access dataset. The end result is a JSONL file containing figures and associated captions, which can be used to train the **BiomedCLIP** model.
19 | 
20 | For a hands-on demonstration, refer to the [example notebook](run_pmc15_pipeline.ipynb).
21 | 
22 | 
23 | ## Environment Setup 
24 | 
25 | ```bash
26 | # it is recmmended to use a virtual environment but not required
27 | python -m venv .venv
28 | source .venv/bin/activate
29 | 
30 | pip install -r requirements.txt
31 | ```
32 | 
33 | ## Reference
34 | ```bibtex
35 | @article{zhang2024biomedclip,
36 |   title={A Multimodal Biomedical Foundation Model Trained from Fifteen Million Image–Text Pairs},
37 |   author={Sheng Zhang and Yanbo Xu and Naoto Usuyama and Hanwen Xu and Jaspreet Bagga and Robert Tinn and Sam Preston and Rajesh Rao and Mu Wei and Naveen Valluri and Cliff Wong and Andrea Tupini and Yu Wang and Matt Mazzola and Swadheen Shukla and Lars Liden and Jianfeng Gao and Angela Crabtree and Brian Piening and Carlo Bifulco and Matthew P. Lungren and Tristan Naumann and Sheng Wang and Hoifung Poon},
38 |   journal={NEJM AI},
39 |   year={2024},
40 |   volume={2},
41 |   number={1},
42 |   doi={10.1056/AIoa2400640},
43 |   url={https://ai.nejm.org/doi/full/10.1056/AIoa2400640}
44 | }
45 | ```
46 | 
47 | ## Contributing
48 | 
49 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
50 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
51 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
52 | 
53 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
54 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
55 | provided by the bot. You will only need to do this once across all repos using our CLA.
56 | 
57 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
58 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
59 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
60 | 
61 | ## Trademarks
62 | 
63 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
64 | trademarks or logos is subject to and must follow 
65 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
66 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
67 | Any use of third-party trademarks or logos are subject to those third-party's policies.
68 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/images/pmc_15m_pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/images/pmc_15m_pipeline.jpg


--------------------------------------------------------------------------------
/pmc15_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/pmc15_pipeline/__init__.py


--------------------------------------------------------------------------------
/pmc15_pipeline/constants.py:
--------------------------------------------------------------------------------
1 | 
2 | PUBMED_OPEN_ACCESS_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/"
3 | PUBMED_OPEN_ACCESS_FILE_LIST_URL = PUBMED_OPEN_ACCESS_BASE_URL + "oa_file_list.txt"
4 | 


--------------------------------------------------------------------------------
/pmc15_pipeline/data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tarfile
  3 | from pathlib import Path
  4 | from typing import Optional
  5 | 
  6 | import pubmed_parser
  7 | import requests
  8 | from lxml import etree
  9 | from tqdm import tqdm
 10 | 
 11 | from .constants import PUBMED_OPEN_ACCESS_BASE_URL, PUBMED_OPEN_ACCESS_FILE_LIST_URL
 12 | from .types import PubMedFile
 13 | from .utils import fs_utils
 14 | 
 15 | repo_root = fs_utils.get_repo_root_path()
 16 | 
 17 | 
 18 | def download_pubmed_file_list(
 19 |     url=PUBMED_OPEN_ACCESS_FILE_LIST_URL,
 20 |     output_file_path: Path = (
 21 |         repo_root / "_results" / "data" / "pubmed_open_access_file_list.txt"
 22 |     ),
 23 | ):
 24 |     # Ensure output directory exists
 25 |     Path(output_file_path).parent.mkdir(parents=True, exist_ok=True)
 26 | 
 27 |     # Download file
 28 |     print(f"Downloading OpenAccess file list from: {url} to {output_file_path}")
 29 | 
 30 |     if Path(output_file_path).exists():
 31 |         print(f"File already exists: {output_file_path}")
 32 |         return
 33 | 
 34 |     response = requests.get(url)
 35 | 
 36 |     with open(output_file_path, "wb") as file:
 37 |         file.write(response.content)
 38 | 
 39 |     print(f"Saved to: {output_file_path}")
 40 | 
 41 | 
 42 | def download_pubmed_files_from_list(
 43 |     file_list_path: Path = (
 44 |         repo_root / "_results/data/pubmed_open_access_file_list.txt"
 45 |     ),
 46 |     output_folder_path: Path = (
 47 |         repo_root / "_results" / "data" / "pubmed_open_access_files_compressed"
 48 |     ),
 49 |     subset_size: Optional[int] = None,
 50 |     file_extension=".tar.gz",
 51 | ):
 52 |     """Download files from PubMed Open Access file list
 53 | 
 54 |     Args:
 55 |         file_list_path (Path, optional): Path to PubMed Open Access Files list. Defaults to "_results/data/pubmed_open_access_file_list_top_100.txt".
 56 |         output_folder_path (Path, optional): Path to save directory. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files_compressed".
 57 |         subset_size (int, optional): Number of files to download. Defaults to None (download all files).
 58 | 
 59 |     Example:
 60 | 
 61 |         python3 -m data.download_pubmed_files
 62 |     """
 63 | 
 64 |     # Get dicts from files list
 65 |     pubmed_files: list[PubMedFile] = []
 66 | 
 67 |     with open(file_list_path, "r") as file:
 68 |         lines = file.readlines()
 69 | 
 70 |         # Skip header
 71 |         lines = lines[1:]
 72 | 
 73 |         for line_idx, line in enumerate(lines):
 74 |             if subset_size and line_idx + 1 > subset_size:
 75 |                 break
 76 | 
 77 |             [path, title, pmcid, pmid, code] = line.strip().split("\t")
 78 |             pubmed_file: PubMedFile = {
 79 |                 "path": path,
 80 |                 "title": title,
 81 |                 "pmcid": pmcid,
 82 |                 "pmid": pmid,
 83 |                 "code": code,
 84 |             }
 85 |             pubmed_files.append(pubmed_file)
 86 | 
 87 |     # Create output folder
 88 |     output_folder_path.mkdir(parents=True, exist_ok=True)
 89 |     skipped_files = []
 90 | 
 91 |     def _get_file_size(url):
 92 |         response = requests.head(url)
 93 |         if "Content-Length" in response.headers:
 94 |             return int(response.headers["Content-Length"])
 95 |         else:
 96 |             return None
 97 | 
 98 |     for pubmed_file in tqdm(pubmed_files):
 99 |         file_name = pubmed_file["pmcid"] + file_extension
100 |         file_path = output_folder_path / file_name
101 | 
102 |         # Check if the file already exists
103 |         if file_path.exists():
104 |             tqdm.write(f"File: {file_name} already exists. Not downloading again.")
105 |             continue
106 | 
107 |         article_url = PUBMED_OPEN_ACCESS_BASE_URL + pubmed_file["path"]
108 |         file_size = _get_file_size(article_url)
109 | 
110 |         if file_size is not None:
111 |             tqdm.write(f"File: {file_name} size: {file_size} bytes")
112 |             try:
113 |                 response = requests.get(article_url)
114 |                 response.raise_for_status()  # Raise an HTTPError for bad responses
115 | 
116 |                 with open(file_path, "wb") as file:
117 |                     file.write(response.content)
118 | 
119 |             except requests.exceptions.RequestException as e:
120 |                 tqdm.write(f"File: {file_name} Skipped! Error occurred: {e}")
121 |                 skipped_files.append(pubmed_file)
122 | 
123 |             with open(file_path, "wb") as file:
124 |                 file.write(response.content)
125 | 
126 |         else:
127 |             tqdm.write(f"File: {file_name} Skipped! Could not get file size!")
128 |             skipped_files.append(pubmed_file)
129 | 
130 |     print(f"Skipped {len(skipped_files)} files.")
131 | 
132 | 
133 | def decompress_pubmed_files(
134 |     input_folder_path: Path = (
135 |         repo_root / "_results" / "data" / "pubmed_open_access_files_compressed"
136 |     ),
137 |     output_folder_path: Path = (
138 |         repo_root / "_results" / "data" / "pubmed_open_access_files"
139 |     ),
140 |     file_extension="*.tar.gz",
141 | ):
142 |     """Decompress article files from PubMed Open Access folder
143 | 
144 |     Args:
145 |         input_folder_path (Path, optional): _description_. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files_compressed".
146 |         output_folder_path (Path, optional): _description_. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files".
147 |         file_extension (str, optional): _description_. Defaults to ".tar.gz".
148 | 
149 |     Example:
150 | 
151 |         python3 -m data.decompress_pubmed_files
152 |     """
153 | 
154 |     # Get dicts from files list
155 |     file_paths = list(input_folder_path.glob(file_extension))
156 | 
157 |     print(
158 |         f"Found {len(file_paths)} files that match {file_extension} in {input_folder_path}"
159 |     )
160 | 
161 |     for file_path in tqdm(file_paths):
162 |         with tarfile.open(file_path, "r:gz") as tar_file:
163 |             # TODO: Use article folder path instead of output folder path?
164 |             # Causes duplicate folder names since tar file contains folder
165 |             tar_file.extractall(output_folder_path)
166 | 
167 |     print(f"Finished extracting {len(file_paths)} files")
168 | 
169 | 
170 | def generate_pmc15_pipeline_outputs(
171 |     decompressed_folder: Path = (
172 |         repo_root / "_results" / "data" / "pubmed_open_access_files"
173 |     ),
174 |     output_file_path: Path = (
175 |         repo_root / "_results" / "data" / "pubmed_parsed_data.json"
176 |     ),
177 | ):
178 | 
179 |     # input - path to .nxml file for each article in the article package
180 |     # output - json object with pmid, pmc id, location (path to article package in storage blobs), figures - list of figure objects which include inline references (mentions of figure throughout the article), caption for the figure, id, label, graphic_ref (filepath to figure jpg in storage blobs), pair_id (a unique id to identify each figure in the article, using pmid + figure_id)
181 |     def parse_single_pubmed_file(nxml_path: Path):
182 |         print(nxml_path)
183 | 
184 |         if nxml_path is None or not nxml_path.exists():
185 |             print("error")
186 |             return []
187 | 
188 |         try:
189 |             print("starting...")
190 |             output = pubmed_parser.parse_pubmed_caption(str(nxml_path.absolute()))
191 |             print("parsed", nxml_path)
192 |         except AttributeError as ae:
193 |             print("Attribute Error: " + str(ae) + " path: " + str(nxml_path))
194 |             return []
195 |         except etree.XMLSyntaxError as xmle:
196 |             print("XML Syntax Error: " + str(xmle) + " path: " + str(nxml_path))
197 |             return []
198 |         except Exception as e:
199 |             print("Exception: " + str(e) + " path: " + str(nxml_path))
200 |             return []
201 | 
202 |         if not output:
203 |             print("no output")
204 |             return []
205 | 
206 |         else:
207 |             figures = []
208 |             pmid = output[0]["pmid"]  # same for all figures in the article
209 |             pmc = output[0]["pmc"]  # same for all figures in the article
210 |             location = Path(nxml_path).parent
211 | 
212 |             # for all figures in the article, create a figure object with inline references (text, section, reference_id), and caption, id, label, graphic_ref, pair_id
213 |             for figure_dict in output:
214 |                 inline_references = figure_dict.get(
215 |                     "fig_refs", {}
216 |                 )  # from pubmed parser
217 |                 ir_objects = []
218 |                 for inline_reference in inline_references:
219 |                     inline_reference_object = {
220 |                         "text": str(inline_reference.get("text", "")),
221 |                         "section": str(inline_reference.get("section", "")),
222 |                         "reference_id": str(inline_reference.get("reference_id", "")),
223 |                     }
224 | 
225 |                     ir_objects.append(inline_reference_object)
226 | 
227 |                 if len(ir_objects) > 0:
228 |                     raise NotImplementedError("Inline references not implemented")
229 | 
230 |                 figure_object = {
231 |                     "fig_caption": str(figure_dict.get("fig_caption", "")),
232 |                     "fig_id": str(figure_dict.get("fig_id", "")),
233 |                     "fig_label": str(figure_dict.get("fig_label", "")),
234 |                     "graphic_ref": (
235 |                         str(location / (figure_dict["graphic_ref"] + ".jpg"))
236 |                         if "graphic_ref" in figure_dict
237 |                         else ""
238 |                     ),  # set this to the path of the jpg image in storage blobs
239 |                     "pair_id": str(pmid) + "_" + str(figure_dict.get("fig_id", "")),
240 |                     "inline_references": ir_objects,  # add inline references
241 |                 }
242 | 
243 |                 figures.append(figure_object)
244 | 
245 |             article = {
246 |                 "pmid": pmid,
247 |                 "pmc": pmc,
248 |                 "location": str(location),
249 |                 "figures": figures,
250 |             }
251 | 
252 |             return [article]
253 | 
254 |     with output_file_path.open("w+") as f:
255 |         for idx, nxml_file in enumerate(decompressed_folder.rglob("*.nxml")):
256 |             parsed = parse_single_pubmed_file(nxml_file)
257 | 
258 |             for article in parsed:
259 |                 for figure in article["figures"]:
260 |                     # remove inline references since we're not using them
261 |                     figure.pop("inline_references")
262 | 
263 |                 f.write(json.dumps(article) + "\n")
264 | 
265 |     print(f"Processed {idx+1} files")
266 | 


--------------------------------------------------------------------------------
/pmc15_pipeline/types.py:
--------------------------------------------------------------------------------
 1 | from typing import TypedDict
 2 | 
 3 | 
 4 | class PubMedFile(TypedDict):
 5 |     path: str
 6 |     title: str
 7 |     pmcid: str
 8 |     pmid: str
 9 |     code: str
10 | 


--------------------------------------------------------------------------------
/pmc15_pipeline/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/pmc15_pipeline/utils/__init__.py


--------------------------------------------------------------------------------
/pmc15_pipeline/utils/fs_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import os
 3 | import sys
 4 | from datetime import datetime
 5 | from pathlib import Path
 6 | from typing import List, Tuple
 7 | 
 8 | 
 9 | @functools.cache
10 | def get_repo_root_path(known_root_foldername=".vscode"):
11 |     for parent in Path(__file__).parents:
12 |         if (parent / known_root_foldername).exists():
13 |             return parent
14 | 
15 |     # check we're not at the root of the drive `/`
16 |     raise ValueError(
17 |         f"Repo root could not be found! Did not find `{known_root_foldername}` as child of any folders in path {Path(__file__)}"
18 |     )
19 | 
20 | 
21 | def get_line_count(file_path: Path) -> int:
22 |     return sum(1 for _ in open(file_path, "r", encoding="utf8"))
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fire==0.4.0
 2 | ipykernel
 3 | ipython
 4 | matplotlib
 5 | opencv-python==4.8.0.74
 6 | PILLOW
 7 | pytest
 8 | requests
 9 | rich==10.12.0
10 | seaborn==0.12.2
11 | pubmed_parser==0.5.1
12 | tqdm==4.67.1


--------------------------------------------------------------------------------
/run_pmc15_pipeline.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## PMC15 Pipeline\n",
   8 |     "\n",
   9 |     "This will run the PMC15 pipeline. The steps are as follows:\n",
  10 |     "\n",
  11 |     "1. Download the list of PMC Open Access data\n",
  12 |     "2. Download and extract the data\n",
  13 |     "3. Parse all the articles and create a `_results/data/pubmed_parsed_data.jsonl` file\n",
  14 |     "\n",
  15 |     "In the `pubmed_parsed_data.jsonl` file, each line is a JSON object with the following shape:\n",
  16 |     "\n",
  17 |     "```json\n",
  18 |     "{\n",
  19 |     "    \"pmid\": \"PMID_VALUE like 11178228\",\n",
  20 |     "    \"pmc\": \"PMC_VALUE like 15015\",\n",
  21 |     "    \"location\": \"LOCATION_PATH: path to where the article is stored on disk\",\n",
  22 |     "    \"figures\": [\n",
  23 |     "        {\n",
  24 |     "            \"fig_caption\": \"FIGURE_CAPTION: the caption of the figure in the article\",\n",
  25 |     "            \"fig_id\": \"FIGURE_ID: F1, F2, etc\",\n",
  26 |     "            \"fig_label\": \"FIGURE_LABEL: Figure 1, Figure 2, etc. Where the figure is referenced in the article\",\n",
  27 |     "            \"graphic_ref\": \"GRAPHIC_REFERENCE_PATH: path to where the imape is stored on disk\",\n",
  28 |     "            \"pair_id\": \"PAIR_ID: {pmid}_{fig_id}\",\n",
  29 |     "        },\n",
  30 |     "    ]\n",
  31 |     "}\n",
  32 |     "```"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 1,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "# this controls how many articles will be downloaded and processed. Set to `None` to process all articles in the PMCOA list\n",
  42 |     "MAX_ITEMS_TO_PROCESS = 100"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 2,
  48 |    "metadata": {},
  49 |    "outputs": [],
  50 |    "source": [
  51 |     "from pmc15_pipeline import data\n",
  52 |     "from pmc15_pipeline.utils import fs_utils"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": 3,
  58 |    "metadata": {},
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "repo_root = fs_utils.get_repo_root_path()"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 4,
  67 |    "metadata": {},
  68 |    "outputs": [
  69 |     {
  70 |      "name": "stdout",
  71 |      "output_type": "stream",
  72 |      "text": [
  73 |       "Downloading OpenAccess file list from: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt to /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_file_list.txt\n",
  74 |       "Saved to: /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_file_list.txt\n"
  75 |      ]
  76 |     }
  77 |    ],
  78 |    "source": [
  79 |     "list_output_path = repo_root / \"_results\" / \"data\" / \"pubmed_open_access_file_list.txt\"\n",
  80 |     "\n",
  81 |     "data.download_pubmed_file_list(\n",
  82 |     "    output_file_path=list_output_path,\n",
  83 |     ")"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "code",
  88 |    "execution_count": 5,
  89 |    "metadata": {},
  90 |    "outputs": [
  91 |     {
  92 |      "name": "stderr",
  93 |      "output_type": "stream",
  94 |      "text": [
  95 |       "  0%|          | 0/100 [00:00<?, ?it/s]"
  96 |      ]
  97 |     },
  98 |     {
  99 |      "name": "stdout",
 100 |      "output_type": "stream",
 101 |      "text": [
 102 |       "File: PMC13900.tar.gz size: 108943 bytes\n"
 103 |      ]
 104 |     },
 105 |     {
 106 |      "name": "stderr",
 107 |      "output_type": "stream",
 108 |      "text": [
 109 |       "  1%|          | 1/100 [00:01<01:44,  1.06s/it]"
 110 |      ]
 111 |     },
 112 |     {
 113 |      "name": "stdout",
 114 |      "output_type": "stream",
 115 |      "text": [
 116 |       "File: PMC13901.tar.gz size: 1913305 bytes\n"
 117 |      ]
 118 |     },
 119 |     {
 120 |      "name": "stderr",
 121 |      "output_type": "stream",
 122 |      "text": [
 123 |       "  2%|▏         | 2/100 [00:02<02:04,  1.27s/it]"
 124 |      ]
 125 |     },
 126 |     {
 127 |      "name": "stdout",
 128 |      "output_type": "stream",
 129 |      "text": [
 130 |       "File: PMC13902.tar.gz size: 1090539 bytes\n"
 131 |      ]
 132 |     },
 133 |     {
 134 |      "name": "stderr",
 135 |      "output_type": "stream",
 136 |      "text": [
 137 |       "  3%|▎         | 3/100 [00:04<01:59,  1.23s/it]"
 138 |      ]
 139 |     },
 140 |     {
 141 |      "name": "stdout",
 142 |      "output_type": "stream",
 143 |      "text": [
 144 |       "File: PMC13911.tar.gz size: 100236 bytes\n"
 145 |      ]
 146 |     },
 147 |     {
 148 |      "name": "stderr",
 149 |      "output_type": "stream",
 150 |      "text": [
 151 |       "  4%|▍         | 4/100 [00:04<01:47,  1.12s/it]"
 152 |      ]
 153 |     },
 154 |     {
 155 |      "name": "stdout",
 156 |      "output_type": "stream",
 157 |      "text": [
 158 |       "File: PMC13912.tar.gz size: 454405 bytes\n"
 159 |      ]
 160 |     },
 161 |     {
 162 |      "name": "stderr",
 163 |      "output_type": "stream",
 164 |      "text": [
 165 |       "  5%|▌         | 5/100 [00:06<01:46,  1.12s/it]"
 166 |      ]
 167 |     },
 168 |     {
 169 |      "name": "stdout",
 170 |      "output_type": "stream",
 171 |      "text": [
 172 |       "File: PMC13913.tar.gz size: 283665 bytes\n"
 173 |      ]
 174 |     },
 175 |     {
 176 |      "name": "stderr",
 177 |      "output_type": "stream",
 178 |      "text": [
 179 |       "  6%|▌         | 6/100 [00:07<01:46,  1.13s/it]"
 180 |      ]
 181 |     },
 182 |     {
 183 |      "name": "stdout",
 184 |      "output_type": "stream",
 185 |      "text": [
 186 |       "File: PMC13914.tar.gz size: 179788 bytes\n"
 187 |      ]
 188 |     },
 189 |     {
 190 |      "name": "stderr",
 191 |      "output_type": "stream",
 192 |      "text": [
 193 |       "  7%|▋         | 7/100 [00:08<01:46,  1.15s/it]"
 194 |      ]
 195 |     },
 196 |     {
 197 |      "name": "stdout",
 198 |      "output_type": "stream",
 199 |      "text": [
 200 |       "File: PMC13915.tar.gz size: 1759577 bytes\n"
 201 |      ]
 202 |     },
 203 |     {
 204 |      "name": "stderr",
 205 |      "output_type": "stream",
 206 |      "text": [
 207 |       "  8%|▊         | 8/100 [00:10<02:20,  1.52s/it]"
 208 |      ]
 209 |     },
 210 |     {
 211 |      "name": "stdout",
 212 |      "output_type": "stream",
 213 |      "text": [
 214 |       "File: PMC13916.tar.gz size: 912644 bytes\n"
 215 |      ]
 216 |     },
 217 |     {
 218 |      "name": "stderr",
 219 |      "output_type": "stream",
 220 |      "text": [
 221 |       "  9%|▉         | 9/100 [00:12<02:14,  1.48s/it]"
 222 |      ]
 223 |     },
 224 |     {
 225 |      "name": "stdout",
 226 |      "output_type": "stream",
 227 |      "text": [
 228 |       "File: PMC13917.tar.gz size: 2247749 bytes\n"
 229 |      ]
 230 |     },
 231 |     {
 232 |      "name": "stderr",
 233 |      "output_type": "stream",
 234 |      "text": [
 235 |       " 10%|█         | 10/100 [00:13<02:09,  1.43s/it]"
 236 |      ]
 237 |     },
 238 |     {
 239 |      "name": "stdout",
 240 |      "output_type": "stream",
 241 |      "text": [
 242 |       "File: PMC13918.tar.gz size: 117041 bytes\n"
 243 |      ]
 244 |     },
 245 |     {
 246 |      "name": "stderr",
 247 |      "output_type": "stream",
 248 |      "text": [
 249 |       " 11%|█         | 11/100 [00:14<01:58,  1.33s/it]"
 250 |      ]
 251 |     },
 252 |     {
 253 |      "name": "stdout",
 254 |      "output_type": "stream",
 255 |      "text": [
 256 |       "File: PMC13919.tar.gz size: 904607 bytes\n"
 257 |      ]
 258 |     },
 259 |     {
 260 |      "name": "stderr",
 261 |      "output_type": "stream",
 262 |      "text": [
 263 |       " 12%|█▏        | 12/100 [00:15<01:55,  1.31s/it]"
 264 |      ]
 265 |     },
 266 |     {
 267 |      "name": "stdout",
 268 |      "output_type": "stream",
 269 |      "text": [
 270 |       "File: PMC13920.tar.gz size: 116567 bytes\n"
 271 |      ]
 272 |     },
 273 |     {
 274 |      "name": "stderr",
 275 |      "output_type": "stream",
 276 |      "text": [
 277 |       " 13%|█▎        | 13/100 [00:16<01:47,  1.23s/it]"
 278 |      ]
 279 |     },
 280 |     {
 281 |      "name": "stdout",
 282 |      "output_type": "stream",
 283 |      "text": [
 284 |       "File: PMC13921.tar.gz size: 102676 bytes\n"
 285 |      ]
 286 |     },
 287 |     {
 288 |      "name": "stderr",
 289 |      "output_type": "stream",
 290 |      "text": [
 291 |       " 14%|█▍        | 14/100 [00:17<01:40,  1.17s/it]"
 292 |      ]
 293 |     },
 294 |     {
 295 |      "name": "stdout",
 296 |      "output_type": "stream",
 297 |      "text": [
 298 |       "File: PMC13922.tar.gz size: 1621237 bytes\n"
 299 |      ]
 300 |     },
 301 |     {
 302 |      "name": "stderr",
 303 |      "output_type": "stream",
 304 |      "text": [
 305 |       " 15%|█▌        | 15/100 [00:19<01:44,  1.23s/it]"
 306 |      ]
 307 |     },
 308 |     {
 309 |      "name": "stdout",
 310 |      "output_type": "stream",
 311 |      "text": [
 312 |       "File: PMC13923.tar.gz size: 1378984 bytes\n"
 313 |      ]
 314 |     },
 315 |     {
 316 |      "name": "stderr",
 317 |      "output_type": "stream",
 318 |      "text": [
 319 |       " 16%|█▌        | 16/100 [00:20<01:46,  1.27s/it]"
 320 |      ]
 321 |     },
 322 |     {
 323 |      "name": "stdout",
 324 |      "output_type": "stream",
 325 |      "text": [
 326 |       "File: PMC13924.tar.gz size: 199975 bytes\n"
 327 |      ]
 328 |     },
 329 |     {
 330 |      "name": "stderr",
 331 |      "output_type": "stream",
 332 |      "text": [
 333 |       " 17%|█▋        | 17/100 [00:21<01:42,  1.23s/it]"
 334 |      ]
 335 |     },
 336 |     {
 337 |      "name": "stdout",
 338 |      "output_type": "stream",
 339 |      "text": [
 340 |       "File: PMC14752.tar.gz size: 685837 bytes\n"
 341 |      ]
 342 |     },
 343 |     {
 344 |      "name": "stderr",
 345 |      "output_type": "stream",
 346 |      "text": [
 347 |       " 18%|█▊        | 18/100 [00:23<01:42,  1.25s/it]"
 348 |      ]
 349 |     },
 350 |     {
 351 |      "name": "stdout",
 352 |      "output_type": "stream",
 353 |      "text": [
 354 |       "File: PMC15015.tar.gz size: 8032007 bytes\n"
 355 |      ]
 356 |     },
 357 |     {
 358 |      "name": "stderr",
 359 |      "output_type": "stream",
 360 |      "text": [
 361 |       " 19%|█▉        | 19/100 [00:25<02:07,  1.58s/it]"
 362 |      ]
 363 |     },
 364 |     {
 365 |      "name": "stdout",
 366 |      "output_type": "stream",
 367 |      "text": [
 368 |       "File: PMC15016.tar.gz size: 1307756 bytes\n"
 369 |      ]
 370 |     },
 371 |     {
 372 |      "name": "stderr",
 373 |      "output_type": "stream",
 374 |      "text": [
 375 |       " 20%|██        | 20/100 [00:26<01:59,  1.50s/it]"
 376 |      ]
 377 |     },
 378 |     {
 379 |      "name": "stdout",
 380 |      "output_type": "stream",
 381 |      "text": [
 382 |       "File: PMC15023.tar.gz size: 9004765 bytes\n"
 383 |      ]
 384 |     },
 385 |     {
 386 |      "name": "stderr",
 387 |      "output_type": "stream",
 388 |      "text": [
 389 |       " 21%|██        | 21/100 [00:29<02:19,  1.76s/it]"
 390 |      ]
 391 |     },
 392 |     {
 393 |      "name": "stdout",
 394 |      "output_type": "stream",
 395 |      "text": [
 396 |       "File: PMC15024.tar.gz size: 170657 bytes\n"
 397 |      ]
 398 |     },
 399 |     {
 400 |      "name": "stderr",
 401 |      "output_type": "stream",
 402 |      "text": [
 403 |       " 22%|██▏       | 22/100 [00:30<01:58,  1.52s/it]"
 404 |      ]
 405 |     },
 406 |     {
 407 |      "name": "stdout",
 408 |      "output_type": "stream",
 409 |      "text": [
 410 |       "File: PMC15025.tar.gz size: 844786 bytes\n"
 411 |      ]
 412 |     },
 413 |     {
 414 |      "name": "stderr",
 415 |      "output_type": "stream",
 416 |      "text": [
 417 |       " 23%|██▎       | 23/100 [00:31<01:50,  1.44s/it]"
 418 |      ]
 419 |     },
 420 |     {
 421 |      "name": "stdout",
 422 |      "output_type": "stream",
 423 |      "text": [
 424 |       "File: PMC15026.tar.gz size: 754450 bytes\n"
 425 |      ]
 426 |     },
 427 |     {
 428 |      "name": "stderr",
 429 |      "output_type": "stream",
 430 |      "text": [
 431 |       " 24%|██▍       | 24/100 [00:32<01:46,  1.40s/it]"
 432 |      ]
 433 |     },
 434 |     {
 435 |      "name": "stdout",
 436 |      "output_type": "stream",
 437 |      "text": [
 438 |       "File: PMC15027.tar.gz size: 859007 bytes\n"
 439 |      ]
 440 |     },
 441 |     {
 442 |      "name": "stderr",
 443 |      "output_type": "stream",
 444 |      "text": [
 445 |       " 25%|██▌       | 25/100 [00:33<01:42,  1.36s/it]"
 446 |      ]
 447 |     },
 448 |     {
 449 |      "name": "stdout",
 450 |      "output_type": "stream",
 451 |      "text": [
 452 |       "File: PMC15028.tar.gz size: 630671 bytes\n"
 453 |      ]
 454 |     },
 455 |     {
 456 |      "name": "stderr",
 457 |      "output_type": "stream",
 458 |      "text": [
 459 |       " 26%|██▌       | 26/100 [00:35<01:46,  1.44s/it]"
 460 |      ]
 461 |     },
 462 |     {
 463 |      "name": "stdout",
 464 |      "output_type": "stream",
 465 |      "text": [
 466 |       "File: PMC16139.tar.gz size: 491655 bytes\n"
 467 |      ]
 468 |     },
 469 |     {
 470 |      "name": "stderr",
 471 |      "output_type": "stream",
 472 |      "text": [
 473 |       " 27%|██▋       | 27/100 [00:36<01:38,  1.35s/it]"
 474 |      ]
 475 |     },
 476 |     {
 477 |      "name": "stdout",
 478 |      "output_type": "stream",
 479 |      "text": [
 480 |       "File: PMC16141.tar.gz size: 2269913 bytes\n"
 481 |      ]
 482 |     },
 483 |     {
 484 |      "name": "stderr",
 485 |      "output_type": "stream",
 486 |      "text": [
 487 |       " 28%|██▊       | 28/100 [00:37<01:36,  1.34s/it]"
 488 |      ]
 489 |     },
 490 |     {
 491 |      "name": "stdout",
 492 |      "output_type": "stream",
 493 |      "text": [
 494 |       "File: PMC16144.tar.gz size: 888478 bytes\n"
 495 |      ]
 496 |     },
 497 |     {
 498 |      "name": "stderr",
 499 |      "output_type": "stream",
 500 |      "text": [
 501 |       " 29%|██▉       | 29/100 [00:39<01:33,  1.31s/it]"
 502 |      ]
 503 |     },
 504 |     {
 505 |      "name": "stdout",
 506 |      "output_type": "stream",
 507 |      "text": [
 508 |       "File: PMC16145.tar.gz size: 395543 bytes\n"
 509 |      ]
 510 |     },
 511 |     {
 512 |      "name": "stderr",
 513 |      "output_type": "stream",
 514 |      "text": [
 515 |       " 30%|███       | 30/100 [00:40<01:31,  1.31s/it]"
 516 |      ]
 517 |     },
 518 |     {
 519 |      "name": "stdout",
 520 |      "output_type": "stream",
 521 |      "text": [
 522 |       "File: PMC17597.tar.gz size: 394598 bytes\n"
 523 |      ]
 524 |     },
 525 |     {
 526 |      "name": "stderr",
 527 |      "output_type": "stream",
 528 |      "text": [
 529 |       " 31%|███       | 31/100 [00:41<01:26,  1.26s/it]"
 530 |      ]
 531 |     },
 532 |     {
 533 |      "name": "stdout",
 534 |      "output_type": "stream",
 535 |      "text": [
 536 |       "File: PMC17598.tar.gz size: 1081239 bytes\n"
 537 |      ]
 538 |     },
 539 |     {
 540 |      "name": "stderr",
 541 |      "output_type": "stream",
 542 |      "text": [
 543 |       " 32%|███▏      | 32/100 [00:42<01:25,  1.26s/it]"
 544 |      ]
 545 |     },
 546 |     {
 547 |      "name": "stdout",
 548 |      "output_type": "stream",
 549 |      "text": [
 550 |       "File: PMC17599.tar.gz size: 2282535 bytes\n"
 551 |      ]
 552 |     },
 553 |     {
 554 |      "name": "stderr",
 555 |      "output_type": "stream",
 556 |      "text": [
 557 |       " 33%|███▎      | 33/100 [00:44<01:27,  1.31s/it]"
 558 |      ]
 559 |     },
 560 |     {
 561 |      "name": "stdout",
 562 |      "output_type": "stream",
 563 |      "text": [
 564 |       "File: PMC17774.tar.gz size: 735629 bytes\n"
 565 |      ]
 566 |     },
 567 |     {
 568 |      "name": "stderr",
 569 |      "output_type": "stream",
 570 |      "text": [
 571 |       " 34%|███▍      | 34/100 [00:47<02:00,  1.83s/it]"
 572 |      ]
 573 |     },
 574 |     {
 575 |      "name": "stdout",
 576 |      "output_type": "stream",
 577 |      "text": [
 578 |       "File: PMC17776.tar.gz size: 1855391 bytes\n"
 579 |      ]
 580 |     },
 581 |     {
 582 |      "name": "stderr",
 583 |      "output_type": "stream",
 584 |      "text": [
 585 |       " 35%|███▌      | 35/100 [00:50<02:28,  2.28s/it]"
 586 |      ]
 587 |     },
 588 |     {
 589 |      "name": "stdout",
 590 |      "output_type": "stream",
 591 |      "text": [
 592 |       "File: PMC17779.tar.gz size: 2136644 bytes\n"
 593 |      ]
 594 |     },
 595 |     {
 596 |      "name": "stderr",
 597 |      "output_type": "stream",
 598 |      "text": [
 599 |       " 36%|███▌      | 36/100 [00:52<02:11,  2.06s/it]"
 600 |      ]
 601 |     },
 602 |     {
 603 |      "name": "stdout",
 604 |      "output_type": "stream",
 605 |      "text": [
 606 |       "File: PMC17803.tar.gz size: 1073430 bytes\n"
 607 |      ]
 608 |     },
 609 |     {
 610 |      "name": "stderr",
 611 |      "output_type": "stream",
 612 |      "text": [
 613 |       " 37%|███▋      | 37/100 [00:53<01:54,  1.82s/it]"
 614 |      ]
 615 |     },
 616 |     {
 617 |      "name": "stdout",
 618 |      "output_type": "stream",
 619 |      "text": [
 620 |       "File: PMC17804.tar.gz size: 563240 bytes\n"
 621 |      ]
 622 |     },
 623 |     {
 624 |      "name": "stderr",
 625 |      "output_type": "stream",
 626 |      "text": [
 627 |       " 38%|███▊      | 38/100 [00:54<01:41,  1.63s/it]"
 628 |      ]
 629 |     },
 630 |     {
 631 |      "name": "stdout",
 632 |      "output_type": "stream",
 633 |      "text": [
 634 |       "File: PMC17805.tar.gz size: 832447 bytes\n"
 635 |      ]
 636 |     },
 637 |     {
 638 |      "name": "stderr",
 639 |      "output_type": "stream",
 640 |      "text": [
 641 |       " 39%|███▉      | 39/100 [00:55<01:31,  1.50s/it]"
 642 |      ]
 643 |     },
 644 |     {
 645 |      "name": "stdout",
 646 |      "output_type": "stream",
 647 |      "text": [
 648 |       "File: PMC17806.tar.gz size: 496353 bytes\n"
 649 |      ]
 650 |     },
 651 |     {
 652 |      "name": "stderr",
 653 |      "output_type": "stream",
 654 |      "text": [
 655 |       " 40%|████      | 40/100 [00:57<01:25,  1.42s/it]"
 656 |      ]
 657 |     },
 658 |     {
 659 |      "name": "stdout",
 660 |      "output_type": "stream",
 661 |      "text": [
 662 |       "File: PMC17807.tar.gz size: 1634044 bytes\n"
 663 |      ]
 664 |     },
 665 |     {
 666 |      "name": "stderr",
 667 |      "output_type": "stream",
 668 |      "text": [
 669 |       " 41%|████      | 41/100 [00:58<01:20,  1.37s/it]"
 670 |      ]
 671 |     },
 672 |     {
 673 |      "name": "stdout",
 674 |      "output_type": "stream",
 675 |      "text": [
 676 |       "File: PMC17808.tar.gz size: 879653 bytes\n"
 677 |      ]
 678 |     },
 679 |     {
 680 |      "name": "stderr",
 681 |      "output_type": "stream",
 682 |      "text": [
 683 |       " 42%|████▏     | 42/100 [00:59<01:17,  1.33s/it]"
 684 |      ]
 685 |     },
 686 |     {
 687 |      "name": "stdout",
 688 |      "output_type": "stream",
 689 |      "text": [
 690 |       "File: PMC17809.tar.gz size: 829564 bytes\n"
 691 |      ]
 692 |     },
 693 |     {
 694 |      "name": "stderr",
 695 |      "output_type": "stream",
 696 |      "text": [
 697 |       " 43%|████▎     | 43/100 [01:00<01:13,  1.29s/it]"
 698 |      ]
 699 |     },
 700 |     {
 701 |      "name": "stdout",
 702 |      "output_type": "stream",
 703 |      "text": [
 704 |       "File: PMC17810.tar.gz size: 1407569 bytes\n"
 705 |      ]
 706 |     },
 707 |     {
 708 |      "name": "stderr",
 709 |      "output_type": "stream",
 710 |      "text": [
 711 |       " 44%|████▍     | 44/100 [01:02<01:12,  1.30s/it]"
 712 |      ]
 713 |     },
 714 |     {
 715 |      "name": "stdout",
 716 |      "output_type": "stream",
 717 |      "text": [
 718 |       "File: PMC17811.tar.gz size: 102895 bytes\n"
 719 |      ]
 720 |     },
 721 |     {
 722 |      "name": "stderr",
 723 |      "output_type": "stream",
 724 |      "text": [
 725 |       " 45%|████▌     | 45/100 [01:03<01:07,  1.22s/it]"
 726 |      ]
 727 |     },
 728 |     {
 729 |      "name": "stdout",
 730 |      "output_type": "stream",
 731 |      "text": [
 732 |       "File: PMC17812.tar.gz size: 2648793 bytes\n"
 733 |      ]
 734 |     },
 735 |     {
 736 |      "name": "stderr",
 737 |      "output_type": "stream",
 738 |      "text": [
 739 |       " 46%|████▌     | 46/100 [01:04<01:09,  1.29s/it]"
 740 |      ]
 741 |     },
 742 |     {
 743 |      "name": "stdout",
 744 |      "output_type": "stream",
 745 |      "text": [
 746 |       "File: PMC17813.tar.gz size: 2404403 bytes\n"
 747 |      ]
 748 |     },
 749 |     {
 750 |      "name": "stderr",
 751 |      "output_type": "stream",
 752 |      "text": [
 753 |       " 47%|████▋     | 47/100 [01:05<01:09,  1.31s/it]"
 754 |      ]
 755 |     },
 756 |     {
 757 |      "name": "stdout",
 758 |      "output_type": "stream",
 759 |      "text": [
 760 |       "File: PMC17814.tar.gz size: 938058 bytes\n"
 761 |      ]
 762 |     },
 763 |     {
 764 |      "name": "stderr",
 765 |      "output_type": "stream",
 766 |      "text": [
 767 |       " 48%|████▊     | 48/100 [01:07<01:06,  1.28s/it]"
 768 |      ]
 769 |     },
 770 |     {
 771 |      "name": "stdout",
 772 |      "output_type": "stream",
 773 |      "text": [
 774 |       "File: PMC17815.tar.gz size: 129469 bytes\n"
 775 |      ]
 776 |     },
 777 |     {
 778 |      "name": "stderr",
 779 |      "output_type": "stream",
 780 |      "text": [
 781 |       " 49%|████▉     | 49/100 [01:08<01:01,  1.20s/it]"
 782 |      ]
 783 |     },
 784 |     {
 785 |      "name": "stdout",
 786 |      "output_type": "stream",
 787 |      "text": [
 788 |       "File: PMC17816.tar.gz size: 251650 bytes\n"
 789 |      ]
 790 |     },
 791 |     {
 792 |      "name": "stderr",
 793 |      "output_type": "stream",
 794 |      "text": [
 795 |       " 50%|█████     | 50/100 [01:09<00:58,  1.18s/it]"
 796 |      ]
 797 |     },
 798 |     {
 799 |      "name": "stdout",
 800 |      "output_type": "stream",
 801 |      "text": [
 802 |       "File: PMC17817.tar.gz size: 971664 bytes\n"
 803 |      ]
 804 |     },
 805 |     {
 806 |      "name": "stderr",
 807 |      "output_type": "stream",
 808 |      "text": [
 809 |       " 51%|█████     | 51/100 [01:10<00:58,  1.19s/it]"
 810 |      ]
 811 |     },
 812 |     {
 813 |      "name": "stdout",
 814 |      "output_type": "stream",
 815 |      "text": [
 816 |       "File: PMC17818.tar.gz size: 321170 bytes\n"
 817 |      ]
 818 |     },
 819 |     {
 820 |      "name": "stderr",
 821 |      "output_type": "stream",
 822 |      "text": [
 823 |       " 52%|█████▏    | 52/100 [01:11<00:55,  1.15s/it]"
 824 |      ]
 825 |     },
 826 |     {
 827 |      "name": "stdout",
 828 |      "output_type": "stream",
 829 |      "text": [
 830 |       "File: PMC17819.tar.gz size: 1144961 bytes\n"
 831 |      ]
 832 |     },
 833 |     {
 834 |      "name": "stderr",
 835 |      "output_type": "stream",
 836 |      "text": [
 837 |       " 53%|█████▎    | 53/100 [01:13<00:57,  1.23s/it]"
 838 |      ]
 839 |     },
 840 |     {
 841 |      "name": "stdout",
 842 |      "output_type": "stream",
 843 |      "text": [
 844 |       "File: PMC17820.tar.gz size: 1641712 bytes\n"
 845 |      ]
 846 |     },
 847 |     {
 848 |      "name": "stderr",
 849 |      "output_type": "stream",
 850 |      "text": [
 851 |       " 54%|█████▍    | 54/100 [01:14<00:57,  1.25s/it]"
 852 |      ]
 853 |     },
 854 |     {
 855 |      "name": "stdout",
 856 |      "output_type": "stream",
 857 |      "text": [
 858 |       "File: PMC17821.tar.gz size: 997089 bytes\n"
 859 |      ]
 860 |     },
 861 |     {
 862 |      "name": "stderr",
 863 |      "output_type": "stream",
 864 |      "text": [
 865 |       " 55%|█████▌    | 55/100 [01:15<00:56,  1.27s/it]"
 866 |      ]
 867 |     },
 868 |     {
 869 |      "name": "stdout",
 870 |      "output_type": "stream",
 871 |      "text": [
 872 |       "File: PMC17822.tar.gz size: 551699 bytes\n"
 873 |      ]
 874 |     },
 875 |     {
 876 |      "name": "stderr",
 877 |      "output_type": "stream",
 878 |      "text": [
 879 |       " 56%|█████▌    | 56/100 [01:16<00:55,  1.27s/it]"
 880 |      ]
 881 |     },
 882 |     {
 883 |      "name": "stdout",
 884 |      "output_type": "stream",
 885 |      "text": [
 886 |       "File: PMC17823.tar.gz size: 363269 bytes\n"
 887 |      ]
 888 |     },
 889 |     {
 890 |      "name": "stderr",
 891 |      "output_type": "stream",
 892 |      "text": [
 893 |       " 57%|█████▋    | 57/100 [01:17<00:51,  1.20s/it]"
 894 |      ]
 895 |     },
 896 |     {
 897 |      "name": "stdout",
 898 |      "output_type": "stream",
 899 |      "text": [
 900 |       "File: PMC17824.tar.gz size: 599619 bytes\n"
 901 |      ]
 902 |     },
 903 |     {
 904 |      "name": "stderr",
 905 |      "output_type": "stream",
 906 |      "text": [
 907 |       " 58%|█████▊    | 58/100 [01:19<00:51,  1.22s/it]"
 908 |      ]
 909 |     },
 910 |     {
 911 |      "name": "stdout",
 912 |      "output_type": "stream",
 913 |      "text": [
 914 |       "File: PMC17825.tar.gz size: 1152324 bytes\n"
 915 |      ]
 916 |     },
 917 |     {
 918 |      "name": "stderr",
 919 |      "output_type": "stream",
 920 |      "text": [
 921 |       " 59%|█████▉    | 59/100 [01:20<00:49,  1.22s/it]"
 922 |      ]
 923 |     },
 924 |     {
 925 |      "name": "stdout",
 926 |      "output_type": "stream",
 927 |      "text": [
 928 |       "File: PMC17826.tar.gz size: 835561 bytes\n"
 929 |      ]
 930 |     },
 931 |     {
 932 |      "name": "stderr",
 933 |      "output_type": "stream",
 934 |      "text": [
 935 |       " 60%|██████    | 60/100 [01:21<00:49,  1.24s/it]"
 936 |      ]
 937 |     },
 938 |     {
 939 |      "name": "stdout",
 940 |      "output_type": "stream",
 941 |      "text": [
 942 |       "File: PMC17827.tar.gz size: 2468947 bytes\n"
 943 |      ]
 944 |     },
 945 |     {
 946 |      "name": "stderr",
 947 |      "output_type": "stream",
 948 |      "text": [
 949 |       " 61%|██████    | 61/100 [01:27<01:37,  2.49s/it]"
 950 |      ]
 951 |     },
 952 |     {
 953 |      "name": "stdout",
 954 |      "output_type": "stream",
 955 |      "text": [
 956 |       "File: PMC17828.tar.gz size: 1343971 bytes\n"
 957 |      ]
 958 |     },
 959 |     {
 960 |      "name": "stderr",
 961 |      "output_type": "stream",
 962 |      "text": [
 963 |       " 62%|██████▏   | 62/100 [01:28<01:22,  2.17s/it]"
 964 |      ]
 965 |     },
 966 |     {
 967 |      "name": "stdout",
 968 |      "output_type": "stream",
 969 |      "text": [
 970 |       "File: PMC17829.tar.gz size: 603916 bytes\n"
 971 |      ]
 972 |     },
 973 |     {
 974 |      "name": "stderr",
 975 |      "output_type": "stream",
 976 |      "text": [
 977 |       " 63%|██████▎   | 63/100 [01:29<01:10,  1.90s/it]"
 978 |      ]
 979 |     },
 980 |     {
 981 |      "name": "stdout",
 982 |      "output_type": "stream",
 983 |      "text": [
 984 |       "File: PMC25774.tar.gz size: 6364434 bytes\n"
 985 |      ]
 986 |     },
 987 |     {
 988 |      "name": "stderr",
 989 |      "output_type": "stream",
 990 |      "text": [
 991 |       " 64%|██████▍   | 64/100 [01:31<01:06,  1.85s/it]"
 992 |      ]
 993 |     },
 994 |     {
 995 |      "name": "stdout",
 996 |      "output_type": "stream",
 997 |      "text": [
 998 |       "File: PMC25775.tar.gz size: 1694695 bytes\n"
 999 |      ]
1000 |     },
1001 |     {
1002 |      "name": "stderr",
1003 |      "output_type": "stream",
1004 |      "text": [
1005 |       " 65%|██████▌   | 65/100 [01:32<00:58,  1.67s/it]"
1006 |      ]
1007 |     },
1008 |     {
1009 |      "name": "stdout",
1010 |      "output_type": "stream",
1011 |      "text": [
1012 |       "File: PMC25776.tar.gz size: 172405 bytes\n"
1013 |      ]
1014 |     },
1015 |     {
1016 |      "name": "stderr",
1017 |      "output_type": "stream",
1018 |      "text": [
1019 |       " 66%|██████▌   | 66/100 [01:33<00:49,  1.45s/it]"
1020 |      ]
1021 |     },
1022 |     {
1023 |      "name": "stdout",
1024 |      "output_type": "stream",
1025 |      "text": [
1026 |       "File: PMC28985.tar.gz size: 1165259 bytes\n"
1027 |      ]
1028 |     },
1029 |     {
1030 |      "name": "stderr",
1031 |      "output_type": "stream",
1032 |      "text": [
1033 |       " 67%|██████▋   | 67/100 [01:34<00:45,  1.39s/it]"
1034 |      ]
1035 |     },
1036 |     {
1037 |      "name": "stdout",
1038 |      "output_type": "stream",
1039 |      "text": [
1040 |       "File: PMC28986.tar.gz size: 302608 bytes\n"
1041 |      ]
1042 |     },
1043 |     {
1044 |      "name": "stderr",
1045 |      "output_type": "stream",
1046 |      "text": [
1047 |       " 68%|██████▊   | 68/100 [01:36<00:42,  1.31s/it]"
1048 |      ]
1049 |     },
1050 |     {
1051 |      "name": "stdout",
1052 |      "output_type": "stream",
1053 |      "text": [
1054 |       "File: PMC28987.tar.gz size: 118333 bytes\n"
1055 |      ]
1056 |     },
1057 |     {
1058 |      "name": "stderr",
1059 |      "output_type": "stream",
1060 |      "text": [
1061 |       " 69%|██████▉   | 69/100 [01:37<00:37,  1.22s/it]"
1062 |      ]
1063 |     },
1064 |     {
1065 |      "name": "stdout",
1066 |      "output_type": "stream",
1067 |      "text": [
1068 |       "File: PMC28988.tar.gz size: 121496 bytes\n"
1069 |      ]
1070 |     },
1071 |     {
1072 |      "name": "stderr",
1073 |      "output_type": "stream",
1074 |      "text": [
1075 |       " 70%|███████   | 70/100 [01:38<00:35,  1.17s/it]"
1076 |      ]
1077 |     },
1078 |     {
1079 |      "name": "stdout",
1080 |      "output_type": "stream",
1081 |      "text": [
1082 |       "File: PMC28989.tar.gz size: 146736 bytes\n"
1083 |      ]
1084 |     },
1085 |     {
1086 |      "name": "stderr",
1087 |      "output_type": "stream",
1088 |      "text": [
1089 |       " 71%|███████   | 71/100 [01:39<00:32,  1.12s/it]"
1090 |      ]
1091 |     },
1092 |     {
1093 |      "name": "stdout",
1094 |      "output_type": "stream",
1095 |      "text": [
1096 |       "File: PMC28990.tar.gz size: 99355 bytes\n"
1097 |      ]
1098 |     },
1099 |     {
1100 |      "name": "stderr",
1101 |      "output_type": "stream",
1102 |      "text": [
1103 |       " 72%|███████▏  | 72/100 [01:40<00:31,  1.11s/it]"
1104 |      ]
1105 |     },
1106 |     {
1107 |      "name": "stdout",
1108 |      "output_type": "stream",
1109 |      "text": [
1110 |       "File: PMC28991.tar.gz size: 6534 bytes\n"
1111 |      ]
1112 |     },
1113 |     {
1114 |      "name": "stderr",
1115 |      "output_type": "stream",
1116 |      "text": [
1117 |       " 73%|███████▎  | 73/100 [01:41<00:27,  1.02s/it]"
1118 |      ]
1119 |     },
1120 |     {
1121 |      "name": "stdout",
1122 |      "output_type": "stream",
1123 |      "text": [
1124 |       "File: PMC28992.tar.gz size: 10531 bytes\n"
1125 |      ]
1126 |     },
1127 |     {
1128 |      "name": "stderr",
1129 |      "output_type": "stream",
1130 |      "text": [
1131 |       " 74%|███████▍  | 74/100 [01:41<00:24,  1.05it/s]"
1132 |      ]
1133 |     },
1134 |     {
1135 |      "name": "stdout",
1136 |      "output_type": "stream",
1137 |      "text": [
1138 |       "File: PMC28993.tar.gz size: 10119 bytes\n"
1139 |      ]
1140 |     },
1141 |     {
1142 |      "name": "stderr",
1143 |      "output_type": "stream",
1144 |      "text": [
1145 |       " 75%|███████▌  | 75/100 [01:42<00:23,  1.07it/s]"
1146 |      ]
1147 |     },
1148 |     {
1149 |      "name": "stdout",
1150 |      "output_type": "stream",
1151 |      "text": [
1152 |       "File: PMC28994.tar.gz size: 9055 bytes\n"
1153 |      ]
1154 |     },
1155 |     {
1156 |      "name": "stderr",
1157 |      "output_type": "stream",
1158 |      "text": [
1159 |       " 76%|███████▌  | 76/100 [01:43<00:21,  1.14it/s]"
1160 |      ]
1161 |     },
1162 |     {
1163 |      "name": "stdout",
1164 |      "output_type": "stream",
1165 |      "text": [
1166 |       "File: PMC28995.tar.gz size: 130264 bytes\n"
1167 |      ]
1168 |     },
1169 |     {
1170 |      "name": "stderr",
1171 |      "output_type": "stream",
1172 |      "text": [
1173 |       " 77%|███████▋  | 77/100 [01:44<00:21,  1.07it/s]"
1174 |      ]
1175 |     },
1176 |     {
1177 |      "name": "stdout",
1178 |      "output_type": "stream",
1179 |      "text": [
1180 |       "File: PMC28996.tar.gz size: 76186 bytes\n"
1181 |      ]
1182 |     },
1183 |     {
1184 |      "name": "stderr",
1185 |      "output_type": "stream",
1186 |      "text": [
1187 |       " 78%|███████▊  | 78/100 [01:45<00:20,  1.06it/s]"
1188 |      ]
1189 |     },
1190 |     {
1191 |      "name": "stdout",
1192 |      "output_type": "stream",
1193 |      "text": [
1194 |       "File: PMC28997.tar.gz size: 11338 bytes\n"
1195 |      ]
1196 |     },
1197 |     {
1198 |      "name": "stderr",
1199 |      "output_type": "stream",
1200 |      "text": [
1201 |       " 79%|███████▉  | 79/100 [01:46<00:18,  1.15it/s]"
1202 |      ]
1203 |     },
1204 |     {
1205 |      "name": "stdout",
1206 |      "output_type": "stream",
1207 |      "text": [
1208 |       "File: PMC28998.tar.gz size: 7731 bytes\n"
1209 |      ]
1210 |     },
1211 |     {
1212 |      "name": "stderr",
1213 |      "output_type": "stream",
1214 |      "text": [
1215 |       " 80%|████████  | 80/100 [01:46<00:16,  1.20it/s]"
1216 |      ]
1217 |     },
1218 |     {
1219 |      "name": "stdout",
1220 |      "output_type": "stream",
1221 |      "text": [
1222 |       "File: PMC28999.tar.gz size: 11962 bytes\n"
1223 |      ]
1224 |     },
1225 |     {
1226 |      "name": "stderr",
1227 |      "output_type": "stream",
1228 |      "text": [
1229 |       " 81%|████████  | 81/100 [01:47<00:15,  1.26it/s]"
1230 |      ]
1231 |     },
1232 |     {
1233 |      "name": "stdout",
1234 |      "output_type": "stream",
1235 |      "text": [
1236 |       "File: PMC29000.tar.gz size: 38067 bytes\n"
1237 |      ]
1238 |     },
1239 |     {
1240 |      "name": "stderr",
1241 |      "output_type": "stream",
1242 |      "text": [
1243 |       " 82%|████████▏ | 82/100 [01:48<00:14,  1.23it/s]"
1244 |      ]
1245 |     },
1246 |     {
1247 |      "name": "stdout",
1248 |      "output_type": "stream",
1249 |      "text": [
1250 |       "File: PMC29001.tar.gz size: 14723 bytes\n"
1251 |      ]
1252 |     },
1253 |     {
1254 |      "name": "stderr",
1255 |      "output_type": "stream",
1256 |      "text": [
1257 |       " 83%|████████▎ | 83/100 [01:49<00:13,  1.22it/s]"
1258 |      ]
1259 |     },
1260 |     {
1261 |      "name": "stdout",
1262 |      "output_type": "stream",
1263 |      "text": [
1264 |       "File: PMC29002.tar.gz size: 57256 bytes\n"
1265 |      ]
1266 |     },
1267 |     {
1268 |      "name": "stderr",
1269 |      "output_type": "stream",
1270 |      "text": [
1271 |       " 84%|████████▍ | 84/100 [01:50<00:14,  1.14it/s]"
1272 |      ]
1273 |     },
1274 |     {
1275 |      "name": "stdout",
1276 |      "output_type": "stream",
1277 |      "text": [
1278 |       "File: PMC29003.tar.gz size: 13664 bytes\n"
1279 |      ]
1280 |     },
1281 |     {
1282 |      "name": "stderr",
1283 |      "output_type": "stream",
1284 |      "text": [
1285 |       " 85%|████████▌ | 85/100 [01:51<00:12,  1.21it/s]"
1286 |      ]
1287 |     },
1288 |     {
1289 |      "name": "stdout",
1290 |      "output_type": "stream",
1291 |      "text": [
1292 |       "File: PMC29004.tar.gz size: 104722 bytes\n"
1293 |      ]
1294 |     },
1295 |     {
1296 |      "name": "stderr",
1297 |      "output_type": "stream",
1298 |      "text": [
1299 |       " 86%|████████▌ | 86/100 [01:52<00:13,  1.02it/s]"
1300 |      ]
1301 |     },
1302 |     {
1303 |      "name": "stdout",
1304 |      "output_type": "stream",
1305 |      "text": [
1306 |       "File: PMC29005.tar.gz size: 314438 bytes\n"
1307 |      ]
1308 |     },
1309 |     {
1310 |      "name": "stderr",
1311 |      "output_type": "stream",
1312 |      "text": [
1313 |       " 87%|████████▋ | 87/100 [01:53<00:12,  1.01it/s]"
1314 |      ]
1315 |     },
1316 |     {
1317 |      "name": "stdout",
1318 |      "output_type": "stream",
1319 |      "text": [
1320 |       "File: PMC29006.tar.gz size: 117859 bytes\n"
1321 |      ]
1322 |     },
1323 |     {
1324 |      "name": "stderr",
1325 |      "output_type": "stream",
1326 |      "text": [
1327 |       " 88%|████████▊ | 88/100 [01:54<00:11,  1.01it/s]"
1328 |      ]
1329 |     },
1330 |     {
1331 |      "name": "stdout",
1332 |      "output_type": "stream",
1333 |      "text": [
1334 |       "File: PMC29007.tar.gz size: 222331 bytes\n"
1335 |      ]
1336 |     },
1337 |     {
1338 |      "name": "stderr",
1339 |      "output_type": "stream",
1340 |      "text": [
1341 |       " 89%|████████▉ | 89/100 [01:55<00:11,  1.02s/it]"
1342 |      ]
1343 |     },
1344 |     {
1345 |      "name": "stdout",
1346 |      "output_type": "stream",
1347 |      "text": [
1348 |       "File: PMC29008.tar.gz size: 103557 bytes\n"
1349 |      ]
1350 |     },
1351 |     {
1352 |      "name": "stderr",
1353 |      "output_type": "stream",
1354 |      "text": [
1355 |       " 90%|█████████ | 90/100 [01:56<00:09,  1.00it/s]"
1356 |      ]
1357 |     },
1358 |     {
1359 |      "name": "stdout",
1360 |      "output_type": "stream",
1361 |      "text": [
1362 |       "File: PMC29009.tar.gz size: 954097 bytes\n"
1363 |      ]
1364 |     },
1365 |     {
1366 |      "name": "stderr",
1367 |      "output_type": "stream",
1368 |      "text": [
1369 |       " 91%|█████████ | 91/100 [01:57<00:09,  1.06s/it]"
1370 |      ]
1371 |     },
1372 |     {
1373 |      "name": "stdout",
1374 |      "output_type": "stream",
1375 |      "text": [
1376 |       "File: PMC29010.tar.gz size: 146881 bytes\n"
1377 |      ]
1378 |     },
1379 |     {
1380 |      "name": "stderr",
1381 |      "output_type": "stream",
1382 |      "text": [
1383 |       " 92%|█████████▏| 92/100 [01:58<00:08,  1.04s/it]"
1384 |      ]
1385 |     },
1386 |     {
1387 |      "name": "stdout",
1388 |      "output_type": "stream",
1389 |      "text": [
1390 |       "File: PMC29011.tar.gz size: 145990 bytes\n"
1391 |      ]
1392 |     },
1393 |     {
1394 |      "name": "stderr",
1395 |      "output_type": "stream",
1396 |      "text": [
1397 |       " 93%|█████████▎| 93/100 [01:59<00:07,  1.05s/it]"
1398 |      ]
1399 |     },
1400 |     {
1401 |      "name": "stdout",
1402 |      "output_type": "stream",
1403 |      "text": [
1404 |       "File: PMC29012.tar.gz size: 203809 bytes\n"
1405 |      ]
1406 |     },
1407 |     {
1408 |      "name": "stderr",
1409 |      "output_type": "stream",
1410 |      "text": [
1411 |       " 94%|█████████▍| 94/100 [02:00<00:06,  1.04s/it]"
1412 |      ]
1413 |     },
1414 |     {
1415 |      "name": "stdout",
1416 |      "output_type": "stream",
1417 |      "text": [
1418 |       "File: PMC29013.tar.gz size: 666149 bytes\n"
1419 |      ]
1420 |     },
1421 |     {
1422 |      "name": "stderr",
1423 |      "output_type": "stream",
1424 |      "text": [
1425 |       " 95%|█████████▌| 95/100 [02:02<00:05,  1.11s/it]"
1426 |      ]
1427 |     },
1428 |     {
1429 |      "name": "stdout",
1430 |      "output_type": "stream",
1431 |      "text": [
1432 |       "File: PMC29014.tar.gz size: 235216 bytes\n"
1433 |      ]
1434 |     },
1435 |     {
1436 |      "name": "stderr",
1437 |      "output_type": "stream",
1438 |      "text": [
1439 |       " 96%|█████████▌| 96/100 [02:03<00:04,  1.12s/it]"
1440 |      ]
1441 |     },
1442 |     {
1443 |      "name": "stdout",
1444 |      "output_type": "stream",
1445 |      "text": [
1446 |       "File: PMC29015.tar.gz size: 273495 bytes\n"
1447 |      ]
1448 |     },
1449 |     {
1450 |      "name": "stderr",
1451 |      "output_type": "stream",
1452 |      "text": [
1453 |       " 97%|█████████▋| 97/100 [02:04<00:03,  1.13s/it]"
1454 |      ]
1455 |     },
1456 |     {
1457 |      "name": "stdout",
1458 |      "output_type": "stream",
1459 |      "text": [
1460 |       "File: PMC29016.tar.gz size: 244114 bytes\n"
1461 |      ]
1462 |     },
1463 |     {
1464 |      "name": "stderr",
1465 |      "output_type": "stream",
1466 |      "text": [
1467 |       " 98%|█████████▊| 98/100 [02:05<00:02,  1.15s/it]"
1468 |      ]
1469 |     },
1470 |     {
1471 |      "name": "stdout",
1472 |      "output_type": "stream",
1473 |      "text": [
1474 |       "File: PMC29017.tar.gz size: 138825 bytes\n"
1475 |      ]
1476 |     },
1477 |     {
1478 |      "name": "stderr",
1479 |      "output_type": "stream",
1480 |      "text": [
1481 |       " 99%|█████████▉| 99/100 [02:06<00:01,  1.18s/it]"
1482 |      ]
1483 |     },
1484 |     {
1485 |      "name": "stdout",
1486 |      "output_type": "stream",
1487 |      "text": [
1488 |       "File: PMC29018.tar.gz size: 137266 bytes\n"
1489 |      ]
1490 |     },
1491 |     {
1492 |      "name": "stderr",
1493 |      "output_type": "stream",
1494 |      "text": [
1495 |       "100%|██████████| 100/100 [02:07<00:00,  1.27s/it]"
1496 |      ]
1497 |     },
1498 |     {
1499 |      "name": "stdout",
1500 |      "output_type": "stream",
1501 |      "text": [
1502 |       "Skipped 0 files.\n"
1503 |      ]
1504 |     },
1505 |     {
1506 |      "name": "stderr",
1507 |      "output_type": "stream",
1508 |      "text": [
1509 |       "\n"
1510 |      ]
1511 |     }
1512 |    ],
1513 |    "source": [
1514 |     "# remove the subset_size argument to download all files\n",
1515 |     "downloaded_articles_output_path = repo_root / \"_results\" / \"data\" / \"pubmed_open_access_files_compressed\"\n",
1516 |     "\n",
1517 |     "data.download_pubmed_files_from_list(\n",
1518 |     "    file_list_path=list_output_path,\n",
1519 |     "    output_folder_path=downloaded_articles_output_path,\n",
1520 |     "    subset_size=MAX_ITEMS_TO_PROCESS,\n",
1521 |     ")"
1522 |    ]
1523 |   },
1524 |   {
1525 |    "cell_type": "code",
1526 |    "execution_count": 6,
1527 |    "metadata": {},
1528 |    "outputs": [
1529 |     {
1530 |      "name": "stdout",
1531 |      "output_type": "stream",
1532 |      "text": [
1533 |       "Found 100 files that match *.tar.gz in /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files_compressed\n"
1534 |      ]
1535 |     },
1536 |     {
1537 |      "name": "stderr",
1538 |      "output_type": "stream",
1539 |      "text": [
1540 |       "100%|██████████| 100/100 [00:08<00:00, 11.32it/s]"
1541 |      ]
1542 |     },
1543 |     {
1544 |      "name": "stdout",
1545 |      "output_type": "stream",
1546 |      "text": [
1547 |       "Finished extracting 100 files\n"
1548 |      ]
1549 |     },
1550 |     {
1551 |      "name": "stderr",
1552 |      "output_type": "stream",
1553 |      "text": [
1554 |       "\n"
1555 |      ]
1556 |     }
1557 |    ],
1558 |    "source": [
1559 |     "decompressed_folder_path = repo_root / \"_results\" / \"data\" / \"pubmed_open_access_files\"\n",
1560 |     "\n",
1561 |     "data.decompress_pubmed_files(\n",
1562 |     "    input_folder_path=downloaded_articles_output_path,\n",
1563 |     "    output_folder_path=decompressed_folder_path,\n",
1564 |     ")"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "code",
1569 |    "execution_count": 7,
1570 |    "metadata": {},
1571 |    "outputs": [
1572 |     {
1573 |      "name": "stdout",
1574 |      "output_type": "stream",
1575 |      "text": [
1576 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13900/BCR-3-1-055.nxml\n",
1577 |       "starting...\n",
1578 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13900/BCR-3-1-055.nxml\n",
1579 |       "no output\n",
1580 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13901/BCR-3-1-061.nxml\n",
1581 |       "starting...\n",
1582 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13901/BCR-3-1-061.nxml\n",
1583 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13902/BCR-3-1-066.nxml\n",
1584 |       "starting...\n",
1585 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13902/BCR-3-1-066.nxml\n",
1586 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13911/bcr-2-1-059.nxml\n",
1587 |       "starting...\n",
1588 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13911/bcr-2-1-059.nxml\n",
1589 |       "no output\n",
1590 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13912/bcr-2-1-064.nxml\n",
1591 |       "starting...\n",
1592 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13912/bcr-2-1-064.nxml\n",
1593 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13913/bcr-1-1-073.nxml\n",
1594 |       "starting...\n",
1595 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13913/bcr-1-1-073.nxml\n",
1596 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13914/bcr-1-1-081.nxml\n",
1597 |       "starting...\n",
1598 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13914/bcr-1-1-081.nxml\n",
1599 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13915/bcr-1-1-088.nxml\n",
1600 |       "starting...\n",
1601 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13915/bcr-1-1-088.nxml\n",
1602 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13916/bcr-2-2-139.nxml\n",
1603 |       "starting...\n",
1604 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13916/bcr-2-2-139.nxml\n",
1605 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13917/bcr-2-3-222.nxml\n",
1606 |       "starting...\n",
1607 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13917/bcr-2-3-222.nxml\n",
1608 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13918/bcr-2-4-307.nxml\n",
1609 |       "starting...\n",
1610 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13918/bcr-2-4-307.nxml\n",
1611 |       "no output\n",
1612 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13919/bcr-2-4-311.nxml\n",
1613 |       "starting...\n",
1614 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13919/bcr-2-4-311.nxml\n",
1615 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13920/bcr-2-5-378.nxml\n",
1616 |       "starting...\n",
1617 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13920/bcr-2-5-378.nxml\n",
1618 |       "no output\n",
1619 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13921/bcr-2-6-438.nxml\n",
1620 |       "starting...\n",
1621 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13921/bcr-2-6-438.nxml\n",
1622 |       "no output\n",
1623 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13922/bcr-2-6-444.nxml\n",
1624 |       "starting...\n",
1625 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13922/bcr-2-6-444.nxml\n",
1626 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13923/BCR-3-2-122.nxml\n",
1627 |       "starting...\n",
1628 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13923/BCR-3-2-122.nxml\n",
1629 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13924/BCR-3-2-134.nxml\n",
1630 |       "starting...\n",
1631 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC13924/BCR-3-2-134.nxml\n",
1632 |       "no output\n",
1633 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC14752/mk000001.nxml\n",
1634 |       "starting...\n",
1635 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC14752/mk000001.nxml\n",
1636 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15015/gb-2000-1-2-research0003.nxml\n",
1637 |       "starting...\n",
1638 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15015/gb-2000-1-2-research0003.nxml\n",
1639 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15016/gb-2000-1-2-research0004.nxml\n",
1640 |       "starting...\n",
1641 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15016/gb-2000-1-2-research0004.nxml\n",
1642 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15023/gb-2000-1-3-research0005.nxml\n",
1643 |       "starting...\n",
1644 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15023/gb-2000-1-3-research0005.nxml\n",
1645 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15024/gb-2000-1-3-research0006.nxml\n",
1646 |       "starting...\n",
1647 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15024/gb-2000-1-3-research0006.nxml\n",
1648 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15025/gb-2000-1-4-research0007.nxml\n",
1649 |       "starting...\n",
1650 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15025/gb-2000-1-4-research0007.nxml\n",
1651 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15026/gb-2000-1-4-research0008.nxml\n",
1652 |       "starting...\n",
1653 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15026/gb-2000-1-4-research0008.nxml\n",
1654 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15027/gb-2000-1-5-research0009.nxml\n",
1655 |       "starting...\n",
1656 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15027/gb-2000-1-5-research0009.nxml\n",
1657 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15028/gb-2000-1-5-research0010.nxml\n",
1658 |       "starting...\n",
1659 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC15028/gb-2000-1-5-research0010.nxml\n",
1660 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16139/gb-2000-1-6-research0011.nxml\n",
1661 |       "starting...\n",
1662 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16139/gb-2000-1-6-research0011.nxml\n",
1663 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16141/gb-2000-1-6-research0012.nxml\n",
1664 |       "starting...\n",
1665 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16141/gb-2000-1-6-research0012.nxml\n",
1666 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16144/gb-2000-1-6-research0013.nxml\n",
1667 |       "starting...\n",
1668 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16144/gb-2000-1-6-research0013.nxml\n",
1669 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16145/gb-2000-1-6-research0014.nxml\n",
1670 |       "starting...\n",
1671 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC16145/gb-2000-1-6-research0014.nxml\n",
1672 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17597/gb-2000-2-1-research0001.nxml\n",
1673 |       "starting...\n",
1674 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17597/gb-2000-2-1-research0001.nxml\n",
1675 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17598/gb-2000-2-1-research0002.nxml\n",
1676 |       "starting...\n",
1677 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17598/gb-2000-2-1-research0002.nxml\n",
1678 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17599/gb-2001-2-1-research0003.nxml\n",
1679 |       "starting...\n",
1680 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17599/gb-2001-2-1-research0003.nxml\n",
1681 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17774/ar-1-1-063.nxml\n",
1682 |       "starting...\n",
1683 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17774/ar-1-1-063.nxml\n",
1684 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17776/ar-1-1-071.nxml\n",
1685 |       "starting...\n",
1686 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17776/ar-1-1-071.nxml\n",
1687 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17779/ar-1-1-081.nxml\n",
1688 |       "starting...\n",
1689 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17779/ar-1-1-081.nxml\n",
1690 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17803/ar-2-1-050.nxml\n",
1691 |       "starting...\n",
1692 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17803/ar-2-1-050.nxml\n",
1693 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17804/ar-2-1-059.nxml\n",
1694 |       "starting...\n",
1695 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17804/ar-2-1-059.nxml\n",
1696 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17805/ar-2-1-065.nxml\n",
1697 |       "starting...\n",
1698 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17805/ar-2-1-065.nxml\n",
1699 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17806/ar-2-1-075.nxml\n",
1700 |       "starting...\n",
1701 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17806/ar-2-1-075.nxml\n",
1702 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17807/ar-2-2-142.nxml\n",
1703 |       "starting...\n",
1704 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17807/ar-2-2-142.nxml\n",
1705 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17808/ar-2-2-145.nxml\n",
1706 |       "starting...\n",
1707 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17808/ar-2-2-145.nxml\n",
1708 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17809/ar-2-2-154.nxml\n",
1709 |       "starting...\n",
1710 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17809/ar-2-2-154.nxml\n",
1711 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17810/ar-2-3-229.nxml\n",
1712 |       "starting...\n",
1713 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17810/ar-2-3-229.nxml\n",
1714 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17811/ar-2-3-236.nxml\n",
1715 |       "starting...\n",
1716 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17811/ar-2-3-236.nxml\n",
1717 |       "no output\n",
1718 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17812/ar-2-4-293.nxml\n",
1719 |       "starting...\n",
1720 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17812/ar-2-4-293.nxml\n",
1721 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17813/ar-2-4-303.nxml\n",
1722 |       "starting...\n",
1723 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17813/ar-2-4-303.nxml\n",
1724 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17814/ar-2-4-315.nxml\n",
1725 |       "starting...\n",
1726 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17814/ar-2-4-315.nxml\n",
1727 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17815/ar-2-4-327.nxml\n",
1728 |       "starting...\n",
1729 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17815/ar-2-4-327.nxml\n",
1730 |       "no output\n",
1731 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17816/ar-2-5-399.nxml\n",
1732 |       "starting...\n",
1733 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17816/ar-2-5-399.nxml\n",
1734 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17817/ar-2-5-407.nxml\n",
1735 |       "starting...\n",
1736 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17817/ar-2-5-407.nxml\n",
1737 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17818/ar-2-5-415.nxml\n",
1738 |       "starting...\n",
1739 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17818/ar-2-5-415.nxml\n",
1740 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17819/ar-2-5-424.nxml\n",
1741 |       "starting...\n",
1742 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17819/ar-2-5-424.nxml\n",
1743 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17820/ar-2-6-477.nxml\n",
1744 |       "starting...\n",
1745 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17820/ar-2-6-477.nxml\n",
1746 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17821/ar-2-6-489.nxml\n",
1747 |       "starting...\n",
1748 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17821/ar-2-6-489.nxml\n",
1749 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17822/ar-2-6-504.nxml\n",
1750 |       "starting...\n",
1751 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17822/ar-2-6-504.nxml\n",
1752 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17823/AR-3-1-041.nxml\n",
1753 |       "starting...\n",
1754 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17823/AR-3-1-041.nxml\n",
1755 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17824/AR-3-1-048.nxml\n",
1756 |       "starting...\n",
1757 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17824/AR-3-1-048.nxml\n",
1758 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17825/AR-3-1-054.nxml\n",
1759 |       "starting...\n",
1760 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17825/AR-3-1-054.nxml\n",
1761 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17826/AR-3-1-065.nxml\n",
1762 |       "starting...\n",
1763 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17826/AR-3-1-065.nxml\n",
1764 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17827/AR-3-1-072.nxml\n",
1765 |       "starting...\n",
1766 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17827/AR-3-1-072.nxml\n",
1767 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17828/ar-3-2-118.nxml\n",
1768 |       "starting...\n",
1769 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17828/ar-3-2-118.nxml\n",
1770 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17829/ar-3-2-127.nxml\n",
1771 |       "starting...\n",
1772 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC17829/ar-3-2-127.nxml\n",
1773 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25774/gb-2001-2-2-research0004.nxml\n",
1774 |       "starting...\n",
1775 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25774/gb-2001-2-2-research0004.nxml\n",
1776 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25775/gb-2001-2-2-research0005.nxml\n",
1777 |       "starting...\n",
1778 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25775/gb-2001-2-2-research0005.nxml\n",
1779 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25776/gb-2001-2-2-research0006.nxml\n",
1780 |       "starting...\n",
1781 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC25776/gb-2001-2-2-research0006.nxml\n",
1782 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28985/cc-1-1-015.nxml\n",
1783 |       "starting...\n",
1784 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28985/cc-1-1-015.nxml\n",
1785 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28986/cc-1-1-025.nxml\n",
1786 |       "starting...\n",
1787 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28986/cc-1-1-025.nxml\n",
1788 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28987/cc-1-1-041.nxml\n",
1789 |       "starting...\n",
1790 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28987/cc-1-1-041.nxml\n",
1791 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28988/cc-1-1-045.nxml\n",
1792 |       "starting...\n",
1793 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28988/cc-1-1-045.nxml\n",
1794 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28989/cc-1-2-065.nxml\n",
1795 |       "starting...\n",
1796 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28989/cc-1-2-065.nxml\n",
1797 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28990/cc-1-2-071.nxml\n",
1798 |       "starting...\n",
1799 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28990/cc-1-2-071.nxml\n",
1800 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28991/cc-1-2-075.nxml\n",
1801 |       "starting...\n",
1802 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28991/cc-1-2-075.nxml\n",
1803 |       "no output\n",
1804 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28992/cc-1-2-079.nxml\n",
1805 |       "starting...\n",
1806 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28992/cc-1-2-079.nxml\n",
1807 |       "no output\n",
1808 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28993/cc-1-3-095.nxml\n",
1809 |       "starting...\n",
1810 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28993/cc-1-3-095.nxml\n",
1811 |       "no output\n",
1812 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28994/cc-1-3-101.nxml\n",
1813 |       "starting...\n",
1814 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28994/cc-1-3-101.nxml\n",
1815 |       "no output\n",
1816 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28995/cc-1-3-105.nxml\n",
1817 |       "starting...\n",
1818 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28995/cc-1-3-105.nxml\n",
1819 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28996/cc-1-3-111.nxml\n",
1820 |       "starting...\n",
1821 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28996/cc-1-3-111.nxml\n",
1822 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28997/cc-2-1-019.nxml\n",
1823 |       "starting...\n",
1824 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28997/cc-2-1-019.nxml\n",
1825 |       "no output\n",
1826 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28998/cc-2-1-025.nxml\n",
1827 |       "starting...\n",
1828 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28998/cc-2-1-025.nxml\n",
1829 |       "no output\n",
1830 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28999/cc-2-1-029.nxml\n",
1831 |       "starting...\n",
1832 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC28999/cc-2-1-029.nxml\n",
1833 |       "no output\n",
1834 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29000/cc-2-1-035.nxml\n",
1835 |       "starting...\n",
1836 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29000/cc-2-1-035.nxml\n",
1837 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29001/cc-2-2-049.nxml\n",
1838 |       "starting...\n",
1839 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29001/cc-2-2-049.nxml\n",
1840 |       "no output\n",
1841 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29002/cc-2-2-057.nxml\n",
1842 |       "starting...\n",
1843 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29002/cc-2-2-057.nxml\n",
1844 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29003/cc-2-2-061.nxml\n",
1845 |       "starting...\n",
1846 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29003/cc-2-2-061.nxml\n",
1847 |       "no output\n",
1848 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29004/cc-2-2-067.nxml\n",
1849 |       "starting...\n",
1850 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29004/cc-2-2-067.nxml\n",
1851 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29005/cc-3-1-001.nxml\n",
1852 |       "starting...\n",
1853 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29005/cc-3-1-001.nxml\n",
1854 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29006/cc-3-1-007.nxml\n",
1855 |       "starting...\n",
1856 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29006/cc-3-1-007.nxml\n",
1857 |       "no output\n",
1858 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29007/cc-3-1-011.nxml\n",
1859 |       "starting...\n",
1860 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29007/cc-3-1-011.nxml\n",
1861 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29008/cc-3-1-017.nxml\n",
1862 |       "starting...\n",
1863 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29008/cc-3-1-017.nxml\n",
1864 |       "no output\n",
1865 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29009/cc-3-1-019.nxml\n",
1866 |       "starting...\n",
1867 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29009/cc-3-1-019.nxml\n",
1868 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29010/cc-3-1-023.nxml\n",
1869 |       "starting...\n",
1870 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29010/cc-3-1-023.nxml\n",
1871 |       "no output\n",
1872 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29011/cc-3-1-033.nxml\n",
1873 |       "starting...\n",
1874 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29011/cc-3-1-033.nxml\n",
1875 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29012/cc-3-1-039.nxml\n",
1876 |       "starting...\n",
1877 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29012/cc-3-1-039.nxml\n",
1878 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29013/cc-3-1-045.nxml\n",
1879 |       "starting...\n",
1880 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29013/cc-3-1-045.nxml\n",
1881 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29014/cc-3-1-051.nxml\n",
1882 |       "starting...\n",
1883 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29014/cc-3-1-051.nxml\n",
1884 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29015/cc-3-2-057.nxml\n",
1885 |       "starting...\n",
1886 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29015/cc-3-2-057.nxml\n",
1887 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29016/cc-3-2-065.nxml\n",
1888 |       "starting...\n",
1889 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29016/cc-3-2-065.nxml\n",
1890 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29017/cc-3-3-071.nxml\n",
1891 |       "starting...\n",
1892 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29017/cc-3-3-071.nxml\n",
1893 |       "no output\n",
1894 |       "/workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29018/cc-3-3-079.nxml\n",
1895 |       "starting...\n",
1896 |       "parsed /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_files/PMC29018/cc-3-3-079.nxml\n",
1897 |       "no output\n",
1898 |       "Processed 100 files\n"
1899 |      ]
1900 |     }
1901 |    ],
1902 |    "source": [
1903 |     "pipeline_input_file_path = repo_root / \"_results\" / \"data\" / \"pubmed_parsed_data.jsonl\"\n",
1904 |     "\n",
1905 |     "data.generate_pmc15_pipeline_outputs(\n",
1906 |     "    decompressed_folder=decompressed_folder_path,\n",
1907 |     "    output_file_path=pipeline_input_file_path,\n",
1908 |     ")"
1909 |    ]
1910 |   },
1911 |   {
1912 |    "cell_type": "code",
1913 |    "execution_count": 8,
1914 |    "metadata": {},
1915 |    "outputs": [
1916 |     {
1917 |      "name": "stdout",
1918 |      "output_type": "stream",
1919 |      "text": [
1920 |       "Number of lines in pipeline output file: 78\n"
1921 |      ]
1922 |     }
1923 |    ],
1924 |    "source": [
1925 |     "num_lines = fs_utils.get_line_count(pipeline_input_file_path)\n",
1926 |     "print(f\"Number of lines in pipeline output file: {num_lines}\")"
1927 |    ]
1928 |   }
1929 |  ],
1930 |  "metadata": {
1931 |   "kernelspec": {
1932 |    "display_name": ".venv",
1933 |    "language": "python",
1934 |    "name": "python3"
1935 |   },
1936 |   "language_info": {
1937 |    "codemirror_mode": {
1938 |     "name": "ipython",
1939 |     "version": 3
1940 |    },
1941 |    "file_extension": ".py",
1942 |    "mimetype": "text/x-python",
1943 |    "name": "python",
1944 |    "nbconvert_exporter": "python",
1945 |    "pygments_lexer": "ipython3",
1946 |    "version": "3.12.8"
1947 |   }
1948 |  },
1949 |  "nbformat": 4,
1950 |  "nbformat_minor": 2
1951 | }
1952 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F541, F403, F405
3 | 
4 | 


--------------------------------------------------------------------------------