├── .devcontainer └── devcontainer.json ├── .editorconfig ├── .gitattributes ├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── images └── pmc_15m_pipeline.jpg ├── pmc15_pipeline ├── __init__.py ├── constants.py ├── data.py ├── types.py └── utils │ ├── __init__.py │ └── fs_utils.py ├── requirements.txt ├── run_pmc15_pipeline.ipynb └── tox.ini /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python 3 | { 4 | "name": "Python 3", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye" 7 | 8 | // Features to add to the dev container. More info: https://containers.dev/features. 9 | // "features": {}, 10 | 11 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 12 | // "forwardPorts": [], 13 | 14 | // Use 'postCreateCommand' to run commands after the container is created. 15 | // "postCreateCommand": "pip3 install --user -r requirements.txt", 16 | 17 | // Configure tool-specific properties. 18 | // "customizations": {}, 19 | 20 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 21 | // "remoteUser": "root" 22 | } 23 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | # Unix-style newlines with a newline ending every file 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | charset = utf-8 9 | 10 | [*.py] 11 | max_line_length = 79 12 | 13 | # 4 space indentation 14 | [*.{py,json}] 15 | indent_style = space 16 | indent_size = 4 17 | 18 | # 2 space indentation 19 | [*.{md,sh,yaml,yml}] 20 | indent_style = space 21 | indent_size = 2 22 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitattributes 2 | 3 | # Set the default behavior, in case people don't have core.autocrlf set. 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion 5 | * text=auto 6 | 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | *.pyz text diff=python 16 | *.pyi text diff=python 17 | 18 | # Binary files 19 | # ============ 20 | *.db binary 21 | *.p binary 22 | *.pkl binary 23 | *.pickle binary 24 | *.pyc binary export-ignore 25 | *.pyo binary export-ignore 26 | *.pyd binary 27 | 28 | # Jupyter notebook 29 | *.ipynb text eol=lf 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _results/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | docs/source/generated 27 | .pytest_cache 28 | 29 | # Jupyter Notebook 30 | .ipynb_checkpoints 31 | 32 | # virtualenv 33 | .venv 34 | venv/ 35 | ENV/ 36 | 37 | # Experimental Folder 38 | experimental/* 39 | 40 | # Weights and Biases logs 41 | wandb/ 42 | 43 | # Debug results dir 44 | _results/* 45 | 46 | # PyTorch Checkpoints 47 | *.pt 48 | 49 | # YOLO output 50 | runs/ 51 | 52 | # SAM output 53 | *.jpg 54 | 55 | # MLFlow output 56 | mlruns/ 57 | 58 | # Output from unit tests 59 | expansion_pipeline/code/tests/test_results/* 60 | 61 | # Environment Variables 62 | .env* 63 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "UI: Caption Splitter", 9 | "type": "python", 10 | "request": "launch", 11 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 12 | "module": "components.caption_splitter.caption_splitter_prefilter_ui", 13 | "console": "integratedTerminal", 14 | "justMyCode": false 15 | }, 16 | { 17 | "name": "UI: Citance Adder", 18 | "type": "python", 19 | "request": "launch", 20 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 21 | "module": "components.citance_adder.citance_adder_ui", 22 | "console": "integratedTerminal", 23 | "justMyCode": false 24 | }, 25 | { 26 | "name": "UI: Text Label Matcher", 27 | "type": "python", 28 | "request": "launch", 29 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 30 | "module": "components.text_label_matcher.text_label_matcher_ui", 31 | "console": "integratedTerminal", 32 | "justMyCode": false 33 | }, 34 | { 35 | "name": "UI: Panel Matcher", 36 | "type": "python", 37 | "request": "launch", 38 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 39 | "module": "components.panel_matcher.panel_matcher_ui", 40 | "console": "integratedTerminal", 41 | "justMyCode": false 42 | }, 43 | { 44 | "name": "UI: Data Merger", 45 | "type": "python", 46 | "request": "launch", 47 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 48 | "module": "components.data_merger.data_merger_ui", 49 | "console": "integratedTerminal", 50 | "justMyCode": false 51 | }, 52 | { 53 | "name": "UI: Data Expander", 54 | "type": "python", 55 | "request": "launch", 56 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 57 | "module": "components.data_expander.data_expander_ui", 58 | "console": "integratedTerminal", 59 | "justMyCode": false 60 | }, 61 | { 62 | "name": "UI: Component Comparitor", 63 | "type": "python", 64 | "request": "launch", 65 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 66 | "module": "components.component_comparator.component_comparator_ui", 67 | "console": "integratedTerminal", 68 | "justMyCode": false 69 | }, 70 | { 71 | "name": "Local Run: Caption Splitter", 72 | "type": "python", 73 | "request": "launch", 74 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 75 | "module": "components.caption_splitter.caption_splitter_prefilter_runner", 76 | "console": "integratedTerminal", 77 | "justMyCode": false 78 | }, 79 | { 80 | "name": "Local Run: Citance Adder", 81 | "type": "python", 82 | "request": "launch", 83 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 84 | "module": "components.citance_adder.citance_adder_runner", 85 | "console": "integratedTerminal", 86 | "justMyCode": false 87 | }, 88 | { 89 | "name": "Local Run: Text Label Matcher Input", 90 | "type": "python", 91 | "request": "launch", 92 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 93 | "module": "components.text_label_matcher_input.text_label_matcher_input_runner", 94 | "console": "integratedTerminal", 95 | "justMyCode": false 96 | }, 97 | { 98 | "name": "Local Run: Text Label Matcher", 99 | "type": "python", 100 | "request": "launch", 101 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 102 | "module": "components.text_label_matcher.text_label_matcher_runner", 103 | "console": "integratedTerminal", 104 | "justMyCode": false 105 | }, 106 | { 107 | "name": "Local Run: Panel Matcher Input", 108 | "type": "python", 109 | "request": "launch", 110 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 111 | "module": "components.panel_matcher_input.panel_matcher_input_runner", 112 | "console": "integratedTerminal", 113 | "justMyCode": false 114 | }, 115 | { 116 | "name": "Local Run: Panel Matcher", 117 | "type": "python", 118 | "request": "launch", 119 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 120 | "module": "components.panel_matcher.panel_matcher_runner", 121 | "console": "integratedTerminal", 122 | "justMyCode": false 123 | }, 124 | { 125 | "name": "Local Run: Data Expander", 126 | "type": "python", 127 | "request": "launch", 128 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 129 | "module": "components.data_expander.data_expander_runner", 130 | "console": "integratedTerminal", 131 | "justMyCode": false 132 | }, 133 | { 134 | "name": "Local Run: Data Merger", 135 | "type": "python", 136 | "request": "launch", 137 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 138 | "module": "components.data_merger.data_merger_runner", 139 | "console": "integratedTerminal", 140 | "justMyCode": false 141 | }, 142 | { 143 | "name": "Local Run: Analyze Data", 144 | "type": "python", 145 | "request": "launch", 146 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 147 | "module": "shared.analyze_pipeline_results", 148 | "console": "integratedTerminal", 149 | "justMyCode": false 150 | }, 151 | { 152 | "name": "Python: Text Label Matcher", 153 | "type": "python", 154 | "request": "launch", 155 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 156 | "module": "components.text_label_matcher.cli_runner", 157 | "console": "integratedTerminal", 158 | "justMyCode": false, 159 | "args": [ 160 | "--input_folder_path", 161 | "${workspaceFolder}/_results/3.text_label_matcher_input", 162 | "--output_folder_path", 163 | "${workspaceFolder}/_results/3.text_label_matcher" 164 | ], 165 | "env": { 166 | "LOGLEVEL": "DEBUG" 167 | } 168 | }, 169 | { 170 | "name": "Python: OpenCV Panel Split", 171 | "type": "python", 172 | "request": "launch", 173 | "module": "segmentation.opencv_panel_split", 174 | "console": "integratedTerminal", 175 | "justMyCode": false, 176 | "args": [ 177 | "--input_image_folder_path", 178 | "${workspaceFolder}/_results/sample_multi_panel_images" 179 | ], 180 | }, 181 | { 182 | "name": "Python: Caption Splitter", 183 | "type": "python", 184 | "request": "launch", 185 | "module": "text_caption_segmentation.segment_captions", 186 | "console": "integratedTerminal", 187 | "justMyCode": false 188 | }, 189 | { 190 | "name": "Python: Download Pub Med Articles", 191 | "type": "python", 192 | "request": "launch", 193 | "module": "data.download_pubmed_files", 194 | "console": "integratedTerminal", 195 | "justMyCode": false, 196 | "args": [ 197 | ], 198 | }, 199 | { 200 | "name": "Python: Decompress Pub Med Articles", 201 | "type": "python", 202 | "request": "launch", 203 | "module": "data.decompress_pubmed_files", 204 | "console": "integratedTerminal", 205 | "justMyCode": false, 206 | "args": [ 207 | ], 208 | }, 209 | { 210 | "name": "Python: Optical Character Recognition (v3.2)", 211 | "type": "python", 212 | "request": "launch", 213 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 214 | "module": "components.image_label_detector.aml_component_cli_runner", 215 | "console": "integratedTerminal", 216 | "justMyCode": false, 217 | "args": [ 218 | "--input_folder_path", 219 | "${workspaceFolder}/_results/image_label_detector/input", 220 | "--output_folder_path", 221 | "${workspaceFolder}/_results/image_label_detector/output", 222 | "--batch_size", 223 | "2", 224 | "--max_images", 225 | "5", 226 | // "--log_figure_texts", 227 | // "True", 228 | ], 229 | "envFile": "${workspaceFolder}/.env", 230 | "env": { 231 | "AZURE_VISION_ENDPOINT": "https://westus2.api.cognitive.microsoft.com/", 232 | "LOGLEVEL": "DEBUG" 233 | } 234 | }, 235 | { 236 | "name": "Python: Image Analysis (v4.0)", 237 | "type": "python", 238 | "request": "launch", 239 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 240 | "module": "components.image_label_detector.aml_component_cli_runner", 241 | "console": "integratedTerminal", 242 | "justMyCode": false, 243 | "args": [ 244 | "--input_folder_path", 245 | "${workspaceFolder}/_results/2.1.caption_splitter/2023-07-24T18-04/success", 246 | "--output_folder_path", 247 | "${workspaceFolder}/_results/image_label_detector/output", 248 | "--max_images", 249 | "5", 250 | // "--log_figure_texts", 251 | // "True", 252 | "--use_version_4", 253 | "True", 254 | ], 255 | "envFile": "${workspaceFolder}/.env", 256 | "env": { 257 | "AZURE_VISION_ENDPOINT": "https://westus2.api.cognitive.microsoft.com/", 258 | "LOGLEVEL": "DEBUG" 259 | } 260 | }, 261 | { 262 | "name": "Python: Image Label Comparer", 263 | "type": "python", 264 | "request": "launch", 265 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 266 | "module": "components.image_label_detector.figure_texts_comparer", 267 | "console": "integratedTerminal", 268 | "justMyCode": false, 269 | "args": [ 270 | "--figures_file_path", 271 | "${workspaceFolder}/_results/image_label_detector/input/figures.jsonl", 272 | "--figure_texts_objects_a_file_path", 273 | "${workspaceFolder}/_results/image_label_detector/output/20230717.190855_v3.2/success/figure_texts.jsonl", 274 | // "${workspaceFolder}/_results/image_label_detector/output/20230713.164600/output.jsonl", 275 | "--figure_texts_objects_b_file_path", 276 | "${workspaceFolder}/_results/image_label_detector/output/20230717.190918_v4.0/success/figure_texts.jsonl", 277 | // "${workspaceFolder}/_results/image_label_detector/output/20230718.174032_v4.0/success/figure_texts.jsonl", 278 | "--output_folder_path", 279 | "${workspaceFolder}/_results/image_label_detector/output", 280 | ], 281 | "envFile": "${workspaceFolder}/.env", 282 | "env": { 283 | "LOGLEVEL": "DEBUG" 284 | } 285 | }, 286 | { 287 | "name": "Python: Figure Splitter", 288 | "type": "python", 289 | "request": "launch", 290 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 291 | "module": "components.figure_splitter.aml_component_cli_runner", 292 | "console": "integratedTerminal", 293 | "justMyCode": false, 294 | "args": [ 295 | "--input_folder_path", 296 | "${workspaceFolder}/_results/2.1.caption_splitter/2023-07-24T18-04/success", 297 | "--output_folder_path", 298 | "${workspaceFolder}/_results/figure_splitter/output/", 299 | "--max_images", 300 | "5", 301 | ], 302 | "env": { 303 | "LOGLEVEL": "DEBUG" 304 | } 305 | }, 306 | { 307 | "name": "Python: Data Expander CLI Runner", 308 | "type": "python", 309 | "request": "launch", 310 | "cwd": "${workspaceFolder}/expansion_pipeline/code", 311 | "module": "components.data_expander.data_expander_cli_runner", 312 | "console": "integratedTerminal", 313 | "justMyCode": false, 314 | "args": [ 315 | "--input_folder_path", 316 | "${workspaceFolder}/_results/4.panel_matcher/2023-07-26T16-41/success", 317 | "--output_folder_path", 318 | "${workspaceFolder}/_results/5.data_expander", 319 | "--output_dbfs_paths", 320 | "True", 321 | "--max_figures", 322 | "5", 323 | ], 324 | "envFile": "${workspaceFolder}/.env", 325 | "env": { 326 | "LOGLEVEL": "DEBUG" 327 | } 328 | }, 329 | ] 330 | } 331 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "autoDocstring.docstringFormat": "google", 3 | "python.analysis.autoImportCompletions": true, 4 | "python.analysis.completeFunctionParens": true, 5 | "python.linting.flake8Enabled": true, 6 | "python.testing.pytestEnabled": true, 7 | "python.testing.pytestArgs": [ 8 | "expansion_pipeline/code" 9 | ], 10 | "python.testing.unittestEnabled": false, 11 | "python.analysis.typeCheckingMode": "basic" 12 | } 13 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BiomedCLIP Data Pipeline 2 | 3 | [![Code License](https://img.shields.io/badge/Code%20License-MIT%20License-red)](LICENSE) 4 | 5 | *A pipeline to construct millions of image-caption figures from PubMed.* 6 | 7 | [[NEJM AI Article](https://ai.nejm.org/stoken/default+domain/9VPKUGJYJ5BPFXY83IBS/full?redirectUri=doi/full/10.1056/AIoa2400640)] 8 | 9 | **BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs**
10 | 11 | Sheng Zhang, Yanbo Xu, Naoto Usuyama, Hanwen Xu, Jaspreet Bagga, Robert Tinn, Sam Preston, Rajesh Rao, Mu Wei, Naveen Valluri, Cliff Wong, Andrea Tupini, Yu Wang, Matt Mazzola, Swadheen Shukla, Lars Liden, Jianfeng Gao, Angela Crabtree, Brian Piening, Carlo Bifulco, Matthew P. Lungren, Tristan Naumann, Sheng Wang, Hoifung Poon 12 | 13 |

14 |
15 |

16 | 17 | 18 | This repository hosts the **BiomedCLIP Data Pipeline**, which automatically downloads and processes a set of articles from the PubMed Central Open Access dataset. The end result is a JSONL file containing figures and associated captions, which can be used to train the **BiomedCLIP** model. 19 | 20 | For a hands-on demonstration, refer to the [example notebook](run_pmc15_pipeline.ipynb). 21 | 22 | 23 | ## Environment Setup 24 | 25 | ```bash 26 | # it is recmmended to use a virtual environment but not required 27 | python -m venv .venv 28 | source .venv/bin/activate 29 | 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | ## Reference 34 | ```bibtex 35 | @article{zhang2024biomedclip, 36 | title={A Multimodal Biomedical Foundation Model Trained from Fifteen Million Image–Text Pairs}, 37 | author={Sheng Zhang and Yanbo Xu and Naoto Usuyama and Hanwen Xu and Jaspreet Bagga and Robert Tinn and Sam Preston and Rajesh Rao and Mu Wei and Naveen Valluri and Cliff Wong and Andrea Tupini and Yu Wang and Matt Mazzola and Swadheen Shukla and Lars Liden and Jianfeng Gao and Angela Crabtree and Brian Piening and Carlo Bifulco and Matthew P. Lungren and Tristan Naumann and Sheng Wang and Hoifung Poon}, 38 | journal={NEJM AI}, 39 | year={2024}, 40 | volume={2}, 41 | number={1}, 42 | doi={10.1056/AIoa2400640}, 43 | url={https://ai.nejm.org/doi/full/10.1056/AIoa2400640} 44 | } 45 | ``` 46 | 47 | ## Contributing 48 | 49 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 50 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 51 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 52 | 53 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 54 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 55 | provided by the bot. You will only need to do this once across all repos using our CLA. 56 | 57 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 58 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 59 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 60 | 61 | ## Trademarks 62 | 63 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 64 | trademarks or logos is subject to and must follow 65 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 66 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 67 | Any use of third-party trademarks or logos are subject to those third-party's policies. 68 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /images/pmc_15m_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/images/pmc_15m_pipeline.jpg -------------------------------------------------------------------------------- /pmc15_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/pmc15_pipeline/__init__.py -------------------------------------------------------------------------------- /pmc15_pipeline/constants.py: -------------------------------------------------------------------------------- 1 | 2 | PUBMED_OPEN_ACCESS_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/" 3 | PUBMED_OPEN_ACCESS_FILE_LIST_URL = PUBMED_OPEN_ACCESS_BASE_URL + "oa_file_list.txt" 4 | -------------------------------------------------------------------------------- /pmc15_pipeline/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tarfile 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import pubmed_parser 7 | import requests 8 | from lxml import etree 9 | from tqdm import tqdm 10 | 11 | from .constants import PUBMED_OPEN_ACCESS_BASE_URL, PUBMED_OPEN_ACCESS_FILE_LIST_URL 12 | from .types import PubMedFile 13 | from .utils import fs_utils 14 | 15 | repo_root = fs_utils.get_repo_root_path() 16 | 17 | 18 | def download_pubmed_file_list( 19 | url=PUBMED_OPEN_ACCESS_FILE_LIST_URL, 20 | output_file_path: Path = ( 21 | repo_root / "_results" / "data" / "pubmed_open_access_file_list.txt" 22 | ), 23 | ): 24 | # Ensure output directory exists 25 | Path(output_file_path).parent.mkdir(parents=True, exist_ok=True) 26 | 27 | # Download file 28 | print(f"Downloading OpenAccess file list from: {url} to {output_file_path}") 29 | 30 | if Path(output_file_path).exists(): 31 | print(f"File already exists: {output_file_path}") 32 | return 33 | 34 | response = requests.get(url) 35 | 36 | with open(output_file_path, "wb") as file: 37 | file.write(response.content) 38 | 39 | print(f"Saved to: {output_file_path}") 40 | 41 | 42 | def download_pubmed_files_from_list( 43 | file_list_path: Path = ( 44 | repo_root / "_results/data/pubmed_open_access_file_list.txt" 45 | ), 46 | output_folder_path: Path = ( 47 | repo_root / "_results" / "data" / "pubmed_open_access_files_compressed" 48 | ), 49 | subset_size: Optional[int] = None, 50 | file_extension=".tar.gz", 51 | ): 52 | """Download files from PubMed Open Access file list 53 | 54 | Args: 55 | file_list_path (Path, optional): Path to PubMed Open Access Files list. Defaults to "_results/data/pubmed_open_access_file_list_top_100.txt". 56 | output_folder_path (Path, optional): Path to save directory. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files_compressed". 57 | subset_size (int, optional): Number of files to download. Defaults to None (download all files). 58 | 59 | Example: 60 | 61 | python3 -m data.download_pubmed_files 62 | """ 63 | 64 | # Get dicts from files list 65 | pubmed_files: list[PubMedFile] = [] 66 | 67 | with open(file_list_path, "r") as file: 68 | lines = file.readlines() 69 | 70 | # Skip header 71 | lines = lines[1:] 72 | 73 | for line_idx, line in enumerate(lines): 74 | if subset_size and line_idx + 1 > subset_size: 75 | break 76 | 77 | [path, title, pmcid, pmid, code] = line.strip().split("\t") 78 | pubmed_file: PubMedFile = { 79 | "path": path, 80 | "title": title, 81 | "pmcid": pmcid, 82 | "pmid": pmid, 83 | "code": code, 84 | } 85 | pubmed_files.append(pubmed_file) 86 | 87 | # Create output folder 88 | output_folder_path.mkdir(parents=True, exist_ok=True) 89 | skipped_files = [] 90 | 91 | def _get_file_size(url): 92 | response = requests.head(url) 93 | if "Content-Length" in response.headers: 94 | return int(response.headers["Content-Length"]) 95 | else: 96 | return None 97 | 98 | for pubmed_file in tqdm(pubmed_files): 99 | file_name = pubmed_file["pmcid"] + file_extension 100 | file_path = output_folder_path / file_name 101 | 102 | # Check if the file already exists 103 | if file_path.exists(): 104 | tqdm.write(f"File: {file_name} already exists. Not downloading again.") 105 | continue 106 | 107 | article_url = PUBMED_OPEN_ACCESS_BASE_URL + pubmed_file["path"] 108 | file_size = _get_file_size(article_url) 109 | 110 | if file_size is not None: 111 | tqdm.write(f"File: {file_name} size: {file_size} bytes") 112 | try: 113 | response = requests.get(article_url) 114 | response.raise_for_status() # Raise an HTTPError for bad responses 115 | 116 | with open(file_path, "wb") as file: 117 | file.write(response.content) 118 | 119 | except requests.exceptions.RequestException as e: 120 | tqdm.write(f"File: {file_name} Skipped! Error occurred: {e}") 121 | skipped_files.append(pubmed_file) 122 | 123 | with open(file_path, "wb") as file: 124 | file.write(response.content) 125 | 126 | else: 127 | tqdm.write(f"File: {file_name} Skipped! Could not get file size!") 128 | skipped_files.append(pubmed_file) 129 | 130 | print(f"Skipped {len(skipped_files)} files.") 131 | 132 | 133 | def decompress_pubmed_files( 134 | input_folder_path: Path = ( 135 | repo_root / "_results" / "data" / "pubmed_open_access_files_compressed" 136 | ), 137 | output_folder_path: Path = ( 138 | repo_root / "_results" / "data" / "pubmed_open_access_files" 139 | ), 140 | file_extension="*.tar.gz", 141 | ): 142 | """Decompress article files from PubMed Open Access folder 143 | 144 | Args: 145 | input_folder_path (Path, optional): _description_. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files_compressed". 146 | output_folder_path (Path, optional): _description_. Defaults to repo_root/"_results"/"data"/"pubmed_open_access_files". 147 | file_extension (str, optional): _description_. Defaults to ".tar.gz". 148 | 149 | Example: 150 | 151 | python3 -m data.decompress_pubmed_files 152 | """ 153 | 154 | # Get dicts from files list 155 | file_paths = list(input_folder_path.glob(file_extension)) 156 | 157 | print( 158 | f"Found {len(file_paths)} files that match {file_extension} in {input_folder_path}" 159 | ) 160 | 161 | for file_path in tqdm(file_paths): 162 | with tarfile.open(file_path, "r:gz") as tar_file: 163 | # TODO: Use article folder path instead of output folder path? 164 | # Causes duplicate folder names since tar file contains folder 165 | tar_file.extractall(output_folder_path) 166 | 167 | print(f"Finished extracting {len(file_paths)} files") 168 | 169 | 170 | def generate_pmc15_pipeline_outputs( 171 | decompressed_folder: Path = ( 172 | repo_root / "_results" / "data" / "pubmed_open_access_files" 173 | ), 174 | output_file_path: Path = ( 175 | repo_root / "_results" / "data" / "pubmed_parsed_data.json" 176 | ), 177 | ): 178 | 179 | # input - path to .nxml file for each article in the article package 180 | # output - json object with pmid, pmc id, location (path to article package in storage blobs), figures - list of figure objects which include inline references (mentions of figure throughout the article), caption for the figure, id, label, graphic_ref (filepath to figure jpg in storage blobs), pair_id (a unique id to identify each figure in the article, using pmid + figure_id) 181 | def parse_single_pubmed_file(nxml_path: Path): 182 | print(nxml_path) 183 | 184 | if nxml_path is None or not nxml_path.exists(): 185 | print("error") 186 | return [] 187 | 188 | try: 189 | print("starting...") 190 | output = pubmed_parser.parse_pubmed_caption(str(nxml_path.absolute())) 191 | print("parsed", nxml_path) 192 | except AttributeError as ae: 193 | print("Attribute Error: " + str(ae) + " path: " + str(nxml_path)) 194 | return [] 195 | except etree.XMLSyntaxError as xmle: 196 | print("XML Syntax Error: " + str(xmle) + " path: " + str(nxml_path)) 197 | return [] 198 | except Exception as e: 199 | print("Exception: " + str(e) + " path: " + str(nxml_path)) 200 | return [] 201 | 202 | if not output: 203 | print("no output") 204 | return [] 205 | 206 | else: 207 | figures = [] 208 | pmid = output[0]["pmid"] # same for all figures in the article 209 | pmc = output[0]["pmc"] # same for all figures in the article 210 | location = Path(nxml_path).parent 211 | 212 | # for all figures in the article, create a figure object with inline references (text, section, reference_id), and caption, id, label, graphic_ref, pair_id 213 | for figure_dict in output: 214 | inline_references = figure_dict.get( 215 | "fig_refs", {} 216 | ) # from pubmed parser 217 | ir_objects = [] 218 | for inline_reference in inline_references: 219 | inline_reference_object = { 220 | "text": str(inline_reference.get("text", "")), 221 | "section": str(inline_reference.get("section", "")), 222 | "reference_id": str(inline_reference.get("reference_id", "")), 223 | } 224 | 225 | ir_objects.append(inline_reference_object) 226 | 227 | if len(ir_objects) > 0: 228 | raise NotImplementedError("Inline references not implemented") 229 | 230 | figure_object = { 231 | "fig_caption": str(figure_dict.get("fig_caption", "")), 232 | "fig_id": str(figure_dict.get("fig_id", "")), 233 | "fig_label": str(figure_dict.get("fig_label", "")), 234 | "graphic_ref": ( 235 | str(location / (figure_dict["graphic_ref"] + ".jpg")) 236 | if "graphic_ref" in figure_dict 237 | else "" 238 | ), # set this to the path of the jpg image in storage blobs 239 | "pair_id": str(pmid) + "_" + str(figure_dict.get("fig_id", "")), 240 | "inline_references": ir_objects, # add inline references 241 | } 242 | 243 | figures.append(figure_object) 244 | 245 | article = { 246 | "pmid": pmid, 247 | "pmc": pmc, 248 | "location": str(location), 249 | "figures": figures, 250 | } 251 | 252 | return [article] 253 | 254 | with output_file_path.open("w+") as f: 255 | for idx, nxml_file in enumerate(decompressed_folder.rglob("*.nxml")): 256 | parsed = parse_single_pubmed_file(nxml_file) 257 | 258 | for article in parsed: 259 | for figure in article["figures"]: 260 | # remove inline references since we're not using them 261 | figure.pop("inline_references") 262 | 263 | f.write(json.dumps(article) + "\n") 264 | 265 | print(f"Processed {idx+1} files") 266 | -------------------------------------------------------------------------------- /pmc15_pipeline/types.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | 4 | class PubMedFile(TypedDict): 5 | path: str 6 | title: str 7 | pmcid: str 8 | pmid: str 9 | code: str 10 | -------------------------------------------------------------------------------- /pmc15_pipeline/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/BiomedCLIP_data_pipeline/686a465267787aba96b6eca8cd9ee1dede28c2c2/pmc15_pipeline/utils/__init__.py -------------------------------------------------------------------------------- /pmc15_pipeline/utils/fs_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | import sys 4 | from datetime import datetime 5 | from pathlib import Path 6 | from typing import List, Tuple 7 | 8 | 9 | @functools.cache 10 | def get_repo_root_path(known_root_foldername=".vscode"): 11 | for parent in Path(__file__).parents: 12 | if (parent / known_root_foldername).exists(): 13 | return parent 14 | 15 | # check we're not at the root of the drive `/` 16 | raise ValueError( 17 | f"Repo root could not be found! Did not find `{known_root_foldername}` as child of any folders in path {Path(__file__)}" 18 | ) 19 | 20 | 21 | def get_line_count(file_path: Path) -> int: 22 | return sum(1 for _ in open(file_path, "r", encoding="utf8")) 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire==0.4.0 2 | ipykernel 3 | ipython 4 | matplotlib 5 | opencv-python==4.8.0.74 6 | PILLOW 7 | pytest 8 | requests 9 | rich==10.12.0 10 | seaborn==0.12.2 11 | pubmed_parser==0.5.1 12 | tqdm==4.67.1 -------------------------------------------------------------------------------- /run_pmc15_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PMC15 Pipeline\n", 8 | "\n", 9 | "This will run the PMC15 pipeline. The steps are as follows:\n", 10 | "\n", 11 | "1. Download the list of PMC Open Access data\n", 12 | "2. Download and extract the data\n", 13 | "3. Parse all the articles and create a `_results/data/pubmed_parsed_data.jsonl` file\n", 14 | "\n", 15 | "In the `pubmed_parsed_data.jsonl` file, each line is a JSON object with the following shape:\n", 16 | "\n", 17 | "```json\n", 18 | "{\n", 19 | " \"pmid\": \"PMID_VALUE like 11178228\",\n", 20 | " \"pmc\": \"PMC_VALUE like 15015\",\n", 21 | " \"location\": \"LOCATION_PATH: path to where the article is stored on disk\",\n", 22 | " \"figures\": [\n", 23 | " {\n", 24 | " \"fig_caption\": \"FIGURE_CAPTION: the caption of the figure in the article\",\n", 25 | " \"fig_id\": \"FIGURE_ID: F1, F2, etc\",\n", 26 | " \"fig_label\": \"FIGURE_LABEL: Figure 1, Figure 2, etc. Where the figure is referenced in the article\",\n", 27 | " \"graphic_ref\": \"GRAPHIC_REFERENCE_PATH: path to where the imape is stored on disk\",\n", 28 | " \"pair_id\": \"PAIR_ID: {pmid}_{fig_id}\",\n", 29 | " },\n", 30 | " ]\n", 31 | "}\n", 32 | "```" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# this controls how many articles will be downloaded and processed. Set to `None` to process all articles in the PMCOA list\n", 42 | "MAX_ITEMS_TO_PROCESS = 100" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from pmc15_pipeline import data\n", 52 | "from pmc15_pipeline.utils import fs_utils" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "repo_root = fs_utils.get_repo_root_path()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Downloading OpenAccess file list from: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt to /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_file_list.txt\n", 74 | "Saved to: /workspaces/biomedclip_data_pipeline/_results/data/pubmed_open_access_file_list.txt\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "list_output_path = repo_root / \"_results\" / \"data\" / \"pubmed_open_access_file_list.txt\"\n", 80 | "\n", 81 | "data.download_pubmed_file_list(\n", 82 | " output_file_path=list_output_path,\n", 83 | ")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | " 0%| | 0/100 [00:00