├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Pipfile ├── README.md ├── bin ├── scibot-bookmarklet ├── scibot-dashboard └── scibot-dbsetup ├── docs ├── architecture.graphml ├── setup.md ├── workflow-paper-id.graphml ├── workflow-rrid.graphml └── workflows.org ├── resources ├── config_files │ └── etc │ │ ├── nginx │ │ ├── nginx.conf │ │ └── scibot.conf │ │ ├── systemd │ │ └── system │ │ │ ├── env.conf │ │ │ ├── scibot-bookmarklet-sync.service │ │ │ ├── scibot-bookmarklet.service │ │ │ ├── scibot-bookmarklet.socket │ │ │ ├── scibot-dashboard.service │ │ │ └── scibot-dashboard.socket │ │ └── tmpfiles.d │ │ └── scibot-bookmarklet.conf ├── rpmbuild │ ├── .gitignore │ ├── SOURCES │ │ ├── env.conf │ │ ├── nginx.conf │ │ ├── scibot-bookmarklet-sync.service │ │ ├── scibot-bookmarklet.conf │ │ ├── scibot-bookmarklet.service │ │ ├── scibot-bookmarklet.socket │ │ └── scibot.conf │ └── SPECS │ │ └── scibot.spec └── scripts │ └── scibot-monkey-button.user.js ├── scibot ├── __init__.py ├── anno.py ├── bookmarklet.py ├── bookmarklet_server.py ├── check.py ├── cli.py ├── config.py ├── dash.py ├── dashboard.py ├── db.py ├── export.py ├── extract.py ├── get_annos.py ├── papers.py ├── release.py ├── release_report.py ├── rridxp.py ├── services.py ├── submit.py ├── sync.py ├── uri.py ├── utils.py └── workflow.py ├── setup.cfg ├── setup.py ├── sql ├── extensions.sql ├── permissions.sql ├── postgres.sql └── schemas.sql ├── templates ├── _formhelpers.html ├── main.html ├── results.html ├── search.html └── table.html └── test ├── __init__.py ├── test_extract.py ├── test_resolver.py ├── test_routes.py ├── test_sync.py └── testing_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.json 3 | *.log 4 | *.swp 5 | *.pyc 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Jon Udell and Tom Gillespie 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include test * 2 | exclude .gitignore 3 | exclude .travis.yml 4 | exclude MANIFEST.in 5 | recursive-exclude * *.pyc 6 | recursive-exclude * *.swp 7 | recursive-exclude * *.swo 8 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [requires] 7 | python_version = '3.7' 8 | 9 | [packages] 10 | hyputils = {editable = true, ref = "master", git = "https://github.com/tgbugs/hyputils.git"} 11 | ontquery = {editable = true, ref = "master", git = "https://github.com/tgbugs/ontquery.git"} 12 | pyontutils = {editable = true, ref = "master", git = "https://github.com/tgbugs/pyontutils.git"} 13 | "e1839a8" = {path = ".", editable = true} 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciBot 2 | [![PyPI version](https://badge.fury.io/py/scibot.svg)](https://pypi.org/project/scibot/) 3 | 4 | curation workflow automation and coordination 5 | 6 | * find RRIDs in articles 7 | * look them up in the SciCrunch resolver 8 | * create Hypothesis annotations that anchor to the RRIDs and display lookup results 9 | 10 | ## Getting Started 11 | 12 | * [Create a Hypothesis](https://web.hypothes.is/start/) account which will post the annotations. 13 | * Generate an api token at https://hypothes.is/profile/developer (must be logged in to see page). 14 | * Create a group to store the annotations at https://hypothes.is/groups/new (must be logged in to see page). 15 | * See [Setup on amazon](#setup-on-amazon) 16 | 17 | ## Capturing the bookmarklet 18 | 19 | Visit https://HOST:PORT/bookmarklet and follow the instructions. 20 | 21 | ## Using the bookmarklet 22 | 23 | Visit an article that contains RRIDs, click the bookmarklet 24 | 25 | ## Checking results in the browser 26 | 27 | The found RRIDs are logged to the JavaScript console 28 | 29 | ## Checking results on the server 30 | 31 | The found RRIDs are logged to timestamped files, along with the text and html of the article that was scanned for RRIDs 32 | 33 | ## Setup on Gentoo 34 | As root. 35 | ```bash 36 | layman -a tgbugs-overlay 37 | emerge scibot 38 | rc-config add scibot-bookmarklet default 39 | vim /etc/conf.d/scibot-bookmarklet # set username, group, api key, etc. 40 | /etc/init.d/scibot-bookmarklet start 41 | ``` 42 | 43 | ## Setup on ubuntu 18.04 44 | Set `export PATH=~/.local/bin:${PATH}` in `~/.bashrc` 45 | 1. `sudo apt-get install build-essential python3-dev libxml2-dev libxslt1-dev` 46 | 2. `pip3 install --user pip pipenv` 47 | 3. `git clone https://github.com/SciCrunch/scibot.git` 48 | 4. `cd scibot && pipenv install --skip-lock` 49 | 5. `pipenv shell` to get an environment with acess to all the required libraries. 50 | 6. Inside the pipenv shell (after following steps 6-10 below) you should 51 | be able to run commands like `python scibot/export.py`. 52 | 53 | ## Setup on amazon 54 | 55 | Building the rpm 56 | ``` 57 | pushd resources/rpmbuild 58 | rpmbuild --nodeps --define "_topdir `pwd`" -ba SPECS/scibot.spec 59 | popd 60 | ``` 61 | Once this is done scp the rpm to the host. 62 | Also scp the ssl certs over, or use letsencrypt to obtain a cert. 63 | If you are using using a cert from another registrar you may need to 64 | bundle your certs `cat my-cert.crt existing-bundle.crt > scicrunch.io.crt` 65 | (see https://gist.github.com/bradmontgomery/6487319 for details) 66 | See [nginx.conf](./resources/config_files/etc/nginx/scibot.conf) 67 | for details on where to put the certs after scping them over. 68 | 69 | Install steps run as root or via sudo. 70 | ```bash 71 | amazon-linux-extras install nginx1.12 72 | yum install scibot-9999-0.noarch.rpm # update with yum reinstall 73 | pip3 install pipenv wheel 74 | vim /etc/systemd/system/scibot-bookmarklet.service.d/env.conf # set api keys etc 75 | ``` 76 | 77 | Install scibot codebase as the scibot user 78 | ```bash 79 | git clone https://github.com/SciCrunch/scibot.git 80 | pushd scibot 81 | pipenv install --skip-lock 82 | ``` 83 | Hopefully this step will become simpler once we start pushing releases. 84 | `pipenv install scibot` or alternately it may also be possible to package 85 | everything we need in the rpm and only install that. With none of the other 86 | steps needed at all. 87 | 88 | Start services as root 89 | ```bash 90 | systemctl start nginx scibot-bookmarklet-sync scibot-bookmarklet 91 | ``` 92 | 93 | ### Updating 94 | On the scibot host 95 | ```bash 96 | sudo su scibot - 97 | pushd scibot 98 | echo "$(date -Is) $(git rev-parse HEAD)" >> ~/previous-scibot-hashes 99 | git pull 100 | mv Pipfile.lock "Pipefile.lock.$(date -Is)" 101 | ~/.local/bin/pipenv install --skip-lock 102 | ``` 103 | 104 | Restart as root 105 | ```bash 106 | systemctl restart scibot-bookmarklet-sync scibot-bookmarklet 107 | ``` 108 | 109 | ### manual setup 110 | Install steps 111 | 0. ssh in to the host that will serve the script 112 | 1. `sudo yum install gcc libxml2 libxml2-devel libxslt libxslt-devel python36 python36-devel python36-pip` 113 | 2. `sudo alternatives --set python /usr/bin/python3.6` 114 | 3. `sudo pip install pipenv` 115 | 4. `git clone https://github.com/SciCrunch/scibot.git` 116 | 5. `cd scibot && python3.6 setup.py wheel && pipenv install dist/*.whl` 117 | 6. `export SCIBOT_USERNAME=someusername` 118 | 7. `export SCIBOT_GROUP=somegroupname` 119 | 8. `unset HISTFILE` 120 | 9. `export SCIBOT_API_TOKEN=sometoken` 121 | 10. `export SCIBOT_SYNC=somerandomnumber` (e.g. run `head -c 100 /dev/urandom | tr -dc 'a-zA-Z0-9'` every time) 122 | 11. create a screen session 123 | 12. in the screen session run `pipenv run scibot-server` you should create a link to the log files folder in ~/scibot/ 124 | 13. get letsencrypt certs using certbot, follow directions [here](https://certbot.eff.org/docs/using.html) (prefer standalone) 125 | 126 | 127 | ## Development setup 128 | To set up scibot for development (for example if you want to run manual releases) 129 | 0. Install python3 and pip for your os (e.g. on macos use `brew`) 130 | 1. From your git folder run `git clone https://github.com/tgbugs/scibot.git` 131 | 2. `pushd scibot` 132 | 3. `pip3 install --user -e .` will install requirements and register the 133 | scibot folder that is under version control with python as the scibot module. 134 | 4. `popd` 135 | 136 | ## Dev server 137 | You should change `lol` to something less guessable even if it is only 138 | running on localhost. 139 | 140 | Run the following blocks in two separate terminals and connect to 141 | `https://localhost:4443/bookmarklet`. If you try `http` you will get 142 | a connection error. 143 | 144 | You will need to generate the self signed certs using openssl as 145 | described in the comments in [bookmarklet.py::main()](./scibot/bookmarklet.py). 146 | Alternately comment out the ssl wrapping of the socket and use `http`. 147 | 148 | ```bash 149 | SCIBOT_SYNC=lol python -m scibot.sync 150 | ``` 151 | 152 | ```bash 153 | SCIBOT_USERNAME=scibot \ 154 | SCIBOT_GROUP=dev-group \ 155 | SCIBOT_GROUP2=dev-group \ 156 | SCIBOT_GROUP_STAGING=__world__ \ 157 | SCIBOT_API_TOKEN=hypothesis-api-key \ 158 | SCIBOT_SYNC=lol \ 159 | python -m scibot.bookmarklet 160 | ``` 161 | 162 | ## If all else fails 163 | Make sure you have >=python3.6 and pip installed. Clone the repo and 164 | run `python setup.py develop --user`. 165 | 166 | ## CoC 167 | SciBot adheres to the Contributor Covenant: 168 | https://www.contributor-covenant.org/ 169 | -------------------------------------------------------------------------------- /bin/scibot-bookmarklet: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | gunicorn -b unix:/run/scibot-bookmarklet/socket \ 3 | --pid /run/scibot-bookmarklet/pid \ 4 | -n scibot-bookmarklet \ 5 | -w 4 \ 6 | -k gevent \ 7 | -t 600 \ 8 | --preload \ 9 | --capture-output \ 10 | --log-level debug \ 11 | scibot.bookmarklet_server:app 12 | -------------------------------------------------------------------------------- /bin/scibot-dashboard: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | gunicorn -b unix:/run/scibot-dashboard/socket --pid /run/scibot-dashboard/pid -n scibot-dashboard -w 4 -k gevent -t 600 --preload --log-level debug scibot.dash:app 3 | #gunicorn -b localhost:5000 -n scibot -w 4 -k gevent -t 600 --log-level debug server:app 4 | -------------------------------------------------------------------------------- /bin/scibot-dbsetup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # scibot-dbsetup [PORT] [DATABASE] 3 | 4 | SOURCE="${BASH_SOURCE[0]}" 5 | while [ -h "$SOURCE" ]; do # resolve all symlinks 6 | DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" 7 | SOURCE="$(readlink "$SOURCE")" 8 | [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # resolve relative symlinks 9 | done 10 | ABS_PATH="$( cd -P "$( dirname "$SOURCE" )" && pwd )" 11 | 12 | SQL="${ABS_PATH}/../sql/" 13 | RESOURCES="${ABS_PATH}/../resources/" 14 | 15 | if [ -z $1 ]; then 16 | PORT=5432 17 | else 18 | PORT=$1 19 | fi 20 | 21 | if [ -z $2 ]; then 22 | DATABASE=scibot_test 23 | else 24 | DATABASE=$2 25 | fi 26 | 27 | # postgres setup 28 | psql -U postgres -h localhost -p $PORT -d postgres -f "${SQL}/postgres.sql" -v database=$DATABASE && 29 | psql -U postgres -h localhost -p $PORT -d $DATABASE -f "${SQL}/extensions.sql" && 30 | 31 | # scibot-admin setup 32 | psql -U scibot-admin -h localhost -p $PORT -d $DATABASE -f "${SQL}/schemas.sql" 33 | #psql -U scibot-admin -h localhost -p $PORT -d $DATABASE -f "${SQL}/permissions.sql" -v database=$DATABASE # FIXME this has to be run after populating the database via python 34 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | # Database setup 2 | ```bash 3 | export DBNAME=scibot_ASDF # WARNING this WILL overwrite existing databases 4 | scibot-dbsetup 5432 ${DBNAME} # initial db, user, extension, and schema creation 5 | scibot db-init ${DBNAME} # create the schema from the hypothesis orm code 6 | scibot api-sync ${DBNAME} # retrieve and load existing annotations 7 | ``` 8 | 9 | # Installing services 10 | TODO 11 | 12 | # Starting services 13 | ## openrc 14 | ```bash 15 | /etc/init.d/scibot-ws-sync start 16 | ``` 17 | ## systemd 18 | ```bash 19 | systemctl start scibot-ws-sync 20 | ``` 21 | -------------------------------------------------------------------------------- /docs/workflows.org: -------------------------------------------------------------------------------- 1 | * clear space and backup 2 | This is the workflow as it exists now, it is not remotely ideal. 3 | 4 | scibot.scicrunch.io 5 | #+begin_src bash 6 | pushd /var/lib/scibot/logs 7 | sudo gzip *.json 8 | sudo mv *.gz gzipped 9 | #+end_src 10 | 11 | orpheus 12 | #+begin_src bash 13 | DATE=2023-04-01 14 | scibot.scicrunch.io:/var/lib/scibot/logs/gzipped/* /home/tom/files/scibot/${DATE}/ 15 | pushd ${DATE} 16 | gunzip *.json.gz 17 | popd 18 | XZ_OPT=-e9 tar -cvJf ${DATE}.tar.xz ${DATE} 19 | # confirm backup 20 | #+end_src 21 | 22 | athena 23 | #+begin_src bash 24 | DATE=2023-04-01 25 | pushd ~/nas/logs/scibot-backup 26 | rsync --progress -vapX orpheus:/home/tom/files/scibot/${DATE}.tar.xz . 27 | #+end_src 28 | 29 | orpheus 30 | #+begin_src bash 31 | DATE=2023-04-01 32 | # rm ${DATE}/*.json 33 | # rmdir ${DATE} 34 | #+end_src 35 | 36 | scibot.scicrunch.io 37 | #+begin_src bash 38 | pushd /var/lib/scibot/logs 39 | # sudo rm gzipped/*.json.gz 40 | #+end_src 41 | 42 | -------------------------------------------------------------------------------- /resources/config_files/etc/nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | user nginx nginx; 2 | worker_processes 1; 3 | 4 | error_log /var/log/nginx/error_log info; 5 | 6 | events { 7 | worker_connections 1024; 8 | use epoll; 9 | } 10 | 11 | http { 12 | include /etc/nginx/mime.types; 13 | default_type application/octet-stream; 14 | 15 | log_format main 16 | '$remote_addr - $remote_user [$time_local] ' 17 | '"$request" $status $bytes_sent ' 18 | '"$http_referer" "$http_user_agent" ' 19 | '"$gzip_ratio"'; 20 | 21 | client_header_timeout 10m; 22 | client_body_timeout 10m; 23 | proxy_read_timeout 900s; 24 | send_timeout 10m; 25 | 26 | connection_pool_size 256; 27 | client_header_buffer_size 1k; 28 | large_client_header_buffers 4 2k; 29 | request_pool_size 4k; 30 | 31 | gzip on; 32 | gzip_http_version 1.0; 33 | gzip_proxied any; 34 | gzip_min_length 500; 35 | gzip_disable "MSIE [1-6]\."; 36 | gzip_types text/plain text/xml text/css 37 | text/comma-separated-values 38 | text/javascript 39 | text/json 40 | application/json 41 | application/x-javascript 42 | application/atom+xml; 43 | 44 | output_buffers 1 32k; 45 | postpone_output 1460; 46 | 47 | sendfile on; 48 | tcp_nopush on; 49 | tcp_nodelay on; 50 | 51 | keepalive_timeout 75 20; 52 | 53 | ignore_invalid_headers on; 54 | 55 | include /etc/nginx/scibot.conf; 56 | 57 | server { 58 | listen 80; 59 | listen [::]:80; 60 | server_name localhost; 61 | 62 | access_log /var/log/nginx/default.access_log main; 63 | error_log /var/log/nginx/default.error_log info; 64 | location / { 65 | return 404; 66 | } 67 | } 68 | 69 | server { 70 | listen 443; 71 | listen [::]:443; 72 | server_name localhost; 73 | 74 | access_log /var/log/nginx/default.ssl_access_log main; 75 | error_log /var/log/nginx/default.ssl_error_log info; 76 | location / { 77 | return 404; 78 | } 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /resources/config_files/etc/nginx/scibot.conf: -------------------------------------------------------------------------------- 1 | upstream scibot-bookmarklet { 2 | # change this to point where it is actually running 3 | server unix:/run/scibot-bookmarklet/socket; 4 | } 5 | 6 | upstream scibot-dashboard { 7 | # change this to point where it is actually running 8 | server unix:/run/scibot-dashboard/socket; 9 | } 10 | 11 | server { 12 | listen 80; 13 | listen [::]:80; 14 | server_name scibot.scicrunch.io; 15 | return 301 https://$server_name$request_uri; 16 | 17 | access_log /var/log/nginx/scibot.scicrunch.io.access_log main; 18 | error_log /var/log/nginx/scibot.scicrunch.io.error_log info; 19 | } 20 | 21 | server { 22 | listen 443; 23 | listen [::]:443 ssl; 24 | server_name scibot.scicrunch.io; 25 | ssl on; 26 | 27 | # *.scicrunch.io certs 28 | ssl_certificate /etc/scicrunch.io/scicrunch.io.crt; 29 | ssl_certificate_key /etc/scicrunch.io/scicrunch.io.key; 30 | 31 | access_log /var/log/nginx/scibot.scicrunch.io.ssl_access_log main; 32 | error_log /var/log/nginx/scibot.scicrunch.io.ssl_error_log info; 33 | 34 | # from https://cipherli.st/ 35 | # and https://raymii.org/s/tutorials/Strong_SSL_Security_On_nginx.html 36 | 37 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; 38 | ssl_prefer_server_ciphers on; 39 | ssl_ciphers "EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH"; 40 | ssl_ecdh_curve secp384r1; 41 | ssl_session_cache shared:SSL:10m; 42 | ssl_session_tickets off; 43 | ssl_stapling on; 44 | ssl_stapling_verify on; 45 | resolver 8.8.8.8 8.8.4.4 valid=300s; 46 | resolver_timeout 5s; 47 | # disable HSTS header for now 48 | #add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload"; 49 | add_header X-Frame-Options DENY; 50 | add_header X-Content-Type-Options nosniff; 51 | ssl_dhparam /etc/ssl/certs/dhparam.pem; # openssl dhparam -out /tmp/dhparam.pem 4096 # DO NOT RUN ON AMAZON scp it over 52 | location / { 53 | proxy_pass http://scibot-bookmarklet; 54 | client_max_body_size 20m; 55 | proxy_redirect off; 56 | proxy_set_header Host $host; 57 | proxy_set_header X-Real-IP $remote_addr; 58 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 59 | proxy_set_header X-Forwarded-Host $server_name; 60 | proxy_set_header X-Forwarded-Scheme $scheme; 61 | } 62 | location /dashboard { 63 | proxy_pass http://scibot-dashboard; 64 | client_max_body_size 20m; 65 | proxy_redirect off; 66 | proxy_set_header Host $host; 67 | proxy_set_header X-Real-IP $remote_addr; 68 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 69 | proxy_set_header X-Forwarded-Host $server_name; 70 | proxy_set_header X-Forwarded-Scheme $scheme; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/env.conf: -------------------------------------------------------------------------------- 1 | SCIBOT_GROUP= 2 | SCIBOT_USERNAME= 3 | SCIBOT_API_TOKEN= 4 | SCIBOT_SYNC= 5 | SOURCE_LOG_LOC=/var/lib/scibot/logs 6 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/scibot-bookmarklet-sync.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=scibot bookmarket sync daemon 3 | After=network.target 4 | 5 | [Service] 6 | PIDFile=/run/scibot-bookemarklet-sync/pid 7 | User=scibot 8 | Group=scibot 9 | RuntimeDirectory=scibot-bookmarklet-sync 10 | WorkingDirectory=/var/lib/scibot/scibot 11 | EnvironmentFile=/etc/systemd/system/scibot-bookmarklet.service.d/env.conf 12 | ExecStart=/var/lib/scibot/.local/bin/pipenv run scibot-sync 13 | ExecReload=/bin/kill -s HUP $MAINPID 14 | ExecStop=/bin/kill -s TERM $MAINPID 15 | PrivateTmp=true 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | WantedBy=scibot-bookmarklet.service 20 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/scibot-bookmarklet.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=scibot bookmarket gunicorn daemon 3 | Requires=scibot-bookmarklet-sync.service 4 | After=network.target 5 | 6 | [Service] 7 | PIDFile=/run/scibot-bookemarklet/pid 8 | User=scibot 9 | Group=scibot 10 | RuntimeDirectory=scibot-bookmarklet 11 | WorkingDirectory=/var/lib/scibot/scibot 12 | EnvironmentFile=/etc/systemd/system/scibot-bookmarklet.service.d/env.conf 13 | ExecStart=/var/lib/scibot/.local/bin/pipenv run gunicorn \ 14 | --bind unix:/run/scibot-bookmarklet/socket \ 15 | --name scibot-bookmarklet \ 16 | --workers 4 \ 17 | --worker-class gevent \ 18 | --timeout 600 \ 19 | --group scibot \ 20 | --user scibot \ 21 | --log-level DEBUG \ 22 | --log-file /var/log/scibot/bookmarklet.log \ 23 | --capture-output \ 24 | scibot.bookmarklet_server:app 25 | ExecReload=/bin/kill -s HUP $MAINPID 26 | ExecStop=/bin/kill -s TERM $MAINPID 27 | PrivateTmp=true 28 | 29 | [Install] 30 | WantedBy=multi-user.target 31 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/scibot-bookmarklet.socket: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=scibot bookmarklet gunicorn socket 3 | 4 | [Socket] 5 | ListenStream=/run/scibot-bookmarket/socket 6 | 7 | [Install] 8 | WantedBy=sockets.target 9 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/scibot-dashboard.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=scibot dashboard gunicorn daemon 3 | Requires=scibot-dashboard.socket 4 | After=network.target 5 | 6 | [Service] 7 | PIDFile=/run/scibot-dashboard/pid 8 | User={scibot-user} 9 | Group={scibot-user} 10 | RuntimeDirectory=scibot-dashboard 11 | WorkingDirectory=/home/{scibot-user}/run 12 | ExecStart=/home/{scibot-user}/.local/bin/pipenv run scibot-dashboard 13 | ExecReload=/bin/kill -s HUP $MAINPID 14 | ExecStop=/bin/kill -s TERM $MAINPID 15 | PrivateTmp=true 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /resources/config_files/etc/systemd/system/scibot-dashboard.socket: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=scibot dashboard gunicorn socket 3 | 4 | [Socket] 5 | ListenStream=/run/scibot-dashboard/socket 6 | 7 | [Install] 8 | WantedBy=sockets.target 9 | -------------------------------------------------------------------------------- /resources/config_files/etc/tmpfiles.d/scibot-bookmarklet.conf: -------------------------------------------------------------------------------- 1 | d /run/scibot-bookmarklet 0755 scibot scibot - 2 | d /run/scibot-bookmarklet-sync 0755 scibot scibot - 3 | -------------------------------------------------------------------------------- /resources/rpmbuild/.gitignore: -------------------------------------------------------------------------------- 1 | BUILD/* 2 | RPMS/* 3 | SRPMS/* 4 | -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/env.conf: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/systemd/system/env.conf -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/nginx.conf: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/nginx/nginx.conf -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/scibot-bookmarklet-sync.service: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/systemd/system/scibot-bookmarklet-sync.service -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/scibot-bookmarklet.conf: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/tmpfiles.d/scibot-bookmarklet.conf -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/scibot-bookmarklet.service: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/systemd/system/scibot-bookmarklet.service -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/scibot-bookmarklet.socket: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/systemd/system/scibot-bookmarklet.socket -------------------------------------------------------------------------------- /resources/rpmbuild/SOURCES/scibot.conf: -------------------------------------------------------------------------------- 1 | ../../config_files/etc/nginx/scibot.conf -------------------------------------------------------------------------------- /resources/rpmbuild/SPECS/scibot.spec: -------------------------------------------------------------------------------- 1 | # you must build this with --nodeps if you are not on a RHEL alike 2 | %define _unitdir /lib/systemd/system 3 | %define _etcdir /etc/systemd/system 4 | 5 | # building on gentoo makes this /var/lib for some reason :/ 6 | %define _localstatedir /var 7 | 8 | %define scibot_user scibot 9 | %define scibot_group %{scibot_user} 10 | %define scibot_home %{_localstatedir}/lib/scibot 11 | %define scibot_log %{_localstatedir}/log/scibot 12 | %define scibot_source_log %{scibot_home}/logs 13 | 14 | %define name scibot 15 | %define version 9999 16 | Name: %{name} 17 | Version: %{version} 18 | Release: 0 19 | Summary: curation workflow automation and coordination 20 | License: Apache-2.0 21 | Url: https://github.com/SciCrunch/scibot 22 | BuildArch: noarch 23 | BuildRequires: systemd 24 | BuildRequires: git 25 | Requires: gcc # eventually this should be a build requires 26 | Requires: bash 27 | Requires: nginx 28 | Requires: python3 29 | Requires: python3-devel # eventual build requires 30 | Requires(post): systemd 31 | Requires(preun): systemd 32 | Requires(postun): systemd 33 | 34 | Source1: scibot-bookmarklet.socket 35 | Source2: scibot-bookmarklet.service 36 | Source3: scibot-bookmarklet-sync.service 37 | Source4: env.conf 38 | Source5: scibot-bookmarklet.conf 39 | Source6: nginx.conf 40 | Source7: scibot.conf 41 | 42 | %description 43 | curation workflow automation and coordination 44 | 45 | %prep 46 | 47 | if [[ ! -d %{buildroot} ]]; then 48 | mkdir %{buildroot}; 49 | fi 50 | 51 | %define gitroot scibot 52 | if [[ ! -d %{gitroot} ]]; then 53 | git clone https://github.com/SciCrunch/scibot.git 54 | fi 55 | 56 | %build 57 | #pushd %{gitroot} 58 | #python3 setup.py bdist_wheel 59 | #%py3_build 60 | 61 | %install 62 | install -p -D -m 644 %{SOURCE1} %{buildroot}/%{_unitdir}/scibot-bookmarklet.socket 63 | install -p -D -m 644 %{SOURCE2} %{buildroot}/%{_unitdir}/scibot-bookmarklet.service 64 | install -p -D -m 644 %{SOURCE3} %{buildroot}/%{_unitdir}/scibot-bookmarklet-sync.service 65 | install -p -D -m 600 %{SOURCE4} %{buildroot}/%{_etcdir}/scibot-bookmarklet.service.d/env.conf 66 | install -p -D -m 644 %{SOURCE5} %{buildroot}/etc/tmpfiles.d/scibot-bookmarklet.conf 67 | install -p -D -m 644 %{SOURCE6} %{buildroot}/etc/nginx/nginx.conf 68 | install -p -D -m 644 %{SOURCE7} %{buildroot}/etc/nginx/scibot.conf 69 | #%py3_install 70 | 71 | %pre 72 | getent group %{scibot_group} > /dev/null || groupadd -r %{scibot_group} 73 | getent passwd %{scibot_user} > /dev/null || \ 74 | useradd -r -m -d %{scibot_home} -g %{scibot_group} \ 75 | -s /bin/bash -c "scibot services" %{scibot_user} 76 | if [[ ! -d %{scibot_log} ]]; then 77 | mkdir %{scibot_log} # owner? 78 | chown %{scibot_user}:%{scibot_group} %{scibot_log} 79 | fi 80 | if [[ ! -d %{scibot_source_log} ]]; then 81 | mkdir %{scibot_source_log} 82 | chown %{scibot_user}:%{scibot_group} %{scibot_source_log} 83 | fi 84 | 85 | %post 86 | systemd-tmpfiles --create 87 | systemctl enable nginx 88 | systemctl enable scibot-bookmarklet 89 | systemctl enable scibot-bookmarklet-sync 90 | 91 | %clean 92 | rm -rf %{buildroot} 93 | 94 | %files 95 | %{_unitdir}/scibot-bookmarklet.socket 96 | %{_unitdir}/scibot-bookmarklet.service 97 | %{_unitdir}/scibot-bookmarklet-sync.service 98 | %{_etcdir}/scibot-bookmarklet.service.d/env.conf 99 | /etc/tmpfiles.d/scibot-bookmarklet.conf 100 | /etc/nginx/nginx.conf 101 | /etc/nginx/scibot.conf 102 | 103 | %changelog 104 | # skip this for now 105 | -------------------------------------------------------------------------------- /resources/scripts/scibot-monkey-button.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name SciBot Button 3 | // @namespace https://github.com/SciCrunch/scibot/tree/master/resources/scripts 4 | // @description Run SciBot in a way that ignores CORS 5 | // @match *://*/* 6 | // @exclude *://*.google.com/* 7 | // @exclude *://*.github.com/* 8 | // @exclude *://github.com/* 9 | // @version 1.0 10 | // @grant GM_addStyle 11 | // ==/UserScript== 12 | 13 | var zNode = document.createElement ('div'); 14 | zNode.innerHTML = ''; 15 | zNode.setAttribute ('id', 'scibotButtonContainer'); 16 | document.body.appendChild (zNode); 17 | 18 | //--- Activate the newly added button. 19 | document.getElementById ("runSciBot").addEventListener ( 20 | "click", ButtonClickAction, false 21 | ); 22 | 23 | function ButtonClickAction (zEvent) { 24 | /*--- For our dummy action, we'll just add a line of text to the top 25 | of the screen. 26 | */ 27 | document.getElementById ("scibotButtonContainer").remove(); 28 | var xhr=new XMLHttpRequest(); 29 | var params=('uri=' + location.href + 30 | '&head=' + encodeURIComponent(document.head.innerHTML) + 31 | '&body=' + encodeURIComponent(document.body.innerHTML) + 32 | '&data=' + encodeURIComponent(document.body.innerText)); 33 | xhr.open('POST', 'https://scibot.scicrunch.io/rrid', true); 34 | xhr.setRequestHeader('Content-type', 'application/x-www-form-urlencoded'); 35 | xhr.setRequestHeader('Access-Control-Allow-Origin', '*'); 36 | xhr.onreadystatechange=function(){if(xhr.readyState==4) console.log('rrids: ' + xhr.responseText)}; 37 | xhr.send(params) 38 | } 39 | 40 | GM_addStyle ( ` 41 | #scibotButtonContainer { 42 | position: absolute; 43 | top: 0; 44 | left: 0; 45 | font-size: 20px; 46 | background: orange; 47 | border: 3px outset black; 48 | margin: 5px; 49 | opacity: 0.9; 50 | z-index: 9999; 51 | padding: 5px 20px; 52 | } 53 | #runSciBot { 54 | cursor: pointer; 55 | } 56 | #scibotButtonContainer p { 57 | color: red; 58 | background: white; 59 | } 60 | ` ); 61 | -------------------------------------------------------------------------------- /scibot/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.3' 2 | -------------------------------------------------------------------------------- /scibot/anno.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from hyputils.memex import models 3 | from hyputils.memex.util.uri import normalize as uri_normalize 4 | from hyputils.memex.models.document import update_document_metadata 5 | from hyputils.memex.schemas.annotation import CreateAnnotationSchema 6 | from pyontutils.utils import anyMembers, noneMembers 7 | 8 | 9 | class FakeRequest: 10 | def __init__(self, json): 11 | self.json = json 12 | self.authenticated_userid = json['user'] 13 | 14 | 15 | def validate(j): 16 | request = FakeRequest(j) 17 | schema = CreateAnnotationSchema(request) 18 | appstruct = schema.validate(j) 19 | return appstruct 20 | 21 | 22 | def extract_extra(j): 23 | return j['id'], j['created'], j['updated'] 24 | 25 | 26 | def make_anno(data, dbdocs): 27 | #document_uri_dicts = data['document']['document_uri_dicts'] 28 | #document_meta_dicts = data['document']['document_meta_dicts'] 29 | #del data['document'] 30 | #data = {k:v for k, v in data.items() if k != 'document'} # prevent overwrite on batch load 31 | 32 | annotation = models.Annotation(**data) # FIXME for batch the overhead here is stupid beyond belief 33 | annotation.document_id = dbdocs[uri_normalize(annotation.target_uri)].id 34 | #for k, v in data.items(): 35 | #print(k, v) 36 | #setattr(annotation, k, v) 37 | #id, created, updated = extra 38 | #annotation.id = id 39 | #annotation.created = created 40 | #annotation.updated = updated 41 | 42 | return annotation 43 | 44 | # this baby is super slow 45 | document = update_document_metadata( 46 | session, 47 | annotation.target_uri, 48 | document_meta_dicts, 49 | document_uri_dicts, 50 | created=created, 51 | updated=updated) 52 | annotation.document = document 53 | 54 | return annotation 55 | 56 | 57 | def quickload(j): 58 | """ a quickload routine for json that comes from the hypothes.is api 59 | and that has already passed the json schema validate checks """ 60 | 61 | return { 62 | 'id':j['id'], 63 | 'created':j['created'], 64 | 'updated':j['updated'], 65 | #'document':{}, 66 | 'extra':{}, 67 | 'groupid':j['group'], 68 | 'references':j['references'] if 'references' in j else [], 69 | 'shared':not j['hidden'] if 'hidden' in j else True, # some time in august hidden was dropped 70 | 'tags':j['tags'], 71 | 'target_selectors':[selector 72 | for selector_sources in j['target'] 73 | if 'selector' in selector_sources 74 | for selector in selector_sources['selector']] , 75 | 'target_uri':j['uri'], # FIXME check on this vs selectors 76 | 'text':j['text'], 77 | 'userid':j['user'], 78 | } 79 | 80 | 81 | def doc(j): 82 | # FIXME this skips the normalize routines ... 83 | return {'document_meta_dicts': ([{'claimant': j['uri'], 84 | 'type': 'title', # FIXME see if more 85 | 'value': j['document']['title']}] 86 | if 'title' in j['document'] 87 | else []), 88 | 'document_uri_dicts': [{'claimant': j['uri'], 89 | 'content_type': '', # FIXME see if more 90 | 'type': 'self-claim', # FIXME see if more 91 | 'uri': j['uri']}]} 92 | 93 | 94 | def mdoc(uri, claims): 95 | return {'document_meta_dicts': claims, 96 | 'document_uri_dicts': [{'claimant': uri, 97 | 'content_type': '', # FIXME see if more 98 | 'type': 'self-claim', # FIXME see if more 99 | 'uri': uri}]} 100 | 101 | 102 | def add_doc_all(uri, created, updated, claims): # batch only run once 103 | doc = models.Document(created=created, updated=updated) 104 | duri = models.DocumentURI(document=doc, # how does this play out w/o creating explicitly? 105 | claimant=uri, 106 | uri=uri, 107 | type='self-claim', 108 | created=created, 109 | updated=updated) 110 | #yield doc 111 | #yield duri 112 | for claim in claims: 113 | #yield 114 | models.DocumentMeta(document=doc, 115 | created=created, 116 | updated=updated, 117 | # FIXME for this we may need to pull the latest??? or no 118 | **claim) 119 | 120 | return doc 121 | 122 | 123 | def quickuri(j): 124 | return (j['created'], 125 | j['updated'], 126 | [{'claimant':j['uri'], 'type':k, 'value':v} 127 | for k, v in j['document'].items()]) 128 | -------------------------------------------------------------------------------- /scibot/bookmarklet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """SciBot server implementation 3 | 4 | Usage: 5 | bookmarklet [options] 6 | 7 | Options: 8 | -s --sync-port=PORT the port that the sync services is running on 9 | """ 10 | 11 | import re 12 | import csv 13 | import ssl 14 | import gzip 15 | import json 16 | from io import StringIO 17 | from typing import Callable, Iterable, Tuple, Any, Generator 18 | from pathlib import Path 19 | from datetime import datetime 20 | from curio import run 21 | from curio.channel import AuthenticationError 22 | from flask import Flask, request, abort 23 | from hyputils.hypothesis import HypothesisUtils 24 | from scibot.config import source_log_location 25 | from scibot.utils import log 26 | from scibot.export import export_impl, export_json_impl 27 | 28 | try: 29 | from scibot.workflow import curatorTags 30 | except ImportError: 31 | # FIXME don't want a hard rdflib dependency here 32 | curatorTags = lambda : [] 33 | 34 | # logging 35 | 36 | def write_stdout(target_uri, document, doi, pmid, found_rrids, head, body, text, h): 37 | log.info(f'DOI:{doi}') 38 | log.info(pmid) 39 | 40 | 41 | def write_log(target_uri, document, doi, pmid, found_rrids, head, body, text, h): 42 | now = datetime.now().isoformat()[0:19].replace(':','').replace('-','') 43 | frv = list(set(found_rrids.values())) 44 | if len(frv) == 1 and frv[0] == 'Already Annotated': 45 | head, body, text = None, None, None 46 | log = {'target_uri':target_uri, 47 | 'group':h.group, 48 | 'doi':doi, 49 | 'pmid':pmid, 50 | 'found_rrids':found_rrids, 51 | 'count':len(found_rrids), 52 | 'head':head, 53 | 'body':body, 54 | 'text':text, 55 | 'document': document, 56 | } 57 | fname = Path(source_log_location, f'rrid-{now}.json') 58 | with open(fname.as_posix(), 'wt') as f: 59 | json.dump(log, f, sort_keys=True, indent=4) 60 | 61 | # types 62 | 63 | Found = Tuple[str, str, str, str] 64 | Finder = Callable[[str], Iterable[Found]] 65 | Checker = Callable[[Found], bool] 66 | Resolved = Tuple[str, int, str] 67 | Resolver = Callable[[Found], Resolved] 68 | Submitter = Callable[[Found, Resolved], Any] 69 | Processor = Callable[[str, str], Generator] 70 | 71 | # bookmarklet endpoint 72 | 73 | bookmarklet_base = r""" 74 | javascript:(function(){var xhr=new XMLHttpRequest(); 75 | 76 | var params='uri='+location.href+ 77 | '&head='+encodeURIComponent(document.head.innerHTML)+ 78 | '&body='+encodeURIComponent(document.body.innerHTML)+ 79 | '&data='+encodeURIComponent(document.body.innerText); 80 | 81 | xhr.open('POST','%s/%s',true); 82 | xhr.setRequestHeader('Content-type','application/x-www-form-urlencoded'); 83 | xhr.setRequestHeader('Access-Control-Allow-Origin','*'); 84 | xhr.onreadystatechange=function(){if(xhr.readyState==4)console.log('rrids: '+xhr.responseText)}; 85 | xhr.send(params)}()); 86 | """ 87 | 88 | html_base = """ 89 | 90 | 94 | SciBot bookmarklet 95 | 96 |

SciBot

97 |

To install the bookmarklet, drag this link -- SciBot %s -- to your bookmarks bar.

98 |

If you need to copy/paste the bookmarklet's code into a bookmarklet, it's here:

99 | %s 100 | 101 | 102 | """ 103 | 104 | 105 | def bookmarklet_wrapper(request, endpoint): 106 | """ Return text of the SciBot bookmarklet """ 107 | normalized = 'https://' + request.host 108 | code = bookmarklet_base % (normalized, endpoint) 109 | bookmarklet = code.replace('"', '"').replace('\n','') 110 | html = html_base % (bookmarklet, request.host.split('.', 1)[-1], code) 111 | return html 112 | 113 | 114 | # rrid endpoint 115 | 116 | from scibot.extract import process_POST_request, find_rrids as finder 117 | from scibot.check import check_already_submitted 118 | from scibot.services import existing_tags, get_pmid, rrid_resolver_xml 119 | from scibot.submit import annotate_doi_pmid, submit_to_h 120 | 121 | 122 | def make_find_check_resolve_submit(finder: Finder, notSubmittedCheck: Checker, 123 | resolver: Resolver, submitter: Submitter) -> Processor: 124 | def inner(text: str) -> Generator: 125 | for found in finder(text): 126 | log.info(found) 127 | if notSubmittedCheck(found): 128 | resolved = resolver(found) 129 | yield submitter(found, resolved) 130 | 131 | return inner 132 | 133 | 134 | def pmid_logic(doi, pmid_from_source, target_uri=None, document=None, h=None, tags=None): 135 | # TODO move the annotation of errors out of this 136 | if doi: 137 | pmid_from_doi = get_pmid(doi) 138 | else: 139 | pmid_from_doi = None 140 | 141 | if pmid_from_source and pmid_from_doi: 142 | if pmid_from_source == pmid_from_doi: 143 | pmid = pmid_from_source 144 | else: 145 | # TODO responses -> db 146 | # TODO tag for marking errors explicitly without the dashboard? 147 | r1 = annotate_doi_pmid(target_uri, document, None, pmid_from_doi, h, tags, 'ERROR\nPMID from DOI') 148 | r2 = annotate_doi_pmid(target_uri, document, None, pmid_from_source, h, tags, 'ERROR\nPMID from source') 149 | pmid = None 150 | elif pmid_from_source: 151 | pmid = pmid_from_source 152 | elif pmid_from_doi: 153 | pmid = pmid_from_doi 154 | else: 155 | pmid = None 156 | 157 | return pmid 158 | 159 | 160 | def rrid_POST(request, h, logloc, URL_LOCK): 161 | (target_uri, document, doi, pmid_from_source, 162 | head, body, text, cleaned_text) = process_POST_request(request) 163 | running = URL_LOCK.start_uri(target_uri) 164 | log.info(target_uri) 165 | if running: 166 | log.info('################# EARLY EXIT') 167 | return 'URI Already running ' + target_uri 168 | 169 | try: 170 | tags, unresolved_exacts = existing_tags(target_uri, h) 171 | pmid = pmid_logic(doi, pmid_from_source, target_uri, document, h, tags) 172 | r = annotate_doi_pmid(target_uri, document, doi, pmid, h, tags) # todo r -> db with responses 173 | 174 | # these values are defined up here as shared state that will be 175 | # mutated across multiple calls to checker, resolver, and submitter 176 | # this is a really bad design because it is not clear that processText 177 | # actually does this ... once again, python is best if you just use the 178 | # objects and give up any hope for an alternative approach, the way it 179 | # is done here also makes the scope where these values could be used 180 | # completely ambiguous and hard to understand/reason about 181 | 182 | found_rrids = {} 183 | existing = [] 184 | existing_with_suffixes = [] 185 | 186 | def checker(found): 187 | prefix, exact, exact_for_hypothesis, suffix = found 188 | return not check_already_submitted(exact, exact_for_hypothesis, 189 | found_rrids, tags, unresolved_exacts) 190 | 191 | def resolver(found): 192 | prefix, exact, exact_for_hypothesis, suffix = found 193 | return rrid_resolver_xml(exact, found_rrids) 194 | 195 | def submitter(found, resolved): 196 | return submit_to_h(target_uri, document, found, resolved, h, found_rrids, 197 | existing, existing_with_suffixes) 198 | 199 | processText = make_find_check_resolve_submit(finder, checker, resolver, submitter) 200 | 201 | responses = list(processText(cleaned_text)) # this call runs everything 202 | 203 | results = ', '.join(found_rrids.keys()) 204 | write_stdout(target_uri, document, doi, pmid, found_rrids, head, body, text, h) 205 | write_log(target_uri, document, doi, pmid, found_rrids, head, body, text, h) 206 | 207 | except BaseException as e: 208 | # there are some other linger issues that are what was causing 209 | # uris to get stuck as always running in sync 210 | log.exception(e) 211 | raise e 212 | 213 | finally: 214 | URL_LOCK.stop_uri(target_uri) 215 | 216 | return results, 200, {'Content-Type': 'text/plain', 217 | 'Access-Control-Allow-Origin':'*'} 218 | 219 | 220 | def rrid_OPTIONS(request): 221 | try: 222 | request_headers = request.headers['Access-Control-Request-Headers'].lower() 223 | request_headers = re.findall('\w(?:[-\w]*\w)', request_headers) 224 | except KeyError: 225 | request_headers = [] 226 | response_headers = ['access-control-allow-origin'] 227 | for req_acoa_header in request_headers: 228 | if req_acoa_header not in response_headers: 229 | response_headers.append(req_acoa_header) 230 | response_headers = ','.join(response_headers) 231 | return '', 204, {'Access-Control-Allow-Origin': '*', 232 | 'Access-Control-Allow-Headers': response_headers} 233 | 234 | 235 | def rrid_wrapper(request, h, logloc, URL_LOCK): 236 | """ Receive an article, parse RRIDs, resolve them, create annotations, log results """ 237 | if request.method == 'OPTIONS': 238 | return rrid_OPTIONS(request) 239 | elif request.method == 'POST': 240 | return rrid_POST(request, h, logloc, URL_LOCK) 241 | else: 242 | return abort(405) 243 | 244 | 245 | def main(local=False): 246 | from scibot.config import api_token, username, group, group2 247 | print(username, group, group2) # sanity check 248 | from scibot.sync import __doc__ as sync__doc__, Locker, client 249 | from scibot.config import syncword 250 | if syncword is None: 251 | raise KeyError('Please set the SCIBOT_SYNC environment variable') 252 | 253 | from docopt import docopt, parse_defaults 254 | _sdefaults = {o.name:o.value if o.argcount else None for o in parse_defaults(sync__doc__)} 255 | _backup_sync_port = int(_sdefaults['--port']) 256 | 257 | app = Flask('scibot bookmarklet server') 258 | 259 | h = HypothesisUtils(username=username, token=api_token, group=group) 260 | h2 = HypothesisUtils(username=username, token=api_token, group=group2) 261 | 262 | if __name__ == '__main__': 263 | args = docopt(__doc__) 264 | _sync_port = args['--sync-port'] 265 | 266 | if _sync_port: 267 | sync_port = int(_sync_port) 268 | else: 269 | sync_port = _backup_sync_port 270 | else: 271 | sync_port = _backup_sync_port 272 | 273 | chan = 'localhost', sync_port 274 | 275 | # TODO 276 | #try: 277 | #except AuthenticationError as e: 278 | #raise e 279 | send = run(client, chan, syncword) 280 | URL_LOCK = Locker(send) 281 | app.URL_LOCK = URL_LOCK 282 | 283 | #@app.route('/synctest', methods=['GET']) 284 | def synctest(): 285 | URL_LOCK.start_uri('a-test-uri') 286 | URL_LOCK.stop_uri('a-test-uri') 287 | return 'test-passed?' 288 | 289 | synctest() 290 | 291 | @app.route('/controlled-tags', methods=['GET']) 292 | def route_controlled_tags(): 293 | curator_tags = curatorTags() # TODO need client support for workflow:RRID -> * here 294 | return '\n'.join(curator_tags), 200, {'Content-Type':'text/plain; charset=utf-8'} 295 | 296 | @app.route('/rrid', methods=['POST', 'OPTIONS']) 297 | def rrid(): 298 | return rrid_wrapper(request, h, 'logs/rrid/', URL_LOCK) 299 | 300 | @app.route('/validaterrid', methods=['POST', 'OPTIONS']) 301 | def validaterrid(request): 302 | return rrid_wrapper(request, h2, 'logs/validaterrid/', URL_LOCK) 303 | 304 | @app.route('/bookmarklet', methods=['GET']) 305 | def bookmarklet(): 306 | return bookmarklet_wrapper(request, 'rrid') 307 | 308 | @app.route('/validatebookmarklet', methods=['GET']) 309 | def validatebookmarklet(): 310 | return bookmarklet_wrapper(request, 'validaterrid') 311 | 312 | @app.route('/export', methods=['GET']) 313 | def export(): 314 | print('starting csv export') 315 | output_rows, DATE = export_impl() 316 | data = StringIO() 317 | writer = csv.writer(data) 318 | writer.writerows(sorted(output_rows)) 319 | return gzip.compress(data.getvalue().encode()), 200, { 320 | 'Content-Type': 'text/csv', 321 | 'Content-Disposition': 'attachment;filename = RRID-data-%s.csv' % DATE, 322 | 'Content-Encoding': 'gzip'} 323 | 324 | @app.route('/export.json', methods=['GET']) 325 | def export_json(): 326 | print('starting json export') 327 | output_json, DATE = export_json_impl() 328 | data = json.dumps(output_json, sort_keys=True, indent=4) 329 | 330 | return gzip.compress(data.encode()), 200, { 331 | 'Content-Type': 'application/json', 332 | 'Content-Encoding': 'gzip'} 333 | 334 | if not local: 335 | return app 336 | else: 337 | from os.path import expanduser 338 | from wsgiref.simple_server import make_server 339 | from scibot.config import test_host, port_bookmarklet 340 | 341 | print('no login detected, running on localhost only') 342 | host = test_host 343 | port = port_bookmarklet 344 | 345 | print('host: %s, port %s' % ( host, port )) 346 | server = make_server(host, port, app) 347 | # openssl req -new -x509 -keyout scibot-self-sign-temp.pem -out scibot-self-sign-temp.pem -days 365 -nodes 348 | #server.socket = ssl.wrap_socket(server.socket, 349 | #keyfile='/etc/letsencrypt/live/scibot.scicrunch.io/privkey.pem', 350 | #certfile='/etc/letsencrypt/live/scibot.scicrunch.io/fullchain.pem', 351 | #server_side=True) 352 | server.socket = ssl.wrap_socket(server.socket, 353 | keyfile=expanduser('~/files/certs/scibot_test/tmp-nginx.key'), 354 | certfile=expanduser('~/files/certs/scibot_test/tmp-nginx.crt'), 355 | server_side=True) 356 | log.debug('serving forever') 357 | server.serve_forever() 358 | 359 | 360 | if __name__ == '__main__': 361 | main(local=True) 362 | -------------------------------------------------------------------------------- /scibot/bookmarklet_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.sys.stdout.write(f'\x1b]2;{os.path.basename(__name__)}\x07\n') 4 | 5 | from scibot.bookmarklet import main 6 | app = main() 7 | 8 | if __name__ == '__main__': 9 | from scibot import config 10 | app.run(host='localhost', port=config.port_bookmarklet, threaded=True) 11 | -------------------------------------------------------------------------------- /scibot/check.py: -------------------------------------------------------------------------------- 1 | 2 | def check_already_submitted(exact, exact_for_hypothesis, found_rrids, tags, unresolved_exacts): 3 | if exact in tags or exact_for_hypothesis in unresolved_exacts: 4 | print('\tskipping %s, already annotated' % exact) 5 | found_rrids[exact] = 'Already Annotated' 6 | return True 7 | 8 | -------------------------------------------------------------------------------- /scibot/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """SciBot command line utilities 3 | 4 | Usage: 5 | scibot db-init [options] [] 6 | scibot api-sync [options] [] 7 | scibot ws-sync [options] [] 8 | scibot debug [options] [] 9 | 10 | Options: 11 | -h --help show this 12 | -d --debug enable echo and embed 13 | -k --check when syncing run checks (required to insert) 14 | """ 15 | 16 | import os 17 | try: 18 | breakpoint 19 | except NameError: 20 | from IPython import embed as breakpoint 21 | 22 | 23 | def main(): 24 | from docopt import docopt 25 | args = docopt(__doc__) 26 | database = args[''] 27 | if database is not None: 28 | os.environ.update({'SCIBOT_DATABASE': database}) 29 | 30 | from scibot import config 31 | from scibot.db import getSession, init_scibot, AnnoSyncFactory, WebsocketSyncFactory 32 | 33 | if args['db-init']: 34 | # insurace, it is passed into init direclty as well 35 | #os.system(f'scibot-dbsetup {config.dbPort()} {database}') 36 | # the above should be done manually to prevent fat fingers 37 | init_scibot(database) 38 | 39 | elif args['api-sync']: 40 | check = args['--check'] 41 | dburi = config.dbUri(user='scibot-admin') # needed ofr update 42 | session = getSession(dburi=dburi, echo=args['--debug']) 43 | AnnoSync = AnnoSyncFactory(session) 44 | cur_sync = AnnoSync(config.api_token, config.username, 45 | config.group, config.memfile) 46 | cur_sync.sync_annos(check=check) 47 | pub_sync = AnnoSync(config.api_token, config.username, 48 | config.group_staging, config.pmemfile) 49 | pub_sync.sync_annos(check=check) 50 | 51 | elif args['ws-sync']: 52 | session = getSession(echo=args['--debug']) 53 | WebsocketSync = WebsocketSyncFactory(session) 54 | wss = WebsocketSync(config.api_token, config.username, config.group) 55 | wss.run() 56 | 57 | elif args['debug']: 58 | from time import time 59 | session = getSession(echo=args['--debug']) 60 | if True: 61 | dcount = {r.uri:r.document_id 62 | for r in session.execute('SELECT uri, document_id FROM document_uri')} 63 | from hyputils.memex import models 64 | from hyputils.hypothesis import Memoizer 65 | from scibot.anno import disambiguate_uris 66 | from interlex.core import makeParamsValues 67 | mem = Memoizer(config.memfile, config.api_token, config.username, config.group) 68 | annos, last_updated = mem.get_annos_from_file() 69 | uris = set(a.uri for a in annos) 70 | dd = disambiguate_uris(uris) 71 | multi = [v for v in dd.values() if len(v) > 1] 72 | _rows = [a._row for a in annos] 73 | AnnoSync = AnnoSyncFactory(session) 74 | cur_sync = AnnoSync(config.api_token, config.username, config.group) 75 | 76 | rows = _rows 77 | 78 | # rows = [r for r in _rows if 'articles/4-42/' in r['uri']] 79 | # rows = [r for r in _rows if '10.1002/jnr.23615' in r['uri']] 80 | # rows = [r for r in _rows if 'ncomms8028' in r['uri']] # TODO res chain these 81 | # rows = [r for r in _rows if '?term=Gene' in r['uri']] 82 | # rows = [r for r in _rows if 'index.php?' in r['uri']] 83 | # rows = [r for r in _rows if 'govhttp' in r['uri']] # maximum wat 84 | # rows = [r for r in _rows if 'fasebj.org' in r['uri']] 85 | 86 | check = False 87 | 88 | cur_sync.memoization_file = config.memfile 89 | cur_sync.sync_annos(check=check) 90 | 91 | 92 | return 93 | cur_sync.sync_annos(api_rows=rows, check=check) 94 | # when remote the upload bandwidth is now the limiting factor 95 | session.rollback() 96 | cur_sync.sync_annos(check=check) 97 | session.rollback() 98 | breakpoint() 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /scibot/config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from os import environ 3 | from socket import gethostname 4 | from pathlib import Path 5 | from hyputils.hypothesis import group_to_memfile, ucd 6 | 7 | # ports 8 | port_bookmarklet = 4443 9 | port_dashboard = 8080 10 | 11 | ## WARNING if you change one of these update the file in bin/ 12 | port_guni_bookmarket = 5000 # scibot-bookmarklet 13 | port_guni_dashboard = 5005 # scibot-dashboard 14 | 15 | # dev 16 | dev_remote_hosts = 'athena', 'arachne' 17 | 18 | # testing 19 | test_host = 'localhost' 20 | test_port = port_bookmarklet 21 | test_database = '__scibot_testing' 22 | 23 | # db 24 | user = 'scibot-user' 25 | database = environ.get('SCIBOT_DATABASE', test_database) 26 | 27 | 28 | def dbPort(): 29 | return 54321 if gethostname() in dev_remote_hosts else 5432 30 | 31 | 32 | def dbUri(user=user, host='localhost', port=dbPort(), database=database): 33 | if hasattr(sys, 'pypy_version_info'): 34 | dialect = 'psycopg2cffi' 35 | else: 36 | dialect = 'psycopg2' 37 | return f'postgresql+{dialect}://{user}@{host}:{port}/{database}' 38 | 39 | 40 | # mq 41 | vhost = 'scibot' 42 | broker_url = environ.get('CELERY_BROKER_URL', 43 | environ.get('BROKER_URL', 44 | 'amqp://guest:guest@localhost:5672//')) 45 | broker_backend = environ.get('CELERY_BROKER_BACKEND', 46 | environ.get('BROKER_BACKEND', 47 | 'rpc://')) 48 | accept_content = ('pickle', 'json') 49 | 50 | # logging 51 | source_log_location = environ.get('SOURCE_LOG_LOC', 52 | (Path(__file__).parent.parent / 53 | 'logs').as_posix()) 54 | 55 | # hypothesis 56 | api_token = environ.get('SCIBOT_API_TOKEN', 'TOKEN') # Hypothesis API token 57 | username = environ.get('SCIBOT_USERNAME', 'USERNAME') # Hypothesis username 58 | group = environ.get('SCIBOT_GROUP', '__world__') 59 | group2 = environ.get('SCIBOT_GROUP2', '__world__') 60 | group_staging = environ.get('SCIBOT_GROUP_STAGING', '__world__') 61 | syncword = environ.get('SCIBOT_SYNC') 62 | 63 | READ_ONLY = True 64 | if group_staging == '__world__' and not READ_ONLY: 65 | raise IOError('WARNING YOU ARE DOING THIS FOR REAL PLEASE COMMENT OUT THIS LINE') 66 | 67 | def _post(group_hash): 68 | if group_hash.startswith('f'): 69 | print('Real annos') 70 | elif group_hash.startswith('9'): 71 | print('Test annos') 72 | 73 | memfile = group_to_memfile(group, _post) 74 | 75 | pmemfile = f'{ucd}/scibot/annos-__world__-{username}.json' 76 | 77 | if group_staging == '__world__': 78 | smemfile = f'{ucd}/scibot/annos-__world__-{username}.json' 79 | else: 80 | smemfile = group_to_memfile(group_staging) 81 | 82 | # rrid resolver 83 | resolver_xml_filepath = Path('~/ni/dev/rrid/scibot/scibot_rrid_xml.pickle').expanduser() # FIXME 84 | -------------------------------------------------------------------------------- /scibot/dash.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey 2 | monkey.patch_all() 3 | 4 | import os 5 | 6 | os.sys.stdout.write(f'\x1b]2;{os.path.basename(__name__)}\x07\n') 7 | 8 | from scibot.dashboard import setup 9 | app = setup() 10 | -------------------------------------------------------------------------------- /scibot/dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import atexit 3 | from os import environ 4 | from pathlib import Path 5 | from jinja2 import ChoiceLoader, FileSystemLoader 6 | from scibot.utils import PMID, DOI 7 | from scibot.config import api_token, username, group, group_staging, memfile, pmemfile 8 | from scibot.release import Curation, PublicAnno 9 | from scibot.export import bad_tags 10 | from pyontutils.utils import anyMembers, noneMembers 11 | from htmlfn import render_table, htmldoc, atag, divtag 12 | from htmlfn import table_style, navbar_style, cur_style 13 | 14 | from hyputils.subscribe import preFilter, AnnotationStream 15 | from hyputils.handlers import helperSyncHandler, filterHandler 16 | from hyputils.hypothesis import Memoizer 17 | from flask import Flask, render_template, request, url_for 18 | try: 19 | breakpoint 20 | except NameError: 21 | from IPython import embed as breakpoint 22 | 23 | print('END IMPORTS') 24 | 25 | def route(route_name): 26 | def wrapper(function): 27 | def inner(*args, **kwargs): 28 | print(route_name) 29 | return function(*args, **kwargs) 30 | return inner 31 | return wrapper 32 | 33 | def make_app(annos, pannos=[]): 34 | 35 | app = Flask('scibot dashboard') 36 | 37 | template_loader = ChoiceLoader([app.jinja_loader, 38 | FileSystemLoader([(Path(__file__).parent.parent / 'templates').as_posix()])]) 39 | app.jinja_loader = template_loader 40 | 41 | [Curation(a, annos) for a in annos] 42 | [PublicAnno(a, pannos) for a in pannos] 43 | base_url = '/dashboard/' 44 | names = ['missing', 'incorrect', 'papers', 'unresolved', 45 | 'no-pmid', 'no-doi', 'no-annos', 'table', 'Journals'] 46 | 47 | def tag_string(c): 48 | return ' '.join(sorted(t.replace('RRIDCUR:', '') 49 | for t in c.tags if 'RRIDCUR' in t)) 50 | 51 | def filter_rows(filter=lambda c: True): 52 | yield from ((str(i + 1), 53 | tag_string(c), 54 | atag(PMID(c.pmid), c.pmid, new_tab=True) 55 | if c.pmid 56 | else (atag(DOI(c.doi), c.doi, new_tab=True) 57 | if c.doi 58 | else ''), 59 | atag(c.shareLink, 'Annotation', new_tab=True) 60 | if c # FIXME how does this work? 61 | else atag(c.uri, 'Paper', new_tab=True), 62 | atag(c.htmlLink, 'Anno HTML', new_tab=True), 63 | c.user, 64 | '\n'.join(c.curator_notes)) 65 | for i, c in enumerate(sorted((c for c in Curation 66 | if c.isAstNode 67 | and not c.Duplicate 68 | and not c.corrected # FIXME need a better way... 69 | and not c.public_id 70 | and filter(c)), 71 | key=tag_string))) 72 | k = 0 73 | kList = [] 74 | URLDict = {} 75 | for h in Curation: 76 | if BaseURL(h._anno) in URLDict.keys(): 77 | URLDict[BaseURL(h._anno)] += 1 78 | else: 79 | URLDict[BaseURL(h._anno)] = 1 80 | kList.append(k) 81 | class NavBar: 82 | def atag(self, route, name): 83 | if route == self.current_route: 84 | return atag(url_for(route), name, cls='navbar-select') 85 | else: 86 | return atag(url_for(route), name) 87 | 88 | def __call__(self, route=None): 89 | self.current_route = route 90 | out = divtag(self.atag('route_base', 'Home'), 91 | self.atag('route_papers', 'Papers'), 92 | self.atag('route_anno_help_needed', 'Help Needed'), 93 | self.atag('route_anno_incorrect', 'Incorrect'), 94 | self.atag('route_anno_unresolved', 'Unresolved'), 95 | self.atag('route_anno_missing', 'Missing'), 96 | self.atag('route_no_pmid', 'No PMID'), 97 | self.atag('route_no_doi', 'No DOI'), 98 | self.atag('route_no_id', 'No ID'), 99 | #self.atag('route_no_annos', 'No annos'), 100 | self.atag('route_table', 'All'), 101 | # TODO search box 102 | atag('https://github.com/SciCrunch/scibot/issues', 103 | 'GitHub issues', new_tab=True), 104 | cls='navbar') 105 | self.current_route = None 106 | return out 107 | 108 | navbar = NavBar() 109 | 110 | def table_rows(rows, title, route): 111 | return htmldoc(navbar(route), 112 | divtag(render_table(rows, '#', 'Problem', 'Identifier', 'Link', 'HTML Link', 'Curator', 'Notes'), 113 | cls='main'), 114 | title=title, 115 | styles=(table_style, cur_style, navbar_style)) 116 | 117 | def nonestr(thing): 118 | return '' if thing is None else thing 119 | 120 | def done_rrids(rrids): 121 | for rrid, s in rrids.items(): 122 | for a in s: 123 | if a.Validated: 124 | yield rrid 125 | break 126 | 127 | def todo_rrids(rrids): 128 | done = set(done_rrids(rrids)) 129 | for rrid in rrids: 130 | if rrid not in done: 131 | yield rrid 132 | 133 | def render_papers(rows): 134 | return divtag(render_table(rows, 135 | '#', 'Paper', 'PMID', 'DOI', 136 | 'TODO', 'Done', 'RRIDs', 'Annotations'), 137 | cls='main') 138 | 139 | def papers(filter=lambda a:True): 140 | return [(str(i + 1),) + t 141 | for i, t in 142 | enumerate(sorted(((atag(url, '...' + url[-20:], new_tab=True), 143 | nonestr(rrids.pmid), 144 | '' if 145 | rrids.doi is None else 146 | atag(DOI(rrids.doi), rrids.doi, new_tab=True), 147 | str(len(list(todo_rrids(rrids)))), 148 | str(len(list(done_rrids(rrids)))), 149 | str(len(rrids)), 150 | str(len([a for r in rrids.values() 151 | for a in r]))) 152 | for url, rrids in Curation._papers.items() 153 | if filter(next(a for s in rrids.values() 154 | for a in s))), 155 | key=lambda r: int(r[3]), 156 | reverse=True))] 157 | 158 | def no_pmid(): 159 | return papers(lambda a:a.pmid is None) 160 | 161 | def no_doi(): 162 | return papers(lambda a:a.doi is None) 163 | 164 | def no_id(): 165 | return papers(lambda a:a.doi is None and a.pmid is None) 166 | 167 | def no_annos(): # TODO 168 | return [] 169 | 170 | @app.route('/css/table.css') 171 | def route_css_table_style(): 172 | return table_style, 200, {'Content-Type':'text/css'} 173 | 174 | @app.route('/dashboard', methods=('GET', 'POST')) 175 | @app.route('/dashboard/', methods=('GET', 'POST')) 176 | def route_base(): 177 | return render_template('main.html', method='get', 178 | navbar=navbar(request.url_rule.endpoint), 179 | navbar_style = navbar_style, 180 | var='We have a lot of work to do!', 181 | nmissing='??', 182 | nures='??', 183 | incor='??', 184 | npapers=str(len(Curation._papers)), 185 | nnopmid=str(len(no_pmid())), 186 | nnodoi=str(len(no_doi())), 187 | #nnoboth=str(len(no_both())), 188 | #nnoannos=str(len(no_annos())) 189 | nnoannos='??', 190 | allp='??',) 191 | 192 | @app.route('/dashboard/anno-count') 193 | def route_anno_count(): 194 | return str(len(Curation._annos_list)) 195 | 196 | #@app.route(PurePath(base_url, 'anno-tags').as_posix()) 197 | @app.route('/dashboard/anno-user/') 198 | def route_anno_tags(user): 199 | print(user) 200 | out = '\n'.join([f'{anno.user} {anno.text} {anno.tags}
' 201 | for anno in Curation._annos_list if anno.user == user]) 202 | return out 203 | 204 | @app.route('/dashboard/journals') 205 | def route_Journals(): 206 | file = open("Journals.txt","r") 207 | paperStr = file.read() 208 | file.close() 209 | if paperStr == '': 210 | h = 0 211 | URLList = [] 212 | counter = 0 213 | paperStr = str(counter) + ' Results:

' 214 | print("PROSSESING") 215 | for h in Curation: 216 | journal = Journal(h._anno) 217 | if "urn:x-pdf" in journal or "file:" in journal: 218 | URLList.append(journal) 219 | if journal == "": 220 | print (h.shareLink) 221 | if not journal in URLList: 222 | paperStr += "
Journal Link
" 223 | paperStr += journal 224 | counter += 1 225 | URLList.append(journal) 226 | paperStr = str(counter) + paperStr[1:] 227 | file = open("Journals.txt", "w") 228 | file.write(paperStr) 229 | file.close() 230 | return (paperStr) 231 | 232 | @app.route('/dashboard/DOI') 233 | def route_DOI(): 234 | DOIStr = "" 235 | DOIList = [] 236 | counter = 0 237 | for h in Curation: 238 | if [t for t in h.tags if t.startswith("DOI")]: 239 | if h.doi not in DOIList: 240 | DOIStr += '
Anno #:%s
' % h 241 | DOIStr += ' Anno Link
' 242 | DOIStr += h.doi 243 | counter += 1 244 | if h.doi: 245 | DOIList.append(h.doi) 246 | return (str(counter) + "

" + DOIStr) 247 | 248 | @app.route('/dashboard/done') 249 | def route_done(): 250 | return 'TODO' 251 | 252 | @app.route('/dashboard/public') 253 | def route_public(): 254 | #return 'TODO' 255 | rows = ((str(i + 1),) + r for i, r in 256 | enumerate((nonestr(pa.curation_paper.pmid), 257 | nonestr(pa.curation_paper.doi), 258 | pa.rrid,) 259 | for pa in PublicAnno 260 | # skip incorrectly formatted and errors for now 261 | if pa.curation_ids and 262 | None not in pa.curation_annos and 263 | pa.rrid is not None # FIXME make clear these are page notes 264 | )) 265 | return htmldoc(navbar(request.url_rule.endpoint), 266 | divtag(render_table(rows, '#', 'PMID', 'DOI', 'RRID'), 267 | cls='main'), 268 | title='SciBot public release', 269 | styles=(table_style, cur_style, navbar_style)) 270 | 271 | @app.route('/dashboard/table') 272 | def route_table(): 273 | rows = filter_rows(lambda c: c.very_bad or c._Missing and not c.rrid or c.Incorrect or c.Unresolved) 274 | return table_rows(rows, 'All SciBot curation problems', request.url_rule.endpoint) 275 | 276 | """ 277 | """ 304 | 305 | @app.route('/dashboard/no-annos') 306 | def route_no_annos(): 307 | return htmldoc(navbar(request.url_rule.endpoint), 308 | divtag('There shouldn\'t be anything here...', 309 | cls='main'), 310 | title='SciBot No Anno Papers', 311 | styles=(navbar_style,)) 312 | 313 | @app.route('/dashboard/papers') 314 | def route_papers(): 315 | rows = papers() 316 | return htmldoc(navbar(request.url_rule.endpoint), 317 | render_papers(rows), 318 | title='SciBot papers', 319 | styles=(table_style, cur_style, navbar_style)) 320 | 321 | @app.route('/dashboard/no-pmid') 322 | def route_no_pmid(): 323 | rows = no_pmid() 324 | return htmldoc(navbar(request.url_rule.endpoint), 325 | render_papers(rows), 326 | title='SciBot No PMID Papers', 327 | styles=(table_style, cur_style, navbar_style)) 328 | 329 | @app.route('/dashboard/no-doi') 330 | def route_no_doi(): 331 | rows = no_doi() 332 | return htmldoc(navbar(request.url_rule.endpoint), 333 | render_papers(rows), 334 | title='SciBot No DOI Papers', 335 | styles=(table_style, cur_style, navbar_style)) 336 | 337 | @app.route('/dashboard/no-id') 338 | def route_no_id(): 339 | rows = no_id() 340 | return htmldoc(navbar(request.url_rule.endpoint), 341 | render_papers(rows), 342 | title='SciBot No ID Papers', 343 | styles=(table_style, cur_style, navbar_style)) 344 | 345 | @app.route('/dashboard/help-needed') 346 | def route_anno_help_needed(): 347 | rows = filter_rows(lambda c: c.very_bad) 348 | 349 | return table_rows(rows, 'Help needed RRIDs', request.url_rule.endpoint) 350 | 351 | @app.route('/dashboard/incorrect') 352 | def route_anno_incorrect(): 353 | rows = filter_rows(lambda c: not c.very_bad and c.Incorrect) 354 | return table_rows(rows, 'Incorrect RRIDs', request.url_rule.endpoint) 355 | 356 | @app.route('/dashboard/unresolved') 357 | def route_anno_unresolved(): 358 | rows = filter_rows(lambda c: c.Unresolved and not c.very_bad and not c.Incorrect) 359 | 360 | return table_rows(rows, 'Unresolved RRIDs', request.url_rule.endpoint) 361 | 362 | @app.route('/dashboard/missing', methods=('GET', 'POST')) 363 | def route_anno_missing(): 364 | rows = filter_rows(lambda c: c._Missing and not c.rrid) 365 | return table_rows(rows, 'Missing RRIDs', request.url_rule.endpoint) 366 | 367 | @app.route('/dashboard/no-replies') 368 | def route_no_replies(): 369 | # this should be the table with no replies 370 | return 'TODO' 371 | 372 | @app.route('/dashboard/results') 373 | def search_results(search): 374 | h = 0 375 | hlist = [] 376 | hstr = '' 377 | counter = 0 378 | # if search.data['search'] == '': 379 | # h = 0 380 | # hstr = '' 381 | # for h in Curation: 382 | # hstr += repr(h) 383 | # h += 1 384 | # return(hstr) 385 | # else: 386 | if search.data['select'] == 'ID': 387 | for h in Curation: 388 | if search.data['search'] in h.id: 389 | hstr += '
Anno #:%s
' % h 390 | hstr += ' Anno Link
' 391 | hstr += repr(h) 392 | counter += 1 393 | if hstr == '': 394 | return('no results') 395 | return (str(counter) + ' Results:

' + hstr) 396 | #return render_template('results.html', results=html.unescape(hstr)) 397 | elif search.data['select'] == 'Tags': 398 | for h in Curation: 399 | if [t for t in h.tags if search.data['search'] in t]: 400 | hstr += '
Anno #:%s
' % h 401 | hstr += ' Anno Link
' 402 | hstr += repr(h) 403 | counter += 1 404 | if hstr == '': 405 | return('no results') 406 | print (str(len(hlist))) 407 | print(len(Curation._annos_list)) 408 | return (str(counter) + ' Results:

' + hstr) 409 | #return render_template('results.html', results=hstr) 410 | elif search.data['select'] == 'User': 411 | for h in Curation: 412 | if h._anno.user == search.data['search']: 413 | hstr += '
Anno #:%s
' % h 414 | hstr += ' Anno Link
' 415 | hstr += repr(h) 416 | counter += 1 417 | if hstr == '': 418 | return('no results') 419 | return (str(counter) + ' Results:

' + hstr) 420 | else: 421 | return search_text(search.data['select'], Curation._annos_list, list(Curation), search.data['search']) 422 | 423 | #new_function = route('/my/route')(route_base) 424 | 425 | #return new_function 426 | #breakpoint() 427 | return app 428 | #new_function_outside = make_app('not really annos') 429 | 430 | def search_text(text, annos, search): 431 | h = 0 432 | hlist = [] 433 | hstr = '' 434 | counter = 0 435 | for h in Curation: 436 | hsplit = h.text.split('

',h.text.count('

')) 437 | t = 0 438 | Data = '' 439 | for t in range(0, len(hsplit)): 440 | if text in hsplit[t]: 441 | Data = hsplit[t].replace(text + ': ', '') 442 | 443 | if search.upper() in Data.upper(): 444 | hstr += '
Anno #:%s
' % h 445 | hstr += ' Anno Link
' 446 | hstr += repr(h) 447 | hstr += "
" + BaseURL(annos[h]) 448 | counter += 1 449 | if hstr == '': 450 | return('no results') 451 | return (str(counter) + ' Results:

' + hstr) 452 | 453 | def BaseURL(anno): 454 | URL = anno.uri.replace(".long", "").replace("/abstract", "").replace("/full","").replace(".short", "").replace(".full", "").replace("http://","").replace("https://","").replace("/FullText","").replace("/Abstract","").replace("/enhanced","") 455 | SplitURL = URL.split("/", URL.count("/")) 456 | if SplitURL[-1] == '': 457 | URL = SplitURL[0] + SplitURL[-2] 458 | else: 459 | URL = SplitURL[0] + SplitURL[-1] 460 | return URL 461 | 462 | def Journal(anno): 463 | URL = anno.uri.replace(".long", "").replace("/abstract", "").replace("/full","").replace(".short", "").replace(".full", "").replace("http://","").replace("https://","").replace("/FullText","").replace("/Abstract","").replace("/enhanced","") 464 | SplitURL = URL.split("/", URL.count("/")) 465 | if len(SplitURL) == 1 or len(SplitURL) == 0: 466 | print(URL) 467 | URL = SplitURL[0] 468 | return URL 469 | 470 | def annoSync(memfile, group, helpers=tuple(), world_ok=False): 471 | if group == '__world__' and not world_ok: 472 | raise ValueError('Group is set to __world__ please run the usual `export HYP_ ...` command.') 473 | get_annos = Memoizer(memfile, api_token, username, group) 474 | yield get_annos 475 | prefilter = preFilter(groups=[group]).export() 476 | hsh = type(f'helperSyncHandler{group}', 477 | (helperSyncHandler,), 478 | dict(memoizer=get_annos, 479 | helpers=helpers)) 480 | annos = get_annos() 481 | yield annos 482 | stream_thread, exit_loop = AnnotationStream(annos, prefilter, hsh)() 483 | yield stream_thread 484 | yield exit_loop 485 | 486 | def setup(): 487 | get_annos, annos, stream_thread, exit_loop = annoSync(memfile, group, (Curation,)) 488 | get_pannos, pannos, pstream_thread, pexit_loop = annoSync(pmemfile, group_staging, 489 | (PublicAnno,), world_ok=True) 490 | def close_stuff(): 491 | exit_loop() 492 | stream_thread.join() 493 | 494 | atexit.register(close_stuff) 495 | 496 | stream_thread.start() 497 | app = make_app(annos, pannos) 498 | app.debug=False 499 | return app 500 | 501 | def main(): 502 | get_annos, annos, stream_thread, exit_loop = annoSync(memfile, group, (Curation,)) 503 | get_pannos, pannos, pstream_thread, exit_loop = annoSync(pmemfile, group_staging, 504 | (PublicAnno,), world_ok=True) 505 | 506 | app = make_app(annos, pannos) 507 | #stream_loop.start() 508 | #pstream_loop.start() # FIXME eventloop already running error... 509 | app.secret_key = 'super secret key' 510 | app.config['SESSION_TYPE'] = 'filesystem' 511 | print(app.view_functions) 512 | app.debug = False 513 | from scibot.config import test_host, port_dashboard 514 | app.run(host=test_host, port=port_dashboard) 515 | 516 | if __name__ == '__main__': 517 | main() 518 | -------------------------------------------------------------------------------- /scibot/db.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import asyncio 3 | from pathlib import Path 4 | from datetime import datetime 5 | from itertools import chain 6 | from collections import namedtuple, defaultdict 7 | import json 8 | from hyputils.memex import models 9 | from hyputils.memex.db import init 10 | from hyputils.memex.util.uri import normalize as uri_normalize 11 | from hyputils.memex.db.types import _get_hex_from_urlsafe, _get_urlsafe_from_hex, URLSafeUUID 12 | from hyputils.memex.util.user import split_user 13 | from hyputils.memex.models.document import update_document_metadata 14 | from sqlalchemy import create_engine 15 | from sqlalchemy.sql import text 16 | from sqlalchemy.sql.expression import bindparam 17 | from sqlalchemy.orm.session import sessionmaker 18 | from sqlalchemy.dialects.postgresql import ARRAY 19 | from hyputils.hypothesis import Memoizer 20 | from hyputils.handlers import dbSyncHandler 21 | from hyputils.subscribe import setup_websocket, preFilter 22 | from scibot import config 23 | from scibot.anno import quickload, quickuri, add_doc_all, validate 24 | from scibot.utils import makeSimpleLogger, uri_normalization 25 | from interlex.core import makeParamsValues # FIXME probably need a common import ... 26 | try: 27 | breakpoint 28 | except NameError: 29 | from IPython import embed as breakpoint 30 | 31 | 32 | def getSession(dburi=config.dbUri(), echo=False): 33 | engine = create_engine(dburi, echo=echo) 34 | 35 | Session = sessionmaker() 36 | Session.configure(bind=engine) 37 | session = Session() 38 | return session 39 | 40 | 41 | def init_scibot(database): 42 | dburi = config.dbUri(user='scibot-admin', database=database) 43 | #dburi = dbUri('postgres') 44 | engine = create_engine(dburi) 45 | init(engine, should_create=True, authority='scicrunch') 46 | 47 | Session = sessionmaker() 48 | Session.configure(bind=engine) 49 | session = Session() 50 | file = Path(__file__).parent / '../sql/permissions.sql' 51 | with open(file.as_posix(), 'rt') as f: 52 | sql = f.read() 53 | #args = dict(database=database) 54 | # FIXME XXX evil replace 55 | sql_icky = sql.replace(':database', f'"{database}"') 56 | session.execute(sql_icky) 57 | session.commit() 58 | 59 | 60 | class DbQueryFactory: 61 | """ parent class for creating converters for queries with uniform results """ 62 | 63 | convert = tuple() 64 | query = '' 65 | 66 | def ___new__(cls, *args, **kwargs): 67 | return super().__new__(cls) 68 | 69 | def __new__(cls, session): 70 | newcls = cls.bindSession(session) 71 | newcls.__new__ = cls.___new__ 72 | return newcls 73 | 74 | @classmethod 75 | def bindSession(cls, session): 76 | # this approach seems better than overloading what __new__ does 77 | # and doing unexpected things in new 78 | classTypeInstance = type(cls.__name__.replace('Factory',''), 79 | (cls,), 80 | dict(session=session)) 81 | return classTypeInstance 82 | 83 | def __init__(self, condition=''): 84 | self.condition = condition 85 | 86 | def execute(self, params=None, raw=False): 87 | if params is None: 88 | params = {} 89 | gen = self.session.execute(self.query + ' ' + self.condition, params) 90 | first = next(gen) 91 | if raw: 92 | yield first 93 | yield from gen 94 | else: 95 | Result = namedtuple(self.__class__.__name__ + 'Result', list(first.keys())) # TODO check perf, seems ok? 96 | for result in chain((first,), gen): 97 | yield Result(*(c(v) if c else v for c, v in zip(self.convert, result))) 98 | 99 | @staticmethod 100 | def get_cols(model, no_id=True): 101 | cols = model.__table__.columns.keys() 102 | if no_id: 103 | cols.remove('id') 104 | 105 | return cols 106 | 107 | def _table_insert(self, table_things, table, cols): 108 | def do_defaults(thing, cols): 109 | for col in cols: 110 | value = getattr(thing, col) 111 | if value is None: 112 | c = table.columns[col] 113 | if c.default: # TODO nullable check here? 114 | value = c.default.arg 115 | 116 | yield value 117 | 118 | *templates, params = makeParamsValues( 119 | *([list(do_defaults(t, cols))] 120 | for t in table_things)) 121 | 122 | col_expr = f'({", ".join(cols)})' 123 | sql = (f'INSERT INTO {table.name} {col_expr} VALUES ' + 124 | ', '.join(templates) + 125 | 'RETURNING id') 126 | 127 | for thing, rp in zip(table_things, self.session.execute(sql, params)): 128 | thing.id = rp.id 129 | 130 | def insert_bulk(self, things, column_mapping=None, keep_id=False): 131 | # this works around the orm so be sure 132 | # to call session.expunge_all when done 133 | tables = set(t.__table__ for t in things) 134 | for table in tables: 135 | table_things = [t for t in things if t.__table__ == table] 136 | if column_mapping and table.name in column_mapping: 137 | cols = column_mapping[table.name] 138 | else: 139 | cols = self.get_cols(table_things[0].__class__) 140 | self._table_insert(table_things, table, cols) 141 | 142 | def __call__(self, params=None): 143 | return self.execute(params) 144 | 145 | def __iter__(self): 146 | """ works for cases without params """ 147 | return self.execute() 148 | 149 | 150 | class AnnoSyncFactory(Memoizer, DbQueryFactory): 151 | log = makeSimpleLogger('scibot.db.sync') 152 | convert = (lambda d: datetime.isoformat(d) + '+00:00',) # FIXME hack 153 | query = 'SELECT updated FROM annotation' 154 | condition = 'WHERE groupid = :groupid ORDER BY updated DESC LIMIT 1' # default condition 155 | 156 | def __init__(self, api_token=config.api_token, username=config.username, 157 | group=config.group, memoization_file=None, condition=''): 158 | super().__init__(memoization_file, api_token=api_token, username=username, group=group) 159 | if condition: 160 | self.condition = condition 161 | 162 | def __call__(self): 163 | """ block upstream call which does undesirable things """ 164 | raise NotImplemented 165 | 166 | def get_api_rows(self, search_after=None, stop_at=None): 167 | try: 168 | if self.group == '__world__': 169 | self.condition = 'WHERE groupid = :groupid AND userid = :userid ORDER BY updated DESC LIMIT 1' 170 | userid = f'acct:{self.username}@hypothes.is' # FIXME other registration authorities 171 | last_updated = next(self.execute(params={'groupid':self.group, 172 | 'userid':userid})).updated 173 | else: 174 | last_updated = next(self.execute(params={'groupid':self.group})).updated 175 | self.log.debug(f'last updated at {last_updated} for {self.group}') 176 | except StopIteration: 177 | last_updated = None 178 | self.log.debug(f'no annotations in database for {self.group}') 179 | 180 | if self.memoization_file is None: 181 | rows = list(self.yield_from_api(search_after=last_updated, stop_at=stop_at)) 182 | else: 183 | if last_updated: 184 | rows = [a._row for a in self.get_annos() if a.updated > last_updated] 185 | else: 186 | rows = [a._row for a in self.get_annos()] 187 | 188 | return rows 189 | 190 | def sync_annos(self, search_after=None, stop_at=None, api_rows=None, check=False): 191 | """ batch sync """ 192 | 193 | if not api_rows: 194 | # TODO stream this using generators? 195 | api_rows = self.get_api_rows(search_after, stop_at) 196 | if not api_rows: 197 | self.log.info(f'all annotations are up to date') 198 | return 199 | 200 | anno_records = [quickload(r) for r in api_rows] 201 | 202 | #qsql = 'SELECT distinct(id, updated) FROM annotation WHERE groupid=:groupid' # makes it a string :/ 203 | qsql = 'SELECT id, updated, document_id FROM annotation WHERE groupid=:groupid' 204 | params = dict(groupid=api_rows[0]['group']) 205 | existing = self.session.execute(qsql, params) 206 | dext = {_get_urlsafe_from_hex(id.hex):(up, did) for id, up, did in existing} 207 | dupes = [(a, dext[a['id']][0].isoformat() + '+00:00') for a in anno_records if a['id'] in dext] 208 | maybe_update = [a['id'] for a, u in dupes if a['updated'] > u] 209 | assert len(dupes) == len(maybe_update) 210 | #to_update = tuple(_get_hex_from_urlsafe(i) i for i in maybe_update) 211 | to_delete = {f'id{i}':v for i, v in enumerate(maybe_update)} 212 | if to_delete: 213 | names_or = ' OR '.join(f'id = :{p}' for p in to_delete) 214 | _dsql = text(f'DELETE FROM annotation WHERE {names_or}') 215 | bindparams=tuple(bindparam(name, type_=URLSafeUUID) for name in to_delete) 216 | dsql = _dsql.bindparams(*bindparams) 217 | # delete to avoid collisions, they will be added again later and 218 | # then finalized when the transaction finishes 219 | self.session.execute(dsql, to_delete) 220 | 221 | self.log.debug(f'quickload complete for {len(api_rows)} api_rows') 222 | 223 | anno_id_to_doc_id = self.q_create_docs(api_rows) 224 | self.q_create_annos(anno_records, anno_id_to_doc_id) 225 | 226 | def do_check(): 227 | api_rows # so that it is accessible in function scope 228 | self.log.debug('checking for consistency') 229 | annos = self.session.query(models.Annotation).\ 230 | filter(models.Annotation.groupid == self.group).all() 231 | #docs = self.session.query(models.Document).all() 232 | durs = self.session.query(models.DocumentURI).all() 233 | doc_uris = defaultdict(set) 234 | _ = [doc_uris[d.document_id].add(d.uri) for d in durs] 235 | doc_uris = dict(doc_uris) 236 | #dms = self.session.query(models.DocumentMeta).all() 237 | #doc_mismatch = [a for a in annos if anno_id_to_doc_id[a.id] != a.document.id] # super slow due to orm fetches 238 | doc_missing = [a for a in annos if a.id not in anno_id_to_doc_id] 239 | assert not doc_missing 240 | doc_mismatch = [a for a in annos if anno_id_to_doc_id[a.id] != a.document_id] 241 | assert not doc_mismatch, doc_mismatch 242 | # don't use the orm to do this, it is too slow even if you send the other queries above 243 | #breakpoint() 244 | uri_mismatch = [(a.target_uri, doc_uris[a.document_id], a) 245 | for a in annos 246 | if a.target_uri not in doc_uris[a.document_id]] 247 | # NOTE hypothesis only allows 1 record per normalized uri, so we have to normalize here as well 248 | maybe_mismatch = set(frozenset(s) for u, s, a in uri_mismatch if not s.add(u)) 249 | h_mismatch = set(s for s in maybe_mismatch if len(frozenset(uri_normalize(u) for u in s)) > 1) 250 | self.log.debug(f'h mismatch has {len(h_mismatch)} cases') 251 | # the above normalization is not sufficient for cases where there are two 252 | # hypothes.is normalized uris AND a scibot normalized uri as well 253 | super_mismatch = set(s for s in h_mismatch if len(frozenset(uri_normalization(u) for u in s)) > 1) 254 | assert not super_mismatch, super_mismatch 255 | 256 | if check: 257 | self.session.flush() # have to run this to get the doc ids to work? 258 | do_check() 259 | 260 | self.session.commit() 261 | self.log.debug('commit done') 262 | else: 263 | breakpoint() 264 | 265 | def q_create_annos(self, anno_records, anno_id_to_doc_id): 266 | # NOTE values_sets adds the document_id field and 267 | # so self.types must be called after values_sets completes 268 | values_sets = tuple(self.values_sets(anno_records, anno_id_to_doc_id)) 269 | *values_templates, values, bindparams = makeParamsValues(*values_sets, 270 | types=self.types(anno_records)) 271 | rec_keys = self.get_rec_keys(anno_records) 272 | sql = text(f'INSERT INTO annotation ({", ".join(rec_keys)}) VALUES {", ".join(values_templates)}') 273 | sql = sql.bindparams(*bindparams) 274 | 275 | def debug_type(column): 276 | # FIXME column name collisions 277 | col = models.Annotation.__table__.columns[column] 278 | ctype = col.type.python_type 279 | ind = rec_keys.index(column) 280 | for values, in values_sets: 281 | if type(values[ind]) != ctype: 282 | print('ERROR IN ', values) 283 | 284 | def debug_templates(column): 285 | col = models.Annotation.__table__.columns[column] 286 | ctype = col.type.python_type 287 | for t in values_templates: 288 | for k, ws_c_vn_ws in zip(rec_keys, t.strip('(').rstrip(')').split(',')): 289 | vn = ws_c_vn_ws.strip().rstrip().strip(':') 290 | v = values[vn] 291 | if k == column and type(v) != ctype: 292 | print('ERROR IN', t) 293 | 294 | try: 295 | self.session.execute(sql, values) 296 | self.log.debug('anno execute done') 297 | except BaseException as e: 298 | self.log.error('YOU ARE IN ERROR SPACE') 299 | breakpoint() 300 | 301 | self.session.flush() 302 | self.log.debug('anno flush done') 303 | 304 | def get_rec_keys(self, anno_records): 305 | def fix_reserved(k): 306 | if k == 'references': 307 | k = '"references"' 308 | 309 | return k 310 | 311 | return [fix_reserved(k) for k in anno_records[0].keys()] 312 | 313 | def values_sets(self, anno_records, anno_id_to_doc_id): 314 | def type_fix(k, v): # TODO is this faster or is type_fix? 315 | if isinstance(v, dict): 316 | return json.dumps(v) # FIXME perf? 317 | elif isinstance(v, list): 318 | if any(isinstance(e, dict) for e in v): 319 | return json.dumps(v) # FIXME perf? 320 | return v 321 | 322 | def make_vs(d): 323 | id = d['id'] 324 | document_id = anno_id_to_doc_id[id] 325 | d['document_id'] = document_id 326 | # FIXME does ordering matter here!? 327 | return [type_fix(k, v) for k, v in d.items()], # don't miss the , to make this a value set 328 | 329 | yield from (make_vs(d) for d in anno_records) 330 | self.log.debug('anno values sets done') 331 | 332 | def types(self, datas): 333 | def make_types(d): 334 | def inner(k): 335 | if k == 'id': 336 | return URLSafeUUID 337 | elif k == 'references': 338 | return ARRAY(URLSafeUUID) 339 | else: 340 | return None 341 | return [inner(k) for k in d] 342 | 343 | yield from (make_types(d) for d in datas) 344 | 345 | @staticmethod 346 | def uri_records(row): 347 | uri = row['uri'] 348 | return uri, uri_normalization(uri), quickuri(row) 349 | 350 | def q_prepare_docs(self, rows): 351 | existing_unnormed = {r.uri:(r.document_id, 352 | self.convert[0](r.created), 353 | self.convert[0](r.updated)) 354 | for r in self.session.execute('SELECT uri, document_id, created, ' 355 | 'updated FROM document_uri')} 356 | created_updated = {docid:(created, updated) 357 | for _, (docid, created, updated) in existing_unnormed.items()} 358 | _existing = defaultdict(set) 359 | _ = [_existing[uri_normalization(uri)].add(docid) 360 | for uri, (docid, created, updated) in existing_unnormed.items()] 361 | assert not [_ for _ in _existing.values() if len(_) > 1] # TODO proper handling for this case 362 | h_existing_unnormed = {uri_normalize(uri):docid 363 | for uri, (docid, created, updated) in existing_unnormed.items()} 364 | existing = {k:next(iter(v)) for k, v in _existing.items()} # FIXME issues when things get big 365 | latest_existing = max(u for c, u in created_updated.values()) if created_updated else None 366 | 367 | new_docs = {} # FIXME this is completely opaque since it is not persisted anywhere 368 | for row in sorted(rows, key=lambda r: r['created']): 369 | id = row['id'] 370 | uri, uri_normed, (created, updated, claims) = self.uri_records(row) 371 | try: 372 | docid = existing[uri_normed] 373 | dc, du = created_updated[docid] 374 | doc = models.Document(id=docid, created=dc, updated=du) 375 | if doc.updated < updated: 376 | # FIXME TODO update the record? 377 | #self.log.warning('YOU ARE NOT UPDATING A DOC WHEN YOU SHOULD!!!!!!\n' 378 | #f'{docid} {doc.updated} {updated}') 379 | pass 380 | 381 | do_claims = False 382 | except KeyError as e: 383 | if existing: 384 | if row['updated'] <= latest_existing: 385 | # only need to worry if we are recreating 386 | raise e 387 | if uri_normed not in new_docs: 388 | do_claims = True 389 | doc = models.Document(created=created, updated=updated) 390 | self.session.add(doc) # TODO perf testing vs add_all 391 | new_docs[uri_normed] = doc 392 | else: 393 | do_claims = False 394 | doc = new_docs[uri_normed] 395 | 396 | #if type(doc.created) == str: 397 | #breakpoint() 398 | yield id, doc 399 | 400 | if uri_normalize(uri) not in h_existing_unnormed: 401 | # NOTE allowing only the normalized uri can cause confusion (i.e. see checks in sync_annos) 402 | h_existing_unnormed[uri_normalize(uri)] = doc 403 | # TODO do these get added automatically if their doc gets added but exists? 404 | doc_uri = models.DocumentURI(document=doc, 405 | claimant=uri, 406 | uri=uri, 407 | type='self-claim', 408 | created=created, 409 | updated=updated) 410 | yield None, doc_uri 411 | 412 | # because of how this schema is designed 413 | # the only way that this can be fast is 414 | # if we assume that all claims are identical 415 | # FIXME if there is a new claim type then we are toast though :/ 416 | # the modelling here assumes that title etc can't change 417 | #print(id, uri, uri_normed, row['user'], row['uri'], row['created']) 418 | if do_claims: 419 | for claim in claims: 420 | #print(id, uri, uri_normed, claim['claimant'], claim['type'], claim['value']) 421 | dm = models.DocumentMeta(document=doc, 422 | created=created, 423 | updated=updated, 424 | **claim) 425 | yield None, dm 426 | 427 | def q_create_docs(self, rows): 428 | ids_docs = list(self.q_prepare_docs(rows)) 429 | docs = sorted(set(d for i, d in ids_docs if i), key=lambda d:d.created) 430 | uri_meta = list(d for i, d in ids_docs if not i) 431 | assert len(uri_meta) == len(set(uri_meta)) 432 | 433 | # TODO skip the ones with document ids 434 | self.insert_bulk(docs, {'document':['created', 'updated']}) 435 | 436 | for um in uri_meta: 437 | um.document_id = um.document.id 438 | um.document = None 439 | del um.document # have to have this or doceument overrides document_id 440 | 441 | self.insert_bulk(uri_meta) 442 | self.session.expunge_all() # preven attempts to add unpersisted 443 | self.session.flush() 444 | self.log.debug('finished inserting docs') 445 | anno_id_to_doc_id = {i:d.id for i, d in ids_docs} 446 | return anno_id_to_doc_id 447 | 448 | def sync_anno_stream(self, search_after=None, stop_at=None): 449 | """ streaming one anno at a time version of sync """ 450 | for row in self.yield_from_api(search_after=last_updated, stop_at=stop_at): 451 | yield row, 'TODO' 452 | continue 453 | # TODO 454 | datum = validate(row) # roughly 30x slower than quickload 455 | # the h code I'm calling assumes these are new annos 456 | datum['id'] = row['id'] 457 | datum['created'] = row['created'] 458 | datum['updated'] = row['updated'] 459 | document_dict = datum.pop('document') 460 | document_uri_dicts = document_dict['document_uri_dicts'] 461 | document_meta_dicts = document_dict['document_meta_dicts'] 462 | a = [models.Annotation(**d, 463 | document_id=dbdocs[uri_normalize(d['target_uri'])].id) 464 | for d in datas] # slow 465 | self.log.debug('making annotations') 466 | self.session.add_all(a) 467 | self.log.debug('adding all annotations') 468 | 469 | 470 | class WebsocketSyncFactory(AnnoSyncFactory): 471 | 472 | def __init__(self, 473 | api_token=config.api_token, 474 | username=config.username, 475 | group=config.group, 476 | helpers=tuple(), 477 | threaded=False): 478 | super().__init__(api_token, username, group) 479 | self.prefilter = preFilter(groups=[group]).export() 480 | handler = type(f'dbSyncHandler{group}', # TODO this is where we customize 481 | (dbSyncHandler,), 482 | {} 483 | #dict(session=self.session) 484 | # self.handler gives access to the session 485 | # so in theory don't need this unless treading breaks something 486 | ) 487 | self.filter_handlers = [handler(self.handler)] 488 | self.ws_loop, self.exit_loop = setup_websocket(self.api_token, self.prefilter, self.filter_handlers) 489 | self.threaded = threaded 490 | self.loop = asyncio.get_event_loop() 491 | if self.threaded: # yes do this at init and not at call time, you should know by then 492 | self.stream_thread = Thread(target=self.loop_target, 493 | args=(self.loop, self.ws_loop)) 494 | 495 | def handler(self, message): 496 | act = message['options']['action'] 497 | print('act', act) 498 | print(message) 499 | if act != 'delete': 500 | row = message['payload'][0] 501 | self.create_anno(row) 502 | if act == 'create': 503 | pass 504 | elif act == 'update': 505 | pass 506 | elif act == 'delete': 507 | pass 508 | elif act == 'flag': 509 | 'lol' 510 | else: 511 | raise UnknownAction(act) # email the maintainer basically 512 | 513 | def create_anno(self, row): 514 | datum = validate(row) 515 | 516 | document_dict = datum.pop('document') 517 | document_uri_dicts = document_dict['document_uri_dicts'] 518 | document_meta_dicts = document_dict['document_meta_dicts'] 519 | 520 | id = row['id'] 521 | target_uri = datum['target_uri'] 522 | created = row['created'] 523 | updated = row['updated'] 524 | 525 | annotation = models.Annotation(**datum) 526 | 527 | document = update_document_metadata( # TODO update normalization rules 528 | self.session, 529 | target_uri, 530 | document_meta_dicts, 531 | document_uri_dicts, 532 | created=created, # FIXME doesn't quite seem right, would klobber 533 | updated=updated) 534 | 535 | print(id) 536 | annotation.document = document 537 | annotation.id = id 538 | annotation.target_uri = target_uri 539 | annotation.created = created 540 | annotation.updated = updated 541 | self.session.add(annotation) 542 | self.session.flush() 543 | self.session.commit() # FIXME hypothesis doesn't call this 544 | 545 | @staticmethod 546 | def loop_target(loop, ws_loop): 547 | asyncio.set_event_loop(loop) 548 | loop.run_until_complete(ws_loop(loop)) 549 | 550 | def close_stuff(self): 551 | self.exit_loop() 552 | if self.threaded: # FIXME 553 | self.stream_thread.join() 554 | 555 | def run(self): 556 | atexit.register(self.close_stuff) 557 | if self.threaded: 558 | self.stream_thread.start() 559 | else: 560 | try: 561 | self.loop.run_until_complete(self.ws_loop(self.loop)) 562 | except KeyboardInterrupt: 563 | return # at exist will deal with it 564 | 565 | 566 | def uuid_to_urlsafe(uuid): 567 | return _get_urlsafe_from_hex(uuid.hex) 568 | 569 | 570 | class AnnoQueryFactory(DbQueryFactory): 571 | convert = ( 572 | uuid_to_urlsafe, 573 | lambda d: datetime.isoformat(d) + '+00:00', # FIXME hack WARNING MAKE SURE ALL TIMESTAMPS THAT GO IN 574 | lambda d: datetime.isoformat(d) + '+00:00', # FIXME hack ARE DERIVED FROM datetime.utcnow() 575 | None, 576 | lambda userid: split_user(userid)['username'], 577 | None, 578 | lambda lst: [uuid_to_urlsafe(uuid) for uuid in lst], 579 | ) 580 | query = ('SELECT id, created, updated, target_uri, userid, tags, a.references ' 581 | 'FROM annotation AS a') 582 | 583 | 584 | def bindSession(cls, session): 585 | return cls.bindSession(session) 586 | -------------------------------------------------------------------------------- /scibot/export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import print_function 3 | import re 4 | import csv 5 | import pickle 6 | from datetime import date 7 | from collections import defaultdict 8 | from collections import namedtuple, defaultdict 9 | from lxml import etree 10 | from hyputils.hypothesis import HypothesisUtils, HypothesisAnnotation, Memoizer 11 | from scibot.config import memfile, api_token, username, group 12 | 13 | bad_tags = { 14 | 'RRID:Incorrect', 15 | 'RRID:InsufficientMetadata', 16 | 'RRID:Missing', 17 | 'RRID:Unrecognized', 18 | 'RRID:Unresolved', 19 | 'RRID:Validated', 20 | 'RRID:Duplicate', 21 | } 22 | 23 | def get_proper_citation(xml): 24 | root = etree.fromstring(xml) 25 | if root.findall('error'): 26 | proper_citation = '' 27 | else: 28 | data_elements = root.findall('data')[0] 29 | data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements] # these shouldn't duplicate 30 | a = [v for n, v in data_elements if n == 'Proper Citation'] 31 | proper_citation = a[0] if a else '' 32 | 33 | return proper_citation 34 | 35 | def fix_trailing_slash(annotated_urls): 36 | for key in [k for k in annotated_urls.keys()]: 37 | if key.endswith('/'): 38 | new_key = key.rstrip('/') 39 | print(new_key) 40 | if new_key in annotated_urls: 41 | annotated_urls[key].extend(annotated_urls.pop(new_key)) 42 | 43 | def export_impl(): 44 | get_annos = Memoizer(memfile, username=username, api_token=api_token, group=group) 45 | annos = get_annos() 46 | 47 | annotated_urls = defaultdict(list) 48 | for anno in annos: 49 | annotated_urls[anno.uri].append(anno) 50 | 51 | fix_trailing_slash(annotated_urls) 52 | 53 | output_rows = [] 54 | for annotated_url in annotated_urls.keys(): 55 | #print(annotated_url) 56 | annos = annotated_urls[annotated_url] 57 | replies = defaultdict(list) 58 | PMID = [] 59 | for anno in annos: # gotta build the reply structure and get pmid 60 | #print('id:', anno.id) 61 | #print('user:', anno.user) 62 | #print('exact:', anno.exact) 63 | #print('text:', anno.text) 64 | #print('tags:', anno.tags) 65 | #print('type:', anno.type) 66 | #print('references:', anno.references) 67 | if anno.references: 68 | for reference in anno.references: # shouldn't there only be one??? 69 | replies[reference].append(anno) 70 | PMID.extend([tag for tag in anno.tags if tag.startswith('PMID:') and '_' not in tag]) # bad tags with PMID:SCR_ 71 | #curators didn't put the pmid in as tags :( 72 | if anno.text.startswith('PMID:'): # DANGER ZONE 73 | if '_' in anno.text: 74 | print('PMIDS DONT HAVE UNDERSCORES PROBABLY CURATION BUG', anno.text) 75 | else: 76 | PMID.append(anno.text.strip()) # because, yep, when you don't tag sometimes you get \n :/ 77 | 78 | if PMID: 79 | if len(PMID) > 1: 80 | print(PMID, annotated_url) 81 | if PMID[0] == PMID[1]: 82 | PMID = PMID[0] 83 | print('WARNING: more than one pmid tag') 84 | else: 85 | print("raise BaseException('more than one pmid tag')") # irritating 86 | PMID = PMID[0] # FIXME 87 | else: 88 | PMID = PMID[0] 89 | #print(PMID) 90 | else: 91 | all_tags = [] 92 | for a in annos: 93 | all_tags.extend(a.tags) 94 | #print('NO PMID FOR', annotated_url) 95 | #print(set([a.user for a in annos])) 96 | #print(all_tags) 97 | PMID = annotated_url 98 | 99 | RRIDs = defaultdict(list) 100 | EXACTs = {} 101 | CITEs = {} 102 | #USERs = {} 103 | for anno in annos: 104 | RRID = None 105 | additional = [] 106 | for tag in anno.tags: 107 | if re.match('RRID:.+[0-9]+.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH 108 | #if re.match('RRID:.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH 109 | if RRID is not None: 110 | raise BaseException('MORE THAN ONE RRID PER ENTRY!') 111 | RRID = tag # :/ this works for now but ARHGHHGHASFHAS 112 | else: 113 | additional.append(tag) # eg Unresolved 114 | 115 | if tag == 'RRIDCUR:Missing': # fix for bad curation process 116 | maybe_rrid = anno.text.strip() 117 | if re.match('RRID:.+[0-9]+', maybe_rrid): # ARRRRGGGGHHHHHHH ARRRRGGHHHH 118 | RRID = maybe_rrid # RRIDCUR:Missing was already added above 119 | 120 | if RRID is not None: 121 | EXACTs[RRID] = anno.exact.strip() if anno.exact else '' 122 | RRIDs[RRID].extend(additional) 123 | #USERs[RRID] = anno.user 124 | if RRID not in CITEs: 125 | if anno.text: 126 | if 'Proper Citation:' in anno.text: 127 | CITEs[RRID] = anno.text.split('Proper Citation:')[1].strip().split('<',1)[0] 128 | 129 | if anno.id in replies: 130 | for r_anno in replies[anno.id]: 131 | RRIDs[RRID].extend(r_anno.tags) # not worrying about the text here 132 | elif not anno.references and PMID not in anno.tags: # this is an independent annotation which will not be included 133 | new = 'NONE:' + anno.id 134 | RRIDs[new].append('') 135 | EXACTs[new] = anno.exact 136 | #USERs[RRID] = anno.user 137 | 138 | for rrid, more in RRIDs.items(): 139 | #FIXME TOOOOOO SLOW 140 | #r = requests.get('https://scicrunch.org/resolver/{RRID}.xml'.format(RRID=rrid)) 141 | #if r.status_code < 300: 142 | #proper_citation = get_proper_citation(r.content) 143 | #else: 144 | #proper_citation = '' 145 | 146 | try: 147 | proper_citation = CITEs[rrid] 148 | except KeyError: # FIXME this is a hack to avoid some cases of LWW for citations 149 | proper_citation = '' 150 | 151 | if not more: 152 | row = [PMID, rrid, '', annotated_url, EXACTs[rrid], proper_citation] 153 | output_rows.append(row) 154 | else: 155 | for val in set(more): # cull dupes 156 | row = [PMID, rrid, val, annotated_url, EXACTs[rrid], proper_citation] 157 | output_rows.append(row) 158 | 159 | DATE = date.today().strftime('%Y-%m-%d') 160 | return output_rows, DATE 161 | 162 | 163 | class NormalizedAnno(HypothesisAnnotation): 164 | @property 165 | def tags(self): 166 | tags = super().tags 167 | out = [] 168 | for tag in tags: 169 | if tag in bad_tags: 170 | # scibot made a mistake early 171 | # might be able to correct tags in bulk someday 172 | out.append(tag.replace('RRID:', 'RRIDCUR:')) 173 | else: 174 | out.append(tag) 175 | 176 | text = self.text 177 | exact = self.exact 178 | if text.startswith('RRID:'): 179 | # catch cases where the RRID was put in text instead of in tags 180 | if 'RRIDCUR:Missing' in out or 'RRIDCUR:Unrecognized' in out: 181 | # trap for cases where there is more text after an RRID... 182 | rtag = text.split(None, 1)[0] 183 | if rtag not in out: 184 | out.append(rtag) 185 | print('TEXT ISSUE for %s at https://hyp.is/%s' % (self.user, self.id)) 186 | 187 | elif exact and exact.startswith('RRID:'): # this needs to go second in case of RRIDCUR:Incorrect 188 | if exact.startswith('RRID: '): # deal with nospace first 189 | rtag = exact.replace('RRID: ', 'RRID:') 190 | else: 191 | rtag = exact 192 | 193 | rtag = rtag.split(None, 1)[0] # trap more 194 | if rtag not in out: 195 | if self.user == 'scibot' and len(out) == 1 and tags[0].startswith('RRID:RRID:'): # FIXME HACK 196 | out = [rtag] 197 | else: 198 | # anything else we detect in the data doesn't need 199 | # to be corrected or used to fix tags 200 | pass 201 | 202 | return out 203 | 204 | 205 | def export_json_impl(): 206 | get_annos = Memoizer(memfile, username=username, api_token=api_token, group=group) 207 | annos = get_annos() 208 | 209 | # clean up bugs from old curation workflow 210 | for anno in annos: 211 | yield NormalizedAnno(anno)._normalized() 212 | continue 213 | 214 | ### URG 215 | 216 | def oldmain(): 217 | output_rows, DATE = export_impl() 218 | with open('RRID-data-%s.csv' % DATE, 'wt') as f: 219 | writer = csv.writer(f, lineterminator='\n') 220 | writer.writerows(sorted(output_rows)) 221 | 222 | import json 223 | output_json, DATE = export_json_impl() 224 | with open('RRID-data-%s.json' % DATE, 'wt') as f: 225 | json.dump(output_json, f, sort_keys=True, indent=4) 226 | 227 | def main(): 228 | import json 229 | DATE = date.today().strftime('%Y-%m-%d') 230 | with open('RRID-data-%s.json' % DATE, 'wt') as f: 231 | f.write('[') 232 | gen = export_json_impl() 233 | s = json.dumps(next(gen), sort_keys=True, indent=1) 234 | for d in gen: 235 | f.write(s) 236 | f.write(',\n') 237 | s = json.dumps(d, sort_keys=True, indent=1) 238 | 239 | f.write(s) 240 | f.write(']') 241 | 242 | if __name__ == '__main__': 243 | main() 244 | 245 | -------------------------------------------------------------------------------- /scibot/extract.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from scibot.utils import makeSimpleLogger 5 | 6 | log = makeSimpleLogger('extract') 7 | 8 | 9 | # utility 10 | def col0(pairs): return list(zip(*pairs))[0] 11 | def col1(pairs): return list(zip(*pairs))[1] 12 | 13 | # prefixes 14 | 15 | agprefixes = ( 16 | ('Addgene', 'Addgene'), 17 | ('addgene', 'Addgene'), 18 | ('plasmid', 'Addgene'), 19 | ('Plasmid', 'Addgene'), 20 | ('addgene cat', 'Addgene'), 21 | ('addgene.org', 'Addgene'), 22 | ('addgene ID', 'Addgene'), # probably won't work 23 | ('addgene cat. no.', 'Addgene'), 24 | ('Jackson Laboratory Cat', 'IMSR_JAX'), 25 | ('Jackson Laboratory Cat', 'IMSR_JAX'), 26 | ('Jackson Laboratory Stock', 'IMSR_JAX'), 27 | ('Jackson Laboratory Stock', 'IMSR_JAX'), 28 | ('The Jackson Laboratory', 'IMSR_JAX'), 29 | ('The Jackson Laboratory Stock', 'IMSR_JAX'), 30 | ('The Jackson Laboratory Stock', 'IMSR_JAX'), 31 | ('The Jackson Laboratory Cat', 'IMSR_JAX'), 32 | ('The Jackson Laboratory Cat', 'IMSR_JAX'), 33 | ('Jackson Laboratories Cat', 'IMSR_JAX'), 34 | ('Jackson Laboratories Cat', 'IMSR_JAX'), 35 | ) 36 | 37 | prefixes = ( 38 | ('AB', 'AB'), 39 | ('AGSC', 'AGSC'), 40 | ('ARC', 'IMSR_ARC'), 41 | ('BCBC', 'BCBC'), 42 | ('BDSC', 'BDSC'), 43 | ('DGRC', 'DGRC'), 44 | ('CGC', 'CGC'), 45 | ('CRL', 'IMSR_CRL'), 46 | #('CVCL', 'CVCL'), # numbers + letters :/ 47 | ('DGGR', 'DGGR'), 48 | ('FBst', 'FBst'), 49 | ('FlyBase', 'FlyBase'), 50 | ('HAR', 'IMSR_HAR'), 51 | ('JAX', 'IMSR_JAX'), 52 | ('KOMP', 'IMSR_KOMP'), 53 | ('MGI', 'MGI'), 54 | ('MMRRC', 'MMRRC'), 55 | ('NCIMR', 'IMSR_NCIMR'), 56 | ('NSRRC', 'NSRRC'), 57 | ('NXR', 'NXR'), 58 | ('RBRC', 'IMSR_RBRC'), 59 | ('RGD', 'RGD'), 60 | ('SCR', 'SCR'), 61 | ('TAC', 'IMSR_TAC'), 62 | ('TIGM', 'IMSR_TIGM'), 63 | ('TSC', 'TSC'), 64 | ('WB-STRAIN', 'WB-STRAIN'), 65 | ('WTSI', 'IMSR_WTSI'), 66 | ('ZDB', 'ZFIN_ZDB'), 67 | ('ZFIN', 'ZFIN'), 68 | ('ZIRC', 'ZIRC'), 69 | ) 70 | prefix_lookup = {k:v for k, v in prefixes} 71 | prefix_lookup['CVCL'] = 'CVCL' # ah special cases 72 | prefix_lookup.update({k:v for k, v in agprefixes}) 73 | 74 | # paper identifiers 75 | 76 | def searchSoup(soup): 77 | def search(tag, prop, val, key, additional_prop_vals=None): 78 | if additional_prop_vals is None: 79 | pvs = {prop:val} 80 | else: 81 | additional_prop_vals.update({prop:val}) 82 | pvs = additional_prop_vals 83 | matches = soup.find_all(tag, pvs) 84 | if matches: 85 | return matches[0][key] 86 | return search 87 | 88 | 89 | def normalizeDoi(doi): 90 | doi = doi.replace(' ', '') 91 | if 'http' in doi or 'doi.org' in doi: 92 | doi = '10.' + doi.split('.org/10.', 1)[-1] 93 | elif doi.startswith('doi:'): 94 | doi = doi.strip('doi:') 95 | elif doi.startswith('DOI:'): 96 | doi = doi.strip('DOI:') 97 | return doi 98 | 99 | 100 | def getDoi(*soups): 101 | argslist = ( # these go in order so best returns first 102 | # TODO bind a handler for these as well... 103 | ('meta', 'name', 'DC.Identifier', 'content'), # elife pmc etc. 104 | ('meta', 'name', 'DOI', 'content'), # nature pref 105 | ('meta', 'name', 'dc.identifier', 'content'), # nature 106 | ('meta', 'name', 'citation_doi', 'content'), # wiley jove f1000 ok 107 | ('a', 'class', 'doi', 'href'), # evilier 108 | ('a', 'class', 'S_C_ddDoi', 'href'), # evilier 109 | ('a', 'id', 'ddDoi', 'href'), # evilier 110 | ('meta', 'name', 'DC.identifier', 'content'), # f1000 worst 111 | ('meta', 'name', 'dc.Identifier', 'content', {'scheme':'doi'}), # tandf 112 | ('meta', 'name', 'dc.Source', 'content'), # mit press jounals wat 113 | ('meta', 'name', 'dc.identifier', 'content'), 114 | ('meta', 'name', 'prism.doi', 'content'), 115 | ) 116 | for soup in soups: 117 | for args in argslist: 118 | doi = searchSoup(soup)(*args) 119 | if doi is not None: 120 | return normalizeDoi(doi) 121 | 122 | 123 | def getUri(uri, *soups): 124 | argslist = ( 125 | ('meta', 'property', 'og:url', 'content'), # FIXME mitpressjournals has an idiot in it somewhere 126 | ('link', 'rel', 'canonical', 'href'), 127 | ) 128 | for soup in soups: 129 | for args in argslist: 130 | cu = searchSoup(soup)(*args) 131 | if cu is not None and cu.startswith('http'): 132 | if cu != uri: 133 | log.warning('canonical and uri do not match, ' 134 | f'preferring canonical\n{cu}\n{uri}') 135 | return cu 136 | return uri 137 | 138 | 139 | def getPmid(*soups): 140 | argslist = ( 141 | ('meta', 'property', 'citation_pmid', 'content'), 142 | ) 143 | for soup in soups: 144 | for args in argslist: 145 | cu = searchSoup(soup)(*args) 146 | if cu is not None: 147 | return cu # FIXME TODO yeild here 148 | 149 | 150 | def getTitle(*soups): 151 | for soup in soups: 152 | for t in soup.find_all('title'): 153 | yield t.text 154 | 155 | 156 | def chooseTitle(document, titles): 157 | meta_titles = [] 158 | for k, d_ in document.items(): 159 | rank = 0 # TODO 160 | if 'title' in d_: 161 | t = d_['title'][0] # FIXME warn on > 1 ? 162 | meta_titles.append((rank, t)) 163 | 164 | if meta_titles: 165 | title = sorted(meta_titles)[0][1] 166 | 167 | elif titles: 168 | title = sorted(titles, key=lambda t: -len(t))[0] 169 | 170 | else: 171 | log.warning(f'no title for {document}') 172 | title = 'Spooky nameless page' 173 | 174 | document['title'] = title 175 | 176 | 177 | def getLinks(*soups): 178 | for soup in soups: 179 | for l in soup.find_all('link'): 180 | yield l.attrs 181 | 182 | 183 | def chooseLinks(document, links): 184 | meta_links = [] 185 | for link in links: 186 | if 'rel' in link and 'canonical' in link['rel']: 187 | l = {'rel': 'canonical', 'href': link['href']} 188 | if 'type' in link and link['type']: 189 | l['type'] = link['type'] 190 | 191 | meta_links.append(l) 192 | 193 | # TODO pull out other links as well 194 | document['link'].extend(meta_links) 195 | 196 | 197 | def searchSoups(argslist, *soups): 198 | for soup in soups: 199 | for args in argslist: 200 | cu = searchSoup(soup)(*args) 201 | if cu is not None: 202 | yield cu 203 | 204 | 205 | def getDocument(target_uri, *soups): 206 | # TODO probably want to detect when there are tags in the header that 207 | # we are missing/skipping since this takes a closed world approach to detection 208 | # rather than prefix based detection 209 | # TODO pull these out into a more visible file 210 | dc_fields = 'identifier', 'title', 'publisher', 'format', 'creator', 'date' 211 | eprints_fields = ('title', 'creators_name', 'type', 'datestamp', 'ispublished', 212 | 'date', 'date_type', 'publication', 'volume', 'pagerange') 213 | prism_fields = ('volume', 'number', 'startingPage', 'endingPage', 'publicationName', 214 | 'issn', 'publicationDate', 'doi') 215 | highwire_fields = ('title', 'journal_title', 'publisher', 'issue', 'volume', 'doi', 216 | 'firstpage', 'lastpage', 'date', 'abstract_html_url', 'fulltext_html_url', 217 | 'pdf_url', 'pii', 'article_type', 'online_date', 'publication_date', 218 | 'issn', 'keywords', 'language', 'author', 'author_institution', 219 | ) 220 | og_fields = ('title', 'type', 'image', 'url', 'audio', 'description', 'determiner', 221 | 'locale', 'locale:alternate', 'site_name', 'video') 222 | 223 | def dmeta(rexp, fields, props=('name',)): 224 | return {field: [('meta', prop, re.compile(rexp.format(field=field), re.I), 'content') 225 | for prop in props] for field in fields} 226 | 227 | todo = { 228 | 'dc': dmeta('^(dc|dcterms).{field}$', dc_fields), 229 | 'eprints': dmeta('^eprints.{field}$', eprints_fields), 230 | 'facebook': dmeta('^og:{field}$', og_fields, props=('name', 'property')), 231 | # some people use name for og instead of property (spec expects property) 232 | 'highwire': dmeta('^citation_{field}$', highwire_fields), 233 | 'prism': dmeta('^prism.{field}$', prism_fields), 234 | 'twitter':{}, # TODO?? 235 | } 236 | 237 | document = {key: {field: results 238 | for field, argslist in dict_.items() 239 | for results in (list(searchSoups(argslist, *soups)),) 240 | if results} 241 | for key, dict_ in todo.items()} 242 | 243 | doi = getDoi(*soups) 244 | pmid = getPmid(*soups) 245 | titles = list(getTitle(*soups)) 246 | chooseTitle(document, titles) 247 | links = list(getLinks(*soups)) 248 | document['link'] = [{'href': target_uri}] 249 | chooseLinks(document, links) 250 | return document, doi, pmid 251 | 252 | 253 | def document_from_url(url): 254 | resp = requests.get(url) 255 | soup = BeautifulSoup(resp.content, 'lxml') 256 | return getDocument(url, resp, soup), soup 257 | 258 | 259 | # rrids 260 | 261 | def clean_text(text): 262 | # cleanup the inner text 263 | text = text.replace('–','-') 264 | text = text.replace('‐','-') # what the wat 265 | text = text.replace('\xa0',' ') # nonbreaking space fix 266 | 267 | mids = (r'', 268 | r'\ ', 269 | r'_\ ', 270 | r'\ _', 271 | r': ', 272 | r'-', 273 | ) 274 | tail = r'([\s,;\)])' 275 | replace = r'\1\2_\3\4' 276 | def make_cartesian_product(prefix, suffix=r'(\d+)'): 277 | return [(prefix + mid + suffix + tail, replace) for mid in mids] 278 | 279 | fixes = [] 280 | prefixes_digit = [r'([^\w])(%s)' % _ for _ in ('AB', 'SCR', 'MGI')] 281 | for p in prefixes_digit: 282 | fixes.extend(make_cartesian_product(p)) 283 | fixes.extend(make_cartesian_product(r'([^\w])(CVCL)', r'([0-9A-Z]+)')) # FIXME r'(\w{0,5})' better \w+ ok 284 | fixes.append((r'\(RRID\):', r'RRID:')) 285 | 286 | for f, r in fixes: 287 | text = re.sub(f, r, text) 288 | return text 289 | 290 | 291 | def find_rrids(text): 292 | # first round 293 | regex1 = '(.{0,32})(RRID(:|\)*,*)[ \t]*)(\w+[_\-:]+[\w\-]+)([^\w].{0,31})' 294 | matches = re.findall(regex1, text) 295 | for prefix, rrid, sep, id_, suffix in matches: 296 | #print((prefix, rrid, sep, id_, suffix)) 297 | exact = 'RRID:' + id_ 298 | exact_for_hypothesis = exact 299 | yield prefix, exact, exact_for_hypothesis, suffix 300 | 301 | # second round 302 | orblock = '(' + '|'.join(col0(prefixes)) + ')' 303 | sep = '(:|_)([ \t]*)' 304 | agsep = '([ \t]*#)([ \t]*)' # FIXME doesn't work with "Stock No." or "Stock No:" 305 | agorblock = '(' + '|'.join(col0(agprefixes)) + ')' 306 | regex2 = ('(.{0,32})(?:' + orblock + f'{sep}(\d+)|(CVCL){sep}(\w+)|' 307 | + agorblock + f'{agsep}(\w+))([^\w].{{0,31}})') # the first 0,32 always greedy matches??? 308 | matches2 = re.findall(regex2, text) # FIXME this doesn't work since our prefix/suffix can be 'wrong' 309 | for (prefix, namespace, sep, spaces, nums, 310 | cvcl, cvcl_sep, cvcl_spaces, cvcl_nums, 311 | ag, ag_sep, ag_spaces, ag_nums, 312 | suffix) in matches2: 313 | if cvcl: 314 | #print('\t\t', (prefix, namespace, sep, spaces, nums, cvcl, cvcl_sep, cvcl_spaces, cvcl_nums, suffix)) 315 | namespace, sep, spaces, nums = cvcl, cvcl_sep, cvcl_spaces, cvcl_nums # sigh 316 | elif ag: 317 | namespace, sep, spaces, nums = ag, ag_sep, ag_spaces, ag_nums # sigh 318 | if re.match(regex1, ''.join((prefix, namespace, sep, spaces, nums, suffix))) is not None: 319 | #print('already matched') 320 | continue # already caught it above and don't want to add it again 321 | if ag: # switch sep for addgene after match 322 | sep = '_' 323 | exact_for_hypothesis = namespace + sep + nums 324 | resolver_namespace = prefix_lookup[namespace] 325 | exact = 'RRID:' + resolver_namespace + sep + nums 326 | yield prefix, exact, exact_for_hypothesis, suffix 327 | 328 | # third round for BDSC 329 | regex3 = '(.{0,32})(BDSC|BL|Bl|Bloomington)(\s?)(stock)?(\s)?(#|no|no\.)?(\s?)([0-9]{2,10})([^\w].{0,31})' 330 | matches3 = re.findall(regex3, text) 331 | for prefix, a, b, c, d, e, f, nums, suffix in matches3: 332 | if nums in ['17', '21']: 333 | # special case to avoid false positives 334 | continue 335 | yield prefix, f'RRID:BDSC_{nums.strip()}', f'{a}{b}{c}{d}{e}{f}{nums}', suffix 336 | 337 | # fourth round for SAMN 338 | regex4 = '(.{0,32})(SAMN)(\s?)([0-9]{3,15})([^\w].{0,31})' 339 | matches4 = re.findall(regex4, text) 340 | for prefix, a, b, nums, suffix in matches4: 341 | yield prefix, f'RRID:SAMN{nums.strip()}', f'{a}{b}{nums}', suffix 342 | 343 | 344 | # extract from post 345 | 346 | def process_POST_request(request): 347 | dict_ = dict(request.form) 348 | def htmlify(thing): 349 | try: 350 | html = dict_[thing] 351 | except KeyError as e: 352 | html = '' 353 | return '' + html + '' 354 | uri = dict_['uri'] 355 | head = htmlify('head') 356 | body = htmlify('body') 357 | try: 358 | text = dict_['data'] 359 | except KeyError as e: 360 | text = '' 361 | 362 | headsoup = BeautifulSoup(head, 'lxml') 363 | bodysoup = BeautifulSoup(body, 'lxml') 364 | 365 | target_uri = getUri(uri, headsoup, bodysoup) 366 | #doi = getDoi(headsoup, bodysoup) 367 | #pmid = getPmid(headsoup, bodysoup) 368 | document, doi, pmid = getDocument(target_uri, headsoup, bodysoup) 369 | cleaned_text = clean_text(text) 370 | return target_uri, document, doi, pmid, head, body, text, cleaned_text 371 | 372 | 373 | class PaperId: 374 | id_types = ( 375 | 'uri_normalized', 376 | 'doi', 377 | 'pmid', 378 | 'hypothesis_normalized', 379 | 'uri', 380 | ) 381 | def __init__(self, 382 | uri_normalized, 383 | doi=None, 384 | pmid=None, 385 | hypothesis_normalized=None, 386 | uri=None): 387 | 388 | # names 389 | self.uri_normalized = uri_normalized 390 | self.doi = OntId(doi) if doi else doi 391 | self.pmid = OntId(pmid) if pmid else pmid 392 | self.hypothesis_normalized = hypothesis_normalized 393 | self.uri = uri 394 | 395 | # amusingly the actual identities 396 | self.urn = None # pdf fingerprint 397 | self.text_hash = None 398 | self.html_hash = None 399 | self.head_hash = None 400 | self.body_hash = None 401 | self.jats_hash = None 402 | self.stripped_hash = None 403 | 404 | @property 405 | def _existing_ids(self): 406 | for id_type in self.id_types: 407 | id = getattr(self, id_type, None) 408 | if id is not None: 409 | yield id 410 | 411 | @property 412 | def existing_ids(self): 413 | return set(self._existing_ids) 414 | 415 | @property 416 | def _resolvable_ids(self): 417 | yield self.doi.iri 418 | yield self.uri 419 | 420 | @property 421 | def resolvable_ids(self): 422 | return set(self._resolvable_ids) 423 | 424 | @property 425 | def _chains(self): 426 | for id in self.resolvable_ids: 427 | yield id, tuple(resolution_chain(id)) 428 | 429 | @property 430 | def chains(self): 431 | return {id:chain for id, chain in self._chains} 432 | 433 | def idPaper(self): 434 | if self.doi is None: 435 | paper = self 436 | doi = paper['DOI'] 437 | pmid = paper['PMID'] 438 | log.info(url) 439 | if not self.doi and self.uri.startswith('http'): # we've go some weird ones in there... 440 | doi = scrapeDoi(uri) 441 | # scrapeIds(uri) 442 | if doi is not None: 443 | log.info(doi) 444 | pmid = get_pmid(doi) 445 | log.warning('json malformed in get_pmid') 446 | log.info(pmid) 447 | resp = annotate_doi_pmid(url, doi, pmid, rrcu.h_curation, []) 448 | log.info('new doi') 449 | return resp 450 | else: 451 | log.info(doi) 452 | log.info('already found') 453 | 454 | def scrapeDoi(self): 455 | env = os.environ.copy() 456 | cmd_line = ['timeout', '30s', 'google-chrome-unstable', '--headless', '--dump-dom', url] 457 | p = subprocess.Popen(cmd_line, stdin=subprocess.PIPE, 458 | stdout=subprocess.PIPE, 459 | stderr=subprocess.STDOUT, 460 | env=env) 461 | out, err = p.communicate() 462 | if p.returncode: 463 | log.critical('UTOH') 464 | return None 465 | elif b'ERROR:headless_shell.cc' in out: 466 | log.critical(out) 467 | raise IOError('Something is wrong...') 468 | qurl = quote(url, '') 469 | if len(qurl) > 200: 470 | qurl = qurl[:200] 471 | with open(os.path.expanduser(f'~/files/scibot/{qurl}'), 'wb') as f: 472 | f.write(out) 473 | both = BeautifulSoup(out, 'lxml') 474 | doi = getDoi(both, both) 475 | return doi 476 | 477 | 478 | 479 | 480 | 481 | -------------------------------------------------------------------------------- /scibot/get_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Get the 10k most recent annotations from a group. """ 3 | import json 4 | import hashlib 5 | from os import environ, chmod 6 | from datetime import date 7 | from hyputils.hypothesis import HypothesisUtils 8 | 9 | def main(): 10 | TODAY = date.isoformat(date.today()) 11 | 12 | api_token = environ.get('SCIBOT_API_TOKEN', 'TOKEN') # Hypothesis API token 13 | username = environ.get('SCIBOT_USERNAME', 'USERNAME') # Hypothesis username 14 | group = environ.get('SCIBOT_GROUP', '__world__') 15 | group_staging = environ.get('SCIBOT_GROUP_STAGING', '__world__') 16 | 17 | m = hashlib.sha256() 18 | m.update(group.encode()) 19 | group_hash = m.hexdigest()[:16] # 16 is 2x the length of the original group... 20 | h = HypothesisUtils(username=username, token=api_token, group=group, max_results=10000) 21 | # FIXME there seems to be an additiona bug here in hyputils 22 | # there will be an error at the end when we overrun 10k 23 | recent_annos = list(h.search_all({'group': h.group})) 24 | fn = f'annos-{group_hash}-{TODAY}.json' 25 | with open(fn, 'wt') as f: 26 | json.dump(recent_annos, f, indent=4) 27 | chmod(fn, 0o600) 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /scibot/papers.py: -------------------------------------------------------------------------------- 1 | class KeyAccessor: 2 | # XXX whenever annos changes these need to be recreated as well... 3 | # if that is too slow then we will need to implement proper add 4 | # and delete for only affected items... 5 | """ For when your identifiers aren't python compatible. """ 6 | prop = 'id' 7 | object_container_class = set 8 | def __init__(self, objects=tuple(), id_prop=None): 9 | self._propagate = issubclass(self.object_container_class, KeyAccessor) 10 | self._objects = {} 11 | errors = [] 12 | for o in objects: 13 | k = getattr(o, self.prop) 14 | if k not in self._objects: 15 | self._objects[k] = self._make_cont() 16 | 17 | try: 18 | self._objects[k].add(o) 19 | except BaseException as e: 20 | if errors: 21 | try: 22 | raise e from errors[-1] 23 | except BaseException as ne: 24 | errors.append(ne) 25 | else: 26 | errors.append(e) 27 | 28 | if errors: 29 | raise errors[-1] 30 | 31 | self._id_prop = None 32 | if id_prop is not None: 33 | if objects: 34 | print('setting id prop') 35 | setattr(self, id_prop, getattr(o, id_prop)) # o from the for loop 36 | else: 37 | self._id_prop = id_prop 38 | 39 | def _make_cont(self): 40 | if self._propagate: 41 | cont = self.object_container_class(id_prop=self.prop) 42 | else: 43 | cont = self.object_container_class() 44 | return cont 45 | 46 | def remove(self, object_): 47 | k = getattr(object_, self.prop) 48 | self._objects[k].remove(object_) 49 | if not self._objects[k]: 50 | self._objects.pop(k) 51 | 52 | def discard(self, object_): 53 | k = getattr(object_, self.prop) 54 | self._objects[k].discard(object_) 55 | if not self._objects[k]: 56 | self._objects.pop(k) 57 | 58 | def add(self, object_): 59 | if self._id_prop is not None: 60 | setattr(self, self._id_prop, getattr(object_, self._id_prop)) 61 | self._id_prop = None 62 | k = getattr(object_, self.prop) 63 | if k not in self._objects: 64 | self._objects[k] = self._make_cont() 65 | 66 | self._objects[k].add(object_) 67 | 68 | def keys(self): 69 | return sorted(self._objects, key=lambda k: '0' if k is None else k) 70 | 71 | def values(self): 72 | for v in list(self._objects.values()): 73 | yield v 74 | 75 | def items(self): 76 | # we use list() here to simplify synchronization issues with the websocket 77 | # since yield allows the thread to shift 78 | for k, v in list(self._objects.items()): 79 | yield k, v 80 | 81 | def __iter__(self): 82 | for k in self._objects: 83 | yield k 84 | 85 | def __contains__(self, key): 86 | return key in self._objects 87 | 88 | def __getitem__(self, key): 89 | try: 90 | return self._objects[key] 91 | except KeyError: 92 | self.__missing__(key) 93 | 94 | #def __setitem__(self, key): 95 | #raise TypeError('Cannot set values on this class, it is immutable') 96 | 97 | def __missing__(self, key): 98 | raise KeyError(f'{key} not found') 99 | 100 | def __len__(self): 101 | return len(self._objects) 102 | 103 | def __repr__(self): 104 | return repr({k:v for k,v in self.items()}) 105 | 106 | def __str__(self): 107 | return str({k:v for k,v in self.items()}) 108 | 109 | 110 | class RRIDs(KeyAccessor): 111 | """ AKA a Paper """ 112 | prop = 'rrid' 113 | object_container_class = set 114 | 115 | @property 116 | def doi(self): 117 | if None in self._objects: 118 | for o in self._objects[None]: 119 | if o.KillPageNote: 120 | continue 121 | if o._anno.is_page_note or o.user != 'scibot': # FIXME some curators did these as annotations too... 122 | for t in o.tags: 123 | if t.startswith('DOI:') and ' ' not in t and '\n' not in t and t.count(':') == 1: 124 | return t 125 | 126 | @property 127 | def pmid(self): 128 | if None in self._objects: 129 | for o in self._objects[None]: 130 | if o.KillPageNote: 131 | continue 132 | for t in o.tags: 133 | if t.startswith('PMID:') and ' ' not in t and '\n' not in t and t.count(':') == 1: 134 | return t 135 | 136 | 137 | class Papers(KeyAccessor): 138 | prop = 'uri_normalized' 139 | object_container_class = RRIDs 140 | 141 | 142 | class SameDOI(KeyAccessor): 143 | prop = 'doi' 144 | object_container_class = Papers 145 | 146 | 147 | class SamePMID(KeyAccessor): 148 | prop = 'pmid' 149 | object_container_class = Papers 150 | 151 | 152 | class MultiplePMID(KeyAccessor): 153 | prop = 'doi' 154 | object_container_class = SamePMID 155 | 156 | 157 | class MultipleDOI(KeyAccessor): 158 | prop = 'pmid' 159 | object_container_class = SameDOI 160 | 161 | 162 | class RRIDSimple(KeyAccessor): 163 | prop = 'rrid' 164 | object_container_class = set 165 | 166 | 167 | class PMIDRRIDs(KeyAccessor): 168 | prop = 'pmid' 169 | object_container_class = RRIDSimple 170 | 171 | 172 | class DOIRRIDs(KeyAccessor): 173 | prop = 'doi' 174 | object_container_class = RRIDSimple 175 | 176 | 177 | class MPP(KeyAccessor): 178 | prop = 'uri_normalized' 179 | object_container_class = PMIDRRIDs 180 | 181 | 182 | class MPD(KeyAccessor): 183 | prop = 'uri_normalized' 184 | object_container_class = DOIRRIDs 185 | -------------------------------------------------------------------------------- /scibot/release_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import socket 4 | from collections import namedtuple, Counter 5 | from pyontutils.utils import mysql_conn_helper 6 | from scibot.release import Curation, PublicAnno, get_annos, get_pannos 7 | from scibot.utils import uri_normalization 8 | from scibot import config 9 | try: 10 | breakpoint 11 | except NameError: 12 | from IPython import embed as breakpoint 13 | 14 | 15 | def dbUri(user='nif_eelg_secure', host='nif-mysql.crbs.ucsd.edu', port=3306, database='nif_eelg'): 16 | DB_URI = 'mysql+pymysql://{user}:{password}@{host}:{port}/{db}' # FIXME db => pyontutils refactor 17 | if socket.gethostname() in config.dev_remote_hosts: 18 | db_cfg_kwargs = mysql_conn_helper('localhost', database, user, 33060) # see .ssh/config 19 | else: 20 | db_cfg_kwargs = mysql_conn_helper(host, database, user, port) 21 | 22 | return DB_URI.format(**db_cfg_kwargs) 23 | 24 | 25 | class RRIDData: 26 | def __init__(self, session): 27 | self.session = session 28 | 29 | def james_rrids(self): 30 | sql = '''SELECT m.uri, v.pmid, v.rrid, m.annotation_id, 31 | m.hypothesis_user, v.journal, v.title, v.year 32 | FROM rrid_mentions_view2 AS v JOIN rrid_mentions AS m ON v.rrid_mention_id = m.id 33 | WHERE source = 'Hypothesis' ''' 34 | yield from self.session.execute(sql) 35 | 36 | def missing_pmids(self): 37 | sql = '''SELECT distinct(pmid) FROM rrid_mentions''' 38 | sql2 = '''SELECT distinct(concat("PMID:", pmid)) FROM rrid_mentions_literature_records''' 39 | all_pmids = set(p for p, in self.session.execute(sql)) 40 | pmids_in_lit_table = set(p for p, in self.session.execute(sql2)) 41 | missing = all_pmids - pmids_in_lit_table 42 | return missing 43 | 44 | def combine(self, rrid_recs=None): 45 | if rrid_recs is None: 46 | rrid_recs = self.james_rrids() 47 | 48 | header = ('release', 'issue', 'urin', 'uri', 49 | 'pmid', 'cpmid', 'doi', 'rrid', 'crrid', 50 | 'rlink', 'alink', 'plink', 51 | 'user', 'journal', 'title', 'year')#, 'ptext') 52 | rr = namedtuple('reprow', header) 53 | yield rr(*header) 54 | pa_done = set() 55 | for row in rrid_recs: 56 | #print(row) 57 | c = Curation.byId(row.annotation_id) 58 | errorl = [] 59 | if c is None: 60 | 61 | yield rr(None, 'BAD-ANNOTATION-ID', None, row.uri, 62 | row.pmid, None, None, row.rrid, None, 63 | None, row.annotation_id, None, 64 | row.hypothesis_user, row.journal, row.title, row.year)#, None) 65 | continue 66 | 67 | else: 68 | if row.uri != c.uri: 69 | errorl.append('URI-mismatch') 70 | if row.pmid != c.pmid: 71 | errorl.append('PMID-mismatch') 72 | if row.rrid != c.rrid: 73 | errorl.append('RRID-mismatch') 74 | 75 | issue = ' '.join(errorl) 76 | pa = c._public_anno if c else None 77 | pasl = pa.shareLink if pa is not None else None 78 | pat = pa.text if pa is not None else None 79 | if pa is not None: 80 | pa_done.add(pa.id) 81 | yield rr(c.isReleaseNode, issue, c.uri_normalized, row.uri, 82 | row.pmid, c.pmid, c.doi, row.rrid, c.rrid, 83 | c.rridLink, c.shareLink, pasl, 84 | row.hypothesis_user, row.journal, row.title, row.year)#, pat) 85 | 86 | rissue = 'RELEASED-BUT-NOT-IN-TABLE' 87 | for pa in PublicAnno: 88 | if pa.id not in pa_done: 89 | cs = list(pa.curation_annos) 90 | if not cs: 91 | irn = None 92 | issue = 'DELETE-THIS' 93 | cpmid = None 94 | cdoi = None 95 | cshareLink = None 96 | curators = '' # FIXME look this up? 97 | else: 98 | issue = rissue 99 | irn = all(c.isReleaseNode if c is not None else c for c in cs) 100 | if not irn: 101 | if None in cs: 102 | issue = '!?-WHAT-HAVE-YOU-DONE!?-WHERE-IS-MY-CURATION-ANNO!? ' + issue 103 | cs = [c for c in cs if c is not None] 104 | 105 | if not cs: 106 | irn = None 107 | issue = 'DELETE-THIS ' + issue 108 | cpmid = None 109 | cdoi = None 110 | cshareLink = None 111 | else: 112 | cdoi = cs[0].doi # NOTE this MASSIVE CHANGES semantics of the rrid column 113 | if cdoi != pa.doi: 114 | issue = 'DOI-mismatch ' + issue 115 | 116 | cpmid = cs[0].pmid # NOTE this changes semantics of the pmid column 117 | if cpmid != pa.pmid: 118 | issue = 'PMID-mismatch ' + issue 119 | 120 | 121 | cshareLink = cs[0].shareLink 122 | 123 | curators = ' '.join([c for ca in cs for c in ca.curators]) 124 | 125 | issue = 'z ' + issue # ordering 126 | yield rr(irn, issue, pa.uri_normalized, pa.uri, 127 | pa.pmid, cpmid, cdoi, pa.doi, pa.rrid, 128 | pa.rridLink, cshareLink, pa.shareLink, 129 | curators, None, None, None)#, pa.text) 130 | 131 | 132 | def main(): 133 | from sqlalchemy import create_engine 134 | from sqlalchemy.orm.session import sessionmaker 135 | engine = create_engine(dbUri(), echo=True) 136 | Session = sessionmaker() 137 | Session.configure(bind=engine) 138 | session = Session() 139 | rd = RRIDData(session) 140 | 141 | annos = get_annos() 142 | Curation._annos_list = annos 143 | pannos = get_pannos() 144 | PublicAnno._annos_list = pannos 145 | 146 | for helper in (Curation, PublicAnno): 147 | [helper(a, helper._annos_list) for a in helper._annos_list] 148 | 149 | def key(r): 150 | i = r.issue if r.issue else '' 151 | p = r.pmid if r.pmid else '' 152 | u = r.urin if r.urin else '' 153 | c = r.crrid if r.crrid else '' 154 | return i, p, u, c 155 | 156 | rrid_recs = [r for r in rd.james_rrids() if r.uri] 157 | 158 | rd = RRIDData(session) 159 | gen = rd.combine(rrid_recs) 160 | report = [next(gen)] + sorted(gen, key=key) 161 | bads = [r for r in report if r.issue] 162 | noanno = [r for r in bads if not r.urin] 163 | noalink = [r for r in noanno if not r.alink] 164 | noannoalink = [r for r in noanno if r.alink] 165 | 166 | badwanno = [r for r in bads if r.urin] 167 | 168 | irep = {} 169 | for r in report: 170 | if r.alink not in irep: 171 | irep[r.alink] = set() 172 | 173 | irep[r.alink].add(r) 174 | 175 | duplicates = {k:v for k, v in irep.items() if len(v) > 1} 176 | 177 | with open('scibot-rrid-bads.csv', 'wt') as f: 178 | writer = csv.writer(f, lineterminator='\n') 179 | writer.writerows(bads) 180 | 181 | with open('scibot-rrid-all.csv', 'wt') as f: 182 | writer = csv.writer(f, lineterminator='\n') 183 | writer.writerows(report) 184 | 185 | # FIXME PublicAnno pmid and doi do not work correctly ... 186 | # FIXME there are 233 annotations with public ids and public rrids 187 | # that do not have an rrid, meaning that the backlinks got scrambled somehow ! 188 | public_report = [(c.pmid, c.canonical_rrid, c.rrid, 189 | c._public_anno.shareLink, 190 | c._anno.prefix, c.exact, c._anno.suffix,) 191 | for c in Curation 192 | if c.isAstNode and c.public_id is not None and 193 | c.pmid is not None and c.rrid is not None and 194 | not c.NotRRID and c.canonical_rrid is not None] 195 | wat = [r for r in public_report if None in r] 196 | public_report.sort() 197 | pheader = 'pmid', 'canonical rrid', 'rrid', 'public share link', 'prefix', 'exact', 'suffix' 198 | public_report = [pheader] + public_report 199 | 200 | with open('scibot-rrid-public-simple.csv', 'wt') as f: 201 | writer = csv.writer(f, lineterminator='\n') 202 | writer.writerows(public_report) 203 | 204 | breakpoint() 205 | 206 | 207 | if __name__ == '__main__': 208 | main() 209 | -------------------------------------------------------------------------------- /scibot/rridxp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """rridxp: export and crunch RRID data 3 | 4 | Usage: 5 | rridxp csv [options] [...] 6 | rridxp multi-id-report [options] 7 | 8 | Examples: 9 | rridxp csv 10 | rridxp csv MGI IMSR 11 | 12 | Options: 13 | -h --help show this 14 | -d --debug drop into embed after jobs finish 15 | """ 16 | 17 | # Run these commands (with variables filled in) to retrieve the data 18 | # the first run may take ~30mins to synchronize all annotations 19 | # export SCIBOT_USERNAME=scibot 20 | # export SCIBOT_GROUP=${SCIBOT_CURATION_GROUP} 21 | # export SCIBOT_GROUP2=${SCIBOT_CURATION_GROUP} 22 | # export SCIBOT_GROUP_STAGING=__world__ 23 | # export SCIBOT_API_TOKEN=${SCIBOT_API_TOKEN} 24 | # export SCIBOT_SYNC=$(head -c 100 /dev/urandom | tr -dc 'a-zA-Z0-9') 25 | 26 | import csv 27 | import json 28 | from datetime import datetime 29 | from pyontutils.utils import anyMembers 30 | try: 31 | breakpoint 32 | except NameError: 33 | from IPython import embed as breakpoint 34 | 35 | 36 | def UTCNOW(): 37 | return datetime.isoformat(datetime.utcnow()) 38 | 39 | def deNone(*things): 40 | return tuple('' if thing is None else thing for thing in things) 41 | 42 | def multiIssue(mp): 43 | return {d:{p:set(_.uri for _ in c.values()) 44 | for p, c in r.items() 45 | if p is not None} 46 | for d, r in mp.items() 47 | if d is not None and 48 | len([k for k in r.keys() if k is not None]) > 1} 49 | 50 | class Encode(json.JSONEncoder): 51 | def default(self, thing): 52 | if isinstance(thing, set): 53 | return list(thing) 54 | else: 55 | return super().default(thing) 56 | 57 | def main(): 58 | from docopt import docopt 59 | args = docopt(__doc__, version='rridxp 0.0.0') 60 | print(args) 61 | from scibot.release import get_annos, Curation, SamePMID, MultiplePMID, MultipleDOI, MPP, MPD 62 | annos = get_annos() 63 | [Curation(a, annos) for a in annos] 64 | def midr(): 65 | mp = multiIssue(MultiplePMID(Curation)) 66 | md = multiIssue(MultipleDOI(Curation)) 67 | # filtering by url first removes any detectable instances of multiple dois/pmids 68 | #mpp = multiIssue(MPP(Curation)) 69 | #mpd = multiIssue(MPD(Curation)) 70 | with open('multiple-pmids.json', 'wt') as f: 71 | json.dump(mp, f, sort_keys=True, indent=4, cls=Encode) 72 | with open('multiple-dois.json', 'wt') as f: 73 | json.dump(md, f, sort_keys=True, indent=4, cls=Encode) 74 | 75 | if args['multi-id-report']: 76 | midr() 77 | 78 | elif args['csv']: 79 | substrings = args[''] # ['MGI', 'IMSR'] 80 | if substrings: 81 | ssj = '-'.join(ss.lower() for ss in substrings) + '-' 82 | else: 83 | substrings = [''] 84 | ssj = 'all-' 85 | 86 | pmids2 = SamePMID(set(annotation 87 | for paper in Curation._papers.values() 88 | for rrid, annotations in paper.items() 89 | if rrid is not None and anyMembers(rrid, *substrings) 90 | for annotation in annotations)) 91 | 92 | now = UTCNOW() 93 | rows = [['PMID', 'DOI', 'URI', 'shareLink', 'exact', 'rrid', 'public_tags']] 94 | rows += sorted(deNone(anno.pmid, anno.doi, anno.uri, anno.shareLink, anno.exact, anno.rrid, ','.join([t for t in anno.public_tags if 'RRID:' not in t])) 95 | for pmid, papers in pmids2.items() 96 | for rrids in papers.values() 97 | for annos in rrids.values() 98 | for anno in annos) 99 | with open(f'{ssj}rrids-{now}.csv', 'wt') as f: 100 | csv.writer(f, lineterminator='\n').writerows(rows) 101 | 102 | nomatch = [['PMID', 'DOI', 'URI', 'shareLink', 'exact', 'rrid', 'public_tags']] 103 | nomatch += sorted(deNone(anno.pmid, anno.doi, anno.uri, anno.shareLink, anno.exact, anno.rrid, ','.join([t for t in anno.public_tags if 'RRID:' not in t])) 104 | for pmid, papers in pmids2.items() 105 | for rrids in papers.values() 106 | for annos in rrids.values() 107 | for anno in annos 108 | if anno.exact and anno.rrid and anno.exact not in anno.rrid) 109 | 110 | with open(f'{ssj}rrids-nomatch-{now}.csv', 'wt') as f: 111 | csv.writer(f, lineterminator='\n').writerows(nomatch) 112 | 113 | if args['--debug']: 114 | breakpoint() 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /scibot/services.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from scibot.utils import log 4 | try: 5 | from urllib.parse import urlencode, quote 6 | except ImportError: 7 | from urllib import urlencode, quote 8 | 9 | # existing identifiers and rrids 10 | 11 | def existing_tags(target_uri, h): 12 | params = { 13 | 'limit':200, 14 | 'uri':target_uri, 15 | 'group':h.group, 16 | 'user':h.username, 17 | } 18 | query_url = h.query_url_template.format(query=urlencode(params, True)) 19 | obj = h.authenticated_api_query(query_url) 20 | rows = obj['rows'] 21 | tags = {} 22 | unresolved_exacts = {} 23 | for row in rows: 24 | # FIXME make sure that target_uri actually matches the URI returned here! 25 | # it is ok to to have multiple page notes appear as a result because we 26 | # don't have access to hypothes.is's document table 27 | for tag in row['tags']: 28 | if tag.startswith('RRID:'): 29 | tags[tag] = row['id'] 30 | elif tag.startswith('PMID:'): 31 | tags[tag] = row['id'] 32 | elif tag.startswith('DOI:'): 33 | tags[tag] = row['id'] 34 | elif tag == 'RRIDCUR:Unresolved': 35 | unresolved_exacts[row['target'][0]['selector'][0]['exact']] = row['id'] 36 | return tags, unresolved_exacts 37 | 38 | # PMIDs 39 | 40 | def get_pmid(doi): # TODO 41 | url = f'https://www.ncbi.nlm.nih.gov/pubmed/?term={quote(doi)}[Location ID]&report=uilist&format=text' 42 | body = requests.get(url).text 43 | soup = BeautifulSoup(body, 'lxml') 44 | matches = soup.find_all('pre') 45 | if matches: 46 | pmid = matches[0].get_text().strip() 47 | if '\n' in pmid: # in the event that we get multiple PMIDs it means something is wrong 48 | pmid = None 49 | if pmid: 50 | log.info(f'got pmid from pubmed: {pmid}') 51 | return 'PMID:' + pmid 52 | params={'idtype':'auto', 'format':'json', 'ids':doi,} 53 | endpoint = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/' 54 | resp = requests.get(endpoint, params=params) 55 | if resp.ok: 56 | pj = resp.json() 57 | else: 58 | resp.raise_for_status() 59 | 60 | log.debug(pj) 61 | for rec in pj['records']: 62 | try: 63 | return 'PMID:' + rec['pmid'] 64 | except KeyError: 65 | pass 66 | 67 | # RRIDs 68 | 69 | def rrid_resolver_xml(exact, found_rrids): 70 | print('\t' + exact) 71 | resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact 72 | r = requests.get(resolver_uri) 73 | status_code = r.status_code 74 | xml = r.content 75 | print(status_code) 76 | found_rrids[exact] = status_code 77 | return xml, status_code, resolver_uri 78 | -------------------------------------------------------------------------------- /scibot/submit.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from scibot.utils import DOI, PMID, rrid_from_citation, log 3 | 4 | def make_extra(document, expanded_exact=None): 5 | out = {'document': document,} 6 | if expanded_exact: 7 | out['expanded_exact'] = expanded_exact 8 | 9 | return out 10 | 11 | def annotate_doi_pmid(target_uri, document, doi, pmid, h, tags, extra_text=None): # TODO 12 | # need to check for existing ... 13 | extra = make_extra(document) 14 | text_list = [] 15 | tags_to_add = [] 16 | if extra_text is not None: 17 | text_list.append(extra_text) 18 | if doi is not None: 19 | doi_ = 'DOI:' + doi 20 | if doi_ not in tags: 21 | text_list.append(DOI(doi)) 22 | tags_to_add.append(doi_) 23 | if pmid and pmid not in tags: 24 | text_list.append(PMID(pmid)) 25 | tags_to_add.append(pmid) 26 | if tags_to_add: 27 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, 28 | document=document, 29 | text='\n'.join(text_list), 30 | tags=tags_to_add, 31 | extra=extra,) 32 | log.info(r) 33 | log.info(r.text) 34 | return r 35 | 36 | 37 | def submit_to_h(target_uri, document, found, resolved, h, found_rrids, existing, existing_with_suffixes): 38 | prefix, exact, exact_for_hypothesis, suffix = found 39 | xml, status_code, resolver_uri = resolved 40 | extra = make_extra(document, exact) 41 | 42 | if exact.startswith('RRID:'): 43 | tail = exact[len('RRID:'):] 44 | else: 45 | tail = exact 46 | 47 | duplicate = exact in existing 48 | suffix_match = (tail, suffix) in existing_with_suffixes # FIXME prefix tree for fuzzier matches 49 | new_tags = [] 50 | if duplicate: 51 | new_tags.append('RRIDCUR:Duplicate') 52 | elif suffix_match: 53 | log.info(f'suffix matches, skipping entirely, {tail} {suffix}') 54 | return 55 | else: 56 | # note that we use the normalized exact here to detect 57 | # duplicates but provide the canonical RRID: as the tag 58 | existing.append(exact) 59 | existing_with_suffixes.append((tail, suffix)) 60 | 61 | if status_code < 300: 62 | root = etree.fromstring(xml) 63 | if duplicate: 64 | # just mark the duplicate so that it will anchor in the client 65 | # but don't add the RRID: tag and don't include the resolver metadata 66 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, 67 | document=document, 68 | prefix=prefix, 69 | exact=exact_for_hypothesis, 70 | suffix=suffix, 71 | text='', 72 | tags=new_tags, 73 | extra=extra,) 74 | 75 | elif root.findall('error'): 76 | s = 'Resolver lookup failed.' 77 | s += '


resolver lookup

' % resolver_uri 78 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, 79 | document=document, 80 | prefix=prefix, 81 | exact=exact_for_hypothesis, 82 | suffix=suffix, 83 | text=s, 84 | tags=new_tags + ['RRIDCUR:Unresolved'], 85 | extra=extra,) 86 | log.error(f'rrid unresolved {exact}') 87 | 88 | else: 89 | s = '' 90 | title = root.findall('title')[0].text 91 | s += f'Title: {title}\n' 92 | data_elements = root.findall('data')[0] 93 | data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements] # these shouldn't duplicate 94 | citation = [(n, v) for n, v in data_elements if n == 'Proper Citation'] 95 | rrid = [rrid_from_citation(c) for _, c in citation] if citation else [exact] 96 | name = [(n, v) for n, v in data_elements if n == 'Name'] 97 | data_elements = citation + name + sorted([(n, v) for n, v in 98 | data_elements if (n != 'Proper Citation' or 99 | n != 'Name') and v is not None]) 100 | for name, value in data_elements: 101 | if ((name == 'Reference' or name == 'Mentioned In Literature') 102 | and value is not None and value.startswith(' 500: 104 | continue # nif-0000-30467 fix keep those pubmed links short! 105 | s += '

%s: %s

' % (name, value) 106 | s += '

resolver lookup

' % resolver_uri 107 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, 108 | document=document, 109 | prefix=prefix, 110 | exact=exact_for_hypothesis, 111 | suffix=suffix, 112 | text=s, 113 | tags=new_tags + rrid, 114 | extra=extra,) 115 | 116 | elif status_code >= 500: 117 | s = 'Resolver lookup failed due to server error.' 118 | s += '

resolver lookup

' % resolver_uri 119 | r = None 120 | log.error(f'{status_code} error for {resolver_uri}') 121 | else: 122 | s = 'Resolver lookup failed.' 123 | s += '

resolver lookup

' % resolver_uri 124 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, 125 | document=document, 126 | prefix=prefix, 127 | exact=exact_for_hypothesis, 128 | suffix=suffix, 129 | text=s, 130 | tags=new_tags + ['RRIDCUR:Unresolved'], 131 | extra=extra,) 132 | if r is not None: 133 | if r.ok: 134 | found_rrids[exact] = r.json()['links']['incontext'] 135 | else: 136 | try: 137 | r.raise_for_status() 138 | except Exception as e: 139 | # FIXME the fact that submission can fail silently and 140 | # that it does not get resubmitted or retried is a 141 | # major oversight ... 142 | log.exception(e) 143 | 144 | return r 145 | 146 | def api_row_to_db(api_row): 147 | # TODO insert the created annotation into our local store 148 | # check for contention/consistency with the websocket 149 | pass 150 | -------------------------------------------------------------------------------- /scibot/sync.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Run the scibot sync service 3 | 4 | Usage: 5 | scibot-sync [options] 6 | 7 | Options: 8 | -p --port=PORT the port that the service should run on [default: 12345] 9 | """ 10 | 11 | from curio import run, socket, UniversalEvent, TaskGroup 12 | from curio.time import timeout_after, sleep 13 | from curio.errors import TaskTimeout, TaskCancelled 14 | from curio.channel import Channel, Connection, AuthenticationError 15 | from scibot.utils import log as _log 16 | 17 | log = _log.getChild('aChannel') 18 | slog = _log.getChild('sync') 19 | clog = slog.getChild('client') # TODO add process info here 20 | mlog = slog.getChild('manager') 21 | 22 | done = UniversalEvent() 23 | 24 | 25 | class aChannel(Channel): 26 | async def connect(self, *, authkey=None, attempts=None): 27 | nattempts = 0 28 | while True: 29 | try: 30 | sock = socket.socket(self.family, socket.SOCK_STREAM) 31 | await sock.connect(self.address) 32 | sock_stream = sock.as_stream() 33 | c = Connection(sock_stream, sock_stream) 34 | #raise BaseException 35 | try: 36 | async with timeout_after(1): 37 | if authkey: 38 | await c.authenticate_client(authkey) 39 | return c 40 | except TaskTimeout: 41 | log.warning('Channel connection to %s timed out', self.address) 42 | await c.close() 43 | del c 44 | del sock_stream 45 | 46 | except OSError as e: 47 | if attempts is not None: 48 | if nattempts >= attempts: 49 | raise e 50 | else: 51 | nattempts += 1 52 | else: 53 | log.error('Channel connection to %s failed', self.address, exc_info=True) 54 | 55 | await sock.close() 56 | await sleep(1) 57 | 58 | 59 | class Locker: 60 | def __init__(self, send): 61 | self.send = send 62 | 63 | def _getQ(self): 64 | asdf = set() 65 | while 1: 66 | try: 67 | asdf.add(self.urls.get_nowait()) 68 | print('oh boy') 69 | except Empty: 70 | break 71 | print('current queue', asdf) 72 | #print(id(self)) 73 | return asdf 74 | 75 | def _setQ(self, uris): 76 | for uri in uris: 77 | log.info('putting uri', uri) 78 | self.urls.put(uri) 79 | print('done putting', uris, 'in queue') 80 | 81 | def start_uri(self, uri): 82 | val = run(self.send, 'add ' + uri) 83 | if val: 84 | return True 85 | else: 86 | return 87 | 88 | print(self.lock, id(self.urls)) 89 | with self.lock: 90 | print(self.lock, id(self.urls)) 91 | uris = self._getQ() 92 | if uri in uris: 93 | log.info(uri, 'is already running') 94 | return True 95 | else: 96 | log.info('starting work for', uri) 97 | uris.add(uri) 98 | self._setQ(uris) 99 | 100 | def stop_uri(self, uri): 101 | run(self.send, 'del ' + uri) 102 | return 103 | #print(self.lock, id(self.urls)) 104 | with self.lock: 105 | #print(self.lock, id(self.urls)) 106 | uris = self._getQ() 107 | uris.discard(uri) 108 | print('completed work for', uri) 109 | self._setQ(uris) 110 | 111 | 112 | async def exit(task_group): 113 | await done.wait() 114 | clog.info(f'sync {task_group} exiting ...') # have to call this before cancel 115 | await task_group.cancel_remaining() 116 | 117 | 118 | async def manage_single_connection(connection, currently_running_urls): 119 | while True: 120 | try: 121 | msg = await connection.recv() 122 | except (EOFError, ConnectionResetError) as e: # in the event that the client closes 123 | mlog.info(f'connection {connection} closed due to {e}') 124 | break 125 | else: 126 | op, uri = msg.split(' ', 1) 127 | mlog.debug(f'{op} :: {uri}') 128 | if op == 'add': 129 | if uri in currently_running_urls: 130 | await connection.send(True) 131 | else: 132 | currently_running_urls.add(uri) 133 | await connection.send(False) 134 | elif op == 'del': 135 | currently_running_urls.discard(uri) 136 | await connection.send(False) 137 | else: 138 | await connection.send('ERROR') 139 | mlog.debug(currently_running_urls) 140 | 141 | 142 | async def manager(chan, syncword): 143 | encoded = syncword.encode() 144 | currently_running_urls = set() 145 | async def listen_for_new_conns(task_group): 146 | while True: 147 | ch = Channel(chan) 148 | try: 149 | connection = await ch.accept(authkey=encoded) 150 | mlog.info(f'new connection created {connection}') 151 | await task_group.spawn(manage_single_connection, 152 | connection, 153 | currently_running_urls) 154 | await ch.close() # sort of strange that we need this? can we connect again later !? 155 | except ConnectionResetError as e: 156 | mlog.warning('client connection attempt did not terminate property') 157 | 158 | async with TaskGroup() as connection_tasks: 159 | await connection_tasks.spawn(exit, connection_tasks) 160 | await connection_tasks.spawn(listen_for_new_conns, connection_tasks) 161 | 162 | 163 | # synchronization setup 164 | async def client(chan, syncword): 165 | encoded = syncword.encode() 166 | async def auth(): 167 | ch = Channel(chan) 168 | async def connect(_ch=ch, authkey=encoded): 169 | connection = await _ch.connect(authkey=encoded) 170 | clog.debug(f'got connection {connection}') 171 | return connection 172 | 173 | async with TaskGroup(wait=any) as auth_or_exit: 174 | clog.info('waiting for sync services to start') 175 | exit_task = await auth_or_exit.spawn(exit, auth_or_exit) 176 | conn_task = await auth_or_exit.spawn(connect) 177 | 178 | connection = conn_task.result 179 | clog.debug(str(connection)) 180 | return connection 181 | 182 | clog.info('starting auth') 183 | heh = [await auth()] 184 | async def send(uri): 185 | c = heh[0] 186 | async def sendit(): 187 | await c.send(uri) 188 | resp = await c.recv() 189 | _uri = uri.split(' ', 1)[-1] 190 | msg = f'not :: {_uri}' if resp else f'run :: {_uri}' 191 | clog.debug(msg) 192 | return resp 193 | 194 | try: 195 | async with TaskGroup(wait=any) as send_or_exit: 196 | exit_task = await send_or_exit.spawn(exit, send_or_exit) 197 | send_task = await send_or_exit.spawn(sendit) 198 | 199 | try: 200 | resp = send_task.result 201 | return resp 202 | except TaskCancelled: 203 | return 204 | except RuntimeError as e: # FIXME not quite right? 205 | clog.error(e) # not eure what is causing this ... maybe a connection error? 206 | 207 | except (EOFError, BrokenPipeError) as e: 208 | c = await auth() 209 | heh[0] = c 210 | return await send(uri) 211 | 212 | return send 213 | 214 | 215 | def main(): 216 | from scibot.config import syncword 217 | if syncword is None: 218 | raise KeyError('Please set the SCIBOT_SYNC environment variable') 219 | 220 | import os 221 | try: 222 | # This is in a try block because colorama (used by colorlog) wraps 223 | # redirected stdout to strip certain control codes which can cause an 224 | # AttributeError: 'NoneType' object has no attribute 'set_title' 225 | # because colorama changes os.sys.stdout.write in a way that 226 | # removes the call to set_title 227 | os.sys.stdout.write(f'\x1b]2;{os.path.basename(__file__)}\x07\n') 228 | except AttributeError as e: 229 | slog.exception(e) 230 | 231 | from docopt import docopt 232 | args = docopt(__doc__) 233 | 234 | chan = ('localhost', int(args['--port'])) 235 | run(manager, chan, syncword) 236 | 237 | 238 | if __name__ == '__main__': 239 | main() 240 | -------------------------------------------------------------------------------- /scibot/uri.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for dealing with URIs within the Hypothesis API. 3 | 4 | There are two main ways of considering the relationship between annotations and 5 | annotated objects: 6 | 7 | 1. Annotations are, practically speaking, made on web pages, and thus they have 8 | a URL associated with them. 9 | 10 | 2. Annotations are made on documents, and the particular HTML or PDF page being 11 | annotated is merely a specific manifestation of the abstract document that 12 | is being annotated. In this scenario, a document may be identified by one or 13 | more URIs. 14 | 15 | The second model is more complex from both a conceptual point of view and in 16 | terms of implementation, but it offers substantial benefits. If we talk of 17 | annotations attached to documents, without regard to presentation format or 18 | location, we are able to do many interesting things: 19 | 20 | - Alice makes an annotation on a PDF; Bob visits an HTML version of the same 21 | document, and sees Alice's annotation. 22 | - Alice makes an annotation on an Instapaper-hosted version of a web page which 23 | contains a tag. Bob visits the original article and sees 24 | Alice's annotation. 25 | - Bob makes an annotation on a PDF which is on his local machine. Alice opens 26 | the same PDF on her machine, and see's Bob's annotations even if the PDF has 27 | never been uploaded to a webserver. (We can do this because of the 28 | immutability of PDF documents -- we can uniquely fingerprint each one and 29 | form a "URN" of the form "urn:x-pdf:".) 30 | 31 | The challenge, then, is to enable these features without making the public API 32 | for creating and updating annotations overly complex. It turns out this is 33 | possible if we can answer two questions: 34 | 35 | 1. Given two URI strings, do they both refer to the same URI, practically 36 | speaking? (AKA "normalization".) 37 | 38 | e.g. on the web, the following URLs will *usually* refer to the same web 39 | page:: 40 | 41 | http://example.com/foo?a=hello&b=world 42 | http://exAMPle.com/foo?a=hello&b=world 43 | http://example.com/foo/?a=hello&b=world 44 | http://example.com/foo?b=world&a=hello 45 | http://example.com/foo?a=hello&b=world#somefragment 46 | 47 | 2. Given a URI, what are all the known URIs of the underlying *document* (in 48 | the sense given above). (AKA "expansion".) 49 | 50 | e.g. we may know (from page metadata or otherwise) that all the following 51 | URIs refer to the same content, even if in differing formats:: 52 | 53 | http://example.com/research/papers/2015-discoveries.html 54 | http://example.com/research/papers/2015-discoveries.pdf 55 | http://example.org/reprints/example-com-2015-discoveries.pdf 56 | urn:x-pdf:c83fa94bd1d522276a32f81682a43d29 57 | urn:doi:10.1000/12345 58 | 59 | This package is responsible for defining URI normalization routines for use 60 | elsewhere in the Hypothesis application. URI expansion is handled by 61 | :py:func:`h.storage.expand_uri`. 62 | """ 63 | import re 64 | from urllib.parse import ( 65 | SplitResult, 66 | parse_qsl, 67 | quote, 68 | quote_plus, 69 | unquote, 70 | unquote_plus, 71 | urlparse, 72 | urlsplit, 73 | ) 74 | 75 | URL_SCHEMES = {"http", "https"} 76 | 77 | # List of regular expressions matching the names of query parameters that we 78 | # strip from URLs as part of normalization. 79 | BLACKLISTED_QUERY_PARAMS = [ 80 | re.compile(regex) 81 | for regex in ( 82 | # Google AdWords tracking identifier. Reference: 83 | # 84 | # https://support.google.com/analytics/answer/2938246?hl=en 85 | # 86 | r"^gclid$", 87 | # Google Analytics campaigns. Reference: 88 | # 89 | # https://support.google.com/analytics/answer/1033867?hl=en 90 | # 91 | r"^utm_(campaign|content|medium|source|term)$", 92 | # WebTrends Analytics query params. Reference: 93 | # 94 | # http://help.webtrends.com/en/analytics10/#qpr_about.html 95 | # 96 | r"^WT\..+$", 97 | # Amazon security access token. Reference: 98 | # 99 | # https://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html 100 | # 101 | r"(?i)^x-amz-security-token$", 102 | # 103 | # Google Drive resource key. Reference: 104 | # 105 | # https://support.google.com/a/answer/10685032 106 | r"^resourcekey$", 107 | ) 108 | ] 109 | 110 | # From RFC3986. The ABNF for path segments is 111 | # 112 | # path-abempty = *( "/" segment ) 113 | # ... 114 | # segment = *pchar 115 | # ... 116 | # pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 117 | # ... 118 | # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 119 | # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 120 | # / "*" / "+" / "," / ";" / "=" 121 | # 122 | # Taken together, this implies the following set of "unreserved" characters for 123 | # path segments (excluding ALPHA and DIGIT which are handled already). 124 | UNRESERVED_PATHSEGMENT = "-._~:@!$&'()*+,;=" 125 | 126 | # From RFC3986. The ABNF for query strings is 127 | # 128 | # query = *( pchar / "/" / "?" ) 129 | # 130 | # Where the definition of pchar is as given above. 131 | # 132 | # We exclude "&" and ";" from both names and values, and "=" from names, as 133 | # they are used as delimiters in HTTP URL query strings. In addition, "+" is 134 | # used to denote the space character, so for legacy reasons this is also 135 | # excluded. 136 | UNRESERVED_QUERY_NAME = "-._~:@!$'()*," 137 | UNRESERVED_QUERY_VALUE = "-._~:@!$'()*,=" 138 | 139 | # The string that gets prefixed onto a URI if you paste the URI into our Via 140 | # form. For example pasting https://example.com would 141 | # redirect your browser to https://via.hypothes.is/https://example.com. 142 | VIA_PREFIX = "https://via.hypothes.is/" 143 | 144 | 145 | def normalize(uristr): 146 | """ 147 | Translate the given URI into a normalized form. 148 | 149 | :type uristr: unicode 150 | :rtype: unicode 151 | """ 152 | 153 | # Strip proxy prefix for proxied URLs 154 | for scheme in URL_SCHEMES: 155 | if uristr.startswith(VIA_PREFIX + scheme + ":"): 156 | uristr = uristr[len(VIA_PREFIX) :] 157 | break 158 | 159 | # Try to extract the scheme 160 | uri = urlsplit(uristr) 161 | 162 | # If this isn't a URL, we don't perform any normalization 163 | if uri.scheme.lower() not in URL_SCHEMES: 164 | return uristr 165 | 166 | # Don't perform normalization on URLs with no hostname. 167 | if uri.hostname is None: 168 | return uristr 169 | 170 | scheme = _normalize_scheme(uri) 171 | netloc = _normalize_netloc(uri) 172 | path = _normalize_path(uri) 173 | query = _normalize_query(uri) 174 | fragment = None 175 | 176 | uri = SplitResult(scheme, netloc, path, query, fragment) 177 | 178 | return uri.geturl() 179 | 180 | 181 | def origin(url): 182 | """ 183 | Return a copy of ``url`` with the path, query string and fragment removed. 184 | 185 | ``url`` is assumed to be an HTTP(S) URL. 186 | """ 187 | url_parts = urlsplit(url) 188 | return SplitResult(url_parts.scheme, url_parts.netloc, "", "", "").geturl() 189 | 190 | 191 | def _normalize_scheme(uri): 192 | scheme = uri.scheme 193 | 194 | if scheme in URL_SCHEMES: # pragma: no cover 195 | scheme = "httpx" 196 | 197 | return scheme 198 | 199 | 200 | def _normalize_netloc(uri): 201 | netloc = uri.netloc 202 | ipv6_hostname = "[" in netloc and "]" in netloc 203 | 204 | username = uri.username 205 | password = uri.password 206 | hostname = uri.hostname 207 | port = uri.port 208 | 209 | # Normalise hostname to lower case 210 | hostname = hostname.lower() 211 | 212 | # Remove port if default for the scheme 213 | if uri.scheme == "http" and port == 80: 214 | port = None 215 | elif uri.scheme == "https" and port == 443: 216 | port = None 217 | 218 | # Put it all back together again... 219 | userinfo = None 220 | if username is not None: 221 | userinfo = username 222 | if password is not None: 223 | userinfo += ":" + password 224 | 225 | if ipv6_hostname: 226 | hostname = "[" + hostname + "]" 227 | 228 | hostinfo = hostname 229 | if port is not None: 230 | hostinfo += ":" + str(port) 231 | 232 | if userinfo is not None: 233 | netloc = "@".join([userinfo, hostinfo]) 234 | else: 235 | netloc = hostinfo 236 | 237 | return netloc 238 | 239 | 240 | def _normalize_path(uri): 241 | path = uri.path 242 | 243 | while path.endswith("/"): 244 | path = path[:-1] 245 | 246 | segments = path.split("/") 247 | segments = [_normalize_pathsegment(s) for s in segments] 248 | path = "/".join(segments) 249 | 250 | return path 251 | 252 | 253 | def _normalize_pathsegment(segment): 254 | return quote(unquote(segment), safe=UNRESERVED_PATHSEGMENT) 255 | 256 | 257 | def _normalize_query(uri): 258 | query = uri.query 259 | 260 | try: 261 | items = parse_qsl(query, keep_blank_values=True, strict_parsing=True) 262 | except ValueError: 263 | # If we can't parse the query string, we better preserve it as it was. 264 | return query 265 | 266 | # Python sorts are stable, so preserving relative ordering of items with 267 | # the same key doesn't require any work from us 268 | items = sorted(items, key=lambda x: x[0]) 269 | 270 | # Remove query params that are blacklisted 271 | items = [i for i in items if not _blacklisted_query_param(i[0])] 272 | 273 | # Normalise percent-encoding for query items 274 | query = _normalize_queryitems(items) 275 | 276 | return query 277 | 278 | 279 | def _normalize_queryitems(items): 280 | segments = [ 281 | "=".join([_normalize_queryname(i[0]), _normalize_queryvalue(i[1])]) 282 | for i in items 283 | ] 284 | return "&".join(segments) 285 | 286 | 287 | def _normalize_queryname(name): 288 | return quote_plus(unquote_plus(name), safe=UNRESERVED_QUERY_NAME) 289 | 290 | 291 | def _normalize_queryvalue(value): 292 | return quote_plus(unquote_plus(value), safe=UNRESERVED_QUERY_VALUE) 293 | 294 | 295 | def _blacklisted_query_param(string): 296 | """Return True if the given string matches any BLACKLISTED_QUERY_PARAMS.""" 297 | return any(patt.match(string) for patt in BLACKLISTED_QUERY_PARAMS) 298 | 299 | 300 | def render_url_template(template, example_url): 301 | """ 302 | Update a URL template to have the same scheme and host as the example. 303 | 304 | This function is primarily used in development to support creating 305 | absolute links to h or other Hypothesis services which work when h is 306 | accessed from the same system (where the h dev server is "localhost:") 307 | or a different device (when the h dev server is "machine-name.local:"). 308 | """ 309 | parsed = urlparse(example_url) 310 | 311 | url = template.replace("{current_host}", parsed.hostname) 312 | url = url.replace("{current_scheme}", parsed.scheme) 313 | return url 314 | -------------------------------------------------------------------------------- /scibot/utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from .uri import normalize as uri_normalize 3 | from pyontutils.utils import Async, deferred, chunk_list, anyMembers, noneMembers, makeSimpleLogger 4 | try: 5 | breakpoint 6 | except NameError: 7 | from IPython import embed as breakpoint 8 | 9 | log = makeSimpleLogger('scibot') 10 | logd = log.getChild('data') 11 | 12 | from hyputils.hypothesis import log as _hlog 13 | _hlog.removeHandler(_hlog.handlers[0]) 14 | _hlog.addHandler(log.handlers[0]) 15 | 16 | 17 | class ResolutionError(Exception): 18 | """ something messed up """ 19 | 20 | 21 | def DOI(doi): 22 | return 'https://doi.org/' + doi 23 | 24 | 25 | def PMID(pmid): 26 | return pmid.replace('PMID:', 'https://www.ncbi.nlm.nih.gov/pubmed/') 27 | 28 | 29 | def rrid_from_citation(citation): 30 | _, suffix_stuff = citation.split('RRID:') 31 | suffix = suffix_stuff.rstrip(')') 32 | return 'RRID:' + suffix 33 | 34 | 35 | def get_pmid_from_url(url): 36 | if anyMembers(url, 37 | 'www.ncbi.nlm.nih.gov/pubmed/', 38 | 'europepmc.org/abstract/MED/'): 39 | # TODO validate the suffix 40 | _, suffix = url.rsplit('/', 1) 41 | return 'PMID:' + suffix 42 | 43 | 44 | def zap_deleted(get_annos): 45 | annos = get_annos() 46 | new_annos = get_annos.get_annos_from_api(len(annos), 200) 47 | n_deleted = len([a for a in new_annos if a in annos]) 48 | print('there are', n_deleted, 'potentially deleted annotations') 49 | missing = [] 50 | h = get_annos.h() 51 | 52 | def thing(id): 53 | return id, h.head_annotation(id).ok 54 | 55 | # work backwards to cull deleted annotations 56 | size = 500 57 | n_chunks = len(annos) // size 58 | for i, anno_chunk in enumerate(chunk_list(list(reversed(annos)), size)): 59 | if i < 10: 60 | continue 61 | print('chunk size', size, 'number', i + 1 , 'of', n_chunks, 'found', len(missing)) 62 | if len(missing) >= n_deleted: 63 | break 64 | responses = Async(25)(deferred(thing)(a.id) for a in anno_chunk) 65 | missing += [id for id, ok in responses if not ok] 66 | 67 | # TODO actually remove them 68 | breakpoint() 69 | 70 | 71 | def resolution_chain(iri): 72 | #doi = doi # TODO 73 | s = requests.Session() 74 | head = requests.head(iri) 75 | yield head.url 76 | while head.is_redirect and head.status_code < 400: # FIXME redirect loop issue 77 | yield head.next.url 78 | head = s.send(head.next) 79 | yield head.url 80 | if not head.is_redirect: 81 | break 82 | 83 | if head.status_code >= 400: 84 | raise ResolutionError(f'Nothing found at {head.url}\n') 85 | 86 | 87 | bad_uris = ('/articles/6-124/v2', # FIXME don't hardcode this >_< 88 | '//bmcbiol.biomedcentral.com/articles/10.1186/s12915-016-0257-2') 89 | 90 | 91 | def uri_normalization(uri): 92 | """ NOTE: this does NOT produce uris """ 93 | try: 94 | # strip hypothesis extension prefix 95 | if uri.startswith('chrome-extension://bjfhmglciegochdpefhhlphglcehbmek/content/web/viewer.html?file='): 96 | junk, uri = uri.split('=', 1) 97 | 98 | # universal fixes 99 | no_fragment, *_frag = uri.rsplit('#', 1) 100 | no_trailing_slash = no_fragment.rstrip('/') # annoying 101 | _scheme, no_scheme = no_trailing_slash.split('://', 1) 102 | 103 | # special cases 104 | if 'frontiersin.org' in no_scheme: 105 | # og:url on frontiers is incorrect 106 | no_scheme = no_scheme.replace('article/', 'articles/') 107 | elif 'fasebj.org' in no_scheme: # FIXME this one has _all_ the variants :/ 108 | no_scheme = (no_scheme 109 | .replace('.abstract', '') 110 | .replace('.full', '') 111 | .replace('.pdf', '') 112 | ) 113 | elif no_scheme.endswith('?needAccess=true'): 114 | no_scheme = no_scheme[:-len('?needAccess=true')] 115 | elif '?systemMessage' in no_scheme: 116 | no_scheme, junk = no_scheme.rsplit('?systemMessage', 1) 117 | 118 | # specific fixes 119 | if anyMembers(no_scheme, 120 | 'acs.org', 121 | 'ahajournals.org', 122 | 'biologicalpsychiatryjournal.com', 123 | 'ebiomedicine.com', 124 | 'fasebj.org', 125 | 'frontiersin.org', 126 | 'future-science.com', 127 | 'hindawi.com', 128 | 'ieee.org', 129 | 'jclinepi.com', 130 | 'jpeds.com', 131 | 'liebertpub.com', 132 | 'mitpressjournals.org', 133 | 'molbiolcell.org', 134 | 'molmetab.com', 135 | 'neurobiologyofaging.org', 136 | 'physiology.org', 137 | 'sagepub.com', 138 | 'sciencedirect.com', 139 | 'tandfonline.com', 140 | 'theriojournal.com', 141 | 'wiley.com',): 142 | # NOTE not all the above hit all of these 143 | # almost all still resolve 144 | normalized = (no_scheme 145 | .replace('/abstract', '') 146 | .replace('/abs', '') 147 | .replace('/fulltext', '') 148 | .replace('/full', '') 149 | .replace('/pdf', '')) 150 | #elif ('sciencedirect.com' in no_scheme): 151 | #normalized = (no_scheme 152 | #.replace('/abs', '')) 153 | elif ('cell.com' in no_scheme): 154 | normalized = (no_scheme # FIXME looks like cell uses /abstract in og:url 155 | .replace('/abstract', '/XXX') 156 | .replace('/fulltext', '/XXX')) 157 | elif 'jneurosci.org' in no_scheme: 158 | # TODO content/early -> resolution_chain(doi) 159 | normalized = (no_scheme 160 | .replace('.short', '') 161 | .replace('.long', '') 162 | .replace('.full', '') 163 | .replace('.pdf', '') 164 | # note .full.pdf is a thing 165 | ) 166 | elif 'pnas.org' in no_scheme: 167 | normalized = (no_scheme 168 | .replace('.short', '') 169 | .replace('.long', '') 170 | .replace('.full', '')) 171 | elif 'mdpi.com' in no_scheme: 172 | normalized = (no_scheme 173 | .replace('/htm', '')) 174 | elif 'f1000research.com' in no_scheme: 175 | # you should be ashamed of yourselves for being in here for this reason 176 | normalized, *maybe_version = no_scheme.rsplit('/v', 1) 177 | elif 'academic.oup.com' in no_scheme: 178 | normalized, *maybesr = no_scheme.rsplit('?searchresult=', 1) 179 | _normalized, maybe_junk = normalized.rsplit('/', 1) 180 | numbers = '0123456789' 181 | if (maybe_junk[0] not in numbers or # various ways to detect the human readable junk after the id 182 | maybe_junk[-1] not in numbers or 183 | '-' in maybe_junk or 184 | len(maybe_junk) > 20): 185 | normalized = _normalized 186 | elif anyMembers(no_scheme, 187 | 'jci.org', 188 | 'nature.com'): 189 | # cases where safe to remove query fragment 190 | normalized, *_query = no_scheme.rsplit('?', 1) 191 | normalized, *table_number = normalized.rsplit('/tables/', 1) 192 | elif 'pubmed/?term=' in no_scheme and noneMembers(no_scheme, ' ', '+'): 193 | normalized = no_scheme.replace('?term=', '') 194 | elif 'nih.gov/pubmed/?' in no_scheme: 195 | # FIXME scibot vs client norm? 196 | normalized = no_scheme.replace(' ', '+') 197 | elif 'govhttp' in no_scheme: 198 | # lol oh dear 199 | hrm, oops = no_scheme.split('govhttp') 200 | ded, wat = oops.split('//', 1) 201 | blargh, suffix = wat.split('/', 1) 202 | normalized = hrm + 'gov/pmc/' + suffix 203 | elif 'table/undtbl' in no_scheme: 204 | normalized, table_number = no_scheme.rsplit('table/undtbl') 205 | elif anyMembers(no_scheme, 206 | 'index.php?', 207 | ): 208 | # cases where we just use hypothes.is normalization 209 | _scheme, normalized = uri_normalize(uri).split('://') # FIXME h dependency 210 | else: 211 | normalized = no_scheme 212 | 213 | 'onlinelibrary.wiley.com/doi/10.1002/cne.23727?wol1URL=/doi/10.1002/cne.23727®ionCode=US-CA&identityKey=e2523300-b934-48c9-b08e-940de05d7335' 214 | 'www.jove.com/video/55441/?language=Japanese' 215 | 'www.nature.com/neuro/journal/v19/n5/full/nn.4282.html' 216 | 'www.nature.com/cr/journal/vaop/ncurrent/full/cr201669a.html' 217 | 'https://www.nature.com/articles/cr201669' 218 | 219 | #{'www.ingentaconnect.com/content/umrsmas/bullmar/2017/00000093/00000002/art00006': 220 | #[OntId('DOI:10.5343/bms.2016.1044'), OntId('DOI:info:doi/10.5343/bms.2016.1044')]} 221 | 222 | # pmid extract from pmc 223 | # 224 | return normalized 225 | 226 | 227 | except ValueError as e: # split fail 228 | pdf_prefix = 'urn:x-pdf:' 229 | if uri.startswith(pdf_prefix): 230 | return uri 231 | elif uri in bad_uris: 232 | print('AAAAAAAAAAAAAAAAAAAAAAAAAAA', uri) 233 | return 'THIS URI IS GARBAGE AND THIS IS ITS NORMALIZED FORM' 234 | else: 235 | raise TypeError(uri) from e 236 | 237 | 238 | def disambiguate_uris(uris): 239 | dd = defaultdict(set) 240 | _ = [dd[uri_normalization(uri)].add(uri) for uri in uris if uri not in bad_uris] 241 | return dict(dd) 242 | 243 | 244 | class mproperty: 245 | def __init__(self, fget=None, fset=None, fdel=None, doc=None): 246 | if doc is None and fget is not None and hasattr(fget, "__doc__"): 247 | doc = fget.__doc__ 248 | self.__get = fget 249 | self.__set = fset 250 | self.__del = fdel 251 | self.__doc__ = doc 252 | if fget is not None: 253 | self._attr_name = '___' + fget.__name__ 254 | 255 | def __get__(self, inst, type=None): 256 | if inst is None: 257 | return self 258 | if self.__get is None: 259 | raise AttributeError('unreadable attribute') 260 | 261 | if not hasattr(inst, self._attr_name): 262 | result = self.__get(inst) 263 | setattr(inst, self._attr_name, result) 264 | return getattr(inst, self._attr_name) 265 | 266 | def __set__(self, inst, value): 267 | if self.__set is None: 268 | raise AttributeError('can\'t set attribute') 269 | delattr(inst, self._attr_name) 270 | return self.__set(inst, value) 271 | 272 | def __delete__(self, inst): 273 | if self.__del is None: 274 | raise AttributeError('can\'t delete attribute') 275 | delattr(inst, self._attr_name) 276 | return self.__del(inst) 277 | 278 | def mproperty_set(inst, func_name, value): 279 | if isinstance(func_name, basestring): 280 | property_name = '___' + func_name 281 | elif hasattr(func_name, '__name__'): 282 | property_name = '___' + func_name.func_name 283 | else: 284 | raise 285 | setattr(inst, property_name, value) 286 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | [tool:pytest] 4 | testpaths=test 5 | addopts=--verbose --color=yes -W ignore 6 | [bdist_wheel] 7 | universal=1 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup 3 | 4 | 5 | def find_version(filename): 6 | _version_re = re.compile(r"__version__ = ['\"](.*)['\"]") 7 | last = None # match python semantics 8 | for line in open(filename): 9 | version_match = _version_re.match(line) 10 | if version_match: 11 | return version_match.group(1) 12 | 13 | return last 14 | 15 | 16 | __version__ = find_version('scibot/__init__.py') 17 | 18 | with open('README.md', 'rt') as f: 19 | long_description = f.read() 20 | 21 | tests_require = ['pytest', 'pytest-runner'] 22 | setup(name='scibot', 23 | version=__version__, 24 | description='curation workflow automation and coordination', 25 | long_description=long_description, 26 | long_description_content_type='text/markdown', 27 | url='https://github.com/SciCrunch/scibot', 28 | author='Tom Gillespie', 29 | author_email='tgbugs@gmail.com', 30 | license='Apache 2.0', 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | #'License :: OSI Approved :: Apache 2', # pypi doesn't have v2 34 | 'Programming Language :: Python :: 3.6', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Programming Language :: Python :: 3.8', 37 | 'Programming Language :: Python :: 3.9', 38 | 'Programming Language :: Python :: 3.10', 39 | 'Programming Language :: Python :: 3.11', 40 | 'Programming Language :: Python :: Implementation :: CPython', 41 | 'Programming Language :: Python :: Implementation :: PyPy', 42 | 'Operating System :: POSIX :: Linux', 43 | ], 44 | keywords='rrid curation biocuration hypothesis hypothes.is web annotation', 45 | packages=['scibot'], 46 | tests_require=tests_require, 47 | install_requires=[ 48 | 'beautifulsoup4', 49 | 'curio>=1.6', 50 | 'docopt', 51 | 'flask', 52 | #'gevent', 53 | #'gunicorn', 54 | #'hyputils[memex]>=0.0.4', 55 | 'hyputils>=0.0.4', 56 | "ipython; python_version < '3.7'", 57 | 'lxml', 58 | 'pyontutils>=0.1.4', 59 | ], 60 | extras_require={'dev':['pyontutils',], 61 | 'test': tests_require}, 62 | scripts=['bin/scibot-bookmarklet', 'bin/scibot-dashboard'], 63 | entry_points={ 64 | 'console_scripts': [ 65 | 'scibot-sync=scibot.sync:main' 66 | ], 67 | }, 68 | ) 69 | -------------------------------------------------------------------------------- /sql/extensions.sql: -------------------------------------------------------------------------------- 1 | -- postgres scibot_test 2 | -- CONNECT TO scibot_test USER postgres; 3 | 4 | -- NOTE dev-db/postgresql uuid use flag is required 5 | 6 | CREATE EXTENSION "uuid-ossp"; -- keep this on public schema for safety 7 | -------------------------------------------------------------------------------- /sql/permissions.sql: -------------------------------------------------------------------------------- 1 | -- scibot-admin scibot_test 2 | -- CONNECT TO :database USER "scibot-admin"; 3 | 4 | GRANT CONNECT ON DATABASE :database TO "scibot-user"; 5 | GRANT USAGE ON SCHEMA scibot TO "scibot-user"; 6 | 7 | GRANT SELECT, INSERT, UPDATE ON ALL TABLES IN SCHEMA scibot TO "scibot-user"; -- tables includes views 8 | GRANT USAGE ON ALL SEQUENCES IN SCHEMA scibot TO "scibot-user"; 9 | -------------------------------------------------------------------------------- /sql/postgres.sql: -------------------------------------------------------------------------------- 1 | -- postgres postgres 2 | -- CONNECT TO postgres USER postgres; 3 | 4 | DO 5 | $body$ 6 | BEGIN 7 | IF NOT EXISTS ( SELECT * FROM pg_catalog.pg_user 8 | WHERE usename = 'scibot-user') THEN 9 | CREATE ROLE "scibot-user" LOGIN 10 | NOSUPERUSER INHERIT NOCREATEDB NOCREATEROLE; 11 | END IF; 12 | IF NOT EXISTS ( SELECT * FROM pg_catalog.pg_user 13 | WHERE usename = 'scibot-admin') THEN 14 | CREATE ROLE "scibot-admin" LOGIN 15 | NOSUPERUSER INHERIT NOCREATEDB NOCREATEROLE; 16 | END IF; 17 | END; 18 | $body$ language plpgsql; 19 | 20 | -- postgres postgres 21 | 22 | ALTER ROLE "scibot-admin" SET search_path = scibot, public; 23 | ALTER ROLE "scibot-user" SET search_path = scibot, public; 24 | 25 | -- postgres postgres 26 | 27 | DROP DATABASE IF EXISTS :database; 28 | 29 | -- postgres postgres 30 | 31 | CREATE DATABASE :database -- scibot 32 | WITH OWNER = 'scibot-admin' 33 | ENCODING = 'UTF8' 34 | TABLESPACE = pg_default 35 | LC_COLLATE = 'en_US.UTF-8' 36 | LC_CTYPE = 'en_US.UTF-8' 37 | CONNECTION LIMIT = -1; 38 | 39 | -------------------------------------------------------------------------------- /sql/schemas.sql: -------------------------------------------------------------------------------- 1 | -- interlex-admin interlex_test 2 | 3 | CREATE SCHEMA IF NOT EXISTS scibot; 4 | -------------------------------------------------------------------------------- /templates/_formhelpers.html: -------------------------------------------------------------------------------- 1 | {% macro render_field(field) %} 2 |
{{ field.label }} 3 |
{{ field(**kwargs)|safe }} 4 | {% if field.errors %} 5 |
    6 | {% for error in field.errors %} 7 |
  • {{ error }}
  • 8 | {% endfor %} 9 |
10 | {% endif %} 11 |
12 | {% endmacro %} 13 | -------------------------------------------------------------------------------- /templates/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | SciBot Curation Dashboard 5 | 6 | 11 | 12 | 13 | 14 | {{ navbar|safe }} 15 |
16 |

SciBot Curation Dashboard

17 | {{ var }}

18 | 19 | 20 | 23 | 26 | 29 | 32 | 35 | 38 | 41 | 44 | 45 |
21 | Missing ({{ nmissing }}) 22 | 24 | Unresolved ({{ nures }}) 25 | 27 | Incorrect ({{ incor }}) 28 | 30 | Papers ({{ npapers }}) 31 | 33 | No PMID ({{ nnopmid }}) 34 | 36 | No DOI ({{ nnodoi }}) 37 | 39 | No Annos ({{ nnoannos }}) 40 | 42 | All Problems ({{ allp }}) 43 |
46 | 47 |
48 | 49 | 50 | -------------------------------------------------------------------------------- /templates/results.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Search Results: 5 | 6 | 7 | {% with messages = get_flashed_messages() %} 8 | {% if messages %} 9 |
    10 | {% for message in messages %} 11 |
  • {{ message }}
  • 12 | {% endfor %} 13 |
14 | {% endif %} 15 | {% endwith %} 16 | 17 |

Search Results

18 | 19 | {{results}} 20 | 21 | 22 | -------------------------------------------------------------------------------- /templates/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Search 5 | 8 | 9 | 10 | 11 | {{ navbar|safe }} 12 |
13 |

Search

14 | 15 | {% with messages = get_flashed_messages() %} 16 | {% if messages %} 17 |
    18 | {% for message in messages %} 19 |
  • {{ message }}
  • 20 | {% endfor %} 21 |
22 | {% endif %} 23 | {% endwith %} 24 | 25 | {% from "_formhelpers.html" import render_field %} 26 |
27 |
28 | {{ render_field(form.select) }} 29 |

30 | {{ render_field(form.search) }} 31 |

32 |

33 |

34 |
35 | 36 | 37 | -------------------------------------------------------------------------------- /templates/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {{returnStr}} 16 |
ProblemPMIDLinkAnnotated ByNotes
17 | 18 | 19 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/scibot/3badc8d1ca4f441958d5062d1c5ba925996d22a2/test/__init__.py -------------------------------------------------------------------------------- /test/test_extract.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from scibot.extract import clean_text, find_rrids 3 | 4 | 5 | class TestFind(unittest.TestCase): 6 | def test_regex(self): 7 | text = """ 8 | REAGENT or RESOURCE SOURCE IDENTIFIER 9 | Antibodies 10 | 11 | 12 | anti-mouse CD3, biotin, clone 17A2 BioLegend Cat# 100244; RRID: AB_2563947 13 | anti-mouse CD19, biotin, clone 6D5 BioLegend Cat# 115504; RRID: AB_312823 14 | anti-mouse Ly-6G and Ly-6C, biotin, clone RB6-8C5 BD Biosciences Cat# 553124; RRID: AB_394640 15 | anti-mouse CD16/CD32, biotin, clone 2.4G2 BD Biosciences Cat# 553143; RRID: AB_394658 16 | anti-mouse CD11b, biotin, clone M1/70 BD Biosciences Cat# 553309; RRID: AB_394773 17 | Bacterial and Virus Strains 18 | 19 | 20 | Lactobacillus plantarum (Lp 39 [IAM 12477]) ATCC ATCC14917 21 | 22 | 23 | Biological Samples 24 | 25 | 26 | Healthy adult intestine tissue Hospital Universitario de La Princesa (Madrid, Spain) N/A 27 | """ 28 | found = list(find_rrids(clean_text(text))) 29 | assert len(found) == 5 30 | 31 | -------------------------------------------------------------------------------- /test/test_resolver.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from scibot.services import rrid_resolver_xml 3 | 4 | 5 | class TestResolver(unittest.TestCase): 6 | def test_redirect(self): 7 | found_rrids = {} 8 | out = rrid_resolver_xml('%20rid_000041', found_rrids) 9 | 10 | -------------------------------------------------------------------------------- /test/test_routes.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from flask import request 3 | from scibot.bookmarklet import main as bookmarklet_main 4 | from test.testing_data import form_data 5 | 6 | 7 | def start_uri(uri): 8 | # nothing real is being done, so alway return not running 9 | return False 10 | 11 | 12 | class TestRoutes(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.app = bookmarklet_main() 16 | cls.app.URL_LOCK.start_uri = start_uri 17 | print(list(cls.app.view_functions.keys())) 18 | 19 | def test_bookmarket(self): 20 | func = self.app.view_functions['bookmarklet'] 21 | with self.app.test_request_context('/bookmarket'): 22 | hrm = func() 23 | 24 | def test_rrid_post(self): 25 | func = self.app.view_functions['rrid'] 26 | with self.app.test_request_context('/rrid', method='POST'): 27 | request.form = form_data 28 | hrm = func() 29 | 30 | def test_rrid_options(self): 31 | func = self.app.view_functions['rrid'] 32 | with self.app.test_request_context('/rrid', method='OPTIONS'): 33 | hrm = func() 34 | -------------------------------------------------------------------------------- /test/test_sync.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from time import sleep 3 | from threading import Thread 4 | from curio import run 5 | from scibot.sync import manager, client, Locker, done 6 | from scibot.utils import makeSimpleLogger 7 | 8 | log = makeSimpleLogger('sync test log') 9 | host = 'localhost' 10 | port = 11111 11 | syncword = 'syncword!' 12 | 13 | 14 | def manager_main(): 15 | run(manager, (host, port), syncword) 16 | 17 | 18 | def client_main(uri='aaaaaaaaaaaaaaaaaaaaaaaaaa'): 19 | send = run(client, (host, port), syncword) 20 | URL_LOCK = Locker(send) 21 | while not done.is_set(): 22 | URL_LOCK.start_uri(uri) 23 | sleep(0.5) 24 | URL_LOCK.stop_uri(uri) 25 | sleep(1) 26 | 27 | 28 | def main(): 29 | manager_thread = Thread(target=manager_main, args=tuple()) 30 | manager_thread.start() 31 | n_clients = 7 32 | messages = [chr(x + 97) * 10 for x in range(n_clients)] 33 | cthreads = [] 34 | for i, msg in enumerate(messages): 35 | sleep(0.25) 36 | client_thread = Thread(target=client_main, args=(msg,)) 37 | cthreads.append(client_thread) 38 | client_thread.start() 39 | 40 | sleep(4) 41 | done.set() 42 | log.info('waiting on clients') 43 | for i, t in enumerate(cthreads): 44 | t.join() 45 | log.info(f'joined client thread {i}') 46 | 47 | log.info('waiting on manager') # this doesn't work 48 | manager_thread.join() 49 | 50 | 51 | class TestSync(unittest.TestCase): 52 | def test_all(self): 53 | main() 54 | -------------------------------------------------------------------------------- /test/testing_data.py: -------------------------------------------------------------------------------- 1 | form_data = {'body': ['\n' 2 | '

SciBot

\n' 3 | '

To install the bookmarklet, drag this link -- SciBot ' 7 | 'localhost:4443 -- to your bookmarks bar.

\n' 8 | "

If you need to copy/paste the bookmarklet's code into a " 9 | "bookmarklet, it's here:

\n" 10 | '\n' 11 | 'javascript:(function(){var xhr=new XMLHttpRequest();\n' 12 | '\n' 13 | "var params='uri='+location.href+\n" 14 | "'&head='+encodeURIComponent(document.head.innerHTML)+\n" 15 | "'&body='+encodeURIComponent(document.body.innerHTML)+\n" 16 | "'&data='+encodeURIComponent(document.body.innerText);\n" 17 | '\n' 18 | "xhr.open('POST','https://localhost:4443/rrid',true);\n" 19 | "xhr.setRequestHeader('Content-type','application/x-www-form-urlencoded');\n" 20 | "xhr.setRequestHeader('Access-Control-Allow-Origin','*');\n" 21 | "xhr.onreadystatechange=function(){if(xhr.readyState==4)console.log('rrids: " 22 | "'+xhr.responseText)};\n" 23 | 'xhr.send(params)}());\n' 24 | '\n' 25 | '\n' 26 | '\n'], 27 | 'data': ['SciBot\n' 28 | '\n' 29 | 'To install the bookmarklet, drag this link -- SciBot localhost:4443 ' 30 | '-- to your bookmarks bar.\n' 31 | '\n' 32 | "If you need to copy/paste the bookmarklet's code into a " 33 | "bookmarklet, it's here:\n" 34 | '\n' 35 | 'javascript:(function(){var xhr=new XMLHttpRequest(); var ' 36 | "params='uri='+location.href+ " 37 | "'&head='+encodeURIComponent(document.head.innerHTML)+ " 38 | "'&body='+encodeURIComponent(document.body.innerHTML)+ " 39 | "'&data='+encodeURIComponent(document.body.innerText); " 40 | "xhr.open('POST','https://localhost:4443/rrid',true); " 41 | "xhr.setRequestHeader('Content-type','application/x-www-form-urlencoded'); " 42 | "xhr.setRequestHeader('Access-Control-Allow-Origin','*'); " 43 | "xhr.onreadystatechange=function(){if(xhr.readyState==4)console.log('rrids: " 44 | "'+xhr.responseText)}; xhr.send(params)}());"], 45 | 'head': ['\n' 46 | '\n' 51 | 'SciBot bookmarklet'], 52 | 'uri': ['https://localhost:4443/bookmarklet']} 53 | --------------------------------------------------------------------------------