├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Pipfile
├── README.md
├── bin
├── scibot-bookmarklet
├── scibot-dashboard
└── scibot-dbsetup
├── docs
├── architecture.graphml
├── setup.md
├── workflow-paper-id.graphml
├── workflow-rrid.graphml
└── workflows.org
├── resources
├── config_files
│ └── etc
│ │ ├── nginx
│ │ ├── nginx.conf
│ │ └── scibot.conf
│ │ ├── systemd
│ │ └── system
│ │ │ ├── env.conf
│ │ │ ├── scibot-bookmarklet-sync.service
│ │ │ ├── scibot-bookmarklet.service
│ │ │ ├── scibot-bookmarklet.socket
│ │ │ ├── scibot-dashboard.service
│ │ │ └── scibot-dashboard.socket
│ │ └── tmpfiles.d
│ │ └── scibot-bookmarklet.conf
├── rpmbuild
│ ├── .gitignore
│ ├── SOURCES
│ │ ├── env.conf
│ │ ├── nginx.conf
│ │ ├── scibot-bookmarklet-sync.service
│ │ ├── scibot-bookmarklet.conf
│ │ ├── scibot-bookmarklet.service
│ │ ├── scibot-bookmarklet.socket
│ │ └── scibot.conf
│ └── SPECS
│ │ └── scibot.spec
└── scripts
│ └── scibot-monkey-button.user.js
├── scibot
├── __init__.py
├── anno.py
├── bookmarklet.py
├── bookmarklet_server.py
├── check.py
├── cli.py
├── config.py
├── dash.py
├── dashboard.py
├── db.py
├── export.py
├── extract.py
├── get_annos.py
├── papers.py
├── release.py
├── release_report.py
├── rridxp.py
├── services.py
├── submit.py
├── sync.py
├── uri.py
├── utils.py
└── workflow.py
├── setup.cfg
├── setup.py
├── sql
├── extensions.sql
├── permissions.sql
├── postgres.sql
└── schemas.sql
├── templates
├── _formhelpers.html
├── main.html
├── results.html
├── search.html
└── table.html
└── test
├── __init__.py
├── test_extract.py
├── test_resolver.py
├── test_routes.py
├── test_sync.py
└── testing_data.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.json
3 | *.log
4 | *.swp
5 | *.pyc
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2017 Jon Udell and Tom Gillespie
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include test *
2 | exclude .gitignore
3 | exclude .travis.yml
4 | exclude MANIFEST.in
5 | recursive-exclude * *.pyc
6 | recursive-exclude * *.swp
7 | recursive-exclude * *.swo
8 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [requires]
7 | python_version = '3.7'
8 |
9 | [packages]
10 | hyputils = {editable = true, ref = "master", git = "https://github.com/tgbugs/hyputils.git"}
11 | ontquery = {editable = true, ref = "master", git = "https://github.com/tgbugs/ontquery.git"}
12 | pyontutils = {editable = true, ref = "master", git = "https://github.com/tgbugs/pyontutils.git"}
13 | "e1839a8" = {path = ".", editable = true}
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SciBot
2 | [](https://pypi.org/project/scibot/)
3 |
4 | curation workflow automation and coordination
5 |
6 | * find RRIDs in articles
7 | * look them up in the SciCrunch resolver
8 | * create Hypothesis annotations that anchor to the RRIDs and display lookup results
9 |
10 | ## Getting Started
11 |
12 | * [Create a Hypothesis](https://web.hypothes.is/start/) account which will post the annotations.
13 | * Generate an api token at https://hypothes.is/profile/developer (must be logged in to see page).
14 | * Create a group to store the annotations at https://hypothes.is/groups/new (must be logged in to see page).
15 | * See [Setup on amazon](#setup-on-amazon)
16 |
17 | ## Capturing the bookmarklet
18 |
19 | Visit https://HOST:PORT/bookmarklet and follow the instructions.
20 |
21 | ## Using the bookmarklet
22 |
23 | Visit an article that contains RRIDs, click the bookmarklet
24 |
25 | ## Checking results in the browser
26 |
27 | The found RRIDs are logged to the JavaScript console
28 |
29 | ## Checking results on the server
30 |
31 | The found RRIDs are logged to timestamped files, along with the text and html of the article that was scanned for RRIDs
32 |
33 | ## Setup on Gentoo
34 | As root.
35 | ```bash
36 | layman -a tgbugs-overlay
37 | emerge scibot
38 | rc-config add scibot-bookmarklet default
39 | vim /etc/conf.d/scibot-bookmarklet # set username, group, api key, etc.
40 | /etc/init.d/scibot-bookmarklet start
41 | ```
42 |
43 | ## Setup on ubuntu 18.04
44 | Set `export PATH=~/.local/bin:${PATH}` in `~/.bashrc`
45 | 1. `sudo apt-get install build-essential python3-dev libxml2-dev libxslt1-dev`
46 | 2. `pip3 install --user pip pipenv`
47 | 3. `git clone https://github.com/SciCrunch/scibot.git`
48 | 4. `cd scibot && pipenv install --skip-lock`
49 | 5. `pipenv shell` to get an environment with acess to all the required libraries.
50 | 6. Inside the pipenv shell (after following steps 6-10 below) you should
51 | be able to run commands like `python scibot/export.py`.
52 |
53 | ## Setup on amazon
54 |
55 | Building the rpm
56 | ```
57 | pushd resources/rpmbuild
58 | rpmbuild --nodeps --define "_topdir `pwd`" -ba SPECS/scibot.spec
59 | popd
60 | ```
61 | Once this is done scp the rpm to the host.
62 | Also scp the ssl certs over, or use letsencrypt to obtain a cert.
63 | If you are using using a cert from another registrar you may need to
64 | bundle your certs `cat my-cert.crt existing-bundle.crt > scicrunch.io.crt`
65 | (see https://gist.github.com/bradmontgomery/6487319 for details)
66 | See [nginx.conf](./resources/config_files/etc/nginx/scibot.conf)
67 | for details on where to put the certs after scping them over.
68 |
69 | Install steps run as root or via sudo.
70 | ```bash
71 | amazon-linux-extras install nginx1.12
72 | yum install scibot-9999-0.noarch.rpm # update with yum reinstall
73 | pip3 install pipenv wheel
74 | vim /etc/systemd/system/scibot-bookmarklet.service.d/env.conf # set api keys etc
75 | ```
76 |
77 | Install scibot codebase as the scibot user
78 | ```bash
79 | git clone https://github.com/SciCrunch/scibot.git
80 | pushd scibot
81 | pipenv install --skip-lock
82 | ```
83 | Hopefully this step will become simpler once we start pushing releases.
84 | `pipenv install scibot` or alternately it may also be possible to package
85 | everything we need in the rpm and only install that. With none of the other
86 | steps needed at all.
87 |
88 | Start services as root
89 | ```bash
90 | systemctl start nginx scibot-bookmarklet-sync scibot-bookmarklet
91 | ```
92 |
93 | ### Updating
94 | On the scibot host
95 | ```bash
96 | sudo su scibot -
97 | pushd scibot
98 | echo "$(date -Is) $(git rev-parse HEAD)" >> ~/previous-scibot-hashes
99 | git pull
100 | mv Pipfile.lock "Pipefile.lock.$(date -Is)"
101 | ~/.local/bin/pipenv install --skip-lock
102 | ```
103 |
104 | Restart as root
105 | ```bash
106 | systemctl restart scibot-bookmarklet-sync scibot-bookmarklet
107 | ```
108 |
109 | ### manual setup
110 | Install steps
111 | 0. ssh in to the host that will serve the script
112 | 1. `sudo yum install gcc libxml2 libxml2-devel libxslt libxslt-devel python36 python36-devel python36-pip`
113 | 2. `sudo alternatives --set python /usr/bin/python3.6`
114 | 3. `sudo pip install pipenv`
115 | 4. `git clone https://github.com/SciCrunch/scibot.git`
116 | 5. `cd scibot && python3.6 setup.py wheel && pipenv install dist/*.whl`
117 | 6. `export SCIBOT_USERNAME=someusername`
118 | 7. `export SCIBOT_GROUP=somegroupname`
119 | 8. `unset HISTFILE`
120 | 9. `export SCIBOT_API_TOKEN=sometoken`
121 | 10. `export SCIBOT_SYNC=somerandomnumber` (e.g. run `head -c 100 /dev/urandom | tr -dc 'a-zA-Z0-9'` every time)
122 | 11. create a screen session
123 | 12. in the screen session run `pipenv run scibot-server` you should create a link to the log files folder in ~/scibot/
124 | 13. get letsencrypt certs using certbot, follow directions [here](https://certbot.eff.org/docs/using.html) (prefer standalone)
125 |
126 |
127 | ## Development setup
128 | To set up scibot for development (for example if you want to run manual releases)
129 | 0. Install python3 and pip for your os (e.g. on macos use `brew`)
130 | 1. From your git folder run `git clone https://github.com/tgbugs/scibot.git`
131 | 2. `pushd scibot`
132 | 3. `pip3 install --user -e .` will install requirements and register the
133 | scibot folder that is under version control with python as the scibot module.
134 | 4. `popd`
135 |
136 | ## Dev server
137 | You should change `lol` to something less guessable even if it is only
138 | running on localhost.
139 |
140 | Run the following blocks in two separate terminals and connect to
141 | `https://localhost:4443/bookmarklet`. If you try `http` you will get
142 | a connection error.
143 |
144 | You will need to generate the self signed certs using openssl as
145 | described in the comments in [bookmarklet.py::main()](./scibot/bookmarklet.py).
146 | Alternately comment out the ssl wrapping of the socket and use `http`.
147 |
148 | ```bash
149 | SCIBOT_SYNC=lol python -m scibot.sync
150 | ```
151 |
152 | ```bash
153 | SCIBOT_USERNAME=scibot \
154 | SCIBOT_GROUP=dev-group \
155 | SCIBOT_GROUP2=dev-group \
156 | SCIBOT_GROUP_STAGING=__world__ \
157 | SCIBOT_API_TOKEN=hypothesis-api-key \
158 | SCIBOT_SYNC=lol \
159 | python -m scibot.bookmarklet
160 | ```
161 |
162 | ## If all else fails
163 | Make sure you have >=python3.6 and pip installed. Clone the repo and
164 | run `python setup.py develop --user`.
165 |
166 | ## CoC
167 | SciBot adheres to the Contributor Covenant:
168 | https://www.contributor-covenant.org/
169 |
--------------------------------------------------------------------------------
/bin/scibot-bookmarklet:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | gunicorn -b unix:/run/scibot-bookmarklet/socket \
3 | --pid /run/scibot-bookmarklet/pid \
4 | -n scibot-bookmarklet \
5 | -w 4 \
6 | -k gevent \
7 | -t 600 \
8 | --preload \
9 | --capture-output \
10 | --log-level debug \
11 | scibot.bookmarklet_server:app
12 |
--------------------------------------------------------------------------------
/bin/scibot-dashboard:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | gunicorn -b unix:/run/scibot-dashboard/socket --pid /run/scibot-dashboard/pid -n scibot-dashboard -w 4 -k gevent -t 600 --preload --log-level debug scibot.dash:app
3 | #gunicorn -b localhost:5000 -n scibot -w 4 -k gevent -t 600 --log-level debug server:app
4 |
--------------------------------------------------------------------------------
/bin/scibot-dbsetup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # scibot-dbsetup [PORT] [DATABASE]
3 |
4 | SOURCE="${BASH_SOURCE[0]}"
5 | while [ -h "$SOURCE" ]; do # resolve all symlinks
6 | DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
7 | SOURCE="$(readlink "$SOURCE")"
8 | [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # resolve relative symlinks
9 | done
10 | ABS_PATH="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
11 |
12 | SQL="${ABS_PATH}/../sql/"
13 | RESOURCES="${ABS_PATH}/../resources/"
14 |
15 | if [ -z $1 ]; then
16 | PORT=5432
17 | else
18 | PORT=$1
19 | fi
20 |
21 | if [ -z $2 ]; then
22 | DATABASE=scibot_test
23 | else
24 | DATABASE=$2
25 | fi
26 |
27 | # postgres setup
28 | psql -U postgres -h localhost -p $PORT -d postgres -f "${SQL}/postgres.sql" -v database=$DATABASE &&
29 | psql -U postgres -h localhost -p $PORT -d $DATABASE -f "${SQL}/extensions.sql" &&
30 |
31 | # scibot-admin setup
32 | psql -U scibot-admin -h localhost -p $PORT -d $DATABASE -f "${SQL}/schemas.sql"
33 | #psql -U scibot-admin -h localhost -p $PORT -d $DATABASE -f "${SQL}/permissions.sql" -v database=$DATABASE # FIXME this has to be run after populating the database via python
34 |
--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
1 | # Database setup
2 | ```bash
3 | export DBNAME=scibot_ASDF # WARNING this WILL overwrite existing databases
4 | scibot-dbsetup 5432 ${DBNAME} # initial db, user, extension, and schema creation
5 | scibot db-init ${DBNAME} # create the schema from the hypothesis orm code
6 | scibot api-sync ${DBNAME} # retrieve and load existing annotations
7 | ```
8 |
9 | # Installing services
10 | TODO
11 |
12 | # Starting services
13 | ## openrc
14 | ```bash
15 | /etc/init.d/scibot-ws-sync start
16 | ```
17 | ## systemd
18 | ```bash
19 | systemctl start scibot-ws-sync
20 | ```
21 |
--------------------------------------------------------------------------------
/docs/workflows.org:
--------------------------------------------------------------------------------
1 | * clear space and backup
2 | This is the workflow as it exists now, it is not remotely ideal.
3 |
4 | scibot.scicrunch.io
5 | #+begin_src bash
6 | pushd /var/lib/scibot/logs
7 | sudo gzip *.json
8 | sudo mv *.gz gzipped
9 | #+end_src
10 |
11 | orpheus
12 | #+begin_src bash
13 | DATE=2023-04-01
14 | scibot.scicrunch.io:/var/lib/scibot/logs/gzipped/* /home/tom/files/scibot/${DATE}/
15 | pushd ${DATE}
16 | gunzip *.json.gz
17 | popd
18 | XZ_OPT=-e9 tar -cvJf ${DATE}.tar.xz ${DATE}
19 | # confirm backup
20 | #+end_src
21 |
22 | athena
23 | #+begin_src bash
24 | DATE=2023-04-01
25 | pushd ~/nas/logs/scibot-backup
26 | rsync --progress -vapX orpheus:/home/tom/files/scibot/${DATE}.tar.xz .
27 | #+end_src
28 |
29 | orpheus
30 | #+begin_src bash
31 | DATE=2023-04-01
32 | # rm ${DATE}/*.json
33 | # rmdir ${DATE}
34 | #+end_src
35 |
36 | scibot.scicrunch.io
37 | #+begin_src bash
38 | pushd /var/lib/scibot/logs
39 | # sudo rm gzipped/*.json.gz
40 | #+end_src
41 |
42 |
--------------------------------------------------------------------------------
/resources/config_files/etc/nginx/nginx.conf:
--------------------------------------------------------------------------------
1 | user nginx nginx;
2 | worker_processes 1;
3 |
4 | error_log /var/log/nginx/error_log info;
5 |
6 | events {
7 | worker_connections 1024;
8 | use epoll;
9 | }
10 |
11 | http {
12 | include /etc/nginx/mime.types;
13 | default_type application/octet-stream;
14 |
15 | log_format main
16 | '$remote_addr - $remote_user [$time_local] '
17 | '"$request" $status $bytes_sent '
18 | '"$http_referer" "$http_user_agent" '
19 | '"$gzip_ratio"';
20 |
21 | client_header_timeout 10m;
22 | client_body_timeout 10m;
23 | proxy_read_timeout 900s;
24 | send_timeout 10m;
25 |
26 | connection_pool_size 256;
27 | client_header_buffer_size 1k;
28 | large_client_header_buffers 4 2k;
29 | request_pool_size 4k;
30 |
31 | gzip on;
32 | gzip_http_version 1.0;
33 | gzip_proxied any;
34 | gzip_min_length 500;
35 | gzip_disable "MSIE [1-6]\.";
36 | gzip_types text/plain text/xml text/css
37 | text/comma-separated-values
38 | text/javascript
39 | text/json
40 | application/json
41 | application/x-javascript
42 | application/atom+xml;
43 |
44 | output_buffers 1 32k;
45 | postpone_output 1460;
46 |
47 | sendfile on;
48 | tcp_nopush on;
49 | tcp_nodelay on;
50 |
51 | keepalive_timeout 75 20;
52 |
53 | ignore_invalid_headers on;
54 |
55 | include /etc/nginx/scibot.conf;
56 |
57 | server {
58 | listen 80;
59 | listen [::]:80;
60 | server_name localhost;
61 |
62 | access_log /var/log/nginx/default.access_log main;
63 | error_log /var/log/nginx/default.error_log info;
64 | location / {
65 | return 404;
66 | }
67 | }
68 |
69 | server {
70 | listen 443;
71 | listen [::]:443;
72 | server_name localhost;
73 |
74 | access_log /var/log/nginx/default.ssl_access_log main;
75 | error_log /var/log/nginx/default.ssl_error_log info;
76 | location / {
77 | return 404;
78 | }
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/resources/config_files/etc/nginx/scibot.conf:
--------------------------------------------------------------------------------
1 | upstream scibot-bookmarklet {
2 | # change this to point where it is actually running
3 | server unix:/run/scibot-bookmarklet/socket;
4 | }
5 |
6 | upstream scibot-dashboard {
7 | # change this to point where it is actually running
8 | server unix:/run/scibot-dashboard/socket;
9 | }
10 |
11 | server {
12 | listen 80;
13 | listen [::]:80;
14 | server_name scibot.scicrunch.io;
15 | return 301 https://$server_name$request_uri;
16 |
17 | access_log /var/log/nginx/scibot.scicrunch.io.access_log main;
18 | error_log /var/log/nginx/scibot.scicrunch.io.error_log info;
19 | }
20 |
21 | server {
22 | listen 443;
23 | listen [::]:443 ssl;
24 | server_name scibot.scicrunch.io;
25 | ssl on;
26 |
27 | # *.scicrunch.io certs
28 | ssl_certificate /etc/scicrunch.io/scicrunch.io.crt;
29 | ssl_certificate_key /etc/scicrunch.io/scicrunch.io.key;
30 |
31 | access_log /var/log/nginx/scibot.scicrunch.io.ssl_access_log main;
32 | error_log /var/log/nginx/scibot.scicrunch.io.ssl_error_log info;
33 |
34 | # from https://cipherli.st/
35 | # and https://raymii.org/s/tutorials/Strong_SSL_Security_On_nginx.html
36 |
37 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;
38 | ssl_prefer_server_ciphers on;
39 | ssl_ciphers "EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH";
40 | ssl_ecdh_curve secp384r1;
41 | ssl_session_cache shared:SSL:10m;
42 | ssl_session_tickets off;
43 | ssl_stapling on;
44 | ssl_stapling_verify on;
45 | resolver 8.8.8.8 8.8.4.4 valid=300s;
46 | resolver_timeout 5s;
47 | # disable HSTS header for now
48 | #add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload";
49 | add_header X-Frame-Options DENY;
50 | add_header X-Content-Type-Options nosniff;
51 | ssl_dhparam /etc/ssl/certs/dhparam.pem; # openssl dhparam -out /tmp/dhparam.pem 4096 # DO NOT RUN ON AMAZON scp it over
52 | location / {
53 | proxy_pass http://scibot-bookmarklet;
54 | client_max_body_size 20m;
55 | proxy_redirect off;
56 | proxy_set_header Host $host;
57 | proxy_set_header X-Real-IP $remote_addr;
58 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
59 | proxy_set_header X-Forwarded-Host $server_name;
60 | proxy_set_header X-Forwarded-Scheme $scheme;
61 | }
62 | location /dashboard {
63 | proxy_pass http://scibot-dashboard;
64 | client_max_body_size 20m;
65 | proxy_redirect off;
66 | proxy_set_header Host $host;
67 | proxy_set_header X-Real-IP $remote_addr;
68 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
69 | proxy_set_header X-Forwarded-Host $server_name;
70 | proxy_set_header X-Forwarded-Scheme $scheme;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/env.conf:
--------------------------------------------------------------------------------
1 | SCIBOT_GROUP=
2 | SCIBOT_USERNAME=
3 | SCIBOT_API_TOKEN=
4 | SCIBOT_SYNC=
5 | SOURCE_LOG_LOC=/var/lib/scibot/logs
6 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/scibot-bookmarklet-sync.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=scibot bookmarket sync daemon
3 | After=network.target
4 |
5 | [Service]
6 | PIDFile=/run/scibot-bookemarklet-sync/pid
7 | User=scibot
8 | Group=scibot
9 | RuntimeDirectory=scibot-bookmarklet-sync
10 | WorkingDirectory=/var/lib/scibot/scibot
11 | EnvironmentFile=/etc/systemd/system/scibot-bookmarklet.service.d/env.conf
12 | ExecStart=/var/lib/scibot/.local/bin/pipenv run scibot-sync
13 | ExecReload=/bin/kill -s HUP $MAINPID
14 | ExecStop=/bin/kill -s TERM $MAINPID
15 | PrivateTmp=true
16 |
17 | [Install]
18 | WantedBy=multi-user.target
19 | WantedBy=scibot-bookmarklet.service
20 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/scibot-bookmarklet.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=scibot bookmarket gunicorn daemon
3 | Requires=scibot-bookmarklet-sync.service
4 | After=network.target
5 |
6 | [Service]
7 | PIDFile=/run/scibot-bookemarklet/pid
8 | User=scibot
9 | Group=scibot
10 | RuntimeDirectory=scibot-bookmarklet
11 | WorkingDirectory=/var/lib/scibot/scibot
12 | EnvironmentFile=/etc/systemd/system/scibot-bookmarklet.service.d/env.conf
13 | ExecStart=/var/lib/scibot/.local/bin/pipenv run gunicorn \
14 | --bind unix:/run/scibot-bookmarklet/socket \
15 | --name scibot-bookmarklet \
16 | --workers 4 \
17 | --worker-class gevent \
18 | --timeout 600 \
19 | --group scibot \
20 | --user scibot \
21 | --log-level DEBUG \
22 | --log-file /var/log/scibot/bookmarklet.log \
23 | --capture-output \
24 | scibot.bookmarklet_server:app
25 | ExecReload=/bin/kill -s HUP $MAINPID
26 | ExecStop=/bin/kill -s TERM $MAINPID
27 | PrivateTmp=true
28 |
29 | [Install]
30 | WantedBy=multi-user.target
31 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/scibot-bookmarklet.socket:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=scibot bookmarklet gunicorn socket
3 |
4 | [Socket]
5 | ListenStream=/run/scibot-bookmarket/socket
6 |
7 | [Install]
8 | WantedBy=sockets.target
9 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/scibot-dashboard.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=scibot dashboard gunicorn daemon
3 | Requires=scibot-dashboard.socket
4 | After=network.target
5 |
6 | [Service]
7 | PIDFile=/run/scibot-dashboard/pid
8 | User={scibot-user}
9 | Group={scibot-user}
10 | RuntimeDirectory=scibot-dashboard
11 | WorkingDirectory=/home/{scibot-user}/run
12 | ExecStart=/home/{scibot-user}/.local/bin/pipenv run scibot-dashboard
13 | ExecReload=/bin/kill -s HUP $MAINPID
14 | ExecStop=/bin/kill -s TERM $MAINPID
15 | PrivateTmp=true
16 |
17 | [Install]
18 | WantedBy=multi-user.target
19 |
--------------------------------------------------------------------------------
/resources/config_files/etc/systemd/system/scibot-dashboard.socket:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=scibot dashboard gunicorn socket
3 |
4 | [Socket]
5 | ListenStream=/run/scibot-dashboard/socket
6 |
7 | [Install]
8 | WantedBy=sockets.target
9 |
--------------------------------------------------------------------------------
/resources/config_files/etc/tmpfiles.d/scibot-bookmarklet.conf:
--------------------------------------------------------------------------------
1 | d /run/scibot-bookmarklet 0755 scibot scibot -
2 | d /run/scibot-bookmarklet-sync 0755 scibot scibot -
3 |
--------------------------------------------------------------------------------
/resources/rpmbuild/.gitignore:
--------------------------------------------------------------------------------
1 | BUILD/*
2 | RPMS/*
3 | SRPMS/*
4 |
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/env.conf:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/systemd/system/env.conf
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/nginx.conf:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/nginx/nginx.conf
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/scibot-bookmarklet-sync.service:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/systemd/system/scibot-bookmarklet-sync.service
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/scibot-bookmarklet.conf:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/tmpfiles.d/scibot-bookmarklet.conf
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/scibot-bookmarklet.service:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/systemd/system/scibot-bookmarklet.service
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/scibot-bookmarklet.socket:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/systemd/system/scibot-bookmarklet.socket
--------------------------------------------------------------------------------
/resources/rpmbuild/SOURCES/scibot.conf:
--------------------------------------------------------------------------------
1 | ../../config_files/etc/nginx/scibot.conf
--------------------------------------------------------------------------------
/resources/rpmbuild/SPECS/scibot.spec:
--------------------------------------------------------------------------------
1 | # you must build this with --nodeps if you are not on a RHEL alike
2 | %define _unitdir /lib/systemd/system
3 | %define _etcdir /etc/systemd/system
4 |
5 | # building on gentoo makes this /var/lib for some reason :/
6 | %define _localstatedir /var
7 |
8 | %define scibot_user scibot
9 | %define scibot_group %{scibot_user}
10 | %define scibot_home %{_localstatedir}/lib/scibot
11 | %define scibot_log %{_localstatedir}/log/scibot
12 | %define scibot_source_log %{scibot_home}/logs
13 |
14 | %define name scibot
15 | %define version 9999
16 | Name: %{name}
17 | Version: %{version}
18 | Release: 0
19 | Summary: curation workflow automation and coordination
20 | License: Apache-2.0
21 | Url: https://github.com/SciCrunch/scibot
22 | BuildArch: noarch
23 | BuildRequires: systemd
24 | BuildRequires: git
25 | Requires: gcc # eventually this should be a build requires
26 | Requires: bash
27 | Requires: nginx
28 | Requires: python3
29 | Requires: python3-devel # eventual build requires
30 | Requires(post): systemd
31 | Requires(preun): systemd
32 | Requires(postun): systemd
33 |
34 | Source1: scibot-bookmarklet.socket
35 | Source2: scibot-bookmarklet.service
36 | Source3: scibot-bookmarklet-sync.service
37 | Source4: env.conf
38 | Source5: scibot-bookmarklet.conf
39 | Source6: nginx.conf
40 | Source7: scibot.conf
41 |
42 | %description
43 | curation workflow automation and coordination
44 |
45 | %prep
46 |
47 | if [[ ! -d %{buildroot} ]]; then
48 | mkdir %{buildroot};
49 | fi
50 |
51 | %define gitroot scibot
52 | if [[ ! -d %{gitroot} ]]; then
53 | git clone https://github.com/SciCrunch/scibot.git
54 | fi
55 |
56 | %build
57 | #pushd %{gitroot}
58 | #python3 setup.py bdist_wheel
59 | #%py3_build
60 |
61 | %install
62 | install -p -D -m 644 %{SOURCE1} %{buildroot}/%{_unitdir}/scibot-bookmarklet.socket
63 | install -p -D -m 644 %{SOURCE2} %{buildroot}/%{_unitdir}/scibot-bookmarklet.service
64 | install -p -D -m 644 %{SOURCE3} %{buildroot}/%{_unitdir}/scibot-bookmarklet-sync.service
65 | install -p -D -m 600 %{SOURCE4} %{buildroot}/%{_etcdir}/scibot-bookmarklet.service.d/env.conf
66 | install -p -D -m 644 %{SOURCE5} %{buildroot}/etc/tmpfiles.d/scibot-bookmarklet.conf
67 | install -p -D -m 644 %{SOURCE6} %{buildroot}/etc/nginx/nginx.conf
68 | install -p -D -m 644 %{SOURCE7} %{buildroot}/etc/nginx/scibot.conf
69 | #%py3_install
70 |
71 | %pre
72 | getent group %{scibot_group} > /dev/null || groupadd -r %{scibot_group}
73 | getent passwd %{scibot_user} > /dev/null || \
74 | useradd -r -m -d %{scibot_home} -g %{scibot_group} \
75 | -s /bin/bash -c "scibot services" %{scibot_user}
76 | if [[ ! -d %{scibot_log} ]]; then
77 | mkdir %{scibot_log} # owner?
78 | chown %{scibot_user}:%{scibot_group} %{scibot_log}
79 | fi
80 | if [[ ! -d %{scibot_source_log} ]]; then
81 | mkdir %{scibot_source_log}
82 | chown %{scibot_user}:%{scibot_group} %{scibot_source_log}
83 | fi
84 |
85 | %post
86 | systemd-tmpfiles --create
87 | systemctl enable nginx
88 | systemctl enable scibot-bookmarklet
89 | systemctl enable scibot-bookmarklet-sync
90 |
91 | %clean
92 | rm -rf %{buildroot}
93 |
94 | %files
95 | %{_unitdir}/scibot-bookmarklet.socket
96 | %{_unitdir}/scibot-bookmarklet.service
97 | %{_unitdir}/scibot-bookmarklet-sync.service
98 | %{_etcdir}/scibot-bookmarklet.service.d/env.conf
99 | /etc/tmpfiles.d/scibot-bookmarklet.conf
100 | /etc/nginx/nginx.conf
101 | /etc/nginx/scibot.conf
102 |
103 | %changelog
104 | # skip this for now
105 |
--------------------------------------------------------------------------------
/resources/scripts/scibot-monkey-button.user.js:
--------------------------------------------------------------------------------
1 | // ==UserScript==
2 | // @name SciBot Button
3 | // @namespace https://github.com/SciCrunch/scibot/tree/master/resources/scripts
4 | // @description Run SciBot in a way that ignores CORS
5 | // @match *://*/*
6 | // @exclude *://*.google.com/*
7 | // @exclude *://*.github.com/*
8 | // @exclude *://github.com/*
9 | // @version 1.0
10 | // @grant GM_addStyle
11 | // ==/UserScript==
12 |
13 | var zNode = document.createElement ('div');
14 | zNode.innerHTML = '';
15 | zNode.setAttribute ('id', 'scibotButtonContainer');
16 | document.body.appendChild (zNode);
17 |
18 | //--- Activate the newly added button.
19 | document.getElementById ("runSciBot").addEventListener (
20 | "click", ButtonClickAction, false
21 | );
22 |
23 | function ButtonClickAction (zEvent) {
24 | /*--- For our dummy action, we'll just add a line of text to the top
25 | of the screen.
26 | */
27 | document.getElementById ("scibotButtonContainer").remove();
28 | var xhr=new XMLHttpRequest();
29 | var params=('uri=' + location.href +
30 | '&head=' + encodeURIComponent(document.head.innerHTML) +
31 | '&body=' + encodeURIComponent(document.body.innerHTML) +
32 | '&data=' + encodeURIComponent(document.body.innerText));
33 | xhr.open('POST', 'https://scibot.scicrunch.io/rrid', true);
34 | xhr.setRequestHeader('Content-type', 'application/x-www-form-urlencoded');
35 | xhr.setRequestHeader('Access-Control-Allow-Origin', '*');
36 | xhr.onreadystatechange=function(){if(xhr.readyState==4) console.log('rrids: ' + xhr.responseText)};
37 | xhr.send(params)
38 | }
39 |
40 | GM_addStyle ( `
41 | #scibotButtonContainer {
42 | position: absolute;
43 | top: 0;
44 | left: 0;
45 | font-size: 20px;
46 | background: orange;
47 | border: 3px outset black;
48 | margin: 5px;
49 | opacity: 0.9;
50 | z-index: 9999;
51 | padding: 5px 20px;
52 | }
53 | #runSciBot {
54 | cursor: pointer;
55 | }
56 | #scibotButtonContainer p {
57 | color: red;
58 | background: white;
59 | }
60 | ` );
61 |
--------------------------------------------------------------------------------
/scibot/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.3'
2 |
--------------------------------------------------------------------------------
/scibot/anno.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from hyputils.memex import models
3 | from hyputils.memex.util.uri import normalize as uri_normalize
4 | from hyputils.memex.models.document import update_document_metadata
5 | from hyputils.memex.schemas.annotation import CreateAnnotationSchema
6 | from pyontutils.utils import anyMembers, noneMembers
7 |
8 |
9 | class FakeRequest:
10 | def __init__(self, json):
11 | self.json = json
12 | self.authenticated_userid = json['user']
13 |
14 |
15 | def validate(j):
16 | request = FakeRequest(j)
17 | schema = CreateAnnotationSchema(request)
18 | appstruct = schema.validate(j)
19 | return appstruct
20 |
21 |
22 | def extract_extra(j):
23 | return j['id'], j['created'], j['updated']
24 |
25 |
26 | def make_anno(data, dbdocs):
27 | #document_uri_dicts = data['document']['document_uri_dicts']
28 | #document_meta_dicts = data['document']['document_meta_dicts']
29 | #del data['document']
30 | #data = {k:v for k, v in data.items() if k != 'document'} # prevent overwrite on batch load
31 |
32 | annotation = models.Annotation(**data) # FIXME for batch the overhead here is stupid beyond belief
33 | annotation.document_id = dbdocs[uri_normalize(annotation.target_uri)].id
34 | #for k, v in data.items():
35 | #print(k, v)
36 | #setattr(annotation, k, v)
37 | #id, created, updated = extra
38 | #annotation.id = id
39 | #annotation.created = created
40 | #annotation.updated = updated
41 |
42 | return annotation
43 |
44 | # this baby is super slow
45 | document = update_document_metadata(
46 | session,
47 | annotation.target_uri,
48 | document_meta_dicts,
49 | document_uri_dicts,
50 | created=created,
51 | updated=updated)
52 | annotation.document = document
53 |
54 | return annotation
55 |
56 |
57 | def quickload(j):
58 | """ a quickload routine for json that comes from the hypothes.is api
59 | and that has already passed the json schema validate checks """
60 |
61 | return {
62 | 'id':j['id'],
63 | 'created':j['created'],
64 | 'updated':j['updated'],
65 | #'document':{},
66 | 'extra':{},
67 | 'groupid':j['group'],
68 | 'references':j['references'] if 'references' in j else [],
69 | 'shared':not j['hidden'] if 'hidden' in j else True, # some time in august hidden was dropped
70 | 'tags':j['tags'],
71 | 'target_selectors':[selector
72 | for selector_sources in j['target']
73 | if 'selector' in selector_sources
74 | for selector in selector_sources['selector']] ,
75 | 'target_uri':j['uri'], # FIXME check on this vs selectors
76 | 'text':j['text'],
77 | 'userid':j['user'],
78 | }
79 |
80 |
81 | def doc(j):
82 | # FIXME this skips the normalize routines ...
83 | return {'document_meta_dicts': ([{'claimant': j['uri'],
84 | 'type': 'title', # FIXME see if more
85 | 'value': j['document']['title']}]
86 | if 'title' in j['document']
87 | else []),
88 | 'document_uri_dicts': [{'claimant': j['uri'],
89 | 'content_type': '', # FIXME see if more
90 | 'type': 'self-claim', # FIXME see if more
91 | 'uri': j['uri']}]}
92 |
93 |
94 | def mdoc(uri, claims):
95 | return {'document_meta_dicts': claims,
96 | 'document_uri_dicts': [{'claimant': uri,
97 | 'content_type': '', # FIXME see if more
98 | 'type': 'self-claim', # FIXME see if more
99 | 'uri': uri}]}
100 |
101 |
102 | def add_doc_all(uri, created, updated, claims): # batch only run once
103 | doc = models.Document(created=created, updated=updated)
104 | duri = models.DocumentURI(document=doc, # how does this play out w/o creating explicitly?
105 | claimant=uri,
106 | uri=uri,
107 | type='self-claim',
108 | created=created,
109 | updated=updated)
110 | #yield doc
111 | #yield duri
112 | for claim in claims:
113 | #yield
114 | models.DocumentMeta(document=doc,
115 | created=created,
116 | updated=updated,
117 | # FIXME for this we may need to pull the latest??? or no
118 | **claim)
119 |
120 | return doc
121 |
122 |
123 | def quickuri(j):
124 | return (j['created'],
125 | j['updated'],
126 | [{'claimant':j['uri'], 'type':k, 'value':v}
127 | for k, v in j['document'].items()])
128 |
--------------------------------------------------------------------------------
/scibot/bookmarklet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """SciBot server implementation
3 |
4 | Usage:
5 | bookmarklet [options]
6 |
7 | Options:
8 | -s --sync-port=PORT the port that the sync services is running on
9 | """
10 |
11 | import re
12 | import csv
13 | import ssl
14 | import gzip
15 | import json
16 | from io import StringIO
17 | from typing import Callable, Iterable, Tuple, Any, Generator
18 | from pathlib import Path
19 | from datetime import datetime
20 | from curio import run
21 | from curio.channel import AuthenticationError
22 | from flask import Flask, request, abort
23 | from hyputils.hypothesis import HypothesisUtils
24 | from scibot.config import source_log_location
25 | from scibot.utils import log
26 | from scibot.export import export_impl, export_json_impl
27 |
28 | try:
29 | from scibot.workflow import curatorTags
30 | except ImportError:
31 | # FIXME don't want a hard rdflib dependency here
32 | curatorTags = lambda : []
33 |
34 | # logging
35 |
36 | def write_stdout(target_uri, document, doi, pmid, found_rrids, head, body, text, h):
37 | log.info(f'DOI:{doi}')
38 | log.info(pmid)
39 |
40 |
41 | def write_log(target_uri, document, doi, pmid, found_rrids, head, body, text, h):
42 | now = datetime.now().isoformat()[0:19].replace(':','').replace('-','')
43 | frv = list(set(found_rrids.values()))
44 | if len(frv) == 1 and frv[0] == 'Already Annotated':
45 | head, body, text = None, None, None
46 | log = {'target_uri':target_uri,
47 | 'group':h.group,
48 | 'doi':doi,
49 | 'pmid':pmid,
50 | 'found_rrids':found_rrids,
51 | 'count':len(found_rrids),
52 | 'head':head,
53 | 'body':body,
54 | 'text':text,
55 | 'document': document,
56 | }
57 | fname = Path(source_log_location, f'rrid-{now}.json')
58 | with open(fname.as_posix(), 'wt') as f:
59 | json.dump(log, f, sort_keys=True, indent=4)
60 |
61 | # types
62 |
63 | Found = Tuple[str, str, str, str]
64 | Finder = Callable[[str], Iterable[Found]]
65 | Checker = Callable[[Found], bool]
66 | Resolved = Tuple[str, int, str]
67 | Resolver = Callable[[Found], Resolved]
68 | Submitter = Callable[[Found, Resolved], Any]
69 | Processor = Callable[[str, str], Generator]
70 |
71 | # bookmarklet endpoint
72 |
73 | bookmarklet_base = r"""
74 | javascript:(function(){var xhr=new XMLHttpRequest();
75 |
76 | var params='uri='+location.href+
77 | '&head='+encodeURIComponent(document.head.innerHTML)+
78 | '&body='+encodeURIComponent(document.body.innerHTML)+
79 | '&data='+encodeURIComponent(document.body.innerText);
80 |
81 | xhr.open('POST','%s/%s',true);
82 | xhr.setRequestHeader('Content-type','application/x-www-form-urlencoded');
83 | xhr.setRequestHeader('Access-Control-Allow-Origin','*');
84 | xhr.onreadystatechange=function(){if(xhr.readyState==4)console.log('rrids: '+xhr.responseText)};
85 | xhr.send(params)}());
86 | """
87 |
88 | html_base = """
89 |
90 |
94 | SciBot bookmarklet
95 |
96 |
SciBot
97 |
To install the bookmarklet, drag this link -- SciBot %s -- to your bookmarks bar.
98 |
If you need to copy/paste the bookmarklet's code into a bookmarklet, it's here:
99 | %s
100 |
101 |
102 | """
103 |
104 |
105 | def bookmarklet_wrapper(request, endpoint):
106 | """ Return text of the SciBot bookmarklet """
107 | normalized = 'https://' + request.host
108 | code = bookmarklet_base % (normalized, endpoint)
109 | bookmarklet = code.replace('"', '"').replace('\n','')
110 | html = html_base % (bookmarklet, request.host.split('.', 1)[-1], code)
111 | return html
112 |
113 |
114 | # rrid endpoint
115 |
116 | from scibot.extract import process_POST_request, find_rrids as finder
117 | from scibot.check import check_already_submitted
118 | from scibot.services import existing_tags, get_pmid, rrid_resolver_xml
119 | from scibot.submit import annotate_doi_pmid, submit_to_h
120 |
121 |
122 | def make_find_check_resolve_submit(finder: Finder, notSubmittedCheck: Checker,
123 | resolver: Resolver, submitter: Submitter) -> Processor:
124 | def inner(text: str) -> Generator:
125 | for found in finder(text):
126 | log.info(found)
127 | if notSubmittedCheck(found):
128 | resolved = resolver(found)
129 | yield submitter(found, resolved)
130 |
131 | return inner
132 |
133 |
134 | def pmid_logic(doi, pmid_from_source, target_uri=None, document=None, h=None, tags=None):
135 | # TODO move the annotation of errors out of this
136 | if doi:
137 | pmid_from_doi = get_pmid(doi)
138 | else:
139 | pmid_from_doi = None
140 |
141 | if pmid_from_source and pmid_from_doi:
142 | if pmid_from_source == pmid_from_doi:
143 | pmid = pmid_from_source
144 | else:
145 | # TODO responses -> db
146 | # TODO tag for marking errors explicitly without the dashboard?
147 | r1 = annotate_doi_pmid(target_uri, document, None, pmid_from_doi, h, tags, 'ERROR\nPMID from DOI')
148 | r2 = annotate_doi_pmid(target_uri, document, None, pmid_from_source, h, tags, 'ERROR\nPMID from source')
149 | pmid = None
150 | elif pmid_from_source:
151 | pmid = pmid_from_source
152 | elif pmid_from_doi:
153 | pmid = pmid_from_doi
154 | else:
155 | pmid = None
156 |
157 | return pmid
158 |
159 |
160 | def rrid_POST(request, h, logloc, URL_LOCK):
161 | (target_uri, document, doi, pmid_from_source,
162 | head, body, text, cleaned_text) = process_POST_request(request)
163 | running = URL_LOCK.start_uri(target_uri)
164 | log.info(target_uri)
165 | if running:
166 | log.info('################# EARLY EXIT')
167 | return 'URI Already running ' + target_uri
168 |
169 | try:
170 | tags, unresolved_exacts = existing_tags(target_uri, h)
171 | pmid = pmid_logic(doi, pmid_from_source, target_uri, document, h, tags)
172 | r = annotate_doi_pmid(target_uri, document, doi, pmid, h, tags) # todo r -> db with responses
173 |
174 | # these values are defined up here as shared state that will be
175 | # mutated across multiple calls to checker, resolver, and submitter
176 | # this is a really bad design because it is not clear that processText
177 | # actually does this ... once again, python is best if you just use the
178 | # objects and give up any hope for an alternative approach, the way it
179 | # is done here also makes the scope where these values could be used
180 | # completely ambiguous and hard to understand/reason about
181 |
182 | found_rrids = {}
183 | existing = []
184 | existing_with_suffixes = []
185 |
186 | def checker(found):
187 | prefix, exact, exact_for_hypothesis, suffix = found
188 | return not check_already_submitted(exact, exact_for_hypothesis,
189 | found_rrids, tags, unresolved_exacts)
190 |
191 | def resolver(found):
192 | prefix, exact, exact_for_hypothesis, suffix = found
193 | return rrid_resolver_xml(exact, found_rrids)
194 |
195 | def submitter(found, resolved):
196 | return submit_to_h(target_uri, document, found, resolved, h, found_rrids,
197 | existing, existing_with_suffixes)
198 |
199 | processText = make_find_check_resolve_submit(finder, checker, resolver, submitter)
200 |
201 | responses = list(processText(cleaned_text)) # this call runs everything
202 |
203 | results = ', '.join(found_rrids.keys())
204 | write_stdout(target_uri, document, doi, pmid, found_rrids, head, body, text, h)
205 | write_log(target_uri, document, doi, pmid, found_rrids, head, body, text, h)
206 |
207 | except BaseException as e:
208 | # there are some other linger issues that are what was causing
209 | # uris to get stuck as always running in sync
210 | log.exception(e)
211 | raise e
212 |
213 | finally:
214 | URL_LOCK.stop_uri(target_uri)
215 |
216 | return results, 200, {'Content-Type': 'text/plain',
217 | 'Access-Control-Allow-Origin':'*'}
218 |
219 |
220 | def rrid_OPTIONS(request):
221 | try:
222 | request_headers = request.headers['Access-Control-Request-Headers'].lower()
223 | request_headers = re.findall('\w(?:[-\w]*\w)', request_headers)
224 | except KeyError:
225 | request_headers = []
226 | response_headers = ['access-control-allow-origin']
227 | for req_acoa_header in request_headers:
228 | if req_acoa_header not in response_headers:
229 | response_headers.append(req_acoa_header)
230 | response_headers = ','.join(response_headers)
231 | return '', 204, {'Access-Control-Allow-Origin': '*',
232 | 'Access-Control-Allow-Headers': response_headers}
233 |
234 |
235 | def rrid_wrapper(request, h, logloc, URL_LOCK):
236 | """ Receive an article, parse RRIDs, resolve them, create annotations, log results """
237 | if request.method == 'OPTIONS':
238 | return rrid_OPTIONS(request)
239 | elif request.method == 'POST':
240 | return rrid_POST(request, h, logloc, URL_LOCK)
241 | else:
242 | return abort(405)
243 |
244 |
245 | def main(local=False):
246 | from scibot.config import api_token, username, group, group2
247 | print(username, group, group2) # sanity check
248 | from scibot.sync import __doc__ as sync__doc__, Locker, client
249 | from scibot.config import syncword
250 | if syncword is None:
251 | raise KeyError('Please set the SCIBOT_SYNC environment variable')
252 |
253 | from docopt import docopt, parse_defaults
254 | _sdefaults = {o.name:o.value if o.argcount else None for o in parse_defaults(sync__doc__)}
255 | _backup_sync_port = int(_sdefaults['--port'])
256 |
257 | app = Flask('scibot bookmarklet server')
258 |
259 | h = HypothesisUtils(username=username, token=api_token, group=group)
260 | h2 = HypothesisUtils(username=username, token=api_token, group=group2)
261 |
262 | if __name__ == '__main__':
263 | args = docopt(__doc__)
264 | _sync_port = args['--sync-port']
265 |
266 | if _sync_port:
267 | sync_port = int(_sync_port)
268 | else:
269 | sync_port = _backup_sync_port
270 | else:
271 | sync_port = _backup_sync_port
272 |
273 | chan = 'localhost', sync_port
274 |
275 | # TODO
276 | #try:
277 | #except AuthenticationError as e:
278 | #raise e
279 | send = run(client, chan, syncword)
280 | URL_LOCK = Locker(send)
281 | app.URL_LOCK = URL_LOCK
282 |
283 | #@app.route('/synctest', methods=['GET'])
284 | def synctest():
285 | URL_LOCK.start_uri('a-test-uri')
286 | URL_LOCK.stop_uri('a-test-uri')
287 | return 'test-passed?'
288 |
289 | synctest()
290 |
291 | @app.route('/controlled-tags', methods=['GET'])
292 | def route_controlled_tags():
293 | curator_tags = curatorTags() # TODO need client support for workflow:RRID -> * here
294 | return '\n'.join(curator_tags), 200, {'Content-Type':'text/plain; charset=utf-8'}
295 |
296 | @app.route('/rrid', methods=['POST', 'OPTIONS'])
297 | def rrid():
298 | return rrid_wrapper(request, h, 'logs/rrid/', URL_LOCK)
299 |
300 | @app.route('/validaterrid', methods=['POST', 'OPTIONS'])
301 | def validaterrid(request):
302 | return rrid_wrapper(request, h2, 'logs/validaterrid/', URL_LOCK)
303 |
304 | @app.route('/bookmarklet', methods=['GET'])
305 | def bookmarklet():
306 | return bookmarklet_wrapper(request, 'rrid')
307 |
308 | @app.route('/validatebookmarklet', methods=['GET'])
309 | def validatebookmarklet():
310 | return bookmarklet_wrapper(request, 'validaterrid')
311 |
312 | @app.route('/export', methods=['GET'])
313 | def export():
314 | print('starting csv export')
315 | output_rows, DATE = export_impl()
316 | data = StringIO()
317 | writer = csv.writer(data)
318 | writer.writerows(sorted(output_rows))
319 | return gzip.compress(data.getvalue().encode()), 200, {
320 | 'Content-Type': 'text/csv',
321 | 'Content-Disposition': 'attachment;filename = RRID-data-%s.csv' % DATE,
322 | 'Content-Encoding': 'gzip'}
323 |
324 | @app.route('/export.json', methods=['GET'])
325 | def export_json():
326 | print('starting json export')
327 | output_json, DATE = export_json_impl()
328 | data = json.dumps(output_json, sort_keys=True, indent=4)
329 |
330 | return gzip.compress(data.encode()), 200, {
331 | 'Content-Type': 'application/json',
332 | 'Content-Encoding': 'gzip'}
333 |
334 | if not local:
335 | return app
336 | else:
337 | from os.path import expanduser
338 | from wsgiref.simple_server import make_server
339 | from scibot.config import test_host, port_bookmarklet
340 |
341 | print('no login detected, running on localhost only')
342 | host = test_host
343 | port = port_bookmarklet
344 |
345 | print('host: %s, port %s' % ( host, port ))
346 | server = make_server(host, port, app)
347 | # openssl req -new -x509 -keyout scibot-self-sign-temp.pem -out scibot-self-sign-temp.pem -days 365 -nodes
348 | #server.socket = ssl.wrap_socket(server.socket,
349 | #keyfile='/etc/letsencrypt/live/scibot.scicrunch.io/privkey.pem',
350 | #certfile='/etc/letsencrypt/live/scibot.scicrunch.io/fullchain.pem',
351 | #server_side=True)
352 | server.socket = ssl.wrap_socket(server.socket,
353 | keyfile=expanduser('~/files/certs/scibot_test/tmp-nginx.key'),
354 | certfile=expanduser('~/files/certs/scibot_test/tmp-nginx.crt'),
355 | server_side=True)
356 | log.debug('serving forever')
357 | server.serve_forever()
358 |
359 |
360 | if __name__ == '__main__':
361 | main(local=True)
362 |
--------------------------------------------------------------------------------
/scibot/bookmarklet_server.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | os.sys.stdout.write(f'\x1b]2;{os.path.basename(__name__)}\x07\n')
4 |
5 | from scibot.bookmarklet import main
6 | app = main()
7 |
8 | if __name__ == '__main__':
9 | from scibot import config
10 | app.run(host='localhost', port=config.port_bookmarklet, threaded=True)
11 |
--------------------------------------------------------------------------------
/scibot/check.py:
--------------------------------------------------------------------------------
1 |
2 | def check_already_submitted(exact, exact_for_hypothesis, found_rrids, tags, unresolved_exacts):
3 | if exact in tags or exact_for_hypothesis in unresolved_exacts:
4 | print('\tskipping %s, already annotated' % exact)
5 | found_rrids[exact] = 'Already Annotated'
6 | return True
7 |
8 |
--------------------------------------------------------------------------------
/scibot/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """SciBot command line utilities
3 |
4 | Usage:
5 | scibot db-init [options] []
6 | scibot api-sync [options] []
7 | scibot ws-sync [options] []
8 | scibot debug [options] []
9 |
10 | Options:
11 | -h --help show this
12 | -d --debug enable echo and embed
13 | -k --check when syncing run checks (required to insert)
14 | """
15 |
16 | import os
17 | try:
18 | breakpoint
19 | except NameError:
20 | from IPython import embed as breakpoint
21 |
22 |
23 | def main():
24 | from docopt import docopt
25 | args = docopt(__doc__)
26 | database = args['']
27 | if database is not None:
28 | os.environ.update({'SCIBOT_DATABASE': database})
29 |
30 | from scibot import config
31 | from scibot.db import getSession, init_scibot, AnnoSyncFactory, WebsocketSyncFactory
32 |
33 | if args['db-init']:
34 | # insurace, it is passed into init direclty as well
35 | #os.system(f'scibot-dbsetup {config.dbPort()} {database}')
36 | # the above should be done manually to prevent fat fingers
37 | init_scibot(database)
38 |
39 | elif args['api-sync']:
40 | check = args['--check']
41 | dburi = config.dbUri(user='scibot-admin') # needed ofr update
42 | session = getSession(dburi=dburi, echo=args['--debug'])
43 | AnnoSync = AnnoSyncFactory(session)
44 | cur_sync = AnnoSync(config.api_token, config.username,
45 | config.group, config.memfile)
46 | cur_sync.sync_annos(check=check)
47 | pub_sync = AnnoSync(config.api_token, config.username,
48 | config.group_staging, config.pmemfile)
49 | pub_sync.sync_annos(check=check)
50 |
51 | elif args['ws-sync']:
52 | session = getSession(echo=args['--debug'])
53 | WebsocketSync = WebsocketSyncFactory(session)
54 | wss = WebsocketSync(config.api_token, config.username, config.group)
55 | wss.run()
56 |
57 | elif args['debug']:
58 | from time import time
59 | session = getSession(echo=args['--debug'])
60 | if True:
61 | dcount = {r.uri:r.document_id
62 | for r in session.execute('SELECT uri, document_id FROM document_uri')}
63 | from hyputils.memex import models
64 | from hyputils.hypothesis import Memoizer
65 | from scibot.anno import disambiguate_uris
66 | from interlex.core import makeParamsValues
67 | mem = Memoizer(config.memfile, config.api_token, config.username, config.group)
68 | annos, last_updated = mem.get_annos_from_file()
69 | uris = set(a.uri for a in annos)
70 | dd = disambiguate_uris(uris)
71 | multi = [v for v in dd.values() if len(v) > 1]
72 | _rows = [a._row for a in annos]
73 | AnnoSync = AnnoSyncFactory(session)
74 | cur_sync = AnnoSync(config.api_token, config.username, config.group)
75 |
76 | rows = _rows
77 |
78 | # rows = [r for r in _rows if 'articles/4-42/' in r['uri']]
79 | # rows = [r for r in _rows if '10.1002/jnr.23615' in r['uri']]
80 | # rows = [r for r in _rows if 'ncomms8028' in r['uri']] # TODO res chain these
81 | # rows = [r for r in _rows if '?term=Gene' in r['uri']]
82 | # rows = [r for r in _rows if 'index.php?' in r['uri']]
83 | # rows = [r for r in _rows if 'govhttp' in r['uri']] # maximum wat
84 | # rows = [r for r in _rows if 'fasebj.org' in r['uri']]
85 |
86 | check = False
87 |
88 | cur_sync.memoization_file = config.memfile
89 | cur_sync.sync_annos(check=check)
90 |
91 |
92 | return
93 | cur_sync.sync_annos(api_rows=rows, check=check)
94 | # when remote the upload bandwidth is now the limiting factor
95 | session.rollback()
96 | cur_sync.sync_annos(check=check)
97 | session.rollback()
98 | breakpoint()
99 |
100 |
101 | if __name__ == '__main__':
102 | main()
103 |
--------------------------------------------------------------------------------
/scibot/config.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from os import environ
3 | from socket import gethostname
4 | from pathlib import Path
5 | from hyputils.hypothesis import group_to_memfile, ucd
6 |
7 | # ports
8 | port_bookmarklet = 4443
9 | port_dashboard = 8080
10 |
11 | ## WARNING if you change one of these update the file in bin/
12 | port_guni_bookmarket = 5000 # scibot-bookmarklet
13 | port_guni_dashboard = 5005 # scibot-dashboard
14 |
15 | # dev
16 | dev_remote_hosts = 'athena', 'arachne'
17 |
18 | # testing
19 | test_host = 'localhost'
20 | test_port = port_bookmarklet
21 | test_database = '__scibot_testing'
22 |
23 | # db
24 | user = 'scibot-user'
25 | database = environ.get('SCIBOT_DATABASE', test_database)
26 |
27 |
28 | def dbPort():
29 | return 54321 if gethostname() in dev_remote_hosts else 5432
30 |
31 |
32 | def dbUri(user=user, host='localhost', port=dbPort(), database=database):
33 | if hasattr(sys, 'pypy_version_info'):
34 | dialect = 'psycopg2cffi'
35 | else:
36 | dialect = 'psycopg2'
37 | return f'postgresql+{dialect}://{user}@{host}:{port}/{database}'
38 |
39 |
40 | # mq
41 | vhost = 'scibot'
42 | broker_url = environ.get('CELERY_BROKER_URL',
43 | environ.get('BROKER_URL',
44 | 'amqp://guest:guest@localhost:5672//'))
45 | broker_backend = environ.get('CELERY_BROKER_BACKEND',
46 | environ.get('BROKER_BACKEND',
47 | 'rpc://'))
48 | accept_content = ('pickle', 'json')
49 |
50 | # logging
51 | source_log_location = environ.get('SOURCE_LOG_LOC',
52 | (Path(__file__).parent.parent /
53 | 'logs').as_posix())
54 |
55 | # hypothesis
56 | api_token = environ.get('SCIBOT_API_TOKEN', 'TOKEN') # Hypothesis API token
57 | username = environ.get('SCIBOT_USERNAME', 'USERNAME') # Hypothesis username
58 | group = environ.get('SCIBOT_GROUP', '__world__')
59 | group2 = environ.get('SCIBOT_GROUP2', '__world__')
60 | group_staging = environ.get('SCIBOT_GROUP_STAGING', '__world__')
61 | syncword = environ.get('SCIBOT_SYNC')
62 |
63 | READ_ONLY = True
64 | if group_staging == '__world__' and not READ_ONLY:
65 | raise IOError('WARNING YOU ARE DOING THIS FOR REAL PLEASE COMMENT OUT THIS LINE')
66 |
67 | def _post(group_hash):
68 | if group_hash.startswith('f'):
69 | print('Real annos')
70 | elif group_hash.startswith('9'):
71 | print('Test annos')
72 |
73 | memfile = group_to_memfile(group, _post)
74 |
75 | pmemfile = f'{ucd}/scibot/annos-__world__-{username}.json'
76 |
77 | if group_staging == '__world__':
78 | smemfile = f'{ucd}/scibot/annos-__world__-{username}.json'
79 | else:
80 | smemfile = group_to_memfile(group_staging)
81 |
82 | # rrid resolver
83 | resolver_xml_filepath = Path('~/ni/dev/rrid/scibot/scibot_rrid_xml.pickle').expanduser() # FIXME
84 |
--------------------------------------------------------------------------------
/scibot/dash.py:
--------------------------------------------------------------------------------
1 | from gevent import monkey
2 | monkey.patch_all()
3 |
4 | import os
5 |
6 | os.sys.stdout.write(f'\x1b]2;{os.path.basename(__name__)}\x07\n')
7 |
8 | from scibot.dashboard import setup
9 | app = setup()
10 |
--------------------------------------------------------------------------------
/scibot/dashboard.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import atexit
3 | from os import environ
4 | from pathlib import Path
5 | from jinja2 import ChoiceLoader, FileSystemLoader
6 | from scibot.utils import PMID, DOI
7 | from scibot.config import api_token, username, group, group_staging, memfile, pmemfile
8 | from scibot.release import Curation, PublicAnno
9 | from scibot.export import bad_tags
10 | from pyontutils.utils import anyMembers, noneMembers
11 | from htmlfn import render_table, htmldoc, atag, divtag
12 | from htmlfn import table_style, navbar_style, cur_style
13 |
14 | from hyputils.subscribe import preFilter, AnnotationStream
15 | from hyputils.handlers import helperSyncHandler, filterHandler
16 | from hyputils.hypothesis import Memoizer
17 | from flask import Flask, render_template, request, url_for
18 | try:
19 | breakpoint
20 | except NameError:
21 | from IPython import embed as breakpoint
22 |
23 | print('END IMPORTS')
24 |
25 | def route(route_name):
26 | def wrapper(function):
27 | def inner(*args, **kwargs):
28 | print(route_name)
29 | return function(*args, **kwargs)
30 | return inner
31 | return wrapper
32 |
33 | def make_app(annos, pannos=[]):
34 |
35 | app = Flask('scibot dashboard')
36 |
37 | template_loader = ChoiceLoader([app.jinja_loader,
38 | FileSystemLoader([(Path(__file__).parent.parent / 'templates').as_posix()])])
39 | app.jinja_loader = template_loader
40 |
41 | [Curation(a, annos) for a in annos]
42 | [PublicAnno(a, pannos) for a in pannos]
43 | base_url = '/dashboard/'
44 | names = ['missing', 'incorrect', 'papers', 'unresolved',
45 | 'no-pmid', 'no-doi', 'no-annos', 'table', 'Journals']
46 |
47 | def tag_string(c):
48 | return ' '.join(sorted(t.replace('RRIDCUR:', '')
49 | for t in c.tags if 'RRIDCUR' in t))
50 |
51 | def filter_rows(filter=lambda c: True):
52 | yield from ((str(i + 1),
53 | tag_string(c),
54 | atag(PMID(c.pmid), c.pmid, new_tab=True)
55 | if c.pmid
56 | else (atag(DOI(c.doi), c.doi, new_tab=True)
57 | if c.doi
58 | else ''),
59 | atag(c.shareLink, 'Annotation', new_tab=True)
60 | if c # FIXME how does this work?
61 | else atag(c.uri, 'Paper', new_tab=True),
62 | atag(c.htmlLink, 'Anno HTML', new_tab=True),
63 | c.user,
64 | '\n'.join(c.curator_notes))
65 | for i, c in enumerate(sorted((c for c in Curation
66 | if c.isAstNode
67 | and not c.Duplicate
68 | and not c.corrected # FIXME need a better way...
69 | and not c.public_id
70 | and filter(c)),
71 | key=tag_string)))
72 | k = 0
73 | kList = []
74 | URLDict = {}
75 | for h in Curation:
76 | if BaseURL(h._anno) in URLDict.keys():
77 | URLDict[BaseURL(h._anno)] += 1
78 | else:
79 | URLDict[BaseURL(h._anno)] = 1
80 | kList.append(k)
81 | class NavBar:
82 | def atag(self, route, name):
83 | if route == self.current_route:
84 | return atag(url_for(route), name, cls='navbar-select')
85 | else:
86 | return atag(url_for(route), name)
87 |
88 | def __call__(self, route=None):
89 | self.current_route = route
90 | out = divtag(self.atag('route_base', 'Home'),
91 | self.atag('route_papers', 'Papers'),
92 | self.atag('route_anno_help_needed', 'Help Needed'),
93 | self.atag('route_anno_incorrect', 'Incorrect'),
94 | self.atag('route_anno_unresolved', 'Unresolved'),
95 | self.atag('route_anno_missing', 'Missing'),
96 | self.atag('route_no_pmid', 'No PMID'),
97 | self.atag('route_no_doi', 'No DOI'),
98 | self.atag('route_no_id', 'No ID'),
99 | #self.atag('route_no_annos', 'No annos'),
100 | self.atag('route_table', 'All'),
101 | # TODO search box
102 | atag('https://github.com/SciCrunch/scibot/issues',
103 | 'GitHub issues', new_tab=True),
104 | cls='navbar')
105 | self.current_route = None
106 | return out
107 |
108 | navbar = NavBar()
109 |
110 | def table_rows(rows, title, route):
111 | return htmldoc(navbar(route),
112 | divtag(render_table(rows, '#', 'Problem', 'Identifier', 'Link', 'HTML Link', 'Curator', 'Notes'),
113 | cls='main'),
114 | title=title,
115 | styles=(table_style, cur_style, navbar_style))
116 |
117 | def nonestr(thing):
118 | return '' if thing is None else thing
119 |
120 | def done_rrids(rrids):
121 | for rrid, s in rrids.items():
122 | for a in s:
123 | if a.Validated:
124 | yield rrid
125 | break
126 |
127 | def todo_rrids(rrids):
128 | done = set(done_rrids(rrids))
129 | for rrid in rrids:
130 | if rrid not in done:
131 | yield rrid
132 |
133 | def render_papers(rows):
134 | return divtag(render_table(rows,
135 | '#', 'Paper', 'PMID', 'DOI',
136 | 'TODO', 'Done', 'RRIDs', 'Annotations'),
137 | cls='main')
138 |
139 | def papers(filter=lambda a:True):
140 | return [(str(i + 1),) + t
141 | for i, t in
142 | enumerate(sorted(((atag(url, '...' + url[-20:], new_tab=True),
143 | nonestr(rrids.pmid),
144 | '' if
145 | rrids.doi is None else
146 | atag(DOI(rrids.doi), rrids.doi, new_tab=True),
147 | str(len(list(todo_rrids(rrids)))),
148 | str(len(list(done_rrids(rrids)))),
149 | str(len(rrids)),
150 | str(len([a for r in rrids.values()
151 | for a in r])))
152 | for url, rrids in Curation._papers.items()
153 | if filter(next(a for s in rrids.values()
154 | for a in s))),
155 | key=lambda r: int(r[3]),
156 | reverse=True))]
157 |
158 | def no_pmid():
159 | return papers(lambda a:a.pmid is None)
160 |
161 | def no_doi():
162 | return papers(lambda a:a.doi is None)
163 |
164 | def no_id():
165 | return papers(lambda a:a.doi is None and a.pmid is None)
166 |
167 | def no_annos(): # TODO
168 | return []
169 |
170 | @app.route('/css/table.css')
171 | def route_css_table_style():
172 | return table_style, 200, {'Content-Type':'text/css'}
173 |
174 | @app.route('/dashboard', methods=('GET', 'POST'))
175 | @app.route('/dashboard/', methods=('GET', 'POST'))
176 | def route_base():
177 | return render_template('main.html', method='get',
178 | navbar=navbar(request.url_rule.endpoint),
179 | navbar_style = navbar_style,
180 | var='We have a lot of work to do!',
181 | nmissing='??',
182 | nures='??',
183 | incor='??',
184 | npapers=str(len(Curation._papers)),
185 | nnopmid=str(len(no_pmid())),
186 | nnodoi=str(len(no_doi())),
187 | #nnoboth=str(len(no_both())),
188 | #nnoannos=str(len(no_annos()))
189 | nnoannos='??',
190 | allp='??',)
191 |
192 | @app.route('/dashboard/anno-count')
193 | def route_anno_count():
194 | return str(len(Curation._annos_list))
195 |
196 | #@app.route(PurePath(base_url, 'anno-tags').as_posix())
197 | @app.route('/dashboard/anno-user/')
198 | def route_anno_tags(user):
199 | print(user)
200 | out = '\n'.join([f'{anno.user} {anno.text} {anno.tags} '
201 | for anno in Curation._annos_list if anno.user == user])
202 | return out
203 |
204 | @app.route('/dashboard/journals')
205 | def route_Journals():
206 | file = open("Journals.txt","r")
207 | paperStr = file.read()
208 | file.close()
209 | if paperStr == '':
210 | h = 0
211 | URLList = []
212 | counter = 0
213 | paperStr = str(counter) + ' Results:
'
214 | print("PROSSESING")
215 | for h in Curation:
216 | journal = Journal(h._anno)
217 | if "urn:x-pdf" in journal or "file:" in journal:
218 | URLList.append(journal)
219 | if journal == "":
220 | print (h.shareLink)
221 | if not journal in URLList:
222 | paperStr += " Journal Link "
223 | paperStr += journal
224 | counter += 1
225 | URLList.append(journal)
226 | paperStr = str(counter) + paperStr[1:]
227 | file = open("Journals.txt", "w")
228 | file.write(paperStr)
229 | file.close()
230 | return (paperStr)
231 |
232 | @app.route('/dashboard/DOI')
233 | def route_DOI():
234 | DOIStr = ""
235 | DOIList = []
236 | counter = 0
237 | for h in Curation:
238 | if [t for t in h.tags if t.startswith("DOI")]:
239 | if h.doi not in DOIList:
240 | DOIStr += ' Anno #:%s ' % h
241 | DOIStr += ' Anno Link '
242 | DOIStr += h.doi
243 | counter += 1
244 | if h.doi:
245 | DOIList.append(h.doi)
246 | return (str(counter) + "
" + DOIStr)
247 |
248 | @app.route('/dashboard/done')
249 | def route_done():
250 | return 'TODO'
251 |
252 | @app.route('/dashboard/public')
253 | def route_public():
254 | #return 'TODO'
255 | rows = ((str(i + 1),) + r for i, r in
256 | enumerate((nonestr(pa.curation_paper.pmid),
257 | nonestr(pa.curation_paper.doi),
258 | pa.rrid,)
259 | for pa in PublicAnno
260 | # skip incorrectly formatted and errors for now
261 | if pa.curation_ids and
262 | None not in pa.curation_annos and
263 | pa.rrid is not None # FIXME make clear these are page notes
264 | ))
265 | return htmldoc(navbar(request.url_rule.endpoint),
266 | divtag(render_table(rows, '#', 'PMID', 'DOI', 'RRID'),
267 | cls='main'),
268 | title='SciBot public release',
269 | styles=(table_style, cur_style, navbar_style))
270 |
271 | @app.route('/dashboard/table')
272 | def route_table():
273 | rows = filter_rows(lambda c: c.very_bad or c._Missing and not c.rrid or c.Incorrect or c.Unresolved)
274 | return table_rows(rows, 'All SciBot curation problems', request.url_rule.endpoint)
275 |
276 | """
277 | """
304 |
305 | @app.route('/dashboard/no-annos')
306 | def route_no_annos():
307 | return htmldoc(navbar(request.url_rule.endpoint),
308 | divtag('There shouldn\'t be anything here...',
309 | cls='main'),
310 | title='SciBot No Anno Papers',
311 | styles=(navbar_style,))
312 |
313 | @app.route('/dashboard/papers')
314 | def route_papers():
315 | rows = papers()
316 | return htmldoc(navbar(request.url_rule.endpoint),
317 | render_papers(rows),
318 | title='SciBot papers',
319 | styles=(table_style, cur_style, navbar_style))
320 |
321 | @app.route('/dashboard/no-pmid')
322 | def route_no_pmid():
323 | rows = no_pmid()
324 | return htmldoc(navbar(request.url_rule.endpoint),
325 | render_papers(rows),
326 | title='SciBot No PMID Papers',
327 | styles=(table_style, cur_style, navbar_style))
328 |
329 | @app.route('/dashboard/no-doi')
330 | def route_no_doi():
331 | rows = no_doi()
332 | return htmldoc(navbar(request.url_rule.endpoint),
333 | render_papers(rows),
334 | title='SciBot No DOI Papers',
335 | styles=(table_style, cur_style, navbar_style))
336 |
337 | @app.route('/dashboard/no-id')
338 | def route_no_id():
339 | rows = no_id()
340 | return htmldoc(navbar(request.url_rule.endpoint),
341 | render_papers(rows),
342 | title='SciBot No ID Papers',
343 | styles=(table_style, cur_style, navbar_style))
344 |
345 | @app.route('/dashboard/help-needed')
346 | def route_anno_help_needed():
347 | rows = filter_rows(lambda c: c.very_bad)
348 |
349 | return table_rows(rows, 'Help needed RRIDs', request.url_rule.endpoint)
350 |
351 | @app.route('/dashboard/incorrect')
352 | def route_anno_incorrect():
353 | rows = filter_rows(lambda c: not c.very_bad and c.Incorrect)
354 | return table_rows(rows, 'Incorrect RRIDs', request.url_rule.endpoint)
355 |
356 | @app.route('/dashboard/unresolved')
357 | def route_anno_unresolved():
358 | rows = filter_rows(lambda c: c.Unresolved and not c.very_bad and not c.Incorrect)
359 |
360 | return table_rows(rows, 'Unresolved RRIDs', request.url_rule.endpoint)
361 |
362 | @app.route('/dashboard/missing', methods=('GET', 'POST'))
363 | def route_anno_missing():
364 | rows = filter_rows(lambda c: c._Missing and not c.rrid)
365 | return table_rows(rows, 'Missing RRIDs', request.url_rule.endpoint)
366 |
367 | @app.route('/dashboard/no-replies')
368 | def route_no_replies():
369 | # this should be the table with no replies
370 | return 'TODO'
371 |
372 | @app.route('/dashboard/results')
373 | def search_results(search):
374 | h = 0
375 | hlist = []
376 | hstr = ''
377 | counter = 0
378 | # if search.data['search'] == '':
379 | # h = 0
380 | # hstr = ''
381 | # for h in Curation:
382 | # hstr += repr(h)
383 | # h += 1
384 | # return(hstr)
385 | # else:
386 | if search.data['select'] == 'ID':
387 | for h in Curation:
388 | if search.data['search'] in h.id:
389 | hstr += ' Anno #:%s ' % h
390 | hstr += ' Anno Link '
391 | hstr += repr(h)
392 | counter += 1
393 | if hstr == '':
394 | return('no results')
395 | return (str(counter) + ' Results:
' + hstr)
396 | #return render_template('results.html', results=html.unescape(hstr))
397 | elif search.data['select'] == 'Tags':
398 | for h in Curation:
399 | if [t for t in h.tags if search.data['search'] in t]:
400 | hstr += ' Anno #:%s ' % h
401 | hstr += ' Anno Link '
402 | hstr += repr(h)
403 | counter += 1
404 | if hstr == '':
405 | return('no results')
406 | print (str(len(hlist)))
407 | print(len(Curation._annos_list))
408 | return (str(counter) + ' Results:
' + hstr)
409 | #return render_template('results.html', results=hstr)
410 | elif search.data['select'] == 'User':
411 | for h in Curation:
412 | if h._anno.user == search.data['search']:
413 | hstr += ' Anno #:%s ' % h
414 | hstr += ' Anno Link '
415 | hstr += repr(h)
416 | counter += 1
417 | if hstr == '':
418 | return('no results')
419 | return (str(counter) + ' Results:
' % resolver_uri
78 | r = h.create_annotation_with_target_using_only_text_quote(url=target_uri,
79 | document=document,
80 | prefix=prefix,
81 | exact=exact_for_hypothesis,
82 | suffix=suffix,
83 | text=s,
84 | tags=new_tags + ['RRIDCUR:Unresolved'],
85 | extra=extra,)
86 | log.error(f'rrid unresolved {exact}')
87 |
88 | else:
89 | s = ''
90 | title = root.findall('title')[0].text
91 | s += f'Title: {title}\n'
92 | data_elements = root.findall('data')[0]
93 | data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements] # these shouldn't duplicate
94 | citation = [(n, v) for n, v in data_elements if n == 'Proper Citation']
95 | rrid = [rrid_from_citation(c) for _, c in citation] if citation else [exact]
96 | name = [(n, v) for n, v in data_elements if n == 'Name']
97 | data_elements = citation + name + sorted([(n, v) for n, v in
98 | data_elements if (n != 'Proper Citation' or
99 | n != 'Name') and v is not None])
100 | for name, value in data_elements:
101 | if ((name == 'Reference' or name == 'Mentioned In Literature')
102 | and value is not None and value.startswith(' 500:
104 | continue # nif-0000-30467 fix keep those pubmed links short!
105 | s += '