├── .gitignore
├── CONTRIBUTING.md
├── Creating a new BigQuery dataset.pdf
├── LICENSE
├── Querying a BigQuery dataset.pdf
├── README.md
├── examples
    ├── BERT_For_Patents.ipynb
    ├── Document_representation_from_BERT.ipynb
    ├── claim-text
    │   ├── claim_text_extraction.ipynb
    │   └── data
    │   │   └── 20k_G_and_H_publication_numbers.csv
    └── patent_set_expansion.ipynb
├── models
    ├── BERT for Patents.md
    ├── claim_breadth
    │   ├── README.md
    │   ├── batch_inference.py
    │   ├── batch_inference_test.py
    │   ├── generate_embedding_vocab.sql
    │   ├── hptuning_config.yaml
    │   ├── preprocess.py
    │   ├── preprocess_test.py
    │   ├── requirements.txt
    │   ├── testdata
    │   │   └── example-output-from-preprocess-step.tfrecord.gz
    │   └── trainer
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── task.py
    └── landscaping
    │   ├── AutomatedPatentLandscaping.pdf
    │   ├── AutomatedPatentLandscaping_2018Update.pdf
    │   ├── LandscapeNotebook.ipynb
    │   ├── README.md
    │   ├── __init__.py
    │   ├── expansion.py
    │   ├── figs
    │       ├── flow.png
    │       └── project-id.png
    │   ├── keras_metrics.py
    │   ├── model.py
    │   ├── seeds
    │       ├── README.md
    │       ├── hair_dryer.seed.csv
    │       ├── hair_dryer_large.seed.csv
    │       └── video_codec.seed.csv
    │   ├── tokenizer.py
    │   ├── train_data.py
    │   └── word2vec.py
├── tables
    ├── dataset_Berkeley Fung.md
    ├── dataset_Berkeley Fung.md.pdf
    ├── dataset_CPA Global.md
    ├── dataset_CPA Global.md.pdf
    ├── dataset_European Bioinformatics Institute.md
    ├── dataset_European Bioinformatics Institute.md.pdf
    ├── dataset_Google Patents Public Datasets.md
    ├── dataset_Google Patents Public Datasets.md.pdf
    ├── dataset_Other.md
    ├── dataset_Other.md.pdf
    ├── dataset_USPTO.md
    ├── dataset_USPTO.md.pdf
    ├── index.md
    └── index.md.pdf
└── tools
    ├── bigquery-indexer
        ├── README.md
        ├── beam-rdkit-runner
        │   └── Dockerfile
        └── main.py
    ├── bq_bulk_cp.pysh
    ├── bq_ls.pysh
    ├── csv_upload.pysh
    ├── dataset_berkeley_fung.json
    ├── dataset_ebi.json
    ├── dataset_ifi.json
    ├── dataset_innography.json
    ├── dataset_other.json
    ├── dataset_public.json
    ├── dataset_report.pysh
    ├── dataset_uspto.json
    ├── generate_dataset_docs.py
    └── sqlite_dump.pysh


/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | abstracts/
3 | checkpoints/
4 | models/landscaping/bigquery_credentials.dat
5 | models/landscaping/models/
6 | .ipynb_checkpoints/
7 | __pycache__/
8 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | 


--------------------------------------------------------------------------------
/Creating a new BigQuery dataset.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/Creating a new BigQuery dataset.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 


--------------------------------------------------------------------------------
/Querying a BigQuery dataset.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/Querying a BigQuery dataset.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Patent analysis using the Google Patents Public Datasets on BigQuery
 2 | 
 3 | The contents of this repository are not an official Google product.
 4 | 
 5 | [Google Patents Public Datasets](https://console.cloud.google.com/launcher/browse?q=google%20patents%20public%20datasets&filter=solution-type:dataset) is a collection of compatible BigQuery database tables from government, research and private companies for conducting statistical analysis of patent data. The data is available to be queried with SQL through BigQuery, joined with private datasets you upload, and exported and processed using many other compatible analysis tools. This repository is a centralized source for examples which use the data.
 6 | 
 7 | Currently the repo contains three examples:
 8 | 
 9 | 1. [Patent Landscaping](https://github.com/google/patents-public-data/blob/master/models/landscaping/README.md):  A demo of an automated process of finding patents related to a particular topic given an initial seed set of patents. Based on the paper by Dave Feltenberger and Aaron Abood, [Automated Patent Landscaping](models/landscaping/AutomatedPatentLandscaping.pdf).
10 | 
11 | 2. [Claim Text Extraction](https://github.com/google/patents-public-data/blob/master/examples/claim-text/claim_text_extraction.ipynb): A demo of interacting with patent claim text data using BigQuery and python.
12 | 
13 | 3. [Claim Breadth Model](https://github.com/google/patents-public-data/blob/master/models/claim_breadth/README.md): A machine learning method for estimating patent claim breadth using data from BigQuery.
14 | 
15 | Other helpful resources from the community:
16 | 
17 | 1. [Replicable Patent Indicators](https://www.kaggle.com/code/georgeabiyounes/replicable-patent-indicators/notebook) ([paper](https://onlinelibrary.wiley.com/doi/10.1111/1467-8462.12545))
18 | 


--------------------------------------------------------------------------------
/examples/Document_representation_from_BERT.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Document representation from BERT",
  7 |       "provenance": [
  8 |         {
  9 |           "file_id": "1hccaqNncyxDG32f5U1Qncz6ipWiLV0TQ",
 10 |           "timestamp": 1614125265907
 11 |         },
 12 |         {
 13 |           "file_id": "1d9KurXhXvrV-jo-x2f7DkZ40qx75YAh_",
 14 |           "timestamp": 1604694308174
 15 |         }
 16 |       ],
 17 |       "collapsed_sections": [],
 18 |       "last_runtime": {
 19 |         "build_target": "//corp/legal/patents/colab:dst_colab_notebook",
 20 |         "kind": "shared"
 21 |       },
 22 |       "toc_visible": true
 23 |     },
 24 |     "kernelspec": {
 25 |       "name": "python3",
 26 |       "display_name": "Python 3"
 27 |     }
 28 |   },
 29 |   "cells": [
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "metadata": {
 33 |         "id": "ED6tBdZtOjlU"
 34 |       },
 35 |       "source": [
 36 |         "# Document representation from BERT"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "markdown",
 41 |       "metadata": {
 42 |         "id": "CqNm7ioGOgSm"
 43 |       },
 44 |       "source": [
 45 |         "Copyright 2021 Google Inc.\n",
 46 |         "\n",
 47 |         "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
 48 |         "\n",
 49 |         "http://www.apache.org/licenses/LICENSE-2.0\n",
 50 |         "\n",
 51 |         "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
 52 |       ]
 53 |     },
 54 |     {
 55 |       "cell_type": "code",
 56 |       "metadata": {
 57 |         "id": "c1vLcDJINTGg"
 58 |       },
 59 |       "source": [
 60 |         "import collections\n",
 61 |         "import math\n",
 62 |         "import random\n",
 63 |         "import sys\n",
 64 |         "import time\n",
 65 |         "from typing import Dict, List, Tuple\n",
 66 |         "from sklearn.metrics import pairwise\n",
 67 |         "# Use Tensorflow 2.0\n",
 68 |         "import tensorflow as tf\n",
 69 |         "import numpy as np"
 70 |       ],
 71 |       "execution_count": null,
 72 |       "outputs": []
 73 |     },
 74 |     {
 75 |       "cell_type": "code",
 76 |       "metadata": {
 77 |         "id": "vfSIZaeaPHpZ",
 78 |         "colab": {
 79 |           "height": 53
 80 |         },
 81 |         "executionInfo": {
 82 |           "status": "ok",
 83 |           "timestamp": 1614125346371,
 84 |           "user_tz": 300,
 85 |           "elapsed": 155,
 86 |           "user": {
 87 |             "displayName": "Rob Srebrovic",
 88 |             "photoUrl": "",
 89 |             "userId": "06004353344935214283"
 90 |           }
 91 |         },
 92 |         "outputId": "c0bca557-2962-4f3b-a8f9-71be6d820897"
 93 |       },
 94 |       "source": [
 95 |         "# Set BigQuery application credentials\n",
 96 |         "from google.cloud import bigquery\n",
 97 |         "import os\n",
 98 |         "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"path/to/file.json\"\n",
 99 |         "\n",
100 |         "project_id = \"your_bq_project_id\"\n",
101 |         "bq_client = bigquery.Client(project=project_id)"
102 |       ],
103 |       "execution_count": 2,
104 |       "outputs": [
105 |         {
106 |           "output_type": "execute_result",
107 |           "data": {
108 |             "application/vnd.google.colaboratory.intrinsic+json": {
109 |               "type": "string"
110 |             },
111 |             "text/plain": [
112 |               "'# Set BigQuery application credentials\\nfrom google.cloud import bigquery\\nimport os\\nos.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"path/to/file.json\"\\n\\nproject_id = \"your_bq_project_id\"\\nbq_client = bigquery.Client(project=project_id)'"
113 |             ]
114 |           },
115 |           "metadata": {
116 |             "tags": []
117 |           },
118 |           "execution_count": 2
119 |         }
120 |       ]
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "id": "7BojUHDYrESY"
126 |       },
127 |       "source": [
128 |         "# You will have to clone the BERT repo\n",
129 |         "!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo\n",
130 |         "if not 'bert_repo' in sys.path:\n",
131 |         "  sys.path += ['bert_repo']"
132 |       ],
133 |       "execution_count": null,
134 |       "outputs": []
135 |     },
136 |     {
137 |       "cell_type": "markdown",
138 |       "metadata": {
139 |         "id": "QeoX7LfgPLGP"
140 |       },
141 |       "source": [
142 |         "The BERT repo uses Tensorflow 1 and thus a few of the functions have been moved/changed/renamed in Tensorflow 2. In order for the BERT tokenizer to be used, one of the lines in the repo that was just cloned needs to be modified to comply with Tensorflow 2. Line 125 in the BERT tokenization.py file must be changed as follows:\n",
143 |         "\n",
144 |         "From => `with tf.gfile.GFile(vocab_file, \"r\") as reader:`\n",
145 |         "\n",
146 |         "To => `with tf.io.gfile.GFile(vocab_file, \"r\") as reader:`\n",
147 |         "\n",
148 |         "Once that is complete and the file is saved, the tokenization library can be imported."
149 |       ]
150 |     },
151 |     {
152 |       "cell_type": "code",
153 |       "metadata": {
154 |         "id": "HsSJXKPDPLXn"
155 |       },
156 |       "source": [
157 |         "import tokenization"
158 |       ],
159 |       "execution_count": null,
160 |       "outputs": []
161 |     },
162 |     {
163 |       "cell_type": "markdown",
164 |       "metadata": {
165 |         "id": "JBqRRfigQxxK"
166 |       },
167 |       "source": [
168 |         "# Load BERT"
169 |       ]
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "metadata": {
174 |         "id": "kp2fx508lWBG"
175 |       },
176 |       "source": [
177 |         "MAX_SEQ_LENGTH = 512\n",
178 |         "MODEL_DIR = 'path/to/model'\n",
179 |         "VOCAB = 'path/to/vocab'\n",
180 |         "\n",
181 |         "tokenizer = tokenization.FullTokenizer(VOCAB, do_lower_case=True)"
182 |       ],
183 |       "execution_count": null,
184 |       "outputs": []
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "metadata": {
189 |         "id": "sNf96pSxxXg2"
190 |       },
191 |       "source": [
192 |         "model = tf.compat.v2.saved_model.load(export_dir=MODEL_DIR, tags=['serve'])\n",
193 |         "model = model.signatures['serving_default']"
194 |       ],
195 |       "execution_count": null,
196 |       "outputs": []
197 |     },
198 |     {
199 |       "cell_type": "code",
200 |       "metadata": {
201 |         "id": "-BWnaHqoT7db"
202 |       },
203 |       "source": [
204 |         "# Mean pooling layer for combining\n",
205 |         "pooling = tf.keras.layers.GlobalAveragePooling1D()"
206 |       ],
207 |       "execution_count": null,
208 |       "outputs": []
209 |     },
210 |     {
211 |       "cell_type": "markdown",
212 |       "metadata": {
213 |         "id": "rtzZg5LESCxF"
214 |       },
215 |       "source": [
216 |         "# Get a couple of Patents\n",
217 |         "\n",
218 |         "Here we do a simple query from the BigQuery patents data to collect the claims for a sample set of patents."
219 |       ]
220 |     },
221 |     {
222 |       "cell_type": "code",
223 |       "metadata": {
224 |         "id": "u3iTTJQ5SFba"
225 |       },
226 |       "source": [
227 |         "# Put your publications here.\n",
228 |         "test_pubs = (\n",
229 |         "    'US-8000000-B2', 'US-2007186831-A1', 'US-2009030261-A1', 'US-10722718-B2'\n",
230 |         ")\n",
231 |         "\n",
232 |         "js = r\"\"\"\n",
233 |         "  // Regex to find the separations of the claims data\n",
234 |         "  var pattern = new RegExp(/[.][\\\\s]+[0-9]+[\\\\s]*[.]/, 'g');\n",
235 |         "  if (pattern.test(text)) {\n",
236 |         "    return text.split(pattern);\n",
237 |         "  }\n",
238 |         "\"\"\"\n",
239 |         "\n",
240 |         "query = r'''\n",
241 |         "  #standardSQL\n",
242 |         "  CREATE TEMPORARY FUNCTION breakout_claims(text STRING) RETURNS ARRAY<STRING> \n",
243 |         "  LANGUAGE js AS \"\"\"\n",
244 |         "  {}\n",
245 |         "  \"\"\"; \n",
246 |         "\n",
247 |         "  SELECT \n",
248 |         "    pubs.publication_number, \n",
249 |         "    title.text as title, \n",
250 |         "    breakout_claims(claims.text) as claims\n",
251 |         "  FROM `patents-public-data.patents.publications` as pubs,\n",
252 |         "    UNNEST(claims_localized) as claims,\n",
253 |         "    UNNEST(title_localized) as title\n",
254 |         "  WHERE\n",
255 |         "    publication_number in {}\n",
256 |         "'''.format(js, test_pubs)\n",
257 |         "\n",
258 |         "df = bq_client.query(query).to_dataframe()"
259 |       ],
260 |       "execution_count": null,
261 |       "outputs": []
262 |     },
263 |     {
264 |       "cell_type": "code",
265 |       "metadata": {
266 |         "colab": {
267 |           "height": 241
268 |         },
269 |         "id": "ORcVOefPsT0U",
270 |         "executionInfo": {
271 |           "status": "ok",
272 |           "timestamp": 1614011849900,
273 |           "user_tz": 300,
274 |           "elapsed": 309,
275 |           "user": {
276 |             "displayName": "Jay Yonamine",
277 |             "photoUrl": "",
278 |             "userId": "01949405773282057831"
279 |           }
280 |         },
281 |         "outputId": "5299f3c1-b64e-4cbd-9206-273d1fb1d300"
282 |       },
283 |       "source": [
284 |         "df.head()"
285 |       ],
286 |       "execution_count": null,
287 |       "outputs": [
288 |         {
289 |           "output_type": "execute_result",
290 |           "data": {
291 |             "text/html": [
292 |               "<div>\n",
293 |               "<style scoped>\n",
294 |               "    .dataframe tbody tr th:only-of-type {\n",
295 |               "        vertical-align: middle;\n",
296 |               "    }\n",
297 |               "\n",
298 |               "    .dataframe tbody tr th {\n",
299 |               "        vertical-align: top;\n",
300 |               "    }\n",
301 |               "\n",
302 |               "    .dataframe thead th {\n",
303 |               "        text-align: right;\n",
304 |               "    }\n",
305 |               "</style>\n",
306 |               "<table border=\"1\" class=\"dataframe\">\n",
307 |               "  <thead>\n",
308 |               "    <tr style=\"text-align: right;\">\n",
309 |               "      <th></th>\n",
310 |               "      <th>publication_number</th>\n",
311 |               "      <th>title</th>\n",
312 |               "      <th>claims</th>\n",
313 |               "    </tr>\n",
314 |               "  </thead>\n",
315 |               "  <tbody>\n",
316 |               "    <tr>\n",
317 |               "      <th>0</th>\n",
318 |               "      <td>US-2009030261-A1</td>\n",
319 |               "      <td>Drug delivery system</td>\n",
320 |               "      <td>[1 . A drug delivery system comprising:\\n a ca...</td>\n",
321 |               "    </tr>\n",
322 |               "    <tr>\n",
323 |               "      <th>1</th>\n",
324 |               "      <td>US-2007186831-A1</td>\n",
325 |               "      <td>Sewing machine</td>\n",
326 |               "      <td>[1 . A sewing machine comprising:\\n a needle b...</td>\n",
327 |               "    </tr>\n",
328 |               "    <tr>\n",
329 |               "      <th>2</th>\n",
330 |               "      <td>US-8000000-B2</td>\n",
331 |               "      <td>Visual prosthesis</td>\n",
332 |               "      <td>[1. A visual prosthesis apparatus comprising:\\...</td>\n",
333 |               "    </tr>\n",
334 |               "    <tr>\n",
335 |               "      <th>3</th>\n",
336 |               "      <td>US-10722718-B2</td>\n",
337 |               "      <td>Systems and methods for treatment of dry eye</td>\n",
338 |               "      <td>[What is claimed is: \\n     \\n       1. A meth...</td>\n",
339 |               "    </tr>\n",
340 |               "  </tbody>\n",
341 |               "</table>\n",
342 |               "</div>"
343 |             ],
344 |             "text/plain": [
345 |               "  publication_number  ...                                             claims\n",
346 |               "0   US-2009030261-A1  ...  [1 . A drug delivery system comprising:\\n a ca...\n",
347 |               "1   US-2007186831-A1  ...  [1 . A sewing machine comprising:\\n a needle b...\n",
348 |               "2      US-8000000-B2  ...  [1. A visual prosthesis apparatus comprising:\\...\n",
349 |               "3     US-10722718-B2  ...  [What is claimed is: \\n     \\n       1. A meth...\n",
350 |               "\n",
351 |               "[4 rows x 3 columns]"
352 |             ]
353 |           },
354 |           "metadata": {
355 |             "tags": []
356 |           },
357 |           "execution_count": 8
358 |         }
359 |       ]
360 |     },
361 |     {
362 |       "cell_type": "code",
363 |       "metadata": {
364 |         "id": "NeFzKlMw1DQd"
365 |       },
366 |       "source": [
367 |         "def get_bert_token_input(texts):\n",
368 |         "  input_ids = []\n",
369 |         "  input_mask = []\n",
370 |         "  segment_ids = []\n",
371 |         "\n",
372 |         "  for text in texts:\n",
373 |         "    tokens = tokenizer.tokenize(text)\n",
374 |         "    if len(tokens) > MAX_SEQ_LENGTH - 2:\n",
375 |         "      tokens = tokens[0:(MAX_SEQ_LENGTH - 2)]\n",
376 |         "    tokens = ['[CLS]'] + tokens + ['[SEP]']\n",
377 |         "\n",
378 |         "\n",
379 |         "    ids = tokenizer.convert_tokens_to_ids(tokens)\n",
380 |         "    token_pad = MAX_SEQ_LENGTH - len(ids)\n",
381 |         "    input_mask.append([1] * len(ids) + [0] * token_pad)\n",
382 |         "    input_ids.append(ids + [0] * token_pad)\n",
383 |         "    segment_ids.append([0] * MAX_SEQ_LENGTH)\n",
384 |         "  \n",
385 |         "  return {\n",
386 |         "      'segment_ids': tf.convert_to_tensor(segment_ids, dtype=tf.int64),\n",
387 |         "      'input_mask': tf.convert_to_tensor(input_mask, dtype=tf.int64),\n",
388 |         "      'input_ids': tf.convert_to_tensor(input_ids, dtype=tf.int64),\n",
389 |         "      'mlm_positions': tf.convert_to_tensor([], dtype=tf.int64)\n",
390 |         "  }"
391 |       ],
392 |       "execution_count": null,
393 |       "outputs": []
394 |     },
395 |     {
396 |       "cell_type": "code",
397 |       "metadata": {
398 |         "id": "MlrVU10IOlSZ"
399 |       },
400 |       "source": [
401 |         "docs_embeddings = []\n",
402 |         "for _, row in df.iterrows():\n",
403 |         "  inputs = get_bert_token_input(row['claims'])\n",
404 |         "  response = model(**inputs)\n",
405 |         "  avg_embeddings = pooling(\n",
406 |         "      tf.reshape(response['encoder_layer'], shape=[1, -1, 1024]))\n",
407 |         "  docs_embeddings.append(avg_embeddings.numpy()[0])"
408 |       ],
409 |       "execution_count": null,
410 |       "outputs": []
411 |     },
412 |     {
413 |       "cell_type": "code",
414 |       "metadata": {
415 |         "id": "DhF2-w2yU52U",
416 |         "executionInfo": {
417 |           "status": "ok",
418 |           "timestamp": 1614012215102,
419 |           "user_tz": 300,
420 |           "elapsed": 240,
421 |           "user": {
422 |             "displayName": "Jay Yonamine",
423 |             "photoUrl": "",
424 |             "userId": "01949405773282057831"
425 |           }
426 |         },
427 |         "outputId": "c6148de6-f1c2-40c3-d75d-90cc0f4e0469"
428 |       },
429 |       "source": [
430 |         "pairwise.cosine_similarity(docs_embeddings)"
431 |       ],
432 |       "execution_count": null,
433 |       "outputs": [
434 |         {
435 |           "output_type": "execute_result",
436 |           "data": {
437 |             "text/plain": [
438 |               "array([[0.9999988 , 0.68387157, 0.83200616, 0.86913264],\n",
439 |               "       [0.68387157, 1.0000013 , 0.7299322 , 0.73105675],\n",
440 |               "       [0.83200616, 0.7299322 , 0.99999964, 0.9027555 ],\n",
441 |               "       [0.86913264, 0.73105675, 0.9027555 , 0.9999996 ]], dtype=float32)"
442 |             ]
443 |           },
444 |           "metadata": {
445 |             "tags": []
446 |           },
447 |           "execution_count": 13
448 |         }
449 |       ]
450 |     },
451 |     {
452 |       "cell_type": "code",
453 |       "metadata": {
454 |         "id": "TFWxL-IGU9-6",
455 |         "executionInfo": {
456 |           "status": "ok",
457 |           "timestamp": 1614012321633,
458 |           "user_tz": 300,
459 |           "elapsed": 227,
460 |           "user": {
461 |             "displayName": "Jay Yonamine",
462 |             "photoUrl": "",
463 |             "userId": "01949405773282057831"
464 |           }
465 |         },
466 |         "outputId": "9fffcf1d-0c2c-4d84-eb8e-847d6054f125"
467 |       },
468 |       "source": [
469 |         "docs_embeddings[0].shape"
470 |       ],
471 |       "execution_count": null,
472 |       "outputs": [
473 |         {
474 |           "output_type": "execute_result",
475 |           "data": {
476 |             "text/plain": [
477 |               "(1024,)"
478 |             ]
479 |           },
480 |           "metadata": {
481 |             "tags": []
482 |           },
483 |           "execution_count": 23
484 |         }
485 |       ]
486 |     }
487 |   ]
488 | }
489 | 


--------------------------------------------------------------------------------
/models/BERT for Patents.md:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Google Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
 4 | 
 5 | http://www.apache.org/licenses/LICENSE-2.0
 6 | 
 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
 8 | 
 9 | # BERT for Patents
10 | 
11 | The BERT exported here has been trained on >100 million patent documents and was trained on all parts of a patent (abstract, claims, description).
12 | 
13 | The BERT model exported here comes in two formats:
14 | 
15 | * [SavedModel](https://storage.googleapis.com/patents-public-data-github/saved_model.zip)
16 | 
17 | * [Checkpoint](https://storage.googleapis.com/patents-public-data-github/checkpoint.zip)
18 | 
19 | The models can also be loaded and saved in another format or just the weights can be saved.
20 | 
21 | The BERT model has been trained on >100 million patent documents and was trained on all parts of a patent (abstract, claims, description). It has a similar configuration to the BERT-Large model, with a couple of important notes:
22 | 
23 | * The maximum input sequence length is 512 tokens and maximum masked words for a sequence is 45.
24 | * The vocabulary has approximately 8000 added words from the standard BERT vocabulary. These represent frequently used patent terms.
25 | * The vocabulary includes "context" tokens indicating what part of a patent the text is from (abstract, claims, summary, invention). Providing context tokens in the examples is optional.
26 | 
27 | The full BERT vocabulary can be downloaded [here](https://storage.googleapis.com/patents-public-data-github/bert_for_patents_vocab_39k.txt). The vocabulary also contains 1000 unused tokens so that more tokens can be added.
28 | 
29 | The exact configuration for the BERT model is as follows (and downloaded [here](https://storage.googleapis.com/patents-public-data-github/bert_for_patents_large_config.json)):
30 | 
31 | * attention_probs_dropout_prob: 0.1
32 | * hidden_act: gelu
33 | * hidden_dropout_prob: 0.1
34 | * hidden_size: 1024
35 | * initializer_range: 0.02
36 | * intermediate_size: 4096
37 | * max_position_embeddings: 512
38 | * num_attention_heads: 16
39 | * num_hidden_layers: 24
40 | * vocab_size: 39859
41 | 
42 | The model has requires the following input signatures:
43 | 1. `input_ids`
44 | 2. `input_mask`
45 | 3. `segment_ids`
46 | 4. `mlm_ids`
47 | 
48 | And the BERT model contains output signatures for:
49 | 1. `cls_token`
50 | 2. `encoder_layer` is the contextualized word embeddings from the last encoder layer.
51 | 3. `mlm_logits` is the predictions for any masked tokens provided to the model.
52 | 


--------------------------------------------------------------------------------
/models/claim_breadth/README.md:
--------------------------------------------------------------------------------
  1 | # Measuring Patent Claim Breadth Using Google Patents Public Data on BigQuery
  2 | 
  3 | The code in this repository is one approach to measuring patent claim breadth
  4 | using a semi supervised approach. For more details and background, please see
  5 | [this post on the Google Cloud Big Data
  6 | Blog.](https://cloud.google.com/blog/big-data/2018/07/measuring-patent-claim-breadth-using-google-patents-public-datasets)
  7 | 
  8 | ## Prerequisites
  9 | 
 10 | This guide assumes access to a linux based operating system. Windows users may run into compatibility issues when running the commands below. Windows users should consider using a virtual machine.
 11 | 
 12 | #### Setup a Google Cloud Project and Install the gcloud sdk
 13 | 
 14 | Much of the code in this repository requires access to a Google Cloud Project.
 15 | Please setup an account before proceeding. To install gcloud, follow the guide
 16 | [here](https://cloud.google.com/sdk/docs/quickstarts). Then once its installed,
 17 | setup your sdk to reference your account:
 18 | 
 19 | `gcloud init`
 20 | 
 21 | #### Create a bucket where you'll store relevant data for this project and set some environmental variables.
 22 | 
 23 | Each of the steps below relies on Google Cloud Storage for various tasks like
 24 | writing logs or output files. You'll need your own bucket to run the steps below
 25 | or if you'd like to run only one portion of the steps. You can use our public
 26 | bucket gs://patent-claims-data which includes all the relevant input and output
 27 | files
 28 | 
 29 | ```
 30 | export GCP_PROJECT=`gcloud config get-value project`
 31 | export BUCKET=gs://[YOUR BUCKET NAME]
 32 | gsutil mb $BUCKET
 33 | ```
 34 | 
 35 | #### Enable relevant API's in the GCP console.
 36 | 
 37 | Dataflow and cloud ML require several api's to be enabled on your account.
 38 | Before running the examples below, you'll need the following two API's enabled:
 39 | 
 40 | 1.  https://console.cloud.google.com/apis/library/dataflow.googleapis.com
 41 | 2.  https://console.cloud.google.com/apis/library/ml.googleapis.com
 42 | 
 43 | #### Create a service account, download your keys and set a local environmental variable.
 44 | 
 45 | To do this follow [this
 46 | guide](https://cloud.google.com/docs/authentication/getting-started) for setting
 47 | the GOOGLE_APPLICATION_CREDENTIALS environmental variable.
 48 | 
 49 | `export GOOGLE_APPLICATION_CREDENTIALS="[PATH TO DOWNLOADED JSON FILE]"`
 50 | 
 51 | #### Setup a virtual environment and install python dependencies.
 52 | 
 53 | Additionally, you'll likely want to work inside a python virtual environment.
 54 | You can set one up with the following commands:
 55 | 
 56 | ```
 57 | virtualenv myenv
 58 | source myenv/bin/activate
 59 | pip install -r requirements.txt
 60 | ```
 61 | 
 62 | ## A few sample commands
 63 | 
 64 | Below are a handful of sample commands that can be used as a reference on how to
 65 | run the scripts in this repository. For more info, see the blog post mentioned
 66 | above. Please note that all of the commands below will incur charges on your GCP
 67 | account. Most of the commands can be run for less than a dollar at current
 68 | prices, but hyperparameter tuning can easily become very expensive if you run
 69 | many trials. Consider setting [billing alerts and
 70 | limits](https://cloud.google.com/billing/docs/how-to/budgets) before running any
 71 | of the commands below.
 72 | 
 73 | ### To run preprocessing pipeline and produce 1.4m training examples.
 74 | 
 75 | ```
 76 | export OUTPUT_PATH="$BUCKET/training-data/"
 77 | python preprocess.py \
 78 |   --output_path=$OUTPUT_PATH \
 79 |   --project=$GCP_PROJECT \
 80 |   --runner=DataflowRunner \
 81 |   --pipeline_mode=train \
 82 |   --query_kep_pct=0.6 \
 83 |   --cpc_code_list='D,E,F,G,H'
 84 | ```
 85 | 
 86 | ### To run a local training job for a few steps to ensure your model trains.
 87 | 
 88 | #### First, set up a vocab file for an embedding column in the model
 89 | 
 90 | The model has an embedding column which is designed to embed CPC codes at the 4
 91 | digit level which allows the model to learn differences in feature impact across
 92 | technologies (i.e. a claim of the same length might be narrower in one subspace
 93 | than in another.)
 94 | 
 95 | To generate a vocab file, the simplest way is to run a query against the Google
 96 | Patents Public Data on BigQuery and save the output to a text file which we put
 97 | on GCP storage. To do this follow the commands below:
 98 | 
 99 | ```
100 | # Execute a query from the command line and pipe output to text file.
101 | bq --project=$GCP_PROJECT query --max_rows=100000 --format=csv "$(cat generate_embedding_vocab.sql)" > ./cpc_embedding_vocab.txt
102 | # Strip header and blank lines.
103 | sed -i '2 d' cpc_embedding_vocab.txt
104 | sed -i '/^\s*$/d' cpc_embedding_vocab.txt
105 | # Copy to GCS for use in training and remove local copy.
106 | gsutil cp ./cpc_embedding_vocab.txt $BUCKET
107 | rm ./cpc_embedding_vocab.txt
108 | ```
109 | 
110 | #### Launch the local training job.
111 | 
112 | ```
113 | export CPC_EMBEDDING_VOCAB_FILE="$BUCKET/cpc_embedding_vocab.txt"
114 | export GCS_TRAIN_FILES="$BUCKET/training-data/claim-data-train*.tfrecord.gz"
115 | export GCS_EVAL_FILES="$BUCKET/training-data/claim-data-eval*.tfrecord.gz"
116 | gcloud ml-engine local train \
117 |  --package-path trainer \
118 |  --module-name trainer.task \
119 |  --job-dir './test' \
120 |  -- --train-files $GCS_TRAIN_FILES \
121 |     --eval-files $GCS_EVAL_FILES \
122 |     --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE \
123 |     --train-steps 100 \
124 |     --train-batch-size=10 \
125 |     --eval-batch-size=10
126 | ```
127 | 
128 | ### To run Hyperparameter Tuning and select the best model parameters (CAN BE EXPENSIVE).
129 | 
130 | Note - running this command can incur significant charges due to the number of
131 | trials running. Make sure you have billing alerts and budgets set up to avoid
132 | unexpected charges.
133 | 
134 | ```
135 | export JOB_NAME=tuning_$(date +"%s")
136 | export GCS_JOB_DIR="$BUCKET/hptuning/$JOB_NAME"
137 | 
138 | gcloud ml-engine jobs submit training $JOB_NAME \
139 |   --config hptuning_config.yaml \
140 |   --runtime-version 1.6 \
141 |   --job-dir $GCS_JOB_DIR \
142 |   --module-name trainer.task \
143 |   --package-path trainer/ \
144 |   --region us-central1 \
145 |   -- --train-steps 50000 \
146 |      --train-files $GCS_TRAIN_FILES \
147 |      --eval-files $GCS_EVAL_FILES \
148 |      --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE
149 | ```
150 | 
151 | ### To run a cloud training job for 30000 steps with the default Hparams.
152 | 
153 | ```
154 | export JOB_NAME=patent_claims_$(date +"%s")
155 | export GCS_JOB_DIR="$BUCKET/models/$JOB_NAME"
156 | 
157 | gcloud ml-engine jobs submit training $JOB_NAME \
158 |   --scale-tier STANDARD_1 \
159 |   --runtime-version 1.6 \
160 |   --job-dir $GCS_JOB_DIR \
161 |   --module-name trainer.task \
162 |   --package-path trainer/ \
163 |   --region us-central1 \
164 |   -- --train-steps 30000 \
165 |      --train-files $GCS_TRAIN_FILES \
166 |      --eval-files $GCS_EVAL_FILES \
167 |      --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE
168 | ```
169 | 
170 | While your training job is running, logs will be written to GCS and you can
171 | monitor progress with tensorboard using the command below. Note, because you're
172 | fetching logs from GCS - there is some latency between starting tensorboard and
173 | seeing results.
174 | 
175 | `tensorboard --logdir $GCS_JOB_DIR`
176 | 
177 | ### To run preprocessing pipeline and produce input data to run inference on all pubs after 1995 in a D, E, F, G, or H class code:
178 | 
179 | ```
180 | export OUTPUT_PATH="$BUCKET/inference-data"
181 | python preprocess.py \
182 | --output_path=$OUTPUT_PATH \
183 | --project=$GCP_PROJECT \
184 | --runner=DataflowRunner \
185 | --pipeline_mode=inference \
186 | --cpc_code_list='D,E,F,G,H'
187 | ```
188 | 
189 | ### Set up Your Model on Cloud ML
190 | 
191 | In a previous step, we trained a model and saved the final model to GCS. In the
192 | next step, we'll use this model for batch inference by leveraging GCP's Cloud
193 | ML. To use this service, we need to configure a model for online inference. To
194 | read more about this, see [this
195 | doc](https://cloud.google.com/ml-engine/docs/tensorflow/prediction-overview).
196 | 
197 | If you've been following along so far, the following commands will grab the
198 | trained model files from GCP and set up a model version on cloud ML:
199 | 
200 | ```
201 | export MODEL_NAME=patent_claims
202 | export VERSION='v1'
203 | export SAVED_MODEL=`gsutil ls -d "$GCS_JOB_DIR/export/model/[0-9]*/"`
204 | gcloud ml-engine models create $MODEL_NAME
205 | gcloud ml-engine versions create $VERSION --model $MODEL_NAME --origin $SAVED_MODEL --runtime-version=1.4
206 | export MODEL_VERSION_STR="$MODEL_NAME/versions/$VERSION"
207 | ```
208 | 
209 | ### Run batch inference against all US Pubs in a D, E, F, G, or H class code.
210 | 
211 | Now that we have a model ready for predictions, we can run batch inference. Note
212 | that the number of workers will affect how many requests are made against your
213 | model's API.
214 | 
215 | ```
216 | export OUTPUT_PATH="$BUCKET/scored"
217 | export INPUT_FILE_PATTERN="$BUCKET/inference-data/*.tfrecord.gz"
218 | python ./batch_inference.py \
219 |   --model_version_str=$MODEL_VERSION_STR \
220 |   --input_file_pattern=$INPUT_FILE_PATTERN \
221 |   --output_path=$OUTPUT_PATH \
222 |   --num_workers=5 \
223 |   --project=$GCP_PROJECT \
224 |   --write_to_bigquery=True \
225 |   --output_dataset='sandbox' \
226 |   --output_table='claim_scores' \
227 |   --runner=DataflowRunner
228 | ```
229 | 
230 | ## Useful Links
231 | 
232 | The following links are helpful resources for understanding concepts covered in
233 | this repository.
234 | 
235 | -   [Apache Beam programming
236 |     guide](https://beam.apache.org/documentation/programming-guide/)
237 | -   [Detailed overview of using estimators to train a model locally and on
238 |     GCP.](https://github.com/amygdala/code-snippets/blob/master/ml/census_train_and_eval/using_tf.estimator.train_and_evaluate.ipynb)
239 | -   [Overview of hyperparameter
240 |     tuning](https://cloud.google.com/ml-engine/docs/tensorflow/hyperparameter-tuning-overview)
241 | 


--------------------------------------------------------------------------------
/models/claim_breadth/batch_inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache
  2 | # License, Version 2.0 (the "License"); you may not use this file except in
  3 | # compliance with the License. You may obtain a copy of the License at
  4 | # http://www.apache.org/licenses/LICENSE-2.0
  5 | 
  6 | # Unless required by applicable law or agreed to in writing, software
  7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  9 | # License for the specific language governing permissions and limitations under
 10 | # the License.
 11 | """A batch inference script to score a set of Patent publications."""
 12 | import argparse
 13 | import datetime
 14 | import logging
 15 | import os
 16 | import sys
 17 | import apache_beam as beam
 18 | from apache_beam.metrics import Metrics
 19 | from apache_beam.options.pipeline_options import PipelineOptions
 20 | from googleapiclient.discovery import build
 21 | import tensorflow as tf
 22 | 
 23 | NOW = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
 24 | FEATURE_NAMES = [
 25 |     'word_cnt', 'word_cnt_unique', 'char_cnt', 'char_cnt_unique',
 26 |     'limiting_words_cnt', 'digits_or_decimal_cnt', 'atleastoneofand_cnt',
 27 |     'atleastoneofor_cnt', 'counting_cnt', 'excluding_words_cnt',
 28 |     'groupconsistingof_cnt', 'element_cnt', 'adding_words_cnt',
 29 | ]
 30 | 
 31 | 
 32 | def default_args(argv):
 33 |   """Provides default values for Workflow flags."""
 34 |   parser = argparse.ArgumentParser()
 35 |   parser.add_argument(
 36 |       '--model_version_str',
 37 |       required=True,
 38 |       type=str,
 39 |       help='Path to ML Engine model like `MODEL_NAME/versions/VERSION`')
 40 |   parser.add_argument(
 41 |       '--input_file_pattern',
 42 |       required=True,
 43 |       type=str,
 44 |       help='Glob style file pattern to use for selecting input files.')
 45 |   parser.add_argument(
 46 |       '--output_path',
 47 |       required=True,
 48 |       help='Output directory to write results to if write_to_bigquery is false.'
 49 |            'for DataflowRunner use a GCS bucket.')
 50 |   parser.add_argument(
 51 |       '--output_prefix',
 52 |       default='us_patent_claim_scores',
 53 |       help='Prefix to use on sharded output files.')
 54 |   parser.add_argument(
 55 |       '--write_to_bigquery',
 56 |       default=False,
 57 |       help='If `True` output will be written directly to a bigquery table as'
 58 |            'specified by args.output_dataset args.output_table.'
 59 |   )
 60 |   parser.add_argument(
 61 |       '--output_dataset',
 62 |       help='Bigquery Dataset where output should be written if'
 63 |            'write_to_bigquery is true. Will be ignored otherwise.'
 64 |   )
 65 |   parser.add_argument(
 66 |       '--output_table',
 67 |       help='Bigquery Table name where output should be written if'
 68 |            'write_to_bigquery is true. Will be ignored otherwise.'
 69 |   )
 70 |   parser.add_argument(
 71 |       '--output_shards',
 72 |       default=10,
 73 |       help='Number of shards to write in output_path.')
 74 |   parser.add_argument(
 75 |       '--job_name',
 76 |       type=str,
 77 |       default='patent-claims-inference' + NOW,
 78 |       help='A unique job identifier.')
 79 |   parser.add_argument(
 80 |       '--num_workers',
 81 |       default=5,
 82 |       type=int,
 83 |       help='The max number of workers to use.')
 84 |   parser.add_argument(
 85 |       '--autoscaling_algorithm',
 86 |       default='NONE',
 87 |       help='Options are `NONE` or `THROUGHPUT_BASED`. Use None to prevent GCP'
 88 |            'from scaling down to 1 worker due to API throughput.'
 89 |   )
 90 |   parser.add_argument(
 91 |       '--runner',
 92 |       default='DirectRunner',
 93 |       choices=['DataflowRunner', 'DirectRunner'],
 94 |       help='Option to run locally or on GCP, for other options see Beam docs.')
 95 |   parser.add_argument(
 96 |       '--project',
 97 |       type=str,
 98 |       help='The cloud project name to be used for running this pipeline with'
 99 |            'the DataflowRunner option')
100 | 
101 |   parsed_args, _ = parser.parse_known_args(argv)
102 | 
103 |   if parsed_args.runner == 'DataflowRunner':
104 |     if not parsed_args.project:
105 |       msg = 'If running with DataflowRunner please provide a GCP project.'
106 |       raise argparse.ArgumentTypeError(msg)
107 | 
108 |   # Check the output flags when writing to BigQuery.
109 |   if parsed_args.write_to_bigquery:
110 |     if not parsed_args.output_dataset:
111 |       msg = ('When writing to Bigquery, you must specify --output_dataset and '
112 |              '--output_table flags.')
113 |       raise argparse.ArgumentTypeError(msg)
114 | 
115 |   # Setup some additional flags required with DataflowRunner.
116 |   # These can be overridden via the command line.
117 |   default_cloud_values = {
118 |       'temp_location': os.path.join(parsed_args.output_path, 'tmp'),
119 |       'staging_location': os.path.join(parsed_args.output_path, 'stg'),
120 |       'save_main_session': True,
121 |   }
122 | 
123 |   for kk, vv in default_cloud_values.iteritems():
124 |     if kk not in parsed_args or not vars(parsed_args)[kk]:
125 |       vars(parsed_args)[kk] = vv
126 | 
127 |   return parsed_args
128 | 
129 | 
130 | def get_tf_feature(proto, feature_name, feature_type='float_list'):
131 |   """Helper method to retrieve named features from a TF example proto."""
132 |   return getattr(proto.features.feature[feature_name], feature_type).value[0]
133 | 
134 | 
135 | class RunInference(beam.DoFn):
136 |   """Loads saved model and scores inputs."""
137 | 
138 |   def __init__(self, model_endpoint):
139 |     self.success_cnt = Metrics.counter('main', 'inference_success')
140 |     self.model_endpoint = model_endpoint
141 |     self.ml_service = build('ml', 'v1')
142 | 
143 |   def process(self, element):
144 |     """Scores the model using the TF Example input."""
145 |     ex = tf.train.Example.FromString(element)
146 |     instance = {ftr: get_tf_feature(ex, ftr) for ftr in FEATURE_NAMES}
147 |     instance['cpc4'] = get_tf_feature(ex, 'cpc4', 'bytes_list')
148 | 
149 |     response = self.ml_service.projects().predict(
150 |         name=self.model_endpoint,
151 |         body={'instances': [instance]}
152 |     ).execute()
153 | 
154 |     broad_score = response['predictions'][0]['probabilities'][1]
155 | 
156 |     # Pull the publication number from the TF Example proto.
157 |     pub_number = ex.features.feature['publication_number'].bytes_list.value[0]
158 |     self.success_cnt.inc()
159 |     yield {'publication_number': pub_number, 'broad_score': float(broad_score)}
160 | 
161 | 
162 | def format_output(element):
163 |   """Converts dictionary element into a CSV style output."""
164 |   pub_number = element.get('publication_number')
165 |   broad_score = element.get('broad_score')
166 |   return '{0},{1:05f}'.format(pub_number, broad_score)
167 | 
168 | 
169 | def main(argv, await_completion=False):
170 |   """Runs the batch inference pipeline."""
171 |   opt = default_args(argv)
172 |   logging.info('Starting pipeline with args: %s', vars(opt))
173 |   pipeline_options = PipelineOptions().from_dictionary(vars(opt))
174 |   p = beam.Pipeline(options=pipeline_options)
175 |   output_base = os.path.join(opt.output_path, opt.output_prefix)
176 |   model_endpoint = 'projects/{}/models/{}'.format(
177 |       opt.project, opt.model_version_str)
178 |   data = (p
179 |           | 'ReadTFRecords' >> beam.io.ReadFromTFRecord(opt.input_file_pattern)
180 |           | 'RunInference' >> beam.ParDo(RunInference(model_endpoint))
181 |          )
182 | 
183 |   if opt.write_to_bigquery:
184 |     _ = data | 'WriteToBigquery' >> beam.io.gcp.bigquery.WriteToBigQuery(
185 |         table=opt.output_table,
186 |         dataset=opt.output_dataset,
187 |         project=opt.project,
188 |         write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND,
189 |         schema='publication_number:STRING,broad_score:FLOAT'
190 |     )
191 |   else:
192 |     # Format a CSV style output and write to text.
193 |     formatted = data | 'FormatTextOutput' >> beam.Map(format_output)
194 |     _ = formatted | 'WriteToText' >> beam.io.WriteToText(
195 |         file_path_prefix=output_base,
196 |         num_shards=int(opt.output_shards)
197 |     )
198 | 
199 |   result = p.run()
200 |   print('Pipeline running. visit https://console.cloud.google.com/dataflow to '
201 |         'monitor progress.')
202 |   if await_completion:
203 |     result.wait_until_finish()
204 |     return result
205 | 
206 | 
207 | if __name__ == '__main__':
208 |   main(sys.argv[1:])
209 | 


--------------------------------------------------------------------------------
/models/claim_breadth/batch_inference_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache
 2 | # License, Version 2.0 (the "License"); you may not use this file except in
 3 | # compliance with the License. You may obtain a copy of the License at
 4 | # http://www.apache.org/licenses/LICENSE-2.0
 5 | 
 6 | # Unless required by applicable law or agreed to in writing, software
 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 9 | # License for the specific language governing permissions and limitations under
10 | # the License.
11 | """End-to-end test for the patent claim model batch inference script."""
12 | import logging
13 | import os
14 | import shutil
15 | import time
16 | import unittest
17 | from apache_beam.metrics.metric import MetricsFilter
18 | from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher
19 | from apache_beam.testing.test_pipeline import TestPipeline
20 | import batch_inference
21 | from hamcrest.core.core.allof import all_of
22 | from nose.plugins.attrib import attr
23 | 
24 | # Assumes your project and model versions are set as ENV variables. See README.
25 | PROJECT = os.environ['GCP_PROJECT']
26 | MODEL_VERSION_STR = os.environ['MODEL_VERSION_STR']
27 | 
28 | 
29 | def get_pipeline_metric(pipeline_results, metric_name, index=0,
30 |                         result_type='counters'):
31 |   """Attempts to return a metrics from an Apache Beam PipelineResults."""
32 |   metrics_filter = MetricsFilter().with_name(metric_name)
33 |   query_result = pipeline_results.metrics().query(metrics_filter)
34 |   try:
35 |     return query_result[result_type][index].committed
36 |   except IndexError:
37 |     logging.info(
38 |         'No key in metrics for %s at index %s, returning 0', metric_name, index)
39 |     return 0
40 | 
41 | 
42 | class BatchInferenceE2E(unittest.TestCase):
43 |   _multiprocess_can_split_ = True
44 |   OUTPUT_DIR = os.getcwd()
45 |   TEST_DATA_GLOB = os.path.join(OUTPUT_DIR, 'testdata', '*.tfrecord.gz')
46 |   TOTAL_RECORDS_IN_TEST_DATA = 17
47 | 
48 |   @attr('IT')
49 |   def test_text_file_output(self):
50 |     test_pipeline = TestPipeline()
51 |     # Checks that pipeline reaches state "Done"
52 |     pipeline_verifiers = [PipelineStateMatcher()]
53 | 
54 |     # Set extra options to the pipeline for test purpose
55 |     test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
56 |     extra_opts = {
57 |         'project': PROJECT,
58 |         'model_version_str': MODEL_VERSION_STR,
59 |         'input_file_pattern': self.TEST_DATA_GLOB,
60 |         'output_path': test_dir,
61 |         'runner': 'DirectRunner',
62 |         'output_shards': 1,
63 |         'on_success_matcher': all_of(*pipeline_verifiers),
64 |     }
65 | 
66 |     # Add cleanup for testdir
67 |     self.addCleanup(shutil.rmtree, test_dir)
68 | 
69 |     result = batch_inference.main(
70 |         test_pipeline.get_full_options_as_args(**extra_opts),
71 |         await_completion=True
72 |     )
73 | 
74 |     records_scored = get_pipeline_metric(result, 'inference_success')
75 |     self.assertEqual(records_scored, self.TOTAL_RECORDS_IN_TEST_DATA)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |   logging.getLogger().setLevel(logging.DEBUG)
80 |   logging.info('Running with MODEL: %s', MODEL_VERSION_STR)
81 |   unittest.main()
82 | 


--------------------------------------------------------------------------------
/models/claim_breadth/generate_embedding_vocab.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | SELECT
 3 |   SUBSTR(cpc.code, 1, 4) cpc4 # Trim CPC Code to first 4 digits.
 4 | FROM
 5 |   `patents-public-data.patents.publications`,
 6 |   UNNEST(cpc) AS cpc
 7 | WHERE
 8 |   country_code = 'US'
 9 |   AND SUBSTR(cpc.code, 1, 1) IN ('D', 'E', 'G', 'H')
10 |   AND FLOOR(priority_date / 10000) > 1995
11 | GROUP BY
12 |   1
13 | HAVING
14 |   COUNT(publication_number) > 3000
15 | 


--------------------------------------------------------------------------------
/models/claim_breadth/hptuning_config.yaml:
--------------------------------------------------------------------------------
 1 | trainingInput:
 2 |   scaleTier: BASIC_GPU
 3 |   hyperparameters:
 4 |     enableTrialEarlyStopping: True
 5 |     hyperparameterMetricTag: auc
 6 |     maxTrials: 60
 7 |     maxParallelTrials: 2
 8 |     params:
 9 |     - parameterName: dropout
10 |       type: DOUBLE
11 |       minValue: 0.2
12 |       maxValue: 0.6
13 |       scaleType: UNIT_LINEAR_SCALE
14 |     - parameterName: learning-rate
15 |       type: DOUBLE
16 |       minValue: 0.0001
17 |       maxValue: 0.01
18 |       scaleType: UNIT_REVERSE_LOG_SCALE
19 |     - parameterName: first-layer-size
20 |       type: INTEGER
21 |       minValue: 256
22 |       maxValue: 8192
23 |       scaleType: UNIT_LINEAR_SCALE
24 |     - parameterName: num-layers
25 |       type: INTEGER
26 |       minValue: 1
27 |       maxValue: 10
28 |       scaleType: UNIT_LINEAR_SCALE
29 |     - parameterName: scale-factor
30 |       type: DOUBLE
31 |       minValue: 0.3
32 |       maxValue: 0.99
33 |       scaleType: UNIT_LINEAR_SCALE
34 |     - parameterName: cpc-embedding-dim
35 |       type: INTEGER
36 |       minValue: 5
37 |       maxValue: 100
38 |       scaleType: UNIT_LINEAR_SCALE
39 | 


--------------------------------------------------------------------------------
/models/claim_breadth/preprocess_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache
  2 | # License, Version 2.0 (the "License"); you may not use this file except in
  3 | # compliance with the License. You may obtain a copy of the License at
  4 | # http://www.apache.org/licenses/LICENSE-2.0
  5 | 
  6 | # Unless required by applicable law or agreed to in writing, software
  7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  9 | # License for the specific language governing permissions and limitations under
 10 | # the License.
 11 | 
 12 | """End-to-end test for the patent claim breadth model preprocessing code."""
 13 | import logging
 14 | import os
 15 | import shutil
 16 | import time
 17 | import unittest
 18 | from apache_beam.metrics.metric import MetricsFilter
 19 | from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher
 20 | from apache_beam.testing.test_pipeline import TestPipeline
 21 | from hamcrest.core.core.allof import all_of
 22 | from nose.plugins.attrib import attr
 23 | import preprocess
 24 | import tensorflow as tf
 25 | 
 26 | # Assumes you've set an environmental variable for your GCP project. See README.
 27 | PROJECT = os.environ['GCP_PROJECT']
 28 | 
 29 | 
 30 | def read_example_proto(test_dir):
 31 |   filenames = tf.gfile.Glob(os.path.join(test_dir, '*.tfrecord.gz'))
 32 |   tf_opt = tf.python_io.TFRecordOptions(
 33 |       tf.python_io.TFRecordCompressionType.GZIP)
 34 |   record = next(tf.python_io.tf_record_iterator(filenames[0], options=tf_opt))
 35 |   example = tf.train.Example()
 36 |   example.ParseFromString(record)
 37 |   return example
 38 | 
 39 | 
 40 | def get_pipeline_metric(results, metric_name, index=0, result_type='counters'):
 41 |   metric_filter = MetricsFilter().with_name(metric_name)
 42 |   query_result = results.metrics().query(metric_filter)
 43 |   try:
 44 |     return query_result[result_type][index].committed
 45 |   except IndexError:
 46 |     logging.info(
 47 |         'No key in metrics for %s at index %s, returning 0', metric_name, index)
 48 |     return 0
 49 | 
 50 | 
 51 | def get_tf_feature(proto, feature_name, feature_type='float_list'):
 52 |   """Helper method to retrieve named features from a TF example proto."""
 53 |   return getattr(proto.features.feature[feature_name], feature_type).value[0]
 54 | 
 55 | 
 56 | def get_test_query(max_records):
 57 |   return '''
 58 |     #standardSQL
 59 |     with fake_applications as (
 60 |       SELECT
 61 |       'US-1234567-A1' as publication_number,
 62 |       substr(claims.text, 0, 2000) as fullclaim,
 63 |       2000 as priority_yr,
 64 |       'C08F' as cpc4,
 65 |       2003 as median_priority_yr
 66 |       FROM `patents-public-data.patents.publications` p
 67 |       ,UNNEST(claims_localized) claims
 68 |       WHERE claims.language = 'en'
 69 |       AND country_code = 'US'
 70 |       AND claims.text is not null
 71 |       AND FLOOR(priority_date / 10000) > 2005
 72 |       limit {half_max}
 73 |     )
 74 | 
 75 |     , fake_issued as (
 76 |       SELECT
 77 |       'US-1234567-B2' as publication_number,
 78 |       substr(claims.text, 0, 2000) as fullclaim,
 79 |       2012 as priority_yr,
 80 |       'C08F' as cpc4,
 81 |       2003 as median_priority_yr
 82 |       FROM `patents-public-data.patents.publications` p
 83 |       ,UNNEST(claims_localized) claims
 84 |       WHERE claims.language = 'en'
 85 |       AND country_code = 'US'
 86 |       AND claims.text is not null
 87 |       AND FLOOR(priority_date / 10000) > 2005
 88 |       limit {half_max}
 89 |     )
 90 | 
 91 |     select * from fake_applications
 92 |     union all
 93 |     select * from fake_issued
 94 |     '''.format(half_max=(max_records // 2))
 95 | 
 96 | 
 97 | class PreProcessE2E(unittest.TestCase):
 98 |   # Enable nose tests running in parallel
 99 |   _multiprocess_can_split_ = True
100 |   OUTPUT_DIR = os.getcwd()
101 |   TOTAL_RECORDS = 500
102 |   TEST_QUERY = get_test_query(TOTAL_RECORDS)
103 | 
104 |   @attr('IT')
105 |   def test_train_mode(self):
106 |     """Runs pipeline in train mode outputting train, test and eval filesets."""
107 |     test_pipeline = TestPipeline()
108 |     # Set extra options to the pipeline for test purpose
109 |     test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
110 |     self.addCleanup(shutil.rmtree, test_dir)
111 | 
112 |     # Checks that pipeline reaches state "Done"
113 |     pipeline_verifiers = [PipelineStateMatcher()]
114 |     extra_opts = {
115 |         'project': PROJECT,
116 |         'output_path': test_dir,
117 |         'on_success_matcher': all_of(*pipeline_verifiers),
118 |         'runner': 'DirectRunner',
119 |     }
120 | 
121 |     res = preprocess.main(
122 |         test_pipeline.get_full_options_as_args(**extra_opts),
123 |         query=self.TEST_QUERY,
124 |         await_completion=True
125 |     )
126 | 
127 |     # Check counts coming out of GetFirstClaim step.
128 |     parse_first_claim_cnt = get_pipeline_metric(res, 'parse_firstclaim_success')
129 |     self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)
130 | 
131 |     # Check counts coming out of AddFeatures step.
132 |     add_features_cnt = get_pipeline_metric(res, 'create_features_success')
133 |     self.assertEqual(self.TOTAL_RECORDS, add_features_cnt)
134 | 
135 |     # Check counts coming out of AddLabel step.
136 |     broad_cnt = get_pipeline_metric(res, 'add_label_broad')
137 |     narrow_cnt = get_pipeline_metric(res, 'add_label_narrow')
138 |     self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt)
139 | 
140 |     # Check if the number of records coming out of Train/Test = limit step.
141 |     splits = ['train_cnt', 'eval_cnt', 'test_cnt']
142 |     train_test_split_cnt = sum(
143 |         [get_pipeline_metric(res, m) for m in splits]
144 |     )
145 |     self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt)
146 | 
147 |     # Check if number of protos created matched output of train/test split.
148 |     create_proto_success = sum(
149 |         [get_pipeline_metric(res, 'create_proto_success', index=i)
150 |          for i in range(3)]
151 |     )
152 |     self.assertEqual(self.TOTAL_RECORDS, create_proto_success)
153 | 
154 |     # Open a tf Example and check fields.
155 |     example = read_example_proto(test_dir)
156 |     for feature_name in preprocess.FEATURE_NAMES:
157 |       self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)
158 |     # Make sure label feature is present.
159 |     labels = ['broad', 'narrow']
160 |     self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
161 | 
162 |   @attr('IT')
163 |   def test_inference_mode(self):
164 |     """Runs a pipeline in inference mode which should output one fileset."""
165 |     test_pipeline = TestPipeline()
166 |     # Set extra options to the pipeline for test purpose
167 |     test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
168 |     self.addCleanup(shutil.rmtree, test_dir)
169 | 
170 |     # Checks that pipeline reaches state "Done"
171 |     pipeline_verifiers = [PipelineStateMatcher()]
172 |     extra_opts = {
173 |         'project': PROJECT,
174 |         'output_path': test_dir,
175 |         'on_success_matcher': all_of(*pipeline_verifiers),
176 |         'runner': 'DirectRunner',
177 |         'pipeline_mode': 'inference',
178 |     }
179 | 
180 |     res = preprocess.main(
181 |         test_pipeline.get_full_options_as_args(**extra_opts),
182 |         query=self.TEST_QUERY,
183 |         await_completion=True
184 |     )
185 | 
186 |     # Check counts coming out of GetFirstClaim step.
187 |     parse_first_claim_cnt = get_pipeline_metric(res, 'parse_firstclaim_success')
188 |     self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)
189 | 
190 |     # Ensure a proto is created for all input records
191 |     create_proto_success = get_pipeline_metric(res, 'create_proto_success')
192 |     self.assertEqual(self.TOTAL_RECORDS, create_proto_success)
193 | 
194 |     # Open a tf Example and check fields.
195 |     example = read_example_proto(test_dir)
196 |     for feature_name in preprocess.FEATURE_NAMES:
197 |       self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)
198 | 
199 |     # Make sure label feature is not present since we are in inference.
200 |     with self.assertRaises(IndexError):
201 |       get_tf_feature(example, 'label', 'bytes_list')
202 | 
203 | 
204 | if __name__ == '__main__':
205 |   logging.getLogger().setLevel(logging.DEBUG)
206 |   unittest.main()
207 | 


--------------------------------------------------------------------------------
/models/claim_breadth/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.1.13
 2 | apache-beam==2.4.0
 3 | astor==0.6.2
 4 | avro==1.8.2
 5 | backports.weakref==1.0.post1
 6 | bleach==1.5.0
 7 | cachetools==2.0.1
 8 | certifi==2018.1.18
 9 | chardet==3.0.4
10 | crcmod==1.7
11 | dill==0.2.6
12 | docopt==0.6.2
13 | enum34==1.1.6
14 | fasteners==0.14.1
15 | funcsigs==1.0.2
16 | future==0.16.0
17 | futures==3.2.0
18 | gapic-google-cloud-pubsub-v1==0.15.4
19 | gast==0.2.0
20 | google-apitools==0.5.20
21 | google-api-python-client==1.6.7
22 | google-auth==1.4.1
23 | google-auth-httplib2==0.0.3
24 | google-cloud-bigquery==0.25.0
25 | google-cloud-core==0.25.0
26 | google-cloud-pubsub==0.26.0
27 | google-gax==0.15.16
28 | googleapis-common-protos==1.5.3
29 | googledatastore==7.0.1
30 | grpc-google-iam-v1==0.11.4
31 | grpcio==1.10.1
32 | hdfs==2.1.0
33 | html5lib==0.9999999
34 | httplib2==0.9.2
35 | idna==2.6
36 | Markdown==2.6.11
37 | mock==2.0.0
38 | monotonic==1.4
39 | nose==1.3.7
40 | numpy==1.14.2
41 | oauth2client==4.1.2
42 | pbr==4.0.2
43 | ply==3.8
44 | proto-google-cloud-datastore-v1==0.90.4
45 | proto-google-cloud-pubsub-v1==0.15.4
46 | protobuf==3.5.2.post1
47 | pyasn1==0.4.2
48 | pyasn1-modules==0.2.1
49 | PyHamcrest==1.9.0
50 | PyVCF==0.6.8
51 | PyYAML==3.12
52 | requests==2.18.4
53 | rsa==3.4.2
54 | six==1.11.0
55 | tensorboard==1.7.0
56 | tensorflow==1.7.0
57 | termcolor==1.1.0
58 | typing==3.6.4
59 | uritemplate==3.0.0
60 | urllib3==1.22
61 | Werkzeug==0.14.1
62 | 


--------------------------------------------------------------------------------
/models/claim_breadth/testdata/example-output-from-preprocess-step.tfrecord.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/claim_breadth/testdata/example-output-from-preprocess-step.tfrecord.gz


--------------------------------------------------------------------------------
/models/claim_breadth/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/claim_breadth/trainer/__init__.py


--------------------------------------------------------------------------------
/models/claim_breadth/trainer/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache
 2 | # License, Version 2.0 (the "License"); you may not use this file except in
 3 | # compliance with the License. You may obtain a copy of the License at
 4 | # http://www.apache.org/licenses/LICENSE-2.0
 5 | 
 6 | # Unless required by applicable law or agreed to in writing, software
 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 9 | # License for the specific language governing permissions and limitations under
10 | # the License.
11 | """Model definition for the patent claim breadth model."""
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | import tensorflow as tf
16 | 
17 | # Count features created in ../preprocess.py
18 | FEATURE_NAMES = [
19 |     'word_cnt', 'word_cnt_unique', 'char_cnt', 'char_cnt_unique',
20 |     'limiting_words_cnt', 'digits_or_decimal_cnt', 'atleastoneofand_cnt',
21 |     'atleastoneofor_cnt', 'counting_cnt', 'excluding_words_cnt',
22 |     'groupconsistingof_cnt', 'element_cnt', 'adding_words_cnt',
23 | ]
24 | 
25 | 
26 | def build_input_columns(embedding_dim, embedding_vocab_file):
27 |   """Builds input columns for use with Tensorflow Estimator."""
28 |   categorical = tf.feature_column.categorical_column_with_vocabulary_file(
29 |       key='cpc4',
30 |       vocabulary_file=embedding_vocab_file,
31 |       num_oov_buckets=1,
32 |   )
33 |   cpc_embedding = tf.feature_column.embedding_column(
34 |       categorical_column=categorical,
35 |       dimension=embedding_dim
36 |   )
37 |   numeric_columns = [tf.feature_column.numeric_column(k) for k in FEATURE_NAMES]
38 |   return [cpc_embedding] + numeric_columns
39 | 
40 | 
41 | def build_estimator(config, hidden_units=None, learning_rate=0.001, dropout=0.1,
42 |                     embedding_vocab_file=None, embedding_dim=25):
43 |   """Builds an estimator for predicting patent claim complex."""
44 |   input_columns = build_input_columns(embedding_dim, embedding_vocab_file)
45 |   return tf.estimator.DNNClassifier(
46 |       config=config,
47 |       feature_columns=input_columns,
48 |       hidden_units=hidden_units or [512, 256, 128],
49 |       optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=learning_rate),
50 |       dropout=dropout
51 |   )
52 | 
53 | 
54 | def build_serving_fn():
55 |   """Builds serving function based on Hparams."""
56 |   def _json_serving_input_fn():
57 |     inputs = {}
58 |     for feat in FEATURE_NAMES:
59 |       inputs[feat] = tf.placeholder(shape=[None], dtype=tf.float32)
60 |     inputs['cpc4'] = tf.placeholder(shape=[None], dtype=tf.string)
61 |     return tf.estimator.export.ServingInputReceiver(inputs, inputs)
62 | 
63 |   return _json_serving_input_fn
64 | 
65 | 
66 | def input_fn(filespec, batch_size, num_epochs=None, shuffle=True):
67 |   """Builds a TensorFlow input function for use with our model."""
68 |   def _parse_example(example):
69 |     """Parses a TF example protobuffer."""
70 |     feature_spec = {
71 |         'label': tf.FixedLenFeature([], tf.string),
72 |         'cpc4': tf.FixedLenFeature([], tf.string),
73 |         'publication_number': tf.FixedLenFeature([], tf.string),
74 |     }
75 |     for f in FEATURE_NAMES:
76 |       feature_spec[f] = tf.FixedLenFeature([], tf.float32)
77 |     features = tf.parse_single_example(example, feature_spec)
78 |     labels = tf.to_int32(tf.equal(features.pop('label'), 'broad'))
79 | 
80 |     return features, labels
81 | 
82 |   filenames = tf.gfile.Glob(filespec)
83 |   dataset = tf.data.TFRecordDataset(filenames, compression_type='GZIP')
84 |   dataset = dataset.map(_parse_example)
85 |   dataset = dataset.prefetch(batch_size * 5)
86 |   dataset = dataset.batch(batch_size).repeat(num_epochs)
87 |   if shuffle:
88 |     dataset = dataset.shuffle(batch_size * 5)
89 | 
90 |   iterator = dataset.make_one_shot_iterator()
91 |   batch_features, batch_labels = iterator.get_next()
92 |   return batch_features, batch_labels
93 | 


--------------------------------------------------------------------------------
/models/claim_breadth/trainer/task.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache
  2 | # License, Version 2.0 (the "License"); you may not use this file except in
  3 | # compliance with the License. You may obtain a copy of the License at
  4 | # http://www.apache.org/licenses/LICENSE-2.0
  5 | 
  6 | # Unless required by applicable law or agreed to in writing, software
  7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  9 | # License for the specific language governing permissions and limitations under
 10 | # the License.
 11 | """Experiment definition for the patent claim breadth model."""
 12 | import argparse
 13 | import tensorflow as tf
 14 | from tensorflow.contrib.training.python.training import hparam
 15 | import trainer.model as model
 16 | 
 17 | 
 18 | def parse_args():
 19 |   """Parses command line arguements."""
 20 |   parser = argparse.ArgumentParser()
 21 |   # Input Arguments
 22 |   parser.add_argument(
 23 |       '--train-files',
 24 |       help='GCS or local paths to training data',
 25 |       nargs='+',
 26 |       required=True
 27 |   )
 28 |   parser.add_argument(
 29 |       '--eval-files',
 30 |       help='GCS or local paths to evaluation data',
 31 |       nargs='+',
 32 |       required=True
 33 |   )
 34 |   parser.add_argument(
 35 |       '--job-dir',
 36 |       help='GCS location to write checkpoints and export models',
 37 |       required=True,
 38 |   )
 39 | 
 40 |   # Training arguments - hparams which can be tuned.
 41 |   parser.add_argument(
 42 |       '--dropout',
 43 |       help='Dropout between layers in DNN.',
 44 |       default=0.35,
 45 |       type=float
 46 |   )
 47 |   parser.add_argument(
 48 |       '--learning-rate',
 49 |       help='Learning rate for the optimizer.',
 50 |       default=0.01,
 51 |       type=float
 52 |   )
 53 |   parser.add_argument(
 54 |       '--first-layer-size',
 55 |       help='Number of nodes in the first layer of the DNN',
 56 |       default=7500,
 57 |       type=int
 58 |   )
 59 |   parser.add_argument(
 60 |       '--num-layers',
 61 |       help='Number of layers in the DNN',
 62 |       default=1,
 63 |       type=int
 64 |   )
 65 |   parser.add_argument(
 66 |       '--scale-factor',
 67 |       help='How quickly should the size of the layers in the DNN decay',
 68 |       default=0.8,
 69 |       type=float
 70 |   )
 71 |   parser.add_argument(
 72 |       '--cpc-embedding-vocab-file',
 73 |       help='GCS path to a text file with one CPC4 per line. Any CPC4 codes not'
 74 |            'included will be mapped to a single UNK bucket. See README.',
 75 |       required=True,
 76 |       type=str
 77 |   )
 78 |   parser.add_argument(
 79 |       '--cpc-embedding-dim',
 80 |       help='Size of the learned embedding column to represent CPC codes.',
 81 |       default=85,
 82 |       type=int
 83 |   )
 84 | 
 85 |   # Experiment arguments
 86 |   parser.add_argument(
 87 |       '--train-steps',
 88 |       help='Steps to run the training job before exiting.',
 89 |       type=int,
 90 |       default=30000
 91 |   )
 92 |   parser.add_argument(
 93 |       '--train-batch-size',
 94 |       help='Batch size for training steps',
 95 |       type=int,
 96 |       default=512
 97 |   )
 98 |   parser.add_argument(
 99 |       '--eval-batch-size',
100 |       help='Batch size for evaluation steps',
101 |       type=int,
102 |       default=512
103 |   )
104 |   parser.add_argument(
105 |       '--eval-secs',
106 |       help='Time between evaluations.',
107 |       type=int,
108 |       default=120
109 |   )
110 |   parser.add_argument(
111 |       '--eval-steps',
112 |       help='Number of steps to run evalution for at each checkpoint',
113 |       default=100,
114 |       type=int
115 |   )
116 |   parser.add_argument(
117 |       '--verbosity',
118 |       choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'],
119 |       default='INFO',
120 |   )
121 |   return parser.parse_args()
122 | 
123 | 
124 | def main(hparams):
125 |   """Run the training and evaluate using the high level API."""
126 | 
127 |   trn_input = lambda: model.input_fn(
128 |       hparams.train_files,
129 |       batch_size=hparams.train_batch_size
130 |   )
131 |   train_spec = tf.estimator.TrainSpec(trn_input, max_steps=hparams.train_steps)
132 | 
133 |   eval_input = lambda: model.input_fn(
134 |       hparams.eval_files,
135 |       batch_size=hparams.eval_batch_size,
136 |   )
137 | 
138 |   # Construct our JSON serving function for Online Predictions using GCP.
139 |   exporter = tf.estimator.FinalExporter('model', model.build_serving_fn())
140 |   eval_spec = tf.estimator.EvalSpec(
141 |       eval_input,
142 |       throttle_secs=hparams.eval_secs,
143 |       steps=hparams.eval_steps,
144 |       exporters=[exporter],
145 |   )
146 | 
147 |   run_config = tf.estimator.RunConfig()
148 |   run_config = run_config.replace(model_dir=hparams.job_dir)
149 |   # Construct layers sizes with exponential decay
150 |   hidden_units = [
151 |       max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
152 |       for i in range(hparams.num_layers)
153 |   ]
154 |   estimator = model.build_estimator(
155 |       config=run_config,
156 |       hidden_units=hidden_units,
157 |       learning_rate=hparams.learning_rate,
158 |       dropout=hparams.dropout,
159 |       embedding_vocab_file=hparams.cpc_embedding_vocab_file,
160 |       embedding_dim=hparams.cpc_embedding_dim,
161 |   )
162 |   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
163 | 
164 | 
165 | if __name__ == '__main__':
166 |   args = parse_args()
167 |   tf.logging.set_verbosity(args.verbosity)
168 |   hyperparams = hparam.HParams(**args.__dict__)
169 |   main(hyperparams)
170 | 


--------------------------------------------------------------------------------
/models/landscaping/AutomatedPatentLandscaping.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/AutomatedPatentLandscaping.pdf


--------------------------------------------------------------------------------
/models/landscaping/AutomatedPatentLandscaping_2018Update.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/AutomatedPatentLandscaping_2018Update.pdf


--------------------------------------------------------------------------------
/models/landscaping/README.md:
--------------------------------------------------------------------------------
  1 | # Purpose
  2 | Patent landscaping is the process of finding patents related to a particular topic. It is important for companies, investors, governments, and academics seeking to gauge innovation and assess risk. However, there is no broadly recognized best approach to landscaping. Frequently, patent landscaping is a bespoke human-driven process that relies heavily on complex queries over bibliographic patent databases. In this paper (and repository), we present Automated Patent Landscaping, an approach that jointly leverages human domain expertise, heuristics based on patent metadata, and machine learning to generate high-quality patent landscapes with minimal effort.
  3 | 
  4 | # Creating a Patent Landscape
  5 | 
  6 | The figure 1 shows the high level flow to create a patent landscape. We'll walk through each of these in turn in the accompanying Jupyter Notebook.
  7 | 
  8 | ![Fig 1. High Level Flow of Automated Patent Landscaping](figs/flow.png)
  9 | 
 10 | ## Requirements
 11 | Before we get started, you should install some requirements for running this notebook. We rely on TensorFlow, Keras, and Google's Cloud infrastructure such as BigQuery, where we pull public patent data, and that needs to be installed and authorized. You need a few basics before continuing:
 12 | * Anaconda
 13 | * Jupyter Notebooks
 14 | * TensorFlow and Keras
 15 | * Google Cloud SDK
 16 | * BigQuery Python Client
 17 | * A few Python utilities
 18 | 
 19 | ### Platform Support
 20 | 
 21 | Note that this has primarily been tested using Linux (Ubuntu) and Windows 10
 22 | with both CPU-based TensorFlow and with GPUs. There's no reason this shouldn't
 23 | work just fine with MacOS, but it's not been thoroughly tested. If you encounter
 24 | issues with the instructions, please feel free to reach out to
 25 | [Dave Feltenberger](https://github.com/seinberg) or send a pull request with a fix.
 26 | 
 27 | ### Anaconda
 28 | I strongly recommend using Anaconda for this - it helps manage environments for Python, and these instructions will assume you're using it. Download Anaconda from [https://www.anaconda.com/download/](https://www.anaconda.com/download/). Install the Python 3.6 version, *not* 2.7.
 29 | 
 30 | Once Anaconda is installed, create an environment:
 31 | ```
 32 | conda create -n patent-landscape python=3.5
 33 | source activate patent-landscape (or just: activate patent-landscape if you're in Windows)
 34 | ```
 35 | 
 36 | ### Jupyter Notebooks
 37 | 
 38 | To run the code in this notebook, you'll also need to install Jupyter. The following installs Jupyter and some utilities that let you toggle between different conda environments while inside a notebook.
 39 | 
 40 | ```
 41 | conda config --add channels conda-forge
 42 | conda install jupyter ipython nb_conda=2.2.0
 43 | ```
 44 | 
 45 | ### Installing pip
 46 | 
 47 | Some packages we depend on use `pip` for package management. If you're in Windows or Linux, installing Anaconda should take care of this for you. If you're on a Mac, there's a chance this isn't installed and that you need to install it yourself. You can install it with this command:
 48 | ```
 49 | conda install pip
 50 | ```
 51 | 
 52 | ### TensorFlow and Keras
 53 | 
 54 | TensorFlow will work 'out of the box' with just your CPU. Since we're going to be building a model using neural networks, however, I highly recommend using GPU acceleration - training will be at least an order of magnitude faster with a modern GPU. You'll need to follow the TensorFlow instructions found [here](https://www.tensorflow.org/install/) for your platform. There are several steps to getting your GPU working for Deep Learning, so pay careful attention to the instructions. Note that only Nvidia chipset-based GPUs will work with TensorFlow.
 55 | 
 56 | To skip all the GPU acceleration and just get started, you can just run this command within your active conda environment:
 57 | ```
 58 | pip install tensorflow
 59 | ```
 60 | 
 61 | Keras is an excellent high level Deep Learning library that we'll use to build our models:
 62 | ```
 63 | conda install keras
 64 | ```
 65 | 
 66 | Also install tflearn, the high-level library on top of TensorFlow, if you'd like to experiment with another high-level library like Keras (though our example doesn't directly use tflearn):
 67 | ```
 68 | pip install tflearn
 69 | ```
 70 | 
 71 | ### Google Cloud SDK
 72 | ***Download and install Google Cloud SDK.*** You can download and install using [these](https://cloud.google.com/sdk/docs) instructions. On Linux, you'll need to source the `.sh` file and source the relevant include files to make sure the binaries are in your path. You should also install the python utilities for your Anaconda project:
 73 | 
 74 | ```
 75 | pip install google-cloud
 76 | pip install google-cloud-storage
 77 | ```
 78 | 
 79 | Once you have the `gcloud` client installed, you need to authorize it to access Google's Cloud on your behalf. ***Don't forget this, or you'll get difficult to debug errors while running the code!*** From your active conda environment, run this command and follow the prompts:
 80 | ```
 81 | gcloud auth application-default login
 82 | ```
 83 | 
 84 | Finally, you'll also need to install the Google API Python Client and BigQuery extension to Pandas:
 85 | ```
 86 | pip install google-api-python-client pandas-gbq
 87 | ```
 88 | 
 89 | ### Python Utilities
 90 | 
 91 | ```
 92 | conda install numpy pandas h5py scipy scikit-learn matplotlib seaborn
 93 | ```
 94 | 
 95 | ## Google Cloud Tools Client Authorization
 96 | 
 97 | For this code to run properly, you need to authorize Google Cloud to run. This is important or you'll get weird errors that are hard to debug :) See above, the `gcloud` command.
 98 | 
 99 | # Running the Landscaping Code
100 | 
101 | There are two primary steps: 1) cloning the Github repo, and 2) starting
102 | Jupyter.
103 | 
104 | ## 1) Clone the Landscaping Github Repo
105 | 
106 | ```
107 | git clone https://github.com/google/patents-public-data
108 | ```
109 | 
110 | 
111 | ## 2) Start Jupyter
112 | 
113 | Now that you have the Repo, from a command line, change into the root of the
114 | repository you just cloned. The Jupyter notebook we care about is in
115 | `models/landscaping/LandscapeNotebook.ipynb`. Finally, set an environment
116 | variable and start Jupyter:
117 | 
118 | ```
119 | export KERAS_BACKEND=tensorflow; jupyter notebook
120 | ```
121 | 
122 | *Note*: don't forget to change Conda environments to the patent-landscape one.
123 | You can do this before starting `jupyter` with the following command:
124 | ```
125 | source activate patent-landscape (or just: activate patent-landscape if you're in Windows)
126 | ```
127 | 
128 | or, once you're in the Jupyter environment, and assuming you've installed all
129 | the packages mentioned above, by choosing the Environment menu option and
130 | selecting the `patent-landscape` environment.
131 | 


--------------------------------------------------------------------------------
/models/landscaping/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/models/landscaping/figs/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/figs/flow.png


--------------------------------------------------------------------------------
/models/landscaping/figs/project-id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/figs/project-id.png


--------------------------------------------------------------------------------
/models/landscaping/keras_metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import keras.backend as K
16 | 
17 | def precision(y_true, y_pred):
18 |     """Precision metric.
19 |     Only computes a batch-wise average of precision.
20 |     Computes the precision, a metric for multi-label classification of
21 |     how many selected items are relevant.
22 |     """
23 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
24 |     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
25 |     precision = true_positives / (predicted_positives + K.epsilon())
26 |     return precision
27 | 
28 | 
29 | def recall(y_true, y_pred):
30 |     """Recall metric.
31 |     Only computes a batch-wise average of recall.
32 |     Computes the recall, a metric for multi-label classification of
33 |     how many relevant items are selected.
34 |     """
35 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
36 |     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
37 |     recall = true_positives / (possible_positives + K.epsilon())
38 |     return recall
39 | 
40 | def fbeta_score(y_true, y_pred, beta=1):
41 |     """Computes the F score.
42 |     The F score is the weighted harmonic mean of precision and recall.
43 |     Here it is only computed as a batch-wise average, not globally.
44 |     This is useful for multi-label classification, where input samples can be
45 |     classified as sets of labels. By only using accuracy (precision) a model
46 |     would achieve a perfect score by simply assigning every class to every
47 |     input. In order to avoid this, a metric should penalize incorrect class
48 |     assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
49 |     computes this, as a weighted mean of the proportion of correct class
50 |     assignments vs. the proportion of incorrect class assignments.
51 |     With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
52 |     correct classes becomes more important, and with beta > 1 the metric is
53 |     instead weighted towards penalizing incorrect class assignments.
54 |     """
55 |     if beta < 0:
56 |         raise ValueError('The lowest choosable beta is zero (only precision).')
57 | 
58 |     # If there are no true positives, fix the F score at 0 like sklearn.
59 |     if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
60 |         return 0
61 | 
62 |     p = precision(y_true, y_pred)
63 |     r = recall(y_true, y_pred)
64 |     bb = beta ** 2
65 |     fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
66 |     return fbeta_score
67 | 
68 | 
69 | def fmeasure(y_true, y_pred):
70 |     """Computes the f-measure, the harmonic mean of precision and recall.
71 |     Here it is only computed as a batch-wise average, not globally.
72 |     """
73 |     return fbeta_score(y_true, y_pred, beta=1)
74 | 
75 | fscore = f1score = fmeasure
76 | 


--------------------------------------------------------------------------------
/models/landscaping/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from sklearn.metrics import classification_report
 16 | from sklearn.metrics import confusion_matrix
 17 | 
 18 | import keras
 19 | from keras.models import Sequential, Model
 20 | from keras.layers import Dense, Input, Embedding, BatchNormalization, ELU, Concatenate
 21 | from keras.layers import LSTM, Conv1D, MaxPooling1D
 22 | from keras.layers.merge import concatenate
 23 | from keras.layers.core import Dropout
 24 | from keras_metrics import precision, recall, f1score
 25 | 
 26 | import matplotlib.pyplot as plt
 27 | import os
 28 | import pandas as pd
 29 | import seaborn as sn
 30 | 
 31 | class LandscapeModel:
 32 |     target_names = ['seed', 'antiseed']
 33 |     tf_model = None
 34 |     td = None
 35 |     data_path = None
 36 |     seed_name = None
 37 | 
 38 |     def __init__(self, training_data, data_path, seed_name):
 39 |         self.tf_model = None
 40 |         self.td = training_data
 41 |         self.data_path = data_path
 42 |         self.seed_name = seed_name
 43 | 
 44 |     def wire_model_functional(self, lstm_size, dropout_pct, sequence_len):
 45 |         print('Building Functional model.')
 46 | 
 47 |         refs_input = Input(shape=(self.td.trainRefsOneHotX.shape[1],), name='refs_input')
 48 |         refs = Dense(
 49 |                 256,
 50 |                 input_dim=self.td.trainRefsOneHotX.shape[1],
 51 |                 activation=None)(refs_input)
 52 |         refs = Dropout(dropout_pct)(refs)
 53 |         refs = BatchNormalization()(refs)
 54 |         refs = ELU()(refs)
 55 |         refs = Dense(64, activation=None)(refs)
 56 |         refs = Dropout(dropout_pct)(refs)
 57 |         refs = BatchNormalization()(refs)
 58 |         refs = ELU()(refs)
 59 | 
 60 |         cpcs_input = Input(shape=(self.td.trainCpcOneHotX.shape[1],), name='cpcs_input')
 61 |         cpcs = Dense(
 62 |                 32,
 63 |                 input_dim=self.td.trainCpcOneHotX.shape[1],
 64 |                 activation=None)(cpcs_input)
 65 |         cpcs = Dropout(dropout_pct)(cpcs)
 66 |         cpcs = BatchNormalization()(cpcs)
 67 |         cpcs = ELU()(cpcs)
 68 | 
 69 |         # Use pre-trained Word2Vec embeddings
 70 |         embedding_layer_input = Input(shape=(sequence_len,), name='embed_input')
 71 |         embedding_layer = Embedding(self.td.w2v_runtime.embedding_weights.shape[0],
 72 |                                     self.td.w2v_runtime.embedding_weights.shape[1],
 73 |                                     weights=[self.td.w2v_runtime.embedding_weights],
 74 |                                     input_length=sequence_len,
 75 |                                     trainable=False)(embedding_layer_input)
 76 |         deep = LSTM(
 77 |             lstm_size,
 78 |             dropout=dropout_pct,
 79 |             recurrent_dropout=dropout_pct,
 80 |             return_sequences=False,
 81 |             name='LSTM_1')(embedding_layer)
 82 |         deep = Dense(300, activation=None)(deep)
 83 |         deep = Dropout(dropout_pct)(deep)
 84 |         deep = BatchNormalization()(deep)
 85 |         deep = ELU()(deep)
 86 | 
 87 |         #model_inputs_to_concat = [cpcs, refs, deep]
 88 |         model_inputs_to_concat = [refs, deep]
 89 | 
 90 |         final_layer = Concatenate(name='concatenated_layer')(model_inputs_to_concat)
 91 |         output = Dense(64, activation=None)(final_layer)
 92 |         output = Dropout(dropout_pct)(output)
 93 |         output = BatchNormalization()(output)
 94 |         output = ELU()(output)
 95 |         output = Dense(1, activation='sigmoid')(output)
 96 | 
 97 |         #model = Model(inputs=[cpcs_input, refs_input, embedding_layer_input], outputs=output, name='model')
 98 |         model = Model(inputs=[refs_input, embedding_layer_input], outputs=output, name='model')
 99 |         model.compile(loss='binary_crossentropy',
100 |                       optimizer='adam',
101 |                       metrics=['accuracy', precision, recall, f1score])
102 | 
103 |         self.tf_model = model
104 |         print('Done building graph.')
105 |         print(self.tf_model.summary())
106 | 
107 |     def train_model(self, model, batch_size, num_epochs=5):
108 |         print('Training model.')
109 |         model.fit(x={
110 |             'refs_input': self.td.trainRefsOneHotX,
111 |             'embed_input': self.td.padded_train_embed_x,
112 |             'cpcs_input': self.td.trainCpcOneHotX},
113 |                   y=self.td.trainY,
114 |                   batch_size=batch_size,
115 |                   epochs=num_epochs,
116 |                   validation_data=(
117 |                       {
118 |                           'refs_input': self.td.testRefsOneHotX,
119 |                           'cpcs_input': self.td.testCpcOneHotX,
120 |                           'embed_input': self.td.padded_test_embed_x},
121 |                       self.td.testY))
122 |         return model
123 | 
124 |     def train_or_load_model(self, batch_size, num_epochs=5):
125 |         model_dir = os.path.join(self.data_path, self.seed_name)
126 |         model_path = os.path.join(model_dir, 'model.pb')
127 | 
128 |         if os.path.exists(model_path):
129 |             print('Model exists at {}; loading existing trained model.'.format(model_path))
130 |             self.tf_model = keras.models.load_model(
131 |                 model_path,
132 |                 custom_objects={'precision': precision, 'recall': recall, 'fmeasure': f1score})
133 |         else:
134 |             print('Model has not been trained yet.')
135 |             tf_model = self.train_model(self.tf_model, batch_size, num_epochs)
136 |             print('Saving model to {}'.format(model_path))
137 |             if not os.path.exists(model_dir):
138 |                 os.makedirs(model_dir)
139 | 
140 |             tf_model.save(model_path)
141 |             print('Model persisted and ready for inference!')
142 | 
143 |     def evaluate_model(self, batch_size):
144 |         score, acc, p, r, f1 = self.tf_model.evaluate(
145 |             x={
146 |                 'refs_input': self.td.testRefsOneHotX,
147 |                 'cpcs_input': self.td.testCpcOneHotX,
148 |                 'embed_input': self.td.padded_test_embed_x
149 |             },
150 |             y=self.td.testY,
151 |             batch_size=batch_size)
152 | 
153 |         print('')
154 |         print('Test score: {:.4f}'.format(score))
155 |         print('Test accuracy: {:.4f}'.format(acc))
156 |         print('Test p/r (f1): {:.2f}/{:.2f} ({:.2f})'.format(p, r, f1))
157 | 
158 |         return (score, acc, p, r, f1)
159 | 
160 |     def batch_predict(self, padded_text_embeddings, refs_one_hot, cpcs_one_hot):
161 |         return self.tf_model.predict(
162 |             {
163 |                 'embed_input': padded_text_embeddings,
164 |                 'cpcs_input': cpcs_one_hot,
165 |                 'refs_input': refs_one_hot
166 |             })
167 | 
168 |     def predict(self, train_data_util, text, refs, cpcs):
169 |         '''
170 |         '''
171 | 
172 |         adhoc_text = pd.Series([text])
173 |         adhoc_refs = pd.Series([refs])
174 |         adhoc_cpcs = pd.Series([cpcs])
175 | 
176 |         padded_text_embeddings, refs_one_hot, cpcs_one_hot = \
177 |             train_data_util.prep_for_inference(adhoc_text, adhoc_refs, adhoc_cpcs)
178 | 
179 |         return self.batch_predict(padded_text_embeddings, refs_one_hot, cpcs_one_hot)
180 | 
181 |     def binary_prediction_idx(self, score):
182 |         if score < .5:
183 |             return 0
184 |         return 1
185 | 
186 |     def label_to_idx(self, label):
187 |         label = label.lower()
188 |         for i in range(0, len(self.target_names)):
189 |             if label == self.target_names[i]:
190 |                 return i
191 |         raise ValueError('Label {} has no target name from [{}]'.format(label, self.target_names))
192 | 
193 |     def reports(self, prediction_df):
194 |         binary_predictions_x = prediction_df.score.apply(self.binary_prediction_idx)
195 |         actual_labels_y = prediction_df.label.apply(self.label_to_idx)
196 | 
197 |         cr = classification_report(binary_predictions_x, actual_labels_y, target_names=self.target_names)
198 |         cm = confusion_matrix(binary_predictions_x, actual_labels_y)
199 | 
200 |         return cr, cm
201 | 
202 |     def show_confusion_matrix(self, confusion_matrix):
203 |         cm_df = pd.DataFrame(confusion_matrix)
204 |         plt.figure(figsize = (10,7))
205 |         sn.heatmap(cm_df, xticklabels=self.target_names, yticklabels=self.target_names)
206 | 


--------------------------------------------------------------------------------
/models/landscaping/seeds/README.md:
--------------------------------------------------------------------------------
 1 | # Example Patent Seed Sets
 2 | This directory contains example seed sets for use in automated patent landscaping as described in the corresponding paper.
 3 | 
 4 | ## Hair Dryer seed set
 5 | The first, [hair_dryer_large.seed.csv](hair_dryer_large.seed.csv), contains seed patents related to hair dryers. Use this seed set to build a patent landscaping model and apply it to any text snippet to see whether it is likely to be a hair dryer patent.
 6 | 
 7 | ## Video Codec seed set
 8 | The second, [video_codec.seed.csv](video_codec.seed.csv), we have used publicly available data to provide a seed set of patents related to video codecs.
 9 | 
10 | **DISCLAIMER AND SOURCING FOR video_codec.seed.csv**
11 | 
12 | The file video_codec.seed.csv provides a list of patents for a video codec seed set.  The list is composed of US patents identified by MPEG LA for HEVC, MPEG2, MPEG4, and AVC in the PDFs linked below.  The authors have not reviewed the contents of the patents contained in the list and some of the listed patents may not actually relate to video codecs. The inclusion of this file should not be construed as an acknowledgement that any of the listed patents actually cover any video codec standard.
13 | 
14 | [http://www.mpegla.com/main/programs/M4v/Documents/m4v-att1.pdf](http://www.mpegla.com/main/programs/M4v/Documents/m4v-att1.pdf)
15 | [http://www.mpegla.com/main/programs/HEVC/Documents/hevc-att1.pdf](http://www.mpegla.com/main/programs/HEVC/Documents/hevc-att1.pdf)
16 | [http://www.mpegla.com/main/programs/M2/Documents/m2-att1.pdf](http://www.mpegla.com/main/programs/M2/Documents/m2-att1.pdf)
17 | [http://www.mpegla.com/main/programs/AVC/Documents/avc-att1.pdf](http://www.mpegla.com/main/programs/AVC/Documents/avc-att1.pdf)
18 | 


--------------------------------------------------------------------------------
/models/landscaping/seeds/hair_dryer.seed.csv:
--------------------------------------------------------------------------------
  1 | 8407913
  2 | 6907678
  3 | 9144286
  4 | 5956863
  5 | 8893400
  6 | 8782920
  7 | 6798982
  8 | 8256132
  9 | 8307948
 10 | 8459273
 11 | 6305325
 12 | 6739071
 13 | 20150366316
 14 | 5725159
 15 | 6718651
 16 | 6935046
 17 | 7784750
 18 | 8904663
 19 | 5841943
 20 | 20170079401
 21 | 6199295
 22 | 20140290087
 23 | 6285828
 24 | 20130326898
 25 | 6671460
 26 | 6011903
 27 | 6269549
 28 | 6885810
 29 | 6038782
 30 | 5875562
 31 | 5647007
 32 | 6314236
 33 | 8517318
 34 | 20080235980
 35 | 20040093756
 36 | 6986212
 37 | 20060196075
 38 | 6191930
 39 | 7096597
 40 | 5488783
 41 | 6491267
 42 | 4596921
 43 | 7380347
 44 | 4659907
 45 | 20110203128
 46 | 6199805
 47 | 20040047620
 48 | 20050069303
 49 | 6889445
 50 | 5701681
 51 | 4827105
 52 | 6725562
 53 | 6354016
 54 | 6449870
 55 | 4493975
 56 | 6026590
 57 | 8720078
 58 | 20030177657
 59 | 20090100698
 60 | 9675157
 61 | 5790749
 62 | 4955145
 63 | 6067724
 64 | 6732450
 65 | 4524263
 66 | 20120024620
 67 | 7913416
 68 | 7584759
 69 | 20090065661
 70 | 4711988
 71 | 5434946
 72 | 5842286
 73 | 4453695
 74 | 5351417
 75 | 7350317
 76 | 4316077
 77 | 5651190
 78 | 7047660
 79 | 20100282810
 80 | 5884008
 81 | 4254324
 82 | 4683369
 83 | 5689896
 84 | 6393718
 85 | 6901936
 86 | 6591516
 87 | 20050108889
 88 | 5606640
 89 | 20110079239
 90 | 4918289
 91 | 20040231180
 92 | 4700049
 93 | 4972065
 94 | 5448677
 95 | 20060254073
 96 | 5612849
 97 | 4197448
 98 | 20100014844
 99 | 6188837
100 | 4687906
101 | 4759135
102 | 9675158
103 | 20060201016
104 | 3955065
105 | 4904847
106 | 5216822
107 | 5649370
108 | 4263500
109 | 6367162
110 | 20040163274
111 | 4323761
112 | 4538362
113 | 20080216339
114 | 5613305
115 | 20160022004
116 | 4260875
117 | 20020189128
118 | 4602146
119 | 5572800
120 | 20040088878
121 | 3691646
122 | 20130276321
123 | 5729907
124 | 6408533
125 | 20090188126
126 | 8103155
127 | 20020112362
128 | 5148512
129 | 20130111777
130 | 4198558
131 | 4309595
132 | 4391047
133 | 4635382
134 | 5243682
135 | 4395619
136 | 4936027
137 | 4794225
138 | 6910281
139 | 20070294909
140 | 6026821
141 | 4114022
142 | 4977306
143 | 4767914
144 | 7801423
145 | 8732976
146 | 20010020668
147 | 4295283
148 | 5784800
149 | 7676952
150 | 20130277517
151 | 20040172847
152 | 4327278
153 | 20050284495
154 | 20060064892
155 | 20070137060
156 | 7264209
157 | 20080116753
158 | 4712313
159 | 4800654
160 | 3943329
161 | 4225106
162 | 6085435
163 | 4615347
164 | 5598640
165 | 8230615
166 | 3946498
167 | 3947659
168 | 3981314
169 | 4196343
170 | 8081873
171 | 6029364
172 | 4910385
173 | 5195253
174 | 4132360
175 | 4382174
176 | 5243683
177 | 4430808
178 | 20110177711
179 | 20060006294
180 | 4848007
181 | 4967060
182 | 20040159002
183 | 5235759
184 | 20060075654
185 | 4691451
186 | 5155925
187 | 4658511
188 | 5996243
189 | 20150335128
190 | 6049994
191 | 7204038
192 | 20040020070
193 | 4634836
194 | 5036601
195 | 5157757
196 | 4003388
197 | 20070245590
198 | 5013891
199 | 4706153
200 | 4701595
201 | 20120266483
202 | 20150037015
203 | 6041514
204 | 20130153461
205 | 4039774
206 | 4525623
207 | 20090188125
208 | 20100212177
209 | 7123823
210 | 20070119070
211 | 4490602
212 | 5473824
213 | 20140345156
214 | 4890395
215 | 4308670
216 | 4287673
217 | 20050229424
218 | 6089239
219 | 3775861
220 | 5317815
221 | 3846047
222 | 4195217
223 | 7086176
224 | 20040001707
225 | 7165341
226 | 6130991
227 | 20110162225
228 | 20050091867
229 | 6691429
230 | 4218608
231 | 4365141
232 | 9072358
233 | 20030196344
234 | 4647757
235 | 20090320873
236 | 8020827
237 | 4424437
238 | 20020108264
239 | 20070062058
240 | 4321456
241 | 6370326
242 | 20160213122
243 | 9498039
244 | 4297564
245 | 4924602
246 | 4471213
247 | 20160242524
248 | 4603246
249 | 20010025430
250 | 6108934
251 | 3818600
252 | 3717936
253 | 4556782
254 | 5172880
255 | 9578945
256 | 4939345
257 | 6377749
258 | 20170127804
259 | 5765792
260 | 4088869
261 | 20130104415
262 | 20130263464
263 | 4667086
264 | 20100162585
265 | 4118874
266 | 20070114219
267 | 20110073735
268 | 3132232
269 | 4896020
270 | 3202797
271 | 20050204577
272 | 3303325
273 | 9149105
274 | 20050139226
275 | 3109912
276 | 4225775
277 | 20160353854
278 | 2514528
279 | 20020174559
280 | 4896021
281 | 20160220005
282 | 3832789
283 | 20070169369
284 | 3550285
285 | 4514618
286 | 5325809
287 | 7308899
288 | 4365426
289 | 20010005943
290 | 20150289623
291 | 20160367005
292 | 20160367007
293 | 20170105502
294 | 3836750
295 | 3095496
296 | 3889693
297 | 20160262520
298 | 3978314
299 | 4320283
300 | 4370544
301 | 4300280
302 | 3981313
303 | 3594916
304 | 20120317829
305 | 4214149
306 | 3849902
307 | 20160367006
308 | 20050011534
309 | 3284611
310 | 3308268
311 | 20010051042
312 | 6266893
313 | 20060098962
314 | 20160120286
315 | 20060213074
316 | 3304625
317 | 3949487
318 | 20170105503
319 | 20080263887
320 | 3953710
321 | 994259
322 | 5271160
323 | 3937231
324 | 3555699
325 | 20170006991
326 | 4173231
327 | 4641014
328 | 5610990
329 | 3702031
330 | 3348020
331 | 3596371
332 | 4021930
333 | 20130091724
334 | 20160166036
335 | 20160367003
336 | 20140202020
337 | 20150189967
338 | 20070274696
339 | 20110198421
340 | 3782002
341 | 20170156471
342 | 5531032
343 | 20080032543
344 | 3872336
345 | 3872607
346 | 20120291301
347 | 3731396
348 | 3362086
349 | 9185958
350 | 20160367014
351 | 20150201730
352 | 3831000
353 | 20020073573
354 | 6301800
355 | 20160309873
356 | 3836749
357 | 20140326713
358 | 8578623
359 | 20060026858
360 | 3992785
361 | 20150296954
362 | 3791045
363 | 5149209
364 | 9603430
365 | 3769718
366 | 20160051026
367 | 20050204578
368 | 3777406
369 | 20140047727
370 | 20130014402
371 | 20050204576
372 | 3763573
373 | 3358383
374 | 20150216283
375 | 


--------------------------------------------------------------------------------
/models/landscaping/seeds/hair_dryer_large.seed.csv:
--------------------------------------------------------------------------------
  1 | 8407913
  2 | 6907678
  3 | 9144286
  4 | 5956863
  5 | 8893400
  6 | 8782920
  7 | 6798982
  8 | 8256132
  9 | 8307948
 10 | 8459273
 11 | 6305325
 12 | 6739071
 13 | 20150366316
 14 | 5725159
 15 | 6718651
 16 | 6935046
 17 | 7784750
 18 | 8904663
 19 | 5841943
 20 | 20170079401
 21 | 6199295
 22 | 20140290087
 23 | 6285828
 24 | 20130326898
 25 | 6671460
 26 | 6011903
 27 | 6269549
 28 | 6885810
 29 | 6038782
 30 | 5875562
 31 | 5647007
 32 | 6314236
 33 | 8517318
 34 | 20080235980
 35 | 20040093756
 36 | 6986212
 37 | 20060196075
 38 | 6191930
 39 | 7096597
 40 | 5488783
 41 | 6491267
 42 | 4596921
 43 | 7380347
 44 | 4659907
 45 | 20110203128
 46 | 6199805
 47 | 20040047620
 48 | 20050069303
 49 | 6889445
 50 | 5701681
 51 | 4827105
 52 | 6725562
 53 | 6354016
 54 | 6449870
 55 | 4493975
 56 | 6026590
 57 | 8720078
 58 | 20030177657
 59 | 20090100698
 60 | 9675157
 61 | 5790749
 62 | 4955145
 63 | 6067724
 64 | 6732450
 65 | 4524263
 66 | 20120024620
 67 | 7913416
 68 | 7584759
 69 | 20090065661
 70 | 4711988
 71 | 5434946
 72 | 5842286
 73 | 4453695
 74 | 5351417
 75 | 7350317
 76 | 4316077
 77 | 5651190
 78 | 7047660
 79 | 20100282810
 80 | 5884008
 81 | 4254324
 82 | 4683369
 83 | 5689896
 84 | 6393718
 85 | 6901936
 86 | 6591516
 87 | 20050108889
 88 | 5606640
 89 | 20110079239
 90 | 4918289
 91 | 20040231180
 92 | 4700049
 93 | 4972065
 94 | 5448677
 95 | 20060254073
 96 | 5612849
 97 | 4197448
 98 | 20100014844
 99 | 6188837
100 | 4687906
101 | 4759135
102 | 9675158
103 | 20060201016
104 | 3955065
105 | 4904847
106 | 5216822
107 | 5649370
108 | 4263500
109 | 6367162
110 | 20040163274
111 | 4323761
112 | 4538362
113 | 20080216339
114 | 5613305
115 | 20160022004
116 | 4260875
117 | 20020189128
118 | 4602146
119 | 5572800
120 | 20040088878
121 | 3691646
122 | 20130276321
123 | 5729907
124 | 6408533
125 | 20090188126
126 | 8103155
127 | 20020112362
128 | 5148512
129 | 20130111777
130 | 4198558
131 | 4309595
132 | 4391047
133 | 4635382
134 | 5243682
135 | 4395619
136 | 4936027
137 | 4794225
138 | 6910281
139 | 20070294909
140 | 6026821
141 | 4114022
142 | 4977306
143 | 4767914
144 | 7801423
145 | 8732976
146 | 20010020668
147 | 4295283
148 | 5784800
149 | 7676952
150 | 20130277517
151 | 20040172847
152 | 4327278
153 | 20050284495
154 | 20060064892
155 | 20070137060
156 | 7264209
157 | 20080116753
158 | 4712313
159 | 4800654
160 | 3943329
161 | 4225106
162 | 6085435
163 | 4615347
164 | 5598640
165 | 8230615
166 | 3946498
167 | 3947659
168 | 3981314
169 | 4196343
170 | 8081873
171 | 6029364
172 | 4910385
173 | 5195253
174 | 4132360
175 | 4382174
176 | 5243683
177 | 4430808
178 | 20110177711
179 | 20060006294
180 | 4848007
181 | 4967060
182 | 20040159002
183 | 5235759
184 | 20060075654
185 | 4691451
186 | 5155925
187 | 4658511
188 | 5996243
189 | 20150335128
190 | 6049994
191 | 7204038
192 | 20040020070
193 | 4634836
194 | 5036601
195 | 5157757
196 | 4003388
197 | 20070245590
198 | 5013891
199 | 4706153
200 | 4701595
201 | 20120266483
202 | 20150037015
203 | 6041514
204 | 20130153461
205 | 4039774
206 | 4525623
207 | 20090188125
208 | 20100212177
209 | 7123823
210 | 20070119070
211 | 4490602
212 | 5473824
213 | 20140345156
214 | 4890395
215 | 4308670
216 | 4287673
217 | 20050229424
218 | 6089239
219 | 3775861
220 | 5317815
221 | 3846047
222 | 4195217
223 | 7086176
224 | 20040001707
225 | 7165341
226 | 6130991
227 | 20110162225
228 | 20050091867
229 | 6691429
230 | 4218608
231 | 4365141
232 | 9072358
233 | 20030196344
234 | 4647757
235 | 20090320873
236 | 8020827
237 | 4424437
238 | 20020108264
239 | 20070062058
240 | 4321456
241 | 6370326
242 | 20160213122
243 | 9498039
244 | 4297564
245 | 4924602
246 | 4471213
247 | 20160242524
248 | 4603246
249 | 20010025430
250 | 6108934
251 | 3818600
252 | 3717936
253 | 4556782
254 | 5172880
255 | 9578945
256 | 4939345
257 | 6377749
258 | 20170127804
259 | 5765792
260 | 4088869
261 | 20130104415
262 | 20130263464
263 | 4667086
264 | 20100162585
265 | 4118874
266 | 20070114219
267 | 20110073735
268 | 3132232
269 | 4896020
270 | 3202797
271 | 20050204577
272 | 3303325
273 | 9149105
274 | 20050139226
275 | 3109912
276 | 4225775
277 | 20160353854
278 | 2514528
279 | 20020174559
280 | 4896021
281 | 20160220005
282 | 3832789
283 | 20070169369
284 | 3550285
285 | 4514618
286 | 5325809
287 | 7308899
288 | 4365426
289 | 20010005943
290 | 20150289623
291 | 20160367005
292 | 20160367007
293 | 20170105502
294 | 3836750
295 | 3095496
296 | 3889693
297 | 20160262520
298 | 3978314
299 | 4320283
300 | 4370544
301 | 4300280
302 | 3981313
303 | 3594916
304 | 20120317829
305 | 4214149
306 | 3849902
307 | 20160367006
308 | 20050011534
309 | 3284611
310 | 3308268
311 | 20010051042
312 | 6266893
313 | 20060098962
314 | 20160120286
315 | 20060213074
316 | 3304625
317 | 3949487
318 | 20170105503
319 | 20080263887
320 | 3953710
321 | 994259
322 | 5271160
323 | 3937231
324 | 3555699
325 | 20170006991
326 | 4173231
327 | 4641014
328 | 5610990
329 | 3702031
330 | 3348020
331 | 3596371
332 | 4021930
333 | 20130091724
334 | 20160166036
335 | 20160367003
336 | 20140202020
337 | 20150189967
338 | 20070274696
339 | 20110198421
340 | 3782002
341 | 20170156471
342 | 5531032
343 | 20080032543
344 | 3872336
345 | 3872607
346 | 20120291301
347 | 3731396
348 | 3362086
349 | 9185958
350 | 20160367014
351 | 20150201730
352 | 3831000
353 | 20020073573
354 | 6301800
355 | 20160309873
356 | 3836749
357 | 20140326713
358 | 8578623
359 | 20060026858
360 | 3992785
361 | 20150296954
362 | 3791045
363 | 5149209
364 | 9603430
365 | 3769718
366 | 20160051026
367 | 20050204578
368 | 3777406
369 | 20140047727
370 | 20130014402
371 | 20050204576
372 | 3763573
373 | 3358383
374 | 20150216283
375 | 20110099832
376 | 4524263
377 | 5606640
378 | 5485931
379 | 9127885
380 | 5404419
381 | 20140190033
382 | 5031778
383 | 4195416
384 | 5107603
385 | 20090173286
386 | 4934855
387 | 5996249
388 | 5388344
389 | 5655257
390 | 5675907
391 | 4634836
392 | 5642572
393 | 5829162
394 | 6130991
395 | 8371246
396 | 4742199
397 | 4199873
398 | 4406071
399 | 20010055540
400 | 4035927
401 | 5765792
402 | 3985102
403 | RE30266
404 | 20160348301
405 | 20170055807
406 | 5170038
407 | 20080032543
408 | 3872336
409 | 4044474
410 | 20060281664
411 | 


--------------------------------------------------------------------------------
/models/landscaping/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from keras.preprocessing import text
 16 | 
 17 | import re
 18 | import string
 19 | 
 20 | class TextTokenizer:
 21 |     punct_regex = re.compile('([%s])' % (string.punctuation + '‘’'))
 22 |     spaces_regex = re.compile(r'\s{2,}')
 23 |     number_regex = re.compile(r'\d+')
 24 |     keras_tokenizer = None
 25 | 
 26 |     def __init__(
 27 |         self):
 28 |         '''
 29 |         '''
 30 | 
 31 | 
 32 | 
 33 |     def tokenize_to_onehot_matrix(self, text_series, vocab_size, keras_tokenizer=None):
 34 |         '''
 35 |         '''
 36 |         if keras_tokenizer is None:
 37 |             print('No Keras tokenizer supplied so using vocab size ({}) and series to build new one'.format(vocab_size))
 38 | 
 39 |             keras_tokenizer = text.Tokenizer(
 40 |                 num_words=vocab_size,
 41 |                 split=",",
 42 |                 # filter should be same as default, minus the '-'
 43 |                 filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
 44 |                 lower=False)
 45 |             keras_tokenizer.fit_on_texts(text_series)
 46 |             keras_tokenizer.index_word = {idx: word for word, idx in keras_tokenizer.word_index.items()}
 47 | 
 48 |         text_one_hot = keras_tokenizer.texts_to_matrix(text_series)
 49 | 
 50 |         return keras_tokenizer, text_one_hot
 51 | 
 52 | 
 53 |     def tokenize(
 54 |         self,
 55 |         text,
 56 |         normalize_numbers=True,
 57 |         lowercase=True,
 58 |         remove_punct=True,
 59 |         lemmatize=False):
 60 |         '''
 61 |         '''
 62 | 
 63 |         #plain_text = html2text.html2text(text)
 64 |         plain_text = text
 65 |         if not isinstance(plain_text, str):
 66 |             raise Exception(plain_text, type(plain_text))
 67 | 
 68 |         preprocessed = plain_text.replace('\'', '')
 69 |         if lowercase:
 70 |             preprocessed = preprocessed.lower()
 71 | 
 72 |         # Replace punctuation with spaces which handles cases like "searching/filter",
 73 |         # "nothing:)" and "writing.like.this" very well.
 74 |         # The double spaces that often result are then collased by the next method
 75 |         if remove_punct:
 76 |             preprocessed = self.punct_regex.sub(' ', preprocessed)
 77 |         else:
 78 |             preprocessed = self.punct_regex.sub(r' \1 ', preprocessed)
 79 | 
 80 |         preprocessed = self.spaces_regex.sub(' ', preprocessed)
 81 |         if normalize_numbers:
 82 |             preprocessed = self.number_regex.sub('_NUMBER_', preprocessed)
 83 | 
 84 |         if lemmatize:
 85 |             preprocessed = shared_funcs.NltkLemmatize(
 86 |                 preprocessed, stem_post_lemmatize=False
 87 |             )
 88 | 
 89 |         return preprocessed.split()
 90 | 
 91 | 
 92 |     def tokenize_series(
 93 |         self,
 94 |         text_series,
 95 |         normalize_numbers=True,
 96 |         lowercase=True,
 97 |         remove_punct=True,
 98 |         lemmatize=False):
 99 |         '''
100 |         '''
101 | 
102 |         return text_series.apply(self.tokenize)
103 | 
104 | 


--------------------------------------------------------------------------------
/models/landscaping/train_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import tokenizer
 16 | import importlib
 17 | import random
 18 | import numpy as np
 19 | 
 20 | from keras.preprocessing import sequence
 21 | 
 22 | importlib.reload(tokenizer)
 23 | 
 24 | class LandscapeTrainingDataUtil:
 25 |     RAND_SEED=314159
 26 |     refs_vocab_size = 50000
 27 |     training_df = None
 28 |     series_text_to_embed = None
 29 |     prepped_embedding_train = None
 30 |     prepped_refs = None
 31 |     prepped_labels = None
 32 |     w2v_runtime = None
 33 |     ref_to_id = None
 34 |     id_to_ref = None
 35 |     tokenizer = None
 36 |     sequence_len = None
 37 | 
 38 |     def __init__(
 39 |         self, training_df, w2v_runtime):
 40 |         '''
 41 |         '''
 42 | 
 43 |         self.w2v_runtime = w2v_runtime
 44 |         self.training_df = training_df
 45 | 
 46 |         self.tokenizer = tokenizer.TextTokenizer()
 47 | 
 48 |     def label_text_to_id(self, label_name):
 49 |         if label_name == 'antiseed':
 50 |             return 1
 51 |         else:
 52 |             return 0
 53 | 
 54 |     def label_id_to_text(self, label_idx):
 55 |         if label_idx == 1:
 56 |             return 'antiseed'
 57 |         else:
 58 |             return 'seed'
 59 | 
 60 |     def label_series_to_index(self, labels_series):
 61 |         labels_indexed = []
 62 |         for idx in range(0, len(labels_series)):
 63 |             label = labels_series[idx]
 64 |             # 'tokenize' on the label is basically normalizing it
 65 |             tokenized_label = self.tokenizer.tokenize(label)[0]
 66 |             label_idx = self.label_text_to_id(tokenized_label)
 67 |             labels_indexed.append(label_idx)
 68 | 
 69 |         return labels_indexed
 70 |         
 71 |     def text_series_to_embeddings(self, raw_series_text):
 72 |         '''
 73 |         Takes as input a series of text and associated labels
 74 |         '''
 75 | 
 76 |         tokenized_text = self.tokenizer.tokenize_series(raw_series_text)
 77 |         word_to_index_dict = self.w2v_runtime.word_to_index
 78 |         tokenized_indexed_text = []
 79 | 
 80 |         for idx in range(0, len(tokenized_text)):
 81 |             text = tokenized_text[idx]
 82 |             text_word_indexes = []
 83 |             for word in text:
 84 |                 if word in word_to_index_dict:
 85 |                     word_idx = word_to_index_dict[word]
 86 |                 else:
 87 |                     word_idx = word_to_index_dict['UNK']
 88 |                 # this skips 'the' so it can be used for dynamic rnn
 89 |                 if word_idx > 0:
 90 |                     text_word_indexes.append(word_idx)
 91 | 
 92 |             tokenized_indexed_text.append(text_word_indexes)
 93 | 
 94 |         return tokenized_indexed_text
 95 | 
 96 |     def to_text(self, integerized):
 97 |         words = []
 98 |         for word_int in integerized:
 99 |             words.append(self.w2v_runtime.index_to_word[word_int])
100 |         return ' '.join(words)
101 | 
102 |     def randomize_and_split(self, percent_train):
103 |         training_data_to_shuffle = list(
104 |             zip(
105 |                 self.prepped_embedding_train,
106 |                 self.refs_one_hot,
107 |                 self.cpc_one_hot,
108 |                 self.prepped_labels))
109 | 
110 |         print('Randomizing training data')
111 |         random.seed(self.RAND_SEED)
112 |         random.shuffle(training_data_to_shuffle)
113 | 
114 |         train_embed_arr, refs_one_hot, cpc_one_hot, label_arr = zip(*training_data_to_shuffle)
115 | 
116 |         train_idx = int(len(train_embed_arr) * percent_train)
117 | 
118 |         print('Creating NumPy arrays for train/test set out of randomized training data.')
119 |         self.trainEmbedX = np.array(train_embed_arr[:train_idx])
120 |         self.trainRefsOneHotX = np.array(refs_one_hot[:train_idx])
121 |         self.trainCpcOneHotX = np.array(cpc_one_hot[:train_idx])
122 | 
123 |         self.testEmbedX = np.array(train_embed_arr[train_idx:])
124 |         self.testRefsOneHotX = np.array(refs_one_hot[train_idx:])
125 |         self.testCpcOneHotX = np.array(cpc_one_hot[train_idx:])
126 | 
127 |         self.trainY = np.array(label_arr[:train_idx])
128 |         self.testY = np.array(label_arr[train_idx:])
129 | 
130 |     def prepare_training_data(
131 |         self, labels_series, series_text_to_embed, refs_series, cpc_series, percent_train, refs_vocab_size, cpc_vocab_size):
132 | 
133 |         self.series_text_to_embed = series_text_to_embed
134 |         self.prepped_embedding_train = self.text_series_to_embeddings(self.series_text_to_embed)
135 |         self.prepped_labels = self.label_series_to_index(labels_series)
136 |         self.refs_tokenizer, self.refs_one_hot = \
137 |             self.tokenizer.tokenize_to_onehot_matrix(refs_series, refs_vocab_size)
138 |         self.cpc_tokenizer, self.cpc_one_hot = \
139 |             self.tokenizer.tokenize_to_onehot_matrix(cpc_series, cpc_vocab_size)
140 | 
141 |         self.randomize_and_split(percent_train)
142 | 
143 |         print('Train (embed) data shapes: train: {}, train labels shape: {}'.format(
144 |             self.trainEmbedX.shape, self.trainY.shape))
145 |         print('Test (embed) data shape: {}, test labels shape: {}'.format(
146 |             self.testEmbedX.shape, self.testY.shape))
147 | 
148 |         doc_lengths = list(map(len, self.trainEmbedX))
149 |         median_doc_length = int(np.median(doc_lengths))
150 |         max_doc_length = np.max(doc_lengths)
151 |         print('doc lengths for embedding layer: median: {}, mean: {}, max: {}'.format(
152 |             median_doc_length, np.mean(doc_lengths), max_doc_length))
153 | 
154 |         sequence_len = max_doc_length
155 |         self.sequence_len = sequence_len
156 | 
157 |         print('Using sequence length of {} to pad LSTM sequences.'.format(sequence_len))
158 |         self.padded_train_embed_x = sequence.pad_sequences(
159 |             self.trainEmbedX, maxlen=sequence_len, padding='pre', truncating='post')
160 |         self.padded_test_embed_x = sequence.pad_sequences(
161 |             self.testEmbedX, maxlen=sequence_len, padding='pre', truncating='post')
162 | 
163 |         print('Training data ready.')
164 | 
165 |         return self
166 | 
167 |     def prep_for_inference(
168 |         self, series_text_to_embed, refs_series, cpc_series):
169 | 
170 |         prepped_embedding = self.text_series_to_embeddings(series_text_to_embed)
171 | 
172 |         _, refs_one_hot = \
173 |             self.tokenizer.tokenize_to_onehot_matrix(refs_series, None, self.refs_tokenizer)
174 |         _, cpc_one_hot = \
175 |             self.tokenizer.tokenize_to_onehot_matrix(cpc_series, None, self.cpc_tokenizer)
176 | 
177 |         prepped_embedding = np.array(prepped_embedding)
178 |         refs_one_hot = np.array(refs_one_hot)
179 |         cpc_one_hot = np.array(cpc_one_hot)
180 | 
181 |         doc_lengths = list(map(len, self.trainEmbedX))
182 |         sequence_len = np.max(doc_lengths)
183 | 
184 |         padded_embed = sequence.pad_sequences(
185 |             prepped_embedding, maxlen=sequence_len, padding='pre', truncating='post')
186 | 
187 |         return padded_embed, refs_one_hot, cpc_one_hot
188 | 
189 |     def show_instance_details(self, train_instance_idx):
190 |         print('\nOriginal: {}\nTokenized: {}\nIntegerized: {}\nLabelIntegerized: {}'.format(
191 |             self.series_text_to_embed[train_instance_idx],
192 |             self.to_text(self.prepped_embedding_train[train_instance_idx]),
193 |             self.prepped_embedding_train[train_instance_idx],
194 |             self.prepped_labels[train_instance_idx]))
195 | 


--------------------------------------------------------------------------------
/tables/dataset_Berkeley Fung.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | geometry: margin=0.6in
  4 | ---
  5 | 
  6 | # Berkeley Fung
  7 | 
  8 | 
  9 | *****
 10 | ## erudite-marker-539:JEMS16.assignee_disambiguation
 11 | 
 12 | 
 13 | 
 14 | > Accompanying materials to
 15 | > 
 16 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
 17 | > 
 18 | > Additional links:
 19 | > 
 20 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
 21 | > 
 22 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
 23 | > 
 24 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | | Stat | Value |
 31 | |----------|----------|
 32 | | Last updated | 2018-02-15 |
 33 | | Rows | 5,272,283 |
 34 | | Size | 239.7 MB |
 35 | 
 36 | ### Schema
 37 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.assignee_disambiguation)
 38 | 
 39 | * `PatentNo` STRING NULLABLE 
 40 | 
 41 |     > Patent number
 42 | 
 43 | * `pdpass` STRING NULLABLE 
 44 | 
 45 |     > Pdpass (unique identifier of assignees)
 46 | 
 47 | * `assignee_disambiguated` STRING NULLABLE 
 48 | 
 49 |     > Standardized assignee name
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | *****
 65 | ## erudite-marker-539:JEMS16.assignee_raw
 66 | 
 67 | 
 68 | 
 69 | > Accompanying materials to
 70 | > 
 71 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
 72 | > 
 73 | > Additional links:
 74 | > 
 75 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
 76 | > 
 77 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
 78 | > 
 79 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | | Stat | Value |
 86 | |----------|----------|
 87 | | Last updated | 2018-02-15 |
 88 | | Rows | 8,579,322 |
 89 | | Size | 767.6 MB |
 90 | 
 91 | ### Schema
 92 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.assignee_raw)
 93 | 
 94 | * `id` INTEGER NULLABLE 
 95 | 
 96 |     > System generated
 97 | 
 98 | * `PatentNo` STRING NULLABLE 
 99 | 
100 |     > Patent number
101 | 
102 | * `Company` STRING NULLABLE 
103 | 
104 |     > Assignee name (can be companies, universities, government agencies, or simply person name)
105 | 
106 | * `Geography` STRING NULLABLE 
107 | 
108 |     > Raw (city, state, country) tuple of assginee
109 | 
110 | * `Country` STRING NULLABLE 
111 | 
112 |     > Country Code derived from field 'Geography'
113 | 
114 | * `State` STRING NULLABLE 
115 | 
116 |     > State Code derived from field 'Geography' (if in U.S.)
117 | 
118 | * `City` STRING NULLABLE 
119 | 
120 |     > City Name derived from field 'Geography'
121 | 
122 | * `Sequence` STRING NULLABLE 
123 | 
124 |     > Order of appearance (0 means the first assignee, 1 means the second assignee, ..., etc)
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | *****
150 | ## erudite-marker-539:JEMS16.citation
151 | 
152 | 
153 | 
154 | > Accompanying materials to
155 | > 
156 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
157 | > 
158 | > Additional links:
159 | > 
160 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
161 | > 
162 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
163 | > 
164 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
165 | 
166 | 
167 | 
168 | 
169 | 
170 | | Stat | Value |
171 | |----------|----------|
172 | | Last updated | 2018-02-15 |
173 | | Rows | 174,205,746 |
174 | | Size | 10.4 GB |
175 | 
176 | ### Schema
177 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.citation)
178 | 
179 | * `id` INTEGER NULLABLE 
180 | 
181 |     > System generated
182 | 
183 | * `PatentNo_citing` STRING NULLABLE 
184 | 
185 |     > Patent number (citing)
186 | 
187 | * `CountryCodeOrNPL_cited` STRING NULLABLE 
188 | 
189 |     > U.S. or a foreign country code or NPL (non-patent literature) of the cited art
190 | 
191 | * `PatentNoOrNPL_cited` STRING NULLABLE 
192 | 
193 |     > Patent number of non-patent literature (cited)
194 | 
195 | * `sequence` STRING NULLABLE 
196 | 
197 |     > Order of appearance (0 means the first cited art, 1 means the second cited art, ..., etc)
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | *****
217 | ## erudite-marker-539:JEMS16.citation_self
218 | 
219 | 
220 | 
221 | > Accompanying materials to
222 | > 
223 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
224 | > 
225 | > Additional links:
226 | > 
227 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
228 | > 
229 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
230 | > 
231 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
232 | 
233 | 
234 | 
235 | 
236 | 
237 | | Stat | Value |
238 | |----------|----------|
239 | | Last updated | 2018-02-15 |
240 | | Rows | 1,667,637 |
241 | | Size | 20.1 MB |
242 | 
243 | ### Schema
244 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.citation_self)
245 | 
246 | * `PatentNo` STRING NULLABLE 
247 | 
248 |     > Patent number
249 | 
250 | * `Self_Citation_Flag` STRING NULLABLE 
251 | 
252 |     > Backward prior art cites to the same pdpass
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | *****
266 | ## erudite-marker-539:JEMS16.cpc
267 | 
268 | 
269 | 
270 | > Accompanying materials to
271 | > 
272 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
273 | > 
274 | > Additional links:
275 | > 
276 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
277 | > 
278 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
279 | > 
280 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
281 | 
282 | 
283 | 
284 | 
285 | 
286 | | Stat | Value |
287 | |----------|----------|
288 | | Last updated | 2018-02-15 |
289 | | Rows | 65,896,459 |
290 | | Size | 3.7 GB |
291 | 
292 | ### Schema
293 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.cpc)
294 | 
295 | * `id` INTEGER NULLABLE 
296 | 
297 |     > System generated
298 | 
299 | * `PatentNo` STRING NULLABLE 
300 | 
301 |     > Patent number
302 | 
303 | * `Type` STRING NULLABLE 
304 | 
305 |     > CPC
306 | 
307 | * `CPC_Full` STRING NULLABLE 
308 | 
309 |     > Full CPC
310 | 
311 | * `CPC_Layer_1` STRING NULLABLE 
312 | 
313 |     > CPC top layer 1: before {space}
314 | 
315 | * `CPC_Layer_2` STRING NULLABLE 
316 | 
317 |     > CPC top layer 2: before {slash}
318 | 
319 | * `Sequence` STRING NULLABLE 
320 | 
321 |     > Order of appearance (0 means the first CPC, 1 means the second CPC, ..., etc)
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | *****
345 | ## erudite-marker-539:JEMS16.inventor_disambiguated_2
346 | 
347 | 
348 | Old table version `2`, schema skipped.
349 | 
350 | 
351 | 
352 | 
353 | 
354 | *****
355 | ## erudite-marker-539:JEMS16.inventor_disambiguated_3
356 | 
357 | 
358 | 
359 | > Accompanying materials to
360 | > 
361 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
362 | > 
363 | > Additional links:
364 | > 
365 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
366 | > 
367 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
368 | > 
369 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
370 | 
371 | 
372 | > same table to inventor_disambiguated_2, except for data type differences easier for table joining
373 | 
374 | 
375 | 
376 | 
377 | | Stat | Value |
378 | |----------|----------|
379 | | Last updated | 2018-02-23 |
380 | | Rows | 13,345,776 |
381 | | Size | 492.6 MB |
382 | 
383 | ### Schema
384 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.inventor_disambiguated_3)
385 | 
386 | * `PatentNo` STRING NULLABLE 
387 | 
388 | * `InventorFullname` STRING NULLABLE 
389 | 
390 | * `InventorID` STRING NULLABLE 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | *****
406 | ## erudite-marker-539:JEMS16.inventor_raw
407 | 
408 | 
409 | 
410 | > Accompanying materials to
411 | > 
412 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
413 | > 
414 | > Additional links:
415 | > 
416 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
417 | > 
418 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
419 | > 
420 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
421 | 
422 | 
423 | 
424 | 
425 | 
426 | | Stat | Value |
427 | |----------|----------|
428 | | Last updated | 2018-02-15 |
429 | | Rows | 14,745,325 |
430 | | Size | 1.2 GB |
431 | 
432 | ### Schema
433 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.inventor_raw)
434 | 
435 | * `PatentNo` STRING NULLABLE 
436 | 
437 |     > Patent number
438 | 
439 | * `Sequence` STRING NULLABLE 
440 | 
441 |     > Order of appearance (0 means the first inventor, 1 means the second inventor, ..., etc)
442 | 
443 | * `FullName` STRING NULLABLE 
444 | 
445 |     > Full name (in form of Last Name {semicolon} First Name {single space} Middle Name)
446 | 
447 | * `LastName` STRING NULLABLE 
448 | 
449 |     > Last Name
450 | 
451 | * `FirstMiddleName` STRING NULLABLE 
452 | 
453 |     > First Name {single space} Middle Name
454 | 
455 | * `Geography` STRING NULLABLE 
456 | 
457 |     > Raw (city, state, country) tuple of assginee
458 | 
459 | * `Country` STRING NULLABLE 
460 | 
461 |     > Country Code derived from field 'Geography'
462 | 
463 | * `State` STRING NULLABLE 
464 | 
465 |     > State Code derived from field 'Geography' (if in U.S.)
466 | 
467 | * `City` STRING NULLABLE 
468 | 
469 |     > City Code derived from field 'Geography'
470 | 
471 | 
472 | 
473 | 
474 | 
475 | 
476 | 
477 | 
478 | 
479 | 
480 | 
481 | 
482 | 
483 | 
484 | 
485 | 
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 
494 | 
495 | 
496 | *****
497 | ## erudite-marker-539:JEMS16.patent_metadata_2
498 | 
499 | 
500 | 
501 | > Accompanying materials to
502 | > 
503 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
504 | > 
505 | > Additional links:
506 | > 
507 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
508 | > 
509 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
510 | > 
511 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
512 | 
513 | 
514 | 
515 | 
516 | 
517 | | Stat | Value |
518 | |----------|----------|
519 | | Last updated | 2018-02-15 |
520 | | Rows | 6,492,363 |
521 | | Size | 5.4 GB |
522 | 
523 | ### Schema
524 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.patent_metadata_2)
525 | 
526 | * `ApplNo` STRING NULLABLE 
527 | 
528 |     > Application number
529 | 
530 | * `ApplDate` STRING NULLABLE 
531 | 
532 |     > Application date
533 | 
534 | * `PatentNo` STRING NULLABLE 
535 | 
536 |     > Patent number
537 | 
538 | * `IssueDate` STRING NULLABLE 
539 | 
540 |     > Patent issue date or grant date
541 | 
542 | * `FamilyID` STRING NULLABLE  joins on **family_id**
543 | 
544 |     > Patent Family ID derived from USPTO HTML page of the focal patent
545 | 
546 | * `LawFirm` STRING NULLABLE 
547 | 
548 |     > Agent / Law Firm / Correspondent
549 | 
550 | * `AssistExaminer` STRING NULLABLE 
551 | 
552 |     > Assistant examiner
553 | 
554 | * `PrimaryExaminer` STRING NULLABLE 
555 | 
556 |     > Primary examiner
557 | 
558 | * `Title` STRING NULLABLE 
559 | 
560 |     > Patent title
561 | 
562 | * `Abstract` STRING NULLABLE 
563 | 
564 |     > Patent abstract
565 | 
566 | * `GovernmentInterests` STRING NULLABLE 
567 | 
568 |     > Full text statement acknowledging U.S. government supports (if any)
569 | 
570 | 
571 | 
572 | ### Join columns
573 | 
574 | 
575 | 
576 | 
577 | 
578 | 
579 | 
580 | 
581 | 
582 | 
583 | #### FamilyID
584 | 
585 | joins to `patents-public-data:patents.publications::family_id` on **family_id** (87.45%, 5,677,428 rows)
586 | 
587 | | Key | Percent | Rows | Sample values |
588 | |------|-----|--------|--------------------------------------------------------|
589 | | `all` | 87.45% | 5,677,428 | `['41164314', '45348360', '46277349', '25524495', '44708394']` |
590 | 
591 | 
592 |     #standardSQL
593 |     SELECT
594 |       COUNT(*) AS cnt,
595 |       COUNT(second.second_column) AS second_cnt,
596 |       ARRAY_AGG(first.FamilyID IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
597 |     FROM `erudite-marker-539.JEMS16.patent_metadata_2`AS first
598 |     LEFT JOIN (
599 |       SELECT family_id AS second_column, COUNT(*) AS cnt
600 |       FROM `patents-public-data.patents.publications`
601 |       GROUP BY 1
602 |     ) AS second ON first.FamilyID = second.second_column
603 | 
604 | 
605 | 
606 | joins from `patents-public-data:patents.publications::family_id` on **family_id** (25.67%, 25,206,642 rows)
607 | 
608 | 
609 | 
610 | 
611 | 
612 | 
613 | 
614 | 
615 | 
616 | 
617 | 
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 
624 | 
625 | *****
626 | ## erudite-marker-539:JEMS16.patent_novelty
627 | 
628 | 
629 | 
630 | > Accompanying materials to
631 | > 
632 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.”  Forthcoming at Journal of Economics and Management Strategy.
633 | > 
634 | > Additional links:
635 | > 
636 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt
637 | > 
638 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/
639 | > 
640 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license.
641 | 
642 | 
643 | 
644 | 
645 | 
646 | | Stat | Value |
647 | |----------|----------|
648 | | Last updated | 2018-02-15 |
649 | | Rows | 2,816,425 |
650 | | Size | 90.7 MB |
651 | 
652 | ### Schema
653 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.patent_novelty)
654 | 
655 | * `PatentNo` STRING NULLABLE 
656 | 
657 |     > Patent number
658 | 
659 | * `Word` STRING NULLABLE 
660 | 
661 |     > New word (unigram)
662 | 
663 | * `CurrentUse` STRING NULLABLE 
664 | 
665 |     > Number of occurrence of the new word in the focal patent
666 | 
667 | * `FutureUse` STRING NULLABLE 
668 | 
669 |     > Number of appearances of the new word in  subsequent patents (up until Dec 31, 2014)
670 | 
671 | 
672 | 
673 | 
674 | 
675 | 
676 | 
677 | 
678 | 
679 | 
680 | 
681 | 
682 | 
683 | 
684 | 
685 | 


--------------------------------------------------------------------------------
/tables/dataset_Berkeley Fung.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Berkeley Fung.md.pdf


--------------------------------------------------------------------------------
/tables/dataset_CPA Global.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | geometry: margin=0.6in
 4 | ---
 5 | 
 6 | # CPA Global
 7 | 
 8 | 
 9 | *****
10 | ## innography-174118:technical_standards.etsi
11 | 
12 | 
13 | 
14 | > European Telecommunications Standards Institute (ETSI) IPR dataset for technical standards.
15 | > These are the US assets disclosed by companies as related to technical standards in ETSI.  The two major ones included are 3GPP and LTE.
16 | 
17 | 
18 | > “Innography ETSI Data” by Innography (through ETSI IPR) is licensed under a Creative Commons Attribution 4.0 International License.
19 | 
20 | 
21 | 
22 | 
23 | | Stat | Value |
24 | |----------|----------|
25 | | Last updated | 2017-07-25 |
26 | | Rows | 34,465 |
27 | | Size | 1.0 MB |
28 | 
29 | ### Schema
30 | [View in BigQuery](https://bigquery.cloud.google.com/table/innography-174118:technical_standards.etsi)
31 | 
32 | * `PublicationNumber` STRING REQUIRED  joins on **publication_number**
33 | 
34 | * `StandardBody` STRING REQUIRED 
35 | 
36 | * `TechnicalStandard` STRING REQUIRED 
37 | 
38 | 
39 | 
40 | ### Join columns
41 | 
42 | 
43 | #### PublicationNumber
44 | 
45 | joins to `patents-public-data:patents.publications::publication_number` on **publication_number** (99.98%, 34,458 rows)
46 | 
47 | | Key | Percent | Rows | Sample values |
48 | |------|-----|--------|--------------------------------------------------------|
49 | | `3GPP, LTE` | 99.99% | 8,381 | `['US-2009141670-A1', 'US-9673942-B2', 'US-2003185390-A1', 'US-7489672-B2', 'US-8347177-B2']` |
50 | | `LTE` | 99.99% | 7,071 | `['US-2011064120-A1', 'US-2009325504-A1', 'US-2014094175-A1', 'US-6163533-A', 'US-8009661-B2']` |
51 | | `3GPP` | 99.97% | 19,006 | `['US-8594035-B2', 'US-2012014344-A1', 'US-2017012727-A1', 'US-9648048-B2', 'US-2005065801-A1']` |
52 | 
53 | 
54 |     #standardSQL
55 |     SELECT
56 |       COUNT(*) AS cnt,
57 |       COUNT(second.second_column) AS second_cnt,
58 |       first.TechnicalStandard AS grouped,
59 |       ARRAY_AGG(first.PublicationNumber IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
60 |     FROM `innography-174118.technical_standards.etsi`AS first
61 |     LEFT JOIN (
62 |       SELECT publication_number AS second_column, COUNT(*) AS cnt
63 |       FROM `patents-public-data.patents.publications`
64 |       GROUP BY 1
65 |     ) AS second ON first.PublicationNumber = second.second_column
66 |     GROUP BY 3
67 | 
68 | 
69 | 
70 | joins from `patents-public-data:patents.publications::publication_number` on **publication_number** (0.04%, 34,458 rows)
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/tables/dataset_CPA Global.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_CPA Global.md.pdf


--------------------------------------------------------------------------------
/tables/dataset_European Bioinformatics Institute.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_European Bioinformatics Institute.md.pdf


--------------------------------------------------------------------------------
/tables/dataset_Google Patents Public Datasets.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Google Patents Public Datasets.md.pdf


--------------------------------------------------------------------------------
/tables/dataset_Other.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Other.md.pdf


--------------------------------------------------------------------------------
/tables/dataset_USPTO.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_USPTO.md.pdf


--------------------------------------------------------------------------------
/tables/index.md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/index.md.pdf


--------------------------------------------------------------------------------
/tools/bigquery-indexer/README.md:
--------------------------------------------------------------------------------
 1 | # BigQuery column indexer
 2 | 
 3 | This tool supports indexing and normalizing various columns in BigQuery tables.
 4 | It reads an input BigQuery SQL statement to select the columns and outputs a new
 5 | BigQuery table with the indexed columns.
 6 | 
 7 | 
 8 | # Running locally (development)
 9 | 
10 | Build the runner container image with RDKit and Beam dependencies installed. Install the gcloud SDK and authenticate.
11 | 
12 | ```
13 | $ podman --cgroup-manager=cgroupfs build ./beam-rdkit-runner --format docker
14 | ...
15 | STEP 14: COMMIT
16 | --> 49b365fef6f
17 | $ podman run -it --entrypoint "/bin/bash" -v .:/opt/bigquery-indexer 49b365fef6f
18 | (beam-env) root@94bb44368d14$ wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-352.0.0-linux-x86_64.tar.gz
19 | (beam-env) root@94bb44368d14$ tar -xzf google-cloud-sdk-352.0.0-linux-x86_64.tar.gz
20 | (beam-env) root@94bb44368d14$ google-cloud-sdk/install.sh
21 | (beam-env) root@94bb44368d14$ gcloud init
22 | (beam-env) root@94bb44368d14$ gcloud auth application-default login
23 | (beam-env) root@94bb44368d14$ cd /opt/bigquery-indexer && python3 -m main --input_sql "SELECT * FROM nih-nci-cbiit-chem-prod.savi.all LIMIT 100" --output_table <project>:savi.fingerprints --project <project> --temp_location gs://<cloud storage bucket>/tmp/ --skip_fingerprint_columns reaction_smiles
24 | ```
25 | 
26 | # Running in Dataflow on GCP
27 | 
28 | ```
29 | $ pip install 'apache-beam[gcp]=2.31.0'
30 | ```
31 | 
32 | See https://cloud.google.com/dataflow/docs/quickstarts/quickstart-python,
33 | specifically setting GOOGLE_APPLICATION_CREDENTIALS is required.
34 | 
35 | Use your local GCP account credentials by executing:
36 | 
37 | ```
38 | $ gcloud init
39 | $ gcloud auth application-default login
40 | ```
41 | 
42 | Build the runner container.
43 | 
44 | ```
45 | patents-public-data$ cd tools/bigquery-indexer/beam-rdkit-runner
46 | beam-rdkit-runner$ gcloud builds submit --tag gcr.io/<project>/beam-rdkit-runner:latest
47 | ```
48 | 
49 | This example indexes a column containing SMILES (the computer representation of a chemical).
50 | 
51 | ```$ python3 -m main --input_sql "SELECT * FROM nih-nci-cbiit-chem-prod.savi.all LIMIT 100" --output_table <project>:savi.fingerprints --project <project> --temp_location gs://<cloud storage bucket>/tmp/ --skip_fingerprint_columns reaction_smiles --runner DataflowRunner --max_num_workers=20 --region us-central1 --machine_type=n2-highcpu-16 --disk_size_gb=50 --experiment=use_runner_v2 --sdk_container_image=gcr.io/<project>/beam-rdkit-runner:latest --save_main_session```
52 | 
53 | See more configuration flags at https://cloud.google.com/dataflow/docs/guides/flexrs and regions at https://cloud.google.com/dataflow/docs/resources/locations.
54 | 


--------------------------------------------------------------------------------
/tools/bigquery-indexer/beam-rdkit-runner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM index.docker.io/library/debian:stable-slim
 2 | 
 3 | RUN apt-get update
 4 | 
 5 | RUN apt-get install -y wget build-essential
 6 | 
 7 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 8 | 
 9 | RUN bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
10 | 
11 | RUN conda create -n beam-env python=3.8
12 | 
13 | # Run the next commands inside the conda environment.
14 | RUN conda init bash
15 | RUN echo "conda activate beam-env" >> ~/.bashrc
16 | SHELL ["/bin/bash", "--login", "-c"]
17 | 
18 | RUN conda install -q -y -c conda-forge rdkit pip
19 | 
20 | RUN pip install --no-cache-dir apache-beam[gcp]==2.31.0
21 | 
22 | # Copy files from official SDK image, including script/dependencies
23 | COPY --from=registry.hub.docker.com/apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam
24 | 
25 | ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "beam-env", "/opt/apache/beam/boot"]
26 | 


--------------------------------------------------------------------------------
/tools/bigquery-indexer/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 Google LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | """
 18 | Index/normalize/match various columns in BigQuery tables.
 19 | 
 20 | Supports:
 21 | * chemistry (SMILES) column fingerprinting for similarity search
 22 | * (future) patent publication and application number normalization
 23 | * (future) OCID matching
 24 | """
 25 | 
 26 | import sys
 27 | 
 28 | import argparse
 29 | 
 30 | import apache_beam as beam
 31 | 
 32 | from google.cloud import bigquery
 33 | 
 34 | # The additional column suffixes added to each input row containing 'smiles'.
 35 | fingerprint_columns = set(['morgan_fp', 'rdkit_fp', 'atompair_fp', 'tt_fp'])
 36 | 
 37 | def index_row(row, skip_cols):
 38 |   orig_keys = list(row.keys())
 39 |   for key in orig_keys:
 40 |     if 'smiles' in key and key not in skip_cols:
 41 |       fingerprints = generate_fingeprints(row[key])
 42 |       for col, fp in fingerprints.items():
 43 |         if col not in fingerprint_columns:
 44 |           raise RuntimeError(f'fingerprints generated column {col} not in {fingerprint_columns}')
 45 |         row[f'{key}_{col}'] = fp
 46 |   return row
 47 | 
 48 | def generate_fingeprints(smiles):
 49 |   # Load these here so they're only needed on the worker machines.
 50 |   from rdkit import Chem
 51 |   from rdkit.Chem import rdFingerprintGenerator
 52 | 
 53 |   morgan_fp = ''
 54 |   rdkit_fp = ''
 55 |   atompair_fp = ''
 56 |   tt_fp = ''
 57 | 
 58 |   try:
 59 |     mol = Chem.MolFromSmiles(smiles)
 60 | 
 61 |     # Morgan
 62 |     morgan_fp = rdFingerprintGenerator.GetMorganGenerator().GetFingerprint(mol).ToBase64()
 63 | 
 64 |     # Feature Morgan
 65 |     # TODO
 66 | 
 67 |     # RDKit
 68 |     rdkit_fp = rdFingerprintGenerator.GetRDKitFPGenerator().GetFingerprint(mol).ToBase64()
 69 | 
 70 |     # Layered
 71 |     # TODO
 72 | 
 73 |     # Atom pairs
 74 |     atompair_fp = rdFingerprintGenerator.GetAtomPairGenerator().GetFingerprint(mol).ToBase64()
 75 | 
 76 |     # MACCS
 77 |     # TODO
 78 | 
 79 |     # Topological Torsion
 80 |     tt_fp = rdFingerprintGenerator.GetTopologicalTorsionGenerator().GetFingerprint(mol).ToBase64()
 81 | 
 82 |     # Pattern
 83 |     # TODO
 84 | 
 85 |     # E-state
 86 |     # TODO
 87 | 
 88 |   except Exception as e:
 89 |     print(f'Exception {e} processing {smiles}')
 90 |     return {}
 91 |   # NOTE: add any new fingerprints to fingerprint_columns.
 92 |   return {'morgan_fp': morgan_fp, 'rdkit_fp': rdkit_fp, 'atompair_fp': atompair_fp, 'tt_fp': tt_fp}
 93 | 
 94 | def get_query_output_schema(bq_client, query):
 95 |   try:
 96 |     # TODO: add support for accessing the schema to bq_client.query().
 97 |     result = bq_client._connection.api_request(
 98 |       method="POST",
 99 |       path="/projects/jefferson-1790/queries",
100 |       data={
101 |         "query": query,
102 |         "dryRun": True,
103 |         "useLegacySql": False,
104 |       })
105 |   except Exception as exc:
106 |     raise ValueError(f'Error testing SQL query "{query}"') from exc
107 |   return result['schema']
108 | 
109 | def add_fingerprint_schema(orig_schema, skip_cols):
110 |   to_add = []
111 |   for field in orig_schema['fields']:
112 |     key = field['name']
113 |     if 'smiles' in key and key not in skip_cols:
114 |       for fp_col in fingerprint_columns:
115 |         to_add.append({
116 |           'name': f'{key}_{fp_col}',
117 |           'type': 'BYTES',
118 |           'mode': 'NULLABLE',
119 |         })
120 | 
121 |   return {'fields': orig_schema['fields'] + to_add}
122 | 
123 | def run(argv=None):  # pylint: disable=missing-docstring
124 |   parser = argparse.ArgumentParser()
125 | 
126 |   parser.add_argument(
127 |       '--input_sql',
128 |       dest='input_sql',
129 |       default='',
130 |       help='SQL statement to extract SMILES from. Fields containing `smiles` '
131 |            'will generate fingerprints, and any additional fields will be '
132 |            'passed through to the output row.')
133 |   parser.add_argument(
134 |       '--output_table',
135 |       dest='output_table',
136 |       required=True,
137 |       help='Output BigQuery table with indexed chemistry.')
138 |   parser.add_argument(
139 |       '--skip_fingerprint_columns',
140 |       dest='skip_fingerprint_columns',
141 |       default=[],
142 |       help='Column names to skip fingerprinting.')
143 |   known_args, pipeline_args = parser.parse_known_args(argv)
144 | 
145 |   skip_cols = set(known_args.skip_fingerprint_columns.split(','))
146 | 
147 |   # Query the output schema first so we know the schema to set.
148 |   bq_client = bigquery.Client()
149 | 
150 |   # Get the output schema.
151 |   orig_schema = get_query_output_schema(bq_client, known_args.input_sql)
152 |   # Add the new fingerprint columns to the schema.
153 |   schema = add_fingerprint_schema(orig_schema, skip_cols)
154 | 
155 |   print(f'Output schema: {schema}')
156 | 
157 |   with beam.Pipeline(argv=pipeline_args) as p:
158 |     input_rows = (p | 'Read' >> beam.io.Read(beam.io.ReadFromBigQuery(
159 |         query=known_args.input_sql,
160 |         use_standard_sql=True)))
161 | 
162 |     # Each row is a dictionary where the keys are the BigQuery columns
163 |     fingerprints = input_rows | beam.Map(
164 |         lambda row: index_row(row, skip_cols))
165 | 
166 |     (fingerprints | 'Write' >> beam.io.WriteToBigQuery(
167 |         known_args.output_table,
168 |         schema=schema,
169 |         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
170 |         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
171 | 
172 | 
173 | if __name__ == "__main__":
174 |   run(sys.argv)
175 | 


--------------------------------------------------------------------------------
/tools/bq_bulk_cp.pysh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copy a bunch of bigquery tables matching a pattern to a new pattern.
16 | # bq_bulk_cp.pysh jefferson-1790:ebi_chembl.*_23 patents-public-data.ebi_chembl.*
17 | import sh
18 | import sys
19 | import re
20 | import argparse
21 | 
22 | parser = argparse.ArgumentParser(description="Copy a set of BigQuery tables")
23 | parser.add_argument("--dry_run", default=False, action="store_true", help="do not copy")
24 | parser.add_argument("source", help="source table pattern, 'jefferson-1790:ebi_chembl.*_23'")
25 | parser.add_argument("target", help="target table pattern, 'patents-public-data:ebi_chembl.*_23'")
26 | args = parser.parse_args()
27 | 
28 | source_dataset, source_pattern = args.source.split(".")
29 | if "*" in source_dataset:
30 |   print("Wildcards are only supported on tables, not datasets.")
31 |   sys.exit(1)
32 | 
33 | # List all tables in a dataset.
34 | bq = sh.Command("bq")
35 | 
36 | if "*" not in source_pattern:
37 |   bq("cp", args.source, args.target)
38 |   sys.exit(0)
39 | 
40 | tables = bq("ls", "-n", "100000", source_dataset).stdout.split("\n")[2:]
41 | 
42 | source_re = source_pattern.replace("*", "(.*)")
43 | 
44 | for row in tables:
45 |   if row == "":
46 |     continue
47 |   table = row.split()[0]
48 | 
49 |   match = re.match(source_re, table)
50 |   if match:
51 |     src = source_dataset + "." + table
52 |     dest = args.target.replace("*", match.group(1))
53 |     print("bq cp %s %s" % (src, dest))
54 |     if not args.dry_run:
55 |       bq("--debug_mode=true", "--headless=true", "cp", "--force", src, dest, _fg=True)
56 |   else:
57 |     print("Skipping %s" % source_dataset + "." + table)
58 | 


--------------------------------------------------------------------------------
/tools/bq_ls.pysh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # List datasets and tables matching a pattern. Collapse versions.
16 | # bq_ls.pysh jefferson-1790:*
17 | import sh
18 | import sys
19 | import re
20 | 
21 | source_project = sys.argv[1]
22 | 
23 | #source_project_dataset, source_pattern = source.split(".")
24 | #source_project, source_dataset = source_project_dataset.split(":")
25 | 
26 | # List all tables in a dataset.
27 | bq = sh.Command("bq")
28 | 
29 | datasets = bq("ls", source_project + ":").stdout.split("\n")[2:]
30 | 
31 | for row in datasets:
32 |   if row == "":
33 |     continue
34 |   dataset = row.split()[0]
35 |   print("-"*50)
36 |   print("Dataset: %s" % dataset)
37 |   tables = bq("ls", "-n", "100000", source_project + ":" + dataset).stdout.split("\n")[2:]
38 |   for x in tables:
39 |     if x == "":
40 |       continue
41 |     table = x.split()[0]
42 |     print("\t%s" % table)
43 | 


--------------------------------------------------------------------------------
/tools/csv_upload.pysh:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Uploads one or more CSV files into one or more BigQuery tables.
 16 | #
 17 | # Single file, single table:
 18 | # python3 csv_upload.pysh --source '~/Downloads/table.csv' --tables=jefferson-1790:dataset.table
 19 | #
 20 | # Multiple files, single table:
 21 | # python3 csv_upload.pysh --source '~/Downloads/table_*.csv' --tables=jefferson-1790:dataset.table
 22 | #
 23 | # Multiple files per table, multiple tables:
 24 | # python3 csv_upload.pysh --source '~/Downloads/patstat/Data/{}_part*.txt' --tables=jefferson-1790:epo_patstat.{}
 25 | # table1_part00.txt, table1_part01.txt, ... -> jefferson-1790:epo_patstat.table1
 26 | # table2_part00.txt -> jefferson-1790:epo_patstat.table2
 27 | # etc
 28 | import sys
 29 | try:
 30 |   import sh
 31 | except:
 32 |   print("Missing 'sh' library, run 'pip3 install sh'")
 33 |   sys.exit(1)
 34 | import re
 35 | import os
 36 | import argparse
 37 | import glob
 38 | import hashlib
 39 | import queue
 40 | import io
 41 | import csv
 42 | 
 43 | parser = argparse.ArgumentParser(description="Upload a CSV file to a BigQuery table")
 44 | parser.add_argument("--dry_run", default=False, action="store_true", help="Do not upload.")
 45 | parser.add_argument("--bq_bin", default="bq", help="Path to the BigQuery CLI")
 46 | parser.add_argument("--gsutil_bin", default="gsutil", help="Path to the GSUtil CLI")
 47 | parser.add_argument("--project_id", default="", help="Google Cloud Project ID to store temporary Google Cloud Storage files in. If empty, uses the project from the table name.")
 48 | parser.add_argument("--storage_bucket", default="", help="Google Cloud Storage bucket name. This bucket must be in the same region as --location. If empty, creates a new bucket under this project_id.")
 49 | parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite the table if it exists.")
 50 | parser.add_argument("--field_delimiter", default=",", help="Field delimiter between data.")
 51 | parser.add_argument("--read_header", default=True, action="store_true", help="Set the schema from the first row of the first CSV input, else --header must be set.")
 52 | parser.add_argument("--header", default="", help="Comma-separated header names for each column. Only for single tables.")
 53 | parser.add_argument("--column_types", default="", help="Comma-separated types for each column. Only for single tables.")
 54 | parser.add_argument("--location", default="US", help="Geographical location for the dataset, either US or EU. US is preferred, since JOINs must be between tables in the same region.")
 55 | parser.add_argument("--tables", help="BigQuery destination tables. Use '{}' as a placeholder for a matching name in --sources ('project-id:dataset.{}').")
 56 | parser.add_argument("--sources", help="CSV source file pattern. Use '{}' to generate multiple table names in --tables ('reg{}_part*.txt', '**/{}.csv').")
 57 | args = parser.parse_args()
 58 | 
 59 | # Argument checking.
 60 | if not args.location in ["US", "EU"]:
 61 |   print("--location must be US or EU")
 62 |   ost.exit(1)
 63 | 
 64 | # Find the source files and destinatination tables.
 65 | sources = os.path.expanduser(args.sources)
 66 | source_files = glob.glob(sources.replace("{}", "*"))
 67 | 
 68 | table_files = {}
 69 | 
 70 | file_re = sources.replace("*", ".*").replace("{}", "(.*)") + "$"
 71 | for file in source_files:
 72 |   matches = re.search(file_re, file)
 73 |   if not matches:
 74 |     continue
 75 |   table_name = args.tables
 76 |   if "{}" in args.sources:
 77 |     table_part = matches.group(1)
 78 |   else:
 79 |     table_part = os.path.basename(file).replace('.', '_')
 80 |   table_name = args.tables.replace('{}', table_part)
 81 | 
 82 |   if table_name not in table_files:
 83 |     table_files[table_name] = []
 84 |   table_files[table_name].append(file)
 85 | 
 86 | for table in sorted(table_files.keys()):
 87 |   print(table)
 88 |   for v in table_files[table]:
 89 |     print("  " + v)
 90 | 
 91 | if args.header and len(table_files) > 1:
 92 |   print("--header can only be set for a single table upload")
 93 |   os.exit(1)
 94 | 
 95 | # Upload to bucket.
 96 | # Clear bucket space
 97 | gsutil = sh.Command(args.gsutil_bin)
 98 | 
 99 | project_id = args.project_id
100 | if not project_id:
101 |   if not ":" in args.tables:
102 |     print("--tables must use project-id:dataset_name.table_name format")
103 |     os.exit(1)
104 |   project_id = args.tables.split(":")[0]
105 |   print("Using --project_id=%s" % project_id)
106 | 
107 | 
108 | bucket = args.storage_bucket
109 | if not bucket:
110 |   bucket = "%s-bq-uploads-tool" % project_id
111 | 
112 | bucket = "gs://" + bucket
113 | 
114 | try:
115 |   gsutil("ls", bucket)
116 |   print("Bucket %s exists" % bucket)
117 | except:
118 |   if args.location == "EU":
119 |     bucket_location = "europe-west1"
120 |   else:
121 |     bucket_location = "us-east1"
122 | 
123 |   mb_args = ["mb", "-c", "regional", "-l", bucket_location, "-p", project_id, bucket]
124 |   print("gsutil %s" % mb_args)
125 |   if not args.dry_run:
126 |     gsutil(*mb_args)
127 |     print("Created new bucket")
128 | 
129 | # Split to 4G, gzip and upload CSV files. Skip the header lines.
130 | bq = sh.Command(args.bq_bin)
131 | 
132 | buf = 8 * 2 ** 20
133 | 
134 | class Splitter:
135 |   def __init__(self, max_size, path):
136 |     self.max_size = max_size
137 |     self.path = path
138 |     self.size = 0
139 |     self.parts = 0
140 |     self.upload_paths = []
141 |     self.upload_pipe = None
142 |     self.upload_proc = None
143 |     self.done_pipe = queue.Queue()
144 | 
145 |   def data(self, data_chunk):
146 |     # Pipe this through to the gzip and upload commands.
147 |     if not data_chunk:
148 |       self.done()
149 |       return
150 |     chunk_size = len(data_chunk)
151 |     if self.size + chunk_size > self.max_size:
152 |       self.flush()
153 |       self.size = 0
154 |       self.upload_proc = None
155 |     if self.upload_proc is None:
156 |       self.upload_pipe = queue.Queue(maxsize=1)
157 |       gzip_pipe = sh.gzip("-f", _in=self.upload_pipe, _in_bufsize=buf, _out_bufsize=buf, _bg=True)
158 |       path_split = self.path + "_chunk%09d.gz" % self.parts
159 |       self.upload_paths.append(path_split)
160 |       self.parts += 1
161 |       print("Uploading %s" % path_split)
162 |       self.upload_proc = gsutil(gzip_pipe, "cp", "-", path_split, _in_bufsize=buf, _bg=True, _internal_bufsize=16 * 2 ** 20)
163 |       print("Upload proc: %s" % self.upload_proc.pid)
164 | 
165 |     self.size += chunk_size
166 |     print("%.4f GB" % (self.size / (2 ** 30)))
167 |     self.upload_pipe.put(data_chunk)
168 | 
169 |   def done(self, *args):
170 |     print("Splitter parent done")
171 |     self.flush()
172 |     self.done_pipe.put(True)
173 | 
174 |   def done_wait(self):
175 |     # Block until done() is finished.
176 |     self.done_pipe.get()
177 | 
178 |   def flush(self):
179 |     print("Closing upload pipe")
180 |     self.upload_pipe.put(None)
181 |     print("Waiting for upload to finish")
182 |     self.upload_proc.wait()
183 |     print("Upload finished")
184 | 
185 | 
186 | for table in sorted(table_files.keys()):
187 |   files = table_files[table]
188 |   print("Uploading files for table %s" % table)
189 |   uploaded_paths = []
190 |   for file in files:
191 |     dest = bucket + "/%s_%s_%s" % (re.sub('[^a-zA-Z0-9_]', '', table), hashlib.sha1(file.encode('utf-8')).hexdigest(), os.path.basename(file))
192 |     # Split into 4G chunks, gzip and upload.
193 |     print("Copying %s to %s..." % (file, dest))
194 |     if not args.dry_run:
195 |       splitter = Splitter(4 * 2 ** 30, dest)  # 4G
196 |       # Read each file and forward the stream to splitter.data(chunk).
197 |       with open(file, 'rb') as f:
198 |         while True:
199 |           chunk = f.read(buf)
200 |           splitter.data(chunk)
201 |           if not chunk:
202 |             break
203 |       splitter.done_wait()
204 |       uploaded_paths.extend(splitter.upload_paths)
205 |     else:
206 |       uploaded_paths.append(dest + "...[dry run]")
207 | 
208 |   # Get the header.
209 | 
210 | 
211 |   # build header and column datatypes
212 |   header = args.header
213 |   skip_leading_rows = 0
214 |   if not header:
215 |     if args.field_delimiter == "\\t":
216 |       col_sep = "\t"
217 |     elif args.field_delimiter == "\\s":
218 |       col_sep = "\s"
219 |     else:
220 |       col_sep = args.field_delimiter
221 |     with open(files[0], 'r') as f:
222 |       header_list = next(csv.reader(f, delimiter=col_sep))
223 |       print(header_list)
224 |       skip_leading_rows = 1
225 |   else:
226 |     header_list = header.split(",")
227 |   # A column name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_) and
228 |   # start with a letter.
229 |   clean_headers = []
230 |   for header in header_list:
231 |     h = re.sub('[^a-zA-Z0-9_]', '', header)
232 |     if re.match('^[0-9].*$', h):
233 |       h = "f" + h
234 |     clean_headers.append(h)
235 |   header_list = clean_headers
236 | 
237 |   print("Headers: %s" % header_list)
238 | 
239 |   column_types = args.column_types
240 |   if not column_types:
241 |     column_types_list = ['STRING'] * len(header_list)
242 |   else:
243 |     column_types_list = column_types.split(",")
244 | 
245 |   if len(header_list) != len(column_types_list):
246 |     print("Number of header fields and column types must be equal.")
247 |     os.exit(1)
248 | 
249 |   schema = ",".join([header_name + ":" + header_type.upper() for header_name, header_type in zip(header_list, column_types_list)])
250 | 
251 |   # bq create table uploaded_paths
252 |   bq_args = [
253 |       "--location", args.location,
254 |       "--project_id", project_id,
255 |       "load",
256 |       "--source_format", "CSV",
257 |       "--replace",
258 |       "--field_delimiter", args.field_delimiter,
259 |       "--schema", schema,
260 |       "--allow_quoted_newlines",
261 |       "--skip_leading_rows", "%d" % skip_leading_rows,
262 |       table,
263 |       ",".join(uploaded_paths),
264 |   ]
265 |   print("Creating table %s" % table)
266 |   try:
267 |     dataset = table.split(".")[0]
268 |     bq("show", dataset)
269 |   except:
270 |     print("Creating dataset %s" % dataset)
271 |     bq_mk_args = ["--location", args.location, "mk", "--project_id", project_id, dataset]
272 |     print("bq %s" % bq_mk_args)
273 |     if not args.dry_run:
274 |       bq(*bq_mk_args)
275 | 
276 |   print("bq %s" % bq_args)
277 |   if not args.dry_run:
278 |     bq(*bq_args)
279 |     print("Removing uploaded files %s" % uploaded_paths)
280 |     gsutil("rm", *uploaded_paths)
281 |     print("Done creating %s" % table)
282 | 


--------------------------------------------------------------------------------
/tools/dataset_berkeley_fung.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "Berkeley Fung": [
 4 |       "erudite-marker-539:JEMS16.*"
 5 |     ]
 6 |   },
 7 |   "groups": {},
 8 |   "joins": {
 9 |     "family_id": [
10 |       "erudite-marker-539:JEMS16.patent_metadata_2|FamilyID"
11 |     ]
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/tools/dataset_ebi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "European Bioinformatics Institute": [
 4 |       "patents-public-data:ebi_chembl.*",
 5 |       "patents-public-data:ebi_surechembl.*"
 6 |     ]
 7 |   },
 8 |   "groups": {},
 9 |   "joins": {
10 |     "publication_number": [
11 |       "patents-public-data:ebi_surechembl.match|publication_number",
12 |       "patents-public-data:ebi_chembl.match_24|publication_number"
13 |     ],
14 |     "SureChEMBL patent_id": [
15 |       "patents-public-data:ebi_surechembl.match|patent_id",
16 |       "+patents-public-data:ebi_surechembl.map|patent_id"
17 |     ],
18 |     "ChEMBL patent_no": [
19 |       "patents-public-data:ebi_chembl.match_24|patent_no",
20 |       "+patents-public-data:ebi_chembl.product_patents_24|patent_no"
21 |     ],
22 |     "ChEMBL molregno": [
23 |       "+patents-public-data:ebi_chembl.compound_properties_24|molregno",
24 |       "patents-public-data:ebi_chembl.*|molregno"
25 |     ]
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/tools/dataset_ifi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "IFI Claims": [
 4 |       "jefferson-1790:ifi_claims.xml",
 5 |       "jefferson-1790:ifi_claims.publications"
 6 |     ]
 7 |   },
 8 |   "groups": {
 9 |     "jefferson-1790:ifi_claims.xml": "country"
10 |   },
11 |   "joins": {
12 |     "publication_number": [
13 |       "+jefferson-1790:ifi_claims.xml|publication_number"
14 |     ]
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/tools/dataset_innography.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "CPA Global": [
 4 |       "innography-174118:technical_standards.etsi"
 5 |     ]
 6 |   },
 7 |   "groups": {
 8 |     "innography-174118:technical_standards.etsi": "TechnicalStandard"
 9 |   },
10 |   "joins": {
11 |     "publication_number": [
12 |       "innography-174118:technical_standards.etsi|PublicationNumber"
13 |     ]
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/tools/dataset_other.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "Other": [
 4 |       "patents-public-data:cpc.*",
 5 |       "patents-public-data:dsep.*",
 6 |       "patents-public-data:marec.*",
 7 |       "patents-public-data:usitc_investigations.*",
 8 |       "patents-public-data:worldbank_wdi.*"
 9 |     ]
10 |   },
11 |   "groups": {},
12 |   "joins": {
13 |     "publication_number": [
14 |       "patents-public-data:usitc_investigations.match|publication_number",
15 |       "patents-public-data:marec.publications|publication_number"
16 |     ],
17 |     "family_id": [
18 |       "patents-public-data:dsep.disclosures_13|family_id"
19 |     ]
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/tools/dataset_public.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "Google Patents Public Datasets": [
 4 |       "patents-public-data:patents.*",
 5 |       "patents-public-data:google_patents_research.*"
 6 |     ]
 7 |   },
 8 |   "groups": {
 9 |     "patents-public-data:patents.publications": "country_code"
10 |   },
11 |   "joins": {
12 |     "publication_number": [
13 |       "+patents-public-data:patents.publications|publication_number",
14 |       "patents-public-data:google_patents_research.publications|publication_number"
15 |     ],
16 |     "family_id": [
17 |       "+patents-public-data:patents.publications|family_id"
18 |     ],
19 |     "application_number": [
20 |       "+patents-public-data:patents.publications|application_number"
21 |     ]
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/tools/dataset_report.pysh:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Generate a report on the database tables.
 16 | #
 17 | # $ python3 dataset_report.pysh --project_id=<my-project> --configs dataset_public.json dataset_uspto.json ... --output_dir=../tables --formats=pdf
 18 | import sh
 19 | import sys
 20 | import re
 21 | import os
 22 | import json
 23 | import collections
 24 | import datetime
 25 | import jinja2
 26 | import argparse
 27 | 
 28 | parser = argparse.ArgumentParser(description="Generate a set of documentation pages for BigQuery tables.")
 29 | parser.add_argument("--project_id", help="Project ID used to query tables.")
 30 | parser.add_argument("--configs", nargs="+", help="List of JSON configuration files.")
 31 | parser.add_argument("--output_dir", help="Output directory for files.")
 32 | parser.add_argument("--formats", help="Comma-separated list of output formats (pandoc-supported extensions)")
 33 | args = parser.parse_args()
 34 | 
 35 | if not args.output_dir:
 36 |   print("--output_dir is required")
 37 |   sys.exit(1)
 38 | if not args.project_id:
 39 |   print("--project_id is required")
 40 |   sys.exit(1)
 41 | 
 42 | output_dir = os.path.expanduser(args.output_dir)
 43 | 
 44 | bq = sh.Command("bq")
 45 | 
 46 | # Read config files.
 47 | table_config = {}
 48 | group_config = {}
 49 | join_config = {}
 50 | 
 51 | for name in args.configs:
 52 |   print("Reading config %s" % name)
 53 |   with open(os.path.expanduser(name), "r") as f:
 54 |     try:
 55 |       c = json.loads(f.read())
 56 |     except Exception as e:
 57 |       print("Error parsing JSON (this is usually caused by a trailing comma)")
 58 |       raise e
 59 |     for k, v in c.get("tables", {}).items():
 60 |       if k in table_config:
 61 |         table_config[k].extend(v)
 62 |       else:
 63 |         table_config[k] = v
 64 |     group_config.update(c.get("groups", {}))
 65 |     for k, v in c.get("joins", {}).items():
 66 |       if k in join_config:
 67 |         join_config[k].extend(v)
 68 |       else:
 69 |         join_config[k] = v
 70 | 
 71 | print(table_config)
 72 | print(group_config)
 73 | print(join_config)
 74 | 
 75 | # Keep track of printed objects from __repr__.
 76 | __repr_recursion_set = None
 77 | 
 78 | def namedtuple(name, field_list):
 79 |   fields = field_list.split(" ")
 80 |   def init(self, **kwargs):
 81 |     for k, v in kwargs.items():
 82 |       if not k in fields:
 83 |         raise AttributeError("%s not in %s" % (k, fields))
 84 |       setattr(self, k, v)
 85 |   def repr(self):
 86 |     global __repr_recursion_set
 87 |     top = False
 88 |     if not __repr_recursion_set:
 89 |       top = True
 90 |       __repr_recursion_set = set()
 91 |     if self in __repr_recursion_set:
 92 |       result = "%s<...>" % name
 93 |     else:
 94 |       __repr_recursion_set.add(self)
 95 |       result = "%s<%s>" % (name, ", ".join(["%s=%s" % (k, getattr(self, k)) for k in fields]))
 96 |     if top:
 97 |       __repr_recursion_set = None
 98 |     return result
 99 |   return type(name, (), dict({k: None for k in fields}, __init__=init, __repr__=repr))
100 | 
101 | Dataset = namedtuple("Dataset", "name last_updated tables")
102 | 
103 | Table = namedtuple("Table", "name version dataset_description description dataset fields last_updated num_rows from_joins num_bytes old_version")
104 | 
105 | Field = namedtuple("Field", "name table description type mode from_joins to_joins")
106 | 
107 | Join = namedtuple("Join", "name from_field to_field percent num_rows join_stats sql")
108 | 
109 | JoinStat = namedtuple("JoinStat", "percent num_rows key sample_value")
110 | 
111 | datasets = collections.OrderedDict()
112 | 
113 | def find_field(table_name, column):
114 |   for dataset in datasets.values():
115 |     for t in dataset.tables:
116 |       if t.name == table_name:
117 |         for f in t.fields:
118 |           if column == f.name:
119 |             return f
120 |   return None
121 | 
122 | def ts_to_string(unix):
123 |   return datetime.datetime.utcfromtimestamp(unix).strftime("%Y-%m-%d")
124 | 
125 | def tsql(table):
126 |   return table.replace(":", ".")
127 | 
128 | # Fetch a list of all tables and schemas for those tables.
129 | for nice_name, table_fmts in table_config.items():
130 |   for table_fmt in table_fmts:
131 |     dataset_name, table_name = table_fmt.split(".")
132 |     if nice_name not in datasets:
133 |       dataset = Dataset(name=nice_name)
134 |       datasets[nice_name] = dataset
135 |     else:
136 |       dataset = datasets[nice_name]
137 |     show_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", dataset_name).stdout.decode('utf-8'))
138 | 
139 |     if not dataset.tables:
140 |       dataset.tables = []
141 | 
142 |     print("Loading dataset %s" % dataset_name)
143 |     tables = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "ls", "-n", "100000", dataset_name).stdout.decode('utf-8'))
144 |     for table_data in tables:
145 |       name = table_data["tableReference"]["tableId"]
146 |       if re.match(table_name.replace("*", ".*"), name):
147 |         table = Table(name=dataset_name + "." + name, dataset=dataset, dataset_description=show_info.get("description", ""))
148 |         dataset.tables.append(table)
149 |         print(table.name)
150 | 
151 | # Detect table and dataset versions, mark older versions.
152 | latest_table_base = {}  # map[base name]latest name
153 | no_version_tables = {}
154 | for dataset in datasets.values():
155 |   for table in dataset.tables:
156 |     def sub_fn(m):
157 |       return m.group(1)
158 |     m = re.match("^(.+)_([0-9]+[0-9a-zA-Z]*)", table.name)
159 |     if not m:
160 |       no_version_tables[table.name] = True
161 |       latest_table_base[table.name] = table.name
162 |     else:
163 |       base = m.group(1)
164 |       table.version = m.group(2)
165 |       if not base in latest_table_base:
166 |         latest_table_base[base] = table.name
167 |       elif latest_table_base[base] < table.name and not no_version_tables.get(base, ""):
168 |         latest_table_base[base] = table.name
169 | 
170 | 
171 | latest_tables = {}
172 | for latest in latest_table_base.values():
173 |   latest_tables[latest] = True
174 | 
175 | for dataset in datasets.values():
176 |   for table in dataset.tables:
177 |     if table.name not in latest_tables:
178 |       table.old_version = True
179 | 
180 | for dataset in datasets.values():
181 |   for table in dataset.tables:
182 |     if table.old_version:
183 |       print("Skipping old table %s" % table.name)
184 |       continue
185 |     print("Loading table %s" % table.name)
186 |     table_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", table.name).stdout.decode('utf-8'))
187 |     table_fields = []
188 |     def add_fields(parent, fields):
189 |       for field in fields:
190 |         name = field["name"]
191 |         if parent:
192 |           name = parent + "." + name
193 |         table_fields.append(Field(
194 |             name=name,
195 |             table=table,
196 |             description=field.get("description", ""),
197 |             type=field.get("type", ""),
198 |             mode=field.get("mode", ""),
199 |         ))
200 |         if "fields" in field:
201 |           add_fields(name, field["fields"])
202 | 
203 |     add_fields("", table_info["schema"]["fields"])
204 |     table.fields = table_fields
205 |     table.description = table_info.get("description", "")
206 |     table.last_updated = ts_to_string(int(table_info["lastModifiedTime"]) / 1000)
207 |     if not dataset.last_updated or dataset.last_updated < table.last_updated:
208 |       dataset.last_updated = table.last_updated
209 |     table.num_rows = table_info["numRows"]
210 |     table.num_bytes = table_info["numBytes"]
211 |     # Possibly calculate group-by stats.
212 |     if table.name in group_config:
213 |       column = group_config[table.name]
214 |       query = "SELECT COUNT(*) AS cnt, {column} AS grouped FROM `{table}` GROUP BY 2 ORDER BY 1".format(table=tsql(table.name), column=column)
215 |       result = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "query", "--use_legacy_sql=false", query).stdout.decode('utf-8'))
216 |       table.stats = {}
217 |       for row in result:
218 |         js = JoinStat(key=row["grouped"], num_rows=int(row["cnt"]))
219 |         table.stats[js.key] = js
220 | 
221 | 
222 | # Support wildcards in join groups: dataset:*|molregno
223 | for join_group in join_config.values():
224 |   i = 0
225 |   while i < len(join_group):
226 |     if not "*" in join_group[i]:
227 |       i += 1
228 |       continue
229 |     table_fmt, column_fmt = join_group[i].split("|")
230 |     # Loop over all tables and columns and look for matches.
231 |     matches = []
232 |     for dataset in datasets.values():
233 |       for table in dataset.tables:
234 |         if not re.match(table_fmt.replace("*", ".*"), table.name) or table.old_version:
235 |           continue
236 |         for field in table.fields:
237 |           if re.match(column_fmt.replace("*", ".*"), field.name):
238 |             matches.append("%s|%s" % (table.name, field.name))
239 |     # Replace join_group[i] with the matched values.
240 |     join_group.pop(i)
241 |     for v in matches:
242 |       join_group.insert(i, v)
243 |       i += 1
244 | 
245 | join_done = set()
246 | 
247 | # Enumerate all possible joins inside each group of matching columns.
248 | for join_name, join_group in join_config.items():
249 |   for i in range(len(join_group)):
250 |     self = join_group[i]
251 |     for j in range(len(join_group)):
252 |       if j == i:
253 |         continue
254 |       first_table, first_column = join_group[i].split("|")
255 |       second_table, second_column = join_group[j].split("|")
256 |       # Only join tables if one or more has a + as the prefix.
257 |       if not first_table.startswith("+") and not second_table.startswith("+"):
258 |         continue
259 |       first_table = first_table.lstrip("+")
260 |       second_table = second_table.lstrip("+")
261 |       key = first_table + first_column + second_table + second_column
262 |       if key in join_done or (first_table == second_table and first_column == second_column):
263 |         continue
264 |       join_done.add(key)
265 |       print("Running join between %s and %s" %  (join_group[i], join_group[j]))
266 |       from_field = find_field(first_table, first_column)
267 |       to_field = find_field(second_table, second_column)
268 |       if not from_field or not to_field:
269 |         raise TypeError("fields not found: %s:%s %s:%s" % (join_group[i], from_field is not None, join_group[j], to_field is not None))
270 |       group_by = group_config.get(first_table, None)
271 |       if not group_by:
272 |         query = """#standardSQL
273 | SELECT
274 |   COUNT(*) AS cnt,
275 |   COUNT(second.second_column) AS second_cnt,
276 |   ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
277 | FROM `{first_table}`AS first
278 | LEFT JOIN (
279 |   SELECT {second_column} AS second_column, COUNT(*) AS cnt
280 |   FROM `{second_table}`
281 |   GROUP BY 1
282 | ) AS second ON first.{first_column} = second.second_column""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column)
283 |       else:
284 |         query = """#standardSQL
285 | SELECT
286 |   COUNT(*) AS cnt,
287 |   COUNT(second.second_column) AS second_cnt,
288 |   first.{group_by} AS grouped,
289 |   ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
290 | FROM `{first_table}`AS first
291 | LEFT JOIN (
292 |   SELECT {second_column} AS second_column, COUNT(*) AS cnt
293 |   FROM `{second_table}`
294 |   GROUP BY 1
295 | ) AS second ON first.{first_column} = second.second_column
296 | GROUP BY 3""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column, group_by=group_by)
297 | 
298 |       result = json.loads(bq("--format=prettyjson", "query", "--use_legacy_sql=false", query).stdout.decode('utf-8'))
299 |       total_rows = 0
300 |       joined_rows = 0
301 | 
302 |       join_stats = {}
303 |       join = Join(name=join_name, from_field=from_field, to_field=to_field, join_stats=join_stats, sql=query)
304 |       if not from_field.from_joins:
305 |         from_field.from_joins = []
306 |       from_field.from_joins.append(join)
307 |       if not to_field.to_joins:
308 |         to_field.to_joins = []
309 |       to_field.to_joins.append(join)
310 |       if not from_field.table.from_joins:
311 |         from_field.table.from_joins = []
312 |       from_field.table.from_joins.append(join)
313 |       for row in result:
314 |         cnt = int(row["cnt"])
315 |         second_cnt = int(row["second_cnt"])
316 |         total_rows += cnt
317 |         joined_rows += second_cnt
318 |         if not group_by:
319 |           join_stats[""] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key="all", sample_value=row["sample_value"])
320 |         else:
321 |           join_stats[row["grouped"]] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key=row["grouped"], sample_value=row["sample_value"])
322 |       join.percent = joined_rows / total_rows
323 |       join.num_rows = joined_rows
324 | 
325 | def other_formats(name):
326 |   if not args.formats:
327 |     return
328 |   for fmt in args.formats.split(","):
329 |     sh.pandoc(name, "--from", "markdown", "-s", "-o", "%s.%s" % (name, fmt))
330 | 
331 | # "index.md"
332 | # Links to every dataset and description of each dataset
333 | # DOT graph of links between tables
334 | # Link statistics: % of rows that link together
335 | main_page_template = jinja2.Template("""
336 | ---
337 | geometry: margin=0.6in
338 | ---
339 | 
340 | # Datasets
341 | 
342 | {% for dataset in datasets.values() %}
343 | ## [{{dataset.name}}](dataset_{{dataset.name}}.md)
344 | 
345 | | Name | Last updated | Rows | Joins |
346 | |-------------------------------------------|-------|--------|-----------------|
347 | {% for table in dataset.tables -%}
348 | | [{{table.name}}](https://bigquery.cloud.google.com/table/{{table.name}}) | {% if table.last_updated %}{{table.last_updated }}{% endif %} | {% if table.num_rows %}{{"{0:,}".format(table.num_rows|int)}}{% endif %} |
349 | {%- if table.from_joins %}{% for group in table.from_joins|groupby("name") -%}
350 | {{group.grouper}} {% endfor %}{% endif %} |
351 | {% endfor %}
352 | {% endfor %}
353 | """)
354 | 
355 | index_output = os.path.join(output_dir, "index.md")
356 | with open(index_output, "w") as f:
357 |   f.write(main_page_template.render(datasets=datasets))
358 | other_formats(index_output)
359 | 
360 | # "dataset_<name>.md"
361 | # Description of dataset
362 | # List of all tables in dataset
363 | # Sample rows in each table
364 | # Links to other datasets
365 | # Inner-dataset links
366 | # DOT graph of links
367 | dataset_page_template = jinja2.Template("""
368 | ---
369 | geometry: margin=0.6in
370 | ---
371 | 
372 | # {{dataset.name}}
373 | 
374 | {% for table in dataset.tables %}
375 | *****
376 | ## {{table.name}}
377 | 
378 | {% if table.old_version %}
379 | Old table version `{{ table.version }}`, schema skipped.
380 | {% else %}
381 | {% if table.dataset_description %}
382 | > {{table.dataset_description|replace("\n", "\n> ")}}
383 | {% endif %}
384 | {% if table.description %}
385 | > {{table.description|replace("\n", "\n> ")}}
386 | {% endif %}
387 | {% endif %}
388 | 
389 | {% if table.fields %}
390 | | Stat | Value |
391 | |----------|----------|
392 | | Last updated | {{table.last_updated}} |
393 | | Rows | {{"{0:,}".format(table.num_rows|int)}} |
394 | | Size | {{table.num_bytes|filesizeformat}} |
395 | 
396 | ### Schema
397 | [View in BigQuery](https://bigquery.cloud.google.com/table/{{table.name}})
398 | 
399 | {% for field in table.fields -%}
400 | * `{{field.name}}` {{field.type}} {{field.mode}} {% if field.from_joins %} joins on **{{ field.from_joins[0].name }}**{% endif %}
401 | {% if field.description %}
402 |     > {{field.description|replace("\n", "\n> ")}}
403 | {% endif %}
404 | {% endfor %}
405 | 
406 | {% if table.from_joins %}### Join columns{% endif %}
407 | {% for field in table.fields %}
408 | {% if field.from_joins %}
409 | #### {{field.name}}
410 | {% for join in field.from_joins %}
411 | joins to `{{ join.to_field.table.name }}::{{ join.to_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows)
412 | 
413 | | Key | Percent | Rows | Sample values |
414 | |------|-----|--------|--------------------------------------------------------|
415 | {% for stat in join.join_stats.values() -%}
416 | | `{{stat.key}}` | {% if stat.percent > 0.0 %}{{"%.2f" % (100 * stat.percent)}}%{% else %}*none*{% endif %} | {{"{0:,}".format(stat.num_rows|int)}} | `{{stat.sample_value}}` |
417 | {% endfor %}
418 | 
419 |     {{join.sql|indent}}
420 | 
421 | {% endfor %}
422 | {% for join in field.to_joins %}
423 | joins from `{{ join.from_field.table.name }}::{{ join.from_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows)
424 | {% endfor %}
425 | {% endif %}
426 | {% endfor %}
427 | {% endif %}
428 | 
429 | {% endfor %}
430 | """)
431 | 
432 | for dataset in datasets.values():
433 |   output = os.path.join(output_dir, "dataset_%s.md" % dataset.name)
434 |   with open(output, "w") as f:
435 |     f.write(dataset_page_template.render(dataset=dataset))
436 |   other_formats(output)
437 | 


--------------------------------------------------------------------------------
/tools/dataset_uspto.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tables": {
 3 |     "USPTO": [
 4 |       "patents-public-data:patentsview.*",
 5 |       "patents-public-data:uspto_oce_assignment.*",
 6 |       "patents-public-data:uspto_oce_cancer.*",
 7 |       "patents-public-data:uspto_oce_claims.*",
 8 |       "patents-public-data:uspto_oce_litigation.*",
 9 |       "patents-public-data:uspto_oce_office_actions.*",
10 |       "patents-public-data:uspto_oce_pair.*",
11 |       "patents-public-data:uspto_peds.*",
12 |       "patents-public-data:uspto_ptab.*"
13 |     ]
14 |   },
15 |   "groups": {
16 |     "patents-public-data:patents.publications": "country_code"
17 |   },
18 |   "joins": {
19 |     "publication_number": [
20 |       "patents-public-data:patentsview.match|publication_number",
21 |       "patents-public-data:uspto_oce_assignment.match|publication_number",
22 |       "patents-public-data:uspto_oce_cancer.match|publication_number",
23 |       "patents-public-data:uspto_oce_claims.match|publication_number"
24 |     ],
25 |     "family_id": [
26 |       "patents-public-data:uspto_oce_cancer.publications|Family_ID"
27 |     ],
28 |     "application_number": [
29 |       "patents-public-data:uspto_oce_office_actions.match_app|application_number",
30 |       "patents-public-data:uspto_oce_pair.match|application_number",
31 |       "patents-public-data:uspto_peds.match|application_number",
32 |       "patents-public-data:uspto_ptab.match|application_number"
33 |     ],
34 |     "OCE Assignment pgpub_doc_num": [
35 |       "patents-public-data:uspto_oce_assignment.match|pgpub_doc_num",
36 |       "+patents-public-data:uspto_oce_assignment.documentid|pgpub_doc_num"
37 |     ],
38 |     "OCE Assignment grant_doc_num": [
39 |       "patents-public-data:uspto_oce_assignment.match|grant_doc_num",
40 |       "+patents-public-data:uspto_oce_assignment.documentid|grant_doc_num"
41 |     ],
42 |     "OCE Cancer id": [
43 |       "patents-public-data:uspto_oce_cancer.match|Patent_or_Publication_ID",
44 |       "+patents-public-data:uspto_oce_cancer.publications|Patent_or_Publication_ID"
45 |     ],
46 |     "OCE Claims pat_no": [
47 |       "patents-public-data:uspto_oce_claims.match|pat_no",
48 |       "+patents-public-data:uspto_oce_claims.patent_document_stats|pat_no"
49 |     ],
50 |     "OCE Claims pub_no": [
51 |       "patents-public-data:uspto_oce_claims.match|pub_no",
52 |       "+patents-public-data:uspto_oce_claims.pgpub_document_stats|pub_no"
53 |     ],
54 |     "OCE OA app_id": [
55 |       "patents-public-data:uspto_oce_office_actions.match_app|app_id",
56 |       "+patents-public-data:uspto_oce_office_actions.citations|app_id",
57 |       "+patents-public-data:uspto_oce_office_actions.office_actions|app_id",
58 |       "+patents-public-data:uspto_oce_office_actions.rejections|app_id"
59 |     ],
60 |     "OCE OA pub_id": [
61 |       "+patents-public-data:uspto_oce_office_actions.citations|parsed",
62 |       "patents-public-data:uspto_oce_office_actions.match_pub|parsed"
63 |     ],
64 |     "OCE PAIR app_num": [
65 |       "patents-public-data:uspto_oce_pair.match|application_number_pair",
66 |       "+patents-public-data:uspto_oce_pair.application_data|application_number"
67 |     ],
68 |     "PEDS app_num": [
69 |      "patents-public-data:uspto_peds.match|applicationNumberText",
70 |      "+patents-public-data:uspto_peds.applications|patentCaseMetadata.applicationNumberText.electronicText"
71 |     ],
72 |     "PTAB app_num": [
73 |       "patents-public-data:uspto_ptab.match|ApplicationNumber",
74 |       "+patents-public-data:uspto_ptab.trials|ApplicationNumber"
75 |     ],
76 |     "PatentsView patent_id": [
77 |       "+patents-public-data:patentsview.patent|id",
78 |       "patents-public-data:patentsview.*|patent_id"
79 |     ]
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/tools/generate_dataset_docs.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Generate a report on the database tables.
 16 | #
 17 | # $ python3 dataset_report.pysh --project_id=<my-project> --configs dataset_public.json dataset_uspto.json ... --output_dir=../tables --formats=pdf
 18 | import sh
 19 | import sys
 20 | import re
 21 | import os
 22 | import json
 23 | import collections
 24 | import datetime
 25 | import jinja2
 26 | import argparse
 27 | 
 28 | parser = argparse.ArgumentParser(description="Generate a set of documentation pages for BigQuery tables.")
 29 | parser.add_argument("--project_id", help="Project ID used to query tables.")
 30 | parser.add_argument("--configs", nargs="+", help="List of JSON configuration files.")
 31 | parser.add_argument("--output_dir", help="Output directory for files.")
 32 | args = parser.parse_args()
 33 | 
 34 | if not args.output_dir:
 35 |   print("--output_dir is required")
 36 |   sys.exit(1)
 37 | if not args.project_id:
 38 |   print("--project_id is required")
 39 |   sys.exit(1)
 40 | 
 41 | output_dir = os.path.expanduser(args.output_dir)
 42 | 
 43 | bq = sh.Command("bq")
 44 | 
 45 | # Read config files.
 46 | table_config = {}
 47 | group_config = {}
 48 | join_config = {}
 49 | 
 50 | for name in args.configs:
 51 |   print("Reading config %s" % name)
 52 |   with open(os.path.expanduser(name), "r") as f:
 53 |     try:
 54 |       c = json.loads(f.read())
 55 |     except Exception as e:
 56 |       print("Error parsing JSON (this is usually caused by a trailing comma)")
 57 |       raise e
 58 |     for k, v in c.get("tables", {}).items():
 59 |       if k in table_config:
 60 |         table_config[k].extend(v)
 61 |       else:
 62 |         table_config[k] = v
 63 |     group_config.update(c.get("groups", {}))
 64 |     for k, v in c.get("joins", {}).items():
 65 |       if k in join_config:
 66 |         join_config[k].extend(v)
 67 |       else:
 68 |         join_config[k] = v
 69 | 
 70 | print(table_config)
 71 | print(group_config)
 72 | print(join_config)
 73 | 
 74 | # Keep track of printed objects from __repr__.
 75 | __repr_recursion_set = None
 76 | 
 77 | def namedtuple(name, field_list):
 78 |   fields = field_list.split(" ")
 79 |   def init(self, **kwargs):
 80 |     for k, v in kwargs.items():
 81 |       if not k in fields:
 82 |         raise AttributeError("%s not in %s" % (k, fields))
 83 |       setattr(self, k, v)
 84 |   def repr(self):
 85 |     global __repr_recursion_set
 86 |     top = False
 87 |     if not __repr_recursion_set:
 88 |       top = True
 89 |       __repr_recursion_set = set()
 90 |     if self in __repr_recursion_set:
 91 |       result = "%s<...>" % name
 92 |     else:
 93 |       __repr_recursion_set.add(self)
 94 |       result = "%s<%s>" % (name, ", ".join(["%s=%s" % (k, getattr(self, k)) for k in fields]))
 95 |     if top:
 96 |       __repr_recursion_set = None
 97 |     return result
 98 |   return type(name, (), dict({k: None for k in fields}, __init__=init, __repr__=repr))
 99 | 
100 | Dataset = namedtuple("Dataset", "name last_updated tables")
101 | 
102 | Table = namedtuple("Table", "name version dataset_description description dataset fields last_updated num_rows from_joins num_bytes old_version")
103 | 
104 | Field = namedtuple("Field", "name table description type mode from_joins to_joins")
105 | 
106 | Join = namedtuple("Join", "name from_field to_field percent num_rows join_stats sql")
107 | 
108 | JoinStat = namedtuple("JoinStat", "percent num_rows key sample_value")
109 | 
110 | datasets = collections.OrderedDict()
111 | 
112 | def find_field(table_name, column):
113 |   for dataset in datasets.values():
114 |     for t in dataset.tables:
115 |       if t.name == table_name:
116 |         for f in t.fields:
117 |           if column == f.name:
118 |             return f
119 |   return None
120 | 
121 | def ts_to_string(unix):
122 |   return datetime.datetime.utcfromtimestamp(unix).strftime("%Y-%m-%d")
123 | 
124 | def tsql(table):
125 |   return table.replace(":", ".")
126 | 
127 | # Fetch a list of all tables and schemas for those tables.
128 | for nice_name, table_fmts in table_config.items():
129 |   for table_fmt in table_fmts:
130 |     dataset_name, table_name = table_fmt.split(".")
131 |     if nice_name not in datasets:
132 |       dataset = Dataset(name=nice_name)
133 |       datasets[nice_name] = dataset
134 |     else:
135 |       dataset = datasets[nice_name]
136 |     show_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", dataset_name).stdout.decode('utf-8'))
137 | 
138 |     if not dataset.tables:
139 |       dataset.tables = []
140 | 
141 |     print("Loading dataset %s" % dataset_name)
142 |     tables = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "ls", "-n", "100000", dataset_name).stdout.decode('utf-8'))
143 |     for table_data in tables:
144 |       name = table_data["tableReference"]["tableId"]
145 |       if re.match(table_name.replace("*", ".*"), name):
146 |         table = Table(name=dataset_name + "." + name, dataset=dataset, dataset_description=show_info.get("description", ""))
147 |         dataset.tables.append(table)
148 |         print(table.name)
149 | 
150 | # Detect table and dataset versions, mark older versions.
151 | latest_table_base = {}  # map[base name]latest name
152 | no_version_tables = {}
153 | for dataset in datasets.values():
154 |   for table in dataset.tables:
155 |     def sub_fn(m):
156 |       return m.group(1)
157 |     m = re.match("^(.+)_([0-9]+[0-9a-zA-Z]*)", table.name)
158 |     if not m:
159 |       no_version_tables[table.name] = True
160 |       latest_table_base[table.name] = table.name
161 |     else:
162 |       base = m.group(1)
163 |       table.version = m.group(2)
164 |       if not base in latest_table_base:
165 |         latest_table_base[base] = table.name
166 |       elif latest_table_base[base] < table.name and not no_version_tables.get(base, ""):
167 |         latest_table_base[base] = table.name
168 | 
169 | 
170 | latest_tables = {}
171 | for latest in latest_table_base.values():
172 |   latest_tables[latest] = True
173 | 
174 | for dataset in datasets.values():
175 |   for table in dataset.tables:
176 |     if table.name not in latest_tables:
177 |       table.old_version = True
178 | 
179 | for dataset in datasets.values():
180 |   for table in dataset.tables:
181 |     if table.old_version:
182 |       print("Skipping old table %s" % table.name)
183 |       continue
184 |     print("Loading table %s" % table.name)
185 |     table_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", table.name).stdout.decode('utf-8'))
186 |     table_fields = []
187 |     def add_fields(parent, fields):
188 |       for field in fields:
189 |         name = field["name"]
190 |         if parent:
191 |           name = parent + "." + name
192 |         table_fields.append(Field(
193 |             name=name,
194 |             table=table,
195 |             description=field.get("description", ""),
196 |             type=field.get("type", ""),
197 |             mode=field.get("mode", ""),
198 |         ))
199 |         if "fields" in field:
200 |           add_fields(name, field["fields"])
201 | 
202 |     add_fields("", table_info["schema"]["fields"])
203 |     table.fields = table_fields
204 |     table.description = table_info.get("description", "")
205 |     table.last_updated = ts_to_string(int(table_info["lastModifiedTime"]) / 1000)
206 |     if not dataset.last_updated or dataset.last_updated < table.last_updated:
207 |       dataset.last_updated = table.last_updated
208 |     table.num_rows = table_info["numRows"]
209 |     table.num_bytes = table_info["numBytes"]
210 |     # Possibly calculate group-by stats.
211 |     if table.name in group_config:
212 |       column = group_config[table.name]
213 |       query = "SELECT COUNT(*) AS cnt, {column} AS grouped FROM `{table}` GROUP BY 2 ORDER BY 1".format(table=tsql(table.name), column=column)
214 |       result = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "query", "--use_legacy_sql=false", query).stdout.decode('utf-8'))
215 |       table.stats = {}
216 |       for row in result:
217 |         js = JoinStat(key=row["grouped"], num_rows=int(row["cnt"]))
218 |         table.stats[js.key] = js
219 | 
220 | 
221 | # Support wildcards in join groups: dataset:*|molregno
222 | for join_group in join_config.values():
223 |   i = 0
224 |   while i < len(join_group):
225 |     if not "*" in join_group[i]:
226 |       i += 1
227 |       continue
228 |     table_fmt, column_fmt = join_group[i].split("|")
229 |     # Loop over all tables and columns and look for matches.
230 |     matches = []
231 |     for dataset in datasets.values():
232 |       for table in dataset.tables:
233 |         if not re.match(table_fmt.replace("*", ".*"), table.name) or table.old_version:
234 |           continue
235 |         for field in table.fields:
236 |           if re.match(column_fmt.replace("*", ".*"), field.name):
237 |             matches.append("%s|%s" % (table.name, field.name))
238 |     # Replace join_group[i] with the matched values.
239 |     join_group.pop(i)
240 |     for v in matches:
241 |       join_group.insert(i, v)
242 |       i += 1
243 | 
244 | join_done = set()
245 | 
246 | # Enumerate all possible joins inside each group of matching columns.
247 | for join_name, join_group in join_config.items():
248 |   for i in range(len(join_group)):
249 |     self = join_group[i]
250 |     for j in range(len(join_group)):
251 |       if j == i:
252 |         continue
253 |       first_table, first_column = join_group[i].split("|")
254 |       second_table, second_column = join_group[j].split("|")
255 |       # Only join tables if one or more has a + as the prefix.
256 |       if not first_table.startswith("+") and not second_table.startswith("+"):
257 |         continue
258 |       first_table = first_table.lstrip("+")
259 |       second_table = second_table.lstrip("+")
260 |       key = first_table + first_column + second_table + second_column
261 |       if key in join_done or (first_table == second_table and first_column == second_column):
262 |         continue
263 |       join_done.add(key)
264 |       print("Running join between %s and %s" %  (join_group[i], join_group[j]))
265 |       from_field = find_field(first_table, first_column)
266 |       to_field = find_field(second_table, second_column)
267 |       if not from_field or not to_field:
268 |         raise TypeError("fields not found: %s:%s %s:%s" % (join_group[i], from_field is not None, join_group[j], to_field is not None))
269 |       group_by = group_config.get(first_table, None)
270 |       if not group_by:
271 |         query = """#standardSQL
272 | SELECT
273 |   COUNT(*) AS cnt,
274 |   COUNT(second.second_column) AS second_cnt,
275 |   ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
276 | FROM `{first_table}`AS first
277 | LEFT JOIN (
278 |   SELECT {second_column} AS second_column, COUNT(*) AS cnt
279 |   FROM `{second_table}`
280 |   GROUP BY 1
281 | ) AS second ON first.{first_column} = second.second_column""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column)
282 |       else:
283 |         query = """#standardSQL
284 | SELECT
285 |   COUNT(*) AS cnt,
286 |   COUNT(second.second_column) AS second_cnt,
287 |   first.{group_by} AS grouped,
288 |   ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value
289 | FROM `{first_table}`AS first
290 | LEFT JOIN (
291 |   SELECT {second_column} AS second_column, COUNT(*) AS cnt
292 |   FROM `{second_table}`
293 |   GROUP BY 1
294 | ) AS second ON first.{first_column} = second.second_column
295 | GROUP BY 3""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column, group_by=group_by)
296 | 
297 |       result = json.loads(bq("--format=prettyjson", "query", "--use_legacy_sql=false", query).stdout.decode('utf-8'))
298 |       total_rows = 0
299 |       joined_rows = 0
300 | 
301 |       join_stats = {}
302 |       join = Join(name=join_name, from_field=from_field, to_field=to_field, join_stats=join_stats, sql=query)
303 |       if not from_field.from_joins:
304 |         from_field.from_joins = []
305 |       from_field.from_joins.append(join)
306 |       if not to_field.to_joins:
307 |         to_field.to_joins = []
308 |       to_field.to_joins.append(join)
309 |       if not from_field.table.from_joins:
310 |         from_field.table.from_joins = []
311 |       from_field.table.from_joins.append(join)
312 |       for row in result:
313 |         cnt = int(row["cnt"])
314 |         second_cnt = int(row["second_cnt"])
315 |         total_rows += cnt
316 |         joined_rows += second_cnt
317 |         if not group_by:
318 |           join_stats[""] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key="all", sample_value=row["sample_value"])
319 |         else:
320 |           join_stats[row["grouped"]] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key=row["grouped"], sample_value=row["sample_value"])
321 |       join.percent = joined_rows / total_rows
322 |       join.num_rows = joined_rows
323 | 
324 | def other_formats(name):
325 |   if not args.formats:
326 |     return
327 |   for fmt in args.formats.split(","):
328 |     sh.pandoc(name, "--from", "markdown", "-s", "-o", "%s.%s" % (name, fmt))
329 | 
330 | # "index.md"
331 | # Links to every dataset and description of each dataset
332 | # DOT graph of links between tables
333 | # Link statistics: % of rows that link together
334 | main_page_template = jinja2.Template("""
335 | ---
336 | geometry: margin=0.6in
337 | ---
338 | 
339 | # Datasets
340 | 
341 | {% for dataset in datasets.values() %}
342 | ## [{{dataset.name}}](dataset_{{dataset.name}}.md)
343 | 
344 | | Name | Last updated | Rows | Joins |
345 | |-------------------------------------------|-------|--------|-----------------|
346 | {% for table in dataset.tables -%}
347 | | [{{table.name}}](https://bigquery.cloud.google.com/table/{{table.name}}) | {% if table.last_updated %}{{table.last_updated }}{% endif %} | {% if table.num_rows %}{{"{0:,}".format(table.num_rows|int)}}{% endif %} |
348 | {%- if table.from_joins %}{% for group in table.from_joins|groupby("name") -%}
349 | {{group.grouper}} {% endfor %}{% endif %} |
350 | {% endfor %}
351 | {% endfor %}
352 | """)
353 | 
354 | index_output = os.path.join(output_dir, "index.md")
355 | with open(index_output, "w") as f:
356 |   f.write(main_page_template.render(datasets=datasets))
357 | other_formats(index_output)
358 | 
359 | # "dataset_<name>.md"
360 | # Description of dataset
361 | # List of all tables in dataset
362 | # Sample rows in each table
363 | # Links to other datasets
364 | # Inner-dataset links
365 | # DOT graph of links
366 | dataset_page_template = jinja2.Template("""
367 | ---
368 | geometry: margin=0.6in
369 | ---
370 | 
371 | # {{dataset.name}}
372 | 
373 | {% for table in dataset.tables %}
374 | *****
375 | ## {{table.name}}
376 | 
377 | {% if table.old_version %}
378 | Old table version `{{ table.version }}`, schema skipped.
379 | {% else %}
380 | {% if table.dataset_description %}
381 | > {{table.dataset_description|replace("\n", "\n> ")}}
382 | {% endif %}
383 | {% if table.description %}
384 | > {{table.description|replace("\n", "\n> ")}}
385 | {% endif %}
386 | {% endif %}
387 | 
388 | {% if table.fields %}
389 | | Stat | Value |
390 | |----------|----------|
391 | | Last updated | {{table.last_updated}} |
392 | | Rows | {{"{0:,}".format(table.num_rows|int)}} |
393 | | Size | {{table.num_bytes|filesizeformat}} |
394 | 
395 | ### Schema
396 | [View in BigQuery](https://bigquery.cloud.google.com/table/{{table.name}})
397 | 
398 | {% for field in table.fields -%}
399 | * `{{field.name}}` {{field.type}} {{field.mode}} {% if field.from_joins %} joins on **{{ field.from_joins[0].name }}**{% endif %}
400 | {% if field.description %}
401 |     > {{field.description|replace("\n", "\n> ")}}
402 | {% endif %}
403 | {% endfor %}
404 | 
405 | {% if table.from_joins %}### Join columns{% endif %}
406 | {% for field in table.fields %}
407 | {% if field.from_joins %}
408 | #### {{field.name}}
409 | {% for join in field.from_joins %}
410 | joins to `{{ join.to_field.table.name }}::{{ join.to_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows)
411 | 
412 | | Key | Percent | Rows | Sample values |
413 | |------|-----|--------|--------------------------------------------------------|
414 | {% for stat in join.join_stats.values() -%}
415 | | `{{stat.key}}` | {% if stat.percent > 0.0 %}{{"%.2f" % (100 * stat.percent)}}%{% else %}*none*{% endif %} | {{"{0:,}".format(stat.num_rows|int)}} | `{{stat.sample_value}}` |
416 | {% endfor %}
417 | 
418 |     {{join.sql|indent}}
419 | 
420 | {% endfor %}
421 | {% for join in field.to_joins %}
422 | joins from `{{ join.from_field.table.name }}::{{ join.from_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows)
423 | {% endfor %}
424 | {% endif %}
425 | {% endfor %}
426 | {% endif %}
427 | 
428 | {% endfor %}
429 | """)
430 | 
431 | for dataset in datasets.values():
432 |   output = os.path.join(output_dir, "dataset_%s.md" % dataset.name)
433 |   with open(output, "w") as f:
434 |     f.write(dataset_page_template.render(dataset=dataset))
435 |   other_formats(output)
436 | 


--------------------------------------------------------------------------------
/tools/sqlite_dump.pysh:
--------------------------------------------------------------------------------
 1 | # python ocean/patents/deepsea/research/chembl_dump.pysh '/usr/local/google/home/wetherbeei/Downloads/chembl/chembl_24.db' '/usr/local/google/home/wetherbeei/Downloads/chembl/'
 2 | import sh
 3 | import sys
 4 | import os.path
 5 | 
 6 | db = sys.argv[1]
 7 | dir = sys.argv[2]
 8 | 
 9 | tables = sh.sqlite3("-csv", db, "SELECT name FROM sqlite_master WHERE type='table';").stdout.split("\n")
10 | 
11 | for table in tables:
12 |   if table == "":
13 |     continue
14 |   path = os.path.join(dir, table + ".csv")
15 |   print("Dumping %s" % path)
16 |   sh.sqlite3("-csv", "-header", db, "SELECT * FROM %s;" % table, _out=path)
17 | 


--------------------------------------------------------------------------------