├── .gitignore ├── CONTRIBUTING.md ├── Creating a new BigQuery dataset.pdf ├── LICENSE ├── Querying a BigQuery dataset.pdf ├── README.md ├── examples ├── BERT_For_Patents.ipynb ├── Document_representation_from_BERT.ipynb ├── claim-text │ ├── claim_text_extraction.ipynb │ └── data │ │ └── 20k_G_and_H_publication_numbers.csv └── patent_set_expansion.ipynb ├── models ├── BERT for Patents.md ├── claim_breadth │ ├── README.md │ ├── batch_inference.py │ ├── batch_inference_test.py │ ├── generate_embedding_vocab.sql │ ├── hptuning_config.yaml │ ├── preprocess.py │ ├── preprocess_test.py │ ├── requirements.txt │ ├── testdata │ │ └── example-output-from-preprocess-step.tfrecord.gz │ └── trainer │ │ ├── __init__.py │ │ ├── model.py │ │ └── task.py └── landscaping │ ├── AutomatedPatentLandscaping.pdf │ ├── AutomatedPatentLandscaping_2018Update.pdf │ ├── LandscapeNotebook.ipynb │ ├── README.md │ ├── __init__.py │ ├── expansion.py │ ├── figs │ ├── flow.png │ └── project-id.png │ ├── keras_metrics.py │ ├── model.py │ ├── seeds │ ├── README.md │ ├── hair_dryer.seed.csv │ ├── hair_dryer_large.seed.csv │ └── video_codec.seed.csv │ ├── tokenizer.py │ ├── train_data.py │ └── word2vec.py ├── tables ├── dataset_Berkeley Fung.md ├── dataset_Berkeley Fung.md.pdf ├── dataset_CPA Global.md ├── dataset_CPA Global.md.pdf ├── dataset_European Bioinformatics Institute.md ├── dataset_European Bioinformatics Institute.md.pdf ├── dataset_Google Patents Public Datasets.md ├── dataset_Google Patents Public Datasets.md.pdf ├── dataset_Other.md ├── dataset_Other.md.pdf ├── dataset_USPTO.md ├── dataset_USPTO.md.pdf ├── index.md └── index.md.pdf └── tools ├── bigquery-indexer ├── README.md ├── beam-rdkit-runner │ └── Dockerfile └── main.py ├── bq_bulk_cp.pysh ├── bq_ls.pysh ├── csv_upload.pysh ├── dataset_berkeley_fung.json ├── dataset_ebi.json ├── dataset_ifi.json ├── dataset_innography.json ├── dataset_other.json ├── dataset_public.json ├── dataset_report.pysh ├── dataset_uspto.json ├── generate_dataset_docs.py └── sqlite_dump.pysh /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | abstracts/ 3 | checkpoints/ 4 | models/landscaping/bigquery_credentials.dat 5 | models/landscaping/models/ 6 | .ipynb_checkpoints/ 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution, 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | -------------------------------------------------------------------------------- /Creating a new BigQuery dataset.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/Creating a new BigQuery dataset.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------------------------------------- /Querying a BigQuery dataset.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/Querying a BigQuery dataset.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Patent analysis using the Google Patents Public Datasets on BigQuery 2 | 3 | The contents of this repository are not an official Google product. 4 | 5 | [Google Patents Public Datasets](https://console.cloud.google.com/launcher/browse?q=google%20patents%20public%20datasets&filter=solution-type:dataset) is a collection of compatible BigQuery database tables from government, research and private companies for conducting statistical analysis of patent data. The data is available to be queried with SQL through BigQuery, joined with private datasets you upload, and exported and processed using many other compatible analysis tools. This repository is a centralized source for examples which use the data. 6 | 7 | Currently the repo contains three examples: 8 | 9 | 1. [Patent Landscaping](https://github.com/google/patents-public-data/blob/master/models/landscaping/README.md): A demo of an automated process of finding patents related to a particular topic given an initial seed set of patents. Based on the paper by Dave Feltenberger and Aaron Abood, [Automated Patent Landscaping](models/landscaping/AutomatedPatentLandscaping.pdf). 10 | 11 | 2. [Claim Text Extraction](https://github.com/google/patents-public-data/blob/master/examples/claim-text/claim_text_extraction.ipynb): A demo of interacting with patent claim text data using BigQuery and python. 12 | 13 | 3. [Claim Breadth Model](https://github.com/google/patents-public-data/blob/master/models/claim_breadth/README.md): A machine learning method for estimating patent claim breadth using data from BigQuery. 14 | 15 | Other helpful resources from the community: 16 | 17 | 1. [Replicable Patent Indicators](https://www.kaggle.com/code/georgeabiyounes/replicable-patent-indicators/notebook) ([paper](https://onlinelibrary.wiley.com/doi/10.1111/1467-8462.12545)) 18 | -------------------------------------------------------------------------------- /examples/Document_representation_from_BERT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Document representation from BERT", 7 | "provenance": [ 8 | { 9 | "file_id": "1hccaqNncyxDG32f5U1Qncz6ipWiLV0TQ", 10 | "timestamp": 1614125265907 11 | }, 12 | { 13 | "file_id": "1d9KurXhXvrV-jo-x2f7DkZ40qx75YAh_", 14 | "timestamp": 1604694308174 15 | } 16 | ], 17 | "collapsed_sections": [], 18 | "last_runtime": { 19 | "build_target": "//corp/legal/patents/colab:dst_colab_notebook", 20 | "kind": "shared" 21 | }, 22 | "toc_visible": true 23 | }, 24 | "kernelspec": { 25 | "name": "python3", 26 | "display_name": "Python 3" 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "ED6tBdZtOjlU" 34 | }, 35 | "source": [ 36 | "# Document representation from BERT" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "CqNm7ioGOgSm" 43 | }, 44 | "source": [ 45 | "Copyright 2021 Google Inc.\n", 46 | "\n", 47 | "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n", 48 | "\n", 49 | "http://www.apache.org/licenses/LICENSE-2.0\n", 50 | "\n", 51 | "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "id": "c1vLcDJINTGg" 58 | }, 59 | "source": [ 60 | "import collections\n", 61 | "import math\n", 62 | "import random\n", 63 | "import sys\n", 64 | "import time\n", 65 | "from typing import Dict, List, Tuple\n", 66 | "from sklearn.metrics import pairwise\n", 67 | "# Use Tensorflow 2.0\n", 68 | "import tensorflow as tf\n", 69 | "import numpy as np" 70 | ], 71 | "execution_count": null, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "vfSIZaeaPHpZ", 78 | "colab": { 79 | "height": 53 80 | }, 81 | "executionInfo": { 82 | "status": "ok", 83 | "timestamp": 1614125346371, 84 | "user_tz": 300, 85 | "elapsed": 155, 86 | "user": { 87 | "displayName": "Rob Srebrovic", 88 | "photoUrl": "", 89 | "userId": "06004353344935214283" 90 | } 91 | }, 92 | "outputId": "c0bca557-2962-4f3b-a8f9-71be6d820897" 93 | }, 94 | "source": [ 95 | "# Set BigQuery application credentials\n", 96 | "from google.cloud import bigquery\n", 97 | "import os\n", 98 | "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"path/to/file.json\"\n", 99 | "\n", 100 | "project_id = \"your_bq_project_id\"\n", 101 | "bq_client = bigquery.Client(project=project_id)" 102 | ], 103 | "execution_count": 2, 104 | "outputs": [ 105 | { 106 | "output_type": "execute_result", 107 | "data": { 108 | "application/vnd.google.colaboratory.intrinsic+json": { 109 | "type": "string" 110 | }, 111 | "text/plain": [ 112 | "'# Set BigQuery application credentials\\nfrom google.cloud import bigquery\\nimport os\\nos.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"path/to/file.json\"\\n\\nproject_id = \"your_bq_project_id\"\\nbq_client = bigquery.Client(project=project_id)'" 113 | ] 114 | }, 115 | "metadata": { 116 | "tags": [] 117 | }, 118 | "execution_count": 2 119 | } 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "7BojUHDYrESY" 126 | }, 127 | "source": [ 128 | "# You will have to clone the BERT repo\n", 129 | "!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo\n", 130 | "if not 'bert_repo' in sys.path:\n", 131 | " sys.path += ['bert_repo']" 132 | ], 133 | "execution_count": null, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "QeoX7LfgPLGP" 140 | }, 141 | "source": [ 142 | "The BERT repo uses Tensorflow 1 and thus a few of the functions have been moved/changed/renamed in Tensorflow 2. In order for the BERT tokenizer to be used, one of the lines in the repo that was just cloned needs to be modified to comply with Tensorflow 2. Line 125 in the BERT tokenization.py file must be changed as follows:\n", 143 | "\n", 144 | "From => `with tf.gfile.GFile(vocab_file, \"r\") as reader:`\n", 145 | "\n", 146 | "To => `with tf.io.gfile.GFile(vocab_file, \"r\") as reader:`\n", 147 | "\n", 148 | "Once that is complete and the file is saved, the tokenization library can be imported." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "HsSJXKPDPLXn" 155 | }, 156 | "source": [ 157 | "import tokenization" 158 | ], 159 | "execution_count": null, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "id": "JBqRRfigQxxK" 166 | }, 167 | "source": [ 168 | "# Load BERT" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "kp2fx508lWBG" 175 | }, 176 | "source": [ 177 | "MAX_SEQ_LENGTH = 512\n", 178 | "MODEL_DIR = 'path/to/model'\n", 179 | "VOCAB = 'path/to/vocab'\n", 180 | "\n", 181 | "tokenizer = tokenization.FullTokenizer(VOCAB, do_lower_case=True)" 182 | ], 183 | "execution_count": null, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "metadata": { 189 | "id": "sNf96pSxxXg2" 190 | }, 191 | "source": [ 192 | "model = tf.compat.v2.saved_model.load(export_dir=MODEL_DIR, tags=['serve'])\n", 193 | "model = model.signatures['serving_default']" 194 | ], 195 | "execution_count": null, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "-BWnaHqoT7db" 202 | }, 203 | "source": [ 204 | "# Mean pooling layer for combining\n", 205 | "pooling = tf.keras.layers.GlobalAveragePooling1D()" 206 | ], 207 | "execution_count": null, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "id": "rtzZg5LESCxF" 214 | }, 215 | "source": [ 216 | "# Get a couple of Patents\n", 217 | "\n", 218 | "Here we do a simple query from the BigQuery patents data to collect the claims for a sample set of patents." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "metadata": { 224 | "id": "u3iTTJQ5SFba" 225 | }, 226 | "source": [ 227 | "# Put your publications here.\n", 228 | "test_pubs = (\n", 229 | " 'US-8000000-B2', 'US-2007186831-A1', 'US-2009030261-A1', 'US-10722718-B2'\n", 230 | ")\n", 231 | "\n", 232 | "js = r\"\"\"\n", 233 | " // Regex to find the separations of the claims data\n", 234 | " var pattern = new RegExp(/[.][\\\\s]+[0-9]+[\\\\s]*[.]/, 'g');\n", 235 | " if (pattern.test(text)) {\n", 236 | " return text.split(pattern);\n", 237 | " }\n", 238 | "\"\"\"\n", 239 | "\n", 240 | "query = r'''\n", 241 | " #standardSQL\n", 242 | " CREATE TEMPORARY FUNCTION breakout_claims(text STRING) RETURNS ARRAY \n", 243 | " LANGUAGE js AS \"\"\"\n", 244 | " {}\n", 245 | " \"\"\"; \n", 246 | "\n", 247 | " SELECT \n", 248 | " pubs.publication_number, \n", 249 | " title.text as title, \n", 250 | " breakout_claims(claims.text) as claims\n", 251 | " FROM `patents-public-data.patents.publications` as pubs,\n", 252 | " UNNEST(claims_localized) as claims,\n", 253 | " UNNEST(title_localized) as title\n", 254 | " WHERE\n", 255 | " publication_number in {}\n", 256 | "'''.format(js, test_pubs)\n", 257 | "\n", 258 | "df = bq_client.query(query).to_dataframe()" 259 | ], 260 | "execution_count": null, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "metadata": { 266 | "colab": { 267 | "height": 241 268 | }, 269 | "id": "ORcVOefPsT0U", 270 | "executionInfo": { 271 | "status": "ok", 272 | "timestamp": 1614011849900, 273 | "user_tz": 300, 274 | "elapsed": 309, 275 | "user": { 276 | "displayName": "Jay Yonamine", 277 | "photoUrl": "", 278 | "userId": "01949405773282057831" 279 | } 280 | }, 281 | "outputId": "5299f3c1-b64e-4cbd-9206-273d1fb1d300" 282 | }, 283 | "source": [ 284 | "df.head()" 285 | ], 286 | "execution_count": null, 287 | "outputs": [ 288 | { 289 | "output_type": "execute_result", 290 | "data": { 291 | "text/html": [ 292 | "
\n", 293 | "\n", 306 | "\n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | "
publication_numbertitleclaims
0US-2009030261-A1Drug delivery system[1 . A drug delivery system comprising:\\n a ca...
1US-2007186831-A1Sewing machine[1 . A sewing machine comprising:\\n a needle b...
2US-8000000-B2Visual prosthesis[1. A visual prosthesis apparatus comprising:\\...
3US-10722718-B2Systems and methods for treatment of dry eye[What is claimed is: \\n \\n 1. A meth...
\n", 342 | "
" 343 | ], 344 | "text/plain": [ 345 | " publication_number ... claims\n", 346 | "0 US-2009030261-A1 ... [1 . A drug delivery system comprising:\\n a ca...\n", 347 | "1 US-2007186831-A1 ... [1 . A sewing machine comprising:\\n a needle b...\n", 348 | "2 US-8000000-B2 ... [1. A visual prosthesis apparatus comprising:\\...\n", 349 | "3 US-10722718-B2 ... [What is claimed is: \\n \\n 1. A meth...\n", 350 | "\n", 351 | "[4 rows x 3 columns]" 352 | ] 353 | }, 354 | "metadata": { 355 | "tags": [] 356 | }, 357 | "execution_count": 8 358 | } 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "metadata": { 364 | "id": "NeFzKlMw1DQd" 365 | }, 366 | "source": [ 367 | "def get_bert_token_input(texts):\n", 368 | " input_ids = []\n", 369 | " input_mask = []\n", 370 | " segment_ids = []\n", 371 | "\n", 372 | " for text in texts:\n", 373 | " tokens = tokenizer.tokenize(text)\n", 374 | " if len(tokens) > MAX_SEQ_LENGTH - 2:\n", 375 | " tokens = tokens[0:(MAX_SEQ_LENGTH - 2)]\n", 376 | " tokens = ['[CLS]'] + tokens + ['[SEP]']\n", 377 | "\n", 378 | "\n", 379 | " ids = tokenizer.convert_tokens_to_ids(tokens)\n", 380 | " token_pad = MAX_SEQ_LENGTH - len(ids)\n", 381 | " input_mask.append([1] * len(ids) + [0] * token_pad)\n", 382 | " input_ids.append(ids + [0] * token_pad)\n", 383 | " segment_ids.append([0] * MAX_SEQ_LENGTH)\n", 384 | " \n", 385 | " return {\n", 386 | " 'segment_ids': tf.convert_to_tensor(segment_ids, dtype=tf.int64),\n", 387 | " 'input_mask': tf.convert_to_tensor(input_mask, dtype=tf.int64),\n", 388 | " 'input_ids': tf.convert_to_tensor(input_ids, dtype=tf.int64),\n", 389 | " 'mlm_positions': tf.convert_to_tensor([], dtype=tf.int64)\n", 390 | " }" 391 | ], 392 | "execution_count": null, 393 | "outputs": [] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "metadata": { 398 | "id": "MlrVU10IOlSZ" 399 | }, 400 | "source": [ 401 | "docs_embeddings = []\n", 402 | "for _, row in df.iterrows():\n", 403 | " inputs = get_bert_token_input(row['claims'])\n", 404 | " response = model(**inputs)\n", 405 | " avg_embeddings = pooling(\n", 406 | " tf.reshape(response['encoder_layer'], shape=[1, -1, 1024]))\n", 407 | " docs_embeddings.append(avg_embeddings.numpy()[0])" 408 | ], 409 | "execution_count": null, 410 | "outputs": [] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "metadata": { 415 | "id": "DhF2-w2yU52U", 416 | "executionInfo": { 417 | "status": "ok", 418 | "timestamp": 1614012215102, 419 | "user_tz": 300, 420 | "elapsed": 240, 421 | "user": { 422 | "displayName": "Jay Yonamine", 423 | "photoUrl": "", 424 | "userId": "01949405773282057831" 425 | } 426 | }, 427 | "outputId": "c6148de6-f1c2-40c3-d75d-90cc0f4e0469" 428 | }, 429 | "source": [ 430 | "pairwise.cosine_similarity(docs_embeddings)" 431 | ], 432 | "execution_count": null, 433 | "outputs": [ 434 | { 435 | "output_type": "execute_result", 436 | "data": { 437 | "text/plain": [ 438 | "array([[0.9999988 , 0.68387157, 0.83200616, 0.86913264],\n", 439 | " [0.68387157, 1.0000013 , 0.7299322 , 0.73105675],\n", 440 | " [0.83200616, 0.7299322 , 0.99999964, 0.9027555 ],\n", 441 | " [0.86913264, 0.73105675, 0.9027555 , 0.9999996 ]], dtype=float32)" 442 | ] 443 | }, 444 | "metadata": { 445 | "tags": [] 446 | }, 447 | "execution_count": 13 448 | } 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "metadata": { 454 | "id": "TFWxL-IGU9-6", 455 | "executionInfo": { 456 | "status": "ok", 457 | "timestamp": 1614012321633, 458 | "user_tz": 300, 459 | "elapsed": 227, 460 | "user": { 461 | "displayName": "Jay Yonamine", 462 | "photoUrl": "", 463 | "userId": "01949405773282057831" 464 | } 465 | }, 466 | "outputId": "9fffcf1d-0c2c-4d84-eb8e-847d6054f125" 467 | }, 468 | "source": [ 469 | "docs_embeddings[0].shape" 470 | ], 471 | "execution_count": null, 472 | "outputs": [ 473 | { 474 | "output_type": "execute_result", 475 | "data": { 476 | "text/plain": [ 477 | "(1024,)" 478 | ] 479 | }, 480 | "metadata": { 481 | "tags": [] 482 | }, 483 | "execution_count": 23 484 | } 485 | ] 486 | } 487 | ] 488 | } 489 | -------------------------------------------------------------------------------- /models/BERT for Patents.md: -------------------------------------------------------------------------------- 1 | Copyright 2020 Google Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 8 | 9 | # BERT for Patents 10 | 11 | The BERT exported here has been trained on >100 million patent documents and was trained on all parts of a patent (abstract, claims, description). 12 | 13 | The BERT model exported here comes in two formats: 14 | 15 | * [SavedModel](https://storage.googleapis.com/patents-public-data-github/saved_model.zip) 16 | 17 | * [Checkpoint](https://storage.googleapis.com/patents-public-data-github/checkpoint.zip) 18 | 19 | The models can also be loaded and saved in another format or just the weights can be saved. 20 | 21 | The BERT model has been trained on >100 million patent documents and was trained on all parts of a patent (abstract, claims, description). It has a similar configuration to the BERT-Large model, with a couple of important notes: 22 | 23 | * The maximum input sequence length is 512 tokens and maximum masked words for a sequence is 45. 24 | * The vocabulary has approximately 8000 added words from the standard BERT vocabulary. These represent frequently used patent terms. 25 | * The vocabulary includes "context" tokens indicating what part of a patent the text is from (abstract, claims, summary, invention). Providing context tokens in the examples is optional. 26 | 27 | The full BERT vocabulary can be downloaded [here](https://storage.googleapis.com/patents-public-data-github/bert_for_patents_vocab_39k.txt). The vocabulary also contains 1000 unused tokens so that more tokens can be added. 28 | 29 | The exact configuration for the BERT model is as follows (and downloaded [here](https://storage.googleapis.com/patents-public-data-github/bert_for_patents_large_config.json)): 30 | 31 | * attention_probs_dropout_prob: 0.1 32 | * hidden_act: gelu 33 | * hidden_dropout_prob: 0.1 34 | * hidden_size: 1024 35 | * initializer_range: 0.02 36 | * intermediate_size: 4096 37 | * max_position_embeddings: 512 38 | * num_attention_heads: 16 39 | * num_hidden_layers: 24 40 | * vocab_size: 39859 41 | 42 | The model has requires the following input signatures: 43 | 1. `input_ids` 44 | 2. `input_mask` 45 | 3. `segment_ids` 46 | 4. `mlm_ids` 47 | 48 | And the BERT model contains output signatures for: 49 | 1. `cls_token` 50 | 2. `encoder_layer` is the contextualized word embeddings from the last encoder layer. 51 | 3. `mlm_logits` is the predictions for any masked tokens provided to the model. 52 | -------------------------------------------------------------------------------- /models/claim_breadth/README.md: -------------------------------------------------------------------------------- 1 | # Measuring Patent Claim Breadth Using Google Patents Public Data on BigQuery 2 | 3 | The code in this repository is one approach to measuring patent claim breadth 4 | using a semi supervised approach. For more details and background, please see 5 | [this post on the Google Cloud Big Data 6 | Blog.](https://cloud.google.com/blog/big-data/2018/07/measuring-patent-claim-breadth-using-google-patents-public-datasets) 7 | 8 | ## Prerequisites 9 | 10 | This guide assumes access to a linux based operating system. Windows users may run into compatibility issues when running the commands below. Windows users should consider using a virtual machine. 11 | 12 | #### Setup a Google Cloud Project and Install the gcloud sdk 13 | 14 | Much of the code in this repository requires access to a Google Cloud Project. 15 | Please setup an account before proceeding. To install gcloud, follow the guide 16 | [here](https://cloud.google.com/sdk/docs/quickstarts). Then once its installed, 17 | setup your sdk to reference your account: 18 | 19 | `gcloud init` 20 | 21 | #### Create a bucket where you'll store relevant data for this project and set some environmental variables. 22 | 23 | Each of the steps below relies on Google Cloud Storage for various tasks like 24 | writing logs or output files. You'll need your own bucket to run the steps below 25 | or if you'd like to run only one portion of the steps. You can use our public 26 | bucket gs://patent-claims-data which includes all the relevant input and output 27 | files 28 | 29 | ``` 30 | export GCP_PROJECT=`gcloud config get-value project` 31 | export BUCKET=gs://[YOUR BUCKET NAME] 32 | gsutil mb $BUCKET 33 | ``` 34 | 35 | #### Enable relevant API's in the GCP console. 36 | 37 | Dataflow and cloud ML require several api's to be enabled on your account. 38 | Before running the examples below, you'll need the following two API's enabled: 39 | 40 | 1. https://console.cloud.google.com/apis/library/dataflow.googleapis.com 41 | 2. https://console.cloud.google.com/apis/library/ml.googleapis.com 42 | 43 | #### Create a service account, download your keys and set a local environmental variable. 44 | 45 | To do this follow [this 46 | guide](https://cloud.google.com/docs/authentication/getting-started) for setting 47 | the GOOGLE_APPLICATION_CREDENTIALS environmental variable. 48 | 49 | `export GOOGLE_APPLICATION_CREDENTIALS="[PATH TO DOWNLOADED JSON FILE]"` 50 | 51 | #### Setup a virtual environment and install python dependencies. 52 | 53 | Additionally, you'll likely want to work inside a python virtual environment. 54 | You can set one up with the following commands: 55 | 56 | ``` 57 | virtualenv myenv 58 | source myenv/bin/activate 59 | pip install -r requirements.txt 60 | ``` 61 | 62 | ## A few sample commands 63 | 64 | Below are a handful of sample commands that can be used as a reference on how to 65 | run the scripts in this repository. For more info, see the blog post mentioned 66 | above. Please note that all of the commands below will incur charges on your GCP 67 | account. Most of the commands can be run for less than a dollar at current 68 | prices, but hyperparameter tuning can easily become very expensive if you run 69 | many trials. Consider setting [billing alerts and 70 | limits](https://cloud.google.com/billing/docs/how-to/budgets) before running any 71 | of the commands below. 72 | 73 | ### To run preprocessing pipeline and produce 1.4m training examples. 74 | 75 | ``` 76 | export OUTPUT_PATH="$BUCKET/training-data/" 77 | python preprocess.py \ 78 | --output_path=$OUTPUT_PATH \ 79 | --project=$GCP_PROJECT \ 80 | --runner=DataflowRunner \ 81 | --pipeline_mode=train \ 82 | --query_kep_pct=0.6 \ 83 | --cpc_code_list='D,E,F,G,H' 84 | ``` 85 | 86 | ### To run a local training job for a few steps to ensure your model trains. 87 | 88 | #### First, set up a vocab file for an embedding column in the model 89 | 90 | The model has an embedding column which is designed to embed CPC codes at the 4 91 | digit level which allows the model to learn differences in feature impact across 92 | technologies (i.e. a claim of the same length might be narrower in one subspace 93 | than in another.) 94 | 95 | To generate a vocab file, the simplest way is to run a query against the Google 96 | Patents Public Data on BigQuery and save the output to a text file which we put 97 | on GCP storage. To do this follow the commands below: 98 | 99 | ``` 100 | # Execute a query from the command line and pipe output to text file. 101 | bq --project=$GCP_PROJECT query --max_rows=100000 --format=csv "$(cat generate_embedding_vocab.sql)" > ./cpc_embedding_vocab.txt 102 | # Strip header and blank lines. 103 | sed -i '2 d' cpc_embedding_vocab.txt 104 | sed -i '/^\s*$/d' cpc_embedding_vocab.txt 105 | # Copy to GCS for use in training and remove local copy. 106 | gsutil cp ./cpc_embedding_vocab.txt $BUCKET 107 | rm ./cpc_embedding_vocab.txt 108 | ``` 109 | 110 | #### Launch the local training job. 111 | 112 | ``` 113 | export CPC_EMBEDDING_VOCAB_FILE="$BUCKET/cpc_embedding_vocab.txt" 114 | export GCS_TRAIN_FILES="$BUCKET/training-data/claim-data-train*.tfrecord.gz" 115 | export GCS_EVAL_FILES="$BUCKET/training-data/claim-data-eval*.tfrecord.gz" 116 | gcloud ml-engine local train \ 117 | --package-path trainer \ 118 | --module-name trainer.task \ 119 | --job-dir './test' \ 120 | -- --train-files $GCS_TRAIN_FILES \ 121 | --eval-files $GCS_EVAL_FILES \ 122 | --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE \ 123 | --train-steps 100 \ 124 | --train-batch-size=10 \ 125 | --eval-batch-size=10 126 | ``` 127 | 128 | ### To run Hyperparameter Tuning and select the best model parameters (CAN BE EXPENSIVE). 129 | 130 | Note - running this command can incur significant charges due to the number of 131 | trials running. Make sure you have billing alerts and budgets set up to avoid 132 | unexpected charges. 133 | 134 | ``` 135 | export JOB_NAME=tuning_$(date +"%s") 136 | export GCS_JOB_DIR="$BUCKET/hptuning/$JOB_NAME" 137 | 138 | gcloud ml-engine jobs submit training $JOB_NAME \ 139 | --config hptuning_config.yaml \ 140 | --runtime-version 1.6 \ 141 | --job-dir $GCS_JOB_DIR \ 142 | --module-name trainer.task \ 143 | --package-path trainer/ \ 144 | --region us-central1 \ 145 | -- --train-steps 50000 \ 146 | --train-files $GCS_TRAIN_FILES \ 147 | --eval-files $GCS_EVAL_FILES \ 148 | --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE 149 | ``` 150 | 151 | ### To run a cloud training job for 30000 steps with the default Hparams. 152 | 153 | ``` 154 | export JOB_NAME=patent_claims_$(date +"%s") 155 | export GCS_JOB_DIR="$BUCKET/models/$JOB_NAME" 156 | 157 | gcloud ml-engine jobs submit training $JOB_NAME \ 158 | --scale-tier STANDARD_1 \ 159 | --runtime-version 1.6 \ 160 | --job-dir $GCS_JOB_DIR \ 161 | --module-name trainer.task \ 162 | --package-path trainer/ \ 163 | --region us-central1 \ 164 | -- --train-steps 30000 \ 165 | --train-files $GCS_TRAIN_FILES \ 166 | --eval-files $GCS_EVAL_FILES \ 167 | --cpc-embedding-vocab-file $CPC_EMBEDDING_VOCAB_FILE 168 | ``` 169 | 170 | While your training job is running, logs will be written to GCS and you can 171 | monitor progress with tensorboard using the command below. Note, because you're 172 | fetching logs from GCS - there is some latency between starting tensorboard and 173 | seeing results. 174 | 175 | `tensorboard --logdir $GCS_JOB_DIR` 176 | 177 | ### To run preprocessing pipeline and produce input data to run inference on all pubs after 1995 in a D, E, F, G, or H class code: 178 | 179 | ``` 180 | export OUTPUT_PATH="$BUCKET/inference-data" 181 | python preprocess.py \ 182 | --output_path=$OUTPUT_PATH \ 183 | --project=$GCP_PROJECT \ 184 | --runner=DataflowRunner \ 185 | --pipeline_mode=inference \ 186 | --cpc_code_list='D,E,F,G,H' 187 | ``` 188 | 189 | ### Set up Your Model on Cloud ML 190 | 191 | In a previous step, we trained a model and saved the final model to GCS. In the 192 | next step, we'll use this model for batch inference by leveraging GCP's Cloud 193 | ML. To use this service, we need to configure a model for online inference. To 194 | read more about this, see [this 195 | doc](https://cloud.google.com/ml-engine/docs/tensorflow/prediction-overview). 196 | 197 | If you've been following along so far, the following commands will grab the 198 | trained model files from GCP and set up a model version on cloud ML: 199 | 200 | ``` 201 | export MODEL_NAME=patent_claims 202 | export VERSION='v1' 203 | export SAVED_MODEL=`gsutil ls -d "$GCS_JOB_DIR/export/model/[0-9]*/"` 204 | gcloud ml-engine models create $MODEL_NAME 205 | gcloud ml-engine versions create $VERSION --model $MODEL_NAME --origin $SAVED_MODEL --runtime-version=1.4 206 | export MODEL_VERSION_STR="$MODEL_NAME/versions/$VERSION" 207 | ``` 208 | 209 | ### Run batch inference against all US Pubs in a D, E, F, G, or H class code. 210 | 211 | Now that we have a model ready for predictions, we can run batch inference. Note 212 | that the number of workers will affect how many requests are made against your 213 | model's API. 214 | 215 | ``` 216 | export OUTPUT_PATH="$BUCKET/scored" 217 | export INPUT_FILE_PATTERN="$BUCKET/inference-data/*.tfrecord.gz" 218 | python ./batch_inference.py \ 219 | --model_version_str=$MODEL_VERSION_STR \ 220 | --input_file_pattern=$INPUT_FILE_PATTERN \ 221 | --output_path=$OUTPUT_PATH \ 222 | --num_workers=5 \ 223 | --project=$GCP_PROJECT \ 224 | --write_to_bigquery=True \ 225 | --output_dataset='sandbox' \ 226 | --output_table='claim_scores' \ 227 | --runner=DataflowRunner 228 | ``` 229 | 230 | ## Useful Links 231 | 232 | The following links are helpful resources for understanding concepts covered in 233 | this repository. 234 | 235 | - [Apache Beam programming 236 | guide](https://beam.apache.org/documentation/programming-guide/) 237 | - [Detailed overview of using estimators to train a model locally and on 238 | GCP.](https://github.com/amygdala/code-snippets/blob/master/ml/census_train_and_eval/using_tf.estimator.train_and_evaluate.ipynb) 239 | - [Overview of hyperparameter 240 | tuning](https://cloud.google.com/ml-engine/docs/tensorflow/hyperparameter-tuning-overview) 241 | -------------------------------------------------------------------------------- /models/claim_breadth/batch_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache 2 | # License, Version 2.0 (the "License"); you may not use this file except in 3 | # compliance with the License. You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 9 | # License for the specific language governing permissions and limitations under 10 | # the License. 11 | """A batch inference script to score a set of Patent publications.""" 12 | import argparse 13 | import datetime 14 | import logging 15 | import os 16 | import sys 17 | import apache_beam as beam 18 | from apache_beam.metrics import Metrics 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | from googleapiclient.discovery import build 21 | import tensorflow as tf 22 | 23 | NOW = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') 24 | FEATURE_NAMES = [ 25 | 'word_cnt', 'word_cnt_unique', 'char_cnt', 'char_cnt_unique', 26 | 'limiting_words_cnt', 'digits_or_decimal_cnt', 'atleastoneofand_cnt', 27 | 'atleastoneofor_cnt', 'counting_cnt', 'excluding_words_cnt', 28 | 'groupconsistingof_cnt', 'element_cnt', 'adding_words_cnt', 29 | ] 30 | 31 | 32 | def default_args(argv): 33 | """Provides default values for Workflow flags.""" 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument( 36 | '--model_version_str', 37 | required=True, 38 | type=str, 39 | help='Path to ML Engine model like `MODEL_NAME/versions/VERSION`') 40 | parser.add_argument( 41 | '--input_file_pattern', 42 | required=True, 43 | type=str, 44 | help='Glob style file pattern to use for selecting input files.') 45 | parser.add_argument( 46 | '--output_path', 47 | required=True, 48 | help='Output directory to write results to if write_to_bigquery is false.' 49 | 'for DataflowRunner use a GCS bucket.') 50 | parser.add_argument( 51 | '--output_prefix', 52 | default='us_patent_claim_scores', 53 | help='Prefix to use on sharded output files.') 54 | parser.add_argument( 55 | '--write_to_bigquery', 56 | default=False, 57 | help='If `True` output will be written directly to a bigquery table as' 58 | 'specified by args.output_dataset args.output_table.' 59 | ) 60 | parser.add_argument( 61 | '--output_dataset', 62 | help='Bigquery Dataset where output should be written if' 63 | 'write_to_bigquery is true. Will be ignored otherwise.' 64 | ) 65 | parser.add_argument( 66 | '--output_table', 67 | help='Bigquery Table name where output should be written if' 68 | 'write_to_bigquery is true. Will be ignored otherwise.' 69 | ) 70 | parser.add_argument( 71 | '--output_shards', 72 | default=10, 73 | help='Number of shards to write in output_path.') 74 | parser.add_argument( 75 | '--job_name', 76 | type=str, 77 | default='patent-claims-inference' + NOW, 78 | help='A unique job identifier.') 79 | parser.add_argument( 80 | '--num_workers', 81 | default=5, 82 | type=int, 83 | help='The max number of workers to use.') 84 | parser.add_argument( 85 | '--autoscaling_algorithm', 86 | default='NONE', 87 | help='Options are `NONE` or `THROUGHPUT_BASED`. Use None to prevent GCP' 88 | 'from scaling down to 1 worker due to API throughput.' 89 | ) 90 | parser.add_argument( 91 | '--runner', 92 | default='DirectRunner', 93 | choices=['DataflowRunner', 'DirectRunner'], 94 | help='Option to run locally or on GCP, for other options see Beam docs.') 95 | parser.add_argument( 96 | '--project', 97 | type=str, 98 | help='The cloud project name to be used for running this pipeline with' 99 | 'the DataflowRunner option') 100 | 101 | parsed_args, _ = parser.parse_known_args(argv) 102 | 103 | if parsed_args.runner == 'DataflowRunner': 104 | if not parsed_args.project: 105 | msg = 'If running with DataflowRunner please provide a GCP project.' 106 | raise argparse.ArgumentTypeError(msg) 107 | 108 | # Check the output flags when writing to BigQuery. 109 | if parsed_args.write_to_bigquery: 110 | if not parsed_args.output_dataset: 111 | msg = ('When writing to Bigquery, you must specify --output_dataset and ' 112 | '--output_table flags.') 113 | raise argparse.ArgumentTypeError(msg) 114 | 115 | # Setup some additional flags required with DataflowRunner. 116 | # These can be overridden via the command line. 117 | default_cloud_values = { 118 | 'temp_location': os.path.join(parsed_args.output_path, 'tmp'), 119 | 'staging_location': os.path.join(parsed_args.output_path, 'stg'), 120 | 'save_main_session': True, 121 | } 122 | 123 | for kk, vv in default_cloud_values.iteritems(): 124 | if kk not in parsed_args or not vars(parsed_args)[kk]: 125 | vars(parsed_args)[kk] = vv 126 | 127 | return parsed_args 128 | 129 | 130 | def get_tf_feature(proto, feature_name, feature_type='float_list'): 131 | """Helper method to retrieve named features from a TF example proto.""" 132 | return getattr(proto.features.feature[feature_name], feature_type).value[0] 133 | 134 | 135 | class RunInference(beam.DoFn): 136 | """Loads saved model and scores inputs.""" 137 | 138 | def __init__(self, model_endpoint): 139 | self.success_cnt = Metrics.counter('main', 'inference_success') 140 | self.model_endpoint = model_endpoint 141 | self.ml_service = build('ml', 'v1') 142 | 143 | def process(self, element): 144 | """Scores the model using the TF Example input.""" 145 | ex = tf.train.Example.FromString(element) 146 | instance = {ftr: get_tf_feature(ex, ftr) for ftr in FEATURE_NAMES} 147 | instance['cpc4'] = get_tf_feature(ex, 'cpc4', 'bytes_list') 148 | 149 | response = self.ml_service.projects().predict( 150 | name=self.model_endpoint, 151 | body={'instances': [instance]} 152 | ).execute() 153 | 154 | broad_score = response['predictions'][0]['probabilities'][1] 155 | 156 | # Pull the publication number from the TF Example proto. 157 | pub_number = ex.features.feature['publication_number'].bytes_list.value[0] 158 | self.success_cnt.inc() 159 | yield {'publication_number': pub_number, 'broad_score': float(broad_score)} 160 | 161 | 162 | def format_output(element): 163 | """Converts dictionary element into a CSV style output.""" 164 | pub_number = element.get('publication_number') 165 | broad_score = element.get('broad_score') 166 | return '{0},{1:05f}'.format(pub_number, broad_score) 167 | 168 | 169 | def main(argv, await_completion=False): 170 | """Runs the batch inference pipeline.""" 171 | opt = default_args(argv) 172 | logging.info('Starting pipeline with args: %s', vars(opt)) 173 | pipeline_options = PipelineOptions().from_dictionary(vars(opt)) 174 | p = beam.Pipeline(options=pipeline_options) 175 | output_base = os.path.join(opt.output_path, opt.output_prefix) 176 | model_endpoint = 'projects/{}/models/{}'.format( 177 | opt.project, opt.model_version_str) 178 | data = (p 179 | | 'ReadTFRecords' >> beam.io.ReadFromTFRecord(opt.input_file_pattern) 180 | | 'RunInference' >> beam.ParDo(RunInference(model_endpoint)) 181 | ) 182 | 183 | if opt.write_to_bigquery: 184 | _ = data | 'WriteToBigquery' >> beam.io.gcp.bigquery.WriteToBigQuery( 185 | table=opt.output_table, 186 | dataset=opt.output_dataset, 187 | project=opt.project, 188 | write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND, 189 | schema='publication_number:STRING,broad_score:FLOAT' 190 | ) 191 | else: 192 | # Format a CSV style output and write to text. 193 | formatted = data | 'FormatTextOutput' >> beam.Map(format_output) 194 | _ = formatted | 'WriteToText' >> beam.io.WriteToText( 195 | file_path_prefix=output_base, 196 | num_shards=int(opt.output_shards) 197 | ) 198 | 199 | result = p.run() 200 | print('Pipeline running. visit https://console.cloud.google.com/dataflow to ' 201 | 'monitor progress.') 202 | if await_completion: 203 | result.wait_until_finish() 204 | return result 205 | 206 | 207 | if __name__ == '__main__': 208 | main(sys.argv[1:]) 209 | -------------------------------------------------------------------------------- /models/claim_breadth/batch_inference_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache 2 | # License, Version 2.0 (the "License"); you may not use this file except in 3 | # compliance with the License. You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 9 | # License for the specific language governing permissions and limitations under 10 | # the License. 11 | """End-to-end test for the patent claim model batch inference script.""" 12 | import logging 13 | import os 14 | import shutil 15 | import time 16 | import unittest 17 | from apache_beam.metrics.metric import MetricsFilter 18 | from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher 19 | from apache_beam.testing.test_pipeline import TestPipeline 20 | import batch_inference 21 | from hamcrest.core.core.allof import all_of 22 | from nose.plugins.attrib import attr 23 | 24 | # Assumes your project and model versions are set as ENV variables. See README. 25 | PROJECT = os.environ['GCP_PROJECT'] 26 | MODEL_VERSION_STR = os.environ['MODEL_VERSION_STR'] 27 | 28 | 29 | def get_pipeline_metric(pipeline_results, metric_name, index=0, 30 | result_type='counters'): 31 | """Attempts to return a metrics from an Apache Beam PipelineResults.""" 32 | metrics_filter = MetricsFilter().with_name(metric_name) 33 | query_result = pipeline_results.metrics().query(metrics_filter) 34 | try: 35 | return query_result[result_type][index].committed 36 | except IndexError: 37 | logging.info( 38 | 'No key in metrics for %s at index %s, returning 0', metric_name, index) 39 | return 0 40 | 41 | 42 | class BatchInferenceE2E(unittest.TestCase): 43 | _multiprocess_can_split_ = True 44 | OUTPUT_DIR = os.getcwd() 45 | TEST_DATA_GLOB = os.path.join(OUTPUT_DIR, 'testdata', '*.tfrecord.gz') 46 | TOTAL_RECORDS_IN_TEST_DATA = 17 47 | 48 | @attr('IT') 49 | def test_text_file_output(self): 50 | test_pipeline = TestPipeline() 51 | # Checks that pipeline reaches state "Done" 52 | pipeline_verifiers = [PipelineStateMatcher()] 53 | 54 | # Set extra options to the pipeline for test purpose 55 | test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) 56 | extra_opts = { 57 | 'project': PROJECT, 58 | 'model_version_str': MODEL_VERSION_STR, 59 | 'input_file_pattern': self.TEST_DATA_GLOB, 60 | 'output_path': test_dir, 61 | 'runner': 'DirectRunner', 62 | 'output_shards': 1, 63 | 'on_success_matcher': all_of(*pipeline_verifiers), 64 | } 65 | 66 | # Add cleanup for testdir 67 | self.addCleanup(shutil.rmtree, test_dir) 68 | 69 | result = batch_inference.main( 70 | test_pipeline.get_full_options_as_args(**extra_opts), 71 | await_completion=True 72 | ) 73 | 74 | records_scored = get_pipeline_metric(result, 'inference_success') 75 | self.assertEqual(records_scored, self.TOTAL_RECORDS_IN_TEST_DATA) 76 | 77 | 78 | if __name__ == '__main__': 79 | logging.getLogger().setLevel(logging.DEBUG) 80 | logging.info('Running with MODEL: %s', MODEL_VERSION_STR) 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /models/claim_breadth/generate_embedding_vocab.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | SUBSTR(cpc.code, 1, 4) cpc4 # Trim CPC Code to first 4 digits. 4 | FROM 5 | `patents-public-data.patents.publications`, 6 | UNNEST(cpc) AS cpc 7 | WHERE 8 | country_code = 'US' 9 | AND SUBSTR(cpc.code, 1, 1) IN ('D', 'E', 'G', 'H') 10 | AND FLOOR(priority_date / 10000) > 1995 11 | GROUP BY 12 | 1 13 | HAVING 14 | COUNT(publication_number) > 3000 15 | -------------------------------------------------------------------------------- /models/claim_breadth/hptuning_config.yaml: -------------------------------------------------------------------------------- 1 | trainingInput: 2 | scaleTier: BASIC_GPU 3 | hyperparameters: 4 | enableTrialEarlyStopping: True 5 | hyperparameterMetricTag: auc 6 | maxTrials: 60 7 | maxParallelTrials: 2 8 | params: 9 | - parameterName: dropout 10 | type: DOUBLE 11 | minValue: 0.2 12 | maxValue: 0.6 13 | scaleType: UNIT_LINEAR_SCALE 14 | - parameterName: learning-rate 15 | type: DOUBLE 16 | minValue: 0.0001 17 | maxValue: 0.01 18 | scaleType: UNIT_REVERSE_LOG_SCALE 19 | - parameterName: first-layer-size 20 | type: INTEGER 21 | minValue: 256 22 | maxValue: 8192 23 | scaleType: UNIT_LINEAR_SCALE 24 | - parameterName: num-layers 25 | type: INTEGER 26 | minValue: 1 27 | maxValue: 10 28 | scaleType: UNIT_LINEAR_SCALE 29 | - parameterName: scale-factor 30 | type: DOUBLE 31 | minValue: 0.3 32 | maxValue: 0.99 33 | scaleType: UNIT_LINEAR_SCALE 34 | - parameterName: cpc-embedding-dim 35 | type: INTEGER 36 | minValue: 5 37 | maxValue: 100 38 | scaleType: UNIT_LINEAR_SCALE 39 | -------------------------------------------------------------------------------- /models/claim_breadth/preprocess_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache 2 | # License, Version 2.0 (the "License"); you may not use this file except in 3 | # compliance with the License. You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 9 | # License for the specific language governing permissions and limitations under 10 | # the License. 11 | 12 | """End-to-end test for the patent claim breadth model preprocessing code.""" 13 | import logging 14 | import os 15 | import shutil 16 | import time 17 | import unittest 18 | from apache_beam.metrics.metric import MetricsFilter 19 | from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher 20 | from apache_beam.testing.test_pipeline import TestPipeline 21 | from hamcrest.core.core.allof import all_of 22 | from nose.plugins.attrib import attr 23 | import preprocess 24 | import tensorflow as tf 25 | 26 | # Assumes you've set an environmental variable for your GCP project. See README. 27 | PROJECT = os.environ['GCP_PROJECT'] 28 | 29 | 30 | def read_example_proto(test_dir): 31 | filenames = tf.gfile.Glob(os.path.join(test_dir, '*.tfrecord.gz')) 32 | tf_opt = tf.python_io.TFRecordOptions( 33 | tf.python_io.TFRecordCompressionType.GZIP) 34 | record = next(tf.python_io.tf_record_iterator(filenames[0], options=tf_opt)) 35 | example = tf.train.Example() 36 | example.ParseFromString(record) 37 | return example 38 | 39 | 40 | def get_pipeline_metric(results, metric_name, index=0, result_type='counters'): 41 | metric_filter = MetricsFilter().with_name(metric_name) 42 | query_result = results.metrics().query(metric_filter) 43 | try: 44 | return query_result[result_type][index].committed 45 | except IndexError: 46 | logging.info( 47 | 'No key in metrics for %s at index %s, returning 0', metric_name, index) 48 | return 0 49 | 50 | 51 | def get_tf_feature(proto, feature_name, feature_type='float_list'): 52 | """Helper method to retrieve named features from a TF example proto.""" 53 | return getattr(proto.features.feature[feature_name], feature_type).value[0] 54 | 55 | 56 | def get_test_query(max_records): 57 | return ''' 58 | #standardSQL 59 | with fake_applications as ( 60 | SELECT 61 | 'US-1234567-A1' as publication_number, 62 | substr(claims.text, 0, 2000) as fullclaim, 63 | 2000 as priority_yr, 64 | 'C08F' as cpc4, 65 | 2003 as median_priority_yr 66 | FROM `patents-public-data.patents.publications` p 67 | ,UNNEST(claims_localized) claims 68 | WHERE claims.language = 'en' 69 | AND country_code = 'US' 70 | AND claims.text is not null 71 | AND FLOOR(priority_date / 10000) > 2005 72 | limit {half_max} 73 | ) 74 | 75 | , fake_issued as ( 76 | SELECT 77 | 'US-1234567-B2' as publication_number, 78 | substr(claims.text, 0, 2000) as fullclaim, 79 | 2012 as priority_yr, 80 | 'C08F' as cpc4, 81 | 2003 as median_priority_yr 82 | FROM `patents-public-data.patents.publications` p 83 | ,UNNEST(claims_localized) claims 84 | WHERE claims.language = 'en' 85 | AND country_code = 'US' 86 | AND claims.text is not null 87 | AND FLOOR(priority_date / 10000) > 2005 88 | limit {half_max} 89 | ) 90 | 91 | select * from fake_applications 92 | union all 93 | select * from fake_issued 94 | '''.format(half_max=(max_records // 2)) 95 | 96 | 97 | class PreProcessE2E(unittest.TestCase): 98 | # Enable nose tests running in parallel 99 | _multiprocess_can_split_ = True 100 | OUTPUT_DIR = os.getcwd() 101 | TOTAL_RECORDS = 500 102 | TEST_QUERY = get_test_query(TOTAL_RECORDS) 103 | 104 | @attr('IT') 105 | def test_train_mode(self): 106 | """Runs pipeline in train mode outputting train, test and eval filesets.""" 107 | test_pipeline = TestPipeline() 108 | # Set extra options to the pipeline for test purpose 109 | test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) 110 | self.addCleanup(shutil.rmtree, test_dir) 111 | 112 | # Checks that pipeline reaches state "Done" 113 | pipeline_verifiers = [PipelineStateMatcher()] 114 | extra_opts = { 115 | 'project': PROJECT, 116 | 'output_path': test_dir, 117 | 'on_success_matcher': all_of(*pipeline_verifiers), 118 | 'runner': 'DirectRunner', 119 | } 120 | 121 | res = preprocess.main( 122 | test_pipeline.get_full_options_as_args(**extra_opts), 123 | query=self.TEST_QUERY, 124 | await_completion=True 125 | ) 126 | 127 | # Check counts coming out of GetFirstClaim step. 128 | parse_first_claim_cnt = get_pipeline_metric(res, 'parse_firstclaim_success') 129 | self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) 130 | 131 | # Check counts coming out of AddFeatures step. 132 | add_features_cnt = get_pipeline_metric(res, 'create_features_success') 133 | self.assertEqual(self.TOTAL_RECORDS, add_features_cnt) 134 | 135 | # Check counts coming out of AddLabel step. 136 | broad_cnt = get_pipeline_metric(res, 'add_label_broad') 137 | narrow_cnt = get_pipeline_metric(res, 'add_label_narrow') 138 | self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt) 139 | 140 | # Check if the number of records coming out of Train/Test = limit step. 141 | splits = ['train_cnt', 'eval_cnt', 'test_cnt'] 142 | train_test_split_cnt = sum( 143 | [get_pipeline_metric(res, m) for m in splits] 144 | ) 145 | self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt) 146 | 147 | # Check if number of protos created matched output of train/test split. 148 | create_proto_success = sum( 149 | [get_pipeline_metric(res, 'create_proto_success', index=i) 150 | for i in range(3)] 151 | ) 152 | self.assertEqual(self.TOTAL_RECORDS, create_proto_success) 153 | 154 | # Open a tf Example and check fields. 155 | example = read_example_proto(test_dir) 156 | for feature_name in preprocess.FEATURE_NAMES: 157 | self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) 158 | # Make sure label feature is present. 159 | labels = ['broad', 'narrow'] 160 | self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels) 161 | 162 | @attr('IT') 163 | def test_inference_mode(self): 164 | """Runs a pipeline in inference mode which should output one fileset.""" 165 | test_pipeline = TestPipeline() 166 | # Set extra options to the pipeline for test purpose 167 | test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) 168 | self.addCleanup(shutil.rmtree, test_dir) 169 | 170 | # Checks that pipeline reaches state "Done" 171 | pipeline_verifiers = [PipelineStateMatcher()] 172 | extra_opts = { 173 | 'project': PROJECT, 174 | 'output_path': test_dir, 175 | 'on_success_matcher': all_of(*pipeline_verifiers), 176 | 'runner': 'DirectRunner', 177 | 'pipeline_mode': 'inference', 178 | } 179 | 180 | res = preprocess.main( 181 | test_pipeline.get_full_options_as_args(**extra_opts), 182 | query=self.TEST_QUERY, 183 | await_completion=True 184 | ) 185 | 186 | # Check counts coming out of GetFirstClaim step. 187 | parse_first_claim_cnt = get_pipeline_metric(res, 'parse_firstclaim_success') 188 | self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) 189 | 190 | # Ensure a proto is created for all input records 191 | create_proto_success = get_pipeline_metric(res, 'create_proto_success') 192 | self.assertEqual(self.TOTAL_RECORDS, create_proto_success) 193 | 194 | # Open a tf Example and check fields. 195 | example = read_example_proto(test_dir) 196 | for feature_name in preprocess.FEATURE_NAMES: 197 | self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) 198 | 199 | # Make sure label feature is not present since we are in inference. 200 | with self.assertRaises(IndexError): 201 | get_tf_feature(example, 'label', 'bytes_list') 202 | 203 | 204 | if __name__ == '__main__': 205 | logging.getLogger().setLevel(logging.DEBUG) 206 | unittest.main() 207 | -------------------------------------------------------------------------------- /models/claim_breadth/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.1.13 2 | apache-beam==2.4.0 3 | astor==0.6.2 4 | avro==1.8.2 5 | backports.weakref==1.0.post1 6 | bleach==1.5.0 7 | cachetools==2.0.1 8 | certifi==2018.1.18 9 | chardet==3.0.4 10 | crcmod==1.7 11 | dill==0.2.6 12 | docopt==0.6.2 13 | enum34==1.1.6 14 | fasteners==0.14.1 15 | funcsigs==1.0.2 16 | future==0.16.0 17 | futures==3.2.0 18 | gapic-google-cloud-pubsub-v1==0.15.4 19 | gast==0.2.0 20 | google-apitools==0.5.20 21 | google-api-python-client==1.6.7 22 | google-auth==1.4.1 23 | google-auth-httplib2==0.0.3 24 | google-cloud-bigquery==0.25.0 25 | google-cloud-core==0.25.0 26 | google-cloud-pubsub==0.26.0 27 | google-gax==0.15.16 28 | googleapis-common-protos==1.5.3 29 | googledatastore==7.0.1 30 | grpc-google-iam-v1==0.11.4 31 | grpcio==1.10.1 32 | hdfs==2.1.0 33 | html5lib==0.9999999 34 | httplib2==0.9.2 35 | idna==2.6 36 | Markdown==2.6.11 37 | mock==2.0.0 38 | monotonic==1.4 39 | nose==1.3.7 40 | numpy==1.14.2 41 | oauth2client==4.1.2 42 | pbr==4.0.2 43 | ply==3.8 44 | proto-google-cloud-datastore-v1==0.90.4 45 | proto-google-cloud-pubsub-v1==0.15.4 46 | protobuf==3.5.2.post1 47 | pyasn1==0.4.2 48 | pyasn1-modules==0.2.1 49 | PyHamcrest==1.9.0 50 | PyVCF==0.6.8 51 | PyYAML==3.12 52 | requests==2.18.4 53 | rsa==3.4.2 54 | six==1.11.0 55 | tensorboard==1.7.0 56 | tensorflow==1.7.0 57 | termcolor==1.1.0 58 | typing==3.6.4 59 | uritemplate==3.0.0 60 | urllib3==1.22 61 | Werkzeug==0.14.1 62 | -------------------------------------------------------------------------------- /models/claim_breadth/testdata/example-output-from-preprocess-step.tfrecord.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/claim_breadth/testdata/example-output-from-preprocess-step.tfrecord.gz -------------------------------------------------------------------------------- /models/claim_breadth/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/claim_breadth/trainer/__init__.py -------------------------------------------------------------------------------- /models/claim_breadth/trainer/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache 2 | # License, Version 2.0 (the "License"); you may not use this file except in 3 | # compliance with the License. You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 9 | # License for the specific language governing permissions and limitations under 10 | # the License. 11 | """Model definition for the patent claim breadth model.""" 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | import tensorflow as tf 16 | 17 | # Count features created in ../preprocess.py 18 | FEATURE_NAMES = [ 19 | 'word_cnt', 'word_cnt_unique', 'char_cnt', 'char_cnt_unique', 20 | 'limiting_words_cnt', 'digits_or_decimal_cnt', 'atleastoneofand_cnt', 21 | 'atleastoneofor_cnt', 'counting_cnt', 'excluding_words_cnt', 22 | 'groupconsistingof_cnt', 'element_cnt', 'adding_words_cnt', 23 | ] 24 | 25 | 26 | def build_input_columns(embedding_dim, embedding_vocab_file): 27 | """Builds input columns for use with Tensorflow Estimator.""" 28 | categorical = tf.feature_column.categorical_column_with_vocabulary_file( 29 | key='cpc4', 30 | vocabulary_file=embedding_vocab_file, 31 | num_oov_buckets=1, 32 | ) 33 | cpc_embedding = tf.feature_column.embedding_column( 34 | categorical_column=categorical, 35 | dimension=embedding_dim 36 | ) 37 | numeric_columns = [tf.feature_column.numeric_column(k) for k in FEATURE_NAMES] 38 | return [cpc_embedding] + numeric_columns 39 | 40 | 41 | def build_estimator(config, hidden_units=None, learning_rate=0.001, dropout=0.1, 42 | embedding_vocab_file=None, embedding_dim=25): 43 | """Builds an estimator for predicting patent claim complex.""" 44 | input_columns = build_input_columns(embedding_dim, embedding_vocab_file) 45 | return tf.estimator.DNNClassifier( 46 | config=config, 47 | feature_columns=input_columns, 48 | hidden_units=hidden_units or [512, 256, 128], 49 | optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=learning_rate), 50 | dropout=dropout 51 | ) 52 | 53 | 54 | def build_serving_fn(): 55 | """Builds serving function based on Hparams.""" 56 | def _json_serving_input_fn(): 57 | inputs = {} 58 | for feat in FEATURE_NAMES: 59 | inputs[feat] = tf.placeholder(shape=[None], dtype=tf.float32) 60 | inputs['cpc4'] = tf.placeholder(shape=[None], dtype=tf.string) 61 | return tf.estimator.export.ServingInputReceiver(inputs, inputs) 62 | 63 | return _json_serving_input_fn 64 | 65 | 66 | def input_fn(filespec, batch_size, num_epochs=None, shuffle=True): 67 | """Builds a TensorFlow input function for use with our model.""" 68 | def _parse_example(example): 69 | """Parses a TF example protobuffer.""" 70 | feature_spec = { 71 | 'label': tf.FixedLenFeature([], tf.string), 72 | 'cpc4': tf.FixedLenFeature([], tf.string), 73 | 'publication_number': tf.FixedLenFeature([], tf.string), 74 | } 75 | for f in FEATURE_NAMES: 76 | feature_spec[f] = tf.FixedLenFeature([], tf.float32) 77 | features = tf.parse_single_example(example, feature_spec) 78 | labels = tf.to_int32(tf.equal(features.pop('label'), 'broad')) 79 | 80 | return features, labels 81 | 82 | filenames = tf.gfile.Glob(filespec) 83 | dataset = tf.data.TFRecordDataset(filenames, compression_type='GZIP') 84 | dataset = dataset.map(_parse_example) 85 | dataset = dataset.prefetch(batch_size * 5) 86 | dataset = dataset.batch(batch_size).repeat(num_epochs) 87 | if shuffle: 88 | dataset = dataset.shuffle(batch_size * 5) 89 | 90 | iterator = dataset.make_one_shot_iterator() 91 | batch_features, batch_labels = iterator.get_next() 92 | return batch_features, batch_labels 93 | -------------------------------------------------------------------------------- /models/claim_breadth/trainer/task.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. Licensed under the Apache 2 | # License, Version 2.0 (the "License"); you may not use this file except in 3 | # compliance with the License. You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 8 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 9 | # License for the specific language governing permissions and limitations under 10 | # the License. 11 | """Experiment definition for the patent claim breadth model.""" 12 | import argparse 13 | import tensorflow as tf 14 | from tensorflow.contrib.training.python.training import hparam 15 | import trainer.model as model 16 | 17 | 18 | def parse_args(): 19 | """Parses command line arguements.""" 20 | parser = argparse.ArgumentParser() 21 | # Input Arguments 22 | parser.add_argument( 23 | '--train-files', 24 | help='GCS or local paths to training data', 25 | nargs='+', 26 | required=True 27 | ) 28 | parser.add_argument( 29 | '--eval-files', 30 | help='GCS or local paths to evaluation data', 31 | nargs='+', 32 | required=True 33 | ) 34 | parser.add_argument( 35 | '--job-dir', 36 | help='GCS location to write checkpoints and export models', 37 | required=True, 38 | ) 39 | 40 | # Training arguments - hparams which can be tuned. 41 | parser.add_argument( 42 | '--dropout', 43 | help='Dropout between layers in DNN.', 44 | default=0.35, 45 | type=float 46 | ) 47 | parser.add_argument( 48 | '--learning-rate', 49 | help='Learning rate for the optimizer.', 50 | default=0.01, 51 | type=float 52 | ) 53 | parser.add_argument( 54 | '--first-layer-size', 55 | help='Number of nodes in the first layer of the DNN', 56 | default=7500, 57 | type=int 58 | ) 59 | parser.add_argument( 60 | '--num-layers', 61 | help='Number of layers in the DNN', 62 | default=1, 63 | type=int 64 | ) 65 | parser.add_argument( 66 | '--scale-factor', 67 | help='How quickly should the size of the layers in the DNN decay', 68 | default=0.8, 69 | type=float 70 | ) 71 | parser.add_argument( 72 | '--cpc-embedding-vocab-file', 73 | help='GCS path to a text file with one CPC4 per line. Any CPC4 codes not' 74 | 'included will be mapped to a single UNK bucket. See README.', 75 | required=True, 76 | type=str 77 | ) 78 | parser.add_argument( 79 | '--cpc-embedding-dim', 80 | help='Size of the learned embedding column to represent CPC codes.', 81 | default=85, 82 | type=int 83 | ) 84 | 85 | # Experiment arguments 86 | parser.add_argument( 87 | '--train-steps', 88 | help='Steps to run the training job before exiting.', 89 | type=int, 90 | default=30000 91 | ) 92 | parser.add_argument( 93 | '--train-batch-size', 94 | help='Batch size for training steps', 95 | type=int, 96 | default=512 97 | ) 98 | parser.add_argument( 99 | '--eval-batch-size', 100 | help='Batch size for evaluation steps', 101 | type=int, 102 | default=512 103 | ) 104 | parser.add_argument( 105 | '--eval-secs', 106 | help='Time between evaluations.', 107 | type=int, 108 | default=120 109 | ) 110 | parser.add_argument( 111 | '--eval-steps', 112 | help='Number of steps to run evalution for at each checkpoint', 113 | default=100, 114 | type=int 115 | ) 116 | parser.add_argument( 117 | '--verbosity', 118 | choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], 119 | default='INFO', 120 | ) 121 | return parser.parse_args() 122 | 123 | 124 | def main(hparams): 125 | """Run the training and evaluate using the high level API.""" 126 | 127 | trn_input = lambda: model.input_fn( 128 | hparams.train_files, 129 | batch_size=hparams.train_batch_size 130 | ) 131 | train_spec = tf.estimator.TrainSpec(trn_input, max_steps=hparams.train_steps) 132 | 133 | eval_input = lambda: model.input_fn( 134 | hparams.eval_files, 135 | batch_size=hparams.eval_batch_size, 136 | ) 137 | 138 | # Construct our JSON serving function for Online Predictions using GCP. 139 | exporter = tf.estimator.FinalExporter('model', model.build_serving_fn()) 140 | eval_spec = tf.estimator.EvalSpec( 141 | eval_input, 142 | throttle_secs=hparams.eval_secs, 143 | steps=hparams.eval_steps, 144 | exporters=[exporter], 145 | ) 146 | 147 | run_config = tf.estimator.RunConfig() 148 | run_config = run_config.replace(model_dir=hparams.job_dir) 149 | # Construct layers sizes with exponential decay 150 | hidden_units = [ 151 | max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) 152 | for i in range(hparams.num_layers) 153 | ] 154 | estimator = model.build_estimator( 155 | config=run_config, 156 | hidden_units=hidden_units, 157 | learning_rate=hparams.learning_rate, 158 | dropout=hparams.dropout, 159 | embedding_vocab_file=hparams.cpc_embedding_vocab_file, 160 | embedding_dim=hparams.cpc_embedding_dim, 161 | ) 162 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 163 | 164 | 165 | if __name__ == '__main__': 166 | args = parse_args() 167 | tf.logging.set_verbosity(args.verbosity) 168 | hyperparams = hparam.HParams(**args.__dict__) 169 | main(hyperparams) 170 | -------------------------------------------------------------------------------- /models/landscaping/AutomatedPatentLandscaping.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/AutomatedPatentLandscaping.pdf -------------------------------------------------------------------------------- /models/landscaping/AutomatedPatentLandscaping_2018Update.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/AutomatedPatentLandscaping_2018Update.pdf -------------------------------------------------------------------------------- /models/landscaping/README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | Patent landscaping is the process of finding patents related to a particular topic. It is important for companies, investors, governments, and academics seeking to gauge innovation and assess risk. However, there is no broadly recognized best approach to landscaping. Frequently, patent landscaping is a bespoke human-driven process that relies heavily on complex queries over bibliographic patent databases. In this paper (and repository), we present Automated Patent Landscaping, an approach that jointly leverages human domain expertise, heuristics based on patent metadata, and machine learning to generate high-quality patent landscapes with minimal effort. 3 | 4 | # Creating a Patent Landscape 5 | 6 | The figure 1 shows the high level flow to create a patent landscape. We'll walk through each of these in turn in the accompanying Jupyter Notebook. 7 | 8 | ![Fig 1. High Level Flow of Automated Patent Landscaping](figs/flow.png) 9 | 10 | ## Requirements 11 | Before we get started, you should install some requirements for running this notebook. We rely on TensorFlow, Keras, and Google's Cloud infrastructure such as BigQuery, where we pull public patent data, and that needs to be installed and authorized. You need a few basics before continuing: 12 | * Anaconda 13 | * Jupyter Notebooks 14 | * TensorFlow and Keras 15 | * Google Cloud SDK 16 | * BigQuery Python Client 17 | * A few Python utilities 18 | 19 | ### Platform Support 20 | 21 | Note that this has primarily been tested using Linux (Ubuntu) and Windows 10 22 | with both CPU-based TensorFlow and with GPUs. There's no reason this shouldn't 23 | work just fine with MacOS, but it's not been thoroughly tested. If you encounter 24 | issues with the instructions, please feel free to reach out to 25 | [Dave Feltenberger](https://github.com/seinberg) or send a pull request with a fix. 26 | 27 | ### Anaconda 28 | I strongly recommend using Anaconda for this - it helps manage environments for Python, and these instructions will assume you're using it. Download Anaconda from [https://www.anaconda.com/download/](https://www.anaconda.com/download/). Install the Python 3.6 version, *not* 2.7. 29 | 30 | Once Anaconda is installed, create an environment: 31 | ``` 32 | conda create -n patent-landscape python=3.5 33 | source activate patent-landscape (or just: activate patent-landscape if you're in Windows) 34 | ``` 35 | 36 | ### Jupyter Notebooks 37 | 38 | To run the code in this notebook, you'll also need to install Jupyter. The following installs Jupyter and some utilities that let you toggle between different conda environments while inside a notebook. 39 | 40 | ``` 41 | conda config --add channels conda-forge 42 | conda install jupyter ipython nb_conda=2.2.0 43 | ``` 44 | 45 | ### Installing pip 46 | 47 | Some packages we depend on use `pip` for package management. If you're in Windows or Linux, installing Anaconda should take care of this for you. If you're on a Mac, there's a chance this isn't installed and that you need to install it yourself. You can install it with this command: 48 | ``` 49 | conda install pip 50 | ``` 51 | 52 | ### TensorFlow and Keras 53 | 54 | TensorFlow will work 'out of the box' with just your CPU. Since we're going to be building a model using neural networks, however, I highly recommend using GPU acceleration - training will be at least an order of magnitude faster with a modern GPU. You'll need to follow the TensorFlow instructions found [here](https://www.tensorflow.org/install/) for your platform. There are several steps to getting your GPU working for Deep Learning, so pay careful attention to the instructions. Note that only Nvidia chipset-based GPUs will work with TensorFlow. 55 | 56 | To skip all the GPU acceleration and just get started, you can just run this command within your active conda environment: 57 | ``` 58 | pip install tensorflow 59 | ``` 60 | 61 | Keras is an excellent high level Deep Learning library that we'll use to build our models: 62 | ``` 63 | conda install keras 64 | ``` 65 | 66 | Also install tflearn, the high-level library on top of TensorFlow, if you'd like to experiment with another high-level library like Keras (though our example doesn't directly use tflearn): 67 | ``` 68 | pip install tflearn 69 | ``` 70 | 71 | ### Google Cloud SDK 72 | ***Download and install Google Cloud SDK.*** You can download and install using [these](https://cloud.google.com/sdk/docs) instructions. On Linux, you'll need to source the `.sh` file and source the relevant include files to make sure the binaries are in your path. You should also install the python utilities for your Anaconda project: 73 | 74 | ``` 75 | pip install google-cloud 76 | pip install google-cloud-storage 77 | ``` 78 | 79 | Once you have the `gcloud` client installed, you need to authorize it to access Google's Cloud on your behalf. ***Don't forget this, or you'll get difficult to debug errors while running the code!*** From your active conda environment, run this command and follow the prompts: 80 | ``` 81 | gcloud auth application-default login 82 | ``` 83 | 84 | Finally, you'll also need to install the Google API Python Client and BigQuery extension to Pandas: 85 | ``` 86 | pip install google-api-python-client pandas-gbq 87 | ``` 88 | 89 | ### Python Utilities 90 | 91 | ``` 92 | conda install numpy pandas h5py scipy scikit-learn matplotlib seaborn 93 | ``` 94 | 95 | ## Google Cloud Tools Client Authorization 96 | 97 | For this code to run properly, you need to authorize Google Cloud to run. This is important or you'll get weird errors that are hard to debug :) See above, the `gcloud` command. 98 | 99 | # Running the Landscaping Code 100 | 101 | There are two primary steps: 1) cloning the Github repo, and 2) starting 102 | Jupyter. 103 | 104 | ## 1) Clone the Landscaping Github Repo 105 | 106 | ``` 107 | git clone https://github.com/google/patents-public-data 108 | ``` 109 | 110 | 111 | ## 2) Start Jupyter 112 | 113 | Now that you have the Repo, from a command line, change into the root of the 114 | repository you just cloned. The Jupyter notebook we care about is in 115 | `models/landscaping/LandscapeNotebook.ipynb`. Finally, set an environment 116 | variable and start Jupyter: 117 | 118 | ``` 119 | export KERAS_BACKEND=tensorflow; jupyter notebook 120 | ``` 121 | 122 | *Note*: don't forget to change Conda environments to the patent-landscape one. 123 | You can do this before starting `jupyter` with the following command: 124 | ``` 125 | source activate patent-landscape (or just: activate patent-landscape if you're in Windows) 126 | ``` 127 | 128 | or, once you're in the Jupyter environment, and assuming you've installed all 129 | the packages mentioned above, by choosing the Environment menu option and 130 | selecting the `patent-landscape` environment. 131 | -------------------------------------------------------------------------------- /models/landscaping/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /models/landscaping/figs/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/figs/flow.png -------------------------------------------------------------------------------- /models/landscaping/figs/project-id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/models/landscaping/figs/project-id.png -------------------------------------------------------------------------------- /models/landscaping/keras_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import keras.backend as K 16 | 17 | def precision(y_true, y_pred): 18 | """Precision metric. 19 | Only computes a batch-wise average of precision. 20 | Computes the precision, a metric for multi-label classification of 21 | how many selected items are relevant. 22 | """ 23 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 24 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 25 | precision = true_positives / (predicted_positives + K.epsilon()) 26 | return precision 27 | 28 | 29 | def recall(y_true, y_pred): 30 | """Recall metric. 31 | Only computes a batch-wise average of recall. 32 | Computes the recall, a metric for multi-label classification of 33 | how many relevant items are selected. 34 | """ 35 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 36 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 37 | recall = true_positives / (possible_positives + K.epsilon()) 38 | return recall 39 | 40 | def fbeta_score(y_true, y_pred, beta=1): 41 | """Computes the F score. 42 | The F score is the weighted harmonic mean of precision and recall. 43 | Here it is only computed as a batch-wise average, not globally. 44 | This is useful for multi-label classification, where input samples can be 45 | classified as sets of labels. By only using accuracy (precision) a model 46 | would achieve a perfect score by simply assigning every class to every 47 | input. In order to avoid this, a metric should penalize incorrect class 48 | assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0) 49 | computes this, as a weighted mean of the proportion of correct class 50 | assignments vs. the proportion of incorrect class assignments. 51 | With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning 52 | correct classes becomes more important, and with beta > 1 the metric is 53 | instead weighted towards penalizing incorrect class assignments. 54 | """ 55 | if beta < 0: 56 | raise ValueError('The lowest choosable beta is zero (only precision).') 57 | 58 | # If there are no true positives, fix the F score at 0 like sklearn. 59 | if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: 60 | return 0 61 | 62 | p = precision(y_true, y_pred) 63 | r = recall(y_true, y_pred) 64 | bb = beta ** 2 65 | fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) 66 | return fbeta_score 67 | 68 | 69 | def fmeasure(y_true, y_pred): 70 | """Computes the f-measure, the harmonic mean of precision and recall. 71 | Here it is only computed as a batch-wise average, not globally. 72 | """ 73 | return fbeta_score(y_true, y_pred, beta=1) 74 | 75 | fscore = f1score = fmeasure 76 | -------------------------------------------------------------------------------- /models/landscaping/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.metrics import classification_report 16 | from sklearn.metrics import confusion_matrix 17 | 18 | import keras 19 | from keras.models import Sequential, Model 20 | from keras.layers import Dense, Input, Embedding, BatchNormalization, ELU, Concatenate 21 | from keras.layers import LSTM, Conv1D, MaxPooling1D 22 | from keras.layers.merge import concatenate 23 | from keras.layers.core import Dropout 24 | from keras_metrics import precision, recall, f1score 25 | 26 | import matplotlib.pyplot as plt 27 | import os 28 | import pandas as pd 29 | import seaborn as sn 30 | 31 | class LandscapeModel: 32 | target_names = ['seed', 'antiseed'] 33 | tf_model = None 34 | td = None 35 | data_path = None 36 | seed_name = None 37 | 38 | def __init__(self, training_data, data_path, seed_name): 39 | self.tf_model = None 40 | self.td = training_data 41 | self.data_path = data_path 42 | self.seed_name = seed_name 43 | 44 | def wire_model_functional(self, lstm_size, dropout_pct, sequence_len): 45 | print('Building Functional model.') 46 | 47 | refs_input = Input(shape=(self.td.trainRefsOneHotX.shape[1],), name='refs_input') 48 | refs = Dense( 49 | 256, 50 | input_dim=self.td.trainRefsOneHotX.shape[1], 51 | activation=None)(refs_input) 52 | refs = Dropout(dropout_pct)(refs) 53 | refs = BatchNormalization()(refs) 54 | refs = ELU()(refs) 55 | refs = Dense(64, activation=None)(refs) 56 | refs = Dropout(dropout_pct)(refs) 57 | refs = BatchNormalization()(refs) 58 | refs = ELU()(refs) 59 | 60 | cpcs_input = Input(shape=(self.td.trainCpcOneHotX.shape[1],), name='cpcs_input') 61 | cpcs = Dense( 62 | 32, 63 | input_dim=self.td.trainCpcOneHotX.shape[1], 64 | activation=None)(cpcs_input) 65 | cpcs = Dropout(dropout_pct)(cpcs) 66 | cpcs = BatchNormalization()(cpcs) 67 | cpcs = ELU()(cpcs) 68 | 69 | # Use pre-trained Word2Vec embeddings 70 | embedding_layer_input = Input(shape=(sequence_len,), name='embed_input') 71 | embedding_layer = Embedding(self.td.w2v_runtime.embedding_weights.shape[0], 72 | self.td.w2v_runtime.embedding_weights.shape[1], 73 | weights=[self.td.w2v_runtime.embedding_weights], 74 | input_length=sequence_len, 75 | trainable=False)(embedding_layer_input) 76 | deep = LSTM( 77 | lstm_size, 78 | dropout=dropout_pct, 79 | recurrent_dropout=dropout_pct, 80 | return_sequences=False, 81 | name='LSTM_1')(embedding_layer) 82 | deep = Dense(300, activation=None)(deep) 83 | deep = Dropout(dropout_pct)(deep) 84 | deep = BatchNormalization()(deep) 85 | deep = ELU()(deep) 86 | 87 | #model_inputs_to_concat = [cpcs, refs, deep] 88 | model_inputs_to_concat = [refs, deep] 89 | 90 | final_layer = Concatenate(name='concatenated_layer')(model_inputs_to_concat) 91 | output = Dense(64, activation=None)(final_layer) 92 | output = Dropout(dropout_pct)(output) 93 | output = BatchNormalization()(output) 94 | output = ELU()(output) 95 | output = Dense(1, activation='sigmoid')(output) 96 | 97 | #model = Model(inputs=[cpcs_input, refs_input, embedding_layer_input], outputs=output, name='model') 98 | model = Model(inputs=[refs_input, embedding_layer_input], outputs=output, name='model') 99 | model.compile(loss='binary_crossentropy', 100 | optimizer='adam', 101 | metrics=['accuracy', precision, recall, f1score]) 102 | 103 | self.tf_model = model 104 | print('Done building graph.') 105 | print(self.tf_model.summary()) 106 | 107 | def train_model(self, model, batch_size, num_epochs=5): 108 | print('Training model.') 109 | model.fit(x={ 110 | 'refs_input': self.td.trainRefsOneHotX, 111 | 'embed_input': self.td.padded_train_embed_x, 112 | 'cpcs_input': self.td.trainCpcOneHotX}, 113 | y=self.td.trainY, 114 | batch_size=batch_size, 115 | epochs=num_epochs, 116 | validation_data=( 117 | { 118 | 'refs_input': self.td.testRefsOneHotX, 119 | 'cpcs_input': self.td.testCpcOneHotX, 120 | 'embed_input': self.td.padded_test_embed_x}, 121 | self.td.testY)) 122 | return model 123 | 124 | def train_or_load_model(self, batch_size, num_epochs=5): 125 | model_dir = os.path.join(self.data_path, self.seed_name) 126 | model_path = os.path.join(model_dir, 'model.pb') 127 | 128 | if os.path.exists(model_path): 129 | print('Model exists at {}; loading existing trained model.'.format(model_path)) 130 | self.tf_model = keras.models.load_model( 131 | model_path, 132 | custom_objects={'precision': precision, 'recall': recall, 'fmeasure': f1score}) 133 | else: 134 | print('Model has not been trained yet.') 135 | tf_model = self.train_model(self.tf_model, batch_size, num_epochs) 136 | print('Saving model to {}'.format(model_path)) 137 | if not os.path.exists(model_dir): 138 | os.makedirs(model_dir) 139 | 140 | tf_model.save(model_path) 141 | print('Model persisted and ready for inference!') 142 | 143 | def evaluate_model(self, batch_size): 144 | score, acc, p, r, f1 = self.tf_model.evaluate( 145 | x={ 146 | 'refs_input': self.td.testRefsOneHotX, 147 | 'cpcs_input': self.td.testCpcOneHotX, 148 | 'embed_input': self.td.padded_test_embed_x 149 | }, 150 | y=self.td.testY, 151 | batch_size=batch_size) 152 | 153 | print('') 154 | print('Test score: {:.4f}'.format(score)) 155 | print('Test accuracy: {:.4f}'.format(acc)) 156 | print('Test p/r (f1): {:.2f}/{:.2f} ({:.2f})'.format(p, r, f1)) 157 | 158 | return (score, acc, p, r, f1) 159 | 160 | def batch_predict(self, padded_text_embeddings, refs_one_hot, cpcs_one_hot): 161 | return self.tf_model.predict( 162 | { 163 | 'embed_input': padded_text_embeddings, 164 | 'cpcs_input': cpcs_one_hot, 165 | 'refs_input': refs_one_hot 166 | }) 167 | 168 | def predict(self, train_data_util, text, refs, cpcs): 169 | ''' 170 | ''' 171 | 172 | adhoc_text = pd.Series([text]) 173 | adhoc_refs = pd.Series([refs]) 174 | adhoc_cpcs = pd.Series([cpcs]) 175 | 176 | padded_text_embeddings, refs_one_hot, cpcs_one_hot = \ 177 | train_data_util.prep_for_inference(adhoc_text, adhoc_refs, adhoc_cpcs) 178 | 179 | return self.batch_predict(padded_text_embeddings, refs_one_hot, cpcs_one_hot) 180 | 181 | def binary_prediction_idx(self, score): 182 | if score < .5: 183 | return 0 184 | return 1 185 | 186 | def label_to_idx(self, label): 187 | label = label.lower() 188 | for i in range(0, len(self.target_names)): 189 | if label == self.target_names[i]: 190 | return i 191 | raise ValueError('Label {} has no target name from [{}]'.format(label, self.target_names)) 192 | 193 | def reports(self, prediction_df): 194 | binary_predictions_x = prediction_df.score.apply(self.binary_prediction_idx) 195 | actual_labels_y = prediction_df.label.apply(self.label_to_idx) 196 | 197 | cr = classification_report(binary_predictions_x, actual_labels_y, target_names=self.target_names) 198 | cm = confusion_matrix(binary_predictions_x, actual_labels_y) 199 | 200 | return cr, cm 201 | 202 | def show_confusion_matrix(self, confusion_matrix): 203 | cm_df = pd.DataFrame(confusion_matrix) 204 | plt.figure(figsize = (10,7)) 205 | sn.heatmap(cm_df, xticklabels=self.target_names, yticklabels=self.target_names) 206 | -------------------------------------------------------------------------------- /models/landscaping/seeds/README.md: -------------------------------------------------------------------------------- 1 | # Example Patent Seed Sets 2 | This directory contains example seed sets for use in automated patent landscaping as described in the corresponding paper. 3 | 4 | ## Hair Dryer seed set 5 | The first, [hair_dryer_large.seed.csv](hair_dryer_large.seed.csv), contains seed patents related to hair dryers. Use this seed set to build a patent landscaping model and apply it to any text snippet to see whether it is likely to be a hair dryer patent. 6 | 7 | ## Video Codec seed set 8 | The second, [video_codec.seed.csv](video_codec.seed.csv), we have used publicly available data to provide a seed set of patents related to video codecs. 9 | 10 | **DISCLAIMER AND SOURCING FOR video_codec.seed.csv** 11 | 12 | The file video_codec.seed.csv provides a list of patents for a video codec seed set. The list is composed of US patents identified by MPEG LA for HEVC, MPEG2, MPEG4, and AVC in the PDFs linked below. The authors have not reviewed the contents of the patents contained in the list and some of the listed patents may not actually relate to video codecs. The inclusion of this file should not be construed as an acknowledgement that any of the listed patents actually cover any video codec standard. 13 | 14 | [http://www.mpegla.com/main/programs/M4v/Documents/m4v-att1.pdf](http://www.mpegla.com/main/programs/M4v/Documents/m4v-att1.pdf) 15 | [http://www.mpegla.com/main/programs/HEVC/Documents/hevc-att1.pdf](http://www.mpegla.com/main/programs/HEVC/Documents/hevc-att1.pdf) 16 | [http://www.mpegla.com/main/programs/M2/Documents/m2-att1.pdf](http://www.mpegla.com/main/programs/M2/Documents/m2-att1.pdf) 17 | [http://www.mpegla.com/main/programs/AVC/Documents/avc-att1.pdf](http://www.mpegla.com/main/programs/AVC/Documents/avc-att1.pdf) 18 | -------------------------------------------------------------------------------- /models/landscaping/seeds/hair_dryer.seed.csv: -------------------------------------------------------------------------------- 1 | 8407913 2 | 6907678 3 | 9144286 4 | 5956863 5 | 8893400 6 | 8782920 7 | 6798982 8 | 8256132 9 | 8307948 10 | 8459273 11 | 6305325 12 | 6739071 13 | 20150366316 14 | 5725159 15 | 6718651 16 | 6935046 17 | 7784750 18 | 8904663 19 | 5841943 20 | 20170079401 21 | 6199295 22 | 20140290087 23 | 6285828 24 | 20130326898 25 | 6671460 26 | 6011903 27 | 6269549 28 | 6885810 29 | 6038782 30 | 5875562 31 | 5647007 32 | 6314236 33 | 8517318 34 | 20080235980 35 | 20040093756 36 | 6986212 37 | 20060196075 38 | 6191930 39 | 7096597 40 | 5488783 41 | 6491267 42 | 4596921 43 | 7380347 44 | 4659907 45 | 20110203128 46 | 6199805 47 | 20040047620 48 | 20050069303 49 | 6889445 50 | 5701681 51 | 4827105 52 | 6725562 53 | 6354016 54 | 6449870 55 | 4493975 56 | 6026590 57 | 8720078 58 | 20030177657 59 | 20090100698 60 | 9675157 61 | 5790749 62 | 4955145 63 | 6067724 64 | 6732450 65 | 4524263 66 | 20120024620 67 | 7913416 68 | 7584759 69 | 20090065661 70 | 4711988 71 | 5434946 72 | 5842286 73 | 4453695 74 | 5351417 75 | 7350317 76 | 4316077 77 | 5651190 78 | 7047660 79 | 20100282810 80 | 5884008 81 | 4254324 82 | 4683369 83 | 5689896 84 | 6393718 85 | 6901936 86 | 6591516 87 | 20050108889 88 | 5606640 89 | 20110079239 90 | 4918289 91 | 20040231180 92 | 4700049 93 | 4972065 94 | 5448677 95 | 20060254073 96 | 5612849 97 | 4197448 98 | 20100014844 99 | 6188837 100 | 4687906 101 | 4759135 102 | 9675158 103 | 20060201016 104 | 3955065 105 | 4904847 106 | 5216822 107 | 5649370 108 | 4263500 109 | 6367162 110 | 20040163274 111 | 4323761 112 | 4538362 113 | 20080216339 114 | 5613305 115 | 20160022004 116 | 4260875 117 | 20020189128 118 | 4602146 119 | 5572800 120 | 20040088878 121 | 3691646 122 | 20130276321 123 | 5729907 124 | 6408533 125 | 20090188126 126 | 8103155 127 | 20020112362 128 | 5148512 129 | 20130111777 130 | 4198558 131 | 4309595 132 | 4391047 133 | 4635382 134 | 5243682 135 | 4395619 136 | 4936027 137 | 4794225 138 | 6910281 139 | 20070294909 140 | 6026821 141 | 4114022 142 | 4977306 143 | 4767914 144 | 7801423 145 | 8732976 146 | 20010020668 147 | 4295283 148 | 5784800 149 | 7676952 150 | 20130277517 151 | 20040172847 152 | 4327278 153 | 20050284495 154 | 20060064892 155 | 20070137060 156 | 7264209 157 | 20080116753 158 | 4712313 159 | 4800654 160 | 3943329 161 | 4225106 162 | 6085435 163 | 4615347 164 | 5598640 165 | 8230615 166 | 3946498 167 | 3947659 168 | 3981314 169 | 4196343 170 | 8081873 171 | 6029364 172 | 4910385 173 | 5195253 174 | 4132360 175 | 4382174 176 | 5243683 177 | 4430808 178 | 20110177711 179 | 20060006294 180 | 4848007 181 | 4967060 182 | 20040159002 183 | 5235759 184 | 20060075654 185 | 4691451 186 | 5155925 187 | 4658511 188 | 5996243 189 | 20150335128 190 | 6049994 191 | 7204038 192 | 20040020070 193 | 4634836 194 | 5036601 195 | 5157757 196 | 4003388 197 | 20070245590 198 | 5013891 199 | 4706153 200 | 4701595 201 | 20120266483 202 | 20150037015 203 | 6041514 204 | 20130153461 205 | 4039774 206 | 4525623 207 | 20090188125 208 | 20100212177 209 | 7123823 210 | 20070119070 211 | 4490602 212 | 5473824 213 | 20140345156 214 | 4890395 215 | 4308670 216 | 4287673 217 | 20050229424 218 | 6089239 219 | 3775861 220 | 5317815 221 | 3846047 222 | 4195217 223 | 7086176 224 | 20040001707 225 | 7165341 226 | 6130991 227 | 20110162225 228 | 20050091867 229 | 6691429 230 | 4218608 231 | 4365141 232 | 9072358 233 | 20030196344 234 | 4647757 235 | 20090320873 236 | 8020827 237 | 4424437 238 | 20020108264 239 | 20070062058 240 | 4321456 241 | 6370326 242 | 20160213122 243 | 9498039 244 | 4297564 245 | 4924602 246 | 4471213 247 | 20160242524 248 | 4603246 249 | 20010025430 250 | 6108934 251 | 3818600 252 | 3717936 253 | 4556782 254 | 5172880 255 | 9578945 256 | 4939345 257 | 6377749 258 | 20170127804 259 | 5765792 260 | 4088869 261 | 20130104415 262 | 20130263464 263 | 4667086 264 | 20100162585 265 | 4118874 266 | 20070114219 267 | 20110073735 268 | 3132232 269 | 4896020 270 | 3202797 271 | 20050204577 272 | 3303325 273 | 9149105 274 | 20050139226 275 | 3109912 276 | 4225775 277 | 20160353854 278 | 2514528 279 | 20020174559 280 | 4896021 281 | 20160220005 282 | 3832789 283 | 20070169369 284 | 3550285 285 | 4514618 286 | 5325809 287 | 7308899 288 | 4365426 289 | 20010005943 290 | 20150289623 291 | 20160367005 292 | 20160367007 293 | 20170105502 294 | 3836750 295 | 3095496 296 | 3889693 297 | 20160262520 298 | 3978314 299 | 4320283 300 | 4370544 301 | 4300280 302 | 3981313 303 | 3594916 304 | 20120317829 305 | 4214149 306 | 3849902 307 | 20160367006 308 | 20050011534 309 | 3284611 310 | 3308268 311 | 20010051042 312 | 6266893 313 | 20060098962 314 | 20160120286 315 | 20060213074 316 | 3304625 317 | 3949487 318 | 20170105503 319 | 20080263887 320 | 3953710 321 | 994259 322 | 5271160 323 | 3937231 324 | 3555699 325 | 20170006991 326 | 4173231 327 | 4641014 328 | 5610990 329 | 3702031 330 | 3348020 331 | 3596371 332 | 4021930 333 | 20130091724 334 | 20160166036 335 | 20160367003 336 | 20140202020 337 | 20150189967 338 | 20070274696 339 | 20110198421 340 | 3782002 341 | 20170156471 342 | 5531032 343 | 20080032543 344 | 3872336 345 | 3872607 346 | 20120291301 347 | 3731396 348 | 3362086 349 | 9185958 350 | 20160367014 351 | 20150201730 352 | 3831000 353 | 20020073573 354 | 6301800 355 | 20160309873 356 | 3836749 357 | 20140326713 358 | 8578623 359 | 20060026858 360 | 3992785 361 | 20150296954 362 | 3791045 363 | 5149209 364 | 9603430 365 | 3769718 366 | 20160051026 367 | 20050204578 368 | 3777406 369 | 20140047727 370 | 20130014402 371 | 20050204576 372 | 3763573 373 | 3358383 374 | 20150216283 375 | -------------------------------------------------------------------------------- /models/landscaping/seeds/hair_dryer_large.seed.csv: -------------------------------------------------------------------------------- 1 | 8407913 2 | 6907678 3 | 9144286 4 | 5956863 5 | 8893400 6 | 8782920 7 | 6798982 8 | 8256132 9 | 8307948 10 | 8459273 11 | 6305325 12 | 6739071 13 | 20150366316 14 | 5725159 15 | 6718651 16 | 6935046 17 | 7784750 18 | 8904663 19 | 5841943 20 | 20170079401 21 | 6199295 22 | 20140290087 23 | 6285828 24 | 20130326898 25 | 6671460 26 | 6011903 27 | 6269549 28 | 6885810 29 | 6038782 30 | 5875562 31 | 5647007 32 | 6314236 33 | 8517318 34 | 20080235980 35 | 20040093756 36 | 6986212 37 | 20060196075 38 | 6191930 39 | 7096597 40 | 5488783 41 | 6491267 42 | 4596921 43 | 7380347 44 | 4659907 45 | 20110203128 46 | 6199805 47 | 20040047620 48 | 20050069303 49 | 6889445 50 | 5701681 51 | 4827105 52 | 6725562 53 | 6354016 54 | 6449870 55 | 4493975 56 | 6026590 57 | 8720078 58 | 20030177657 59 | 20090100698 60 | 9675157 61 | 5790749 62 | 4955145 63 | 6067724 64 | 6732450 65 | 4524263 66 | 20120024620 67 | 7913416 68 | 7584759 69 | 20090065661 70 | 4711988 71 | 5434946 72 | 5842286 73 | 4453695 74 | 5351417 75 | 7350317 76 | 4316077 77 | 5651190 78 | 7047660 79 | 20100282810 80 | 5884008 81 | 4254324 82 | 4683369 83 | 5689896 84 | 6393718 85 | 6901936 86 | 6591516 87 | 20050108889 88 | 5606640 89 | 20110079239 90 | 4918289 91 | 20040231180 92 | 4700049 93 | 4972065 94 | 5448677 95 | 20060254073 96 | 5612849 97 | 4197448 98 | 20100014844 99 | 6188837 100 | 4687906 101 | 4759135 102 | 9675158 103 | 20060201016 104 | 3955065 105 | 4904847 106 | 5216822 107 | 5649370 108 | 4263500 109 | 6367162 110 | 20040163274 111 | 4323761 112 | 4538362 113 | 20080216339 114 | 5613305 115 | 20160022004 116 | 4260875 117 | 20020189128 118 | 4602146 119 | 5572800 120 | 20040088878 121 | 3691646 122 | 20130276321 123 | 5729907 124 | 6408533 125 | 20090188126 126 | 8103155 127 | 20020112362 128 | 5148512 129 | 20130111777 130 | 4198558 131 | 4309595 132 | 4391047 133 | 4635382 134 | 5243682 135 | 4395619 136 | 4936027 137 | 4794225 138 | 6910281 139 | 20070294909 140 | 6026821 141 | 4114022 142 | 4977306 143 | 4767914 144 | 7801423 145 | 8732976 146 | 20010020668 147 | 4295283 148 | 5784800 149 | 7676952 150 | 20130277517 151 | 20040172847 152 | 4327278 153 | 20050284495 154 | 20060064892 155 | 20070137060 156 | 7264209 157 | 20080116753 158 | 4712313 159 | 4800654 160 | 3943329 161 | 4225106 162 | 6085435 163 | 4615347 164 | 5598640 165 | 8230615 166 | 3946498 167 | 3947659 168 | 3981314 169 | 4196343 170 | 8081873 171 | 6029364 172 | 4910385 173 | 5195253 174 | 4132360 175 | 4382174 176 | 5243683 177 | 4430808 178 | 20110177711 179 | 20060006294 180 | 4848007 181 | 4967060 182 | 20040159002 183 | 5235759 184 | 20060075654 185 | 4691451 186 | 5155925 187 | 4658511 188 | 5996243 189 | 20150335128 190 | 6049994 191 | 7204038 192 | 20040020070 193 | 4634836 194 | 5036601 195 | 5157757 196 | 4003388 197 | 20070245590 198 | 5013891 199 | 4706153 200 | 4701595 201 | 20120266483 202 | 20150037015 203 | 6041514 204 | 20130153461 205 | 4039774 206 | 4525623 207 | 20090188125 208 | 20100212177 209 | 7123823 210 | 20070119070 211 | 4490602 212 | 5473824 213 | 20140345156 214 | 4890395 215 | 4308670 216 | 4287673 217 | 20050229424 218 | 6089239 219 | 3775861 220 | 5317815 221 | 3846047 222 | 4195217 223 | 7086176 224 | 20040001707 225 | 7165341 226 | 6130991 227 | 20110162225 228 | 20050091867 229 | 6691429 230 | 4218608 231 | 4365141 232 | 9072358 233 | 20030196344 234 | 4647757 235 | 20090320873 236 | 8020827 237 | 4424437 238 | 20020108264 239 | 20070062058 240 | 4321456 241 | 6370326 242 | 20160213122 243 | 9498039 244 | 4297564 245 | 4924602 246 | 4471213 247 | 20160242524 248 | 4603246 249 | 20010025430 250 | 6108934 251 | 3818600 252 | 3717936 253 | 4556782 254 | 5172880 255 | 9578945 256 | 4939345 257 | 6377749 258 | 20170127804 259 | 5765792 260 | 4088869 261 | 20130104415 262 | 20130263464 263 | 4667086 264 | 20100162585 265 | 4118874 266 | 20070114219 267 | 20110073735 268 | 3132232 269 | 4896020 270 | 3202797 271 | 20050204577 272 | 3303325 273 | 9149105 274 | 20050139226 275 | 3109912 276 | 4225775 277 | 20160353854 278 | 2514528 279 | 20020174559 280 | 4896021 281 | 20160220005 282 | 3832789 283 | 20070169369 284 | 3550285 285 | 4514618 286 | 5325809 287 | 7308899 288 | 4365426 289 | 20010005943 290 | 20150289623 291 | 20160367005 292 | 20160367007 293 | 20170105502 294 | 3836750 295 | 3095496 296 | 3889693 297 | 20160262520 298 | 3978314 299 | 4320283 300 | 4370544 301 | 4300280 302 | 3981313 303 | 3594916 304 | 20120317829 305 | 4214149 306 | 3849902 307 | 20160367006 308 | 20050011534 309 | 3284611 310 | 3308268 311 | 20010051042 312 | 6266893 313 | 20060098962 314 | 20160120286 315 | 20060213074 316 | 3304625 317 | 3949487 318 | 20170105503 319 | 20080263887 320 | 3953710 321 | 994259 322 | 5271160 323 | 3937231 324 | 3555699 325 | 20170006991 326 | 4173231 327 | 4641014 328 | 5610990 329 | 3702031 330 | 3348020 331 | 3596371 332 | 4021930 333 | 20130091724 334 | 20160166036 335 | 20160367003 336 | 20140202020 337 | 20150189967 338 | 20070274696 339 | 20110198421 340 | 3782002 341 | 20170156471 342 | 5531032 343 | 20080032543 344 | 3872336 345 | 3872607 346 | 20120291301 347 | 3731396 348 | 3362086 349 | 9185958 350 | 20160367014 351 | 20150201730 352 | 3831000 353 | 20020073573 354 | 6301800 355 | 20160309873 356 | 3836749 357 | 20140326713 358 | 8578623 359 | 20060026858 360 | 3992785 361 | 20150296954 362 | 3791045 363 | 5149209 364 | 9603430 365 | 3769718 366 | 20160051026 367 | 20050204578 368 | 3777406 369 | 20140047727 370 | 20130014402 371 | 20050204576 372 | 3763573 373 | 3358383 374 | 20150216283 375 | 20110099832 376 | 4524263 377 | 5606640 378 | 5485931 379 | 9127885 380 | 5404419 381 | 20140190033 382 | 5031778 383 | 4195416 384 | 5107603 385 | 20090173286 386 | 4934855 387 | 5996249 388 | 5388344 389 | 5655257 390 | 5675907 391 | 4634836 392 | 5642572 393 | 5829162 394 | 6130991 395 | 8371246 396 | 4742199 397 | 4199873 398 | 4406071 399 | 20010055540 400 | 4035927 401 | 5765792 402 | 3985102 403 | RE30266 404 | 20160348301 405 | 20170055807 406 | 5170038 407 | 20080032543 408 | 3872336 409 | 4044474 410 | 20060281664 411 | -------------------------------------------------------------------------------- /models/landscaping/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from keras.preprocessing import text 16 | 17 | import re 18 | import string 19 | 20 | class TextTokenizer: 21 | punct_regex = re.compile('([%s])' % (string.punctuation + '‘’')) 22 | spaces_regex = re.compile(r'\s{2,}') 23 | number_regex = re.compile(r'\d+') 24 | keras_tokenizer = None 25 | 26 | def __init__( 27 | self): 28 | ''' 29 | ''' 30 | 31 | 32 | 33 | def tokenize_to_onehot_matrix(self, text_series, vocab_size, keras_tokenizer=None): 34 | ''' 35 | ''' 36 | if keras_tokenizer is None: 37 | print('No Keras tokenizer supplied so using vocab size ({}) and series to build new one'.format(vocab_size)) 38 | 39 | keras_tokenizer = text.Tokenizer( 40 | num_words=vocab_size, 41 | split=",", 42 | # filter should be same as default, minus the '-' 43 | filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n', 44 | lower=False) 45 | keras_tokenizer.fit_on_texts(text_series) 46 | keras_tokenizer.index_word = {idx: word for word, idx in keras_tokenizer.word_index.items()} 47 | 48 | text_one_hot = keras_tokenizer.texts_to_matrix(text_series) 49 | 50 | return keras_tokenizer, text_one_hot 51 | 52 | 53 | def tokenize( 54 | self, 55 | text, 56 | normalize_numbers=True, 57 | lowercase=True, 58 | remove_punct=True, 59 | lemmatize=False): 60 | ''' 61 | ''' 62 | 63 | #plain_text = html2text.html2text(text) 64 | plain_text = text 65 | if not isinstance(plain_text, str): 66 | raise Exception(plain_text, type(plain_text)) 67 | 68 | preprocessed = plain_text.replace('\'', '') 69 | if lowercase: 70 | preprocessed = preprocessed.lower() 71 | 72 | # Replace punctuation with spaces which handles cases like "searching/filter", 73 | # "nothing:)" and "writing.like.this" very well. 74 | # The double spaces that often result are then collased by the next method 75 | if remove_punct: 76 | preprocessed = self.punct_regex.sub(' ', preprocessed) 77 | else: 78 | preprocessed = self.punct_regex.sub(r' \1 ', preprocessed) 79 | 80 | preprocessed = self.spaces_regex.sub(' ', preprocessed) 81 | if normalize_numbers: 82 | preprocessed = self.number_regex.sub('_NUMBER_', preprocessed) 83 | 84 | if lemmatize: 85 | preprocessed = shared_funcs.NltkLemmatize( 86 | preprocessed, stem_post_lemmatize=False 87 | ) 88 | 89 | return preprocessed.split() 90 | 91 | 92 | def tokenize_series( 93 | self, 94 | text_series, 95 | normalize_numbers=True, 96 | lowercase=True, 97 | remove_punct=True, 98 | lemmatize=False): 99 | ''' 100 | ''' 101 | 102 | return text_series.apply(self.tokenize) 103 | 104 | -------------------------------------------------------------------------------- /models/landscaping/train_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tokenizer 16 | import importlib 17 | import random 18 | import numpy as np 19 | 20 | from keras.preprocessing import sequence 21 | 22 | importlib.reload(tokenizer) 23 | 24 | class LandscapeTrainingDataUtil: 25 | RAND_SEED=314159 26 | refs_vocab_size = 50000 27 | training_df = None 28 | series_text_to_embed = None 29 | prepped_embedding_train = None 30 | prepped_refs = None 31 | prepped_labels = None 32 | w2v_runtime = None 33 | ref_to_id = None 34 | id_to_ref = None 35 | tokenizer = None 36 | sequence_len = None 37 | 38 | def __init__( 39 | self, training_df, w2v_runtime): 40 | ''' 41 | ''' 42 | 43 | self.w2v_runtime = w2v_runtime 44 | self.training_df = training_df 45 | 46 | self.tokenizer = tokenizer.TextTokenizer() 47 | 48 | def label_text_to_id(self, label_name): 49 | if label_name == 'antiseed': 50 | return 1 51 | else: 52 | return 0 53 | 54 | def label_id_to_text(self, label_idx): 55 | if label_idx == 1: 56 | return 'antiseed' 57 | else: 58 | return 'seed' 59 | 60 | def label_series_to_index(self, labels_series): 61 | labels_indexed = [] 62 | for idx in range(0, len(labels_series)): 63 | label = labels_series[idx] 64 | # 'tokenize' on the label is basically normalizing it 65 | tokenized_label = self.tokenizer.tokenize(label)[0] 66 | label_idx = self.label_text_to_id(tokenized_label) 67 | labels_indexed.append(label_idx) 68 | 69 | return labels_indexed 70 | 71 | def text_series_to_embeddings(self, raw_series_text): 72 | ''' 73 | Takes as input a series of text and associated labels 74 | ''' 75 | 76 | tokenized_text = self.tokenizer.tokenize_series(raw_series_text) 77 | word_to_index_dict = self.w2v_runtime.word_to_index 78 | tokenized_indexed_text = [] 79 | 80 | for idx in range(0, len(tokenized_text)): 81 | text = tokenized_text[idx] 82 | text_word_indexes = [] 83 | for word in text: 84 | if word in word_to_index_dict: 85 | word_idx = word_to_index_dict[word] 86 | else: 87 | word_idx = word_to_index_dict['UNK'] 88 | # this skips 'the' so it can be used for dynamic rnn 89 | if word_idx > 0: 90 | text_word_indexes.append(word_idx) 91 | 92 | tokenized_indexed_text.append(text_word_indexes) 93 | 94 | return tokenized_indexed_text 95 | 96 | def to_text(self, integerized): 97 | words = [] 98 | for word_int in integerized: 99 | words.append(self.w2v_runtime.index_to_word[word_int]) 100 | return ' '.join(words) 101 | 102 | def randomize_and_split(self, percent_train): 103 | training_data_to_shuffle = list( 104 | zip( 105 | self.prepped_embedding_train, 106 | self.refs_one_hot, 107 | self.cpc_one_hot, 108 | self.prepped_labels)) 109 | 110 | print('Randomizing training data') 111 | random.seed(self.RAND_SEED) 112 | random.shuffle(training_data_to_shuffle) 113 | 114 | train_embed_arr, refs_one_hot, cpc_one_hot, label_arr = zip(*training_data_to_shuffle) 115 | 116 | train_idx = int(len(train_embed_arr) * percent_train) 117 | 118 | print('Creating NumPy arrays for train/test set out of randomized training data.') 119 | self.trainEmbedX = np.array(train_embed_arr[:train_idx]) 120 | self.trainRefsOneHotX = np.array(refs_one_hot[:train_idx]) 121 | self.trainCpcOneHotX = np.array(cpc_one_hot[:train_idx]) 122 | 123 | self.testEmbedX = np.array(train_embed_arr[train_idx:]) 124 | self.testRefsOneHotX = np.array(refs_one_hot[train_idx:]) 125 | self.testCpcOneHotX = np.array(cpc_one_hot[train_idx:]) 126 | 127 | self.trainY = np.array(label_arr[:train_idx]) 128 | self.testY = np.array(label_arr[train_idx:]) 129 | 130 | def prepare_training_data( 131 | self, labels_series, series_text_to_embed, refs_series, cpc_series, percent_train, refs_vocab_size, cpc_vocab_size): 132 | 133 | self.series_text_to_embed = series_text_to_embed 134 | self.prepped_embedding_train = self.text_series_to_embeddings(self.series_text_to_embed) 135 | self.prepped_labels = self.label_series_to_index(labels_series) 136 | self.refs_tokenizer, self.refs_one_hot = \ 137 | self.tokenizer.tokenize_to_onehot_matrix(refs_series, refs_vocab_size) 138 | self.cpc_tokenizer, self.cpc_one_hot = \ 139 | self.tokenizer.tokenize_to_onehot_matrix(cpc_series, cpc_vocab_size) 140 | 141 | self.randomize_and_split(percent_train) 142 | 143 | print('Train (embed) data shapes: train: {}, train labels shape: {}'.format( 144 | self.trainEmbedX.shape, self.trainY.shape)) 145 | print('Test (embed) data shape: {}, test labels shape: {}'.format( 146 | self.testEmbedX.shape, self.testY.shape)) 147 | 148 | doc_lengths = list(map(len, self.trainEmbedX)) 149 | median_doc_length = int(np.median(doc_lengths)) 150 | max_doc_length = np.max(doc_lengths) 151 | print('doc lengths for embedding layer: median: {}, mean: {}, max: {}'.format( 152 | median_doc_length, np.mean(doc_lengths), max_doc_length)) 153 | 154 | sequence_len = max_doc_length 155 | self.sequence_len = sequence_len 156 | 157 | print('Using sequence length of {} to pad LSTM sequences.'.format(sequence_len)) 158 | self.padded_train_embed_x = sequence.pad_sequences( 159 | self.trainEmbedX, maxlen=sequence_len, padding='pre', truncating='post') 160 | self.padded_test_embed_x = sequence.pad_sequences( 161 | self.testEmbedX, maxlen=sequence_len, padding='pre', truncating='post') 162 | 163 | print('Training data ready.') 164 | 165 | return self 166 | 167 | def prep_for_inference( 168 | self, series_text_to_embed, refs_series, cpc_series): 169 | 170 | prepped_embedding = self.text_series_to_embeddings(series_text_to_embed) 171 | 172 | _, refs_one_hot = \ 173 | self.tokenizer.tokenize_to_onehot_matrix(refs_series, None, self.refs_tokenizer) 174 | _, cpc_one_hot = \ 175 | self.tokenizer.tokenize_to_onehot_matrix(cpc_series, None, self.cpc_tokenizer) 176 | 177 | prepped_embedding = np.array(prepped_embedding) 178 | refs_one_hot = np.array(refs_one_hot) 179 | cpc_one_hot = np.array(cpc_one_hot) 180 | 181 | doc_lengths = list(map(len, self.trainEmbedX)) 182 | sequence_len = np.max(doc_lengths) 183 | 184 | padded_embed = sequence.pad_sequences( 185 | prepped_embedding, maxlen=sequence_len, padding='pre', truncating='post') 186 | 187 | return padded_embed, refs_one_hot, cpc_one_hot 188 | 189 | def show_instance_details(self, train_instance_idx): 190 | print('\nOriginal: {}\nTokenized: {}\nIntegerized: {}\nLabelIntegerized: {}'.format( 191 | self.series_text_to_embed[train_instance_idx], 192 | self.to_text(self.prepped_embedding_train[train_instance_idx]), 193 | self.prepped_embedding_train[train_instance_idx], 194 | self.prepped_labels[train_instance_idx])) 195 | -------------------------------------------------------------------------------- /tables/dataset_Berkeley Fung.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | geometry: margin=0.6in 4 | --- 5 | 6 | # Berkeley Fung 7 | 8 | 9 | ***** 10 | ## erudite-marker-539:JEMS16.assignee_disambiguation 11 | 12 | 13 | 14 | > Accompanying materials to 15 | > 16 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 17 | > 18 | > Additional links: 19 | > 20 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 21 | > 22 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 23 | > 24 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 25 | 26 | 27 | 28 | 29 | 30 | | Stat | Value | 31 | |----------|----------| 32 | | Last updated | 2018-02-15 | 33 | | Rows | 5,272,283 | 34 | | Size | 239.7 MB | 35 | 36 | ### Schema 37 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.assignee_disambiguation) 38 | 39 | * `PatentNo` STRING NULLABLE 40 | 41 | > Patent number 42 | 43 | * `pdpass` STRING NULLABLE 44 | 45 | > Pdpass (unique identifier of assignees) 46 | 47 | * `assignee_disambiguated` STRING NULLABLE 48 | 49 | > Standardized assignee name 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | ***** 65 | ## erudite-marker-539:JEMS16.assignee_raw 66 | 67 | 68 | 69 | > Accompanying materials to 70 | > 71 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 72 | > 73 | > Additional links: 74 | > 75 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 76 | > 77 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 78 | > 79 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 80 | 81 | 82 | 83 | 84 | 85 | | Stat | Value | 86 | |----------|----------| 87 | | Last updated | 2018-02-15 | 88 | | Rows | 8,579,322 | 89 | | Size | 767.6 MB | 90 | 91 | ### Schema 92 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.assignee_raw) 93 | 94 | * `id` INTEGER NULLABLE 95 | 96 | > System generated 97 | 98 | * `PatentNo` STRING NULLABLE 99 | 100 | > Patent number 101 | 102 | * `Company` STRING NULLABLE 103 | 104 | > Assignee name (can be companies, universities, government agencies, or simply person name) 105 | 106 | * `Geography` STRING NULLABLE 107 | 108 | > Raw (city, state, country) tuple of assginee 109 | 110 | * `Country` STRING NULLABLE 111 | 112 | > Country Code derived from field 'Geography' 113 | 114 | * `State` STRING NULLABLE 115 | 116 | > State Code derived from field 'Geography' (if in U.S.) 117 | 118 | * `City` STRING NULLABLE 119 | 120 | > City Name derived from field 'Geography' 121 | 122 | * `Sequence` STRING NULLABLE 123 | 124 | > Order of appearance (0 means the first assignee, 1 means the second assignee, ..., etc) 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | ***** 150 | ## erudite-marker-539:JEMS16.citation 151 | 152 | 153 | 154 | > Accompanying materials to 155 | > 156 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 157 | > 158 | > Additional links: 159 | > 160 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 161 | > 162 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 163 | > 164 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 165 | 166 | 167 | 168 | 169 | 170 | | Stat | Value | 171 | |----------|----------| 172 | | Last updated | 2018-02-15 | 173 | | Rows | 174,205,746 | 174 | | Size | 10.4 GB | 175 | 176 | ### Schema 177 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.citation) 178 | 179 | * `id` INTEGER NULLABLE 180 | 181 | > System generated 182 | 183 | * `PatentNo_citing` STRING NULLABLE 184 | 185 | > Patent number (citing) 186 | 187 | * `CountryCodeOrNPL_cited` STRING NULLABLE 188 | 189 | > U.S. or a foreign country code or NPL (non-patent literature) of the cited art 190 | 191 | * `PatentNoOrNPL_cited` STRING NULLABLE 192 | 193 | > Patent number of non-patent literature (cited) 194 | 195 | * `sequence` STRING NULLABLE 196 | 197 | > Order of appearance (0 means the first cited art, 1 means the second cited art, ..., etc) 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | ***** 217 | ## erudite-marker-539:JEMS16.citation_self 218 | 219 | 220 | 221 | > Accompanying materials to 222 | > 223 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 224 | > 225 | > Additional links: 226 | > 227 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 228 | > 229 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 230 | > 231 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 232 | 233 | 234 | 235 | 236 | 237 | | Stat | Value | 238 | |----------|----------| 239 | | Last updated | 2018-02-15 | 240 | | Rows | 1,667,637 | 241 | | Size | 20.1 MB | 242 | 243 | ### Schema 244 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.citation_self) 245 | 246 | * `PatentNo` STRING NULLABLE 247 | 248 | > Patent number 249 | 250 | * `Self_Citation_Flag` STRING NULLABLE 251 | 252 | > Backward prior art cites to the same pdpass 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | ***** 266 | ## erudite-marker-539:JEMS16.cpc 267 | 268 | 269 | 270 | > Accompanying materials to 271 | > 272 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 273 | > 274 | > Additional links: 275 | > 276 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 277 | > 278 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 279 | > 280 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 281 | 282 | 283 | 284 | 285 | 286 | | Stat | Value | 287 | |----------|----------| 288 | | Last updated | 2018-02-15 | 289 | | Rows | 65,896,459 | 290 | | Size | 3.7 GB | 291 | 292 | ### Schema 293 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.cpc) 294 | 295 | * `id` INTEGER NULLABLE 296 | 297 | > System generated 298 | 299 | * `PatentNo` STRING NULLABLE 300 | 301 | > Patent number 302 | 303 | * `Type` STRING NULLABLE 304 | 305 | > CPC 306 | 307 | * `CPC_Full` STRING NULLABLE 308 | 309 | > Full CPC 310 | 311 | * `CPC_Layer_1` STRING NULLABLE 312 | 313 | > CPC top layer 1: before {space} 314 | 315 | * `CPC_Layer_2` STRING NULLABLE 316 | 317 | > CPC top layer 2: before {slash} 318 | 319 | * `Sequence` STRING NULLABLE 320 | 321 | > Order of appearance (0 means the first CPC, 1 means the second CPC, ..., etc) 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | ***** 345 | ## erudite-marker-539:JEMS16.inventor_disambiguated_2 346 | 347 | 348 | Old table version `2`, schema skipped. 349 | 350 | 351 | 352 | 353 | 354 | ***** 355 | ## erudite-marker-539:JEMS16.inventor_disambiguated_3 356 | 357 | 358 | 359 | > Accompanying materials to 360 | > 361 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 362 | > 363 | > Additional links: 364 | > 365 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 366 | > 367 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 368 | > 369 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 370 | 371 | 372 | > same table to inventor_disambiguated_2, except for data type differences easier for table joining 373 | 374 | 375 | 376 | 377 | | Stat | Value | 378 | |----------|----------| 379 | | Last updated | 2018-02-23 | 380 | | Rows | 13,345,776 | 381 | | Size | 492.6 MB | 382 | 383 | ### Schema 384 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.inventor_disambiguated_3) 385 | 386 | * `PatentNo` STRING NULLABLE 387 | 388 | * `InventorFullname` STRING NULLABLE 389 | 390 | * `InventorID` STRING NULLABLE 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | ***** 406 | ## erudite-marker-539:JEMS16.inventor_raw 407 | 408 | 409 | 410 | > Accompanying materials to 411 | > 412 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 413 | > 414 | > Additional links: 415 | > 416 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 417 | > 418 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 419 | > 420 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 421 | 422 | 423 | 424 | 425 | 426 | | Stat | Value | 427 | |----------|----------| 428 | | Last updated | 2018-02-15 | 429 | | Rows | 14,745,325 | 430 | | Size | 1.2 GB | 431 | 432 | ### Schema 433 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.inventor_raw) 434 | 435 | * `PatentNo` STRING NULLABLE 436 | 437 | > Patent number 438 | 439 | * `Sequence` STRING NULLABLE 440 | 441 | > Order of appearance (0 means the first inventor, 1 means the second inventor, ..., etc) 442 | 443 | * `FullName` STRING NULLABLE 444 | 445 | > Full name (in form of Last Name {semicolon} First Name {single space} Middle Name) 446 | 447 | * `LastName` STRING NULLABLE 448 | 449 | > Last Name 450 | 451 | * `FirstMiddleName` STRING NULLABLE 452 | 453 | > First Name {single space} Middle Name 454 | 455 | * `Geography` STRING NULLABLE 456 | 457 | > Raw (city, state, country) tuple of assginee 458 | 459 | * `Country` STRING NULLABLE 460 | 461 | > Country Code derived from field 'Geography' 462 | 463 | * `State` STRING NULLABLE 464 | 465 | > State Code derived from field 'Geography' (if in U.S.) 466 | 467 | * `City` STRING NULLABLE 468 | 469 | > City Code derived from field 'Geography' 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | ***** 497 | ## erudite-marker-539:JEMS16.patent_metadata_2 498 | 499 | 500 | 501 | > Accompanying materials to 502 | > 503 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 504 | > 505 | > Additional links: 506 | > 507 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 508 | > 509 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 510 | > 511 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 512 | 513 | 514 | 515 | 516 | 517 | | Stat | Value | 518 | |----------|----------| 519 | | Last updated | 2018-02-15 | 520 | | Rows | 6,492,363 | 521 | | Size | 5.4 GB | 522 | 523 | ### Schema 524 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.patent_metadata_2) 525 | 526 | * `ApplNo` STRING NULLABLE 527 | 528 | > Application number 529 | 530 | * `ApplDate` STRING NULLABLE 531 | 532 | > Application date 533 | 534 | * `PatentNo` STRING NULLABLE 535 | 536 | > Patent number 537 | 538 | * `IssueDate` STRING NULLABLE 539 | 540 | > Patent issue date or grant date 541 | 542 | * `FamilyID` STRING NULLABLE joins on **family_id** 543 | 544 | > Patent Family ID derived from USPTO HTML page of the focal patent 545 | 546 | * `LawFirm` STRING NULLABLE 547 | 548 | > Agent / Law Firm / Correspondent 549 | 550 | * `AssistExaminer` STRING NULLABLE 551 | 552 | > Assistant examiner 553 | 554 | * `PrimaryExaminer` STRING NULLABLE 555 | 556 | > Primary examiner 557 | 558 | * `Title` STRING NULLABLE 559 | 560 | > Patent title 561 | 562 | * `Abstract` STRING NULLABLE 563 | 564 | > Patent abstract 565 | 566 | * `GovernmentInterests` STRING NULLABLE 567 | 568 | > Full text statement acknowledging U.S. government supports (if any) 569 | 570 | 571 | 572 | ### Join columns 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | #### FamilyID 584 | 585 | joins to `patents-public-data:patents.publications::family_id` on **family_id** (87.45%, 5,677,428 rows) 586 | 587 | | Key | Percent | Rows | Sample values | 588 | |------|-----|--------|--------------------------------------------------------| 589 | | `all` | 87.45% | 5,677,428 | `['41164314', '45348360', '46277349', '25524495', '44708394']` | 590 | 591 | 592 | #standardSQL 593 | SELECT 594 | COUNT(*) AS cnt, 595 | COUNT(second.second_column) AS second_cnt, 596 | ARRAY_AGG(first.FamilyID IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 597 | FROM `erudite-marker-539.JEMS16.patent_metadata_2`AS first 598 | LEFT JOIN ( 599 | SELECT family_id AS second_column, COUNT(*) AS cnt 600 | FROM `patents-public-data.patents.publications` 601 | GROUP BY 1 602 | ) AS second ON first.FamilyID = second.second_column 603 | 604 | 605 | 606 | joins from `patents-public-data:patents.publications::family_id` on **family_id** (25.67%, 25,206,642 rows) 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | ***** 626 | ## erudite-marker-539:JEMS16.patent_novelty 627 | 628 | 629 | 630 | > Accompanying materials to 631 | > 632 | > Balsmeier, B., Assaf, M., Chesebro, T., Fierro, G., Johnson, K., Johnson, S., Li, G., W.S. Lueck, O’Reagan, D., Yeh, W., Zang, G., Fleming, L. “Machine learning and natural language processing applied to the patent corpus.” Forthcoming at Journal of Economics and Management Strategy. 633 | > 634 | > Additional links: 635 | > 636 | > o Inventor disambiguation golden file: http://fung-storage.coe.berkeley.edu/disambig.golden.list.txt 637 | > 638 | > o Inventor social network: http://fung-storage.coe.berkeley.edu/inventors/ 639 | > 640 | > “UCB Fung Institute Patent Data” by the University of California: Berkeley is licensed under a Creative Commons Attribution 4.0 International license. 641 | 642 | 643 | 644 | 645 | 646 | | Stat | Value | 647 | |----------|----------| 648 | | Last updated | 2018-02-15 | 649 | | Rows | 2,816,425 | 650 | | Size | 90.7 MB | 651 | 652 | ### Schema 653 | [View in BigQuery](https://bigquery.cloud.google.com/table/erudite-marker-539:JEMS16.patent_novelty) 654 | 655 | * `PatentNo` STRING NULLABLE 656 | 657 | > Patent number 658 | 659 | * `Word` STRING NULLABLE 660 | 661 | > New word (unigram) 662 | 663 | * `CurrentUse` STRING NULLABLE 664 | 665 | > Number of occurrence of the new word in the focal patent 666 | 667 | * `FutureUse` STRING NULLABLE 668 | 669 | > Number of appearances of the new word in subsequent patents (up until Dec 31, 2014) 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | -------------------------------------------------------------------------------- /tables/dataset_Berkeley Fung.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Berkeley Fung.md.pdf -------------------------------------------------------------------------------- /tables/dataset_CPA Global.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | geometry: margin=0.6in 4 | --- 5 | 6 | # CPA Global 7 | 8 | 9 | ***** 10 | ## innography-174118:technical_standards.etsi 11 | 12 | 13 | 14 | > European Telecommunications Standards Institute (ETSI) IPR dataset for technical standards. 15 | > These are the US assets disclosed by companies as related to technical standards in ETSI. The two major ones included are 3GPP and LTE. 16 | 17 | 18 | > “Innography ETSI Data” by Innography (through ETSI IPR) is licensed under a Creative Commons Attribution 4.0 International License. 19 | 20 | 21 | 22 | 23 | | Stat | Value | 24 | |----------|----------| 25 | | Last updated | 2017-07-25 | 26 | | Rows | 34,465 | 27 | | Size | 1.0 MB | 28 | 29 | ### Schema 30 | [View in BigQuery](https://bigquery.cloud.google.com/table/innography-174118:technical_standards.etsi) 31 | 32 | * `PublicationNumber` STRING REQUIRED joins on **publication_number** 33 | 34 | * `StandardBody` STRING REQUIRED 35 | 36 | * `TechnicalStandard` STRING REQUIRED 37 | 38 | 39 | 40 | ### Join columns 41 | 42 | 43 | #### PublicationNumber 44 | 45 | joins to `patents-public-data:patents.publications::publication_number` on **publication_number** (99.98%, 34,458 rows) 46 | 47 | | Key | Percent | Rows | Sample values | 48 | |------|-----|--------|--------------------------------------------------------| 49 | | `3GPP, LTE` | 99.99% | 8,381 | `['US-2009141670-A1', 'US-9673942-B2', 'US-2003185390-A1', 'US-7489672-B2', 'US-8347177-B2']` | 50 | | `LTE` | 99.99% | 7,071 | `['US-2011064120-A1', 'US-2009325504-A1', 'US-2014094175-A1', 'US-6163533-A', 'US-8009661-B2']` | 51 | | `3GPP` | 99.97% | 19,006 | `['US-8594035-B2', 'US-2012014344-A1', 'US-2017012727-A1', 'US-9648048-B2', 'US-2005065801-A1']` | 52 | 53 | 54 | #standardSQL 55 | SELECT 56 | COUNT(*) AS cnt, 57 | COUNT(second.second_column) AS second_cnt, 58 | first.TechnicalStandard AS grouped, 59 | ARRAY_AGG(first.PublicationNumber IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 60 | FROM `innography-174118.technical_standards.etsi`AS first 61 | LEFT JOIN ( 62 | SELECT publication_number AS second_column, COUNT(*) AS cnt 63 | FROM `patents-public-data.patents.publications` 64 | GROUP BY 1 65 | ) AS second ON first.PublicationNumber = second.second_column 66 | GROUP BY 3 67 | 68 | 69 | 70 | joins from `patents-public-data:patents.publications::publication_number` on **publication_number** (0.04%, 34,458 rows) 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /tables/dataset_CPA Global.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_CPA Global.md.pdf -------------------------------------------------------------------------------- /tables/dataset_European Bioinformatics Institute.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_European Bioinformatics Institute.md.pdf -------------------------------------------------------------------------------- /tables/dataset_Google Patents Public Datasets.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Google Patents Public Datasets.md.pdf -------------------------------------------------------------------------------- /tables/dataset_Other.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_Other.md.pdf -------------------------------------------------------------------------------- /tables/dataset_USPTO.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/dataset_USPTO.md.pdf -------------------------------------------------------------------------------- /tables/index.md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/patents-public-data/d3d08a00e41e411be11b8a1394dd6d84f1312463/tables/index.md.pdf -------------------------------------------------------------------------------- /tools/bigquery-indexer/README.md: -------------------------------------------------------------------------------- 1 | # BigQuery column indexer 2 | 3 | This tool supports indexing and normalizing various columns in BigQuery tables. 4 | It reads an input BigQuery SQL statement to select the columns and outputs a new 5 | BigQuery table with the indexed columns. 6 | 7 | 8 | # Running locally (development) 9 | 10 | Build the runner container image with RDKit and Beam dependencies installed. Install the gcloud SDK and authenticate. 11 | 12 | ``` 13 | $ podman --cgroup-manager=cgroupfs build ./beam-rdkit-runner --format docker 14 | ... 15 | STEP 14: COMMIT 16 | --> 49b365fef6f 17 | $ podman run -it --entrypoint "/bin/bash" -v .:/opt/bigquery-indexer 49b365fef6f 18 | (beam-env) root@94bb44368d14$ wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-352.0.0-linux-x86_64.tar.gz 19 | (beam-env) root@94bb44368d14$ tar -xzf google-cloud-sdk-352.0.0-linux-x86_64.tar.gz 20 | (beam-env) root@94bb44368d14$ google-cloud-sdk/install.sh 21 | (beam-env) root@94bb44368d14$ gcloud init 22 | (beam-env) root@94bb44368d14$ gcloud auth application-default login 23 | (beam-env) root@94bb44368d14$ cd /opt/bigquery-indexer && python3 -m main --input_sql "SELECT * FROM nih-nci-cbiit-chem-prod.savi.all LIMIT 100" --output_table :savi.fingerprints --project --temp_location gs:///tmp/ --skip_fingerprint_columns reaction_smiles 24 | ``` 25 | 26 | # Running in Dataflow on GCP 27 | 28 | ``` 29 | $ pip install 'apache-beam[gcp]=2.31.0' 30 | ``` 31 | 32 | See https://cloud.google.com/dataflow/docs/quickstarts/quickstart-python, 33 | specifically setting GOOGLE_APPLICATION_CREDENTIALS is required. 34 | 35 | Use your local GCP account credentials by executing: 36 | 37 | ``` 38 | $ gcloud init 39 | $ gcloud auth application-default login 40 | ``` 41 | 42 | Build the runner container. 43 | 44 | ``` 45 | patents-public-data$ cd tools/bigquery-indexer/beam-rdkit-runner 46 | beam-rdkit-runner$ gcloud builds submit --tag gcr.io//beam-rdkit-runner:latest 47 | ``` 48 | 49 | This example indexes a column containing SMILES (the computer representation of a chemical). 50 | 51 | ```$ python3 -m main --input_sql "SELECT * FROM nih-nci-cbiit-chem-prod.savi.all LIMIT 100" --output_table :savi.fingerprints --project --temp_location gs:///tmp/ --skip_fingerprint_columns reaction_smiles --runner DataflowRunner --max_num_workers=20 --region us-central1 --machine_type=n2-highcpu-16 --disk_size_gb=50 --experiment=use_runner_v2 --sdk_container_image=gcr.io//beam-rdkit-runner:latest --save_main_session``` 52 | 53 | See more configuration flags at https://cloud.google.com/dataflow/docs/guides/flexrs and regions at https://cloud.google.com/dataflow/docs/resources/locations. 54 | -------------------------------------------------------------------------------- /tools/bigquery-indexer/beam-rdkit-runner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM index.docker.io/library/debian:stable-slim 2 | 3 | RUN apt-get update 4 | 5 | RUN apt-get install -y wget build-essential 6 | 7 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 8 | 9 | RUN bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local 10 | 11 | RUN conda create -n beam-env python=3.8 12 | 13 | # Run the next commands inside the conda environment. 14 | RUN conda init bash 15 | RUN echo "conda activate beam-env" >> ~/.bashrc 16 | SHELL ["/bin/bash", "--login", "-c"] 17 | 18 | RUN conda install -q -y -c conda-forge rdkit pip 19 | 20 | RUN pip install --no-cache-dir apache-beam[gcp]==2.31.0 21 | 22 | # Copy files from official SDK image, including script/dependencies 23 | COPY --from=registry.hub.docker.com/apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam 24 | 25 | ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "beam-env", "/opt/apache/beam/boot"] 26 | -------------------------------------------------------------------------------- /tools/bigquery-indexer/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | """ 18 | Index/normalize/match various columns in BigQuery tables. 19 | 20 | Supports: 21 | * chemistry (SMILES) column fingerprinting for similarity search 22 | * (future) patent publication and application number normalization 23 | * (future) OCID matching 24 | """ 25 | 26 | import sys 27 | 28 | import argparse 29 | 30 | import apache_beam as beam 31 | 32 | from google.cloud import bigquery 33 | 34 | # The additional column suffixes added to each input row containing 'smiles'. 35 | fingerprint_columns = set(['morgan_fp', 'rdkit_fp', 'atompair_fp', 'tt_fp']) 36 | 37 | def index_row(row, skip_cols): 38 | orig_keys = list(row.keys()) 39 | for key in orig_keys: 40 | if 'smiles' in key and key not in skip_cols: 41 | fingerprints = generate_fingeprints(row[key]) 42 | for col, fp in fingerprints.items(): 43 | if col not in fingerprint_columns: 44 | raise RuntimeError(f'fingerprints generated column {col} not in {fingerprint_columns}') 45 | row[f'{key}_{col}'] = fp 46 | return row 47 | 48 | def generate_fingeprints(smiles): 49 | # Load these here so they're only needed on the worker machines. 50 | from rdkit import Chem 51 | from rdkit.Chem import rdFingerprintGenerator 52 | 53 | morgan_fp = '' 54 | rdkit_fp = '' 55 | atompair_fp = '' 56 | tt_fp = '' 57 | 58 | try: 59 | mol = Chem.MolFromSmiles(smiles) 60 | 61 | # Morgan 62 | morgan_fp = rdFingerprintGenerator.GetMorganGenerator().GetFingerprint(mol).ToBase64() 63 | 64 | # Feature Morgan 65 | # TODO 66 | 67 | # RDKit 68 | rdkit_fp = rdFingerprintGenerator.GetRDKitFPGenerator().GetFingerprint(mol).ToBase64() 69 | 70 | # Layered 71 | # TODO 72 | 73 | # Atom pairs 74 | atompair_fp = rdFingerprintGenerator.GetAtomPairGenerator().GetFingerprint(mol).ToBase64() 75 | 76 | # MACCS 77 | # TODO 78 | 79 | # Topological Torsion 80 | tt_fp = rdFingerprintGenerator.GetTopologicalTorsionGenerator().GetFingerprint(mol).ToBase64() 81 | 82 | # Pattern 83 | # TODO 84 | 85 | # E-state 86 | # TODO 87 | 88 | except Exception as e: 89 | print(f'Exception {e} processing {smiles}') 90 | return {} 91 | # NOTE: add any new fingerprints to fingerprint_columns. 92 | return {'morgan_fp': morgan_fp, 'rdkit_fp': rdkit_fp, 'atompair_fp': atompair_fp, 'tt_fp': tt_fp} 93 | 94 | def get_query_output_schema(bq_client, query): 95 | try: 96 | # TODO: add support for accessing the schema to bq_client.query(). 97 | result = bq_client._connection.api_request( 98 | method="POST", 99 | path="/projects/jefferson-1790/queries", 100 | data={ 101 | "query": query, 102 | "dryRun": True, 103 | "useLegacySql": False, 104 | }) 105 | except Exception as exc: 106 | raise ValueError(f'Error testing SQL query "{query}"') from exc 107 | return result['schema'] 108 | 109 | def add_fingerprint_schema(orig_schema, skip_cols): 110 | to_add = [] 111 | for field in orig_schema['fields']: 112 | key = field['name'] 113 | if 'smiles' in key and key not in skip_cols: 114 | for fp_col in fingerprint_columns: 115 | to_add.append({ 116 | 'name': f'{key}_{fp_col}', 117 | 'type': 'BYTES', 118 | 'mode': 'NULLABLE', 119 | }) 120 | 121 | return {'fields': orig_schema['fields'] + to_add} 122 | 123 | def run(argv=None): # pylint: disable=missing-docstring 124 | parser = argparse.ArgumentParser() 125 | 126 | parser.add_argument( 127 | '--input_sql', 128 | dest='input_sql', 129 | default='', 130 | help='SQL statement to extract SMILES from. Fields containing `smiles` ' 131 | 'will generate fingerprints, and any additional fields will be ' 132 | 'passed through to the output row.') 133 | parser.add_argument( 134 | '--output_table', 135 | dest='output_table', 136 | required=True, 137 | help='Output BigQuery table with indexed chemistry.') 138 | parser.add_argument( 139 | '--skip_fingerprint_columns', 140 | dest='skip_fingerprint_columns', 141 | default=[], 142 | help='Column names to skip fingerprinting.') 143 | known_args, pipeline_args = parser.parse_known_args(argv) 144 | 145 | skip_cols = set(known_args.skip_fingerprint_columns.split(',')) 146 | 147 | # Query the output schema first so we know the schema to set. 148 | bq_client = bigquery.Client() 149 | 150 | # Get the output schema. 151 | orig_schema = get_query_output_schema(bq_client, known_args.input_sql) 152 | # Add the new fingerprint columns to the schema. 153 | schema = add_fingerprint_schema(orig_schema, skip_cols) 154 | 155 | print(f'Output schema: {schema}') 156 | 157 | with beam.Pipeline(argv=pipeline_args) as p: 158 | input_rows = (p | 'Read' >> beam.io.Read(beam.io.ReadFromBigQuery( 159 | query=known_args.input_sql, 160 | use_standard_sql=True))) 161 | 162 | # Each row is a dictionary where the keys are the BigQuery columns 163 | fingerprints = input_rows | beam.Map( 164 | lambda row: index_row(row, skip_cols)) 165 | 166 | (fingerprints | 'Write' >> beam.io.WriteToBigQuery( 167 | known_args.output_table, 168 | schema=schema, 169 | write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 170 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) 171 | 172 | 173 | if __name__ == "__main__": 174 | run(sys.argv) 175 | -------------------------------------------------------------------------------- /tools/bq_bulk_cp.pysh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copy a bunch of bigquery tables matching a pattern to a new pattern. 16 | # bq_bulk_cp.pysh jefferson-1790:ebi_chembl.*_23 patents-public-data.ebi_chembl.* 17 | import sh 18 | import sys 19 | import re 20 | import argparse 21 | 22 | parser = argparse.ArgumentParser(description="Copy a set of BigQuery tables") 23 | parser.add_argument("--dry_run", default=False, action="store_true", help="do not copy") 24 | parser.add_argument("source", help="source table pattern, 'jefferson-1790:ebi_chembl.*_23'") 25 | parser.add_argument("target", help="target table pattern, 'patents-public-data:ebi_chembl.*_23'") 26 | args = parser.parse_args() 27 | 28 | source_dataset, source_pattern = args.source.split(".") 29 | if "*" in source_dataset: 30 | print("Wildcards are only supported on tables, not datasets.") 31 | sys.exit(1) 32 | 33 | # List all tables in a dataset. 34 | bq = sh.Command("bq") 35 | 36 | if "*" not in source_pattern: 37 | bq("cp", args.source, args.target) 38 | sys.exit(0) 39 | 40 | tables = bq("ls", "-n", "100000", source_dataset).stdout.split("\n")[2:] 41 | 42 | source_re = source_pattern.replace("*", "(.*)") 43 | 44 | for row in tables: 45 | if row == "": 46 | continue 47 | table = row.split()[0] 48 | 49 | match = re.match(source_re, table) 50 | if match: 51 | src = source_dataset + "." + table 52 | dest = args.target.replace("*", match.group(1)) 53 | print("bq cp %s %s" % (src, dest)) 54 | if not args.dry_run: 55 | bq("--debug_mode=true", "--headless=true", "cp", "--force", src, dest, _fg=True) 56 | else: 57 | print("Skipping %s" % source_dataset + "." + table) 58 | -------------------------------------------------------------------------------- /tools/bq_ls.pysh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # List datasets and tables matching a pattern. Collapse versions. 16 | # bq_ls.pysh jefferson-1790:* 17 | import sh 18 | import sys 19 | import re 20 | 21 | source_project = sys.argv[1] 22 | 23 | #source_project_dataset, source_pattern = source.split(".") 24 | #source_project, source_dataset = source_project_dataset.split(":") 25 | 26 | # List all tables in a dataset. 27 | bq = sh.Command("bq") 28 | 29 | datasets = bq("ls", source_project + ":").stdout.split("\n")[2:] 30 | 31 | for row in datasets: 32 | if row == "": 33 | continue 34 | dataset = row.split()[0] 35 | print("-"*50) 36 | print("Dataset: %s" % dataset) 37 | tables = bq("ls", "-n", "100000", source_project + ":" + dataset).stdout.split("\n")[2:] 38 | for x in tables: 39 | if x == "": 40 | continue 41 | table = x.split()[0] 42 | print("\t%s" % table) 43 | -------------------------------------------------------------------------------- /tools/csv_upload.pysh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Uploads one or more CSV files into one or more BigQuery tables. 16 | # 17 | # Single file, single table: 18 | # python3 csv_upload.pysh --source '~/Downloads/table.csv' --tables=jefferson-1790:dataset.table 19 | # 20 | # Multiple files, single table: 21 | # python3 csv_upload.pysh --source '~/Downloads/table_*.csv' --tables=jefferson-1790:dataset.table 22 | # 23 | # Multiple files per table, multiple tables: 24 | # python3 csv_upload.pysh --source '~/Downloads/patstat/Data/{}_part*.txt' --tables=jefferson-1790:epo_patstat.{} 25 | # table1_part00.txt, table1_part01.txt, ... -> jefferson-1790:epo_patstat.table1 26 | # table2_part00.txt -> jefferson-1790:epo_patstat.table2 27 | # etc 28 | import sys 29 | try: 30 | import sh 31 | except: 32 | print("Missing 'sh' library, run 'pip3 install sh'") 33 | sys.exit(1) 34 | import re 35 | import os 36 | import argparse 37 | import glob 38 | import hashlib 39 | import queue 40 | import io 41 | import csv 42 | 43 | parser = argparse.ArgumentParser(description="Upload a CSV file to a BigQuery table") 44 | parser.add_argument("--dry_run", default=False, action="store_true", help="Do not upload.") 45 | parser.add_argument("--bq_bin", default="bq", help="Path to the BigQuery CLI") 46 | parser.add_argument("--gsutil_bin", default="gsutil", help="Path to the GSUtil CLI") 47 | parser.add_argument("--project_id", default="", help="Google Cloud Project ID to store temporary Google Cloud Storage files in. If empty, uses the project from the table name.") 48 | parser.add_argument("--storage_bucket", default="", help="Google Cloud Storage bucket name. This bucket must be in the same region as --location. If empty, creates a new bucket under this project_id.") 49 | parser.add_argument("--overwrite", default=False, action="store_true", help="Overwrite the table if it exists.") 50 | parser.add_argument("--field_delimiter", default=",", help="Field delimiter between data.") 51 | parser.add_argument("--read_header", default=True, action="store_true", help="Set the schema from the first row of the first CSV input, else --header must be set.") 52 | parser.add_argument("--header", default="", help="Comma-separated header names for each column. Only for single tables.") 53 | parser.add_argument("--column_types", default="", help="Comma-separated types for each column. Only for single tables.") 54 | parser.add_argument("--location", default="US", help="Geographical location for the dataset, either US or EU. US is preferred, since JOINs must be between tables in the same region.") 55 | parser.add_argument("--tables", help="BigQuery destination tables. Use '{}' as a placeholder for a matching name in --sources ('project-id:dataset.{}').") 56 | parser.add_argument("--sources", help="CSV source file pattern. Use '{}' to generate multiple table names in --tables ('reg{}_part*.txt', '**/{}.csv').") 57 | args = parser.parse_args() 58 | 59 | # Argument checking. 60 | if not args.location in ["US", "EU"]: 61 | print("--location must be US or EU") 62 | ost.exit(1) 63 | 64 | # Find the source files and destinatination tables. 65 | sources = os.path.expanduser(args.sources) 66 | source_files = glob.glob(sources.replace("{}", "*")) 67 | 68 | table_files = {} 69 | 70 | file_re = sources.replace("*", ".*").replace("{}", "(.*)") + "$" 71 | for file in source_files: 72 | matches = re.search(file_re, file) 73 | if not matches: 74 | continue 75 | table_name = args.tables 76 | if "{}" in args.sources: 77 | table_part = matches.group(1) 78 | else: 79 | table_part = os.path.basename(file).replace('.', '_') 80 | table_name = args.tables.replace('{}', table_part) 81 | 82 | if table_name not in table_files: 83 | table_files[table_name] = [] 84 | table_files[table_name].append(file) 85 | 86 | for table in sorted(table_files.keys()): 87 | print(table) 88 | for v in table_files[table]: 89 | print(" " + v) 90 | 91 | if args.header and len(table_files) > 1: 92 | print("--header can only be set for a single table upload") 93 | os.exit(1) 94 | 95 | # Upload to bucket. 96 | # Clear bucket space 97 | gsutil = sh.Command(args.gsutil_bin) 98 | 99 | project_id = args.project_id 100 | if not project_id: 101 | if not ":" in args.tables: 102 | print("--tables must use project-id:dataset_name.table_name format") 103 | os.exit(1) 104 | project_id = args.tables.split(":")[0] 105 | print("Using --project_id=%s" % project_id) 106 | 107 | 108 | bucket = args.storage_bucket 109 | if not bucket: 110 | bucket = "%s-bq-uploads-tool" % project_id 111 | 112 | bucket = "gs://" + bucket 113 | 114 | try: 115 | gsutil("ls", bucket) 116 | print("Bucket %s exists" % bucket) 117 | except: 118 | if args.location == "EU": 119 | bucket_location = "europe-west1" 120 | else: 121 | bucket_location = "us-east1" 122 | 123 | mb_args = ["mb", "-c", "regional", "-l", bucket_location, "-p", project_id, bucket] 124 | print("gsutil %s" % mb_args) 125 | if not args.dry_run: 126 | gsutil(*mb_args) 127 | print("Created new bucket") 128 | 129 | # Split to 4G, gzip and upload CSV files. Skip the header lines. 130 | bq = sh.Command(args.bq_bin) 131 | 132 | buf = 8 * 2 ** 20 133 | 134 | class Splitter: 135 | def __init__(self, max_size, path): 136 | self.max_size = max_size 137 | self.path = path 138 | self.size = 0 139 | self.parts = 0 140 | self.upload_paths = [] 141 | self.upload_pipe = None 142 | self.upload_proc = None 143 | self.done_pipe = queue.Queue() 144 | 145 | def data(self, data_chunk): 146 | # Pipe this through to the gzip and upload commands. 147 | if not data_chunk: 148 | self.done() 149 | return 150 | chunk_size = len(data_chunk) 151 | if self.size + chunk_size > self.max_size: 152 | self.flush() 153 | self.size = 0 154 | self.upload_proc = None 155 | if self.upload_proc is None: 156 | self.upload_pipe = queue.Queue(maxsize=1) 157 | gzip_pipe = sh.gzip("-f", _in=self.upload_pipe, _in_bufsize=buf, _out_bufsize=buf, _bg=True) 158 | path_split = self.path + "_chunk%09d.gz" % self.parts 159 | self.upload_paths.append(path_split) 160 | self.parts += 1 161 | print("Uploading %s" % path_split) 162 | self.upload_proc = gsutil(gzip_pipe, "cp", "-", path_split, _in_bufsize=buf, _bg=True, _internal_bufsize=16 * 2 ** 20) 163 | print("Upload proc: %s" % self.upload_proc.pid) 164 | 165 | self.size += chunk_size 166 | print("%.4f GB" % (self.size / (2 ** 30))) 167 | self.upload_pipe.put(data_chunk) 168 | 169 | def done(self, *args): 170 | print("Splitter parent done") 171 | self.flush() 172 | self.done_pipe.put(True) 173 | 174 | def done_wait(self): 175 | # Block until done() is finished. 176 | self.done_pipe.get() 177 | 178 | def flush(self): 179 | print("Closing upload pipe") 180 | self.upload_pipe.put(None) 181 | print("Waiting for upload to finish") 182 | self.upload_proc.wait() 183 | print("Upload finished") 184 | 185 | 186 | for table in sorted(table_files.keys()): 187 | files = table_files[table] 188 | print("Uploading files for table %s" % table) 189 | uploaded_paths = [] 190 | for file in files: 191 | dest = bucket + "/%s_%s_%s" % (re.sub('[^a-zA-Z0-9_]', '', table), hashlib.sha1(file.encode('utf-8')).hexdigest(), os.path.basename(file)) 192 | # Split into 4G chunks, gzip and upload. 193 | print("Copying %s to %s..." % (file, dest)) 194 | if not args.dry_run: 195 | splitter = Splitter(4 * 2 ** 30, dest) # 4G 196 | # Read each file and forward the stream to splitter.data(chunk). 197 | with open(file, 'rb') as f: 198 | while True: 199 | chunk = f.read(buf) 200 | splitter.data(chunk) 201 | if not chunk: 202 | break 203 | splitter.done_wait() 204 | uploaded_paths.extend(splitter.upload_paths) 205 | else: 206 | uploaded_paths.append(dest + "...[dry run]") 207 | 208 | # Get the header. 209 | 210 | 211 | # build header and column datatypes 212 | header = args.header 213 | skip_leading_rows = 0 214 | if not header: 215 | if args.field_delimiter == "\\t": 216 | col_sep = "\t" 217 | elif args.field_delimiter == "\\s": 218 | col_sep = "\s" 219 | else: 220 | col_sep = args.field_delimiter 221 | with open(files[0], 'r') as f: 222 | header_list = next(csv.reader(f, delimiter=col_sep)) 223 | print(header_list) 224 | skip_leading_rows = 1 225 | else: 226 | header_list = header.split(",") 227 | # A column name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_) and 228 | # start with a letter. 229 | clean_headers = [] 230 | for header in header_list: 231 | h = re.sub('[^a-zA-Z0-9_]', '', header) 232 | if re.match('^[0-9].*$', h): 233 | h = "f" + h 234 | clean_headers.append(h) 235 | header_list = clean_headers 236 | 237 | print("Headers: %s" % header_list) 238 | 239 | column_types = args.column_types 240 | if not column_types: 241 | column_types_list = ['STRING'] * len(header_list) 242 | else: 243 | column_types_list = column_types.split(",") 244 | 245 | if len(header_list) != len(column_types_list): 246 | print("Number of header fields and column types must be equal.") 247 | os.exit(1) 248 | 249 | schema = ",".join([header_name + ":" + header_type.upper() for header_name, header_type in zip(header_list, column_types_list)]) 250 | 251 | # bq create table uploaded_paths 252 | bq_args = [ 253 | "--location", args.location, 254 | "--project_id", project_id, 255 | "load", 256 | "--source_format", "CSV", 257 | "--replace", 258 | "--field_delimiter", args.field_delimiter, 259 | "--schema", schema, 260 | "--allow_quoted_newlines", 261 | "--skip_leading_rows", "%d" % skip_leading_rows, 262 | table, 263 | ",".join(uploaded_paths), 264 | ] 265 | print("Creating table %s" % table) 266 | try: 267 | dataset = table.split(".")[0] 268 | bq("show", dataset) 269 | except: 270 | print("Creating dataset %s" % dataset) 271 | bq_mk_args = ["--location", args.location, "mk", "--project_id", project_id, dataset] 272 | print("bq %s" % bq_mk_args) 273 | if not args.dry_run: 274 | bq(*bq_mk_args) 275 | 276 | print("bq %s" % bq_args) 277 | if not args.dry_run: 278 | bq(*bq_args) 279 | print("Removing uploaded files %s" % uploaded_paths) 280 | gsutil("rm", *uploaded_paths) 281 | print("Done creating %s" % table) 282 | -------------------------------------------------------------------------------- /tools/dataset_berkeley_fung.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "Berkeley Fung": [ 4 | "erudite-marker-539:JEMS16.*" 5 | ] 6 | }, 7 | "groups": {}, 8 | "joins": { 9 | "family_id": [ 10 | "erudite-marker-539:JEMS16.patent_metadata_2|FamilyID" 11 | ] 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tools/dataset_ebi.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "European Bioinformatics Institute": [ 4 | "patents-public-data:ebi_chembl.*", 5 | "patents-public-data:ebi_surechembl.*" 6 | ] 7 | }, 8 | "groups": {}, 9 | "joins": { 10 | "publication_number": [ 11 | "patents-public-data:ebi_surechembl.match|publication_number", 12 | "patents-public-data:ebi_chembl.match_24|publication_number" 13 | ], 14 | "SureChEMBL patent_id": [ 15 | "patents-public-data:ebi_surechembl.match|patent_id", 16 | "+patents-public-data:ebi_surechembl.map|patent_id" 17 | ], 18 | "ChEMBL patent_no": [ 19 | "patents-public-data:ebi_chembl.match_24|patent_no", 20 | "+patents-public-data:ebi_chembl.product_patents_24|patent_no" 21 | ], 22 | "ChEMBL molregno": [ 23 | "+patents-public-data:ebi_chembl.compound_properties_24|molregno", 24 | "patents-public-data:ebi_chembl.*|molregno" 25 | ] 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tools/dataset_ifi.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "IFI Claims": [ 4 | "jefferson-1790:ifi_claims.xml", 5 | "jefferson-1790:ifi_claims.publications" 6 | ] 7 | }, 8 | "groups": { 9 | "jefferson-1790:ifi_claims.xml": "country" 10 | }, 11 | "joins": { 12 | "publication_number": [ 13 | "+jefferson-1790:ifi_claims.xml|publication_number" 14 | ] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tools/dataset_innography.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "CPA Global": [ 4 | "innography-174118:technical_standards.etsi" 5 | ] 6 | }, 7 | "groups": { 8 | "innography-174118:technical_standards.etsi": "TechnicalStandard" 9 | }, 10 | "joins": { 11 | "publication_number": [ 12 | "innography-174118:technical_standards.etsi|PublicationNumber" 13 | ] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tools/dataset_other.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "Other": [ 4 | "patents-public-data:cpc.*", 5 | "patents-public-data:dsep.*", 6 | "patents-public-data:marec.*", 7 | "patents-public-data:usitc_investigations.*", 8 | "patents-public-data:worldbank_wdi.*" 9 | ] 10 | }, 11 | "groups": {}, 12 | "joins": { 13 | "publication_number": [ 14 | "patents-public-data:usitc_investigations.match|publication_number", 15 | "patents-public-data:marec.publications|publication_number" 16 | ], 17 | "family_id": [ 18 | "patents-public-data:dsep.disclosures_13|family_id" 19 | ] 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tools/dataset_public.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "Google Patents Public Datasets": [ 4 | "patents-public-data:patents.*", 5 | "patents-public-data:google_patents_research.*" 6 | ] 7 | }, 8 | "groups": { 9 | "patents-public-data:patents.publications": "country_code" 10 | }, 11 | "joins": { 12 | "publication_number": [ 13 | "+patents-public-data:patents.publications|publication_number", 14 | "patents-public-data:google_patents_research.publications|publication_number" 15 | ], 16 | "family_id": [ 17 | "+patents-public-data:patents.publications|family_id" 18 | ], 19 | "application_number": [ 20 | "+patents-public-data:patents.publications|application_number" 21 | ] 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tools/dataset_report.pysh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Generate a report on the database tables. 16 | # 17 | # $ python3 dataset_report.pysh --project_id= --configs dataset_public.json dataset_uspto.json ... --output_dir=../tables --formats=pdf 18 | import sh 19 | import sys 20 | import re 21 | import os 22 | import json 23 | import collections 24 | import datetime 25 | import jinja2 26 | import argparse 27 | 28 | parser = argparse.ArgumentParser(description="Generate a set of documentation pages for BigQuery tables.") 29 | parser.add_argument("--project_id", help="Project ID used to query tables.") 30 | parser.add_argument("--configs", nargs="+", help="List of JSON configuration files.") 31 | parser.add_argument("--output_dir", help="Output directory for files.") 32 | parser.add_argument("--formats", help="Comma-separated list of output formats (pandoc-supported extensions)") 33 | args = parser.parse_args() 34 | 35 | if not args.output_dir: 36 | print("--output_dir is required") 37 | sys.exit(1) 38 | if not args.project_id: 39 | print("--project_id is required") 40 | sys.exit(1) 41 | 42 | output_dir = os.path.expanduser(args.output_dir) 43 | 44 | bq = sh.Command("bq") 45 | 46 | # Read config files. 47 | table_config = {} 48 | group_config = {} 49 | join_config = {} 50 | 51 | for name in args.configs: 52 | print("Reading config %s" % name) 53 | with open(os.path.expanduser(name), "r") as f: 54 | try: 55 | c = json.loads(f.read()) 56 | except Exception as e: 57 | print("Error parsing JSON (this is usually caused by a trailing comma)") 58 | raise e 59 | for k, v in c.get("tables", {}).items(): 60 | if k in table_config: 61 | table_config[k].extend(v) 62 | else: 63 | table_config[k] = v 64 | group_config.update(c.get("groups", {})) 65 | for k, v in c.get("joins", {}).items(): 66 | if k in join_config: 67 | join_config[k].extend(v) 68 | else: 69 | join_config[k] = v 70 | 71 | print(table_config) 72 | print(group_config) 73 | print(join_config) 74 | 75 | # Keep track of printed objects from __repr__. 76 | __repr_recursion_set = None 77 | 78 | def namedtuple(name, field_list): 79 | fields = field_list.split(" ") 80 | def init(self, **kwargs): 81 | for k, v in kwargs.items(): 82 | if not k in fields: 83 | raise AttributeError("%s not in %s" % (k, fields)) 84 | setattr(self, k, v) 85 | def repr(self): 86 | global __repr_recursion_set 87 | top = False 88 | if not __repr_recursion_set: 89 | top = True 90 | __repr_recursion_set = set() 91 | if self in __repr_recursion_set: 92 | result = "%s<...>" % name 93 | else: 94 | __repr_recursion_set.add(self) 95 | result = "%s<%s>" % (name, ", ".join(["%s=%s" % (k, getattr(self, k)) for k in fields])) 96 | if top: 97 | __repr_recursion_set = None 98 | return result 99 | return type(name, (), dict({k: None for k in fields}, __init__=init, __repr__=repr)) 100 | 101 | Dataset = namedtuple("Dataset", "name last_updated tables") 102 | 103 | Table = namedtuple("Table", "name version dataset_description description dataset fields last_updated num_rows from_joins num_bytes old_version") 104 | 105 | Field = namedtuple("Field", "name table description type mode from_joins to_joins") 106 | 107 | Join = namedtuple("Join", "name from_field to_field percent num_rows join_stats sql") 108 | 109 | JoinStat = namedtuple("JoinStat", "percent num_rows key sample_value") 110 | 111 | datasets = collections.OrderedDict() 112 | 113 | def find_field(table_name, column): 114 | for dataset in datasets.values(): 115 | for t in dataset.tables: 116 | if t.name == table_name: 117 | for f in t.fields: 118 | if column == f.name: 119 | return f 120 | return None 121 | 122 | def ts_to_string(unix): 123 | return datetime.datetime.utcfromtimestamp(unix).strftime("%Y-%m-%d") 124 | 125 | def tsql(table): 126 | return table.replace(":", ".") 127 | 128 | # Fetch a list of all tables and schemas for those tables. 129 | for nice_name, table_fmts in table_config.items(): 130 | for table_fmt in table_fmts: 131 | dataset_name, table_name = table_fmt.split(".") 132 | if nice_name not in datasets: 133 | dataset = Dataset(name=nice_name) 134 | datasets[nice_name] = dataset 135 | else: 136 | dataset = datasets[nice_name] 137 | show_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", dataset_name).stdout.decode('utf-8')) 138 | 139 | if not dataset.tables: 140 | dataset.tables = [] 141 | 142 | print("Loading dataset %s" % dataset_name) 143 | tables = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "ls", "-n", "100000", dataset_name).stdout.decode('utf-8')) 144 | for table_data in tables: 145 | name = table_data["tableReference"]["tableId"] 146 | if re.match(table_name.replace("*", ".*"), name): 147 | table = Table(name=dataset_name + "." + name, dataset=dataset, dataset_description=show_info.get("description", "")) 148 | dataset.tables.append(table) 149 | print(table.name) 150 | 151 | # Detect table and dataset versions, mark older versions. 152 | latest_table_base = {} # map[base name]latest name 153 | no_version_tables = {} 154 | for dataset in datasets.values(): 155 | for table in dataset.tables: 156 | def sub_fn(m): 157 | return m.group(1) 158 | m = re.match("^(.+)_([0-9]+[0-9a-zA-Z]*)", table.name) 159 | if not m: 160 | no_version_tables[table.name] = True 161 | latest_table_base[table.name] = table.name 162 | else: 163 | base = m.group(1) 164 | table.version = m.group(2) 165 | if not base in latest_table_base: 166 | latest_table_base[base] = table.name 167 | elif latest_table_base[base] < table.name and not no_version_tables.get(base, ""): 168 | latest_table_base[base] = table.name 169 | 170 | 171 | latest_tables = {} 172 | for latest in latest_table_base.values(): 173 | latest_tables[latest] = True 174 | 175 | for dataset in datasets.values(): 176 | for table in dataset.tables: 177 | if table.name not in latest_tables: 178 | table.old_version = True 179 | 180 | for dataset in datasets.values(): 181 | for table in dataset.tables: 182 | if table.old_version: 183 | print("Skipping old table %s" % table.name) 184 | continue 185 | print("Loading table %s" % table.name) 186 | table_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", table.name).stdout.decode('utf-8')) 187 | table_fields = [] 188 | def add_fields(parent, fields): 189 | for field in fields: 190 | name = field["name"] 191 | if parent: 192 | name = parent + "." + name 193 | table_fields.append(Field( 194 | name=name, 195 | table=table, 196 | description=field.get("description", ""), 197 | type=field.get("type", ""), 198 | mode=field.get("mode", ""), 199 | )) 200 | if "fields" in field: 201 | add_fields(name, field["fields"]) 202 | 203 | add_fields("", table_info["schema"]["fields"]) 204 | table.fields = table_fields 205 | table.description = table_info.get("description", "") 206 | table.last_updated = ts_to_string(int(table_info["lastModifiedTime"]) / 1000) 207 | if not dataset.last_updated or dataset.last_updated < table.last_updated: 208 | dataset.last_updated = table.last_updated 209 | table.num_rows = table_info["numRows"] 210 | table.num_bytes = table_info["numBytes"] 211 | # Possibly calculate group-by stats. 212 | if table.name in group_config: 213 | column = group_config[table.name] 214 | query = "SELECT COUNT(*) AS cnt, {column} AS grouped FROM `{table}` GROUP BY 2 ORDER BY 1".format(table=tsql(table.name), column=column) 215 | result = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "query", "--use_legacy_sql=false", query).stdout.decode('utf-8')) 216 | table.stats = {} 217 | for row in result: 218 | js = JoinStat(key=row["grouped"], num_rows=int(row["cnt"])) 219 | table.stats[js.key] = js 220 | 221 | 222 | # Support wildcards in join groups: dataset:*|molregno 223 | for join_group in join_config.values(): 224 | i = 0 225 | while i < len(join_group): 226 | if not "*" in join_group[i]: 227 | i += 1 228 | continue 229 | table_fmt, column_fmt = join_group[i].split("|") 230 | # Loop over all tables and columns and look for matches. 231 | matches = [] 232 | for dataset in datasets.values(): 233 | for table in dataset.tables: 234 | if not re.match(table_fmt.replace("*", ".*"), table.name) or table.old_version: 235 | continue 236 | for field in table.fields: 237 | if re.match(column_fmt.replace("*", ".*"), field.name): 238 | matches.append("%s|%s" % (table.name, field.name)) 239 | # Replace join_group[i] with the matched values. 240 | join_group.pop(i) 241 | for v in matches: 242 | join_group.insert(i, v) 243 | i += 1 244 | 245 | join_done = set() 246 | 247 | # Enumerate all possible joins inside each group of matching columns. 248 | for join_name, join_group in join_config.items(): 249 | for i in range(len(join_group)): 250 | self = join_group[i] 251 | for j in range(len(join_group)): 252 | if j == i: 253 | continue 254 | first_table, first_column = join_group[i].split("|") 255 | second_table, second_column = join_group[j].split("|") 256 | # Only join tables if one or more has a + as the prefix. 257 | if not first_table.startswith("+") and not second_table.startswith("+"): 258 | continue 259 | first_table = first_table.lstrip("+") 260 | second_table = second_table.lstrip("+") 261 | key = first_table + first_column + second_table + second_column 262 | if key in join_done or (first_table == second_table and first_column == second_column): 263 | continue 264 | join_done.add(key) 265 | print("Running join between %s and %s" % (join_group[i], join_group[j])) 266 | from_field = find_field(first_table, first_column) 267 | to_field = find_field(second_table, second_column) 268 | if not from_field or not to_field: 269 | raise TypeError("fields not found: %s:%s %s:%s" % (join_group[i], from_field is not None, join_group[j], to_field is not None)) 270 | group_by = group_config.get(first_table, None) 271 | if not group_by: 272 | query = """#standardSQL 273 | SELECT 274 | COUNT(*) AS cnt, 275 | COUNT(second.second_column) AS second_cnt, 276 | ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 277 | FROM `{first_table}`AS first 278 | LEFT JOIN ( 279 | SELECT {second_column} AS second_column, COUNT(*) AS cnt 280 | FROM `{second_table}` 281 | GROUP BY 1 282 | ) AS second ON first.{first_column} = second.second_column""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column) 283 | else: 284 | query = """#standardSQL 285 | SELECT 286 | COUNT(*) AS cnt, 287 | COUNT(second.second_column) AS second_cnt, 288 | first.{group_by} AS grouped, 289 | ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 290 | FROM `{first_table}`AS first 291 | LEFT JOIN ( 292 | SELECT {second_column} AS second_column, COUNT(*) AS cnt 293 | FROM `{second_table}` 294 | GROUP BY 1 295 | ) AS second ON first.{first_column} = second.second_column 296 | GROUP BY 3""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column, group_by=group_by) 297 | 298 | result = json.loads(bq("--format=prettyjson", "query", "--use_legacy_sql=false", query).stdout.decode('utf-8')) 299 | total_rows = 0 300 | joined_rows = 0 301 | 302 | join_stats = {} 303 | join = Join(name=join_name, from_field=from_field, to_field=to_field, join_stats=join_stats, sql=query) 304 | if not from_field.from_joins: 305 | from_field.from_joins = [] 306 | from_field.from_joins.append(join) 307 | if not to_field.to_joins: 308 | to_field.to_joins = [] 309 | to_field.to_joins.append(join) 310 | if not from_field.table.from_joins: 311 | from_field.table.from_joins = [] 312 | from_field.table.from_joins.append(join) 313 | for row in result: 314 | cnt = int(row["cnt"]) 315 | second_cnt = int(row["second_cnt"]) 316 | total_rows += cnt 317 | joined_rows += second_cnt 318 | if not group_by: 319 | join_stats[""] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key="all", sample_value=row["sample_value"]) 320 | else: 321 | join_stats[row["grouped"]] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key=row["grouped"], sample_value=row["sample_value"]) 322 | join.percent = joined_rows / total_rows 323 | join.num_rows = joined_rows 324 | 325 | def other_formats(name): 326 | if not args.formats: 327 | return 328 | for fmt in args.formats.split(","): 329 | sh.pandoc(name, "--from", "markdown", "-s", "-o", "%s.%s" % (name, fmt)) 330 | 331 | # "index.md" 332 | # Links to every dataset and description of each dataset 333 | # DOT graph of links between tables 334 | # Link statistics: % of rows that link together 335 | main_page_template = jinja2.Template(""" 336 | --- 337 | geometry: margin=0.6in 338 | --- 339 | 340 | # Datasets 341 | 342 | {% for dataset in datasets.values() %} 343 | ## [{{dataset.name}}](dataset_{{dataset.name}}.md) 344 | 345 | | Name | Last updated | Rows | Joins | 346 | |-------------------------------------------|-------|--------|-----------------| 347 | {% for table in dataset.tables -%} 348 | | [{{table.name}}](https://bigquery.cloud.google.com/table/{{table.name}}) | {% if table.last_updated %}{{table.last_updated }}{% endif %} | {% if table.num_rows %}{{"{0:,}".format(table.num_rows|int)}}{% endif %} | 349 | {%- if table.from_joins %}{% for group in table.from_joins|groupby("name") -%} 350 | {{group.grouper}} {% endfor %}{% endif %} | 351 | {% endfor %} 352 | {% endfor %} 353 | """) 354 | 355 | index_output = os.path.join(output_dir, "index.md") 356 | with open(index_output, "w") as f: 357 | f.write(main_page_template.render(datasets=datasets)) 358 | other_formats(index_output) 359 | 360 | # "dataset_.md" 361 | # Description of dataset 362 | # List of all tables in dataset 363 | # Sample rows in each table 364 | # Links to other datasets 365 | # Inner-dataset links 366 | # DOT graph of links 367 | dataset_page_template = jinja2.Template(""" 368 | --- 369 | geometry: margin=0.6in 370 | --- 371 | 372 | # {{dataset.name}} 373 | 374 | {% for table in dataset.tables %} 375 | ***** 376 | ## {{table.name}} 377 | 378 | {% if table.old_version %} 379 | Old table version `{{ table.version }}`, schema skipped. 380 | {% else %} 381 | {% if table.dataset_description %} 382 | > {{table.dataset_description|replace("\n", "\n> ")}} 383 | {% endif %} 384 | {% if table.description %} 385 | > {{table.description|replace("\n", "\n> ")}} 386 | {% endif %} 387 | {% endif %} 388 | 389 | {% if table.fields %} 390 | | Stat | Value | 391 | |----------|----------| 392 | | Last updated | {{table.last_updated}} | 393 | | Rows | {{"{0:,}".format(table.num_rows|int)}} | 394 | | Size | {{table.num_bytes|filesizeformat}} | 395 | 396 | ### Schema 397 | [View in BigQuery](https://bigquery.cloud.google.com/table/{{table.name}}) 398 | 399 | {% for field in table.fields -%} 400 | * `{{field.name}}` {{field.type}} {{field.mode}} {% if field.from_joins %} joins on **{{ field.from_joins[0].name }}**{% endif %} 401 | {% if field.description %} 402 | > {{field.description|replace("\n", "\n> ")}} 403 | {% endif %} 404 | {% endfor %} 405 | 406 | {% if table.from_joins %}### Join columns{% endif %} 407 | {% for field in table.fields %} 408 | {% if field.from_joins %} 409 | #### {{field.name}} 410 | {% for join in field.from_joins %} 411 | joins to `{{ join.to_field.table.name }}::{{ join.to_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows) 412 | 413 | | Key | Percent | Rows | Sample values | 414 | |------|-----|--------|--------------------------------------------------------| 415 | {% for stat in join.join_stats.values() -%} 416 | | `{{stat.key}}` | {% if stat.percent > 0.0 %}{{"%.2f" % (100 * stat.percent)}}%{% else %}*none*{% endif %} | {{"{0:,}".format(stat.num_rows|int)}} | `{{stat.sample_value}}` | 417 | {% endfor %} 418 | 419 | {{join.sql|indent}} 420 | 421 | {% endfor %} 422 | {% for join in field.to_joins %} 423 | joins from `{{ join.from_field.table.name }}::{{ join.from_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows) 424 | {% endfor %} 425 | {% endif %} 426 | {% endfor %} 427 | {% endif %} 428 | 429 | {% endfor %} 430 | """) 431 | 432 | for dataset in datasets.values(): 433 | output = os.path.join(output_dir, "dataset_%s.md" % dataset.name) 434 | with open(output, "w") as f: 435 | f.write(dataset_page_template.render(dataset=dataset)) 436 | other_formats(output) 437 | -------------------------------------------------------------------------------- /tools/dataset_uspto.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "USPTO": [ 4 | "patents-public-data:patentsview.*", 5 | "patents-public-data:uspto_oce_assignment.*", 6 | "patents-public-data:uspto_oce_cancer.*", 7 | "patents-public-data:uspto_oce_claims.*", 8 | "patents-public-data:uspto_oce_litigation.*", 9 | "patents-public-data:uspto_oce_office_actions.*", 10 | "patents-public-data:uspto_oce_pair.*", 11 | "patents-public-data:uspto_peds.*", 12 | "patents-public-data:uspto_ptab.*" 13 | ] 14 | }, 15 | "groups": { 16 | "patents-public-data:patents.publications": "country_code" 17 | }, 18 | "joins": { 19 | "publication_number": [ 20 | "patents-public-data:patentsview.match|publication_number", 21 | "patents-public-data:uspto_oce_assignment.match|publication_number", 22 | "patents-public-data:uspto_oce_cancer.match|publication_number", 23 | "patents-public-data:uspto_oce_claims.match|publication_number" 24 | ], 25 | "family_id": [ 26 | "patents-public-data:uspto_oce_cancer.publications|Family_ID" 27 | ], 28 | "application_number": [ 29 | "patents-public-data:uspto_oce_office_actions.match_app|application_number", 30 | "patents-public-data:uspto_oce_pair.match|application_number", 31 | "patents-public-data:uspto_peds.match|application_number", 32 | "patents-public-data:uspto_ptab.match|application_number" 33 | ], 34 | "OCE Assignment pgpub_doc_num": [ 35 | "patents-public-data:uspto_oce_assignment.match|pgpub_doc_num", 36 | "+patents-public-data:uspto_oce_assignment.documentid|pgpub_doc_num" 37 | ], 38 | "OCE Assignment grant_doc_num": [ 39 | "patents-public-data:uspto_oce_assignment.match|grant_doc_num", 40 | "+patents-public-data:uspto_oce_assignment.documentid|grant_doc_num" 41 | ], 42 | "OCE Cancer id": [ 43 | "patents-public-data:uspto_oce_cancer.match|Patent_or_Publication_ID", 44 | "+patents-public-data:uspto_oce_cancer.publications|Patent_or_Publication_ID" 45 | ], 46 | "OCE Claims pat_no": [ 47 | "patents-public-data:uspto_oce_claims.match|pat_no", 48 | "+patents-public-data:uspto_oce_claims.patent_document_stats|pat_no" 49 | ], 50 | "OCE Claims pub_no": [ 51 | "patents-public-data:uspto_oce_claims.match|pub_no", 52 | "+patents-public-data:uspto_oce_claims.pgpub_document_stats|pub_no" 53 | ], 54 | "OCE OA app_id": [ 55 | "patents-public-data:uspto_oce_office_actions.match_app|app_id", 56 | "+patents-public-data:uspto_oce_office_actions.citations|app_id", 57 | "+patents-public-data:uspto_oce_office_actions.office_actions|app_id", 58 | "+patents-public-data:uspto_oce_office_actions.rejections|app_id" 59 | ], 60 | "OCE OA pub_id": [ 61 | "+patents-public-data:uspto_oce_office_actions.citations|parsed", 62 | "patents-public-data:uspto_oce_office_actions.match_pub|parsed" 63 | ], 64 | "OCE PAIR app_num": [ 65 | "patents-public-data:uspto_oce_pair.match|application_number_pair", 66 | "+patents-public-data:uspto_oce_pair.application_data|application_number" 67 | ], 68 | "PEDS app_num": [ 69 | "patents-public-data:uspto_peds.match|applicationNumberText", 70 | "+patents-public-data:uspto_peds.applications|patentCaseMetadata.applicationNumberText.electronicText" 71 | ], 72 | "PTAB app_num": [ 73 | "patents-public-data:uspto_ptab.match|ApplicationNumber", 74 | "+patents-public-data:uspto_ptab.trials|ApplicationNumber" 75 | ], 76 | "PatentsView patent_id": [ 77 | "+patents-public-data:patentsview.patent|id", 78 | "patents-public-data:patentsview.*|patent_id" 79 | ] 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /tools/generate_dataset_docs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Generate a report on the database tables. 16 | # 17 | # $ python3 dataset_report.pysh --project_id= --configs dataset_public.json dataset_uspto.json ... --output_dir=../tables --formats=pdf 18 | import sh 19 | import sys 20 | import re 21 | import os 22 | import json 23 | import collections 24 | import datetime 25 | import jinja2 26 | import argparse 27 | 28 | parser = argparse.ArgumentParser(description="Generate a set of documentation pages for BigQuery tables.") 29 | parser.add_argument("--project_id", help="Project ID used to query tables.") 30 | parser.add_argument("--configs", nargs="+", help="List of JSON configuration files.") 31 | parser.add_argument("--output_dir", help="Output directory for files.") 32 | args = parser.parse_args() 33 | 34 | if not args.output_dir: 35 | print("--output_dir is required") 36 | sys.exit(1) 37 | if not args.project_id: 38 | print("--project_id is required") 39 | sys.exit(1) 40 | 41 | output_dir = os.path.expanduser(args.output_dir) 42 | 43 | bq = sh.Command("bq") 44 | 45 | # Read config files. 46 | table_config = {} 47 | group_config = {} 48 | join_config = {} 49 | 50 | for name in args.configs: 51 | print("Reading config %s" % name) 52 | with open(os.path.expanduser(name), "r") as f: 53 | try: 54 | c = json.loads(f.read()) 55 | except Exception as e: 56 | print("Error parsing JSON (this is usually caused by a trailing comma)") 57 | raise e 58 | for k, v in c.get("tables", {}).items(): 59 | if k in table_config: 60 | table_config[k].extend(v) 61 | else: 62 | table_config[k] = v 63 | group_config.update(c.get("groups", {})) 64 | for k, v in c.get("joins", {}).items(): 65 | if k in join_config: 66 | join_config[k].extend(v) 67 | else: 68 | join_config[k] = v 69 | 70 | print(table_config) 71 | print(group_config) 72 | print(join_config) 73 | 74 | # Keep track of printed objects from __repr__. 75 | __repr_recursion_set = None 76 | 77 | def namedtuple(name, field_list): 78 | fields = field_list.split(" ") 79 | def init(self, **kwargs): 80 | for k, v in kwargs.items(): 81 | if not k in fields: 82 | raise AttributeError("%s not in %s" % (k, fields)) 83 | setattr(self, k, v) 84 | def repr(self): 85 | global __repr_recursion_set 86 | top = False 87 | if not __repr_recursion_set: 88 | top = True 89 | __repr_recursion_set = set() 90 | if self in __repr_recursion_set: 91 | result = "%s<...>" % name 92 | else: 93 | __repr_recursion_set.add(self) 94 | result = "%s<%s>" % (name, ", ".join(["%s=%s" % (k, getattr(self, k)) for k in fields])) 95 | if top: 96 | __repr_recursion_set = None 97 | return result 98 | return type(name, (), dict({k: None for k in fields}, __init__=init, __repr__=repr)) 99 | 100 | Dataset = namedtuple("Dataset", "name last_updated tables") 101 | 102 | Table = namedtuple("Table", "name version dataset_description description dataset fields last_updated num_rows from_joins num_bytes old_version") 103 | 104 | Field = namedtuple("Field", "name table description type mode from_joins to_joins") 105 | 106 | Join = namedtuple("Join", "name from_field to_field percent num_rows join_stats sql") 107 | 108 | JoinStat = namedtuple("JoinStat", "percent num_rows key sample_value") 109 | 110 | datasets = collections.OrderedDict() 111 | 112 | def find_field(table_name, column): 113 | for dataset in datasets.values(): 114 | for t in dataset.tables: 115 | if t.name == table_name: 116 | for f in t.fields: 117 | if column == f.name: 118 | return f 119 | return None 120 | 121 | def ts_to_string(unix): 122 | return datetime.datetime.utcfromtimestamp(unix).strftime("%Y-%m-%d") 123 | 124 | def tsql(table): 125 | return table.replace(":", ".") 126 | 127 | # Fetch a list of all tables and schemas for those tables. 128 | for nice_name, table_fmts in table_config.items(): 129 | for table_fmt in table_fmts: 130 | dataset_name, table_name = table_fmt.split(".") 131 | if nice_name not in datasets: 132 | dataset = Dataset(name=nice_name) 133 | datasets[nice_name] = dataset 134 | else: 135 | dataset = datasets[nice_name] 136 | show_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", dataset_name).stdout.decode('utf-8')) 137 | 138 | if not dataset.tables: 139 | dataset.tables = [] 140 | 141 | print("Loading dataset %s" % dataset_name) 142 | tables = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "ls", "-n", "100000", dataset_name).stdout.decode('utf-8')) 143 | for table_data in tables: 144 | name = table_data["tableReference"]["tableId"] 145 | if re.match(table_name.replace("*", ".*"), name): 146 | table = Table(name=dataset_name + "." + name, dataset=dataset, dataset_description=show_info.get("description", "")) 147 | dataset.tables.append(table) 148 | print(table.name) 149 | 150 | # Detect table and dataset versions, mark older versions. 151 | latest_table_base = {} # map[base name]latest name 152 | no_version_tables = {} 153 | for dataset in datasets.values(): 154 | for table in dataset.tables: 155 | def sub_fn(m): 156 | return m.group(1) 157 | m = re.match("^(.+)_([0-9]+[0-9a-zA-Z]*)", table.name) 158 | if not m: 159 | no_version_tables[table.name] = True 160 | latest_table_base[table.name] = table.name 161 | else: 162 | base = m.group(1) 163 | table.version = m.group(2) 164 | if not base in latest_table_base: 165 | latest_table_base[base] = table.name 166 | elif latest_table_base[base] < table.name and not no_version_tables.get(base, ""): 167 | latest_table_base[base] = table.name 168 | 169 | 170 | latest_tables = {} 171 | for latest in latest_table_base.values(): 172 | latest_tables[latest] = True 173 | 174 | for dataset in datasets.values(): 175 | for table in dataset.tables: 176 | if table.name not in latest_tables: 177 | table.old_version = True 178 | 179 | for dataset in datasets.values(): 180 | for table in dataset.tables: 181 | if table.old_version: 182 | print("Skipping old table %s" % table.name) 183 | continue 184 | print("Loading table %s" % table.name) 185 | table_info = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "show", table.name).stdout.decode('utf-8')) 186 | table_fields = [] 187 | def add_fields(parent, fields): 188 | for field in fields: 189 | name = field["name"] 190 | if parent: 191 | name = parent + "." + name 192 | table_fields.append(Field( 193 | name=name, 194 | table=table, 195 | description=field.get("description", ""), 196 | type=field.get("type", ""), 197 | mode=field.get("mode", ""), 198 | )) 199 | if "fields" in field: 200 | add_fields(name, field["fields"]) 201 | 202 | add_fields("", table_info["schema"]["fields"]) 203 | table.fields = table_fields 204 | table.description = table_info.get("description", "") 205 | table.last_updated = ts_to_string(int(table_info["lastModifiedTime"]) / 1000) 206 | if not dataset.last_updated or dataset.last_updated < table.last_updated: 207 | dataset.last_updated = table.last_updated 208 | table.num_rows = table_info["numRows"] 209 | table.num_bytes = table_info["numBytes"] 210 | # Possibly calculate group-by stats. 211 | if table.name in group_config: 212 | column = group_config[table.name] 213 | query = "SELECT COUNT(*) AS cnt, {column} AS grouped FROM `{table}` GROUP BY 2 ORDER BY 1".format(table=tsql(table.name), column=column) 214 | result = json.loads(bq("--format=prettyjson", "--project_id", args.project_id, "query", "--use_legacy_sql=false", query).stdout.decode('utf-8')) 215 | table.stats = {} 216 | for row in result: 217 | js = JoinStat(key=row["grouped"], num_rows=int(row["cnt"])) 218 | table.stats[js.key] = js 219 | 220 | 221 | # Support wildcards in join groups: dataset:*|molregno 222 | for join_group in join_config.values(): 223 | i = 0 224 | while i < len(join_group): 225 | if not "*" in join_group[i]: 226 | i += 1 227 | continue 228 | table_fmt, column_fmt = join_group[i].split("|") 229 | # Loop over all tables and columns and look for matches. 230 | matches = [] 231 | for dataset in datasets.values(): 232 | for table in dataset.tables: 233 | if not re.match(table_fmt.replace("*", ".*"), table.name) or table.old_version: 234 | continue 235 | for field in table.fields: 236 | if re.match(column_fmt.replace("*", ".*"), field.name): 237 | matches.append("%s|%s" % (table.name, field.name)) 238 | # Replace join_group[i] with the matched values. 239 | join_group.pop(i) 240 | for v in matches: 241 | join_group.insert(i, v) 242 | i += 1 243 | 244 | join_done = set() 245 | 246 | # Enumerate all possible joins inside each group of matching columns. 247 | for join_name, join_group in join_config.items(): 248 | for i in range(len(join_group)): 249 | self = join_group[i] 250 | for j in range(len(join_group)): 251 | if j == i: 252 | continue 253 | first_table, first_column = join_group[i].split("|") 254 | second_table, second_column = join_group[j].split("|") 255 | # Only join tables if one or more has a + as the prefix. 256 | if not first_table.startswith("+") and not second_table.startswith("+"): 257 | continue 258 | first_table = first_table.lstrip("+") 259 | second_table = second_table.lstrip("+") 260 | key = first_table + first_column + second_table + second_column 261 | if key in join_done or (first_table == second_table and first_column == second_column): 262 | continue 263 | join_done.add(key) 264 | print("Running join between %s and %s" % (join_group[i], join_group[j])) 265 | from_field = find_field(first_table, first_column) 266 | to_field = find_field(second_table, second_column) 267 | if not from_field or not to_field: 268 | raise TypeError("fields not found: %s:%s %s:%s" % (join_group[i], from_field is not None, join_group[j], to_field is not None)) 269 | group_by = group_config.get(first_table, None) 270 | if not group_by: 271 | query = """#standardSQL 272 | SELECT 273 | COUNT(*) AS cnt, 274 | COUNT(second.second_column) AS second_cnt, 275 | ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 276 | FROM `{first_table}`AS first 277 | LEFT JOIN ( 278 | SELECT {second_column} AS second_column, COUNT(*) AS cnt 279 | FROM `{second_table}` 280 | GROUP BY 1 281 | ) AS second ON first.{first_column} = second.second_column""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column) 282 | else: 283 | query = """#standardSQL 284 | SELECT 285 | COUNT(*) AS cnt, 286 | COUNT(second.second_column) AS second_cnt, 287 | first.{group_by} AS grouped, 288 | ARRAY_AGG(first.{first_column} IGNORE NULLS ORDER BY RAND() LIMIT 5) AS sample_value 289 | FROM `{first_table}`AS first 290 | LEFT JOIN ( 291 | SELECT {second_column} AS second_column, COUNT(*) AS cnt 292 | FROM `{second_table}` 293 | GROUP BY 1 294 | ) AS second ON first.{first_column} = second.second_column 295 | GROUP BY 3""".format(first_table=tsql(first_table), first_column=first_column, second_table=tsql(second_table), second_column=second_column, group_by=group_by) 296 | 297 | result = json.loads(bq("--format=prettyjson", "query", "--use_legacy_sql=false", query).stdout.decode('utf-8')) 298 | total_rows = 0 299 | joined_rows = 0 300 | 301 | join_stats = {} 302 | join = Join(name=join_name, from_field=from_field, to_field=to_field, join_stats=join_stats, sql=query) 303 | if not from_field.from_joins: 304 | from_field.from_joins = [] 305 | from_field.from_joins.append(join) 306 | if not to_field.to_joins: 307 | to_field.to_joins = [] 308 | to_field.to_joins.append(join) 309 | if not from_field.table.from_joins: 310 | from_field.table.from_joins = [] 311 | from_field.table.from_joins.append(join) 312 | for row in result: 313 | cnt = int(row["cnt"]) 314 | second_cnt = int(row["second_cnt"]) 315 | total_rows += cnt 316 | joined_rows += second_cnt 317 | if not group_by: 318 | join_stats[""] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key="all", sample_value=row["sample_value"]) 319 | else: 320 | join_stats[row["grouped"]] = JoinStat(percent=second_cnt / cnt, num_rows=second_cnt, key=row["grouped"], sample_value=row["sample_value"]) 321 | join.percent = joined_rows / total_rows 322 | join.num_rows = joined_rows 323 | 324 | def other_formats(name): 325 | if not args.formats: 326 | return 327 | for fmt in args.formats.split(","): 328 | sh.pandoc(name, "--from", "markdown", "-s", "-o", "%s.%s" % (name, fmt)) 329 | 330 | # "index.md" 331 | # Links to every dataset and description of each dataset 332 | # DOT graph of links between tables 333 | # Link statistics: % of rows that link together 334 | main_page_template = jinja2.Template(""" 335 | --- 336 | geometry: margin=0.6in 337 | --- 338 | 339 | # Datasets 340 | 341 | {% for dataset in datasets.values() %} 342 | ## [{{dataset.name}}](dataset_{{dataset.name}}.md) 343 | 344 | | Name | Last updated | Rows | Joins | 345 | |-------------------------------------------|-------|--------|-----------------| 346 | {% for table in dataset.tables -%} 347 | | [{{table.name}}](https://bigquery.cloud.google.com/table/{{table.name}}) | {% if table.last_updated %}{{table.last_updated }}{% endif %} | {% if table.num_rows %}{{"{0:,}".format(table.num_rows|int)}}{% endif %} | 348 | {%- if table.from_joins %}{% for group in table.from_joins|groupby("name") -%} 349 | {{group.grouper}} {% endfor %}{% endif %} | 350 | {% endfor %} 351 | {% endfor %} 352 | """) 353 | 354 | index_output = os.path.join(output_dir, "index.md") 355 | with open(index_output, "w") as f: 356 | f.write(main_page_template.render(datasets=datasets)) 357 | other_formats(index_output) 358 | 359 | # "dataset_.md" 360 | # Description of dataset 361 | # List of all tables in dataset 362 | # Sample rows in each table 363 | # Links to other datasets 364 | # Inner-dataset links 365 | # DOT graph of links 366 | dataset_page_template = jinja2.Template(""" 367 | --- 368 | geometry: margin=0.6in 369 | --- 370 | 371 | # {{dataset.name}} 372 | 373 | {% for table in dataset.tables %} 374 | ***** 375 | ## {{table.name}} 376 | 377 | {% if table.old_version %} 378 | Old table version `{{ table.version }}`, schema skipped. 379 | {% else %} 380 | {% if table.dataset_description %} 381 | > {{table.dataset_description|replace("\n", "\n> ")}} 382 | {% endif %} 383 | {% if table.description %} 384 | > {{table.description|replace("\n", "\n> ")}} 385 | {% endif %} 386 | {% endif %} 387 | 388 | {% if table.fields %} 389 | | Stat | Value | 390 | |----------|----------| 391 | | Last updated | {{table.last_updated}} | 392 | | Rows | {{"{0:,}".format(table.num_rows|int)}} | 393 | | Size | {{table.num_bytes|filesizeformat}} | 394 | 395 | ### Schema 396 | [View in BigQuery](https://bigquery.cloud.google.com/table/{{table.name}}) 397 | 398 | {% for field in table.fields -%} 399 | * `{{field.name}}` {{field.type}} {{field.mode}} {% if field.from_joins %} joins on **{{ field.from_joins[0].name }}**{% endif %} 400 | {% if field.description %} 401 | > {{field.description|replace("\n", "\n> ")}} 402 | {% endif %} 403 | {% endfor %} 404 | 405 | {% if table.from_joins %}### Join columns{% endif %} 406 | {% for field in table.fields %} 407 | {% if field.from_joins %} 408 | #### {{field.name}} 409 | {% for join in field.from_joins %} 410 | joins to `{{ join.to_field.table.name }}::{{ join.to_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows) 411 | 412 | | Key | Percent | Rows | Sample values | 413 | |------|-----|--------|--------------------------------------------------------| 414 | {% for stat in join.join_stats.values() -%} 415 | | `{{stat.key}}` | {% if stat.percent > 0.0 %}{{"%.2f" % (100 * stat.percent)}}%{% else %}*none*{% endif %} | {{"{0:,}".format(stat.num_rows|int)}} | `{{stat.sample_value}}` | 416 | {% endfor %} 417 | 418 | {{join.sql|indent}} 419 | 420 | {% endfor %} 421 | {% for join in field.to_joins %} 422 | joins from `{{ join.from_field.table.name }}::{{ join.from_field.name }}` on **{{ join.name }}** ({{"%.2f" % (100 * join.percent)}}%, {{"{0:,}".format(join.num_rows|int)}} rows) 423 | {% endfor %} 424 | {% endif %} 425 | {% endfor %} 426 | {% endif %} 427 | 428 | {% endfor %} 429 | """) 430 | 431 | for dataset in datasets.values(): 432 | output = os.path.join(output_dir, "dataset_%s.md" % dataset.name) 433 | with open(output, "w") as f: 434 | f.write(dataset_page_template.render(dataset=dataset)) 435 | other_formats(output) 436 | -------------------------------------------------------------------------------- /tools/sqlite_dump.pysh: -------------------------------------------------------------------------------- 1 | # python ocean/patents/deepsea/research/chembl_dump.pysh '/usr/local/google/home/wetherbeei/Downloads/chembl/chembl_24.db' '/usr/local/google/home/wetherbeei/Downloads/chembl/' 2 | import sh 3 | import sys 4 | import os.path 5 | 6 | db = sys.argv[1] 7 | dir = sys.argv[2] 8 | 9 | tables = sh.sqlite3("-csv", db, "SELECT name FROM sqlite_master WHERE type='table';").stdout.split("\n") 10 | 11 | for table in tables: 12 | if table == "": 13 | continue 14 | path = os.path.join(dir, table + ".csv") 15 | print("Dumping %s" % path) 16 | sh.sqlite3("-csv", "-header", db, "SELECT * FROM %s;" % table, _out=path) 17 | --------------------------------------------------------------------------------