├── .gitignore ├── LICENSE ├── Makefile ├── consolidate.py ├── get_invpat.py ├── integrate.py ├── integration ├── clean_integration.sh ├── consolidate_integration.sh ├── parse_integration.sh └── run_integration_tests.sh ├── lib ├── .gitignore ├── LICENSE_WC.txt ├── __init__.py ├── alchemy │ ├── README.md │ ├── __init__.py │ ├── config.ini │ ├── match.py │ ├── schema.py │ └── schema_func.py ├── argconfig_parse.py ├── assignee_disambiguation.py ├── config_parser.py ├── geoalchemy.py ├── geoalchemy_util.py ├── handlers │ ├── README.md │ ├── __init__.py │ ├── application_handler_v41.py │ ├── application_handler_v42.py │ ├── application_handler_v43.py │ ├── grant_handler_v42.py │ ├── grant_handler_v44.py │ ├── handler.py │ ├── xml_driver.py │ └── xml_util.py ├── lawyer_disambiguation.py ├── manual_replacement_library.txt ├── nber_substitutions.json ├── state_abbreviations.txt ├── tasks.py └── util │ ├── __init__.py │ ├── csv_reader.py │ └── getpatent.py ├── nber_substitutions.json ├── notebooks ├── MySQL.ipynb ├── Patent Database Stat Report.ipynb ├── README.md ├── buildpdf └── latex_nocode.tplx ├── parse.py ├── process.cfg ├── readme.md ├── requirements.txt ├── run_clean.sh ├── run_consolidation.sh ├── starcluster ├── README.md ├── built_tsv.py ├── config.ini ├── fetch_xml.py ├── load.sql ├── load_drop.sql ├── load_pre.sh └── urls.pickle ├── start.py ├── test ├── .gitignore ├── Makefile ├── __init__.py ├── colortest.py ├── colortest.rb ├── config.ini ├── fixtures │ ├── GNS │ │ └── geonames_10.txt │ ├── SAS │ │ └── national_file_head_20120204.txt │ ├── alchemy │ │ ├── alchemy.raw │ │ ├── ipg120103.xml │ │ └── ipg130416.xml │ ├── csv │ │ ├── .gitignore │ │ ├── gen_sample.csv │ │ ├── test.v0.csv │ │ ├── test.v10.csv │ │ ├── test.v2.csv │ │ ├── test.v3.csv │ │ ├── test.v4.csv │ │ ├── test.v5.csv │ │ ├── test.v6.csv │ │ ├── test.v6_alex.csv │ │ ├── test.v7.csv │ │ ├── test.v9.csv │ │ ├── test.v9_UC.csv │ │ └── typos.csv │ ├── goldstandard │ │ ├── .gitignore │ │ ├── benchmark.csv │ │ ├── benchmark.sh │ │ ├── benchmark_confirm.py │ │ ├── berkeleyinventors.csv │ │ ├── gs2011.sh │ │ ├── jamesrhunt.csv │ │ └── readme.md │ ├── ipgxml │ │ ├── ipg050104.small.xml │ │ ├── ipg060103.small.xml │ │ ├── ipg070102.small.xml │ │ ├── ipg080101.small.xml │ │ ├── ipg090106.small.xml │ │ ├── ipg100105.small.xml │ │ ├── ipg110104.small.xml │ │ ├── ipg120103.small.xml │ │ └── ipg130416.small.xml │ ├── sqlite3 │ │ └── combined.sqlite3 │ ├── text │ │ └── accented_characters.txt │ ├── unittest │ │ ├── .gitignore │ │ └── patent_two_parsed_general.xml │ └── xml │ │ ├── .gitignore │ │ ├── 2009_1.xml │ │ ├── 2009_10.xml │ │ ├── 2009_2.xml │ │ ├── 2009_3.xml │ │ ├── 2009_4.xml │ │ ├── 2009_5.xml │ │ ├── 2009_6.xml │ │ ├── 2009_7.xml │ │ ├── 2009_8.xml │ │ ├── 2009_9.xml │ │ ├── 2010_1.xml │ │ ├── 2010_10.xml │ │ ├── 2010_2.xml │ │ ├── 2010_3.xml │ │ ├── 2010_4.xml │ │ ├── 2010_5.xml │ │ ├── 2010_6.xml │ │ ├── 2010_7.xml │ │ ├── 2010_8.xml │ │ ├── 2010_9.xml │ │ ├── 2011_1.xml │ │ ├── 2011_10.xml │ │ ├── 2011_2.xml │ │ ├── 2011_3.xml │ │ ├── 2011_4.xml │ │ ├── 2011_5.xml │ │ ├── 2011_6.xml │ │ ├── 2011_7.xml │ │ ├── 2011_8.xml │ │ ├── 2011_9.xml │ │ ├── 2012_1.xml │ │ ├── 2012_10.xml │ │ ├── 2012_2.xml │ │ ├── 2012_3.xml │ │ ├── 2012_4.xml │ │ ├── 2012_5.xml │ │ ├── 2012_6.xml │ │ ├── 2012_7.xml │ │ ├── 2012_8.xml │ │ ├── 2012_9.xml │ │ ├── basic.xml │ │ ├── ipa061228.one.xml │ │ ├── ipa130117.one.xml │ │ ├── ipg100824-hyphenated.xml │ │ ├── ipg120327.18.xml │ │ ├── ipg120327.196.xml │ │ ├── ipg120327.one.xml │ │ ├── ipg120327.two.xml │ │ ├── pa040101.two.xml │ │ ├── patent_eight_parsed_.xml │ │ ├── patent_eight_unparsed_dna.xml │ │ ├── patent_five_parsed_.xml │ │ ├── patent_five_unparsed_dna.xml │ │ ├── patent_four_parsed_.xml │ │ ├── patent_four_unparsed_.xml │ │ ├── patent_nine_parsed_.xml │ │ ├── patent_nine_unparsed_dna.xml │ │ ├── patent_one_parsed_.xml │ │ ├── patent_one_unparsed_dna.xml │ │ ├── patent_seven_parsed_.xml │ │ ├── patent_seven_unparsed_dna.xml │ │ ├── patent_six_parsed_.xml │ │ ├── patent_six_unparsed_.xml │ │ ├── patent_ten_parsed_.xml │ │ ├── patent_ten_unparsed_dna.xml │ │ ├── patent_three_parsed_.xml │ │ ├── patent_three_unparsed_.xml │ │ └── patent_two_unparsed_.xml ├── integration │ ├── .gitignore │ ├── clean │ │ ├── ipg120327.18 │ │ │ ├── assignee.csv │ │ │ ├── lawyer.csv │ │ │ └── location.csv │ │ ├── ipg120327.one │ │ │ ├── assignee.csv │ │ │ ├── lawyer.csv │ │ │ └── location.csv │ │ └── ipg120327.two │ │ │ ├── assignee.csv │ │ │ ├── lawyer.csv │ │ │ └── location.csv │ ├── consolidate │ │ ├── ipg120327.18 │ │ │ └── disambiguator.csv │ │ └── ipg120327.two │ │ │ └── disambiguator.csv │ ├── parse │ │ ├── ipa061228.one │ │ │ ├── application.csv │ │ │ ├── claim.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ │ ├── ipa130117.one │ │ │ ├── application.csv │ │ │ ├── claim.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ │ ├── ipg120327.18 │ │ │ ├── application.csv │ │ │ ├── citation.csv │ │ │ ├── claim.csv │ │ │ ├── foreigncitation.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── otherreference.csv │ │ │ ├── patent.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlawyer.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── usapplicationcitation.csv │ │ │ ├── uspatentcitation.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ │ ├── ipg120327.one │ │ │ ├── application.csv │ │ │ ├── citation.csv │ │ │ ├── claim.csv │ │ │ ├── foreigncitation.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── otherreference.csv │ │ │ ├── patent.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlawyer.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── usapplicationcitation.csv │ │ │ ├── uspatentcitation.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ │ ├── ipg120327.two │ │ │ ├── application.csv │ │ │ ├── citation.csv │ │ │ ├── claim.csv │ │ │ ├── foreigncitation.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── otherreference.csv │ │ │ ├── patent.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlawyer.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── usapplicationcitation.csv │ │ │ ├── uspatentcitation.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ │ └── pa040101.two │ │ │ ├── application.csv │ │ │ ├── claim.csv │ │ │ ├── ipcr.csv │ │ │ ├── mainclass.csv │ │ │ ├── rawassignee.csv │ │ │ ├── rawinventor.csv │ │ │ ├── rawlocation.csv │ │ │ ├── subclass.csv │ │ │ ├── uspc.csv │ │ │ └── usreldoc.csv │ └── readme.md ├── make_test_databases.py ├── patenttest.sh ├── process.cfg ├── readme.md ├── sqlitetest.py ├── test_alchemy.py ├── test_ascit.py ├── test_configuration.py ├── test_fwork.py ├── test_keylist.py ├── test_parse_file.py ├── test_separate_row_geocode.py ├── test_sqlite.py ├── test_sqlite_index.py ├── test_sqlite_merge.py ├── test_xml_driver.py └── test_xml_util.py └── vm ├── Vagrantfile └── manifests └── default.pp /.gitignore: -------------------------------------------------------------------------------- 1 | *-journal 2 | bmschema.txt 3 | dump.sql 4 | tmp* 5 | results*.txt 6 | .sass* 7 | .DS_* 8 | *~ 9 | *.swp 10 | *.csv 11 | tmp/ 12 | *.log 13 | invpat*.csv 14 | *.pyc 15 | *.sqlite3 16 | data/ 17 | fibo.py 18 | NBER_asg 19 | loctbl 20 | latex/ 21 | html/ 22 | *.project 23 | .pydevproject 24 | hashTbl 25 | test1/ 26 | test_loc.sql 27 | patentroot/ 28 | .settings/ 29 | XML/* 30 | *.s3 31 | *.db 32 | lib/alchemy/config.ini 33 | *.pickle 34 | build/ 35 | tags 36 | *.cmd 37 | 38 | *.pid # pid files 39 | dump.rdb # redis dump 40 | 41 | *.aux 42 | *.idx 43 | *.out 44 | *.pdf 45 | *.tex 46 | *_files 47 | *.ipynb_checkpoints 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | clean: 3 | rm -rf *~ *.pyc *.log 4 | 5 | spotless: clean 6 | rm -rf *.sqlite3 tmp grant.db application.db *-journal disambiguator.csv 7 | 8 | -------------------------------------------------------------------------------- /get_invpat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | """ 30 | Creates the invpat file as seen and constructed in the Harvard DVN project 31 | """ 32 | from lib import alchemy 33 | import pandas as pd 34 | 35 | session_generator = alchemy.session_generator 36 | session = session_generator() 37 | 38 | #res = session.execute('select rawinventor.name_first, rawinventor.name_last, rawlocation.city, rawlocation.state, \ 39 | # rawlocation.country, rawinventor.sequence, patent.id, \ 40 | # year(application.date), year(patent.date), rawassignee.organization, uspc.mainclass_id, inventor.id \ 41 | # from rawinventor left join patent on patent.id = rawinventor.patent_id \ 42 | # left join application on application.patent_id = patent.id \ 43 | # left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \ 44 | # left join rawassignee on rawassignee.patent_id = patent.id \ 45 | # left join uspc on uspc.patent_id = patent.id \ 46 | # left join inventor on inventor.id = rawinventor.inventor_id \ 47 | # where uspc.sequence = 0;') 48 | res = session.execute('select rawinventor.name_first, rawinventor.name_last, location.city, location.state, \ 49 | location.country, rawinventor.sequence, patent.id, year(application.date), \ 50 | year(patent.date), rawassignee.organization, uspc.mainclass_id, inventor.id \ 51 | from rawinventor, rawlocation, patent, application, rawassignee, uspc, inventor,location \ 52 | where rawinventor.patent_id = patent.id and \ 53 | application.patent_id = patent.id and \ 54 | rawlocation.id = rawinventor.rawlocation_id and \ 55 | location.id = rawlocation.location_id and \ 56 | rawassignee.patent_id = patent.id and \ 57 | uspc.patent_id = patent.id and \ 58 | inventor.id = rawinventor.inventor_id;') 59 | data = pd.DataFrame.from_records(res.fetchall()) 60 | data = data.drop_duplicates((6,11)) 61 | data.columns = ['first_name', 'last_name', 'city', 'state', 'country', 'sequence', 'patent', 'app_year', 'grant_year', 'assignee', 'mainclass', 'inventorid'] 62 | data.to_csv('invpat.csv',index=False,encoding='utf8') 63 | -------------------------------------------------------------------------------- /integration/clean_integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Integration testing for the clean.py script 4 | 5 | cd .. 6 | 7 | echo 'Testing test/fixtures/xml/ipg120327.one.xml' 8 | make spotless > /dev/null 9 | mkdir -p tmp/integration/ipg120327.one 10 | ./parse.py -p test/fixtures/xml/ -x ipg120327.one.xml -o . 11 | ./run_clean.sh grant 12 | 13 | for table in assignee lawyer location 14 | do 15 | echo $table 'diffs...' 16 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.one/${table}.csv 17 | # remove UUIDs from database dump because these change each time 18 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.one/${table}.csv 19 | diff test/integration/clean/ipg120327.one/${table}.csv tmp/integration/ipg120327.one/${table}.csv 20 | done 21 | 22 | echo 'Testing test/fixtures/xml/ipg120327.two.xml' 23 | make spotless > /dev/null 24 | mkdir -p tmp/integration/ipg120327.two 25 | ./parse.py -p test/fixtures/xml/ -x ipg120327.two.xml -o . 26 | ./run_clean.sh grant 27 | 28 | for table in assignee lawyer location 29 | do 30 | echo $table 'diffs...' 31 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.two/${table}.csv 32 | # remove UUIDs from database dump because these change each time 33 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.two/${table}.csv 34 | diff test/integration/clean/ipg120327.two/${table}.csv tmp/integration/ipg120327.two/${table}.csv 35 | done 36 | 37 | echo 'Testing test/fixtures/xml/ipg120327.18.xml' 38 | make spotless > /dev/null 39 | mkdir -p tmp/integration/ipg120327.18 40 | ./parse.py -p test/fixtures/xml/ -x ipg120327.18.xml -o . 41 | ./run_clean.sh grant 42 | 43 | for table in assignee lawyer location 44 | do 45 | echo $table 'diffs...' 46 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.18/${table}.csv 47 | # remove UUIDs from database dump because these change each time 48 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.18/${table}.csv 49 | diff test/integration/clean/ipg120327.18/${table}.csv tmp/integration/ipg120327.18/${table}.csv 50 | done 51 | 52 | # clean up after we're done 53 | rm -rf tmp 54 | make spotless > /dev/null 55 | -------------------------------------------------------------------------------- /integration/consolidate_integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Integration testing for the consolidate.py script 4 | 5 | cd .. 6 | 7 | ##### Two rows 8 | 9 | make spotless > /dev/null 10 | ./parse.py -p test/fixtures/xml/ -x ipg120327.two.xml -o . 11 | mkdir -p tmp/integration/ipg120327.two 12 | 13 | echo Starting clean... 14 | ./run_clean.sh grant 15 | 16 | echo Starting consolidate... 17 | python consolidate.py 18 | 19 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12}\t//' disambiguator.csv 20 | diff test/integration/consolidate/ipg120327.two/disambiguator.csv disambiguator.csv 21 | 22 | ### 18 rows 23 | 24 | make spotless > /dev/null 25 | ./parse.py -p test/fixtures/xml/ -x ipg120327.18.xml -o . 26 | mkdir -p tmp/integration/ipg120327.18 27 | 28 | echo Starting clean... 29 | ./run_clean.sh grant 30 | 31 | echo Starting consolidate... 32 | python consolidate.py 33 | 34 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12}\t//' disambiguator.csv 35 | diff test/integration/consolidate/ipg120327.18/disambiguator.csv disambiguator.csv 36 | 37 | ## clean up after we're done 38 | rm -rf tmp 39 | make spotless > /dev/null 40 | -------------------------------------------------------------------------------- /integration/parse_integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd .. 4 | 5 | echo 'Testing test/fixtures/xml/ipg120327.one.xml' 6 | make spotless > /dev/null 7 | mkdir -p tmp/integration/ipg120327.one 8 | ./parse.py -p test/fixtures/xml/ -x ipg120327.one.xml -o . 9 | 10 | for table in application uspatentcitation usapplicationcitation foreigncitation ipcr mainclass otherreference patent rawassignee rawinventor rawlawyer rawlocation subclass uspc usreldoc claim 11 | do 12 | echo $table 'diffs...' 13 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.one/${table}.csv 14 | # remove UUIDs from database dump because these change each time 15 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.one/${table}.csv 16 | diff test/integration/parse/ipg120327.one/${table}.csv tmp/integration/ipg120327.one/${table}.csv 17 | done 18 | 19 | echo 'Testing test/fixtures/xml/ipg120327.two.xml' 20 | make spotless > /dev/null 21 | mkdir -p tmp/integration/ipg120327.two 22 | ./parse.py -p test/fixtures/xml/ -x ipg120327.two.xml -o . 23 | 24 | for table in application uspatentcitation usapplicationcitation foreigncitation ipcr mainclass otherreference patent rawassignee rawinventor rawlawyer rawlocation subclass uspc usreldoc claim 25 | do 26 | echo $table 'diffs...' 27 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.two/${table}.csv 28 | # remove UUIDs from database dump because these change each time 29 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.two/${table}.csv 30 | diff test/integration/parse/ipg120327.two/${table}.csv tmp/integration/ipg120327.two/${table}.csv 31 | done 32 | 33 | echo 'Testing test/fixtures/xml/ipg120327.18.xml' 34 | make spotless > /dev/null 35 | mkdir -p tmp/integration/ipg120327.18 36 | ./parse.py -p test/fixtures/xml/ -x ipg120327.18.xml -o . 37 | 38 | for table in application uspatentcitation usapplicationcitation foreigncitation ipcr mainclass otherreference patent rawassignee rawinventor rawlawyer rawlocation subclass uspc usreldoc claim 39 | do 40 | echo $table 'diffs...' 41 | sqlite3 -csv grant.db "select * from ${table}" > tmp/integration/ipg120327.18/${table}.csv 42 | # remove UUIDs from database dump because these change each time 43 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipg120327.18/${table}.csv 44 | diff test/integration/parse/ipg120327.18/${table}.csv tmp/integration/ipg120327.18/${table}.csv 45 | done 46 | 47 | echo 'Testing test/fixtures/xml/pa040101.two.xml' 48 | make spotless > /dev/null 49 | mkdir -p tmp/integration/pa040101.two 50 | ./parse.py -p test/fixtures/xml/ -x pa040101.two.xml -d application -o . 51 | 52 | for table in application mainclass subclass ipcr uspc claim usreldoc rawlocation rawinventor rawassignee 53 | do 54 | echo $table 'diffs...' 55 | sqlite3 -csv -header application.db "select * from ${table}" > tmp/integration/pa040101.two/${table}.csv 56 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/pa040101.two/${table}.csv 57 | diff test/integration/parse/pa040101.two/${table}.csv tmp/integration/pa040101.two/${table}.csv 58 | done 59 | 60 | echo 'Testing test/fixtures/xml/ipa061228.one.xml' 61 | make spotless > /dev/null 62 | mkdir -p tmp/integration/ipa061228.one 63 | ./parse.py -p test/fixtures/xml/ -x ipa061228.one.xml -d application -o . 64 | 65 | for table in application mainclass subclass ipcr uspc claim usreldoc rawlocation rawinventor rawassignee 66 | do 67 | echo $table 'diffs...' 68 | sqlite3 -csv -header application.db "select * from ${table}" > tmp/integration/ipa061228.one/${table}.csv 69 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipa061228.one/${table}.csv 70 | diff test/integration/parse/ipa061228.one/${table}.csv tmp/integration/ipa061228.one/${table}.csv 71 | done 72 | 73 | echo 'Testing test/fixtures/xml/ipa130117.one.xml' 74 | make spotless > /dev/null 75 | mkdir -p tmp/integration/ipa130117.one 76 | ./parse.py -p test/fixtures/xml/ -x ipa130117.one.xml -d application -o . 77 | 78 | for table in application mainclass subclass ipcr uspc claim usreldoc rawlocation rawinventor rawassignee 79 | do 80 | echo $table 'diffs...' 81 | sqlite3 -csv -header application.db "select * from ${table}" > tmp/integration/ipa130117.one/${table}.csv 82 | perl -pi -e 's/^[a-z0-9]{8}-([a-z0-9]{4}-){3}[a-z0-9]{12},//' tmp/integration/ipa130117.one/${table}.csv 83 | diff test/integration/parse/ipa130117.one/${table}.csv tmp/integration/ipa130117.one/${table}.csv 84 | done 85 | 86 | # clean up after we're done 87 | rm -rf tmp 88 | make spotless > /dev/null 89 | -------------------------------------------------------------------------------- /integration/run_integration_tests.sh: -------------------------------------------------------------------------------- 1 | bash parse_integration.sh 2 | bash clean_integration.sh 3 | bash consolidate_integration.sh 4 | -------------------------------------------------------------------------------- /lib/.gitignore: -------------------------------------------------------------------------------- 1 | patXML.py 2 | -------------------------------------------------------------------------------- /lib/LICENSE_WC.txt: -------------------------------------------------------------------------------- 1 | OPEN DATA LICENSE for MaxMind WorldCities and Postal Code Databases 2 | 3 | Copyright (c) 2008 MaxMind Inc. All Rights Reserved. 4 | 5 | The database uses toponymic information, based on the Geographic Names Data Base, containing official standard names approved by 6 | the United States 7 | Board on Geographic Names and maintained by the National Geospatial-Intelligence Agency. More information is available at the Maps and 8 | Geodata link at www.nga.mil. The National Geospatial-Intelligence Agency name, initials, and seal 9 | are protected by 10 United States 10 | Code Section 445. 11 | 12 | It also uses free population data from Stefan Helders www.world-gazetteer.com. 13 | Visit his website to download the free population data. Our database 14 | combines Stefan's population data with the list of all cities in the world. 15 | 16 | All advertising materials and documentation mentioning features or use of 17 | this database must display the following acknowledgment: 18 | "This product includes data created by MaxMind, available from 19 | http://www.maxmind.com/" 20 | 21 | Redistribution and use with or without modification, are permitted provided 22 | that the following conditions are met: 23 | 1. Redistributions must retain the above copyright notice, this list of 24 | conditions and the following disclaimer in the documentation and/or other 25 | materials provided with the distribution. 26 | 2. All advertising materials and documentation mentioning features or use of 27 | this database must display the following acknowledgement: 28 | "This product includes data created by MaxMind, available from 29 | http://www.maxmind.com/" 30 | 3. "MaxMind" may not be used to endorse or promote products derived from this 31 | database without specific prior written permission. 32 | 33 | THIS DATABASE IS PROVIDED BY MAXMIND.COM ``AS IS'' AND ANY 34 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 35 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 36 | DISCLAIMED. IN NO EVENT SHALL MAXMIND.COM BE LIABLE FOR ANY 37 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 39 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 40 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 42 | DATABASE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/lib/__init__.py -------------------------------------------------------------------------------- /lib/alchemy/README.md: -------------------------------------------------------------------------------- 1 | README 2 | ====== 3 | 4 | #### Installation: 5 | 6 | If using an Ubuntu 12/13 enviornment, several packages are required (or beneficial) to be installed before pulling from the directory. 7 | 8 | ``` 9 | sudo apt-get install -y git 10 | sudo apt-get install -y redis-server 11 | sudo apt-get install -y python-pip 12 | sudo apt-get install -y python-zmq 13 | sudo apt-get install -y p7zip-full 14 | sudo apt-get install -y python-mysqldb 15 | sudo apt-get install -y python-Levenshtein 16 | ``` 17 | 18 | #### Installing the repository 19 | 20 | ``` 21 | git clone git@github.com:funginstitute/patentprocessor 22 | ``` 23 | 24 | After cloning, install the packages via PIP 25 | 26 | ``` 27 | cd patentprocessor 28 | sudo pip install -r requirements.txt 29 | ``` 30 | 31 | Download: 32 | 33 | * [Location Table](https://s3.amazonaws.com/funginstitute/geolocation_data.sqlite3). Place this file in the `lib` directory 34 | 35 | #### Collaborating to the repository 36 | 37 | Rather than cloning the repository, fork it and issue pull requests. To keep your personal repository up to date, we set up `.git/config` to include upstream as follows: 38 | 39 | ``` 40 | ... 41 | 42 | [remote "upstream"] 43 | url = https://github.com/funginstitute/patentprocessor.git 44 | fetch = +refs/heads/*:refs/remotes/upstream/* 45 | [remote "origin"] 46 | fetch = +refs/heads/*:refs/remotes/origin/* 47 | url = git@github.com:[your_username]/patentprocessor.git 48 | 49 | ... 50 | ``` 51 | 52 | Once that is complete, we can fetch and merge. 53 | 54 | ``` 55 | git fetch upstream 56 | git merge upstream\[branch] 57 | ``` 58 | 59 | Issue pull requests to the [FungInstitute GitHub](https://github.com/funginstitute/patentprocessor) repository and the orginators will take a look at the code being modified. 60 | 61 | #### Some MySQL recipes specific to AWS: 62 | 63 | Export files into CSV 64 | 65 | ``` 66 | mysql -u [user] -p [passwd] --database=[db] --host=[host] --batch -e "select * from [table] limit 10" | sed 's/\t/","/g;s/^/"/;s/$/"/;s/\n//g' > [table].csv 67 | ``` 68 | 69 | Allow local file reading (local-infile must be 1 for security purposes) 70 | 71 | ``` 72 | mysql -u [user] -p --local-infile=1 -h [db] [tbl] 73 | ``` 74 | 75 | #### Other notes 76 | 77 | * [Adding Indices to SQLAlchemy](http://stackoverflow.com/questions/6626810/multiple-columns-index-when-using-the-declarative-orm-extension-of-sqlalchemy) 78 | * [Ignoring Files in GIT](https://help.github.com/articles/ignoring-files) 79 | * [Permanently removing files in GIT](http://dalibornasevic.com/posts/2-permanently-remove-files-and-folders-from-a-git-repository) -------------------------------------------------------------------------------- /lib/alchemy/config.ini: -------------------------------------------------------------------------------- 1 | [global] 2 | database = sqlite 3 | echo = False 4 | 5 | [mysql] 6 | host = 7 | user = 8 | password = 9 | grant-database = 10 | application-database = 11 | 12 | [sqlite] 13 | grant-database = grant.db 14 | application-database = application.db 15 | path = . 16 | refresh = True 17 | 18 | [assignee] 19 | threshold = 0.90 20 | 21 | [location] 22 | database = geolocation_data.sqlite3 23 | path = lib 24 | commit_frequency = 10000 25 | 26 | [lawyer] 27 | threshold = 0.9 28 | 29 | [parse] 30 | # if not specified, defaults to 0 (commits after all rows added) 31 | commit_frequency = 1000 32 | -------------------------------------------------------------------------------- /lib/alchemy/match.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | from collections import defaultdict 30 | from collections import Counter 31 | from sqlalchemy.sql.expression import bindparam 32 | from sqlalchemy import create_engine, MetaData, Table, inspect, VARCHAR, Column 33 | from sqlalchemy.orm import sessionmaker 34 | 35 | from datetime import datetime 36 | 37 | def commit_inserts(session, insert_statements, table, is_mysql, commit_frequency = 1000): 38 | """ 39 | Executes bulk inserts for a given table. This is typically much faster than going through 40 | the SQLAlchemy ORM. The insert_statement list of dictionaries may fall victim to SQLAlchemy 41 | complaining that certain columns are null, if you did not specify a value for every single 42 | column for a table. 43 | 44 | Args: 45 | session -- alchemy session object 46 | insert_statements -- list of dictionaries where each dictionary contains key-value pairs of the object 47 | table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table__ 48 | is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this 49 | commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items 50 | """ 51 | if is_mysql: 52 | ignore_prefix = ("IGNORE",) 53 | session.execute("set foreign_key_checks = 0; set unique_checks = 0;") 54 | session.commit() 55 | else: 56 | ignore_prefix = ("OR IGNORE",) 57 | numgroups = len(insert_statements) / commit_frequency 58 | for ng in range(numgroups): 59 | if numgroups == 0: 60 | break 61 | chunk = insert_statements[ng*commit_frequency:(ng+1)*commit_frequency] 62 | session.connection().execute(table.insert(prefixes=ignore_prefix), chunk) 63 | print "committing chunk",ng+1,"of",numgroups,"with length",len(chunk),"at",datetime.now() 64 | session.commit() 65 | last_chunk = insert_statements[numgroups*commit_frequency:] 66 | if last_chunk: 67 | print "committing last",len(last_chunk),"records at",datetime.now() 68 | session.connection().execute(table.insert(prefixes=ignore_prefix), last_chunk) 69 | session.commit() 70 | 71 | def commit_updates(session, update_key, update_statements, table, commit_frequency = 1000): 72 | """ 73 | Executes bulk updates for a given table. This is typically much faster than going through 74 | the SQLAlchemy ORM. In order to be flexible, the update statements must be set up in a specific 75 | way. You can only update one column at a time. The dictionaries in the list `update_statements` 76 | must have two keys: `pk`, which is the primary_key for the record to be updated, and `update` 77 | which is the new value for the column you want to change. The column you want to change 78 | is specified as a string by the argument `update_key`. 79 | 80 | This method will work regardless if you run it over MySQL or SQLite, but with MySQL, it is 81 | usually faster to use the bulk_commit_updates method (see lib/tasks.py), because it uses 82 | a table join to do the updates instead of executing individual statements. 83 | 84 | Args: 85 | session -- alchemy session object 86 | update_key -- the name of the column we want to update 87 | update_statements -- list of dictionaries of updates. See above description 88 | table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table 89 | commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items 90 | """ 91 | primary_key = table.primary_key.columns.values()[0] 92 | update_key = table.columns[update_key] 93 | u = table.update().where(primary_key==bindparam('pk')).values({update_key: bindparam('update')}) 94 | numgroups = len(update_statements) / commit_frequency 95 | for ng in range(numgroups): 96 | if numgroups == 0: 97 | break 98 | chunk = update_statements[ng*commit_frequency:(ng+1)*commit_frequency] 99 | session.connection().execute(u, *chunk) 100 | print "committing chunk",ng+1,"of",numgroups,"with length",len(chunk),"at",datetime.now() 101 | session.commit() 102 | last_chunk = update_statements[numgroups*commit_frequency:] 103 | if last_chunk: 104 | print "committing last",len(last_chunk),"records at",datetime.now() 105 | session.connection().execute(u, *last_chunk) 106 | session.commit() 107 | -------------------------------------------------------------------------------- /lib/alchemy/schema_func.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | """ 30 | These functions support schema so it doesn't get too bloated 31 | """ 32 | 33 | 34 | def fetch(clean, matchSet, session, default): 35 | """ 36 | Takes the values in the existing parameter. 37 | If all the tests in matchset pass, it returns 38 | the object related to it. 39 | 40 | if the params in default refer to an instance that exists, 41 | return it! 42 | """ 43 | for keys in matchSet: 44 | cleanCnt = session.query(clean) 45 | keep = True 46 | for k in keys: 47 | if k not in default: 48 | keep = False 49 | break 50 | cleanCnt.filter(clean.__dict__[k] == default[k]) 51 | if keep and cleanCnt.count() > 0: 52 | return cleanCnt.first() 53 | return None 54 | -------------------------------------------------------------------------------- /lib/argconfig_parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | """ 27 | """ 28 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 29 | """ 30 | """ 31 | Handles argument parsing for parse.py 32 | """ 33 | 34 | import sys 35 | import os 36 | import argparse 37 | import logging 38 | 39 | class ArgHandler(object): 40 | 41 | def __init__(self, arglist): 42 | self.arglist = arglist 43 | 44 | # setup argparse 45 | self.parser = argparse.ArgumentParser(description=\ 46 | 'Specify source directory/directories for xml files to be parsed') 47 | self.parser.add_argument('--patentroot','-p', type=str, nargs='?', 48 | default=os.environ['PATENTROOT'] \ 49 | if os.environ.has_key('PATENTROOT') else '.', 50 | help='root directory of all patent files') 51 | self.parser.add_argument('--xmlregex','-x', type=str, 52 | nargs='?', 53 | help='regex used to match xml files in the PATENTROOT directory.\ 54 | Defaults to ipg\d{6}.xml') 55 | self.parser.add_argument('--verbosity', '-v', type = int, 56 | nargs='?', default=0, 57 | help='Set the level of verbosity for the computation. The higher the \ 58 | verbosity level, the less restrictive the print policy. 0 (default) \ 59 | = error, 1 = warning, 2 = info, 3 = debug') 60 | self.parser.add_argument('--output-directory', '-o', type=str, nargs='?', 61 | default=os.environ['PATENTOUTPUTDIR'] \ 62 | if os.environ.has_key('PATENTOUTPUTDIR') else '.', 63 | help='Set the output directory for the resulting sqlite3 files. Defaults\ 64 | to the current directory "."') 65 | self.parser.add_argument('--document-type', '-d', type=str, nargs='?', 66 | default='grant', 67 | help='Set the type of patent document to be parsed: grant (default) \ 68 | or application') 69 | 70 | # parse arguments and assign values 71 | args = self.parser.parse_args(self.arglist) 72 | self.xmlregex = args.xmlregex 73 | self.patentroot = args.patentroot 74 | self.output_directory = args.output_directory 75 | self.document_type = args.document_type 76 | if self.xmlregex == None: # set defaults for xmlregex here depending on doctype 77 | if self.document_type == 'grant': 78 | self.xmlregex = r"ipg\d{6}.xml" 79 | else: 80 | self.xmlregex = r"i?pa\d{6}.xml" 81 | 82 | # adjust verbosity levels based on specified input 83 | logging_levels = {0: logging.ERROR, 84 | 1: logging.WARNING, 85 | 2: logging.INFO, 86 | 3: logging.DEBUG} 87 | self.verbosity = logging_levels[args.verbosity] 88 | 89 | def get_xmlregex(self): 90 | return self.xmlregex 91 | 92 | def get_patentroot(self): 93 | return self.patentroot 94 | 95 | def get_verbosity(self): 96 | return self.verbosity 97 | 98 | def get_output_directory(self): 99 | return self.output_directory 100 | 101 | def get_document_type(self): 102 | return self.document_type 103 | 104 | def get_help(self): 105 | self.parser.print_help() 106 | sys.exit(1) 107 | -------------------------------------------------------------------------------- /lib/config_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | """ 30 | Parses the process.cfg file 31 | """ 32 | import importlib 33 | from ConfigParser import ConfigParser 34 | 35 | defaults = {'parse': 'defaultparse', 36 | 'clean': 'True', 37 | 'consolidate': 'True', 38 | 'datadir': '/data/patentdata/patents/2013', 39 | 'grantregex': 'ipg\d{6}.xml', 40 | 'applicationregex': 'ipa\d{6}.xml', 41 | 'years': None, 42 | 'downloaddir' : None} 43 | 44 | def extract_process_options(handler, config_section): 45 | """ 46 | Extracts the high level options from the [process] section 47 | of the configuration file. Returns a dictionary of the options 48 | """ 49 | result = {} 50 | result['parse'] = handler.get('process','parse') 51 | result['clean'] = handler.get('process','clean') == 'True' 52 | result['consolidate'] = handler.get('process','consolidate') == 'True' 53 | result['doctype'] = handler.get(config_section,'doctype') 54 | return result 55 | 56 | def extract_parse_options(handler, config_section): 57 | """ 58 | Extracts the specific parsing options from the parse section 59 | as given by the [parse] config option in the [process] section 60 | """ 61 | options = {} 62 | options['datadir'] = handler.get(config_section,'datadir') 63 | options['grantregex'] = handler.get(config_section,'grantregex') 64 | options['applicationregex'] = handler.get(config_section, 'applicationregex') 65 | options['years'] = handler.get(config_section,'years') 66 | options['downloaddir'] = handler.get(config_section,'downloaddir') 67 | if options['years'] and options['downloaddir']: 68 | options['datadir'] = options['downloaddir'] 69 | return options 70 | 71 | def get_config_options(configfile): 72 | """ 73 | Takes in a filepath to a configuration file, returns 74 | two dicts representing the process and parse configuration options. 75 | See `process.cfg` for explanation of the optiosn 76 | """ 77 | handler = ConfigParser(defaults) 78 | try: 79 | handler.read(configfile) 80 | except IOError: 81 | print('Error reading config file ' + configfile) 82 | exit() 83 | process_config = extract_process_options(handler, 'process') 84 | parse_config = extract_parse_options(handler, process_config['parse']) 85 | return process_config, parse_config 86 | 87 | def get_dates(yearstring): 88 | """ 89 | Given a [yearstring] of forms 90 | year1 91 | year1-year2 92 | year1,year2,year3 93 | year1-year2,year3-year4 94 | Creates tuples of dates 95 | """ 96 | years = [] 97 | for subset in yearstring.split(','): 98 | if subset == 'default': 99 | years.append('default') 100 | continue 101 | sublist = subset.split('-') 102 | # left-justify the strings with 0s to add support 103 | # for days and weeks in the date 104 | start = int(sublist[0].ljust(8,'0')) 105 | end = int(sublist[1].ljust(8,'0')) if len(sublist) > 1 else float('inf') 106 | years.append((start,end)) 107 | return years 108 | 109 | 110 | def get_xml_handlers(configfile, document_type='grant'): 111 | """ 112 | Called by parse.py to generate a lookup dictionary for which parser should 113 | be used for a given file 114 | """ 115 | handler = ConfigParser() 116 | handler.read(configfile) 117 | xmlhandlers = {} 118 | config_item = 'grant-xml-handlers' if document_type == 'grant' \ 119 | else 'application-xml-handlers' 120 | for yearrange, handler in handler.items(config_item): 121 | for year in get_dates(yearrange): 122 | try: 123 | xmlhandlers[year] = importlib.import_module(handler) 124 | except: 125 | importlib.sys.path.append('..') 126 | xmlhandlers[year] = importlib.import_module(handler) 127 | return xmlhandlers 128 | -------------------------------------------------------------------------------- /lib/handlers/README.md: -------------------------------------------------------------------------------- 1 | # XML Handlers 2 | 3 | A handler for parsing USPTO XML files must provide the following interface in 4 | order to be immediately compatible with the rest of the toolchain. 5 | 6 | DOCUMENTATION COMING 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /lib/handlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/lib/handlers/__init__.py -------------------------------------------------------------------------------- /lib/handlers/handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | class Patobj(object): 30 | pass 31 | 32 | class PatentHandler(object): 33 | def get_patobj(self): 34 | patobj = Patobj() 35 | for attr in self.attributes: 36 | patobj.__dict__[attr] = getattr(self, attr) 37 | return patobj 38 | -------------------------------------------------------------------------------- /lib/handlers/xml_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | """ 27 | """ 28 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 29 | """ 30 | 31 | """ 32 | Collection of useful functions and tools for working with XML documents 33 | """ 34 | 35 | import re 36 | from itertools import izip 37 | from unicodedata import normalize 38 | from cgi import escape 39 | 40 | 41 | def flatten(ls_of_ls): 42 | """ 43 | Takes in a list of lists, returns a new list of lists 44 | where list `i` contains the `i`th element from the original 45 | set of lists. 46 | """ 47 | return map(list, list(izip(*ls_of_ls))) 48 | 49 | def extend_padding(ls_of_ls, padding=''): 50 | """ 51 | Takes in a lists of lists, returns a new list of lists 52 | where each list is padded up to the length of the longest 53 | list by [padding] (defaults to the empty string) 54 | """ 55 | maxlen = max(map(len, ls_of_ls)) 56 | newls = [] 57 | for ls in ls_of_ls: 58 | if len(ls) != maxlen: 59 | ls.extend([padding]*(maxlen - len(ls))) 60 | newls.append(ls) 61 | return newls 62 | 63 | def escape_html_nosub(string): 64 | """ 65 | Escapes html sequences (e.g. ) that are not the known idiom 66 | for subscript: ... 67 | """ 68 | lt = re.compile('<(?!/?sub>)',flags=re.I) 69 | gt = re.compile('(?=.)*(?',flags=re.I) 70 | amp = re.compile('&(?!(amp;|lt;|gt;))',flags=re.I) 71 | string = re.sub(amp,'&',string) 72 | string = re.sub(lt,"<",string) 73 | string = re.sub(gt,">",string) 74 | return string 75 | 76 | def has_content(l): 77 | """ 78 | Returns true if list [l] contains any non-null objects 79 | """ 80 | return any(filter(lambda x: x, l)) 81 | 82 | def normalize_utf8(string): 83 | """ 84 | Normalizes [string] to be UTF-8 encoded. Accepts both unicode and normal 85 | Python strings. 86 | """ 87 | if isinstance(string, unicode): 88 | return normalize('NFC', string) 89 | else: 90 | return normalize('NFC', string.decode('utf-8')) 91 | 92 | def remove_escape_sequences(string): 93 | """ 94 | Replaces all contiguous instances of "\r\n\t\v\b\f\a " and replaces 95 | it with a single space. Preserves at most one space of surrounding whitespace 96 | """ 97 | escape_seqs = r'[\r\n\t\v\b\f\a ]+' 98 | return re.sub(escape_seqs,' ', string) 99 | 100 | def translate_underscore(string, lower=False): 101 | """ 102 | Replaces the underscore HTML idiom with the literal 103 | underscore character _. 104 | """ 105 | if lower: 106 | string = string.lower() 107 | return string.replace('','_').replace('-','_').replace(u'\u2014','_') 108 | 109 | 110 | def escape_html(string): 111 | """ 112 | Call cgi.escape on the string after applying translate_underscore 113 | """ 114 | s = translate_underscore(string) 115 | return escape(s) 116 | 117 | def normalize_document_identifier(identifier): 118 | """ 119 | [identifier] is a string representing the document-id field from an XML document 120 | """ 121 | # create splits on identifier 122 | if not identifier: return '' 123 | return re.sub(r'([A-Z]*)0?',r'\g<1>',identifier,1) 124 | 125 | def associate_prefix(firstname, lastname): 126 | """ 127 | Prepends everything after the first space-delineated word in [firstname] to 128 | [lastname]. 129 | """ 130 | if ' ' in firstname: 131 | name, prefix = firstname.split(' ',1) # split on first space 132 | else: 133 | name, prefix = firstname, '' 134 | space = ' '*(prefix is not '') 135 | last = prefix+space+lastname 136 | return name, last 137 | 138 | def clean(string, upper=True): 139 | """ 140 | Applies a subset of the above functions in the correct order 141 | and returns the string in all uppercase. 142 | 143 | Change & 144 | """ 145 | string = normalize_utf8(string) 146 | string = remove_escape_sequences(string) 147 | string = translate_underscore(string) 148 | string = escape_html(string) 149 | string = string.replace(" ", " ").replace("&", "&") 150 | if upper: 151 | return string.upper() 152 | else: 153 | return string 154 | 155 | def augment_class(string): 156 | """ 157 | Given a [string] representing the contents of a tag 158 | (see USPTO XML Documentation 4.2 or later), realize the semantic meaning 159 | of the string and return a string of form recognized by USPTO: 160 | /. 161 | """ 162 | mainclass = string[:3] 163 | subclass1 = string[3:6] 164 | subclass2 = string[6:] 165 | if subclass2: 166 | return "{0}/{1}.{2}".format(mainclass, subclass1, subclass2) 167 | return "{0}/{1}".format(mainclass, subclass1) 168 | -------------------------------------------------------------------------------- /lib/manual_replacement_library.txt: -------------------------------------------------------------------------------- 1 | # a 2 | .ANG.|Å 3 | .ang.|å 4 | â|å 5 | â ;|å 6 | Å|Å 7 | {dot over (A)}|Å 8 | {dot over (a)}|å 9 | #Inaccurate, but too many possibilities - this should be easier to debug 10 | {hacek over (a)}|a 11 | .circle.|Å 12 | {overscore (A)}|Ä 13 | #Inaccurate, but too many possibilities - this should be easier to debug 14 | {overscore (a)}|a 15 | {umlaut over (Aa)}|Ää 16 | {acute over (Å)}|Å 17 | /e,uml/a/ |ä 18 | /a/ |ä 19 | /a/|ä 20 | #Inaccurate, but too many possibilities - this should be easier to debug 21 | {haeck over (a)}|a 22 | # b 23 | # c 24 | ć|ć 25 | Ć|Ć 26 | {hacek over (C)}|Č 27 | {haeck over (C)}|Č 28 | {hacek over (c)}|č 29 | # d 30 | {hacek over (D)}|Ď 31 | {hacek over (d)}|ď 32 | # e 33 | ē|ē 34 | {haeck over (e)}|ě 35 | {hacek over (e)}|ě 36 | {overscore (e)}|è 37 | # f 38 | # g 39 | ǵ|ǵ 40 | # h 41 | # i 42 | {hacek over (i)}|i 43 | # j 44 | # k 45 | {umlaut over (K)}|K 46 | # l 47 | {umlaut over (L)}|L 48 | # m 49 | {umlaut over (M)}|M 50 | {umlaut over (m)}|m 51 | # n 52 | ń|ń 53 | {haeck over (n)}|ñ 54 | {overscore (n)}|ñ 55 | # o 56 | ō|ō 57 | #{hacek over (o)}| 58 | #{overscore (o)}| 59 | #{umlaut over (oo)}| 60 | uml/O/ |Ö 61 | uml/o/ |ö 62 | .0.|ø 63 | .O slashed.|Ø 64 | .o slashed.|ø 65 | /o/ |ö 66 | /o/|ő 67 | {hacek over (o)}|ö 68 | {overscore (o)}|ö 69 | #Inaccurate, but too many possibilities - this should be easier to debug 70 | {dot over (o)}|o 71 | {acute over (ø)}|ø 72 | # p 73 | # q 74 | # r 75 | ŕ|ŕ 76 | {hacek over (r)}|ř 77 | {haeck over (r)} |ř 78 | {haeck over (r)}|ř 79 | {grave over (R)}|R 80 | {grave over (r)}|r 81 | # s 82 | ś|ś 83 | ŝ|ŝ 84 | {haeck over (S)}|Š 85 | {hacek over (S)}|Š 86 | {hacek over (s)}|š 87 | {haeck over (s)}|š 88 | {umlaut over (S)}|S 89 | /s/ | sous 90 | /s/|-sur- 91 | /S/|-sur- 92 | {dot over (s)}|s 93 | # t 94 | {dot over (T)}okyo|Tokyo 95 | # u 96 | ũ|ũ 97 | /U/ |Ü 98 | /u/ |ü 99 | /u/|ü 100 | {overscore (u)}|ü 101 | {hacek over (u)}|ǔ 102 | {dot over (u)}|u 103 | # v 104 | # w 105 | # x 106 | # y 107 | # z 108 | Ź|Ź 109 | ź|ź 110 | {hacek over (Z)}|Ž 111 | # Misspellings 112 | a/d/|aan den 113 | {hacek over (A)}lta|Älta 114 | {acute over (B)}uehl|Bühl 115 | {umlaut over (C)}ernilov|Černilov 116 | O/ d.ANG.kra|Ödåkra 117 | {haeck over (u)}ttenre{haeck over (u)}th|Uttenreuth 118 | D{haeck over (u)}sseldorf|Düsseldorf 119 | P{haeck over (u)}choen|Pocheon 120 | Gro{burgwedel|Großburgwedel 121 | B{umlaut over (j)}rringbro|Bjerringbro 122 | B{acute over (j)}árred|Bjärred 123 | Defreggerstra{e|Defreggerstraße 124 | Vaster{s|Västerås 125 | # other 126 | .cent.| 127 | #∘|∘ 128 | =|= 129 | #|# 130 | +|+ 131 | & Engraving;|& Engraving 132 | {umlaut over ( )}| 133 | {acute over (æ)}|æ 134 | “A”-Cdad. de| 135 | all of| 136 | all Of| 137 | All of| 138 | al of | 139 | Both of| 140 | BOTH OF| 141 | both of| 142 | bot of | 143 | both ot| 144 | late of | 145 | LATE OF | 146 | # greek 147 | &agr;|α 148 | &bgr;|β 149 | [|[ 150 | &mgr;|μ 151 | &phgr;|φ 152 | &pgr;|π 153 | ]|] 154 | &thgr;|θ 155 | # URL custom characters 156 | |Ç 157 | |Ç 158 | |i 159 | |i 160 | |Ł 161 | |ș 162 | |ș 163 | |ș 164 | |ș 165 | |ș 166 | |ș 167 | |ș 168 | #Note: should be º, but this is not a valid character so ignore it 169 | | -------------------------------------------------------------------------------- /lib/state_abbreviations.txt: -------------------------------------------------------------------------------- 1 | Alabama|AL 2 | Alaska|AK 3 | Arizona|AZ 4 | Arkansas|AR 5 | California|CA 6 | Colorado|CO 7 | Connecticut|CT 8 | Delaware|DE 9 | Florida|FL 10 | Georgia|GA 11 | Hawaii|HI 12 | Idaho|ID 13 | Illinois|IL 14 | Indiana|IN 15 | Iowa|IA 16 | Kansas|KS 17 | Kentucky|KY 18 | Louisiana|LA 19 | Maine|ME 20 | Maryland|MD 21 | Massachusetts|MA 22 | Michigan|MI 23 | Minnesota|MN 24 | Mississippi|MS 25 | Missouri|MO 26 | Montana|MT 27 | Nebraska|NE 28 | Nevada|NV 29 | New Hampshire|NH 30 | New Jersey|NJ 31 | New Mexico|NM 32 | New York|NY 33 | North Carolina|NC 34 | North Dakota|ND 35 | Ohio|OH 36 | Oklahoma|OK 37 | Oregon|OR 38 | Pennsylvania|PA 39 | Rhode Island|RI 40 | South Carolina|SC 41 | South Dakota|SD 42 | Tennessee|TN 43 | Texas|TX 44 | Utah|UT 45 | Vermont|VT 46 | Virginia|VA 47 | Washington|WA 48 | West Virginia|WV 49 | Wisconsin|WI 50 | Wyoming|WY 51 | -------------------------------------------------------------------------------- /lib/tasks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | """ 27 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 28 | """ 29 | """ 30 | Functions for doing bulk inserts and bulk updates 31 | """ 32 | from alchemy.match import commit_inserts, commit_updates 33 | from alchemy import session_generator 34 | from alchemy.schema import temporary_update, app_temporary_update 35 | from sqlalchemy import create_engine, MetaData, Table, inspect, VARCHAR, Column 36 | from sqlalchemy.orm import sessionmaker 37 | 38 | # fetch reference to temporary_update table. 39 | 40 | def bulk_commit_inserts(insert_statements, table, is_mysql, commit_frequency = 1000, dbtype='grant'): 41 | """ 42 | Executes bulk inserts for a given table. This is typically much faster than going through 43 | the SQLAlchemy ORM. The insert_statement list of dictionaries may fall victim to SQLAlchemy 44 | complaining that certain columns are null, if you did not specify a value for every single 45 | column for a table. 46 | 47 | A session is generated using the scoped_session factory through SQLAlchemy, and then 48 | the actual lib.alchemy.match.commit_inserts task is dispatched. 49 | 50 | Args: 51 | insert_statements -- list of dictionaries where each dictionary contains key-value pairs of the object 52 | table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table__ 53 | is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this 54 | commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items 55 | dbtype -- which base schema to use. Either 'grant' or 'application' 56 | """ 57 | session = session_generator(dbtype=dbtype) 58 | commit_inserts(session, insert_statements, table, is_mysql, commit_frequency) 59 | 60 | def bulk_commit_updates(update_key, update_statements, table, is_mysql, commit_frequency = 1000, dbtype='grant'): 61 | """ 62 | Executes bulk updates for a given table. This is typically much faster than going through 63 | the SQLAlchemy ORM. In order to be flexible, the update statements must be set up in a specific 64 | way. You can only update one column at a time. The dictionaries in the list `update_statements` 65 | must have two keys: `pk`, which is the primary_key for the record to be updated, and `update` 66 | which is the new value for the column you want to change. The column you want to change 67 | is specified as a string by the argument `update_key`. 68 | 69 | If is_mysql is True, then the update will be performed by inserting the record updates 70 | into the table temporary_update and then executing an UPDATE/JOIN. If is_mysql is False, 71 | then SQLite is assumed, and traditional updates are used (lib.alchemy.match.commit_updates) 72 | 73 | A session is generated using the scoped_session factory through SQLAlchemy, and then 74 | the actual task is dispatched. 75 | 76 | Args: 77 | update_key -- the name of the column we want to update 78 | update_statements -- list of dictionaries of updates. See above description 79 | table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table 80 | is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this 81 | commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items 82 | dbtype -- which base schema to use. Either 'grant' or 'application' 83 | """ 84 | session = session_generator(dbtype=dbtype) 85 | if not is_mysql: 86 | commit_updates(session, update_key, update_statements, table, commit_frequency) 87 | return 88 | session.rollback() 89 | if is_mysql: 90 | session.execute('truncate temporary_update;') 91 | else: 92 | session.execute('delete from temporary_update;') 93 | if dbtype == 'grant': 94 | commit_inserts(session, update_statements, temporary_update, is_mysql, 10000) 95 | else: 96 | commit_inserts(session, update_statements, app_temporary_update, is_mysql, 10000) 97 | # now update using the join 98 | primary_key = table.primary_key.columns.values()[0] 99 | update_key = table.columns[update_key] 100 | session.execute("UPDATE {0} join temporary_update ON temporary_update.pk = {1} SET {2} = temporary_update.update;".format(table.name, primary_key.name, update_key.name )) 101 | session.commit() 102 | if is_mysql: 103 | session.execute('truncate temporary_update;') 104 | else: 105 | session.execute('delete from temporary_update;') 106 | session.commit() 107 | -------------------------------------------------------------------------------- /lib/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/lib/util/__init__.py -------------------------------------------------------------------------------- /lib/util/csv_reader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | """ 27 | """ 28 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 29 | """ 30 | 31 | """ 32 | Simplifies the process for reading in unicode CSV files 33 | """ 34 | 35 | import csv 36 | from unicodedata import normalize 37 | import codecs 38 | 39 | def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): 40 | """ 41 | Creates a unicode CSV reader 42 | """ 43 | csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), dialect=dialect, **kwargs) 44 | for row in csv_reader: 45 | yield [unicode(cell, 'utf-8') for cell in row] 46 | 47 | def utf_8_encoder(unicode_csv_data): 48 | """ 49 | Encodes data in utf-8 50 | """ 51 | for line in unicode_csv_data: 52 | yield line.encode('utf-8') 53 | 54 | def read_file(filename): 55 | """ 56 | Given a string [filename], returns an iterator of the lines in the CSV file 57 | """ 58 | with codecs.open(filename, encoding='utf-8') as csvfile: 59 | reader = unicode_csv_reader(csvfile) 60 | for row in reader: 61 | yield row 62 | -------------------------------------------------------------------------------- /lib/util/getpatent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Copyright (c) 2013 The Regents of the University of California, AMERICAN INSTITUTES FOR RESEARCH 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | """ 27 | """ 28 | @author Gabe Fierro gt.fierro@berkeley.edu github.com/gtfierro 29 | """ 30 | 31 | import sys 32 | import re 33 | import time 34 | import mechanize 35 | from BeautifulSoup import BeautifulSoup 36 | 37 | if len(sys.argv) < 2: 38 | print "Given a patent id number, will download the relevant zipfile" 39 | print "Usage: ./getpatent.py " 40 | print "Example: ./getpatent.py 7783348" 41 | sys.exit(0) 42 | 43 | patent_name = sys.argv[1] 44 | 45 | if patent_name[:2].upper() != 'US': 46 | patent_name = 'US'+patent_name 47 | 48 | BASE_URL = 'http://www.google.com/patents/' 49 | ZIP_BASE_URL = 'http://commondatastorage.googleapis.com/patents/grant_full_text/' 50 | br = mechanize.Browser() 51 | br.addheaders = [('User-agent', 'Feedfetcher-Google-iGoogleGadgets;\ 52 | (+http://www.google.com/feedfetcher.html)')] 53 | br.set_handle_robots(False) 54 | html = br.open(BASE_URL+patent_name).read() 55 | 56 | print 'Got HTML for patent page' 57 | 58 | soup = BeautifulSoup(html) 59 | sidebar = soup.find('div', {'class': 'patent_bibdata'}) 60 | text = str(sidebar.text) 61 | date = re.search(r'(?<=Issue date: )[A-Za-z]{3} [0-9]{1,2}, [0-9]{4}', text).group() 62 | date_struct = time.strptime(date, '%b %d, %Y') 63 | year = str(date_struct.tm_year)[2:] 64 | month = str(date_struct.tm_mon).zfill(2) 65 | day = str(date_struct.tm_mday).zfill(2) 66 | 67 | zipfile = 'ipg{0}{1}{2}.zip'.format(year,month,day) 68 | 69 | zipurl = '{0}{1}/{2}'.format(ZIP_BASE_URL,date_struct.tm_year,zipfile) 70 | 71 | print 'Downloading ZIP file: ',zipurl 72 | 73 | res = br.retrieve(zipurl, zipfile) 74 | print res 75 | 76 | print 'Finished downloading' 77 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # IPython Notebooks 2 | 3 | To run a notebook, you need 4 | [iPython](http://ipython.org/ipython-doc/dev/index.html) as well as 5 | [matplotlib](http://matplotlib.org/). Copy the notebook to the directory 6 | containing the sqlite3 files and open using 7 | 8 | ``` 9 | ipython notebook 10 | ``` 11 | 12 | It should open in your browser. 13 | -------------------------------------------------------------------------------- /notebooks/buildpdf: -------------------------------------------------------------------------------- 1 | ipython nbconvert --to=latex --template=latex_nocode.tplx --SphinxTransformer.author='Fung Institute' MySQL.ipynb 2 | sed -e 's/\\vspace{-0\.5\\baselineskip}//g' MySQL.tex > tmp 3 | sed -e 's/\\title{MySQL}/\\title{Patent Database Report}/' tmp > tmp2 4 | mv tmp2 MySQL.tex 5 | pdflatex MySQL.tex 6 | rm tmp tmp2 7 | -------------------------------------------------------------------------------- /notebooks/latex_nocode.tplx: -------------------------------------------------------------------------------- 1 | % Disable input cells 2 | ((* extends 'latex_article.tplx' *)) 3 | ((* block input *)) 4 | ((* endblock input *)) 5 | ((* block output_group *)) 6 | % Add remainer of the document contents below. 7 | ((* for output in cell.outputs *)) 8 | ((( render_output(output) ))) 9 | ((* endfor *)) 10 | ((* endblock *)) 11 | -------------------------------------------------------------------------------- /process.cfg: -------------------------------------------------------------------------------- 1 | # This is a sample file that configures the environment for the preprocessing 2 | # steps of parsing, cleaning, consolidation 3 | 4 | # [process] defines which configured steps the current run of the preprocessor 5 | # will be run. Accepts 4 options: 6 | # parse: defines which parsing configuration will be run 7 | # clean: if True, runs the cleaning step on the output of parse 8 | # consolidate: if True, runs the conslidation step on the output of clean 9 | # doctype: can be grant, application, or all, and processing will proceed accordingly. 10 | # Note: make sure that the value for grantregex and/or applicationregex 11 | # is defined if you wish to use a value other than the default for either 12 | 13 | [process] 14 | parse=download 15 | clean=True 16 | consolidate=True 17 | doctype=all 18 | 19 | #[defaultparse] 20 | ## 'datadir' specifies the path to the directory containing the XML files that 21 | ## we want to parse. This path will be evaluated relative to the main directory 22 | ## of preprocessor. Defaults to '/data/patentdata/patents/2013' 23 | # 24 | # datadir=/path/to/patent/data 25 | 26 | ## 'grantregex' and 'applicationregex' specify the regular expression that 27 | ## matches the XML files that we want to parse. If you are downloading data 28 | ## from the USPTO, then the default value should be fine. Defaults to 29 | ## 'ipg\d{6}.xml', the format found for most USPTO files since 2005 30 | # 31 | # grantregex=ipg\d{6}.xml 32 | # applicationregex=ipa\d{6}.xml 33 | 34 | ## 'years' specifies the range of years for which you want to download and 35 | ## parse. If the current year is specified, the script will download all 36 | ## possible files. Specifying the 'years' option will ignore the 'datadir' 37 | ## option and just download the relevant files to 'downloaddir' (see below) 38 | ## Specify years as: 39 | ## year1 40 | ## year1-year2 41 | ## year1,year2,year3 42 | ## year1-year2,year3-year4 43 | ## latest (downloads the most recent week's data) 44 | ## If this option is NOT specified, the parse will run on the contents of 'datadir' 45 | # 46 | # years=2010-2013 47 | 48 | ## 'downloaddir' specifies the target base directory into which the weekly 49 | ## patent files will be downloaded. Note that the preprocessor will create 50 | ## directories named for each year inside 'downloaddir', and if they already 51 | ## exist, will look inside for previously downloaded files 52 | ## If this option is NOT specified, the parse will run on the contents of 'datadir' 53 | # 54 | # downloaddir=/path/to/base/directory/for/downloads 55 | 56 | # example configuration for a parse of 2012 data. Note that the 'grantregex' 57 | # option is not specified because the default value is sufficient 58 | [2012parse] 59 | datadir=/data/patentdata/patents/2012 60 | 61 | # example configuration to test the parsing 62 | [test] 63 | datadir=test/fixtures/xml 64 | grantregex=\d{4}_\d.xml 65 | applicationregex=ipa\d{6}.*.xml 66 | 67 | # example configuration for a parse of the latest data. Note that the 68 | # regexes for grants and applications will be used if 'all' is specified 69 | # for doctype in [process], and otherwise only the appropriate one will be used. 70 | [download] 71 | years=latest 72 | downloaddir=./data 73 | grantregex=i?pg\d{6}.xml 74 | applicationregex=i?pa\d{6}.xml 75 | 76 | # This section specifies which grant_handler is to be used for each date of the 77 | # released patent. This section should only have to be touched when a new parser is 78 | # introduced. In the case where a year cannot be parsed from the filename (the 79 | # format `ipgYYMMDD` is assumed), then the default parser is used. 80 | # The dates in the ranges are either YYYY or YYYYMMDD. If only one date is provided, 81 | # then the corresponding handler is assumed for all subsequent patents 82 | [grant-xml-handlers] 83 | 2005-20130108=lib.handlers.grant_handler_v42 84 | 20130115=lib.handlers.grant_handler_v44 85 | default=lib.handlers.grant_handler_v42 86 | 87 | [application-xml-handlers] 88 | 2001-20060822=lib.handlers.application_handler_v41 89 | 20060823-20130116=lib.handlers.application_handler_v42 90 | 20130117=lib.handlers.application_handler_v43 91 | default=lib.handlers.application_handler_v42 92 | 93 | # schema changes were in 20010131 (v15), 20020101 (v16), 94 | # 20050825 (v41), 20060823 (v42), 20130121 (v43) 95 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Python scripts for processing USPTO inventor and patent data 2 | 3 | The following collection of scripts performs pre- and post-processing on patent 4 | data as part of the patent inventor disambiguation process. Raw patent data is 5 | obtained from [Google Bulk Patent 6 | Download](http://www.google.com/googlebooks/uspto-patents-grants-text.html). 7 | 8 | For a high-level overview of the patentprocessor toolchain, please see [our 9 | technical 10 | report](https://github.com/funginstitute/publications/raw/master/patentprocessor/patentprocessor.pdf). 11 | 12 | For a description of configuration of the patentprocessor toolchain, please see 13 | [this technical 14 | report](https://github.com/funginstitute/publications/raw/master/weeklyupdate/weeklyupdate.pdf). 15 | 16 | To follow development, subscribe to 17 | [RSS feed](https://github.com/funginstitute/patentprocessor/commits/master.atom). 18 | 19 | ## Patentprocessor Overview 20 | 21 | There are several steps in the patentprocessor toolchain: 22 | 23 | 1. Retrieve/locate parsing target 24 | 2. Execute parsing phase 25 | 3. Run preliminary disambiguations: 26 | * assignee disambiguation 27 | * location disambiguation 28 | 4. Prepare input for inventor disambiguation 29 | 5. Disambiguate inventors (external process) 30 | 6. Ingest disambiguated inventors into database 31 | 32 | For the preliminary disambiguations, you need the [location 33 | database](https://s3.amazonaws.com/fungpatdownloads/geolocation_data.7z). File 34 | requires [7zip](http://www.7-zip.org/) to unpack. 35 | 36 | ## Installation and Configuration of the Preprocessing Environment 37 | 38 | The python-based preprocessor is tested on Ubuntu 12.04 and MacOSX 10.6. Any 39 | flavor of Unix with the following installed should work, though it is possible 40 | to get the toolchain running on Windows. 41 | 42 | If you have [`pip`](http://www.pip-installer.org/en/latest/index.html) 43 | installed, you can simplify the installation process by just running `sudo pip 44 | install -r requirements.txt` from within the patentprocessor directory. 45 | 46 | Please [file an issue](https://github.com/funginstitute/patentprocessor/issues) if you find another dependency. 47 | 48 | ### Ubuntu 49 | 50 | ``` 51 | sudo apt-get update 52 | sudo apt-get install python-dev 53 | sudo apt-get install python-setuptools 54 | sudo easy_install -U distribute 55 | sudo apt-get install -y python-Levenshtein make libmysqlclient-dev python-mysqldb python-pip python-zmq python-numpy gfortran libopenblas-dev liblapack-dev g++ sqlite3 libsqlite3-dev python-sqlite redis-server 56 | sudo pip install -r requirements.txt 57 | ``` 58 | 59 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | MySQL-python==1.2.4 2 | SQLAlchemy==0.8.3 3 | Unidecode==0.04.14 4 | beautifulsoup4==4.3.2 5 | ipython==1.1.0 6 | numpy==1.8.0 7 | python-Levenshtein==0.10.2 8 | pyzmq==14.0.0 9 | requests==2.0.1 10 | wsgiref==0.1.2 11 | pandas==0.12.0 12 | -------------------------------------------------------------------------------- /run_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo 'Running assignee disambiguation' 4 | python lib/assignee_disambiguation.py 5 | 6 | # TODO: fixup lawyer disambiguation 7 | #echo 'Running lawyer disambiguation' 8 | #python lib/lawyer_disambiguation.py 'grant' 9 | 10 | echo 'Running geo disambiguation' 11 | python lib/geoalchemy.py 12 | -------------------------------------------------------------------------------- /run_consolidation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm disambiguator.csv 4 | echo 'Running consolidation for disambiguator' 5 | python consolidate.py $1 6 | -------------------------------------------------------------------------------- /starcluster/README.md: -------------------------------------------------------------------------------- 1 | README 2 | ====== 3 | 4 | #### StarCluster and Batch Parsing 5 | 6 | We take advantage of the fantastic [StarCluster](http://star.mit.edu/cluster/) for batch processing of the Patent XML files. Defaults (such as sgeadmin) are assumed. 7 | 8 | We use StarCluster mostly for distributed jobs. 9 | The StarCluster machines are configured to replicate the environment necessary to parse 10 | 11 | 1. `python fetch_xml.py` Fetching the XML files from USPTO. For now this fetches files specified in `urls.pickle`. As of 8/1/2013, `urls.pickle` contains files from 2005-mid 2013 12 | 2. Login to the starcluster using root 13 | 3. Point the `config.ini` file so it points to the MySQL database 14 | 4. `cd /home/sgeadmin/patentprocessor/starcluster; sh load_pre.sh > ../tar/[num].log` execute the shell script 15 | 5. Transfer the tar files to a separate location (or server) to begin the MySQL ingestion process. 16 | 6. Execute `build_tsv.py` and specify the location of the `tar.gz` files. This builds several text files which can be later ingested. 17 | 7. Modify `config.ini` file and set the proper credentials to the desired database. `from lib import alchemy` so the schema is fully updated. 18 | 8. Log into mysql. If it is a remote server, such as on Amazon RDS, `mysql -u [user] -p --local-infile=1 -h [db] [tbl]` and execute `source load.sql`. The default database is assumed to be `uspto_new` so if this should be something else, please make the appropriate adjustments. 19 | -------------------------------------------------------------------------------- /starcluster/built_tsv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import sys 4 | 5 | # specify the input directory 6 | # something like 20* is fine if we want to take 7 | # care of the directories that begin in the 2000s 8 | 9 | search = sys.argv[1] 10 | 11 | os.system("rm *.txt") 12 | for f in glob.glob("{0}/*.tar.gz".format(search)): 13 | s = f.split("/")[-1].split(".")[0] 14 | y = f.split("/")[0] 15 | os.system("tar -xzf {0}".format(f)) 16 | for t in glob.glob("*.txt"): 17 | os.system("cat {0} >> new/{0}".format(t, y)) 18 | os.system("rm *.txt") 19 | -------------------------------------------------------------------------------- /starcluster/config.ini: -------------------------------------------------------------------------------- 1 | [directory] 2 | home = /home/sgeadmin/patentprocessor 3 | sqlalchemy = /home/sgeadmin/patentprocessor/lib/alchemy 4 | xml = /home/sgeadmin/patentprocessor/XML 5 | local = /mnt/sgeadmin -------------------------------------------------------------------------------- /starcluster/fetch_xml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from datetime import datetime 4 | from ConfigParser import ConfigParser 5 | from IPython.parallel import Client 6 | 7 | config = ConfigParser() 8 | config.read('{0}/config.ini'.format(os.path.dirname(os.path.realpath(__file__)))) 9 | 10 | rc = Client(packer="pickle") 11 | dview = rc[:] 12 | print rc.ids 13 | 14 | 15 | @dview.remote(block=True) 16 | def fetch(): 17 | import os 18 | os.chdir(node) 19 | for i, f in enumerate(files): 20 | fname = f.split("/")[-1].split(".")[0] 21 | if not os.path.exists("{0}.xml".format(fname)): 22 | os.system("wget {0}".format(f)) 23 | os.system("unzip {0}.zip".format(fname)) 24 | 25 | fname = open("urls.pickle", "rb") 26 | urls = pickle.load(fname) 27 | 28 | master = config.get('directory', 'home') 29 | node = config.get('directory', 'local') 30 | if not os.path.exists("{0}/tar".format(master)): 31 | os.makedirs("{0}/tar".format(master)) 32 | 33 | dview["master"] = master 34 | dview["node"] = node 35 | full = [] 36 | for year in urls.keys(): 37 | full.extend(urls[year]) 38 | dview.scatter("files", full) 39 | fetch() 40 | -------------------------------------------------------------------------------- /starcluster/load.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Merge together two disctinct MySQL tables 3 | mysqldump [options] uspto -T /var/lib/mysql/uspto 4 | mysql -u [user] -p --local-infile=1 -h [db] [tbl] 5 | 6 | READ THIS: http://dev.mysql.com/doc/refman/5.5/en/optimizing-innodb-bulk-data-loading.html 7 | */ 8 | 9 | 10 | SELECT "new base", NOW(); 11 | SET FOREIGN_KEY_CHECKS = 0; 12 | SET UNIQUE_CHECKS = 0; 13 | SET SESSION tx_isolation='READ-UNCOMMITTED'; 14 | SET innodb_lock_wait_timeout = 500; 15 | SET autocommit=0; 16 | 17 | SELECT "patent"; 18 | LOAD DATA LOCAL INFILE 'new/patent.txt' INTO TABLE uspto_new.patent FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 19 | SELECT "rawlocation"; 20 | LOAD DATA LOCAL INFILE 'new/rawlocation.txt' IGNORE INTO TABLE uspto_new.rawlocation FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 21 | SELECT "subclass"; 22 | LOAD DATA LOCAL INFILE 'new/subclass.txt' IGNORE INTO TABLE uspto_new.subclass FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 23 | SELECT "mainclass"; 24 | LOAD DATA LOCAL INFILE 'new/mainclass.txt' IGNORE INTO TABLE uspto_new.mainclass FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 25 | SELECT "application"; 26 | LOAD DATA LOCAL INFILE 'new/application.txt' INTO TABLE uspto_new.application FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 27 | SELECT "rawassignee"; 28 | LOAD DATA LOCAL INFILE 'new/rawassignee.txt' INTO TABLE uspto_new.rawassignee FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 29 | SELECT "rawinventor"; 30 | LOAD DATA LOCAL INFILE 'new/rawinventor.txt' INTO TABLE uspto_new.rawinventor FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 31 | SELECT "ipcr"; 32 | LOAD DATA LOCAL INFILE 'new/ipcr.txt' INTO TABLE uspto_new.ipcr FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 33 | SELECT "rawlawyer"; 34 | LOAD DATA LOCAL INFILE 'new/rawlawyer.txt' INTO TABLE uspto_new.rawlawyer FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 35 | SELECT "usreldoc"; 36 | LOAD DATA LOCAL INFILE 'new/usreldoc.txt' INTO TABLE uspto_new.usreldoc FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 37 | SELECT "uspc"; 38 | LOAD DATA LOCAL INFILE 'new/uspc.txt' INTO TABLE uspto_new.uspc FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 39 | 40 | COMMIT; 41 | SET autocommit=1; 42 | SET innodb_lock_wait_timeout = 50; 43 | SET UNIQUE_CHECKS = 1; 44 | SET FOREIGN_KEY_CHECKS = 1; 45 | SET SESSION tx_isolation='REPEATABLE-READ'; 46 | SELECT NOW(); 47 | 48 | /* ------------------------------- */ 49 | 50 | SELECT "new citatons", NOW(); 51 | SET FOREIGN_KEY_CHECKS = 0; 52 | SET UNIQUE_CHECKS = 0; 53 | SET SESSION tx_isolation='READ-UNCOMMITTED'; 54 | SET innodb_lock_wait_timeout = 500; 55 | SET autocommit=0; 56 | 57 | SELECT "foreigncitation"; 58 | LOAD DATA LOCAL INFILE 'new/foreigncitation.txt' INTO TABLE uspto_new.foreigncitation FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 59 | SELECT "otherreference"; 60 | LOAD DATA LOCAL INFILE 'new/otherreference.txt' INTO TABLE uspto_new.otherreference FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 61 | SELECT "usapplicationcitation"; 62 | LOAD DATA LOCAL INFILE 'new/usapplicationcitation.txt' INTO TABLE uspto_new.usapplicationcitation FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 63 | SELECT "uspatentcitation"; 64 | LOAD DATA LOCAL INFILE 'new/uspatentcitation.txt' INTO TABLE uspto_new.uspatentcitation FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 65 | 66 | COMMIT; 67 | SET autocommit=1; 68 | SET innodb_lock_wait_timeout = 50; 69 | SET UNIQUE_CHECKS = 1; 70 | SET FOREIGN_KEY_CHECKS = 1; 71 | SET SESSION tx_isolation='REPEATABLE-READ'; 72 | SELECT NOW(); 73 | 74 | /* ------------------------------- */ 75 | 76 | SELECT "new claims", NOW(); 77 | SET FOREIGN_KEY_CHECKS = 0; 78 | SET UNIQUE_CHECKS = 0; 79 | SET SESSION tx_isolation='READ-UNCOMMITTED'; 80 | SET innodb_lock_wait_timeout = 500; 81 | SET autocommit=0; 82 | 83 | SELECT "citation"; 84 | LOAD DATA LOCAL INFILE 'new/claim.txt' INTO TABLE uspto_new.claim FIELDS TERMINATED by '\t' ENCLOSED BY '\"'; 85 | 86 | COMMIT; 87 | SET autocommit=1; 88 | SET innodb_lock_wait_timeout = 50; 89 | SET UNIQUE_CHECKS = 1; 90 | SET FOREIGN_KEY_CHECKS = 1; 91 | SET SESSION tx_isolation='REPEATABLE-READ'; 92 | SELECT NOW(); 93 | -------------------------------------------------------------------------------- /starcluster/load_drop.sql: -------------------------------------------------------------------------------- 1 | drop table ipcr; 2 | drop table patent_assignee; 3 | drop table patent_inventor; 4 | drop table patent_lawyer; 5 | drop table location_assignee; 6 | drop table location_inventor; 7 | drop table rawassignee; 8 | drop table rawinventor; 9 | drop table rawlawyer; 10 | drop table rawlocation; 11 | drop table application; 12 | drop table assignee; 13 | drop table inventor; 14 | drop table lawyer; 15 | drop table otherreference; 16 | drop table foreigncitation; 17 | drop table uspatentcitation; 18 | drop table usapplicationcitation; 19 | drop table claim; 20 | drop table uspc; 21 | drop table usreldoc; 22 | drop table patent; 23 | drop table location; 24 | drop table mainclass; 25 | drop table subclass; 26 | -------------------------------------------------------------------------------- /starcluster/load_pre.sh: -------------------------------------------------------------------------------- 1 | #cd /home/sgeadmin/patentprocessor/starcluster; sh load_pre.sh > ../tar/2.log 2 | 3 | cd /mnt/sgeadmin 4 | for i in `ls *.xml` 5 | do echo $i 6 | cd /var/lib/mysql/uspto 7 | echo " - remove txt" 8 | rm *.txt 9 | 10 | cd /home/sgeadmin/patentprocessor 11 | echo " - drop database" 12 | mysql -root uspto < starcluster/load_drop.sql 13 | mysql -root uspto < starcluster/load_drop.sql 14 | 15 | cd /home/sgeadmin/patentprocessor 16 | echo " - python" 17 | python parse.py -p /mnt/sgeadmin -x $i 18 | echo " - mysqldump" 19 | mysqldump -root uspto -T /var/lib/mysql/uspto 20 | 21 | echo " - duplicate" 22 | cd /var/lib/mysql/uspto 23 | tar -czf $i.tar.gz *.txt 24 | mv $i.tar.gz /home/sgeadmin/patentprocessor/tar 25 | 26 | done 27 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | results.csv 2 | *.db 3 | tmp 4 | err 5 | fibotest.py 6 | *.sqlite3 7 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | clean: 4 | rm -rf *.sqlite3 *~ *.pyc err 5 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/__init__.py -------------------------------------------------------------------------------- /test/colortest.py: -------------------------------------------------------------------------------- 1 | COLOR16 = "\33[38;5;16m" 2 | COLOR17 = "\33[38;5;17m" 3 | COLOR18 = "\33[38;5;18m" 4 | COLOR19 = "\33[38;5;19m" 5 | COLOR20 = "\33[38;5;20m" 6 | COLOR21 = "\33[38;5;21m" 7 | COLOR22 = "\33[38;5;22m" 8 | COLOR23 = "\33[38;5;23m" 9 | COLOR24 = "\33[38;5;24m" 10 | COLOR25 = "\33[38;5;25m" 11 | COLOR26 = "\33[38;5;26m" 12 | COLOR27 = "\33[38;5;27m" 13 | COLOR28 = "\33[38;5;28m" 14 | COLOR29 = "\33[38;5;29m" 15 | COLOR30 = "\33[38;5;30m" 16 | COLOR31 = "\33[38;5;31m" 17 | COLOR32 = "\33[38;5;32m" 18 | COLOR33 = "\33[38;5;33m" 19 | COLOR34 = "\33[38;5;34m" 20 | COLOR35 = "\33[38;5;35m" 21 | COLOR36 = "\33[38;5;36m" 22 | COLOR37 = "\33[38;5;37m" 23 | COLOR38 = "\33[38;5;38m" 24 | COLOR39 = "\33[38;5;39m" 25 | COLOR40 = "\33[38;5;40m" 26 | COLOR41 = "\33[38;5;41m" 27 | COLOR42 = "\33[38;5;42m" 28 | COLOR43 = "\33[38;5;43m" 29 | COLOR44 = "\33[38;5;44m" 30 | COLOR45 = "\33[38;5;45m" 31 | COLOR46 = "\33[38;5;46m" 32 | COLOR47 = "\33[38;5;47m" 33 | COLOR48 = "\33[38;5;48m" 34 | COLOR49 = "\33[38;5;49m" 35 | COLOR50 = "\33[38;5;50m" 36 | COLOR51 = "\33[38;5;51m" 37 | COLOR52 = "\33[38;5;52m" 38 | COLOR53 = "\33[38;5;53m" 39 | COLOR54 = "\33[38;5;54m" 40 | COLOR55 = "\33[38;5;55m" 41 | COLOR56 = "\33[38;5;56m" 42 | COLOR57 = "\33[38;5;57m" 43 | COLOR58 = "\33[38;5;58m" 44 | COLOR59 = "\33[38;5;59m" 45 | COLOR60 = "\33[38;5;60m" 46 | COLOR61 = "\33[38;5;61m" 47 | COLOR62 = "\33[38;5;62m" 48 | COLOR63 = "\33[38;5;63m" 49 | COLOR64 = "\33[38;5;64m" 50 | COLOR65 = "\33[38;5;65m" 51 | COLOR66 = "\33[38;5;66m" 52 | COLOR67 = "\33[38;5;67m" 53 | COLOR68 = "\33[38;5;68m" 54 | COLOR69 = "\33[38;5;69m" 55 | COLOR70 = "\33[38;5;70m" 56 | COLOR71 = "\33[38;5;71m" 57 | COLOR72 = "\33[38;5;72m" 58 | COLOR73 = "\33[38;5;73m" 59 | COLOR74 = "\33[38;5;74m" 60 | COLOR75 = "\33[38;5;75m" 61 | COLOR76 = "\33[38;5;76m" 62 | COLOR77 = "\33[38;5;77m" 63 | COLOR78 = "\33[38;5;78m" 64 | COLOR79 = "\33[38;5;79m" 65 | COLOR80 = "\33[38;5;80m" 66 | COLOR81 = "\33[38;5;81m" 67 | COLOR82 = "\33[38;5;82m" 68 | COLOR83 = "\33[38;5;83m" 69 | COLOR84 = "\33[38;5;84m" 70 | COLOR85 = "\33[38;5;85m" 71 | COLOR86 = "\33[38;5;86m" 72 | COLOR87 = "\33[38;5;87m" 73 | COLOR88 = "\33[38;5;88m" 74 | COLOR89 = "\33[38;5;89m" 75 | COLOR90 = "\33[38;5;90m" 76 | COLOR91 = "\33[38;5;91m" 77 | COLOR92 = "\33[38;5;92m" 78 | COLOR93 = "\33[38;5;93m" 79 | COLOR94 = "\33[38;5;94m" 80 | COLOR95 = "\33[38;5;95m" 81 | COLOR96 = "\33[38;5;96m" 82 | COLOR97 = "\33[38;5;97m" 83 | COLOR98 = "\33[38;5;98m" 84 | COLOR99 = "\33[38;5;99m" 85 | COLOR100 = "\33[38;5;100m" 86 | COLOR101 = "\33[38;5;101m" 87 | COLOR102 = "\33[38;5;102m" 88 | COLOR103 = "\33[38;5;103m" 89 | COLOR104 = "\33[38;5;104m" 90 | COLOR105 = "\33[38;5;105m" 91 | COLOR106 = "\33[38;5;106m" 92 | COLOR107 = "\33[38;5;107m" 93 | COLOR108 = "\33[38;5;108m" 94 | COLOR109 = "\33[38;5;109m" 95 | COLOR110 = "\33[38;5;110m" 96 | COLOR111 = "\33[38;5;111m" 97 | COLOR112 = "\33[38;5;112m" 98 | COLOR113 = "\33[38;5;113m" 99 | COLOR114 = "\33[38;5;114m" 100 | COLOR115 = "\33[38;5;115m" 101 | COLOR116 = "\33[38;5;116m" 102 | COLOR117 = "\33[38;5;117m" 103 | COLOR118 = "\33[38;5;118m" 104 | COLOR119 = "\33[38;5;119m" 105 | COLOR120 = "\33[38;5;120m" 106 | COLOR121 = "\33[38;5;121m" 107 | COLOR122 = "\33[38;5;122m" 108 | COLOR123 = "\33[38;5;123m" 109 | COLOR124 = "\33[38;5;124m" 110 | COLOR125 = "\33[38;5;125m" 111 | COLOR126 = "\33[38;5;126m" 112 | COLOR127 = "\33[38;5;127m" 113 | COLOR128 = "\33[38;5;128m" 114 | COLOR129 = "\33[38;5;129m" 115 | COLOR130 = "\33[38;5;130m" 116 | COLOR131 = "\33[38;5;131m" 117 | COLOR132 = "\33[38;5;132m" 118 | COLOR133 = "\33[38;5;133m" 119 | COLOR134 = "\33[38;5;134m" 120 | COLOR135 = "\33[38;5;135m" 121 | COLOR136 = "\33[38;5;136m" 122 | COLOR137 = "\33[38;5;137m" 123 | COLOR138 = "\33[38;5;138m" 124 | COLOR139 = "\33[38;5;139m" 125 | COLOR140 = "\33[38;5;140m" 126 | COLOR141 = "\33[38;5;141m" 127 | COLOR142 = "\33[38;5;142m" 128 | COLOR143 = "\33[38;5;143m" 129 | COLOR144 = "\33[38;5;144m" 130 | COLOR145 = "\33[38;5;145m" 131 | COLOR146 = "\33[38;5;146m" 132 | COLOR147 = "\33[38;5;147m" 133 | COLOR148 = "\33[38;5;148m" 134 | COLOR149 = "\33[38;5;149m" 135 | COLOR150 = "\33[38;5;150m" 136 | COLOR151 = "\33[38;5;151m" 137 | COLOR152 = "\33[38;5;152m" 138 | COLOR153 = "\33[38;5;153m" 139 | COLOR154 = "\33[38;5;154m" 140 | COLOR155 = "\33[38;5;155m" 141 | COLOR156 = "\33[38;5;156m" 142 | COLOR157 = "\33[38;5;157m" 143 | COLOR158 = "\33[38;5;158m" 144 | COLOR159 = "\33[38;5;159m" 145 | COLOR160 = "\33[38;5;160m" 146 | COLOR161 = "\33[38;5;161m" 147 | COLOR162 = "\33[38;5;162m" 148 | COLOR163 = "\33[38;5;163m" 149 | COLOR164 = "\33[38;5;164m" 150 | COLOR165 = "\33[38;5;165m" 151 | COLOR166 = "\33[38;5;166m" 152 | COLOR167 = "\33[38;5;167m" 153 | COLOR168 = "\33[38;5;168m" 154 | COLOR169 = "\33[38;5;169m" 155 | COLOR170 = "\33[38;5;170m" 156 | COLOR171 = "\33[38;5;171m" 157 | COLOR172 = "\33[38;5;172m" 158 | COLOR173 = "\33[38;5;173m" 159 | COLOR174 = "\33[38;5;174m" 160 | COLOR175 = "\33[38;5;175m" 161 | COLOR176 = "\33[38;5;176m" 162 | COLOR177 = "\33[38;5;177m" 163 | COLOR178 = "\33[38;5;178m" 164 | COLOR179 = "\33[38;5;179m" 165 | COLOR180 = "\33[38;5;180m" 166 | COLOR181 = "\33[38;5;181m" 167 | COLOR182 = "\33[38;5;182m" 168 | COLOR183 = "\33[38;5;183m" 169 | COLOR184 = "\33[38;5;184m" 170 | COLOR185 = "\33[38;5;185m" 171 | COLOR186 = "\33[38;5;186m" 172 | COLOR187 = "\33[38;5;187m" 173 | COLOR188 = "\33[38;5;188m" 174 | COLOR189 = "\33[38;5;189m" 175 | COLOR190 = "\33[38;5;190m" 176 | COLOR191 = "\33[38;5;191m" 177 | COLOR192 = "\33[38;5;192m" 178 | COLOR193 = "\33[38;5;193m" 179 | COLOR194 = "\33[38;5;194m" 180 | COLOR195 = "\33[38;5;195m" 181 | COLOR196 = "\33[38;5;196m" 182 | COLOR197 = "\33[38;5;197m" 183 | COLOR198 = "\33[38;5;198m" 184 | COLOR199 = "\33[38;5;199m" 185 | COLOR200 = "\33[38;5;200m" 186 | COLOR201 = "\33[38;5;201m" 187 | COLOR202 = "\33[38;5;202m" 188 | COLOR203 = "\33[38;5;203m" 189 | COLOR204 = "\33[38;5;204m" 190 | COLOR205 = "\33[38;5;205m" 191 | COLOR206 = "\33[38;5;206m" 192 | COLOR207 = "\33[38;5;207m" 193 | COLOR208 = "\33[38;5;208m" 194 | COLOR209 = "\33[38;5;209m" 195 | COLOR210 = "\33[38;5;210m" 196 | COLOR211 = "\33[38;5;211m" 197 | COLOR212 = "\33[38;5;212m" 198 | COLOR213 = "\33[38;5;213m" 199 | COLOR214 = "\33[38;5;214m" 200 | COLOR215 = "\33[38;5;215m" 201 | COLOR216 = "\33[38;5;216m" 202 | COLOR217 = "\33[38;5;217m" 203 | COLOR218 = "\33[38;5;218m" 204 | COLOR219 = "\33[38;5;219m" 205 | COLOR220 = "\33[38;5;220m" 206 | COLOR221 = "\33[38;5;221m" 207 | COLOR222 = "\33[38;5;222m" 208 | COLOR223 = "\33[38;5;223m" 209 | COLOR224 = "\33[38;5;224m" 210 | COLOR225 = "\33[38;5;225m" 211 | COLOR226 = "\33[38;5;226m" 212 | COLOR227 = "\33[38;5;227m" 213 | COLOR228 = "\33[38;5;228m" 214 | COLOR229 = "\33[38;5;229m" 215 | COLOR230 = "\33[38;5;230m" 216 | COLOR231 = "\33[38;5;231m" 217 | COLOR232 = "\33[38;5;232m" 218 | COLOR233 = "\33[38;5;233m" 219 | COLOR234 = "\33[38;5;234m" 220 | COLOR235 = "\33[38;5;235m" 221 | COLOR236 = "\33[38;5;236m" 222 | COLOR237 = "\33[38;5;237m" 223 | COLOR238 = "\33[38;5;238m" 224 | COLOR239 = "\33[38;5;239m" 225 | COLOR240 = "\33[38;5;240m" 226 | COLOR241 = "\33[38;5;241m" 227 | COLOR242 = "\33[38;5;242m" 228 | COLOR243 = "\33[38;5;243m" 229 | COLOR244 = "\33[38;5;244m" 230 | COLOR245 = "\33[38;5;245m" 231 | COLOR246 = "\33[38;5;246m" 232 | COLOR247 = "\33[38;5;247m" 233 | COLOR248 = "\33[38;5;248m" 234 | COLOR249 = "\33[38;5;249m" 235 | COLOR250 = "\33[38;5;250m" 236 | COLOR251 = "\33[38;5;251m" 237 | COLOR252 = "\33[38;5;252m" 238 | COLOR253 = "\33[38;5;253m" 239 | COLOR254 = "\33[38;5;254m" 240 | COLOR255 = "\33[38;5;255m" 241 | RESET_COLOR = "" 242 | -------------------------------------------------------------------------------- /test/config.ini: -------------------------------------------------------------------------------- 1 | [global] 2 | database = sqlite 3 | 4 | [sqlite] 5 | database = test.db 6 | path = fixtures/alchemy 7 | 8 | [sqlite2] 9 | database = test.db 10 | path = fixtures/alchemy 11 | -------------------------------------------------------------------------------- /test/fixtures/GNS/geonames_10.txt: -------------------------------------------------------------------------------- 1 | RC UFI UNI LAT LONG DMS_LAT DMS_LONG MGRS JOG FC DSG PC CC1 ADM1 POP ELEV CC2 NT LC SHORT_FORM GENERIC SORT_NAME_RO FULL_NAME_RO FULL_NAME_ND_RO SORT_NAME_RG FULL_NAME_RG FULL_NAME_ND_RG NOTE MODIFY_DATE 2 | 1 -1307834 -1891810 12.516667 -69.983333 123100 -695900 19PCP9315983885 ND19-14 P PPLL AA 00 N PAVIA Pavía Pavia PAVIA Pavía Pavia 1993-12-21 3 | 1 -1307889 -1891862 12.566667 -70.033333 123400 -700200 19PCP8774789436 ND19-14 P PPL AA 00 V SANTAANNA Santa Anna Santa Anna SANTAANNA Santa Anna Santa Anna 1993-12-21 4 | 1 -1307889 -1891878 12.566667 -70.033333 123400 -700200 19PCP8774789436 ND19-14 P PPL AA 00 N SINTANNA Sint Anna Sint Anna SINTANNA Sint Anna Sint Anna 1993-12-21 5 | 1 -1307793 -1891762 12.483333 -69.95 122900 -695700 19PCP9676780186 ND19-14 T HLL AA 00 N KLEINEJAMANOTA Kleine Jamanota Kleine Jamanota KLEINEJAMANOTA Kleine Jamanota Kleine Jamanota 1993-12-21 6 | 1 -1307696 -1891642 12.6 -70.05 123600 -700300 19PCP8595193130 ND19-14 H COVE AA 00 N ARASHI Arashi Arashi ARASHI Arashi Arashi 1993-12-21 7 | 1 -1307696 -1891879 12.6 -70.05 123600 -700300 19PCP8595193130 ND19-14 H COVE AA 00 V SINTARASJI Sint Arasji Sint Arasji SINTARASJI Sint Arasji Sint Arasji 1993-12-21 8 | 1 -1307696 -1891643 12.6 -70.05 123600 -700300 19PCP8595193130 ND19-14 H COVE AA 00 V ARASJI Arasji Arasji ARASJI Arasji Arasji 1993-12-21 9 | 1 -1307748 -1891711 12.55 -69.983333 123300 -695900 19PCP9317287572 ND19-14 T HLL AA 00 V CERUCRISTAL Ceru Cristal Ceru Cristal CRISTAL CERU Cristal, Ceru Cristal, Ceru 1993-12-21 10 | 1 -1307748 -1891712 12.55 -69.983333 123300 -695900 19PCP9317287572 ND19-14 T HLL AA 00 N SEROCRISTAL Sero Cristal Sero Cristal CRISTAL SERO Cristal, Sero Cristal, Sero 1993-12-21 11 | -------------------------------------------------------------------------------- /test/fixtures/SAS/national_file_head_20120204.txt: -------------------------------------------------------------------------------- 1 | FEATURE_ID|FEATURE_NAME|FEATURE_CLASS|STATE_ALPHA|STATE_NUMERIC|COUNTY_NAME|COUNTY_NUMERIC|PRIMARY_LAT_DMS|PRIM_LONG_DMS|PRIM_LAT_DEC|PRIM_LONG_DEC|SOURCE_LAT_DMS|SOURCE_LONG_DMS|SOURCE_LAT_DEC|SOURCE_LONG_DEC|ELEV_IN_M|ELEV_IN_FT|MAP_NAME|DATE_CREATED|DATE_EDITED 2 | 399|Agua Sal Creek|Stream|AZ|04|Apache|001|362740N|1092842W|36.4611122|-109.4784394|362053N|1090915W|36.3480582|-109.1542662|1645|5397|Fire Dance Mesa|02/08/1980| 3 | 400|Agua Sal Wash|Valley|AZ|04|Apache|001|363246N|1093103W|36.546112|-109.5176069|362740N|1092842W|36.4611122|-109.4784394|1597|5239|Little Round Rock|02/08/1980| 4 | 401|Aguaje Draw|Valley|AZ|04|Apache|001|343417N|1091313W|34.5714281|-109.2203696|344308N|1085826W|34.7188|-108.9739|1750|5741|Kearn Lake|02/08/1980|01/14/2008 5 | 402|Arlington State Wildlife Area|Park|AZ|04|Maricopa|013|331455N|1124625W|33.2486547|-112.7735045|||||231|758|Spring Mountain|02/08/1980| 6 | 403|Bar X Wash|Stream|AZ|04|Graham|009|322815N|1095610W|32.4709038|-109.9361853|323048N|1095233W|32.5134024|-109.8759075|1339|4393|West of Greasewood Mountain|02/08/1980| 7 | 404|Bis Ii Ah Wash|Stream|AZ|04|Apache|001|355230N|1093239W|35.8750096|-109.5442721|354903N|1093001W|35.8175|-109.5002778|1799|5902|Beautiful Valley Well|02/08/1980| 8 | 405|Brawley Wash|Stream|AZ|04|Pima|019|322540N|1111726W|32.4278489|-111.2906617|315820N|1112329W|31.972302|-111.3914941|591|1939|West of Marana|02/08/1980| 9 | 406|Cement Trough Canyon|Valley|AZ|04|Navajo|017|335942N|1103045W|33.9950482|-110.5126118|340437N|1103304W|34.0769908|-110.5512265|1494|4902|Blue House Mountain|02/08/1980| 10 | 407|Corn Creek Wash|Stream|AZ|04|Coconino|005|351621N|1105537W|35.2725114|-110.9268068|351958N|1105231W|35.3327883|-110.8751392|1435|4708|Old Leupp|02/08/1980| 11 | -------------------------------------------------------------------------------- /test/fixtures/alchemy/alchemy.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/fixtures/alchemy/alchemy.raw -------------------------------------------------------------------------------- /test/fixtures/csv/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /test/fixtures/csv/gen_sample.csv: -------------------------------------------------------------------------------- 1 | 0,foo,bar,01234567 2 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | ikhlaq.csv 3 | ======= 4 | dt* 5 | Def* 6 | >>>>>>> 39c819113636d5908622990fdaf38898ce2912d4 7 | patentlist.txt 8 | .#bm-gs.py 9 | .goutputstream-5G9IGW 10 | benchmark_errors.txt 11 | grep_scrpt.sh 12 | nancy_list.txt 13 | berkeleyinventors1.csv 14 | berkeleyinventors2.csv 15 | csv/ 16 | ikhlaq.csv 17 | benchmark.sh 18 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | <<<<<<< HEAD 5 | gawk -F, '{print $2",",$3",",$4}' < benchmark.csv > patentlist.txt 6 | ======= 7 | gawk -F, '{print $1",", $2",",$3",",$4}' < benchmark.csv > dt5.csv 8 | 9 | >>>>>>> 39c819113636d5908622990fdaf38898ce2912d4 10 | #gawk -F, '{print $2}' < benchmark.csv > patentlist.txt 11 | 12 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/benchmark_confirm.py: -------------------------------------------------------------------------------- 1 | import sqlite3 as sql 2 | import os 3 | import sys 4 | import logging 5 | 6 | # bmVerify(['final_r7', 'final_r8'], filepath="/home/ysun/disambig/newcode/all/", outdir = "/home/ayu/results_v2/") 7 | 8 | # Text Files 9 | txt_file = 'patentlist.txt' 10 | opened_file = open(txt_file, 'U') 11 | log_file = 'benchmark_results.log' 12 | 13 | # Logging 14 | logging.basicConfig(filename=log_file, level=logging.DEBUG) 15 | open(log_file, "w") 16 | 17 | # Set Up SQL Connections 18 | con = sql.connect('/test/goldstandard/invnum_N_zardoz_with_invpat.sqlite3') 19 | 20 | with con: 21 | 22 | con_cur = con.cursor() 23 | logging.info("Beginning to query database") 24 | con_cur.execute("CREATE INDEX IF NOT EXISTS index_invnum ON invpat (Invnum)"); 25 | con_cur.execute("CREATE INDEX IF NOT EXISTS index_lastname ON invpat (Lastname)"); 26 | con_cur.execute("CREATE INDEX IF NOT EXISTS index_firstname ON invpat (Firstname)"); 27 | count = 0 28 | errors = 0 29 | success = 0 30 | 31 | while True: 32 | 33 | line_read = opened_file.readline() 34 | # print line_read 35 | 36 | if not line_read: 37 | print "EXITING" 38 | break 39 | count = count + 1 40 | if count%100 == 0: 41 | print "starting patent", count 42 | 43 | split_lines = line_read.split(', ') 44 | 45 | # Strip out weird characters/formatting 46 | # Need to add leading "0" to Patent if not Design/Util/etc.. 47 | 48 | patent_to_match = split_lines[0].strip(' \t\n\r') 49 | if len(patent_to_match) == 7: 50 | patent_to_match = "0" + patent_to_match 51 | last_name = split_lines[1].strip(' \t\n\r') 52 | first_name = split_lines[2].strip(' \t\n\r') 53 | 54 | # print patent_to_match, last_name, first_name 55 | 56 | con_cur.execute("SELECT Patent FROM invpat WHERE (Lastname = \"%s\" and Firstname = \"%s\");" % (last_name, first_name)) 57 | 58 | patents_matched_from_SQL = con_cur.fetchall() 59 | match_found = False 60 | for patent_match in patents_matched_from_SQL: 61 | # print patent_match[0] 62 | # print patent_to_match 63 | if patent_match[0] == patent_to_match: 64 | match_found = True 65 | success = success + 1 66 | 67 | if not match_found: 68 | logging.error("Did not find a match for %s, %s, %s" % (first_name, last_name, patent_to_match)) 69 | errors = errors + 1 70 | 71 | logging.info("Total Patents: %d" % count) 72 | logging.info("Patents ran successfully: %d" % success) 73 | logging.info("Patents FAILED: %d" % errors) 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/berkeleyinventors.csv: -------------------------------------------------------------------------------- 1 | UniqueID,Patent,Lastname,Firstname 2 | ,VARCHAR,, 3 | ,%08d,, 4 | UNIQUE,EXACT,FUZZY,FUZZY 5 | 1,5241635,ARVIND, 6 | 2,5241635,CULLER,DAVID 7 | 2,5123095,CULLER,DAVID 8 | 2,5018062,CULLER,DAVID 9 | 3,5018576,SITAR,NICHOLAS 10 | 4,5241635,PAPADOPOULOS,GREGORY M 11 | 4,5123095,PAPADOPOULOS,GREGORY M 12 | 4,5018062,PAPADOPOULOS,GREGORY M 13 | 5,5123095,PINKERTON,JAMES T 14 | 6,5018062,SCHNEIDER,RICHARD P 15 | 7,8194655,PISTER,KRISTOPHER 16 | 7,8059629,PISTER,KRISTOPHER 17 | 7,7961664,PISTER,KRISTOPHER 18 | 7,5726480,PISTER,KRISTOPHER 19 | 7,7873043,PISTER,KRISTOPHER 20 | 7,7420980,PISTER,KRISTOPHER 21 | 7,7529217,PISTER,KRISTOPHER 22 | 7,7881239,PISTER,KRISTOPHER 23 | 7,6517734,PISTER,KRISTOPHER 24 | 7,6517734,PISTER,KRISTOPHER 25 | 7,5659195,PISTER,KRISTOPHER 26 | 8,7873043,SHEAR,ROBERT M 27 | 8,7420980,SHEAR,ROBERT M 28 | 9,7881239,ZATS,YURI S 29 | 9,7529217,ZATS,YURI S 30 | 10,7881239,CONTNAT,ROBERT A 31 | 10,7529217,CONTNAT,ROBERT A 32 | 11,7881239,TREUHAFT,N 33 | 11,7529217,TREUHAFT,N 34 | 12,6517734,MULLER,LILAC 35 | 13,6517734,ARNETT,KENNETH E 36 | 14,6517734,FABINY,LARRY 37 | 15,5659195,STAFSUDD,OSCAR M 38 | 16,5659195,KAISER,WILLIAM J 39 | 17,5659195,NELSON,PHILLIS R 40 | 18,7913472,PISTER,JACINTA 41 | 19,7913472,LIN,JIN-JIN 42 | 20,7913472,TROTH,STEVE 43 | 21,7099871,DOOLIN,DAVID M 44 | 21,7013303,DOOLIN,DAVID M 45 | 21,7171415,DOOLIN,DAVID M 46 | 22,7171415,KAN,GENE H 47 | 22,7099871,KAN,GENE H 48 | 22,7013303,KAN,GENE H 49 | 23,7099871,FAYBISHENKO,YAROSLAY 50 | 23,7013303,FAYBISHENKO,YAROSLAY 51 | 23,7171415,FAYBISHENKO,YAROSLAY 52 | 24,7099871,CUTTING,DOUGLASS R 53 | 24,7171415,CUTTING,DOUGLASS R 54 | 25,5018576,HUNT,JAMES R 55 | 26,7099871,CAMARDA,THOMAS J 56 | 26,7013303,CAMARDA,THOMAS J 57 | 26,7171415,CAMARDA,THOMAS J 58 | 27,7099871,WATERHOUSE,STEVE 59 | 27,7013303,WATERHOUSE,STEVE 60 | 27,7171415,WATERHOUSE,STEVE 61 | 28,7013303,BEATTY,JOHN 62 | 29,5136185,FLEMING,LEE 63 | 29,5029133,FLEMING,LEE 64 | 30,5029133,LA FETRA,ROSS V 65 | 31,6453319,HAINES,MATTHEW 66 | 31,6209003,HAINES,MATTHEW 67 | 31,6128627,HAINES,MATTHEW 68 | 31,6128623,HAINES,MATTHEW 69 | 31,6289358,HAINES,MATTHEW 70 | 31,6292880,HAINES,MATTHEW 71 | 31,6913307,HAINES,MATTHEW 72 | 32,6209003,GOURLEY,DAVID 73 | 32,6128627,GOURLEY,DAVID 74 | 32,6128623,GOURLEY,DAVID 75 | 32,6289358,GOURLEY,DAVID 76 | 32,6292880,GOURLEY,DAVID 77 | 32,6453319,GOURLEY,DAVID 78 | 32,6913307,GOURLEY,DAVID 79 | 33,6209003,TOTTY,BRIAN 80 | 33,6128627,TOTTY,BRIAN 81 | 33,6128623,TOTTY,BRIAN 82 | 33,6289358,TOTTY,BRIAN 83 | 33,6292880,TOTTY,BRIAN 84 | 33,6453319,TOTTY,BRIAN 85 | 34,6913307,TOTTY,BRIAN 86 | 35,6209003,BEGUELIN,ADAM 87 | 35,6128627,BEGUELIN,ADAM 88 | 35,6128623,BEGUELIN,ADAM 89 | 35,6289358,BEGUELIN,ADAM 90 | 35,6292880,BEGUELIN,ADAM 91 | 35,6453319,BEGUELIN,ADAM 92 | 35,6913307,BEGUELIN,ADAM 93 | 36,6913307,PLEVYAK,JOHN 94 | 36,6292880,PLEVYAK,JOHN 95 | 36,6209003,PLEVYAK,JOHN 96 | 36,6128627,PLEVYAK,JOHN 97 | 36,6128623,PLEVYAK,JOHN 98 | 36,6289358,PLEVYAK,JOHN 99 | 36,6453319,PLEVYAK,JOHN 100 | 37,6128627,MATTHIS,PETER 101 | 37,6128623,MATTHIS,PETER 102 | 37,6209003,MATTHIS,PETER 103 | 37,6292880,MATTHIS,PETER 104 | 37,6289358,MATTHIS,PETER 105 | 37,6453319,MATTHIS,PETER 106 | 37,6913307,MATTHIS,PETER 107 | 38,6158781,AARON III,JOHN W 108 | 39,7625697,SHALON,TIDHAR D 109 | 39,7378236,SHALON,TIDHAR D 110 | 39,7442499,SHALON,TIDHAR D 111 | 39,7323298,SHALON,TIDHAR D 112 | 39,6110426,SHALON,TIDHAR D 113 | 39,5807522,SHALON,TIDHAR D 114 | 40,7442499,BROWN,PATRICK O 115 | 40,7625697,BROWN,PATRICK O 116 | 40,7378236,BROWN,PATRICK O 117 | 40,7323298,BROWN,PATRICK O 118 | 40,6110426,BROWN,PATRICK O 119 | 40,5807522,BROWN,PATRICK O 120 | 41,7573873,SIDHU,IKHLAQ S 121 | 41,7453815,SIDHU,IKHLAQ S 122 | 41,7032242,SIDHU,IKHLAQ S 123 | 41,7016675,SIDHU,IKHLAQ S 124 | 41,6954454,SIDHU,IKHLAQ S 125 | 41,6937699,SIDHU,IKHLAQ S 126 | 41,6937610,SIDHU,IKHLAQ S 127 | 41,6914897,SIDHU,IKHLAQ S 128 | 41,6870830,SIDHU,IKHLAQ S 129 | 41,6857072,SIDHU,IKHLAQ S 130 | 41,6857021,SIDHU,IKHLAQ S 131 | 41,6856616,SIDHU,IKHLAQ S 132 | 41,6822957,SIDHU,IKHLAQ S 133 | 41,6804224,SIDHU,IKHLAQ S 134 | 41,6795429,SIDHU,IKHLAQ S 135 | 41,6785261,SIDHU,IKHLAQ S 136 | 41,6771674,SIDHU,IKHLAQ S 137 | 41,6744759,SIDHU,IKHLAQ S 138 | 41,6741586,SIDHU,IKHLAQ S 139 | 41,6732314,SIDHU,IKHLAQ S 140 | 41,6731642,SIDHU,IKHLAQ S 141 | 41,6731630,SIDHU,IKHLAQ S 142 | 41,6697354,SIDHU,IKHLAQ S 143 | 41,6681252,SIDHU,IKHLAQ S 144 | 41,6678250,SIDHU,IKHLAQ S 145 | 41,6675218,SIDHU,IKHLAQ S 146 | 41,6674745,SIDHU,IKHLAQ S 147 | 41,6650901,SIDHU,IKHLAQ S 148 | 41,6650619,SIDHU,IKHLAQ S 149 | 41,6625119,SIDHU,IKHLAQ S 150 | 41,6587433,SIDHU,IKHLAQ S 151 | 41,6584490,SIDHU,IKHLAQ S 152 | 41,6577622,SIDHU,IKHLAQ S 153 | 41,6570606,SIDHU,IKHLAQ S 154 | 41,6567405,SIDHU,IKHLAQ S 155 | 41,6567399,SIDHU,IKHLAQ S 156 | 41,6542504,SIDHU,IKHLAQ S 157 | 41,6512761,SIDHU,IKHLAQ S 158 | 41,6487690,SIDHU,IKHLAQ S 159 | 41,6487603,SIDHU,IKHLAQ S 160 | 41,6446127,SIDHU,IKHLAQ S 161 | 41,7012141,SIDHU,IKHLAQ S 162 | 41,6434606,SIDHU,IKHLAQ S 163 | 41,6381638,SIDHU,IKHLAQ S 164 | 41,6366959,SIDHU,IKHLAQ S 165 | 41,6363053,SIDHU,IKHLAQ S 166 | 41,6360271,SIDHU,IKHLAQ S 167 | 41,6353614,SIDHU,IKHLAQ S 168 | 41,6351524,SIDHU,IKHLAQ S 169 | 41,6269099,SIDHU,IKHLAQ S 170 | 41,6243846,SIDHU,IKHLAQ S 171 | 41,6226769,SIDHU,IKHLAQ S 172 | 41,6182125,SIDHU,IKHLAQ S 173 | 41,6175871,SIDHU,IKHLAQ S 174 | 41,6170075,SIDHU,IKHLAQ S 175 | 41,6169744,SIDHU,IKHLAQ S 176 | 41,6151636,SIDHU,IKHLAQ S 177 | 41,6145109,SIDHU,IKHLAQ S 178 | 41,6055236,SIDHU,IKHLAQ S 179 | 41,6006271,SIDHU,IKHLAQ S 180 | 41,5870412,SIDHU,IKHLAQ S 181 | 42,5018576,UDELL,KENT S 182 | 43,5018576,STEWARD JR,LLOYD D 183 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/gs2011.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Queries the sqlite3 database with 2011 gold standard inventor data" 4 | 5 | 6 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/jamesrhunt.csv: -------------------------------------------------------------------------------- 1 | 1,5362759,HUNT,JAMES R 2 | 1,5860770,HUNT,JAMES R 3 | 1,7195753,HUNT,JAMES R 4 | 1,7482021,HUNT,JAMES R 5 | 1,7195753,HUNT,JAMES R 6 | 1,6783767,HUNT,JAMES R 7 | 1,6770268,HUNT,JAMES R 8 | 1,6579514,HUNT,JAMES R 9 | 1,6071043,HUNT,JAMES R 10 | 1,5951947,HUNT,JAMES R 11 | 1,5353449,HUNT,JAMES R 12 | 1,5578700,HUNT,JAMES R 13 | 1,5675882,HUNT,JAMES R 14 | 1,5613221,HUNT,JAMES R 15 | 1,5611492,HUNT,JAMES R 16 | 1,D329481,HUNT,JAMES R 17 | 1,D322932,HUNT,JAMES R 18 | 1,D322931,HUNT,JAMES R 19 | 1,D322215,HUNT,JAMES R 20 | 1,D321253,HUNT,JAMES R 21 | 1,4977893,HUNT,JAMES R 22 | 1,4571850,HUNT,JAMES R 23 | 1,D271911,HUNT,JAMES R 24 | 1,4040424,HUNT,JAMES R 25 | -------------------------------------------------------------------------------- /test/fixtures/goldstandard/readme.md: -------------------------------------------------------------------------------- 1 | # "Gold Standard" processing for verified patent data 2 | 3 | ## Benchmark files 4 | 5 | * `benchmark.csv` is the csv export from the benchmark.xlsx spreadsheet 6 | file, converted from Windows `crlf` to unix line convention. 7 | * `benchmark.sh` is a wrapper around some `gawk` which processes the 8 | csv file to acquire relevant data. 9 | 10 | ## Gold standard files 11 | 12 | * `gs2011.sh` wraps various operations 13 | * `goldstandard.csv` is an input file in a format acceptable to the 14 | disambiguator. 15 | 16 | -------------------------------------------------------------------------------- /test/fixtures/sqlite3/combined.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/fixtures/sqlite3/combined.sqlite3 -------------------------------------------------------------------------------- /test/fixtures/text/accented_characters.txt: -------------------------------------------------------------------------------- 1 | réâ∑œ®\üñµ 2 | -------------------------------------------------------------------------------- /test/fixtures/unittest/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | -------------------------------------------------------------------------------- /test/fixtures/xml/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /test/fixtures/xml/basic.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | hello 4 | world 5 | 6 | 7 | 1 8 | 2 9 | 3 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/integration/.gitignore: -------------------------------------------------------------------------------- 1 | !*.csv 2 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.18/assignee.csv: -------------------------------------------------------------------------------- 1 | a1e27fa698dd2383ac0e0c5a85bf8e07,2,"","","Acushnet Company","","" 2 | 5729cdc83217097b2de5fd057f6c01d4,2,"","","BBC International LLC","","" 3 | 32fecc3fbbbb2058c5efd7746f8c92ce,2,"","","Devi Kroell Inc.","","" 4 | 917c17490595190472a0c880d468d1f8,3,"","","Design Sportswears","","" 5 | 6b7f48f21749641c4f67f138da7983bf,2,"","","Frito-Lay North America, Inc.","","" 6 | 8e480b469958f8a97a4d98fecc53aa46,3,"","","Hermes Sellier (Societe Par Actions Simplifiee)","","" 7 | 2067d04477dd28ed9fa8dd2c8a338352,2,"","","Icon IP, Inc.","","" 8 | 2e06355f463a868c3830ada75c071757,3,"","","Kybun AG","","" 9 | 70d61f29771e8463211d94090414205e,2,"","","OrthoCor Medical, Inc.","","" 10 | a020612580b3d66a7a11f16cd3edc6a3,2,"","","Times Three Clothier, LLC","","" 11 | de9c78a8eae461883b1bcf5ebdd57612,2,"","","Thompson-Weiler Enterprises, LLC","","" 12 | 769084ad4f0ed13b7c09ed62767e4789,2,"","","The Vermont Teddy Bear Company, Inc.","","" 13 | 6ee34b4c1a39003733e2f3959e9afb10,3,"","","Zero1.tv GmbH","","" 14 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.18/lawyer.csv: -------------------------------------------------------------------------------- 1 | 9577d947a8cdebd0b2958ceccb68ffdf,Andrew,"F. Young, Esq.","",UNKNOWN 2 | a338d00e25226144b10c43ad50091feb,"","","Abelman, Frayne & Schwab",UNKNOWN 3 | 50f3e5a0ef8631495862f78ed762e466,Barry,"G. Magidoff","",UNKNOWN 4 | 197f4d96eccda10e78fdd24c5da8bcd4,Celina,"M. Corr","",UNKNOWN 5 | 953a70cad377ba2df2236a1e6815dcd1,"","","Carston & Cahoon, LLP",UNKNOWN 6 | 8b7ce792ee91fb499f7da344adbb7549,Colin,"P. Cahon","",UNKNOWN 7 | f46fac8a1c76017b4e20ffaafbc97832,D.,"Michael Burns","",UNKNOWN 8 | e3412e962e62f82664c4d92fedcf75d3,"","","Foley & Lardner LLP",UNKNOWN 9 | 1c33aced610a38673c47bdc668e30c4c,"","","GrayRobinson, P.A.",UNKNOWN 10 | 81fcca8ea1a38185db2abfb2ab016e36,H.,"Jay Spiegel","",UNKNOWN 11 | 02efc1400155615d9d936adee32d8892,Jerome,"V. Sartain","",UNKNOWN 12 | 84404f72b783c58c5f3efbb10511961c,Justin,"G. Sanders","",UNKNOWN 13 | fe79f99fb866c9015456cbd7bb4dfcdb,"","","Lackenbach Siegel, LLP",UNKNOWN 14 | 4f546f39addcde5bb509f1708717ca61,"","","Minn Law Firm",UNKNOWN 15 | 5b244d5a5242cbce914c9917814efdb9,"","","Perkins Coie LLP",UNKNOWN 16 | a79f4f0e80347f7e349acb351dfd04c5,Richard,"K. C. Chang, II","",UNKNOWN 17 | b3bb5efb58647ceabe00d01fbb8331db,"","","Schwegman, Lunberg & Woessner, P.A.",UNKNOWN 18 | c1864223e8ed094d1cd8365973cbe3ec,"","","Shoemaker and Mattare",UNKNOWN 19 | 60f4caadf31bf7d7dfcf14ac3da9674e,"","","Stroock & Stroock & Lavan LLP",UNKNOWN 20 | 8bbc28f3b70f7c0d6bca1e459a018e0a,"","","Thomas & Karceski, P.C.",UNKNOWN 21 | ce2ffb0caf580bc53370e856ea4ae87a,Veronica-Adele,"R. Cao","",UNKNOWN 22 | 59817b1e6a20c836b1fa7507497b2f02,Walter,"A. Hackler","",UNKNOWN 23 | 0f029da8bb9147c7b3980b3c219d2ee7,"","","Weiss & Moy, P.C.",UNKNOWN 24 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.18/location.csv: -------------------------------------------------------------------------------- 1 | 26.3586885|-80.0830984,"Boca Raton",FL,US,26.3586885,-80.0830984 2 | 33.0198431|-96.6988856,Plano,TX,US,33.0198431,-96.6988856 3 | 33.1506744|-96.8236116,Frisco,TX,US,33.1506744,-96.8236116 4 | 33.599722|-117.699444,"Laguna Hills",CA,US,33.599722,-117.699444 5 | 33.660297|-117.9992265,"Huntington Beach",CA,US,33.660297,-117.9992265 6 | 36.1988592|-115.1175013,"North Las Vegas",NV,US,36.1988592,-115.1175013 7 | 37.5407246|-77.4360481,Richmond,VA,US,37.5407246,-77.4360481 8 | 39.070388|-76.5452409,"Severna Park",MD,US,39.070388,-76.5452409 9 | 40.2968979|-111.6946475,Orem,UT,US,40.2968979,-111.6946475 10 | 40.7143528|-74.0059731,NYC,NY,US,40.7143528,-74.0059731 11 | 41.1628731|-73.8615246,Ossining,NY,US,41.1628731,-73.8615246 12 | 41.2042616|-73.7270761,"Mount Kisco",NY,US,41.2042616,-73.7270761 13 | 41.6376043|-70.9036487,Fairhaven,MA,US,41.6376043,-70.9036487 14 | 41.7369803|-111.8338359,Logan,UT,US,41.7369803,-111.8338359 15 | 41.7|-70.7633333,Marion,MA,US,41.7,-70.7633333 16 | 42.0714925|-70.8092,Pembroke,MA,US,42.0714925,-70.8092 17 | 42.2495321|-71.0661653,Milton,MA,US,42.2495321,-71.0661653 18 | 42.5792583|-71.4378411,Westford,MA,US,42.5792583,-71.4378411 19 | 44.3806065|-73.227626,Shelburne,VT,US,44.3806065,-73.227626 20 | 44.983334|-93.26667,Minneapolis,MN,US,44.983334,-93.26667 21 | 45.0791325|-93.1471667,Shoreview,MN,US,45.0791325,-93.1471667 22 | 45.775491|12.0439904,Montebelluna,Veneto,IT,45.775491,12.0439904 23 | 47.240075|7.822812,Roggwil,Bern,CH,47.240075,7.822812 24 | 48.856614|2.3522219,Paris,"Île-de-France",FR,48.856614,2.3522219 25 | 52.519171|13.4060912,Berlin,Berlin,DE,52.519171,13.4060912 26 | 53.5510846|9.9936818,Hamburg,Hamburg,DE,53.5510846,9.9936818 27 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.one/assignee.csv: -------------------------------------------------------------------------------- 1 | 6b7f48f21749641c4f67f138da7983bf,2,"","","Frito-Lay North America, Inc.","","" 2 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.one/lawyer.csv: -------------------------------------------------------------------------------- 1 | 197f4d96eccda10e78fdd24c5da8bcd4,Celina,"M. Corr","",UNKNOWN 2 | 953a70cad377ba2df2236a1e6815dcd1,"","","Carston & Cahoon, LLP",UNKNOWN 3 | 8b7ce792ee91fb499f7da344adbb7549,Colin,"P. Cahon","",UNKNOWN 4 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.one/location.csv: -------------------------------------------------------------------------------- 1 | 33.0198431|-96.6988856,Plano,TX,US,33.0198431,-96.6988856 2 | 33.1506744|-96.8236116,Frisco,TX,US,33.1506744,-96.8236116 3 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.two/assignee.csv: -------------------------------------------------------------------------------- 1 | 70d61f29771e8463211d94090414205e,2,"","","OrthoCor Medical, Inc.","","" 2 | 6ee34b4c1a39003733e2f3959e9afb10,3,"","","Zero1.tv GmbH","","" 3 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.two/lawyer.csv: -------------------------------------------------------------------------------- 1 | 5b244d5a5242cbce914c9917814efdb9,"","","Perkins Coie LLP",UNKNOWN 2 | b3bb5efb58647ceabe00d01fbb8331db,"","","Schwegman, Lunberg & Woessner, P.A.",UNKNOWN 3 | -------------------------------------------------------------------------------- /test/integration/clean/ipg120327.two/location.csv: -------------------------------------------------------------------------------- 1 | 44.983334|-93.26667,Minneapolis,MN,US,44.983334,-93.26667 2 | 45.0791325|-93.1471667,Shoreview,MN,US,45.0791325,-93.1471667 3 | 52.519171|13.4060912,Berlin,Berlin,DE,52.519171,13.4060912 4 | 53.5510846|9.9936818,Hamburg,Hamburg,DE,53.5510846,9.9936818 5 | -------------------------------------------------------------------------------- /test/integration/consolidate/ipg120327.18/disambiguator.csv: -------------------------------------------------------------------------------- 1 | Nicole Cavin D656296 D1 D1/128 Frisco TX US Frito-Lay North America, Inc. Frito-Lay North America, Inc. 2 | Divya Paruchuri D656296 D1 D1/128 Frisco TX US Frito-Lay North America, Inc. Frito-Lay North America, Inc. 3 | Michael Zbuchalski D656296 D1 D1/128 Frisco TX US Frito-Lay North America, Inc. Frito-Lay North America, Inc. 4 | Michaela M. Christian D656297 D2 D2/627 North Las Vegas NV US 5 | Heather Thomson Schindler D656298 D2 D2/703 New York NY US Times Three Clothier, LLC Times Three Clothier, LLC 6 | Brian Jeffery Peters D656299 D2 D2/742 Huntington Beach CA US 7 | Debi Purcell D656300 D2 D2/742 Laguna Hills CA US 8 | Michael Scott Randall D656301 D2 D2/858 Shelburne VT US The Vermont Teddy Bear Company, Inc. The Vermont Teddy Bear Company, Inc. 9 | Wade Driggers D656302 D2 D2/946 Richmond VA US 10 | Karl Muller D656303 D2 D2/947 Roggwil CH Kybun AG Kybun AG 11 | Claudio Franco D656303 D2 D2/947 Roggwil CH Kybun AG Kybun AG 12 | Gilberto Debiase D656304 D2 D2/960 Boca Raton FL US BBC International LLC BBC International LLC 13 | Kenneth Golden Harper D656305 D2 D2/960 Orem UT US Icon IP, Inc. Icon IP, Inc. 14 | Jonathan G. Bacon D656306 D2 D2/969 Westford MA US Acushnet Company Acushnet Company 15 | James M. Feeney D656307 D2 D2/969 Marion MA US Acushnet Company Acushnet Company 16 | Richard A. Mochen D656307 D2 D2/969 Marion MA US Acushnet Company Acushnet Company 17 | Paul O. Teeter D656307 D2 D2/969 Marion MA US Acushnet Company Acushnet Company 18 | Kin-Joe Sham D656308 D3 D3/2031 Shoreview MN US OrthoCor Medical, Inc. OrthoCor Medical, Inc. 19 | Oliver Renelt D656309 D3 D3/218 Hamburg DE Zero1.tv GmbH Zero1.tv GmbH 20 | Alexander Gruber D656309 D3 D3/218 Hamburg DE Zero1.tv GmbH Zero1.tv GmbH 21 | Valerie M. Ciptak D656310 D3 D3/226 Ossining NY US Thompson-Weiler Enterprises, LLC Thompson-Weiler Enterprises, LLC 22 | Justin S. Werner D656310 D3 D3/226 Ossining NY US Thompson-Weiler Enterprises, LLC Thompson-Weiler Enterprises, LLC 23 | Gracelia Chiurazzi D656311 D3 D3/232 New York NY US Devi Kroell Inc. Devi Kroell Inc. 24 | Valérie Gerbi D656312 D3 D3/232 Paris FR Design Sportswears Design Sportswears 25 | Jean-Louis Dumas D656313 D3 D3/243 Paris FR Hermes Sellier (Societe Par Actions Simplifiee) Hermes Sellier (Societe Par Actions Simplifiee) 26 | Pierre-Alexis Dumas, legal representative D656313 D3 D3/243 Paris FR Hermes Sellier (Societe Par Actions Simplifiee) Hermes Sellier (Societe Par Actions Simplifiee) 27 | Sandrine Brekke-Dumas, legal representative D656313 D3 D3/243 Paris FR Hermes Sellier (Societe Par Actions Simplifiee) Hermes Sellier (Societe Par Actions Simplifiee) 28 | Couli Jobert D656313 D3 D3/243 Paris FR Hermes Sellier (Societe Par Actions Simplifiee) Hermes Sellier (Societe Par Actions Simplifiee) 29 | -------------------------------------------------------------------------------- /test/integration/consolidate/ipg120327.two/disambiguator.csv: -------------------------------------------------------------------------------- 1 | Kin-Joe Sham D656308 D3 D3/2031 Shoreview MN US OrthoCor Medical, Inc. OrthoCor Medical, Inc. 2 | Oliver Renelt D656309 D3 D3/218 Hamburg DE Zero1.tv GmbH Zero1.tv GmbH 3 | Alexander Gruber D656309 D3 D3/218 Hamburg DE Zero1.tv GmbH Zero1.tv GmbH 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/application.csv: -------------------------------------------------------------------------------- 1 | id,type,number,country,date,abstract,title,granted,num_claims 2 | 2006/20060288462,utility,20060288462,US,2006-12-28,"A plurality of substantially flexible skeletal members disposed in the form of a portion of a garment fabric panel is disclosed. A plurality of connecting members secure the skeletal member in a configuration corresponding to the shape of the portion of the fabric panel. A plurality of candy members is disposed on the skeletal member. A plurality of knots may be employed in the skeletal members at least at some points between the candy members, whereby limited removal of the candy members may be implemented. A plurality of skeletal lengths may be implemented to provide limited removal of the candy members. ","Garments composed of fabric panels incorporating edible cells",,10 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/claim.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,text,dependent,sequence 2 | 2006/20060288462,"A garment, comprising: ",,1 3 | 2006/20060288462,"A garment as in , wherein said candy members comprise hard candy. ",1,2 4 | 2006/20060288462,"A garment as in , wherein said candy members are disposed in a configuration to allow ventilation between said hard candy members. ",1,3 5 | 2006/20060288462,"A garment as in , wherein knots in the skeletal members are provided at least at said some points between said candy members whereby limited removal of said candy members may be implemented. ",1,4 6 | 2006/20060288462,"A garment as in , wherein skeletal members are of various lengths, whereby limited removal of said candy members may be implemented. ",1,5 7 | 2006/20060288462,"A garment as in , further comprising a coating disposed over said candy members whereby the action of moisture is limited. ",1,6 8 | 2006/20060288462,"A garment as in , wherein said candy members comprise an anti-stick filler material. ",1,7 9 | 2006/20060288462,"A garment as in , wherein said candy members comprise a compacted material. ",1,8 10 | 2006/20060288462,"A garment as in , wherein said candy members comprise a compacted xylitol. ",1,9 11 | 2006/20060288462,"A garment as in , wherein said skeletal members are cross-linked.",1,10 12 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/ipcr.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,classification_level,section,subclass,main_group,subgroup,symbol_position,classification_value,classification_status,classification_data_source,action_date,ipc_version_indicator,sequence 2 | 2006/20060288462,A,A,D,31,00,F,I,B,H,2006-12-28,2007-01-01,0 3 | 2006/20060288462,A,A,G,1,50,L,I,B,H,2006-12-28,2007-01-01,1 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/mainclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 002,, 3 | 426,, 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/rawassignee.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,residence,nationality,sequence 2 | 2006/20060288462,,"new york|ny|us",,"","","AMERICAN EXPRESS TRAVEL RELATED SERVICES COMPANY, INC.","","",0 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/rawinventor.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,inventor_id,rawlocation_id,name_first,name_last,nationality,sequence 2 | 2006/20060288462,,koln||de,Yvonne,Schroeder,DE,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/rawlocation.csv: -------------------------------------------------------------------------------- 1 | id,location_id,city,state,country 2 | "new york|ny|us",,"New York",NY,US 3 | koln||de,,Koln,"",DE 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/subclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 002/001000,, 3 | 426/104000,, 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/uspc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,mainclass_id,subclass_id,sequence 2 | 2006/20060288462,002,002/001000,0 3 | 2006/20060288462,426,426/104000,1 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa061228.one/usreldoc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,rel_id,doctype,status,date,number,kind,country,relationship,sequence 2 | 2006/20060288462,29232669,continuation_in_part,PENDING,2005-06-22,29232669,"",US,parent_doc,0 3 | 2006/20060288462,11197279,continuation_in_part,"",2005-08-04,11197279,A1,US,child_doc,1 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/application.csv: -------------------------------------------------------------------------------- 1 | id,type,number,country,date,abstract,title,granted,num_claims 2 | 2013/20130014308,utility,20130014308,US,2013-01-17,"A finger nail tip cover which acts as a typing aid for users when typing with long finger nails. It is made of rubber material and has one hard end, to prevent the nail tip from passing thru when typing and a semi-oval stretchy cavity at the other end where the finger nail tip is placed in.",TYPING-MATE,,12 3 | 2013/20130014306,utility,20130014306,US,2013-01-17,"A glove employable for mixed martial arts, including striking and grappling, employs a dorsal pad having distal and lateral extensions with an integral bend for covering both the dorsal and lateral sides of the user's metacarpal/proximal phalange joints. The integral bend conforms with the user's metacarpal/proximal phalange joints with the user's hand in the clinched fist position. The glove affords protection to the user's metacarpal/proximal phalange joints while striking. On the other hand, the integral bend is capable of easily unbending to an open position for grappling. The integral bend is unbent by unclenching the user's fingers by flexion from the clinched fist position to the open position. The glove also includes a wrist pad for protecting the wrist against heavy strikes.","TACTICAL MIXED MARTIAL ARTS GLOVE",,14 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/claim.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,text,dependent,sequence 2 | 2013/20130014308,"A finger nail tip cover appliance that has a first hard end and a second more soften end with a stretchy cavity to be placed on the finger nail tips of long nails when typing on any keyboard.",,1 3 | 2013/20130014308,"The appliance of wherein a shape that encircles 3/16 of an inches depth of a long finger nail tip.",1,2 4 | 2013/20130014308,"The appliance of wherein facilitates data entry with long natural, acrylic, gel or artificial finger nails.",1,3 5 | 2013/20130014308,"The appliance of wherein has a semi-oval cavity in one end where the finger nail tip is place in.",1,4 6 | 2013/20130014308,"The appliance of wherein has a stretch cavity to fit all sizes of the different long finger nail tips including natural, acrylic, gel or artificial finger nail tips.",1,5 7 | 2013/20130014308,"The appliance of wherein said is designed to be worn on the tips of all long finger nails of both hands, not including the thumbs.",1,6 8 | 2013/20130014308,"The appliance of wherein said a hard end comprising a symmetrical shape that simulates the actual finger tip typing action.",1,7 9 | 2013/20130014308,"The appliance of wherein has a hard end that prevents the finger nail tip from passing thru when typing.",7,8 10 | 2013/20130014308,"The appliance of wherein said is made of rubber material that allows the appliance to be hard on one end and stretchy on the other end.",1,9 11 | 2013/20130014308,"The appliance of wherein compromises a high friction end.",9,10 12 | 2013/20130014308,"The appliance on wherein said to be use for typing on any keyboard with long finger nails that can be natural, acrylic, gel or artificial nails.",1,11 13 | 2013/20130014308,"The appliance of wherein said makes typing with long finger nails faster, comfortable and accurate.",1,12 14 | 2013/20130014306,"An improved glove employable by a user for mixed martial arts, the glove being an open fingered type and of a type having a dorsal pad for protecting the dorsal side of the metacarpals of a user's hand against shock and of a type being capable of assuming an open position for grappling and a clinched fist position for striking, wherein the improvement includes:",,1 15 | 2013/20130014306,"The improved glove of wherein the improvement further includes the dorsal pad having a composition including molded foam, the integral bend being formed by the molded foam.",1,2 16 | 2013/20130014306,"The improved glove of wherein the improvement further includes the dorsal pad having a composition including a first layer of low density foam and a second layer of high density foam, the first and second layers being glued to one another for forming the integral bend.",1,3 17 | 2013/20130014306,"The improved glove of wherein the improvement further includes the distal extension extending sufficiently for covering and protecting the user's Intermediate phalanges.",1,4 18 | 2013/20130014306,"The improved glove of wherein the improvement further includes the dorsal pad extending distally for covering and protecting the user's distal phalanges.",1,5 19 | 2013/20130014306,"The improved glove of wherein the improvement further includes the dorsal pad and the integral bend having an extension extending laterally beyond the metacarpal/proximal phalange joint of the user's index finger for affording impact protection to the lateral side of the metacarpal/proximal phalanges joint of the user's index finger against a sloop hook punch.",1,6 20 | 2013/20130014306,"The improved glove of wherein the improvement further comprises:",1,7 21 | 2013/20130014306,"An improved glove employable by a user for mixed martial arts, the glove being of an opened fingered type and of a type having a dorsal pad for protecting the dorsal side of the metacarpals of a user's hand against shock, wherein the improvement comprising:",,8 22 | 2013/20130014306,"The improved glove of wherein the improvement further including the lateral strike pad having a composition including foam.",8,9 23 | 2013/20130014306,"An improved glove employable by a user for mixed martial arts, the glove being of an open fingered type having a dorsal pad for protecting the dorsal side of the metacarpals of a user's hand against shock, the glove being of a type capable of assuming an open position for grappling and a clinched fist position for striking, wherein the improvement including:",,10 24 | 2013/20130014306,"The improved glove of wherein the improvement further including the dorsal pad, include the integral bend, having a composition selected from the group consisting of molded foam and layered foam.",10,11 25 | 2013/20130014306,"The improved glove of wherein the improvement further including the dorsal pad extending distally for covering and protecting the user's distal phalanges against shock.",10,12 26 | 2013/20130014306,"The improved glove of , the improvement further compromising:",10,13 27 | 2013/20130014306,"An improved method for manufacturing a glove employable by a user for mixed martial arts, wherein the improvement comprises the step of making a dorsal pad having an integral bend by means of a molding process, according to . ",2,14 28 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/ipcr.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,classification_level,section,subclass,main_group,subgroup,symbol_position,classification_value,classification_status,classification_data_source,action_date,ipc_version_indicator,sequence 2 | 2013/20130014308,A,A,D,13,08,F,I,B,H,2013-01-17,2006-01-01,0 3 | 2013/20130014306,A,A,D,13,08,F,I,B,H,2013-01-17,2006-01-01,0 4 | 2013/20130014306,A,A,D,19,02,L,I,B,H,2013-01-17,2006-01-01,1 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/mainclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 2,, 3 | 21,, 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/rawassignee.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,residence,nationality,sequence 2 | 2013/20130014308,,kissimmee|fl|us,,Yennifer,Feliciano,"",US,US,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/rawinventor.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,inventor_id,rawlocation_id,name_first,name_last,nationality,sequence 2 | 2013/20130014308,,kissimmee|fl|us,Yennifer,Feliciano,US,0 3 | 2013/20130014306,,"san diego|ca|us",Christopher,Mechling,US,0 4 | 2013/20130014306,,"san diego|ca|us",Nicholas,Mechling,US,1 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/rawlocation.csv: -------------------------------------------------------------------------------- 1 | id,location_id,city,state,country 2 | kissimmee|fl|us,,Kissimmee,FL,US 3 | "san diego|ca|us",,"San Diego",CA,US 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/subclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 2/21,, 3 | 2/20,, 4 | 21/69,, 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/uspc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,mainclass_id,subclass_id,sequence 2 | 2013/20130014308,2,2/21,0 3 | 2013/20130014306,2,2/20,0 4 | 2013/20130014306,21,21/69,1 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipa130117.one/usreldoc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,rel_id,doctype,status,date,number,kind,country,relationship,sequence 2 | 2013/20130014306,61433841,us_provisional_application,,2011-01-18,61433841,"",US,,0 3 | 2013/20130014306,61473378,us_provisional_application,,2011-04-08,61473378,"",US,,1 4 | 2013/20130014306,61526999,us_provisional_application,,2011-08-24,61526999,"",US,,2 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/application.csv: -------------------------------------------------------------------------------- 1 | 2010/29381217,D656296,29,29381217,US,2010-12-16,, 2 | 2009/29335910,D656297,29,29335910,US,2009-04-24,, 3 | 2009/29352440,D656298,29,29352440,US,2009-12-21,, 4 | 2011/29383856,D656299,29,29383856,US,2011-01-24,, 5 | 2011/29383977,D656300,29,29383977,US,2011-01-25,, 6 | 2011/29371812,D656301,29,29371812,US,2011-10-12,, 7 | 2010/29367468,D656302,29,29367468,US,2010-08-09,, 8 | 2009/29350925,D656303,29,29350925,US,2009-11-25,, 9 | 2011/29385973,D656304,29,29385973,US,2011-02-23,, 10 | 2011/29392441,D656305,29,29392441,US,2011-05-20,, 11 | 2010/29378661,D656306,29,29378661,US,2010-11-08,, 12 | 2010/29378662,D656307,29,29378662,US,2010-11-08,, 13 | 2010/29379369,D656308,29,29379369,US,2010-11-18,, 14 | 2011/29391097,D656309,29,29391097,US,2011-05-03,, 15 | 2011/29389305,D656310,29,29389305,US,2011-04-08,, 16 | 2010/29370848,D656311,29,29370848,US,2010-09-14,, 17 | 2011/29391518,D656312,29,29391518,US,2011-05-10,, 18 | 2010/29381881,D656313,29,29381881,US,2010-12-23,, 19 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/claim.csv: -------------------------------------------------------------------------------- 1 | D656296,"The ornamental design for a ready to eat snack piece, as shown and described.",,1 2 | D656297,"The ornamental design for a fashion belt, as shown and described.",,1 3 | D656298,"The ornamental design for a garment, as shown and described.",,1 4 | D656299,"The ornamental design for pants, as shown and described.",,1 5 | D656300,"The ornamental design for a pants, as shown and described.",,1 6 | D656301,"The ornamental design for apparel sleeve ends, as shown and described.",,1 7 | D656302,"The ornamental design for footwear, as shown and described.",,1 8 | D656303,"The ornamental design for a shoe sole, as shown and described.",,1 9 | D656304,"The ornamental design for a footwear outsole, as shown and described.",,1 10 | D656305,"The ornamental design for a shoe tread, as shown and described.",,1 11 | D656306,"The ornamental design for a golf shoe upper, as shown and described.",,1 12 | D656307,"The ornamental design for a golf shoe upper, as shown and described.",,1 13 | D656308,"The ornamental design for the replaceable cartridge for a pain management system, as shown and described.",,1 14 | D656309,"The ornamental design for a universal remote control accessory for a mobile device, as shown and described.",,1 15 | D656310,"The ornamental design for a wrist pouch, as shown and described.",,1 16 | D656311,"The ornamental design for a handbag with clasp, as shown and described.",,1 17 | D656312,"The ornamental design for a handbag, as shown and described.",,1 18 | D656313,"The ornamental design for a handbag, as shown and described.",,1 19 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/foreigncitation.csv: -------------------------------------------------------------------------------- 1 | D656296,2008-03-01,"","",8290305.5,EP,"cited by examiner",35 2 | D656298,1985-08-01,"","",G8513103.2,DE,"cited by other",123 3 | D656298,2005-04-01,"",U1,202005000716,DE,"cited by other",124 4 | D656298,2007-10-01,"","",00798897-0021,EM,"cited by examiner",125 5 | D656298,1986-03-01,"","",174179,EP,"cited by other",126 6 | D656298,1997-05-01,"","",774241,EP,"cited by other",127 7 | D656298,2001-03-01,"","",1082951,EP,"cited by other",128 8 | D656298,2001-08-01,"","",1125566,EP,"cited by other",129 9 | D656298,2005-07-01,"","",00385562-0106,EP,"cited by other",130 10 | D656298,2005-12-01,"","",00454202-0027,EP,"cited by other",131 11 | D656298,2005-12-01,"","",00454202-0121,EP,"cited by other",132 12 | D656298,2006-12-01,"","",00633755-0027,EP,"cited by other",133 13 | D656298,2007-05-01,"","",00730403-0023,EP,"cited by other",134 14 | D656298,1918-08-01,"","",116526,GB,"cited by other",135 15 | D656298,1924-03-01,"","",212307,GB,"cited by other",136 16 | D656298,1981-11-01,"","",1603600,GB,"cited by other",137 17 | D656298,2005-07-01,"","",3020687,GB,"cited by other",138 18 | D656298,2006-11-01,"","",4000420,GB,"cited by examiner",139 19 | D656298,2007-10-01,"","",4004524,GB,"cited by other",140 20 | D656298,1990-06-01,"","",2-82707,JP,"cited by other",141 21 | D656298,2001-06-01,"","",2001-172806,JP,"cited by other",142 22 | D656298,2002-05-01,"","",2002-138302,JP,"cited by other",143 23 | D656298,2003-05-01,"",A,2003-129303,JP,"cited by other",144 24 | D656298,2005-10-01,"",A,2005-281893,JP,"cited by other",145 25 | D656298,2006-11-01,"","",2006-316359,JP,"cited by other",146 26 | D656298,2006-11-01,"",A,2006-316359,JP,"cited by other",147 27 | D656298,2007-06-01,"","",2007-146337,JP,"cited by other",148 28 | D656298,2007-06-01,"",A,2007-146337,JP,"cited by other",149 29 | D656298,2007-11-01,"","",2007-303002,JP,"cited by other",150 30 | D656298,2008-07-01,"",A,2008-156812,JP,"cited by other",151 31 | D656298,1996-03-01,"","",96/08217,WO,"cited by other",152 32 | D656298,2001-10-01,"","",1/75201,WO,"cited by other",153 33 | D656300,2007-06-01,"",A,2007154394,JP,"cited by examiner",9 34 | D656300,2008-05-01,"",A,2008106395,JP,"cited by examiner",10 35 | D656303,1999-06-01,"","",99/29203,WO,"cited by other",48 36 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/ipcr.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.18/ipcr.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/mainclass.csv: -------------------------------------------------------------------------------- 1 | D1,, 2 | D2,, 3 | D3,, 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/otherreference.csv: -------------------------------------------------------------------------------- 1 | D656296,"Football Shaped Cookies, posted Sep. 3, 2008 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://throwingwaffles.com/2008/09.",0 2 | D656296,"Super Bowel Sugar Cookies, posted Feb. 1, 2009 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://fodder-and-libations.blogspot.com/2009/02/super-bowl-sugar-cookies-with-royal.html.",1 3 | D656296,"Football Oreos, posted Sep. 18, 2010 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://puertabella.blogspot.com/2010/09/football-oreos.html.",2 4 | D656296,"Football Shaped Tortilla Crisps, posted Sep. 23, 2010 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://www.landolakes.com.",3 5 | D656296,"Football with Laces Cookie Cutter; www.karenscookies.net/Football-with-Laces-Cookie-Cutter_p_1068.html.",4 6 | D656296,"Football Cookie Cutter Set; www.michaels.com/Football-Cookie-Cutter-Set/bk0176,default,pd.html.",5 7 | D656298,"Email from Christine Conforte to Heather Schindler “FW: Tummy Tee,” sent Dec. 1, 2006.",0 8 | D656298,"Email from Christine Conforte to Heather Schindler “Liz Lange,” sent Dec. 13, 2006.",1 9 | D656298,"Email from Heather Schindler to Chistine Conforte “Liz Lange,” sent Dec. 13, 2006.",2 10 | D656298,"Email from Christine Conforte “Liz Lange,” sent Dec. 11, 2006.",3 11 | D656298,"Email from Heather Schindler to Christine Conforte “Liz Lange,” sent Dec. 9, 2006.",4 12 | D656298,"Email from Christine Conforte to Heather Schindler “Liz Lange,” sent Dec. 7, 2006.",5 13 | D656298,"Email from Heather Schindler to Christine Conforte “Liz Lange,” sent Dec. 7, 2006.",6 14 | D656298,"Email from Christine Conforte to Heather Schindler, “Liz Lange,” sent Dec. 6, 2006.",7 15 | D656298,"Email from Heather Schindler to Liz Lange copying Michelle Mooring and Christine Conforte, “Follow Up,” sent Jan. 24, 2007.",8 16 | D656298,"Email from Liz Lange to Heather Schindler copying Christine Conforte and Michelle Mooring, “Follow Up,” sent Jan. 23, 2007.",9 17 | D656298,"Email from Heather Schindler to Liz Lange copying Christine Conforte and Michelle Mooring, “Follow Up,” sent Jan. 23, 2007.",10 18 | D656298,"Email from Liz Lange to Heather Schindler copying Christine Conforte “Follow Up,” sent Jan. 19, 2007.",11 19 | D656298,"Email from Heather Schindler to Christine Conforte copying Michelle Mooring “FW: TummyTube,” sent Jan. 31, 2007.",12 20 | D656298,"Email from Christine Conforte to Heather Schindler copying Liz Lange “Tummy Tube,” sent Jan. 30, 2011.",13 21 | D656298,"Email from Heather Schindler to Liz Lange copying Michelle Mooring and Christine Conforte “TummyTube,” sent Feb. 6, 2007.",14 22 | D656298,"Email from Liz Lange to Heather Schindler and Christine Conforte “TummyTube,” sent Feb. 2, 2007.",15 23 | D656298,"Email from Heather Schindler to Christine Conforte copying Liz Lange “TummyTube,” sent Feb. 2, 2007.",16 24 | D656298,"Email from Christine Conforte to Heather Schindler copying Liz Lange “TummyTube,” sent Jan. 30, 2007.",17 25 | D656298,"“Style Spy,” Lucky Magazine, p. 26, Jan. 2002 issue.",18 26 | D656298,"LeCove swimwear catalog, p. 26.",19 27 | D656298,"JC Penney maternity catalog, p. 165.",20 28 | D656298,"“Fashion Q & A,” Shape Magazine, p. 68. Jan. 2008 issue.",21 29 | D656298,"“Self Selects,” Self Magazine, p. 162. Jun. 2000 issue.",22 30 | D656298,"“Do good while you shop,” Lucky Magazine, p. 286. Oct. 2006 issue.",23 31 | D656298,"“Photo Finish,” WWD Intimates catalog. 2006.",24 32 | D656298,"“Shapewear Report,” In Style Magazine, pp. 338-346. Sep. 2007 issue.",25 33 | D656298,"Self Magazine, p. 33. Dec. 2007 issue.",26 34 | D656298,"“My best stress bust is . . . ,” Self Magazine, p. 200. Apr. 2002 issue.",27 35 | D656298,"Times Three Clothier, LLC Sales Order No. 1000, Sales Order Date Apr. 30, 2007.",28 36 | D656298,"Email from Ivan A. Saperstein to Heather Schindler copying Michelle Mooring and J. Schindler “Gatsby's” sent Jul. 20, 2007.",29 37 | D656298,"Email from Heather Schindler to Ivan A. Saperstein copying Michelle Mooring and J. Schindler “Gatsby's” Jul. 20, 2007.",30 38 | D656298,"Email from Ivan A. Saperstein to Michelle Mooring, Heather Schindler and J. Schindler “Gatsby's” Jul. 20, 2007.",31 39 | D656298,"Invoice billed and shipped to Seams Beutiful-Carolyn Weaver, Invoice # 00006138, Aug. 11, 2006.",32 40 | D656298,"Invoice billed and shipped to Seams Beutiful-Carolyn Weaver, Invoice # 00006155, Aug. 25, 2006.",33 41 | D656298,"Packing Slip to Seams Beautiful, Aug. 25, 2006.",34 42 | D656298,"Maidenform v. Times Three Clothier LLC d/b/a Yummie Tummie, Case No. 10-cv-1661 (GBD)—Skinny Cami Declarations. pp. 1-118 (submitted in 9 parts).",35 43 | D656298,"Specifications. pp. 1-3 (submitted in 1 part).",36 44 | D656298,"Maidenform's Initial Disclosures—Jul. 9, 2010. pp. 1-13 (submitted in 1 part).",37 45 | D656298,"Maidenform's Responses and Objections to TTC's First Set of Interrogatories (Nos. 1-18)—Aug. 24, 2010. pp. 1-12 (submitted in 1 part).",38 46 | D656298,"Yummie Tummie and Maidenform Settle Patent Infringement and Invalidity Lawsuits, Fox News 44, PR Newswire (2011). http://www.fox44now.com/story/15242333/yummie-tummie-and-maidenform-settle-patent-infringement-and-invalidity-lawsuits?clienttype=printable—printed Aug. 12, 2011.",39 47 | D656298,"Maidenform Brands, Inc. Reports Second Quarter 2011 Results and Provides Guidance for Full Year 2011, News Releases—General News, Iselin, NJ—/PRNewswire via COMTEX/ Aug. 10, 2011.",40 48 | D656298,"Associated Press, “Maidenform 2Q net income drops on settlement,” Forbes.com (Aug. 10, 2011). http://www.forbes.com/feeds/ap/2011/08/10/business-specialized-consumer-services-us-earns-maidenform_8611962.html?partner=email—printed Aug. 12, 2011.",41 49 | D656298,"Maidenform's Responses and Objections to TTC's First Set of Requests for Production of Documents (Nos. 1-144)—Aug. 24, 2010.",42 50 | D656298,"Maidenform's Second Set of Requests for the Production of Documents and Things (Nos. 78-123)—Sep. 3, 2010.",43 51 | D656298,"Maidenform's Second Set of Interrogatories (Nos. 16-17)—Sep. 3, 2010.",44 52 | D656298,"Maidenform's First Supplemental Responses and Objections to TTC's First Set of Interrogatories —Sep. 10, 2010.",45 53 | D656298,"TTC's Responses to Maidenform's Second Set of Requests for Production (Nos. 78-123)—Oct. 12, 2010.",46 54 | D656298,"TTC's Written Responses to Maidenform's Second Set of Interrogatories (Nos. 16-17). San Francisco Chronicle article—MF0001240-1242. Women's Wear Daily article—MF0001087-1088. Oct. 12, 2010.",47 55 | D656298,"Maidenform's Responses and Objections to TTC's Supplemental First Set of Interrogatories (Nos. 1A-18A) —Nov. 26, 2010.",48 56 | D656298,"I. Donner's Objections to Maidenform's Request for Production of Documents Included in the Subpoena to I. Donner—Feb. 22, 2011.",49 57 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/patent.csv: -------------------------------------------------------------------------------- 1 | D656296,design,D656296,US,2012-03-27,"","Ready to eat snack piece",S1,1 2 | D656297,design,D656297,US,2012-03-27,"","Fashion belt",S1,1 3 | D656298,design,D656298,US,2012-03-27,"",Garment,S1,1 4 | D656299,design,D656299,US,2012-03-27,"",Pants,S1,1 5 | D656300,design,D656300,US,2012-03-27,"",Pants,S1,1 6 | D656301,design,D656301,US,2012-03-27,"","Apparel sleeve ends",S1,1 7 | D656302,design,D656302,US,2012-03-27,"",Footwear,S1,1 8 | D656303,design,D656303,US,2012-03-27,"","Shoe soles",S1,1 9 | D656304,design,D656304,US,2012-03-27,"","Footwear outsole",S1,1 10 | D656305,design,D656305,US,2012-03-27,"","Shoe tread",S1,1 11 | D656306,design,D656306,US,2012-03-27,"","Golf shoe upper",S1,1 12 | D656307,design,D656307,US,2012-03-27,"","Golf shoe upper",S1,1 13 | D656308,design,D656308,US,2012-03-27,"","Replaceable cartridge for a pain management system",S1,1 14 | D656309,design,D656309,US,2012-03-27,"","Universal remote control accessory for a mobile device",S1,1 15 | D656310,design,D656310,US,2012-03-27,"","Wrist pouch",S1,1 16 | D656311,design,D656311,US,2012-03-27,"","Handbag with clasp",S1,1 17 | D656312,design,D656312,US,2012-03-27,"",Handbag,S1,1 18 | D656313,design,D656313,US,2012-03-27,"",Handbag,S1,1 19 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/rawassignee.csv: -------------------------------------------------------------------------------- 1 | D656296,,plano|tx|us,2,"","","Frito-Lay North America, Inc.","","",0 2 | D656298,,"new york|ny|us",2,"","","Times Three Clothier, LLC","","",0 3 | D656301,,||,2,"","","The Vermont Teddy Bear Company, Inc.","","",0 4 | D656303,,roggwil||ch,3,"","","Kybun AG","","",0 5 | D656304,,"boca raton|fl|us",2,"","","BBC International LLC","","",0 6 | D656305,,logan|ut|us,2,"","","Icon IP, Inc.","","",0 7 | D656306,,fairhaven|ma|us,2,"","","Acushnet Company","","",0 8 | D656307,,fairhaven|ma|us,2,"","","Acushnet Company","","",0 9 | D656308,,minneapolis|mn|us,2,"","","OrthoCor Medical, Inc.","","",0 10 | D656309,,berlin||de,3,"","","Zero1.tv GmbH","","",0 11 | D656310,,"mt. kisco|ny|us",2,"","","Thompson-Weiler Enterprises, LLC","","",0 12 | D656311,,"new york|ny|us",2,"","","Devi Kroell Inc.","","",0 13 | D656312,,paris||fr,3,"","","Design Sportswears","","",0 14 | D656313,,paris||fr,3,"","","Hermes Sellier (Societe Par Actions Simplifiee)","","",0 15 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/rawinventor.csv: -------------------------------------------------------------------------------- 1 | D656296,,frisco|tx|us,Nicole,Cavin,OMITTED,0 2 | D656296,,plano|tx|us,Divya,Paruchuri,OMITTED,1 3 | D656296,,plano|tx|us,Michael,Zbuchalski,OMITTED,2 4 | D656297,,"north las vegas|nv|us",Michaela,"M. Christian",OMITTED,0 5 | D656298,,"new york|ny|us",Heather,"Thomson Schindler",OMITTED,0 6 | D656299,,"huntington beach|ca|us",Brian,"Jeffery Peters",OMITTED,0 7 | D656300,,"laguna hills|ca|us",Debi,Purcell,OMITTED,0 8 | D656301,,shelburne|vt|us,Michael,"Scott Randall",OMITTED,0 9 | D656302,,richmond|va|us,Wade,Driggers,OMITTED,0 10 | D656303,,roggwil||ch,Karl,Muller,OMITTED,0 11 | D656303,,montebelluna||it,Claudio,Franco,OMITTED,1 12 | D656304,,"boca raton|fl|us",Gilberto,Debiase,OMITTED,0 13 | D656305,,orem|ut|us,Kenneth,"Golden Harper",OMITTED,0 14 | D656306,,westford|ma|us,Jonathan,"G. Bacon",OMITTED,0 15 | D656307,,marion|ma|us,James,"M. Feeney",OMITTED,0 16 | D656307,,milton|ma|us,Richard,"A. Mochen",OMITTED,1 17 | D656307,,pembroke|ma|us,Paul,"O. Teeter",OMITTED,2 18 | D656308,,shoreview|mn|us,Kin-Joe,Sham,OMITTED,0 19 | D656309,,hamburg||de,Oliver,Renelt,OMITTED,0 20 | D656309,,berlin||de,Alexander,Gruber,OMITTED,1 21 | D656310,,ossining|ny|us,Valerie,"M. Ciptak",OMITTED,0 22 | D656310,,"severna park|md|us",Justin,"S. Werner",OMITTED,1 23 | D656311,,"new york|ny|us",Gracelia,Chiurazzi,OMITTED,0 24 | D656312,,paris||fr,"Valérie",Gerbi,OMITTED,0 25 | D656313,,paris||fr,Jean-Louis,Dumas,OMITTED,0 26 | D656313,,paris||fr,Pierre-Alexis,"Dumas, legal representative",OMITTED,1 27 | D656313,,paris||fr,Sandrine,"Brekke-Dumas, legal representative",OMITTED,2 28 | D656313,,paris||fr,Couli,Jobert,OMITTED,3 29 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/rawlawyer.csv: -------------------------------------------------------------------------------- 1 | ,D656296,Colin,"P. Cahon","",UNKNOWN, 2 | ,D656296,Celina,"M. Corr","",UNKNOWN, 3 | ,D656296,"","","Carston & Cahoon, LLP",UNKNOWN, 4 | ,D656297,Veronica-Adele,"R. Cao","",UNKNOWN, 5 | ,D656297,"","","Weiss & Moy, P.C.",UNKNOWN, 6 | ,D656298,"","","Stroock & Stroock & Lavan LLP",UNKNOWN, 7 | ,D656299,"","","Minn Law Firm",UNKNOWN, 8 | ,D656299,Jerome,"V. Sartain","",UNKNOWN, 9 | ,D656299,Justin,"G. Sanders","",UNKNOWN, 10 | ,D656300,Walter,"A. Hackler","",UNKNOWN, 11 | ,D656301,H.,"Jay Spiegel","",UNKNOWN, 12 | ,D656302,"","","Thomas & Karceski, P.C.",UNKNOWN, 13 | ,D656303,"","","Shoemaker and Mattare",UNKNOWN, 14 | ,D656304,"","","GrayRobinson, P.A.",UNKNOWN, 15 | ,D656305,Richard,"K. C. Chang, II","",UNKNOWN, 16 | ,D656306,D.,"Michael Burns","",UNKNOWN, 17 | ,D656307,D.,"Michael Burns","",UNKNOWN, 18 | ,D656308,"","","Schwegman, Lunberg & Woessner, P.A.",UNKNOWN, 19 | ,D656309,"","","Perkins Coie LLP",UNKNOWN, 20 | ,D656310,Andrew,"F. Young, Esq.","",UNKNOWN, 21 | ,D656310,"","","Lackenbach Siegel, LLP",UNKNOWN, 22 | ,D656311,Barry,"G. Magidoff","",UNKNOWN, 23 | ,D656312,"","","Abelman, Frayne & Schwab",UNKNOWN, 24 | ,D656313,"","","Foley & Lardner LLP",UNKNOWN, 25 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/rawlocation.csv: -------------------------------------------------------------------------------- 1 | plano|tx|us,,Plano,TX,US 2 | frisco|tx|us,,Frisco,TX,US 3 | "north las vegas|nv|us",,"North Las Vegas",NV,US 4 | "new york|ny|us",,"New York",NY,US 5 | "huntington beach|ca|us",,"Huntington Beach",CA,US 6 | "laguna hills|ca|us",,"Laguna Hills",CA,US 7 | ||,,"","","" 8 | shelburne|vt|us,,Shelburne,VT,US 9 | richmond|va|us,,Richmond,VA,US 10 | roggwil||ch,,Roggwil,"",CH 11 | montebelluna||it,,Montebelluna,"",IT 12 | "boca raton|fl|us",,"Boca Raton",FL,US 13 | logan|ut|us,,Logan,UT,US 14 | orem|ut|us,,Orem,UT,US 15 | fairhaven|ma|us,,Fairhaven,MA,US 16 | westford|ma|us,,Westford,MA,US 17 | marion|ma|us,,Marion,MA,US 18 | milton|ma|us,,Milton,MA,US 19 | pembroke|ma|us,,Pembroke,MA,US 20 | minneapolis|mn|us,,Minneapolis,MN,US 21 | shoreview|mn|us,,Shoreview,MN,US 22 | berlin||de,,Berlin,"",DE 23 | hamburg||de,,Hamburg,"",DE 24 | "mt. kisco|ny|us",,"Mt. Kisco",NY,US 25 | ossining|ny|us,,Ossining,NY,US 26 | "severna park|md|us",,"Severna Park",MD,US 27 | paris||fr,,Paris,"",FR 28 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/subclass.csv: -------------------------------------------------------------------------------- 1 | D1/128,, 2 | D1/106,, 3 | D2/627,, 4 | D2/703,, 5 | D2/742,, 6 | D2/858,, 7 | D2/946,, 8 | D2/947,, 9 | D2/956,, 10 | D2/960,, 11 | D2/951,, 12 | D2/958,, 13 | D2/954,, 14 | D2/969,, 15 | D3/2031,, 16 | D3/218,, 17 | D3/226,, 18 | D3/232,, 19 | D3/243,, 20 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/usapplicationcitation.csv: -------------------------------------------------------------------------------- 1 | D656296,2002/0043158,2002-04-01,"Stevenson et al.",A1,2002/0043158,US,"cited by examiner",30 2 | D656296,2004/0258806,2004-12-01,Khazaal,A1,2004/0258806,US,"cited by examiner",31 3 | D656296,2006/0073240,2006-04-01,"David et al.",A1,2006/0073240,US,"cited by examiner",32 4 | D656296,2009/0035433,2009-02-01,"France et al.",A1,2009/0035433,US,"cited by examiner",33 5 | D656296,2011/0111105,2011-05-01,"Besse et al.",A1,2011/0111105,US,"cited by examiner",34 6 | D656298,2004/0045325,2004-03-01,"Rabinowicz et al.",A1,2004/0045325,US,"cited by examiner",116 7 | D656298,2006/0166600,2006-07-01,"Ravoiu et al.",A1,2006/0166600,US,"cited by examiner",117 8 | D656298,2006/0242748,2006-11-01,Martz,A1,2006/0242748,US,"cited by other",118 9 | D656298,2007/0050881,2007-03-01,Kasprzak,A1,2007/0050881,US,"cited by examiner",119 10 | D656298,2007/0094765,2007-05-01,"Summers et al.",A1,2007/0094765,US,"cited by other",120 11 | D656298,2008/0134409,2008-06-01,Karasina,A1,2008/0134409,US,"cited by other",121 12 | D656298,2008/0244805,2008-10-01,Griffin,A1,2008/0244805,US,"cited by other",122 13 | D656299,2005/0246819,2005-11-01,Tucker,A1,2005/0246819,US,"cited by other",31 14 | D656299,2007/0022510,2007-02-01,"Chapuis et al.",A1,2007/0022510,US,"cited by examiner",32 15 | D656303,2004/0003513,2004-01-01,"Crane et al.",A1,2004/0003513,US,"cited by examiner",40 16 | D656303,2007/0028485,2007-02-01,"Crane et al.",A1,2007/0028485,US,"cited by examiner",41 17 | D656303,2010/0122472,2010-05-01,"Wilson et al.",A1,2010/0122472,US,"cited by examiner",42 18 | D656303,2010/0251565,2010-10-01,"Litchfield et al.",A1,2010/0251565,US,"cited by examiner",43 19 | D656303,2011/0023215,2011-02-01,Obradovic,A1,2011/0023215,US,"cited by examiner",44 20 | D656303,2011/0072684,2011-03-01,Stubblefield,A1,2011/0072684,US,"cited by examiner",45 21 | D656303,2011/0113646,2011-05-01,"Merritt et al.",A1,2011/0113646,US,"cited by examiner",46 22 | D656303,2011/0113649,2011-05-01,"Merritt et al.",A1,2011/0113649,US,"cited by examiner",47 23 | D656304,2008/0148598,2008-06-01,Schoenborn,A1,2008/0148598,US,"cited by examiner",21 24 | D656304,2011/0192054,2011-08-01,"Wojnar et al.",A1,2011/0192054,US,"cited by examiner",22 25 | D656305,2010/0293811,2010-11-01,Truelsen,A1,2010/0293811,US,"cited by examiner",28 26 | D656305,2010/0307025,2010-12-01,"Truelsen et al.",A1,2010/0307025,US,"cited by examiner",29 27 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/uspc.csv: -------------------------------------------------------------------------------- 1 | D656296,D1,D1/128,0 2 | D656296,D1,D1/106,1 3 | D656297,D2,D2/627,0 4 | D656298,D2,D2/703,0 5 | D656299,D2,D2/742,0 6 | D656300,D2,D2/742,0 7 | D656301,D2,D2/858,0 8 | D656302,D2,D2/946,0 9 | D656303,D2,D2/947,0 10 | D656303,D2,D2/956,1 11 | D656304,D2,D2/960,0 12 | D656304,D2,D2/951,1 13 | D656304,D2,D2/958,2 14 | D656305,D2,D2/960,0 15 | D656305,D2,D2/951,1 16 | D656305,D2,D2/954,2 17 | D656305,D2,D2/958,3 18 | D656306,D2,D2/969,0 19 | D656307,D2,D2/969,0 20 | D656308,D3,D3/2031,0 21 | D656309,D3,D3/218,0 22 | D656310,D3,D3/226,0 23 | D656311,D3,D3/232,0 24 | D656312,D3,D3/232,0 25 | D656313,D3,D3/243,0 26 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.18/usreldoc.csv: -------------------------------------------------------------------------------- 1 | D656297,29285560,continuation_in_part,ABANDONED,2007-04-02,29285560,"",US,parent_doc,0 2 | D656297,29335910,continuation_in_part,"",,29335910,"",US,child_doc,1 3 | D656298,29350198,continuation_in_part,"",2009-11-12,29350198,"",US,parent_doc,0 4 | D656298,D616627,continuation_in_part,"",,D616627,"",US,parent_grant_document,1 5 | D656298,29352440,continuation_in_part,"",,29352440,"",US,child_doc,2 6 | D656298,29302500,continuation,"",2008-01-17,29302500,"",US,parent_doc,3 7 | D656298,D606285,continuation,"",,D606285,"",US,parent_grant_document,4 8 | D656298,29350198,continuation,"",,29350198,"",US,child_doc,5 9 | D656299,29324767,continuation_in_part,"",2008-09-19,29324767,"",US,parent_doc,0 10 | D656299,D633280,continuation_in_part,"",,D633280,"",US,parent_grant_document,1 11 | D656299,29383856,continuation_in_part,"",,29383856,"",US,child_doc,2 12 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/application.csv: -------------------------------------------------------------------------------- 1 | 2010/29381217,D656296,29,29381217,US,2010-12-16,, 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/citation.csv: -------------------------------------------------------------------------------- 1 | D656296,,1906-08-01,Allen,S,D38206,US,"cited by examiner",0 2 | D656296,,1920-10-01,Shores,S,D56478,US,"cited by examiner",1 3 | D656296,,1959-02-01,Bedenk,A,2874052,US,"cited by examiner",2 4 | D656296,,1959-09-01,"Anderson et al.",A,2905559,US,"cited by examiner",3 5 | D656296,,1971-01-01,"Holtz et al.",S,D219637,US,"cited by examiner",4 6 | D656296,,1972-04-01,"Ball et al.",A,3656966,US,"cited by examiner",5 7 | D656296,,1983-07-01,Zonnenberg,S,D269729,US,"cited by examiner",6 8 | D656296,,1987-12-01,"Gagliardi, Jr.",S,D293040,US,"cited by examiner",7 9 | D656296,,1989-12-01,"Willard et al.",A,4889737,US,"cited by examiner",8 10 | D656296,,1991-05-01,Saks,S,D317171,US,"cited by examiner",9 11 | D656296,,1991-08-01,"Brewer et al.",A,5038201,US,"cited by examiner",10 12 | D656296,,1995-03-01,"Patoskie et al.",S,D355975,US,"cited by examiner",11 13 | D656296,,1995-04-01,"Novak et al.",S,D357710,US,"cited by examiner",12 14 | D656296,,1996-05-01,Tashiro,A,5518391,US,"cited by examiner",13 15 | D656296,,1996-11-01,"Song et al.",A,5571543,US,"cited by examiner",14 16 | D656296,,1997-12-01,"Wilson et al.",S,D388235,US,"cited by examiner",15 17 | D656296,,2001-03-01,Renda,B1,6197334,US,"cited by examiner",16 18 | D656296,,2001-12-01,Teras,S,D452360,US,"cited by examiner",17 19 | D656296,,2003-06-01,"Bell et al.",S,D475451,US,"cited by examiner",18 20 | D656296,,2003-11-01,Manville,S,D482736,US,"cited by examiner",19 21 | D656296,,2004-04-01,Childress,S,D488611,US,"cited by examiner",20 22 | D656296,,2004-07-01,"Bhaskar et al.",S,D493271,US,"cited by examiner",21 23 | D656296,,2004-11-01,"Mihalos et al.",S,D497702,US,"cited by examiner",22 24 | D656296,,2005-05-01,Childress,S,D505531,US,"cited by examiner",23 25 | D656296,,2005-06-01,Childress,S,D506051,US,"cited by examiner",24 26 | D656296,,2007-04-01,"Aleman et al.",S,D540507,US,"cited by examiner",25 27 | D656296,,2007-05-01,"Cocco et al.",S,D543006,US,"cited by examiner",26 28 | D656296,,2007-10-01,"Aleman et al.",S,D552327,US,"cited by examiner",27 29 | D656296,,2008-01-01,"Yuengling et al.",S,D560538,US,"cited by examiner",28 30 | D656296,,2009-10-01,Hodges,S,D601690,US,"cited by examiner",29 31 | D656296,,2002-04-01,"Stevenson et al.",A1,2002/0043158,US,"cited by examiner",30 32 | D656296,,2004-12-01,Khazaal,A1,2004/0258806,US,"cited by examiner",31 33 | D656296,,2006-04-01,"David et al.",A1,2006/0073240,US,"cited by examiner",32 34 | D656296,,2009-02-01,"France et al.",A1,2009/0035433,US,"cited by examiner",33 35 | D656296,,2011-05-01,"Besse et al.",A1,2011/0111105,US,"cited by examiner",34 36 | D656296,,2008-03-01,"","",8290305.5,EP,"cited by examiner",35 37 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/claim.csv: -------------------------------------------------------------------------------- 1 | D656296,"The ornamental design for a ready to eat snack piece, as shown and described.",,1 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/foreigncitation.csv: -------------------------------------------------------------------------------- 1 | D656296,2008-03-01,"","",8290305.5,EP,"cited by examiner",35 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/ipcr.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.one/ipcr.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/mainclass.csv: -------------------------------------------------------------------------------- 1 | D1,, 2 | D11,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/otherreference.csv: -------------------------------------------------------------------------------- 1 | D656296,"Football Shaped Cookies, posted Sep. 3, 2008 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://throwingwaffles.com/2008/09.",0 2 | D656296,"Super Bowel Sugar Cookies, posted Feb. 1, 2009 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://fodder-and-libations.blogspot.com/2009/02/super-bowl-sugar-cookies-with-royal.html.",1 3 | D656296,"Football Oreos, posted Sep. 18, 2010 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://puertabella.blogspot.com/2010/09/football-oreos.html.",2 4 | D656296,"Football Shaped Tortilla Crisps, posted Sep. 23, 2010 [online], [retrieved Jul. 26, 2011]. Retrieved from Internet, http://www.landolakes.com.",3 5 | D656296,"Football with Laces Cookie Cutter; www.karenscookies.net/Football-with-Laces-Cookie-Cutter _p _1068.html.",4 6 | D656296,"Football Cookie Cutter Set; www.michaels.com/Football-Cookie-Cutter-Set/bk0176,default,pd.html.",5 7 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/patent.csv: -------------------------------------------------------------------------------- 1 | D656296,design,D656296,US,2012-03-27,"","Ready to eat snack piece",S1,1 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/rawassignee.csv: -------------------------------------------------------------------------------- 1 | D656296,,plano|tx|us,2,"","","Frito-Lay North America, Inc.","","",0 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/rawinventor.csv: -------------------------------------------------------------------------------- 1 | D656296,,frisco|tx|us,Nicole,Cavin,OMITTED,0 2 | D656296,,plano|tx|us,Divya,Paruchuri,OMITTED,1 3 | D656296,,plano|tx|us,Michael,Zbuchalski,OMITTED,2 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/rawlawyer.csv: -------------------------------------------------------------------------------- 1 | ,D656296,Colin,"P. Cahon","",UNKNOWN, 2 | ,D656296,Celina,"M. Corr","",UNKNOWN, 3 | ,D656296,"","","Carston & Cahoon, LLP",UNKNOWN, 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/rawlocation.csv: -------------------------------------------------------------------------------- 1 | plano|tx|us,,Plano,TX,US 2 | frisco|tx|us,,Frisco,TX,US 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/subclass.csv: -------------------------------------------------------------------------------- 1 | D1/128,, 2 | D11/06,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/usapplicationcitation.csv: -------------------------------------------------------------------------------- 1 | D656296,2002/0043158,2002-04-01,"Stevenson et al.",A1,2002/0043158,US,"cited by examiner",30 2 | D656296,2004/0258806,2004-12-01,Khazaal,A1,2004/0258806,US,"cited by examiner",31 3 | D656296,2006/0073240,2006-04-01,"David et al.",A1,2006/0073240,US,"cited by examiner",32 4 | D656296,2009/0035433,2009-02-01,"France et al.",A1,2009/0035433,US,"cited by examiner",33 5 | D656296,2011/0111105,2011-05-01,"Besse et al.",A1,2011/0111105,US,"cited by examiner",34 6 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/uspatentcitation.csv: -------------------------------------------------------------------------------- 1 | D656296,D38206,1906-08-01,Allen,S,D38206,US,"cited by examiner",0 2 | D656296,D56478,1920-10-01,Shores,S,D56478,US,"cited by examiner",1 3 | D656296,2874052,1959-02-01,Bedenk,A,2874052,US,"cited by examiner",2 4 | D656296,2905559,1959-09-01,"Anderson et al.",A,2905559,US,"cited by examiner",3 5 | D656296,D219637,1971-01-01,"Holtz et al.",S,D219637,US,"cited by examiner",4 6 | D656296,3656966,1972-04-01,"Ball et al.",A,3656966,US,"cited by examiner",5 7 | D656296,D269729,1983-07-01,Zonnenberg,S,D269729,US,"cited by examiner",6 8 | D656296,D293040,1987-12-01,"Gagliardi, Jr.",S,D293040,US,"cited by examiner",7 9 | D656296,4889737,1989-12-01,"Willard et al.",A,4889737,US,"cited by examiner",8 10 | D656296,D317171,1991-05-01,Saks,S,D317171,US,"cited by examiner",9 11 | D656296,5038201,1991-08-01,"Brewer et al.",A,5038201,US,"cited by examiner",10 12 | D656296,D355975,1995-03-01,"Patoskie et al.",S,D355975,US,"cited by examiner",11 13 | D656296,D357710,1995-04-01,"Novak et al.",S,D357710,US,"cited by examiner",12 14 | D656296,5518391,1996-05-01,Tashiro,A,5518391,US,"cited by examiner",13 15 | D656296,5571543,1996-11-01,"Song et al.",A,5571543,US,"cited by examiner",14 16 | D656296,D388235,1997-12-01,"Wilson et al.",S,D388235,US,"cited by examiner",15 17 | D656296,6197334,2001-03-01,Renda,B1,6197334,US,"cited by examiner",16 18 | D656296,D452360,2001-12-01,Teras,S,D452360,US,"cited by examiner",17 19 | D656296,D475451,2003-06-01,"Bell et al.",S,D475451,US,"cited by examiner",18 20 | D656296,D482736,2003-11-01,Manville,S,D482736,US,"cited by examiner",19 21 | D656296,D488611,2004-04-01,Childress,S,D488611,US,"cited by examiner",20 22 | D656296,D493271,2004-07-01,"Bhaskar et al.",S,D493271,US,"cited by examiner",21 23 | D656296,D497702,2004-11-01,"Mihalos et al.",S,D497702,US,"cited by examiner",22 24 | D656296,D505531,2005-05-01,Childress,S,D505531,US,"cited by examiner",23 25 | D656296,D506051,2005-06-01,Childress,S,D506051,US,"cited by examiner",24 26 | D656296,D540507,2007-04-01,"Aleman et al.",S,D540507,US,"cited by examiner",25 27 | D656296,D543006,2007-05-01,"Cocco et al.",S,D543006,US,"cited by examiner",26 28 | D656296,D552327,2007-10-01,"Aleman et al.",S,D552327,US,"cited by examiner",27 29 | D656296,D560538,2008-01-01,"Yuengling et al.",S,D560538,US,"cited by examiner",28 30 | D656296,D601690,2009-10-01,Hodges,S,D601690,US,"cited by examiner",29 31 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/uspc.csv: -------------------------------------------------------------------------------- 1 | D656296,D1,D1/128,0 2 | D656296,D11,D11/06,1 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.one/usreldoc.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.one/usreldoc.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/application.csv: -------------------------------------------------------------------------------- 1 | 2010/29379369,D656308,29,29379369,US,2010-11-18,, 2 | 2011/29391097,D656309,29,29391097,US,2011-05-03,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/citation.csv: -------------------------------------------------------------------------------- 1 | D656308,,2010-08-01,Traylor,S,D621603,US,"cited by examiner",0 2 | D656308,,2011-01-01,Barrass,S,D630009,US,"cited by examiner",1 3 | D656308,,2011-07-01,"Xu et al.",S,D641970,US,"cited by examiner",2 4 | D656308,,2011-09-01,Portney,S,D645657,US,"cited by examiner",3 5 | D656309,,1999-03-01,"Phillips et al.",S,D406191,US,"cited by examiner",0 6 | D656309,,1999-11-01,"Phillips et al.",A,5988577,US,"cited by examiner",1 7 | D656309,,2001-01-01,Lim,B1,6176401,US,"cited by examiner",2 8 | D656309,,2004-11-01,To,S,D497714,US,"cited by examiner",3 9 | D656309,,2005-05-01,"Robertson et al.",S,D504564,US,"cited by examiner",4 10 | D656309,,2006-09-01,McClaude,S,D528539,US,"cited by examiner",5 11 | D656309,,2010-02-01,Bullen,S,D609464,US,"cited by examiner",6 12 | D656309,,2011-07-01,Stampfli,S,D641974,US,"cited by examiner",7 13 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/claim.csv: -------------------------------------------------------------------------------- 1 | D656308,"The ornamental design for the replaceable cartridge for a pain management system, as shown and described.",,1 2 | D656309,"The ornamental design for a universal remote control accessory for a mobile device, as shown and described.",,1 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/foreigncitation.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.two/foreigncitation.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/ipcr.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.two/ipcr.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/mainclass.csv: -------------------------------------------------------------------------------- 1 | D3,, 2 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/otherreference.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.two/otherreference.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/patent.csv: -------------------------------------------------------------------------------- 1 | D656308,design,D656308,US,2012-03-27,"","Replaceable cartridge for a pain management system",S1,1 2 | D656309,design,D656309,US,2012-03-27,"","Universal remote control accessory for a mobile device",S1,1 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/rawassignee.csv: -------------------------------------------------------------------------------- 1 | D656308,,minneapolis|mn|us,2,"","","OrthoCor Medical, Inc.","","",0 2 | D656309,,berlin||de,3,"","","Zero1.tv GmbH","","",0 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/rawinventor.csv: -------------------------------------------------------------------------------- 1 | D656308,,shoreview|mn|us,Kin-Joe,Sham,OMITTED,0 2 | D656309,,hamburg||de,Oliver,Renelt,OMITTED,0 3 | D656309,,berlin||de,Alexander,Gruber,OMITTED,1 4 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/rawlawyer.csv: -------------------------------------------------------------------------------- 1 | ,D656308,"","","Schwegman, Lunberg & Woessner, P.A.",UNKNOWN, 2 | ,D656309,"","","Perkins Coie LLP",UNKNOWN, 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/rawlocation.csv: -------------------------------------------------------------------------------- 1 | minneapolis|mn|us,,Minneapolis,MN,US 2 | shoreview|mn|us,,Shoreview,MN,US 3 | berlin||de,,Berlin,"",DE 4 | hamburg||de,,Hamburg,"",DE 5 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/subclass.csv: -------------------------------------------------------------------------------- 1 | D3/2031,, 2 | D3/218,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/usapplicationcitation.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.two/usapplicationcitation.csv -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/uspatentcitation.csv: -------------------------------------------------------------------------------- 1 | D656308,D621603,2010-08-01,Traylor,S,D621603,US,"cited by examiner",0 2 | D656308,D630009,2011-01-01,Barrass,S,D630009,US,"cited by examiner",1 3 | D656308,D641970,2011-07-01,"Xu et al.",S,D641970,US,"cited by examiner",2 4 | D656308,D645657,2011-09-01,Portney,S,D645657,US,"cited by examiner",3 5 | D656309,D406191,1999-03-01,"Phillips et al.",S,D406191,US,"cited by examiner",0 6 | D656309,5988577,1999-11-01,"Phillips et al.",A,5988577,US,"cited by examiner",1 7 | D656309,6176401,2001-01-01,Lim,B1,6176401,US,"cited by examiner",2 8 | D656309,D497714,2004-11-01,To,S,D497714,US,"cited by examiner",3 9 | D656309,D504564,2005-05-01,"Robertson et al.",S,D504564,US,"cited by examiner",4 10 | D656309,D528539,2006-09-01,McClaude,S,D528539,US,"cited by examiner",5 11 | D656309,D609464,2010-02-01,Bullen,S,D609464,US,"cited by examiner",6 12 | D656309,D641974,2011-07-01,Stampfli,S,D641974,US,"cited by examiner",7 13 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/uspc.csv: -------------------------------------------------------------------------------- 1 | D656308,D3,D3/2031,0 2 | D656309,D3,D3/218,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/ipg120327.two/usreldoc.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/patentprocessor/416a40ce348904af2463bc97144cd4d89dc2ca6d/test/integration/parse/ipg120327.two/usreldoc.csv -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/application.csv: -------------------------------------------------------------------------------- 1 | id,type,number,country,date,abstract,title,granted,num_claims 2 | 2004/20040000002,,20040000002,US,2004-01-01,"A garment adapted for wear by persons desiring to maintain their arms in a fixed position along their upper body. The garment includes an upper portion, such as a shirt, adapted for wear on the upper body, and at least one hand portion, such as a glove. The shirt is preferably made of a loop material and the glove includes a region of hook material. Utilizing the loop and hook materials, the glove can be releasably attached to the shirt to maintain the user's hand in a fixed position along the user's upper body. The garment is primarily intended for maintaining one or both of the user's arms close against the body to facilitate relaxation while in cramped or tight positions, such as in an airplane seat. However, the garment may also be used in a wide variety of applications wherein it is desired to support one or both of the user's arms in a releasably fixed position. ","Garment for preventing muscle strain",,1 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/claim.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,text,dependent,sequence 2 | 2004/20040000002,". A garment for releasably securing a user's hands at a fixed location along the user's body, comprising: ",,1 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/ipcr.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,classification_level,section,subclass,main_group,subgroup,symbol_position,classification_value,classification_status,classification_data_source,action_date,ipc_version_indicator,sequence 2 | 2004/20040000002,A,,B,001,00,,,,,,,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/mainclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 002,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/rawassignee.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,residence,nationality,sequence 2 | 2004/20040000002,,"",,"","","Hill-Rom Services, Inc.",,,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/rawinventor.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,inventor_id,rawlocation_id,name_first,name_last,nationality,sequence 2 | 2004/20040000002,,tarzana|ca|us,Scott,Hollander,US,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/rawlocation.csv: -------------------------------------------------------------------------------- 1 | id,location_id,city,state,country 2 | "",,"","", 3 | tarzana|ca|us,,Tarzana,CA,US 4 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/subclass.csv: -------------------------------------------------------------------------------- 1 | id,title,text 2 | 002/069000,, 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/uspc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,mainclass_id,subclass_id,sequence 2 | 2004/20040000002,002,002/069000,0 3 | -------------------------------------------------------------------------------- /test/integration/parse/pa040101.two/usreldoc.csv: -------------------------------------------------------------------------------- 1 | uuid,application_id,rel_id,doctype,status,date,number,kind,country,relationship,sequence 2 | 2004/20040000002,9919271,relation,GRANTED,2001-07-30,9919271,"",US,parent,0 3 | 2004/20040000002,10430371,relation,GRANTED,2003-05-05,10430371,A1,"",child,1 4 | -------------------------------------------------------------------------------- /test/integration/readme.md: -------------------------------------------------------------------------------- 1 | # Integration testing for patent processing 2 | 3 | This integration directory stores known, good outputs 4 | from scripts running end to end. 5 | 6 | ### General procedure for generating a test 7 | 8 | 1. Run `preprocess.sh` on a limited, known input. 9 | 2. Export results from 1 or more inputs from 1 into csv. 10 | 3. Commit appropriate known, correct results into repo. 11 | 4. Write a wrapper script for running the integration test 12 | and checking output with (say) diff, automatically. This script 13 | could be written in sh, ruby or python, but preferably not sh. 14 | 15 | -------------------------------------------------------------------------------- /test/make_test_databases.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim: set fileencoding=utf-8 : 3 | 4 | import os 5 | import sqlite3 6 | import csv 7 | 8 | def my_sane_remove_wrapper(filename): 9 | try: 10 | os.remove(filename) 11 | except os.error: 12 | pass 13 | 14 | def remove_existing_databases(): 15 | my_sane_remove_wrapper("assignee.sqlite3") 16 | my_sane_remove_wrapper("inventor.sqlite3") 17 | my_sane_remove_wrapper("hashTbl.sqlite4") 18 | 19 | 20 | def make_assignee_db(): 21 | conn = sqlite3.connect("assignee.sqlite3") 22 | f = open('../schemas/assignee.sql', 'r') 23 | schema = f.read() 24 | c = conn.cursor() 25 | c.executescript(schema) 26 | csvfile = open("./integration/parse/ipg120327.18/assignee.csv", 'r') 27 | assignees = csv.reader(csvfile) 28 | for a in assignees: 29 | c.execute('INSERT INTO assignee VALUES (?,?,?,?,?,?,?,?,?)', a ) 30 | csvfile.close() 31 | conn.commit() 32 | conn.close() 33 | 34 | def make_inventor_db(): 35 | conn = sqlite3.connect("inventor.sqlite3") 36 | f = open('../schemas/inventor.sql', 'r') 37 | schema = f.read() 38 | c = conn.cursor() 39 | c.executescript(schema) 40 | conn.text_factory = str 41 | csvfile = open("./integration/parse/ipg120327.18/inventor.csv", 'r') 42 | inventors = csv.reader(csvfile) 43 | for i in inventors: 44 | c.execute('INSERT INTO inventor VALUES (?,?,?,?,?,?,?,?,?,?)', i) 45 | csvfile.close() 46 | conn.commit() 47 | conn.close() 48 | -------------------------------------------------------------------------------- /test/patenttest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f /tmp/res ] ; 4 | then rm /tmp/res ; 5 | fi 6 | 7 | for f in test_*.py 8 | do 9 | printf "\e[0m" 10 | echo "Processing $f file.." 11 | python $f &> /tmp/res 12 | if [[ $? == 0 ]] ; 13 | then printf "\e[32m" ; 14 | else printf "\e[31m" ; 15 | fi 16 | cat /tmp/res 17 | done 18 | 19 | if [ -f /tmp/res ] ; 20 | then rm /tmp/res ; 21 | fi 22 | printf "\e[0m" 23 | 24 | rm -rf *.sqlite3 *.pyc 25 | rm -rf ../*.sqlite3 *.pyc 26 | -------------------------------------------------------------------------------- /test/process.cfg: -------------------------------------------------------------------------------- 1 | ## NOTE: this is a truncated version of the main 'process.cfg' file used by parse.py and start.py in the 2 | ## above directory. The following section(s) are needed for unittests 3 | # This section specifies which grant_handler is to be used for each year of the 4 | # parser. This section should only have to be touched when a new parser is 5 | # introduced. In the case where a year cannot be parsed from the filename (the 6 | # format `ipgYYMMDD` is assumed), then the default parser is used. 7 | [xml-handlers] 8 | 2005-2012=lib.handlers.grant_handler_v42 9 | 2013=lib.handlers.grant_handler_v44 10 | default=lib.handlers.grant_handler_v42 11 | -------------------------------------------------------------------------------- /test/readme.md: -------------------------------------------------------------------------------- 1 | # Test suite for patent preprocessing 2 | 3 | #### tl;dr 4 | 5 | * `./pattesttest.sh` 6 | * `./parse_integration.sh` 7 | * `./clean_integration.sh` 8 | * `./consolidation_integration.sh` 9 | 10 | ---- 11 | 12 | 13 | We're intending on conforming to PEP guidelines, 14 | please note where implementation is not meeting 15 | a relevant PEP. 16 | 17 | Currently (January 30, 2013), we're running unit tests and integration 18 | tests. We do not have full coverage for unit tests. Unit tests are being 19 | constructed as part of the refactoring process, and all new code should 20 | be covered in unit tests. 21 | 22 | Integration tests will run end-to-end on the parsing, cleaning and consolidation 23 | phases, but the current data sets used in the integration tests are 24 | incomplete. Further, the location handling does not work correctly, so 25 | the integration test covering geocoding is broken by designed. 26 | 27 | ## Running unit tests 28 | 29 | Unit tests are constructed for two specific reasons: 30 | 31 | 1. Prevent regression as code base is refactored, and 32 | 2. Ensure extensions to the current code work correctly. 33 | 34 | A general explanation of either refactoring or unit testing new code is 35 | beyond the scope of this readme. File an enhancement request with 36 | specific questions you would like to have answered in this readme. 37 | 38 | The unit tests are invoked automatically in the `./patenttest.sh` 39 | script. 40 | 41 | 42 | ### PATENTROOT 43 | 44 | Not having `PATENTROOT` set will produce this warning notice: 45 | 46 | ```sh 47 | Processing test_parse_config.py file.. 48 | Cannot find PATENTROOT environment variable. Setting PATENTROOT to the 49 | patentprocessor directory for the scope of this test. Use `export 50 | PATENTROOT=/path/to/directory` to change 51 | ``` 52 | 53 | This is easy to silence: `$ export PATENTROOT=.` 54 | 55 | You may want to export `PATENTROOT` in your shell initialization script 56 | for convenience. 57 | 58 | 59 | ## Running integration tests 60 | 61 | Integration testing for the patent preprocessor simulates running both 62 | preprocessor components and the entire preprocessor on a limited set of 63 | patent data. The goal is ensuring that for a given input, the output 64 | doesn't change from run to run as the code continues development. 65 | 66 | The integration tests require two types of databases: 67 | 68 | 1. A set of sqlite databases located in the test directory as a result 69 | of a succesful parse, and 70 | 2. Databases `loctbl` and `NBER_asg` linked from elsewhere like so: 71 | * `ln -s /data/patentdata/NBER/NBER_asg .` 72 | * `ln -s /data/patentdata/location/loctbl.sqlite3 loctbl` 73 | (Your links may be different.) 74 | 75 | The databases mentioned in item 1 are constructed during the 76 | preprocessing, and require no initial setup. 77 | 78 | The databases mentioned in item 2 are used in the cleaning phase of the 79 | preprocessor. 80 | 81 | Fung Institute developers have access to both `loctbl` and `NBER_asg` on 82 | the server. These are read-only on the server, and should be copied into 83 | user's home areas with the soft adjusted appropriately. 84 | 85 | External developers and other interested parties can download: 86 | 87 | * [loctbl](https://s3-us-west-1.amazonaws.com/fidownloads/loctbl.sqlite3) 88 | * [NBER_asg](https://s3-us-west-1.amazonaws.com/fidownloads/NBER_asg) 89 | 90 | Note: the integration tests pass, that is, run correctly, for data we 91 | know is not 100% correct. However, these tests allow evolving the code 92 | to correctness incrementally. 93 | 94 | 95 | 96 | #### Test speed 97 | 98 | The integration tests require correctly indexed tables to operate 99 | efficiently. The run time difference is roughly 5 minutes for each test 100 | over the geocoding with unindexed tables, versus about 6 seconds for 101 | correctly indexed tables. 102 | -------------------------------------------------------------------------------- /test/sqlitetest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import sys 5 | import sqlite3 6 | sys.path.append('../') 7 | sys.path.append('../lib') 8 | import SQLite 9 | 10 | # TODO: Get a database connection for testing merge 11 | 12 | def create_connections(): 13 | cls.conn1 = sqlite3.connect(':memory:') 14 | cls.conn2 = sqlite3.connect(':memory:') 15 | 16 | def close_connections(): 17 | conn1.close() 18 | conn2.close() 19 | 20 | def create_assignee_schema(cursor): 21 | cursor.executescript(""" 22 | CREATE TABLE IF NOT EXISTS assignee ( 23 | Patent VARCHAR(8), AsgType INTEGER, Assignee VARCHAR(30), 24 | City VARCHAR(10), State VARCHAR(2), Country VARCHAR(2), 25 | Nationality VARCHAR(2), Residence VARCHAR(2), AsgSeq INTEGER); 26 | CREATE UNIQUE INDEX IF NOT EXISTS uqAsg ON assignee (Patent, AsgSeq); 27 | DROP INDEX IF EXISTS idx_pata; 28 | DROP INDEX IF EXISTS idx_patent; 29 | DROP INDEX IF EXISTS idx_asgtyp; 30 | DROP INDEX IF EXISTS idx_stt; 31 | DROP INDEX IF EXISTS idx_cty; 32 | """) 33 | 34 | def initialize_assignees(conn): 35 | q = ('D0656296',2,'Frito-Lay North America, Inc.','Plano','TX','US','','',0) 36 | conn.cursor().execute("""INSERT OR IGNORE INTO assignee VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", q) 37 | conn.commit() 38 | 39 | class TestSQLite(unittest.TestCase): 40 | 41 | @classmethod 42 | def setUp(cls): 43 | #print "Setting up..." 44 | cls.conn1 = sqlite3.connect(':memory:') 45 | cls.conn2 = sqlite3.connect(':memory:') 46 | #create_connections() 47 | 48 | @classmethod 49 | def tearDown(cls): 50 | #print "Tearing down..." 51 | cls.conn1.close() 52 | cls.conn2.close() 53 | #close_connections() 54 | 55 | def test_constructor_empty(self): 56 | s = SQLite.SQLite() 57 | assert(s.db == ':memory:') 58 | assert(s.tbl == 'main') 59 | 60 | def test_constructor_dbname(self): 61 | s = SQLite.SQLite(db='foobar.sqlite3') 62 | assert(s.db == 'foobar.sqlite3') 63 | assert(s.tbl == 'main') 64 | 65 | def test_constructor_dbname_tbl(self): 66 | s = SQLite.SQLite(db='foobar.sqlite3', tbl='tbl_foo') 67 | assert(s.db == 'foobar.sqlite3') 68 | assert(s.tbl == 'tbl_foo') 69 | 70 | def test_constructor_dbname_tbl_table(self): 71 | s = SQLite.SQLite(db='foobar.sqlite3', tbl='tbl_foo', table='table_foo') 72 | assert(s.db == 'foobar.sqlite3') 73 | assert(s.tbl == 'tbl_foo') 74 | 75 | def test_constructor_dbname_table(self): 76 | s = SQLite.SQLite(db='foobar.sqlite3', table='table_foo') 77 | assert(s.db == 'foobar.sqlite3') 78 | assert(s.tbl == 'table_foo') 79 | 80 | # def test_merge(self): 81 | # s = SQLite.SQLite() 82 | # s.merge(key=[['AsgNum', 'pdpass']], on=[['assigneeAsc', 'assignee']], 83 | # keyType=['INTEGER'], tableFrom='main', db='db') 84 | # assert(1 == 1) 85 | 86 | def test_index(self): 87 | s = SQLite.SQLite('test.sqlite3') 88 | create_assignee_schema(s.c) 89 | initialize_assignees(s.conn) 90 | assert(1 == 1) 91 | 92 | if __name__ == '__main__': 93 | unittest.main() 94 | -------------------------------------------------------------------------------- /test/test_ascit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim: set fileencoding=utf-8 : 3 | 4 | # The `ascit` function is used during the cleaning phase as 5 | # an sqlite3 function. 6 | 7 | """ 8 | Macos keycodes for common utf characters found in patents. 9 | 10 | http://kb.iu.edu/data/anhf.html 11 | 12 | Keystroke Character 13 | Option-e [letter] acute (e.g., á) 14 | Option-` [letter] grave (e.g., è) 15 | Option-i [letter] circumflex (e.g., ô ) 16 | Option-u [letter] umlaut or dieresis (e.g., ï ) 17 | Option-n [letter] tilde (e.g., ñ ) 18 | Option-q oe ligature ( œ ) 19 | Option-c cedilla ( ç ) 20 | Option-Shift-/ (forward slash) upside-down question mark ( ¿ ) 21 | Option-1 (the number 1) upside-down exclamation point ( ¡ ) 22 | """ 23 | 24 | import unittest 25 | import sys 26 | sys.path.append( '.' ) 27 | sys.path.append( '../lib/' ) 28 | from fwork import ascit 29 | from fwork import remspace 30 | 31 | class TestAscit(unittest.TestCase): 32 | 33 | def setUp(self): 34 | self.foo = 'bar' 35 | 36 | def test_toupper(self): 37 | assert('FOO' == ascit('FOO')) 38 | 39 | def test_retain_acute_verite(self): 40 | #print ascit('verité').rstrip('\r\n') 41 | assert('verité' == ascit('verité')) 42 | 43 | def test_retain_acute(self): 44 | #print 'ascit é' + ascit('é') 45 | assert('é' == ascit('é')) 46 | 47 | def test_retain_grave(self): 48 | assert('è' == ascit('è')) 49 | 50 | def test_retain_circumflex(self): 51 | assert('ô' == ascit('ô')) 52 | 53 | def test_retain_umlaut(self): 54 | assert('ü' == ascit('ü')) 55 | 56 | def test_retain_tilde(self): 57 | assert('ñ' == ascit('ñ')) 58 | 59 | def test_retain_oeligature(self): 60 | assert('œ' == ascit('œ')) 61 | 62 | def test_retain_cedilla(self): 63 | assert('ç' == ascit('ç')) 64 | 65 | def test_retain_usdq(self): 66 | assert('¿' == ascit('¿')) 67 | 68 | def test_int(self): 69 | assert('1' == ascit('1')) 70 | 71 | def test_float(self): 72 | # Default strict=True removes periods. 73 | result = ascit('1.0', strict=False) 74 | assert('1.0' == result) 75 | 76 | def test_remove_period(self): 77 | assert('10' == ascit('1.0', strict=True)) 78 | 79 | def test_remove_ampersand(self): 80 | assert('foobar' == ascit('foo&bar', strict=True)) 81 | 82 | def test_remove_punctuation(self): 83 | assert('foobar' == ascit('f+=_oo@b!#$%^&*(){}ar', strict=True)) 84 | 85 | def test_remove_space_plus(self): 86 | assert('' == ascit(' +', strict=True)) 87 | 88 | def test_remove_spaces(self): 89 | #print ascit('foo bar') 90 | assert('foobar' == ascit('foobar')) 91 | 92 | def test_remove_duplicates(self): 93 | #print ascit('foo, |||,,, ,, |,,, bar') 94 | assert('foo bar' == ascit('foo, |||,,, ,, |,,, bar')) 95 | 96 | def test_remove_braces(self): 97 | #print ascit('{foo bar}', strict=True) 98 | assert('' == ascit('{foo bar}', strict=True)) 99 | 100 | def test_remspace(self): 101 | assert('foobar' == remspace('foo bar')) 102 | 103 | def test_remove_parentheses(self): 104 | #print ascit('{foo bar}', strict=True) 105 | assert('' == ascit('(foo bar)', strict=True)) 106 | 107 | def test_remove_period(self): 108 | assert('hello there' == ascit('hello. there')) 109 | assert('hello there' == ascit('hello. there',strict =True)) 110 | 111 | def test_remove_comma(self): 112 | assert('hello there' == ascit('hello, there')) 113 | assert('hello there' == ascit('hello, there',strict =True)) 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /test/test_configuration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import unittest 5 | 6 | sys.path.append('..') 7 | sys.path.append('../lib') 8 | 9 | from start import get_year_list 10 | 11 | class Test_Configuration(unittest.TestCase): 12 | 13 | def setUp(self): 14 | pass 15 | 16 | def test_get_year1(self): 17 | yearstring = '2013' 18 | expected = [2013] 19 | years = get_year_list(yearstring) 20 | self.assertTrue(expected == years, '\n{0} should be\n{1}'\ 21 | .format(years, expected)) 22 | 23 | def test_get_year2(self): 24 | yearstring = '2010-2013' 25 | expected = [2010, 2011, 2012, 2013] 26 | years = get_year_list(yearstring) 27 | self.assertTrue(expected == years, '\n{0} should be\n{1}'\ 28 | .format(years, expected)) 29 | 30 | def test_get_year3(self): 31 | yearstring = '2010-2013,2009' 32 | expected = [2010,2011,2012,2013,2009] 33 | years = get_year_list(yearstring) 34 | self.assertTrue(expected == years, '\n{0} should be\n{1}'\ 35 | .format(years, expected)) 36 | 37 | def test_get_year4(self): 38 | yearstring = '2008,2010-2013,2009' 39 | expected = [2008,2010,2011,2012,2013,2009] 40 | years = get_year_list(yearstring) 41 | self.assertTrue(expected == years, '\n{0} should be\n{1}'\ 42 | .format(years, expected)) 43 | 44 | def test_get_year5(self): 45 | yearstring = '1975-1978,2000-2002' 46 | expected = [1975,1976,1977,1978,2000,2001,2002] 47 | years = get_year_list(yearstring) 48 | self.assertTrue(expected == years, '\n{0} should be\n{1}'\ 49 | .format(years, expected)) 50 | 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /test/test_fwork.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # `fwork.py` will probably be renamed to something a little 4 | # more suggestive as to its purpose. 5 | 6 | import unittest 7 | import sys 8 | 9 | sys.path.append( '.' ) 10 | sys.path.append( '../lib/' ) 11 | 12 | #import imp 13 | #from yaml import load, dump 14 | 15 | from fwork import * 16 | 17 | class TestFWork(unittest.TestCase): 18 | 19 | def removeFile(self, file): 20 | #delete a file if it exists 21 | if os.path.isfile(file): 22 | os.system("rm {file}".format(file=file)) 23 | 24 | 25 | def setUp(self): 26 | self.foo = 'bar' 27 | 28 | def test_dummy(self): 29 | assert(1 == 1) 30 | 31 | def test_int(self): 32 | assert('1' == ascit('1')) 33 | 34 | def test_float(self): 35 | # Default strict=True removes periods. 36 | result = ascit('1.0', strict=False) 37 | assert('1.0' == result) 38 | 39 | def test_remove_period(self): 40 | assert('10' == ascit('1.0', strict=True)) 41 | 42 | def test_retain_hyphen(self): 43 | assert('KIN-JOE' == ascit('KIN-JOE', strict=True)) 44 | 45 | # def test_quickSQL(self): 46 | # import sqlite3 47 | # self.conn = sqlite3.connect(":memory:") 48 | # self.cursor = self.conn.cursor() 49 | # data = [ ["Unique_ID", "Name"], [1, 1], [2, 2], [3, 3], [4, 4] ] 50 | # quickSQL(self.cursor, data, table="test") 51 | # self.conn.close 52 | 53 | 54 | def test_get_ctypes(self): 55 | assert('VARCHAR' == get_ctypes("FOO")) 56 | assert('REAL' == get_ctypes(4.2)) 57 | assert('INTEGER' == get_ctypes(42)) 58 | 59 | def get_quicksql_data(self): 60 | return [ 61 | [u'UniqueID', u'Patent', u'Lastname', u'Firstname'], 62 | [u'1', u'0.8194655', u'PISTER', u'KRISTOPHER S J'], 63 | [u'1', u'0.8190055', u'PISTER', u'KRISTOPHER S J'] 64 | ] 65 | 66 | def test_is_real(self): 67 | data = u'0.1234' 68 | assert(1 == is_real(data)) 69 | data = u'01234' 70 | assert(0 == is_real(data)) 71 | 72 | 73 | def get_typelist(self): 74 | #return [u'Patent REAL', u'Lastname INTEGER'] 75 | return [u'Lastname INTEGER'] 76 | 77 | def test_text_type(self): 78 | data = 'foo' 79 | assert(True == text_type(data)) 80 | data = 123 81 | assert(False == text_type(data)) 82 | data = 1.23 83 | assert(False == text_type(data)) 84 | data = u'123' 85 | assert(True == text_type(data)) 86 | data = '1.23' 87 | assert(True == text_type(data)) 88 | 89 | def test_create_column_labels(self): 90 | assert(1 == 1) 91 | 92 | 93 | def test_have_schema_type(self): 94 | tl = self.get_typelist() 95 | assert(-1 == have_schema_type(tl, 'UNIQUEID')) 96 | #assert(3 == have_schema_type(tl, 'PATENT')) 97 | #assert(19 == have_schema_type(tl, 'LASTNAME')) 98 | assert(-1 == have_schema_type(tl, 'FIRSTNAME')) 99 | 100 | def test_quickSQL2(self): 101 | import sqlite3 102 | dbfilename = "fwork.sqlite3" 103 | self.removeFile(dbfilename) 104 | self.conn = sqlite3.connect(dbfilename) 105 | self.cursor = self.conn.cursor() 106 | data = self.get_quicksql_data() 107 | typelist = self.get_typelist() 108 | quickSQL2(self.cursor, data, table="test", header=True, typeList=typelist) 109 | self.conn.commit() 110 | self.cursor.close() 111 | 112 | if __name__ == '__main__': 113 | unittest.main() 114 | -------------------------------------------------------------------------------- /test/test_keylist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import os 5 | import sqlite3 6 | import sys 7 | sys.path.append( '../lib/' ) 8 | import SQLite 9 | 10 | class TestSQLite(unittest.TestCase): 11 | 12 | def removeFile(self, fname): 13 | #delete a fname if it exists 14 | if os.path.isfile(fname): 15 | os.remove(fname) 16 | 17 | def createFile(self, fname, ftype=None, data="1,2,3"): 18 | #create a fname db, csv 19 | if fname.split(".")[-1] == "db" or ftype == "db": 20 | conn = sqlite3.connect(fname) 21 | c = conn.cursor() 22 | c.executescript(""" 23 | CREATE TABLE test (a, B, c); 24 | CREATE TABLE main (d, E, f); 25 | INSERT INTO test VALUES ({data}); 26 | INSERT INTO main VALUES ({data}); 27 | CREATE INDEX idx ON test (a); 28 | CREATE INDEX idy ON test (a, b); 29 | """.format(data=data)) #""" 30 | conn.commit() 31 | c.close() 32 | conn = sqlite3.connect(fname) 33 | elif fname.split(".")[-1] == "csv" or ftype == "csv": 34 | os.system("echo '{data}' >> {fname}".\ 35 | format(data=data, fname=fname)) 36 | 37 | def setUp(self): 38 | self.removeFile("test.db") 39 | self.removeFile("test.csv") 40 | self.removeFile("test2.db") 41 | self.removeFile("test2.csv") 42 | # create a really basic dataset 43 | self.createFile(fname="test.db") 44 | self.s = SQLite.SQLite(db="test.db", tbl="test") 45 | self.createFile("test2.db") 46 | s = SQLite.SQLite("test2.db", tbl="test") 47 | self.s.attach(s) 48 | 49 | def tearDown(self): 50 | self.s.close() 51 | self.removeFile("test.db") 52 | self.removeFile("test.csv") 53 | self.removeFile("test2.db") 54 | self.removeFile("test2.csv") 55 | self.removeFile("errlog") 56 | 57 | def test_keyList(self): 58 | #key = self.s._keyList('foo', kwargs={'tbl': 'main'}) 59 | #print "key from test: ", key 60 | #key = self.s._keyList('foo', kwargs={"keys": ['bar', 'baz'], 'tbl': 'main'}) 61 | #print "key from test: ", key 62 | #key = self.s._keyList('foo', kwargs={"keys": 'bar', 'tbl': 'main'}) 63 | #print "key from test: ", key 64 | #key = self.s._keyList('foo', keys={"bar": 'baz'}) 65 | #print "key from test: ", key 66 | key = self.s._keyList('foo', keys={"bar",'baz'}) 67 | print "key from test: ", key 68 | print "key[0] from test: ", key[0] 69 | 70 | self.assertEquals(1,1) 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | 76 | -------------------------------------------------------------------------------- /test/test_parse_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import unittest 6 | import logging 7 | import re 8 | from collections import Iterable 9 | 10 | sys.path.append('../') 11 | import parse 12 | import lib.handlers.grant_handler_v42 as grant_handler_v42 13 | 14 | basedir = os.path.dirname(__file__) 15 | testdir = os.path.join(basedir, './fixtures/xml/') 16 | testfileone = 'ipg120327.one.xml' 17 | testfiletwo = 'ipg120327.two.xml' 18 | regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I) 19 | 20 | class TestParseFile(unittest.TestCase): 21 | 22 | def setUp(self): 23 | pass 24 | 25 | def test_extract_xml_strings_one(self): 26 | parsed_output = parse.extract_xml_strings(testdir+testfileone) 27 | self.assertTrue(isinstance(parsed_output, list)) 28 | self.assertTrue(len(parsed_output) == 1) 29 | self.assertTrue(isinstance(parsed_output[0], tuple)) 30 | self.assertTrue(isinstance(parsed_output[0][1], str)) 31 | self.assertTrue(regex.match(parsed_output[0][1])) 32 | 33 | def test_parse_files_one(self): 34 | filelist = [testdir+testfileone] 35 | parsed_output = parse.parse_files(filelist) 36 | self.assertTrue(isinstance(parsed_output,Iterable)) 37 | parsed_output = list(parsed_output) 38 | self.assertTrue(len(parsed_output) == 1) 39 | self.assertTrue(isinstance(parsed_output[0], tuple)) 40 | self.assertTrue(isinstance(parsed_output[0][1], str)) 41 | self.assertTrue(regex.match(parsed_output[0][1])) 42 | 43 | def test_extract_xml_strings_two(self): 44 | parsed_output = parse.extract_xml_strings(testdir+testfiletwo) 45 | self.assertTrue(isinstance(parsed_output, Iterable)) 46 | parsed_output = list(parsed_output) 47 | self.assertTrue(len(parsed_output) == 2) 48 | self.assertTrue(isinstance(parsed_output[0], tuple)) 49 | self.assertTrue(isinstance(parsed_output[0][1], str)) 50 | self.assertTrue(isinstance(parsed_output[1], tuple)) 51 | self.assertTrue(isinstance(parsed_output[1][1], str)) 52 | self.assertTrue(regex.match(parsed_output[0][1])) 53 | self.assertTrue(regex.match(parsed_output[1][1])) 54 | 55 | def test_parse_files_two(self): 56 | filelist = [testdir+testfiletwo] 57 | parsed_output = parse.parse_files(filelist) 58 | self.assertTrue(isinstance(parsed_output,Iterable)) 59 | parsed_output = list(parsed_output) 60 | self.assertTrue(len(parsed_output) == 2) 61 | self.assertTrue(isinstance(parsed_output[0], tuple)) 62 | self.assertTrue(isinstance(parsed_output[0][1], str)) 63 | self.assertTrue(isinstance(parsed_output[1], tuple)) 64 | self.assertTrue(isinstance(parsed_output[1][1], str)) 65 | self.assertTrue(regex.match(parsed_output[0][1])) 66 | self.assertTrue(regex.match(parsed_output[1][1])) 67 | 68 | def test_use_parse_files_one(self): 69 | filelist = [testdir+testfileone] 70 | parsed_output = list(parse.parse_files(filelist)) 71 | patobj = grant_handler_v42.PatentGrant(parsed_output[0][1], True) 72 | self.assertTrue(patobj) 73 | 74 | def test_use_parse_files_two(self): 75 | filelist = [testdir+testfiletwo] 76 | parsed_output = parse.parse_files(filelist) 77 | parsed_xml = [] 78 | for us_patent_grant in parsed_output: 79 | self.assertTrue(isinstance(us_patent_grant, tuple)) 80 | self.assertTrue(isinstance(us_patent_grant[1], str)) 81 | patobj = grant_handler_v42.PatentGrant(us_patent_grant[1], True) 82 | self.assertTrue(patobj) 83 | 84 | def test_list_files(self): 85 | testdir = os.path.join(basedir, './fixtures/xml') 86 | xmlregex = r'ipg120327.one.xml' 87 | files = parse.list_files(testdir, xmlregex) 88 | self.assertTrue(isinstance(files, list)) 89 | self.assertTrue(len(files) == 1) 90 | self.assertTrue(all(filter(lambda x: isinstance(x, str), files))) 91 | self.assertTrue(all(map(lambda x: os.path.exists(x), files))) 92 | 93 | if __name__ == '__main__': 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /test/test_separate_row_geocode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import unittest 5 | import sys 6 | 7 | sys.path.append( '.' ) 8 | sys.path.append( '../lib/' ) 9 | 10 | from geocode_setup import get_entry_from_row 11 | 12 | class TestSepWrd(unittest.TestCase): 13 | 14 | def test_get_entry_from_row_comma(self): 15 | assert("foo" == get_entry_from_row("foo,bar", 0)) 16 | assert("bar" == get_entry_from_row("foo,bar", 1)) 17 | 18 | def test_get_entry_from_row_pipe(self): 19 | assert("foo" == get_entry_from_row("foo|bar", 0)) 20 | assert("bar" == get_entry_from_row("foo|bar", 1)) 21 | 22 | def test_nosplit(self): 23 | result = get_entry_from_row("foo bar", 0) 24 | assert("foo bar" == result) 25 | result = get_entry_from_row("foo bar", 1) 26 | assert("" == result) 27 | # Check out of bounds index, really ought to fail 28 | assert("" == get_entry_from_row("foo bar", 2)) 29 | 30 | def test_seq_neg1(self): 31 | assert("foo bar" == get_entry_from_row("foo bar", -1)) 32 | 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /test/test_sqlite_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import os 5 | import sqlite3 6 | import sys 7 | sys.path.append( '../lib/' ) 8 | import SQLite 9 | 10 | class TestSQLite(unittest.TestCase): 11 | 12 | def removeFile(self, fname): 13 | #delete a fname if it exists 14 | try: 15 | os.remove(fname) 16 | except OSError: 17 | pass 18 | 19 | def createFile(self, file, type=None, data="1,2,3"): 20 | #create a file db, csv 21 | if file.split(".")[-1] == "db" or type == "db": 22 | connection = sqlite3.connect(file) 23 | cursor = connection.cursor() 24 | cursor.executescript(""" 25 | CREATE TABLE test (a, B, cursor); 26 | CREATE TABLE main (d, E, f); 27 | INSERT INTO test VALUES ({data}); 28 | INSERT INTO main VALUES ({data}); 29 | CREATE INDEX idx ON test (a); 30 | CREATE INDEX idy ON test (a, b); 31 | """.format(data=data)) #""" 32 | connection.commit() 33 | cursor.close() 34 | connection = sqlite3.connect(file) 35 | elif file.split(".")[-1] == "csv" or type == "csv": 36 | os.system("echo '{data}' >> {file}".\ 37 | format(data=data, file=file)) 38 | 39 | def setUp(self): 40 | self.removeFile("test.db") 41 | self.removeFile("test.csv") 42 | self.removeFile("test2.db") 43 | self.removeFile("test2.csv") 44 | # create a really basic dataset 45 | self.createFile(file="test.db") 46 | self.s = SQLite.SQLite(db="test.db", tbl="test") 47 | self.createFile("test2.db") 48 | s = SQLite.SQLite("test2.db", tbl="test") 49 | self.s.attach(s) 50 | 51 | def tearDown(self): 52 | self.s.close() 53 | self.removeFile("test.db") 54 | self.removeFile("test.csv") 55 | self.removeFile("test2.db") 56 | self.removeFile("test2.csv") 57 | self.removeFile("errlog") 58 | 59 | def test_indexes(self): 60 | self.assertIn('idx', self.s.indexes()) 61 | self.assertTrue(self.s.indexes(lookup="idx")) 62 | self.assertFalse(self.s.indexes(lookup="xdi")) 63 | self.assertEquals([0,0], self.s.indexes(seq="xdi")) 64 | self.assertEquals([1,1], self.s.indexes(seq="idx")) 65 | self.s.c.executescript(""" 66 | CREATE INDEX idx1 ON test (b); 67 | CREATE INDEX idx2 ON test (cursor); 68 | CREATE INDEX idx5x3 ON test (a); 69 | CREATE INDEX idx10x ON test (a); 70 | """) 71 | self.assertEquals([1,3], self.s.indexes(seq="idx")) 72 | 73 | 74 | def test__baseIndex(self): 75 | self.assertItemsEqual(['test (a)', 'test (a,b)'], 76 | self.s._baseIndex(db="db")) 77 | self.assertEqual('test (a)', 78 | self.s._baseIndex(idx="idx")) 79 | self.assertEqual('foo (bar,foo)', 80 | self.s._baseIndex(idx="create index x on foo (foo, bar)")) 81 | self.assertEqual('unique foo (foo)', 82 | self.s._baseIndex(idx="create unique index x on foo (foo)")) 83 | 84 | 85 | def test_index(self): 86 | self.s.index([['a','cursor']]) 87 | self.assertIn('test (a,cursor)', self.s._baseIndex()) 88 | 89 | self.s.index('a', unique=True) 90 | self.assertIn('test (a)', self.s._baseIndex()) 91 | self.assertFalse(self.s.index(['a','cursor'])) 92 | 93 | self.s.index('f', tbl="main") 94 | self.assertIn('main (f)', self.s._baseIndex()) 95 | self.assertFalse(self.s.index('a', tbl="main")) 96 | 97 | #self.s.index([['e', 'f']], combo=True, tbl="main") 98 | #self.assertIn('main (e)', self.s._baseIndex(tbl="main")) 99 | #self.assertIn('main (e,f)', self.s._baseIndex(tbl="main")) 100 | 101 | #self.s.index(['a','cursor'], db="db") 102 | #self.assertIn('test (a,cursor)', self.s._baseIndex(db="db")) 103 | 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | 108 | -------------------------------------------------------------------------------- /test/test_sqlite_merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | import os 5 | import sqlite3 6 | import sys 7 | sys.path.append( '../lib/' ) 8 | import SQLite 9 | 10 | class TestSQLite(unittest.TestCase): 11 | 12 | def removeFile(self, fname): 13 | #delete a fname if it exists 14 | try: 15 | os.remove(fname) 16 | except OSError: 17 | pass 18 | 19 | def createFile(self, file, type=None, data="1,2,3"): 20 | #create a file db, csv 21 | if file.split(".")[-1] == "db" or type == "db": 22 | connection = sqlite3.connect(file) 23 | cursor = connection.cursor() 24 | cursor.executescript(""" 25 | CREATE TABLE test (a, B, cursor); 26 | CREATE TABLE main (d, E, f); 27 | INSERT INTO test VALUES ({data}); 28 | INSERT INTO main VALUES ({data}); 29 | CREATE INDEX idx ON test (a); 30 | CREATE INDEX idy ON test (a, b); 31 | """.format(data=data)) #""" 32 | connection.commit() 33 | cursor.close() 34 | connection = sqlite3.connect(file) 35 | elif file.split(".")[-1] == "csv" or type == "csv": 36 | os.system("echo '{data}' >> {file}".\ 37 | format(data=data, file=file)) 38 | 39 | def setUp(self): 40 | self.removeFile("test.db") 41 | self.removeFile("test.csv") 42 | self.removeFile("test2.db") 43 | self.removeFile("test2.csv") 44 | # create a really basic dataset 45 | self.createFile(file="test.db") 46 | self.s = SQLite.SQLite(db="test.db", tbl="test") 47 | self.createFile("test2.db") 48 | s = SQLite.SQLite("test2.db", tbl="test") 49 | self.s.attach(s) 50 | 51 | def tearDown(self): 52 | self.s.close() 53 | self.removeFile("test.db") 54 | self.removeFile("test.csv") 55 | self.removeFile("test2.db") 56 | self.removeFile("test2.csv") 57 | self.removeFile("errlog") 58 | 59 | 60 | def test_indexes(self): 61 | self.assertIn('idx', self.s.indexes()) 62 | self.assertTrue(self.s.indexes(lookup="idx")) 63 | self.assertFalse(self.s.indexes(lookup="xdi")) 64 | self.assertEquals([0,0], self.s.indexes(seq="xdi")) 65 | self.assertEquals([1,1], self.s.indexes(seq="idx")) 66 | self.s.c.executescript(""" 67 | CREATE INDEX idx1 ON test (b); 68 | CREATE INDEX idx2 ON test (cursor); 69 | CREATE INDEX idx5x3 ON test (a); 70 | CREATE INDEX idx10x ON test (a); 71 | """) 72 | self.assertEquals([1,3], self.s.indexes(seq="idx")) 73 | 74 | def test__baseIndex(self): 75 | self.assertItemsEqual(['test (a)', 'test (a,b)'], 76 | self.s._baseIndex(db="db")) 77 | self.assertEqual('test (a)', 78 | self.s._baseIndex(idx="idx")) 79 | self.assertEqual('foo (bar,foo)', 80 | self.s._baseIndex(idx="create index x on foo (foo, bar)")) 81 | self.assertEqual('unique foo (foo)', 82 | self.s._baseIndex(idx="create unique index x on foo (foo)")) 83 | 84 | 85 | def test_index(self): 86 | self.s.index([['a','cursor']]) 87 | self.assertIn('test (a,cursor)', self.s._baseIndex()) 88 | self.s.index('a', unique=True) 89 | self.assertIn('test (a)', self.s._baseIndex()) 90 | self.assertFalse(self.s.index(['a','cursor'])) 91 | self.s.index('f', tbl="main") 92 | self.assertIn('main (f)', self.s._baseIndex()) 93 | self.assertFalse(self.s.index('a', tbl="main")) 94 | #self.s.index(['e', 'f'], combo=True, tbl="main") 95 | #self.assertIn('main (e)', self.s._baseIndex(tbl="main")) 96 | #self.assertIn('main (e,f)', self.s._baseIndex(tbl="main")) 97 | 98 | self.s.index([['a','cursor']], db="db") 99 | self.assertIn('test (a,cursor)', self.s._baseIndex(db="db")) 100 | 101 | # def test_merge(self): 102 | # s = SQLite.SQLite() 103 | # s.merge(key=[['AsgNum', 'pdpass']], on=[['assigneeAsc', 'assignee']], 104 | # keyType=['INTEGER'], tableFrom='main', db='db') 105 | # assert(1 == 1) 106 | 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | 111 | -------------------------------------------------------------------------------- /test/test_xml_driver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import re 5 | import sys 6 | import unittest 7 | from xml.sax import make_parser, handler 8 | from cgi import escape as html_escape 9 | 10 | sys.path.append('../lib/handlers/') 11 | from xml_driver import XMLElement, XMLHandler 12 | 13 | # Directory of test files 14 | basedir = os.curdir 15 | testdir = os.path.join(basedir, 'fixtures/xml/') 16 | 17 | class Test_XMLElement_Basic(unittest.TestCase): 18 | 19 | def setUp(self): 20 | # setup basic.xml parser/handler 21 | xmlhandler = XMLHandler() 22 | parser = make_parser() 23 | parser.setContentHandler(xmlhandler) 24 | parser.setFeature(handler.feature_external_ges, False) 25 | parser.parse(testdir+'basic.xml') 26 | self.assertTrue(xmlhandler.root) 27 | self.root = xmlhandler.root 28 | 29 | def test_basic_xml_tag_counts(self): 30 | self.assertTrue(len(self.root.a) == 1) 31 | self.assertTrue(len(self.root.a.b) == 2) 32 | self.assertTrue(len(self.root.a.b.c) == 3) 33 | self.assertTrue(len(self.root.a.b.d) == 2) 34 | self.assertTrue(len(self.root.a.c) == 3) 35 | 36 | def test_basic_xml_tag_contents(self): 37 | self.assertTrue(self.root.a.b.c[0].get_content() == 'HELLO', \ 38 | "{0} should be {1}".format(self.root.a.b.c[0].get_content(), 'HELLO')) 39 | self.assertTrue(self.root.a.b.c[1].get_content() == 'WORLD', \ 40 | "{0} should be {1}".format(self.root.a.b.c[1].get_content(), 'WORLD')) 41 | self.assertTrue(self.root.a.b.c[2].get_content() == '3', \ 42 | "{0} should be {1}".format(self.root.a.b.c[2].get_content(), '3')) 43 | self.assertTrue(self.root.a.b.d[0].get_content() == '1', \ 44 | "{0} should be {1}".format(self.root.a.b.c[0].get_content(), '1')) 45 | self.assertTrue(self.root.a.b.d[1].get_content() == '2', \ 46 | "{0} should be {1}".format(self.root.a.b.c[1].get_content(), '2')) 47 | 48 | def test_basic_xml_contents_of(self): 49 | self.assertTrue(self.root.a.b.contents_of('c') == ['HELLO','WORLD','3']) 50 | self.assertTrue(self.root.a.b[0].contents_of('c') == ['HELLO','WORLD']) 51 | 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /vm/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | $script = <