├── .gitignore ├── discardedDomains.txt ├── lib ├── edu.mit.jwi_2.4.0.jar ├── jep-3.6.3.jar ├── jwi-license └── libjep.jnilib ├── license.txt ├── project ├── assembly.sbt └── build.properties ├── readme.md ├── readme_classifier1.md ├── readme_inferGrammar.md ├── readme_run_spark_on_amazon_ec2.md └── src ├── main ├── mysql │ └── classifier1 │ │ ├── pages_postprocess.sql │ │ └── prepare_db.sql ├── python │ ├── example_mdr_extract.py │ ├── mdr │ │ ├── __init__.py │ │ ├── __javascript__ │ │ │ └── __init__.mod.js │ │ ├── _tree.py │ │ ├── mdr.py │ │ ├── tree.py │ │ └── utils.py │ ├── mdr_extract.py │ ├── test.py │ └── utils.py ├── resources │ └── logback.xml └── scala │ ├── iproduct │ ├── CrawlerFilterConfig.scala │ ├── ExtractPatentNumbersFromWarc.scala │ ├── FilterArchivePatents.scala │ ├── classifier1 │ │ ├── BuildDomainStats.scala │ │ ├── BuildPages.scala │ │ ├── ImportNewUrlsToTrainingCorpus.scala │ │ ├── ImportUrlRawLabelsToMysql.scala │ │ ├── Page.scala │ │ ├── TrainClassifierAndPredict.scala │ │ └── Utils.scala │ ├── tools │ │ └── ExportPagesFromWarc.scala │ └── utils │ │ ├── CacheUtils.scala │ │ ├── DatabaseUtils.scala │ │ ├── EnvUtils.scala │ │ ├── FileUtils.scala │ │ ├── FilterArchive.scala │ │ ├── HttpUtils.scala │ │ ├── NLPUtils.scala │ │ ├── SparkUtils.scala │ │ ├── URLUtils.scala │ │ ├── Utils.scala │ │ ├── WarcReader.scala │ │ ├── WarcReaderExample.scala │ │ └── WordNetUtils.scala │ └── playground │ └── inferGrammar │ ├── TestMDR.scala │ └── utils │ └── XmlUtils.scala └── test └── scala └── iproduct ├── HttpUtilsTest.scala └── PatentNumberRegexTest.scala /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/.gitignore -------------------------------------------------------------------------------- /discardedDomains.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/discardedDomains.txt -------------------------------------------------------------------------------- /lib/edu.mit.jwi_2.4.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/lib/edu.mit.jwi_2.4.0.jar -------------------------------------------------------------------------------- /lib/jep-3.6.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/lib/jep-3.6.3.jar -------------------------------------------------------------------------------- /lib/jwi-license: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/lib/jwi-license -------------------------------------------------------------------------------- /lib/libjep.jnilib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/lib/libjep.jnilib -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/license.txt -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/project/assembly.sbt -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.15 2 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/readme.md -------------------------------------------------------------------------------- /readme_classifier1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/readme_classifier1.md -------------------------------------------------------------------------------- /readme_inferGrammar.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/readme_inferGrammar.md -------------------------------------------------------------------------------- /readme_run_spark_on_amazon_ec2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/readme_run_spark_on_amazon_ec2.md -------------------------------------------------------------------------------- /src/main/mysql/classifier1/pages_postprocess.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/mysql/classifier1/pages_postprocess.sql -------------------------------------------------------------------------------- /src/main/mysql/classifier1/prepare_db.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/mysql/classifier1/prepare_db.sql -------------------------------------------------------------------------------- /src/main/python/example_mdr_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/example_mdr_extract.py -------------------------------------------------------------------------------- /src/main/python/mdr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/__init__.py -------------------------------------------------------------------------------- /src/main/python/mdr/__javascript__/__init__.mod.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/__javascript__/__init__.mod.js -------------------------------------------------------------------------------- /src/main/python/mdr/_tree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/_tree.py -------------------------------------------------------------------------------- /src/main/python/mdr/mdr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/mdr.py -------------------------------------------------------------------------------- /src/main/python/mdr/tree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/tree.py -------------------------------------------------------------------------------- /src/main/python/mdr/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr/utils.py -------------------------------------------------------------------------------- /src/main/python/mdr_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/mdr_extract.py -------------------------------------------------------------------------------- /src/main/python/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/test.py -------------------------------------------------------------------------------- /src/main/python/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/python/utils.py -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/resources/logback.xml -------------------------------------------------------------------------------- /src/main/scala/iproduct/CrawlerFilterConfig.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/CrawlerFilterConfig.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/ExtractPatentNumbersFromWarc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/ExtractPatentNumbersFromWarc.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/FilterArchivePatents.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/FilterArchivePatents.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/BuildDomainStats.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/BuildDomainStats.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/BuildPages.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/BuildPages.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/ImportNewUrlsToTrainingCorpus.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/ImportNewUrlsToTrainingCorpus.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/ImportUrlRawLabelsToMysql.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/ImportUrlRawLabelsToMysql.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/Page.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/Page.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/TrainClassifierAndPredict.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/TrainClassifierAndPredict.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/classifier1/Utils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/classifier1/Utils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/tools/ExportPagesFromWarc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/tools/ExportPagesFromWarc.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/CacheUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/CacheUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/DatabaseUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/DatabaseUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/EnvUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/EnvUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/FileUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/FileUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/FilterArchive.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/FilterArchive.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/HttpUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/HttpUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/NLPUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/NLPUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/SparkUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/SparkUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/URLUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/URLUtils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/Utils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/Utils.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/WarcReader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/WarcReader.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/WarcReaderExample.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/WarcReaderExample.scala -------------------------------------------------------------------------------- /src/main/scala/iproduct/utils/WordNetUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/iproduct/utils/WordNetUtils.scala -------------------------------------------------------------------------------- /src/main/scala/playground/inferGrammar/TestMDR.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/playground/inferGrammar/TestMDR.scala -------------------------------------------------------------------------------- /src/main/scala/playground/inferGrammar/utils/XmlUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/main/scala/playground/inferGrammar/utils/XmlUtils.scala -------------------------------------------------------------------------------- /src/test/scala/iproduct/HttpUtilsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/test/scala/iproduct/HttpUtilsTest.scala -------------------------------------------------------------------------------- /src/test/scala/iproduct/PatentNumberRegexTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iproduct-database/vpm-filter-spark/HEAD/src/test/scala/iproduct/PatentNumberRegexTest.scala --------------------------------------------------------------------------------