├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── README.md ├── pom.xml ├── vis ├── crawl-sites │ ├── README.md │ ├── data.csv │ ├── index.html │ ├── process.py │ └── raw.txt ├── link-vis │ ├── README.md │ ├── assets │ │ ├── css │ │ │ ├── app.css │ │ │ └── lib │ │ │ │ ├── nouislider.min.css │ │ │ │ └── nouislider.pips.css │ │ └── js │ │ │ ├── app.js │ │ │ ├── lib │ │ │ ├── d3.tip.v0.6.3.js │ │ │ ├── jquery.isloading.min.js │ │ │ └── nouislider.min.js │ │ │ ├── variables.js │ │ │ └── variables.temp │ ├── data │ │ └── graph.json │ ├── index.html │ └── startServer.py └── ner │ ├── URI.js │ ├── d3.layout.cloud.js │ └── index.html ├── warcbase-core ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── warcbase │ │ │ ├── data │ │ │ ├── ArcRecordUtils.java │ │ │ └── WarcRecordUtils.java │ │ │ ├── demo │ │ │ └── WacMapReduceArcDemo.java │ │ │ ├── io │ │ │ ├── ArcRecordWritable.java │ │ │ ├── GenericArchiveRecordWritable.java │ │ │ └── WarcRecordWritable.java │ │ │ ├── mapreduce │ │ │ ├── WacArcInputFormat.java │ │ │ ├── WacGenericInputFormat.java │ │ │ └── WacWarcInputFormat.java │ │ │ └── wayback │ │ │ ├── WarcbaseResourceIndex.java │ │ │ └── WarcbaseResourceStore.java │ ├── python │ │ ├── break-into-date-scrapes.py │ │ ├── combine-entity-results-split-by-date.py │ │ ├── combine-entity-results.py │ │ └── pig2gdf.py │ ├── resources │ │ ├── BDBCollection.xml │ │ └── log4j.properties │ ├── scala │ │ └── org │ │ │ └── warcbase │ │ │ └── spark │ │ │ ├── archive │ │ │ └── io │ │ │ │ ├── ArcRecord.scala │ │ │ │ ├── ArchiveRecord.scala │ │ │ │ ├── GenericArchiveRecord.scala │ │ │ │ └── WarcRecord.scala │ │ │ ├── matchbox │ │ │ ├── ComputeImageSize.scala │ │ │ ├── ComputeMD5.scala │ │ │ ├── DetectLanguage.scala │ │ │ ├── DetectMimeTypeTika.scala │ │ │ ├── ExtractAtMentions.scala │ │ │ ├── ExtractBoilerpipeText.scala │ │ │ ├── ExtractDate.scala │ │ │ ├── ExtractDomain.scala │ │ │ ├── ExtractEntities.scala │ │ │ ├── ExtractGraph.scala │ │ │ ├── ExtractHashtags.scala │ │ │ ├── ExtractImageLinks.scala │ │ │ ├── ExtractLinks.scala │ │ │ ├── ExtractPopularImages.scala │ │ │ ├── ExtractTextFromPDFs.scala │ │ │ ├── ExtractUrls.scala │ │ │ ├── NER3Classifier.scala │ │ │ ├── NERCombinedJson.scala │ │ │ ├── RecordLoader.scala │ │ │ ├── RemoveHTML.scala │ │ │ ├── RemoveHttpHeader.scala │ │ │ ├── StringUtils.scala │ │ │ ├── TupleFormatter.scala │ │ │ ├── TweetUtils.scala │ │ │ └── WriteGDF.scala │ │ │ ├── pythonconverters │ │ │ └── ArcRecordConverter.scala │ │ │ ├── rdd │ │ │ └── RecordRDD.scala │ │ │ ├── scripts │ │ │ ├── CrawlStatistics.scala │ │ │ ├── Filter.scala │ │ │ └── SocialMediaLinks.scala │ │ │ └── utils │ │ │ └── JsonUtil.scala │ └── webapp │ │ └── WEB-INF │ │ └── web.xml │ └── test │ ├── java │ └── org │ │ └── warcbase │ │ ├── ingest │ │ ├── WacArcLoaderTest.java │ │ └── WacWarcLoaderTest.java │ │ ├── io │ │ ├── ArcRecordWritableTest.java │ │ ├── GenericArchiveRecordWritableTest.java │ │ └── WarcRecordWritableTest.java │ │ └── mapreduce │ │ ├── WacArcInputFormatTest.java │ │ ├── WacGenericInputFormatTest.java │ │ └── WacWarcInputFormatTest.java │ ├── resources │ ├── arc │ │ └── example.arc.gz │ ├── ner │ │ └── example.txt │ └── warc │ │ └── example.warc.gz │ └── scala │ └── org │ └── warcbase │ └── spark │ ├── ArcTest.scala │ ├── GenericArchiveRecordTest.scala │ ├── WarcTest.scala │ ├── matchbox │ ├── ExtractAtMentionsTest.scala │ ├── ExtractDateTest.scala │ ├── ExtractDomainTest.scala │ ├── ExtractEntitiesTest.scala │ ├── ExtractHashtagsTest.scala │ ├── ExtractImageLinksTest.scala │ ├── ExtractLinksTest.scala │ ├── ExtractUrlsTest.scala │ ├── StringUtilsTest.scala │ └── TupleFormatterTest.scala │ └── rdd │ └── CountableRDDTest.scala └── warcbase-hbase ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── warcbase │ │ ├── WarcbaseAdmin.java │ │ ├── analysis │ │ ├── FindArcUrls.java │ │ ├── FindWarcUrls.java │ │ └── graph │ │ │ ├── ExtractLinksWac.java │ │ │ ├── ExtractSiteLinks.java │ │ │ ├── InvertAnchorText.java │ │ │ └── PrefixMapping.java │ │ ├── browser │ │ ├── SeleniumBrowser.java │ │ ├── WarcBrowser.java │ │ └── WarcBrowserServlet.java │ │ ├── data │ │ ├── HBaseTableManager.java │ │ ├── UrlMapping.java │ │ ├── UrlMappingBuilder.java │ │ ├── UrlMappingMapReduceBuilder.java │ │ └── UrlUtils.java │ │ ├── demo │ │ ├── WacMapReduceHBaseDemo.java │ │ └── WacMapReduceHBaseWrapperDemo.java │ │ ├── index │ │ ├── IndexerMapper.java │ │ ├── IndexerReducer.java │ │ └── IndexerRunner.java │ │ ├── ingest │ │ ├── IngestFiles.java │ │ └── SearchForUrl.java │ │ └── mapreduce │ │ └── lib │ │ ├── Chain.java │ │ ├── ChainMapContextImpl.java │ │ ├── HBaseRowToArcRecordWritableMapper.java │ │ └── TableChainMapper.java └── solr │ ├── README.txt │ ├── WARCIndexer.conf │ ├── discovery │ ├── conf │ │ ├── currency.xml │ │ ├── elevate.xml │ │ ├── lang │ │ │ ├── contractions_ca.txt │ │ │ ├── contractions_fr.txt │ │ │ ├── contractions_ga.txt │ │ │ ├── contractions_it.txt │ │ │ ├── hyphenations_ga.txt │ │ │ ├── stemdict_nl.txt │ │ │ ├── stoptags_ja.txt │ │ │ ├── stopwords_ar.txt │ │ │ ├── stopwords_bg.txt │ │ │ ├── stopwords_ca.txt │ │ │ ├── stopwords_cz.txt │ │ │ ├── stopwords_da.txt │ │ │ ├── stopwords_de.txt │ │ │ ├── stopwords_el.txt │ │ │ ├── stopwords_en.txt │ │ │ ├── stopwords_es.txt │ │ │ ├── stopwords_eu.txt │ │ │ ├── stopwords_fa.txt │ │ │ ├── stopwords_fi.txt │ │ │ ├── stopwords_fr.txt │ │ │ ├── stopwords_ga.txt │ │ │ ├── stopwords_gl.txt │ │ │ ├── stopwords_hi.txt │ │ │ ├── stopwords_hu.txt │ │ │ ├── stopwords_hy.txt │ │ │ ├── stopwords_id.txt │ │ │ ├── stopwords_it.txt │ │ │ ├── stopwords_ja.txt │ │ │ ├── stopwords_lv.txt │ │ │ ├── stopwords_nl.txt │ │ │ ├── stopwords_no.txt │ │ │ ├── stopwords_pt.txt │ │ │ ├── stopwords_ro.txt │ │ │ ├── stopwords_ru.txt │ │ │ ├── stopwords_sv.txt │ │ │ ├── stopwords_th.txt │ │ │ ├── stopwords_tr.txt │ │ │ └── userdict_ja.txt │ │ ├── protwords.txt │ │ ├── schema.xml │ │ ├── solrconfig-production.xml │ │ ├── solrconfig-server-4.10.4.xml │ │ ├── solrconfig.xml │ │ ├── solrcore.properties │ │ ├── solrcore.properties-production │ │ ├── stopwords.txt │ │ └── synonyms.txt │ └── core.properties │ ├── solr.xml │ └── zoo.cfg └── test └── java └── org └── warcbase └── data ├── UrlMappingTest.java └── UrlUtilsTest.java /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/.gitignore -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/.travis.yml -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/README.md -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/pom.xml -------------------------------------------------------------------------------- /vis/crawl-sites/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/crawl-sites/README.md -------------------------------------------------------------------------------- /vis/crawl-sites/data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/crawl-sites/data.csv -------------------------------------------------------------------------------- /vis/crawl-sites/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/crawl-sites/index.html -------------------------------------------------------------------------------- /vis/crawl-sites/process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/crawl-sites/process.py -------------------------------------------------------------------------------- /vis/crawl-sites/raw.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/crawl-sites/raw.txt -------------------------------------------------------------------------------- /vis/link-vis/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/README.md -------------------------------------------------------------------------------- /vis/link-vis/assets/css/app.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/css/app.css -------------------------------------------------------------------------------- /vis/link-vis/assets/css/lib/nouislider.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/css/lib/nouislider.min.css -------------------------------------------------------------------------------- /vis/link-vis/assets/css/lib/nouislider.pips.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/css/lib/nouislider.pips.css -------------------------------------------------------------------------------- /vis/link-vis/assets/js/app.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/js/app.js -------------------------------------------------------------------------------- /vis/link-vis/assets/js/lib/d3.tip.v0.6.3.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/js/lib/d3.tip.v0.6.3.js -------------------------------------------------------------------------------- /vis/link-vis/assets/js/lib/jquery.isloading.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/js/lib/jquery.isloading.min.js -------------------------------------------------------------------------------- /vis/link-vis/assets/js/lib/nouislider.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/js/lib/nouislider.min.js -------------------------------------------------------------------------------- /vis/link-vis/assets/js/variables.js: -------------------------------------------------------------------------------- 1 | var listOfAvailableDataFiles = ["graph.json"]; 2 | -------------------------------------------------------------------------------- /vis/link-vis/assets/js/variables.temp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/assets/js/variables.temp -------------------------------------------------------------------------------- /vis/link-vis/data/graph.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/data/graph.json -------------------------------------------------------------------------------- /vis/link-vis/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/index.html -------------------------------------------------------------------------------- /vis/link-vis/startServer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/link-vis/startServer.py -------------------------------------------------------------------------------- /vis/ner/URI.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/ner/URI.js -------------------------------------------------------------------------------- /vis/ner/d3.layout.cloud.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/ner/d3.layout.cloud.js -------------------------------------------------------------------------------- /vis/ner/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/vis/ner/index.html -------------------------------------------------------------------------------- /warcbase-core/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/pom.xml -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/data/ArcRecordUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/data/ArcRecordUtils.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/data/WarcRecordUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/data/WarcRecordUtils.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/demo/WacMapReduceArcDemo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/demo/WacMapReduceArcDemo.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/io/ArcRecordWritable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/io/ArcRecordWritable.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/io/GenericArchiveRecordWritable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/io/GenericArchiveRecordWritable.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/io/WarcRecordWritable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/io/WarcRecordWritable.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/mapreduce/WacArcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/mapreduce/WacArcInputFormat.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/mapreduce/WacGenericInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/mapreduce/WacGenericInputFormat.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/mapreduce/WacWarcInputFormat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/mapreduce/WacWarcInputFormat.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/wayback/WarcbaseResourceIndex.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/wayback/WarcbaseResourceIndex.java -------------------------------------------------------------------------------- /warcbase-core/src/main/java/org/warcbase/wayback/WarcbaseResourceStore.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/java/org/warcbase/wayback/WarcbaseResourceStore.java -------------------------------------------------------------------------------- /warcbase-core/src/main/python/break-into-date-scrapes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/python/break-into-date-scrapes.py -------------------------------------------------------------------------------- /warcbase-core/src/main/python/combine-entity-results-split-by-date.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/python/combine-entity-results-split-by-date.py -------------------------------------------------------------------------------- /warcbase-core/src/main/python/combine-entity-results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/python/combine-entity-results.py -------------------------------------------------------------------------------- /warcbase-core/src/main/python/pig2gdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/python/pig2gdf.py -------------------------------------------------------------------------------- /warcbase-core/src/main/resources/BDBCollection.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/resources/BDBCollection.xml -------------------------------------------------------------------------------- /warcbase-core/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/archive/io/ArcRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/archive/io/ArcRecord.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/archive/io/ArchiveRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/archive/io/ArchiveRecord.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/archive/io/GenericArchiveRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/archive/io/GenericArchiveRecord.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/archive/io/WarcRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/archive/io/WarcRecord.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ComputeImageSize.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ComputeImageSize.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ComputeMD5.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ComputeMD5.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/DetectLanguage.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/DetectLanguage.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/DetectMimeTypeTika.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/DetectMimeTypeTika.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractAtMentions.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractAtMentions.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractBoilerpipeText.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractBoilerpipeText.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractDate.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractDate.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractDomain.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractDomain.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractEntities.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractEntities.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractGraph.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractGraph.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractHashtags.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractHashtags.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractImageLinks.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractImageLinks.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractPopularImages.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractPopularImages.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractTextFromPDFs.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractTextFromPDFs.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractUrls.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractUrls.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/NER3Classifier.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/NER3Classifier.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RemoveHTML.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RemoveHTML.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RemoveHttpHeader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/RemoveHttpHeader.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/TupleFormatter.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/TupleFormatter.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/matchbox/WriteGDF.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/WriteGDF.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/pythonconverters/ArcRecordConverter.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/pythonconverters/ArcRecordConverter.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/rdd/RecordRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/rdd/RecordRDD.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/scripts/CrawlStatistics.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/scripts/CrawlStatistics.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/scripts/Filter.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/scripts/Filter.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/scripts/SocialMediaLinks.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/scripts/SocialMediaLinks.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/scala/org/warcbase/spark/utils/JsonUtil.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/scala/org/warcbase/spark/utils/JsonUtil.scala -------------------------------------------------------------------------------- /warcbase-core/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/main/webapp/WEB-INF/web.xml -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/ingest/WacArcLoaderTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/ingest/WacArcLoaderTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/ingest/WacWarcLoaderTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/ingest/WacWarcLoaderTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/io/ArcRecordWritableTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/io/ArcRecordWritableTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/io/GenericArchiveRecordWritableTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/io/GenericArchiveRecordWritableTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/io/WarcRecordWritableTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/io/WarcRecordWritableTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/mapreduce/WacArcInputFormatTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/mapreduce/WacArcInputFormatTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/mapreduce/WacGenericInputFormatTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/mapreduce/WacGenericInputFormatTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/java/org/warcbase/mapreduce/WacWarcInputFormatTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/java/org/warcbase/mapreduce/WacWarcInputFormatTest.java -------------------------------------------------------------------------------- /warcbase-core/src/test/resources/arc/example.arc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/resources/arc/example.arc.gz -------------------------------------------------------------------------------- /warcbase-core/src/test/resources/ner/example.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/resources/ner/example.txt -------------------------------------------------------------------------------- /warcbase-core/src/test/resources/warc/example.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/resources/warc/example.warc.gz -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/ArcTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/ArcTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/GenericArchiveRecordTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/GenericArchiveRecordTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/WarcTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/WarcTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractAtMentionsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractAtMentionsTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractDateTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractDateTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractDomainTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractDomainTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractEntitiesTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractEntitiesTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractHashtagsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractHashtagsTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractImageLinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractImageLinksTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractLinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractLinksTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractUrlsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/ExtractUrlsTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/StringUtilsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/StringUtilsTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/matchbox/TupleFormatterTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/matchbox/TupleFormatterTest.scala -------------------------------------------------------------------------------- /warcbase-core/src/test/scala/org/warcbase/spark/rdd/CountableRDDTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-core/src/test/scala/org/warcbase/spark/rdd/CountableRDDTest.scala -------------------------------------------------------------------------------- /warcbase-hbase/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/pom.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/WarcbaseAdmin.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/WarcbaseAdmin.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/FindArcUrls.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/FindArcUrls.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/FindWarcUrls.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/FindWarcUrls.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/graph/ExtractLinksWac.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/graph/ExtractLinksWac.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/browser/SeleniumBrowser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/browser/SeleniumBrowser.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/browser/WarcBrowser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/browser/WarcBrowser.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/browser/WarcBrowserServlet.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/browser/WarcBrowserServlet.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/data/HBaseTableManager.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/data/HBaseTableManager.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/data/UrlMapping.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/data/UrlMapping.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/data/UrlMappingBuilder.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/data/UrlMappingBuilder.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/data/UrlMappingMapReduceBuilder.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/data/UrlMappingMapReduceBuilder.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/data/UrlUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/data/UrlUtils.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/demo/WacMapReduceHBaseDemo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/demo/WacMapReduceHBaseDemo.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/demo/WacMapReduceHBaseWrapperDemo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/demo/WacMapReduceHBaseWrapperDemo.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/index/IndexerMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/index/IndexerMapper.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/index/IndexerReducer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/index/IndexerReducer.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/index/IndexerRunner.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/index/IndexerRunner.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/ingest/IngestFiles.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/ingest/IngestFiles.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/ingest/SearchForUrl.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/ingest/SearchForUrl.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/Chain.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/Chain.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/ChainMapContextImpl.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/ChainMapContextImpl.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/HBaseRowToArcRecordWritableMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/HBaseRowToArcRecordWritableMapper.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/TableChainMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/java/org/warcbase/mapreduce/lib/TableChainMapper.java -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/README.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/WARCIndexer.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/WARCIndexer.conf -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/currency.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/currency.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/elevate.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/elevate.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_ca.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_fr.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_ga.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/contractions_it.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/hyphenations_ga.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stemdict_nl.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stoptags_ja.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stoptags_ja.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ar.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_bg.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_bg.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ca.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ca.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_cz.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_da.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_da.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_de.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_el.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_en.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_es.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_es.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_eu.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fa.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fa.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fi.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_fr.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ga.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_gl.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hi.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hu.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_hy.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_id.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_id.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_it.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_it.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ja.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ja.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_lv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_lv.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_nl.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_no.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_no.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_pt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_pt.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ro.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ro.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ru.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_ru.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_sv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_sv.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_th.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_tr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/stopwords_tr.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/lang/userdict_ja.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/protwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/protwords.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/schema.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/schema.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/solrconfig-production.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/solrconfig-production.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/solrconfig-server-4.10.4.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/solrconfig-server-4.10.4.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/solrconfig.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/solrconfig.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/solrcore.properties: -------------------------------------------------------------------------------- 1 | #solr.lock.type=hdfs 2 | -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/solrcore.properties-production: -------------------------------------------------------------------------------- 1 | dataDir=/var/local/solr/ukdomain/data 2 | -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/stopwords.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/conf/synonyms.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/discovery/conf/synonyms.txt -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/discovery/core.properties: -------------------------------------------------------------------------------- 1 | name=discovery 2 | -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/solr.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/solr.xml -------------------------------------------------------------------------------- /warcbase-hbase/src/main/solr/zoo.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/main/solr/zoo.cfg -------------------------------------------------------------------------------- /warcbase-hbase/src/test/java/org/warcbase/data/UrlMappingTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/test/java/org/warcbase/data/UrlMappingTest.java -------------------------------------------------------------------------------- /warcbase-hbase/src/test/java/org/warcbase/data/UrlUtilsTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lintool/warcbase/HEAD/warcbase-hbase/src/test/java/org/warcbase/data/UrlUtilsTest.java --------------------------------------------------------------------------------