├── src ├── bundle │ ├── solr_config │ │ ├── conf │ │ │ ├── solrcore.properties │ │ │ ├── lang │ │ │ │ ├── hyphenations_ga.txt │ │ │ │ ├── contractions_ga.txt │ │ │ │ ├── contractions_ca.txt │ │ │ │ ├── contractions_fr.txt │ │ │ │ ├── stemdict_nl.txt │ │ │ │ ├── contractions_it.txt │ │ │ │ ├── stopwords_hy.txt │ │ │ │ ├── stopwords_el.txt │ │ │ │ ├── stopwords_ga.txt │ │ │ │ ├── stopwords_eu.txt │ │ │ │ ├── userdict_ja.txt │ │ │ │ ├── stopwords_en.txt │ │ │ │ ├── stopwords_th.txt │ │ │ │ ├── stopwords_ar.txt │ │ │ │ ├── stopwords_gl.txt │ │ │ │ └── stopwords_cz.txt │ │ │ ├── stopwords_path.txt │ │ │ ├── stopwords.txt │ │ │ ├── protwords.txt │ │ │ ├── synonyms.txt │ │ │ └── elevate.xml │ │ └── README.md │ ├── indexing │ │ ├── show_warc_config.sh │ │ ├── batch_warcs1_folder.bat │ │ ├── batch_warcs2_folder.bat │ │ ├── batch_warcs1_folder.sh │ │ └── batch_warcs2_folder.sh │ ├── solrwayback_bundle.sh │ └── solrwayback_bundle.bat ├── main │ ├── java │ │ └── dk │ │ │ └── kb │ │ │ └── netarchivesuite │ │ │ └── solrwayback │ │ │ ├── solr │ │ │ ├── SolrGenericStreaming.java │ │ │ └── SolrStreamingLineBasedExportClientInterface.java │ │ │ ├── parsers │ │ │ ├── json │ │ │ │ ├── TweetEntity.java │ │ │ │ ├── TweetVideoInfo.java │ │ │ │ ├── TweetVideoVariant.java │ │ │ │ ├── TweetMedia.java │ │ │ │ ├── TweetHashtag.java │ │ │ │ ├── TweetURL.java │ │ │ │ ├── TweetEntities.java │ │ │ │ └── TweetMention.java │ │ │ ├── LineAndByteCount.java │ │ │ ├── ArcWarcFileParserAbstract.java │ │ │ └── ArcFileParserFactory.java │ │ │ ├── service │ │ │ ├── dto │ │ │ │ ├── UrlWrapper.java │ │ │ │ ├── PagePreviewYearsInfo.java │ │ │ │ ├── graph │ │ │ │ │ ├── D3Graph.java │ │ │ │ │ ├── Link.java │ │ │ │ │ └── Node.java │ │ │ │ ├── WordCloudWordAndCount.java │ │ │ │ ├── smurf │ │ │ │ │ ├── DateCount.java │ │ │ │ │ └── SmurfBuckets.java │ │ │ │ ├── SearchResult.java │ │ │ │ ├── PagePreview.java │ │ │ │ ├── HarvestDates.java │ │ │ │ ├── statistics │ │ │ │ │ ├── QueryPercentilesStatistics.java │ │ │ │ │ ├── DomainYearStatistics.java │ │ │ │ │ └── DomainStatistics.java │ │ │ │ ├── FacetCount.java │ │ │ │ ├── PageResource.java │ │ │ │ └── IndexDocShort.java │ │ │ ├── SolrWaybackApplication.java │ │ │ └── exception │ │ │ │ ├── NotFoundServiceException.java │ │ │ │ ├── ServiceExceptionMapper.java │ │ │ │ ├── InternalServiceException.java │ │ │ │ ├── InvalidArgumentServiceException.java │ │ │ │ └── SolrWaybackServiceException.java │ │ │ ├── interfaces │ │ │ ├── IdentityArcFileResolver.java │ │ │ ├── ArcHTTPResolver.java │ │ │ └── ArcFileLocationResolverInterface.java │ │ │ ├── playback │ │ │ ├── PlaybackHandler.java │ │ │ ├── CssPlayback.java │ │ │ ├── JavascriptPlayback.java │ │ │ ├── HtmlPlayback.java │ │ │ └── JodelPlayback.java │ │ │ ├── smurf │ │ │ └── NetarchiveYearCountCache.java │ │ │ ├── listeners │ │ │ └── SolrWaybackAsciiLogo.java │ │ │ ├── util │ │ │ ├── NamedConsumer.java │ │ │ ├── CountingMap.java │ │ │ ├── LimitedReader.java │ │ │ └── StatusInputStream.java │ │ │ ├── normalise │ │ │ └── NormalisationMinimal.java │ │ │ └── encoders │ │ │ └── Sha1Hash.java │ ├── webapp │ │ ├── images │ │ │ ├── twitter_sprite.png │ │ │ ├── today-24px.svg │ │ │ ├── schedule-24dp.svg │ │ │ ├── text_snippet-24px.svg │ │ │ └── preview-24px.svg │ │ ├── leakingForward.jsp │ │ ├── WEB-INF │ │ │ ├── rewrite.config │ │ │ └── web.xml │ │ └── META-INF │ │ │ └── context.xml │ └── resources │ │ ├── dk │ │ └── kb │ │ │ └── netarchivesuite │ │ │ └── webservices │ │ │ └── configuration │ │ │ └── build.properties │ │ ├── build.properties │ │ ├── about_this_archive_kb.txt │ │ ├── about_this_archive.txt │ │ └── about_collection.txt ├── test │ ├── resources │ │ ├── solr_9 │ │ │ ├── netarchivebuilder │ │ │ │ ├── conf │ │ │ │ │ ├── solrcore.properties │ │ │ │ │ ├── lang │ │ │ │ │ │ ├── hyphenations_ga.txt │ │ │ │ │ │ ├── contractions_ga.txt │ │ │ │ │ │ ├── contractions_ca.txt │ │ │ │ │ │ ├── contractions_fr.txt │ │ │ │ │ │ ├── stemdict_nl.txt │ │ │ │ │ │ ├── contractions_it.txt │ │ │ │ │ │ ├── stopwords_hy.txt │ │ │ │ │ │ ├── stopwords_el.txt │ │ │ │ │ │ ├── stopwords_ga.txt │ │ │ │ │ │ ├── stopwords_eu.txt │ │ │ │ │ │ ├── userdict_ja.txt │ │ │ │ │ │ ├── stopwords_en.txt │ │ │ │ │ │ ├── stopwords_th.txt │ │ │ │ │ │ ├── stopwords_ar.txt │ │ │ │ │ │ ├── stopwords_gl.txt │ │ │ │ │ │ └── stopwords_cz.txt │ │ │ │ │ ├── stopwords_path.txt │ │ │ │ │ ├── stopwords.txt │ │ │ │ │ ├── protwords.txt │ │ │ │ │ ├── synonyms.txt │ │ │ │ │ └── elevate.xml │ │ │ │ └── core.properties │ │ │ ├── README.md │ │ │ └── zoo.cfg │ │ ├── properties │ │ │ └── solrwaybackweb_unittest.properties │ │ ├── compressions_warc │ │ │ ├── transfer_compression_gzip.warc │ │ │ ├── transfer_compression_brotli.warc │ │ │ ├── transfer_compression_gzip.warc.gz │ │ │ ├── transfer_compression_none.warc.gz │ │ │ ├── transfer_compression_brotli.warc.gz │ │ │ ├── transfer_compression_gzip_chunked.warc.gz │ │ │ ├── transfer_compression_none_truncated.warc.gz │ │ │ └── README.md │ │ ├── example_arc │ │ │ ├── IAH-20080430204825-00000-blackbook.arc │ │ │ └── IAH-20080430204825-00000-blackbook.arc.gz │ │ ├── example_warc │ │ │ ├── IAH-20080430204825-00000-blackbook.warc │ │ │ ├── IAH-20080430204825-00000-blackbook.warc.gz │ │ │ └── Evil-Warc-Headers.warc │ │ ├── arc_resolvers │ │ │ └── FileMovedMappingTest.txt │ │ ├── example_rewrite │ │ │ ├── encoding.html │ │ │ ├── url_escape.html │ │ │ ├── encoding_expected.html │ │ │ ├── inline_css.html │ │ │ ├── css_import.html │ │ │ ├── url_escape_expected.html │ │ │ ├── cdata.html │ │ │ ├── cdata_expected.html │ │ │ ├── style_element.html │ │ │ ├── css_import_expected.html │ │ │ ├── script_escape_expected.html │ │ │ ├── script_escape.html │ │ │ ├── css2.html │ │ │ ├── script2.html │ │ │ ├── css2_expected.html │ │ │ ├── script2_expected.html │ │ │ ├── style_element_expected.html │ │ │ ├── simple.html │ │ │ └── multisource.html │ │ └── logback-test.xml │ └── java │ │ ├── dk │ │ └── kb │ │ │ └── netarchivesuite │ │ │ └── solrwayback │ │ │ ├── parsers │ │ │ ├── CssParserReplacerTest.java │ │ │ ├── HtmlParserUrlRewriterFromWarcTest.java │ │ │ ├── TestExportArc.java │ │ │ └── warc │ │ │ │ └── ArcGzParserTest.java │ │ │ ├── util │ │ │ └── SolrUtilsTest.java │ │ │ ├── solr │ │ │ ├── NetarchiveSolrTestClient.java │ │ │ └── IndexWatcherTest.java │ │ │ ├── normalize │ │ │ └── FilenameNormalizeTest.java │ │ │ ├── export │ │ │ ├── TestGenerateCSV.java │ │ │ └── TestGenerateLinkGraphCSV.java │ │ │ └── interfaces │ │ │ └── FileMovedMappingResolverTest.java │ │ └── README.txt └── js │ ├── public │ └── favicon.ico │ ├── src │ ├── assets │ │ ├── logo.png │ │ ├── styles │ │ │ ├── main.scss │ │ │ ├── styleVariables.scss │ │ │ └── pwid.scss │ │ └── icons │ │ │ ├── chart.svg │ │ │ ├── image.svg │ │ │ ├── Icons8_flat_checkmark.svg │ │ │ ├── tools.svg │ │ │ ├── video.svg │ │ │ ├── default.svg │ │ │ ├── audio.svg │ │ │ ├── web.svg │ │ │ ├── location.svg │ │ │ ├── warc.svg │ │ │ ├── Font_Awesome_5_regular_clipboard.svg │ │ │ └── twitter.svg │ ├── views │ │ └── About.vue │ ├── mixins │ │ ├── SearchboxUtils.js │ │ ├── StringManipulationUtils.js │ │ └── ImageSearchUtils.js │ ├── components │ │ ├── ngrams │ │ │ ├── netarchive │ │ │ │ └── configs.js │ │ │ ├── chartsCore │ │ │ │ └── chartHelpers │ │ │ │ │ └── index.js │ │ │ └── searchHelper.js │ │ ├── harvestCalendar │ │ │ ├── ColorLegend.vue │ │ │ ├── util.js │ │ │ ├── plugins │ │ │ │ ├── iterators.js │ │ │ │ └── tranformationHelpers.js │ │ │ ├── HarvestsDay.vue │ │ │ ├── AllYearsGraph.vue │ │ │ └── harvestDateHelper.js │ │ ├── ToolboxComponents │ │ │ └── NgramNetarchive.vue │ │ ├── modalComponents │ │ │ ├── CollectionInfo.vue │ │ │ ├── SearchGuidelines.vue │ │ │ └── PrimaryModal.vue │ │ ├── harvestTimeResources │ │ │ ├── HarvestResourcesMissing.vue │ │ │ ├── HarvestMaxTimeDifference.vue │ │ │ └── HarvestPagePreview.vue │ │ ├── searchSingleItemComponents │ │ │ ├── SearchSingleItemFocusImage.vue │ │ │ └── searchSingleItemTypes │ │ │ │ ├── SearchSingleItemImage.vue │ │ │ │ ├── SearchSingleItemTweet.vue │ │ │ │ ├── SearchSingleItemWeb.vue │ │ │ │ └── SearchSingleItemDefault.vue │ │ ├── AboutComponent.vue │ │ ├── LoadingOverlay.vue │ │ ├── notifications │ │ │ └── Notifications.vue │ │ └── AppliedSearchFacets.vue │ ├── App.vue │ ├── store │ │ ├── modal.store.js │ │ └── notifier.store.js │ ├── configs │ │ └── index.js │ ├── main.js │ └── services │ │ └── dataTransformationHelper.js │ ├── README.md │ ├── eslint.config.js │ ├── solrwayback_index_page.html │ ├── package.json │ └── vite.config.js ├── doc ├── image_search.png ├── gps_exif_search.png ├── solrwayback_ngram.png ├── solrwayback_search.png ├── multiple_pagepreviews.png ├── solrwayback_crawltimes.png ├── solrwayback_linkgraph.png ├── solrwayback_playback.png ├── solrwayback_wordcloud.png └── domain_result_visualization.png ├── .github └── workflows │ └── test.yml ├── .vscode └── settings.json └── .gitignore /src/bundle/solr_config/conf/solrcore.properties: -------------------------------------------------------------------------------- 1 | #solr.lock.type=hdfs 2 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrGenericStreaming.java: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/solrcore.properties: -------------------------------------------------------------------------------- 1 | #solr.lock.type=hdfs 2 | -------------------------------------------------------------------------------- /doc/image_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/image_search.png -------------------------------------------------------------------------------- /doc/gps_exif_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/gps_exif_search.png -------------------------------------------------------------------------------- /doc/solrwayback_ngram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_ngram.png -------------------------------------------------------------------------------- /src/js/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/js/public/favicon.ico -------------------------------------------------------------------------------- /doc/solrwayback_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_search.png -------------------------------------------------------------------------------- /src/js/src/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/js/src/assets/logo.png -------------------------------------------------------------------------------- /doc/multiple_pagepreviews.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/multiple_pagepreviews.png -------------------------------------------------------------------------------- /doc/solrwayback_crawltimes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_crawltimes.png -------------------------------------------------------------------------------- /doc/solrwayback_linkgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_linkgraph.png -------------------------------------------------------------------------------- /doc/solrwayback_playback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_playback.png -------------------------------------------------------------------------------- /doc/solrwayback_wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/solrwayback_wordcloud.png -------------------------------------------------------------------------------- /doc/domain_result_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/doc/domain_result_visualization.png -------------------------------------------------------------------------------- /src/main/webapp/images/twitter_sprite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/main/webapp/images/twitter_sprite.png -------------------------------------------------------------------------------- /src/js/src/assets/styles/main.scss: -------------------------------------------------------------------------------- 1 | @use 'global'; 2 | @use 'styleVariables'; 3 | @use 'search'; 4 | @use 'results'; 5 | //More imports to come as project grows -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/core.properties: -------------------------------------------------------------------------------- 1 | name=netarchivebuilder 2 | config=solrconfig.xml 3 | schema=schema.xml 4 | dataDir=netarchivebuilder_data 5 | -------------------------------------------------------------------------------- /src/test/resources/properties/solrwaybackweb_unittest.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/properties/solrwaybackweb_unittest.properties -------------------------------------------------------------------------------- /src/bundle/indexing/show_warc_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd ${BASH_SOURCE%/*} > /dev/null 4 | 5 | java -cp warc-indexer-3.3.1-jar-with-dependencies.jar uk.bl.wa.util.ConfigPrinter 6 | 7 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_gzip.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_gzip.warc -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /src/js/src/views/About.vue: -------------------------------------------------------------------------------- 1 | 7 | -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_brotli.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_brotli.warc -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_gzip.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_gzip.warc.gz -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_none.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_none.warc.gz -------------------------------------------------------------------------------- /src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_brotli.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_brotli.warc.gz -------------------------------------------------------------------------------- /src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz -------------------------------------------------------------------------------- /src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc -------------------------------------------------------------------------------- /src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc.gz -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/chart.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/main/resources/dk/kb/netarchivesuite/webservices/configuration/build.properties: -------------------------------------------------------------------------------- 1 | # Build Time Information 2 | APPLICATION.NAME=${pom.name} 3 | APPLICATION.VERSION=${pom.version} 4 | APPLICATION.BUILDTIME=${build.time} -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_gzip_chunked.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_gzip_chunked.warc.gz -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/transfer_compression_none_truncated.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_none_truncated.warc.gz -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /src/main/resources/build.properties: -------------------------------------------------------------------------------- 1 | #$Id: build.properties 1 2011-10-21 09:33:25Z teg $ 2 | # Build Time Information 3 | APPLICATION.NAME=${pom.name} 4 | APPLICATION.VERSION=${pom.version} 5 | APPLICATION.BUILDTIME=${build.time} -------------------------------------------------------------------------------- /src/test/resources/arc_resolvers/FileMovedMappingTest.txt: -------------------------------------------------------------------------------- 1 | /home/xxx/solrwayback_package_4.2.1/indexing/warcs1/356548-347-20210201093000132-00000-sb-prod-har-001.statsbiblioteket.dk.warc.gz 2 | /mount/netarchive/test-00000.warc.gz -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/image.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/src/mixins/SearchboxUtils.js: -------------------------------------------------------------------------------- 1 | export default { 2 | methods: { 3 | $_getSizeOfTextArea(id) { 4 | this.$refs[id].style.height = '1px' 5 | this.$refs[id].style.height = this.$refs[id].scrollHeight + 'px' 6 | }, 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/main/resources/about_this_archive_kb.txt: -------------------------------------------------------------------------------- 1 | About us
Welcome to the Danish Netarchive.
2 | For more information see Danish Netarchive .
3 |
4 | The Royal Danish Library. -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/Icons8_flat_checkmark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/tools.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/video.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bundle/indexing/batch_warcs1_folder.bat: -------------------------------------------------------------------------------- 1 | cd /D "%~dp0" 2 | 3 | FOR /R warcs1 %%G IN (*.*) do java -Dfile.encoding=UTF-8 -Xmx2048M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" "%%G" 4 | 5 | -------------------------------------------------------------------------------- /src/bundle/indexing/batch_warcs2_folder.bat: -------------------------------------------------------------------------------- 1 | cd /D "%~dp0" 2 | 3 | FOR /R warcs2 %%G IN (*.*) do java -Dfile.encoding=UTF-8 -Xmx2048M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" "%%G" 4 | 5 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/default.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bundle/solr_config/README.md: -------------------------------------------------------------------------------- 1 | # Solr configuration 2 | 3 | This folder contains a copy of the Solr configuration and can be used upload a new Solr configuration to Solr. Only for experience Solr users that knows what they are doing. 4 | See the' Update Solr cloud configuration' in the project README.md -------------------------------------------------------------------------------- /src/main/webapp/images/today-24px.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/src/assets/styles/styleVariables.scss: -------------------------------------------------------------------------------- 1 | :root { 2 | --main-bg-color: #CAF0FE; 3 | --secondary-bg-color: #002E70; 4 | --main-text-color: #303030; 5 | --secondary-text-color: #002E70; 6 | --main-highlight-color: #002E70; 7 | --secondary-highlight-color: #fff6c4; 8 | --seethrough-black: rgba(30,30,30,0.6); 9 | } -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingLineBasedExportClientInterface.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.solr; 2 | 3 | public interface SolrStreamingLineBasedExportClientInterface { 4 | 5 | public String next() throws Exception; 6 | 7 | public int getPageSize(); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetEntity.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import org.apache.commons.lang3.tuple.Pair; 4 | 5 | public interface TweetEntity { 6 | Pair getIndices(); 7 | void setIndices(Pair newIndices); 8 | } 9 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/encoding.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with tricky encoding 4 | 5 | 6 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/url_escape.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with tricky URL 4 | 5 | 6 | 7 |

Ampersand

8 | 9 | 10 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/stopwords_path.txt: -------------------------------------------------------------------------------- 1 | # URL & path elements that should not be indexed (to save space) 2 | 3 | # www is removed by the webarchive-discovery normaliser if it is leading. If it is part of the path we want to keep it, so it is not a stopword 4 | 5 | # All URLs starts with http or https, so definitely remove those 6 | http 7 | https 8 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "Test" 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | build-docker-image: 8 | name: Build Docker image 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout repository 12 | uses: actions/checkout@v3 13 | - name: Build SolrWayback Docker image 14 | run: docker build --tag solrwayback . 15 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/audio.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/stopwords_path.txt: -------------------------------------------------------------------------------- 1 | # URL & path elements that should not be indexed (to save space) 2 | 3 | # www is removed by the webarchive-discovery normaliser if it is leading. If it is part of the path we want to keep it, so it is not a stopword 4 | 5 | # All URLs starts with http or https, so definitely remove those 6 | http 7 | https 8 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/encoding_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with tricky encoding 4 | 5 | 6 | 11 | 12 | -------------------------------------------------------------------------------- /src/main/resources/about_this_archive.txt: -------------------------------------------------------------------------------- 1 |

This is the default SolrWayback about us text.

2 | 3 | The text is intended to have information about the archive. 4 | 5 | Change the property about.text.file in solrwaybackweb.properties to point on a local file with the 6 | full absolute file path. Use HTML markup but with the html/body start and end tag. 7 | 8 | -------------------------------------------------------------------------------- /src/main/webapp/images/schedule-24dp.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/inline_css.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Inline CSS page 4 | 5 | 10 | 11 |

Inline CSS

12 | 13 | 14 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "files.exclude": { 4 | "**/.classpath": true, 5 | "**/.project": true, 6 | "**/.settings": true, 7 | "**/.factorypath": true 8 | }, 9 | "editor.codeActionsOnSave": { 10 | "source.fixAll.eslint": true 11 | }, 12 | "eslint.validate": ["javascript", "vue"], 13 | "eslint.workingDirectories": [ 14 | "./src/js" 15 | ] 16 | } -------------------------------------------------------------------------------- /src/js/src/assets/icons/web.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/css_import.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CSS includes 4 | 9 | 10 |

Multiple CSS includes

11 | 12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/url_escape_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with tricky URL 4 | 5 | 6 | 7 |

Ampersand

8 | 9 | -------------------------------------------------------------------------------- /src/main/webapp/images/text_snippet-24px.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/src/components/ngrams/netarchive/configs.js: -------------------------------------------------------------------------------- 1 | import APP_CONFIGS from '../../../configs' 2 | export default { 3 | SERVICE_URL : 'services/search/', 4 | END_YEAR: (new Date().getFullYear() + 1).toString(), 5 | BASE_SEARCH_URL: () => { 6 | let searchPrefix = window.location.pathname.split('/')[1] === 'search' ? '' : 'search' 7 | return `${APP_CONFIGS.playbackConfig.solrwaybackBaseURL}${searchPrefix}` 8 | } 9 | } -------------------------------------------------------------------------------- /src/main/webapp/leakingForward.jsp: -------------------------------------------------------------------------------- 1 | <%@page import="dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader"%> 2 | 3 | <% 4 | 5 | 6 | String orgUrl = (String) request.getAttribute("javax.servlet.error.request_uri"); 7 | 8 | //http://localhost:8080/solrwayback/ 9 | String redirectURL = PropertiesLoader.WAYBACK_BASEURL+"services/resolveLeak?url="+orgUrl; 10 | response.sendRedirect(redirectURL); 11 | %> 12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/README.md: -------------------------------------------------------------------------------- 1 | # Solr 9 files 2 | 3 | The Solr config files are copied from the [webarchive-discovery](https://github.com/ukwa/webarchive-discovery) project. 4 | 5 | 6 | The `solr_9` folder contains a Solr 9 setup that can be used for 7 | 8 | * Unit testing with embedded Solr 9 9 | * Copying to bundle for Solr 9 Standalone deployment 10 | * Uploading to ZooKeeper for Solr 9 Cloud deployment 11 | -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/ColorLegend.vue: -------------------------------------------------------------------------------- 1 | 10 | 11 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/cdata.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript in CDATA 4 | 5 | 6 |

CDATA is not needed in HTML script-elements, but browsers are browsers...

7 |

Processing this page should not result in any changes.

8 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/location.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/cdata_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript in CDATA 4 | 5 | 6 |

CDATA is not needed in HTML script-elements, but browsers are browsers...

7 |

Processing this page should not result in any changes.

8 | 13 | 14 | -------------------------------------------------------------------------------- /src/js/README.md: -------------------------------------------------------------------------------- 1 | # solrwayback_vue 2 | 3 | ## Project setup 4 | ``` 5 | npm install 6 | ``` 7 | 8 | ### Compiles and hot-reloads for development 9 | ``` 10 | npm run serve 11 | ``` 12 | 13 | ### Compiles and minifies for production 14 | ``` 15 | npm run build 16 | ``` 17 | 18 | ### Lints and fixes files 19 | ``` 20 | npm run lint 21 | ``` 22 | 23 | ### Customize configuration 24 | See [Configuration Reference](https://cli.vuejs.org/config/). 25 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/warc.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/src/App.vue: -------------------------------------------------------------------------------- 1 | 7 | 19 | 20 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/CssParserReplacerTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | public class CssParserReplacerTest { 4 | 5 | 6 | //private static String cssExample= 7 | 8 | 9 | //CSS på følgende side: http://belinda.statsbiblioteket.dk:9721/solrwayback/services/view?source_file_path=/netarkiv-stage/0001/PLIGT/filedir/5065-215-20131114083855-00000-kb-test-har-003.kb.dk.warc.gz&offset=310685392 10 | 11 | 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/UrlWrapper.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class UrlWrapper { 7 | 8 | private String url; 9 | 10 | public UrlWrapper(){ 11 | 12 | } 13 | 14 | public String getUrl() { 15 | return url; 16 | } 17 | 18 | public void setUrl(String url) { 19 | this.url = url; 20 | } 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/js/src/store/modal.store.js: -------------------------------------------------------------------------------- 1 | // Global modal state 2 | import { defineStore } from 'pinia' 3 | 4 | export const useModalStore = defineStore('modal', { 5 | 6 | state: () => ({ 7 | showModal: false, 8 | currentModal: '' 9 | }), 10 | 11 | actions: { 12 | updateShowModal(shown) { 13 | this.showModal = shown 14 | }, 15 | updateCurrentModal(modal){ 16 | this.currentModal = modal 17 | }, 18 | resetState(){ 19 | this.$reset() 20 | } 21 | } 22 | }) 23 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/LineAndByteCount.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | public class LineAndByteCount { 4 | private String line; 5 | private int byteCount; 6 | 7 | public String getLine() { 8 | return line; 9 | } 10 | public void setLine(String line) { 11 | this.line = line; 12 | } 13 | public int getByteCount() { 14 | return byteCount; 15 | } 16 | public void setByteCount(int byteCount) { 17 | this.byteCount = byteCount; 18 | } 19 | 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/style_element.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Style page 4 | 5 | 6 |

Some styling

7 | 8 |
Hello
9 |
World
10 |

11 |

paragraph

12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/css_import_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CSS includes 4 | 9 | 10 | 11 |

Multiple CSS includes

12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/test/java/README.txt: -------------------------------------------------------------------------------- 1 | Information about unittests. 2 | 3 | Property loading. 4 | For unittest that require the properties to be initialised use this way to load the properties 5 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath()); 6 | 7 | This will use the property files under test/resources/properties 8 | 9 | If you need a unittest with quite different properties, you can create a new property file and load that. Just be sure 10 | to include unittest in the name of the property. 11 | 12 | TODO: more documentation -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcWarcFileParserAbstract.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | public class ArcWarcFileParserAbstract { 7 | private static final Logger log = LoggerFactory.getLogger(ArcWarcFileParserAbstract.class); 8 | 9 | public static int getStatusCode(String line){//HTTP/1.1 302 Object moved 10 | String[] tokens = line.split(" "); 11 | String status = tokens[1]; 12 | return Integer.parseInt(status); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/bundle/indexing/batch_warcs1_folder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd ${BASH_SOURCE%/*} > /dev/null 4 | 5 | FILES=warcs1/* 6 | for f in $FILES 7 | do 8 | echo "Processing $f file..." 9 | java -Dfile.encoding=UTF-8 -Xmx1024M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" $f 10 | done 11 | 12 | echo "Flushing Solr. Documents will be visible after flush" 13 | curl -s "http://localhost:8983/solr/netarchivebuilder/update?commit=true&openSearcher=true" > /dev/null 14 | 15 | popd > /dev/null 16 | -------------------------------------------------------------------------------- /src/bundle/indexing/batch_warcs2_folder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd ${BASH_SOURCE%/*} > /dev/null 4 | 5 | FILES=warcs2/* 6 | for f in $FILES 7 | do 8 | echo "Processing $f file..." 9 | java -Dfile.encoding=UTF-8 -Xmx1024M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" $f 10 | done 11 | 12 | echo "Flushing Solr. Documents will be visible after flush" 13 | curl -s "http://localhost:8983/solr/netarchivebuilder/update?commit=true&openSearcher=true" > /dev/null 14 | 15 | popd > /dev/null 16 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/zoo.cfg: -------------------------------------------------------------------------------- 1 | # The number of milliseconds of each tick 2 | tickTime=2000 3 | # The number of ticks that the initial 4 | # synchronization phase can take 5 | initLimit=10 6 | # The number of ticks that can pass between 7 | # sending a request and getting an acknowledgement 8 | syncLimit=5 9 | 10 | # the directory where the snapshot is stored. 11 | # dataDir=/opt/zookeeper/data 12 | # NOTE: Solr defaults the dataDir to /zoo_data 13 | 14 | # the port at which the clients will connect 15 | # clientPort=2181 16 | # NOTE: Solr sets this based on zkRun / zkHost params 17 | 18 | -------------------------------------------------------------------------------- /src/main/webapp/images/preview-24px.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/js/eslint.config.js: -------------------------------------------------------------------------------- 1 | import pluginVue from 'eslint-plugin-vue' 2 | import globals from 'globals' 3 | 4 | export default [ 5 | // add more generic rulesets here, such as: 6 | // js.configs.recommended, 7 | ...pluginVue.configs['flat/recommended'], 8 | { 9 | rules: { 10 | // override/add rules settings here, such as: 11 | 'vue/no-unused-vars': 'error' 12 | // 'vue/multi-word-component-names': 'off', 13 | }, 14 | languageOptions: { 15 | sourceType: 'module', 16 | globals: { 17 | ...globals.browser 18 | } 19 | } 20 | } 21 | ] 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/PagePreviewYearsInfo.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | public class PagePreviewYearsInfo { 4 | 5 | 6 | int year; 7 | int count; 8 | 9 | public PagePreviewYearsInfo() { 10 | 11 | } 12 | 13 | 14 | public int getYear() { 15 | return year; 16 | } 17 | public void setYear(int year) { 18 | this.year = year; 19 | } 20 | public int getCount() { 21 | return count; 22 | } 23 | public void setCount(int count) { 24 | this.count = count; 25 | } 26 | 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/script_escape_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript 4 | 5 | 6 |

Less han/greater than problems

7 | 19 |

Click me!

20 | 21 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/script_escape.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript 4 | 5 | 6 |

Less han/greater than problems

7 | 8 | 20 |

Click me!

21 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/Font_Awesome_5_regular_clipboard.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/js/src/components/ToolboxComponents/NgramNetarchive.vue: -------------------------------------------------------------------------------- 1 | 12 | 13 | 29 | -------------------------------------------------------------------------------- /src/js/src/store/notifier.store.js: -------------------------------------------------------------------------------- 1 | // Global notifier state 2 | import { defineStore } from 'pinia' 3 | import { useSearchStore } from '../store/search.store' 4 | 5 | export const useNotifierStore = defineStore('notifier', { 6 | 7 | state: () => ({ 8 | notifications: [] 9 | }), 10 | 11 | 12 | actions: { 13 | setNotification( notification) { 14 | this.notifications.push(notification) 15 | }, 16 | dismissNotification ( notification) { 17 | this.notifications.pop() 18 | const search = useSearchStore() 19 | search.setLoadingStatus(false) 20 | }, 21 | resetState(){ 22 | this.$reset() 23 | } 24 | } 25 | 26 | }) 27 | 28 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/rewrite.config: -------------------------------------------------------------------------------- 1 | # Add this line to tomcat/conf/logging.properties to get debug log for the rewritevalve 2 | # org.apache.catalina.core.ContainerBase.[Catalina].[localhost].level = FINE 3 | 4 | RewriteCond %{REQUEST_URI} !^/solrwayback/static/.* 5 | RewriteCond %{REQUEST_URI} !^/solrwayback/services/.* 6 | RewriteRule ^. /solrwayback_index_page.html [L] 7 | RewriteRule ^.calendar*$ /solrwayback_index_page.html [L] 8 | RewriteRule ^.pageharvestdata*$ /solrwayback_index_page.html [L] 9 | RewriteRule ^.pwid*$ /solrwayback_index_page.html [L] 10 | RewriteRule ^.search*$ /solrwayback_index_page.html [L] 11 | RewriteRule ^.linkgraph*$ /solrwayback_index_page.html [L] 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtilsTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.util; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | public class SolrUtilsTest { 8 | 9 | @Test 10 | public void combineFilterQueriesTest(){ 11 | String[] filtersFromFrontend = new String[]{"filter1:value1", "filter2:value2", "foo:bar OR bar:zoo"}; 12 | String filterquery = SolrUtils.combineFilterQueries("content_type", "text/html", filtersFromFrontend); 13 | 14 | assertEquals("(content_type:text/html) AND (filter1:value1) AND (filter2:value2) AND (foo:bar OR bar:zoo)", filterquery); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/test/resources/compressions_warc/README.md: -------------------------------------------------------------------------------- 1 | # Brotli and GZip compression sample WARCs 2 | 3 | The WARCs in this folder are used to test compression support. 4 | Each WARC contains a single HTML-page with the text 5 | _"Extremely simple webpage used for testing GZip and Brotli transmission compression."_. 6 | 7 | The WARCs has been harvested using wget, specifying either none, GZip or Brotli as transmission compression. For completeness this was done with and without WARC-GZip compression. A sample call is 8 | ``` 9 | ./wget_latest --delete-after --no-warc-keep-log --header="accept-encoding: br" --warc-file="transfer_compression_brotli" 'http://tokemon.sb.statsbiblioteket.dk' 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /src/main/webapp/META-INF/context.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/js/solrwayback_index_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | SolrWayback 10 | 11 | 12 | 13 | 14 | 17 |
18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/js/src/components/modalComponents/CollectionInfo.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | -------------------------------------------------------------------------------- /src/js/src/configs/index.js: -------------------------------------------------------------------------------- 1 | export default { 2 | playbackConfig: { 3 | alternativePlaybackBaseURL: '', 4 | solrwaybackBaseURL:'', 5 | playbackDisabled:false 6 | }, 7 | exportOptions: { 8 | warcAllowed:false, 9 | csvAllowed:false, 10 | csvFields:'' 11 | }, 12 | 13 | visualizations:{ 14 | ngram:{ 15 | startYear:'' 16 | } 17 | }, 18 | 19 | logo:{ 20 | url: '' 21 | }, 22 | 23 | collection:{ 24 | playback: new Map() 25 | }, 26 | 27 | search:{ 28 | uploadedFileDisabled:false, 29 | pagination:20 30 | }, 31 | 32 | leaflet: { 33 | attribution:'', 34 | source:'', 35 | map: { 36 | latitude:'', 37 | longitude:'', 38 | radius:'' 39 | } 40 | }, 41 | } -------------------------------------------------------------------------------- /src/js/src/components/modalComponents/SearchGuidelines.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/graph/D3Graph.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.graph; 2 | 3 | 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import javax.xml.bind.annotation.XmlRootElement; 8 | 9 | @XmlRootElement 10 | public class D3Graph { 11 | 12 | public List nodes = new ArrayList(); 13 | public List links = new ArrayList(); 14 | 15 | public List getNodes() { 16 | return nodes; 17 | } 18 | public void setNodes(List nodes) { 19 | this.nodes = nodes; 20 | } 21 | public List getLinks() { 22 | return links; 23 | } 24 | public void setLinks(List links) { 25 | this.links = links; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrTestClient.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.solr; 2 | 3 | import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | public class NetarchiveSolrTestClient extends NetarchiveSolrClient{ 8 | 9 | private static final Logger log = LoggerFactory.getLogger(NetarchiveSolrTestClient.class); 10 | /* 11 | * Called from unittest 12 | */ 13 | public static void initializeOverLoadUnitTest(EmbeddedSolrServer server) { 14 | solrServer=server; 15 | noCacheSolrServer=server; 16 | instance = new NetarchiveSolrTestClient(); 17 | log.info("SolrClient initialized with embedded solr for unittest"); 18 | } 19 | 20 | 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/WordCloudWordAndCount.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class WordCloudWordAndCount { 7 | 8 | private int count; 9 | private String word; 10 | 11 | public WordCloudWordAndCount() { 12 | } 13 | 14 | public WordCloudWordAndCount(String word, int count) { 15 | this.word=word; 16 | this.count=count; 17 | } 18 | 19 | 20 | public int getCount() { 21 | return count; 22 | } 23 | 24 | public void setCount(int count) { 25 | this.count = count; 26 | } 27 | 28 | public String getWord() { 29 | return word; 30 | } 31 | 32 | public void setWord(String word) { 33 | this.word = word; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetVideoInfo.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Object that contains a tweet's video variants - i.e. the objects holding the actual links to the video formats. 7 | * Annoyingly, this useless intermediary object had to be made in order to parse the child objects without doing it in 8 | * an ugly and hacky way... 9 | */ 10 | public class TweetVideoInfo { 11 | private List variants; 12 | 13 | public TweetVideoInfo() { 14 | } 15 | 16 | public List getVariants() { 17 | return variants; 18 | } 19 | 20 | public void setVariants(List variants) { 21 | this.variants = variants; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/css2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Specific problem 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

Nothing here

14 | 15 | " -------------------------------------------------------------------------------- /src/bundle/solrwayback_bundle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Check if an argument was provided 3 | if [ -z "$1" ]; then 4 | echo "Usage: $0 {start|stop}" 5 | exit 1 6 | fi 7 | 8 | case "$1" in 9 | start) 10 | echo "Starting solr..." 11 | ./solr-9/bin/solr start -c -m 4g 12 | 13 | echo "Starting SolrWayback in tomcat..." 14 | ./tomcat-9/bin/startup.sh 15 | 16 | printf "\n\nStarted SolrWayback\n" 17 | ;; 18 | stop) 19 | echo "Stopping SolrWayback in tomcat..." 20 | ./tomcat-9/bin/shutdown.sh 21 | 22 | echo "Stopping solr..." 23 | ./solr-9/bin/solr stop 24 | 25 | printf "\n\nStopped SolrWayback\n" 26 | ;; 27 | *) 28 | echo "Invalid option: $1" 29 | echo "Usage: $0 {start|stop}" 30 | exit 2 31 | ;; 32 | esac 33 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/smurf/DateCount.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.smurf; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class DateCount { 7 | 8 | private String date; 9 | private long count; 10 | private long total; 11 | 12 | public DateCount(){ 13 | } 14 | 15 | public long getCount() { 16 | return count; 17 | } 18 | 19 | public void setCount(long count) { 20 | this.count = count; 21 | } 22 | 23 | public long getTotal() { 24 | return total; 25 | } 26 | 27 | public void setTotal(long total) { 28 | this.total = total; 29 | } 30 | 31 | public String getDate() { 32 | return date; 33 | } 34 | 35 | public void setDate(String date) { 36 | this.date = date; 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | solrwayback.iml 4 | /solrwayback.properties 5 | target 6 | /.classpath 7 | /deployLocalTomcat.sh 8 | /.settings/ 9 | /.project 10 | /deployBelindaTomcat.sh 11 | /deployTeg.sh 12 | /solrwayback.log 13 | /temp/ 14 | /bugs.txt 15 | /deployWindows.bat 16 | 17 | 18 | # Webapp 19 | 20 | src/js/.DS_Store 21 | src/js/node_modules 22 | src/js/dist 23 | 24 | # local env files 25 | src/js/.env.local 26 | src/js/.env.*.local 27 | 28 | # Log files 29 | src/js/npm-debug.log* 30 | src/js/yarn-debug.log* 31 | src/js/yarn-error.log* 32 | 33 | 34 | #deployToTest 35 | src/js/deployToTest.sh 36 | 37 | # Editor directories and files 38 | src/js/.idea 39 | src/js/.vscode 40 | src/js/*.suo 41 | src/js/*.ntvs* 42 | src/js/*.njsproj 43 | src/js/*.sln 44 | src/js/*.sw? 45 | 46 | 47 | /deployTegLocal.sh 48 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/script2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript 4 | 5 | 6 |

Modified from a specific page in the Danish net Archive

7 | 8 | 17 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/SolrWaybackApplication.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service; 2 | 3 | import java.util.Arrays; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import javax.ws.rs.core.Application; 8 | 9 | import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider; 10 | 11 | import dk.kb.netarchivesuite.solrwayback.service.exception.ServiceExceptionMapper; 12 | 13 | 14 | public class SolrWaybackApplication extends Application { 15 | 16 | public Set> getClasses() { 17 | return new HashSet<>(Arrays.asList( 18 | JacksonJsonProvider.class, 19 | SolrWaybackResource.class, 20 | SolrWaybackResourceWeb.class, 21 | SolrWaybackMementoAPI.class, 22 | ServiceExceptionMapper.class 23 | )); 24 | } 25 | 26 | 27 | } -------------------------------------------------------------------------------- /src/js/src/assets/styles/pwid.scss: -------------------------------------------------------------------------------- 1 | .pwidContainer{ 2 | padding:2rem; 3 | } 4 | 5 | .copyToClipboard { 6 | display: block; 7 | height: 30px; /*height of icon */ 8 | width: 30px; /*width of icon */ 9 | cursor: pointer; 10 | float:left; 11 | 12 | } 13 | 14 | .copyToClipboardText{ 15 | font-size:150%; 16 | margin: 0 0 2rem 0; 17 | cursor: pointer; 18 | color:var(--main-highlight-color); 19 | text-decoration: underline; 20 | } 21 | 22 | .copyContainer{ 23 | margin: 0 0 2rem 0; 24 | padding: 0 0 2rem 0; 25 | border-bottom: 1px dashed black 26 | } 27 | 28 | .clipBoardIcon { 29 | background: url(../assets/icons/Font_Awesome_5_regular_clipboard.svg) no-repeat 0px 0px; 30 | } 31 | 32 | .checkmarkIcon { 33 | background: url(../assets/icons/Icons8_flat_checkmark.svg) no-repeat 0px 0px; 34 | } 35 | 36 | .PWIDRawData { 37 | white-space: pre-line; 38 | } -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/SearchResult.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import javax.xml.bind.annotation.XmlRootElement; 7 | 8 | @XmlRootElement 9 | public class SearchResult { 10 | 11 | private long numberOfResults=0; 12 | private List results = new ArrayList(); 13 | public long getNumberOfResults() { 14 | return numberOfResults; 15 | } 16 | public void setNumberOfResults(long numberOfResults) { 17 | this.numberOfResults = numberOfResults; 18 | } 19 | public List getResults() { 20 | return results; 21 | } 22 | public void setResults(List results) { 23 | this.results = results; 24 | } 25 | 26 | 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetVideoVariant.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | public class TweetVideoVariant { 4 | private String url; 5 | 6 | private int bitrate; 7 | 8 | private String contentType; 9 | 10 | public TweetVideoVariant() { 11 | } 12 | 13 | public String getUrl() { 14 | return url; 15 | } 16 | 17 | public void setUrl(String url) { 18 | this.url = url; 19 | } 20 | 21 | public int getBitrate() { 22 | return bitrate; 23 | } 24 | 25 | public void setBitrate(int bitrate) { 26 | this.bitrate = bitrate; 27 | } 28 | 29 | public String getContentType() { 30 | return contentType; 31 | } 32 | 33 | public void setContentType(String contentType) { 34 | this.contentType = contentType; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /src/js/src/components/harvestTimeResources/HarvestResourcesMissing.vue: -------------------------------------------------------------------------------- 1 | 20 | 21 | 35 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetMedia.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | public class TweetMedia { 4 | private String mediaUrl; 5 | 6 | private String type; 7 | 8 | private TweetVideoInfo videoInfo; 9 | 10 | public TweetMedia() { 11 | } 12 | 13 | public String getMediaUrl() { 14 | return mediaUrl; 15 | } 16 | 17 | public void setMediaUrl(String mediaUrl) { 18 | this.mediaUrl = mediaUrl; 19 | } 20 | 21 | public String getType() { 22 | return type; 23 | } 24 | 25 | public void setType(String type) { 26 | this.type = type; 27 | } 28 | 29 | public TweetVideoInfo getVideoInfo() { 30 | return videoInfo; 31 | } 32 | 33 | public void setVideoInfo(TweetVideoInfo videoInfo) { 34 | this.videoInfo = videoInfo; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/NotFoundServiceException.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.exception; 2 | 3 | import javax.ws.rs.core.Response; 4 | 5 | public class NotFoundServiceException extends SolrWaybackServiceException { 6 | 7 | private static final long serialVersionUID = 27182818L; 8 | private static final Response.Status responseStatus = Response.Status.NOT_FOUND; 9 | 10 | public NotFoundServiceException() { 11 | super(responseStatus); 12 | } 13 | 14 | public NotFoundServiceException(String message) { 15 | super(message, responseStatus); 16 | } 17 | 18 | public NotFoundServiceException(String message, Throwable cause) { 19 | super(message, cause, responseStatus); 20 | } 21 | 22 | public NotFoundServiceException(Throwable cause) { 23 | super(cause, responseStatus); 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/PagePreview.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | public class PagePreview { 4 | 5 | private String pagePreviewUrl; 6 | private Long crawlDate; 7 | private String solrWaybackUrl; 8 | 9 | public PagePreview(){ 10 | 11 | } 12 | 13 | public String getPagePreviewUrl() { 14 | return pagePreviewUrl; 15 | } 16 | 17 | public void setPagePreviewUrl(String pagePreviewUrl) { 18 | this.pagePreviewUrl = pagePreviewUrl; 19 | } 20 | 21 | public Long getCrawlDate() { 22 | return crawlDate; 23 | } 24 | 25 | public void setCrawlDate(Long crawlDate) { 26 | this.crawlDate = crawlDate; 27 | } 28 | 29 | public String getSolrWaybackUrl() { 30 | return solrWaybackUrl; 31 | } 32 | 33 | public void setSolrWaybackUrl(String solrWaybackUrl) { 34 | this.solrWaybackUrl = solrWaybackUrl; 35 | } 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/js/src/components/ngrams/chartsCore/chartHelpers/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Helpers for charting. 3 | * 4 | */ 5 | 6 | import ChartOptionsCore from '../../chartsCore/chartConfigs/chartOptionsCore' 7 | 8 | export default { 9 | 10 | /** 11 | * Generate labels for chart. 12 | * - Override with own label config as you see fit - 13 | */ 14 | getChartLabels: (labels, scale) => { 15 | return ChartOptionsCore.getChartLabels(labels, scale) 16 | }, 17 | 18 | /** 19 | * Generate options for chart. 20 | * - Override with own options config as you see fit - 21 | */ 22 | getChartOptions: (searchType, scale) => { 23 | return ChartOptionsCore.getChartOptions(searchType, scale) 24 | }, 25 | 26 | /** 27 | * Generate dataset for chart. 28 | * - Override with own data set config as you see fit - 29 | */ 30 | getChartDataSet(rawData) { 31 | return ChartOptionsCore.getChartDataSet(rawData) 32 | } 33 | 34 | } 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/IdentityArcFileResolver.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.interfaces; 2 | 3 | 4 | import java.util.Map; 5 | 6 | /** 7 | * @deprecated use {@link RewriteLocationResolver} instead. Its default behaviour works 100% as IdentityArcFileResolver. 8 | */ 9 | public class IdentityArcFileResolver implements ArcFileLocationResolverInterface { 10 | /* 11 | * This implementation just returns the same file location as output. Can be used if path to the arc-files is the same as 12 | * the index field: source_file_path 13 | */ 14 | @Override 15 | public ArcSource resolveArcFileLocation(String source_file_path) { 16 | return ArcSource.fromFile(source_file_path); 17 | } 18 | @Override 19 | public void setParameters(Map parameters) { 20 | //Does not use parameters 21 | } 22 | 23 | @Override 24 | public void initialize() { 25 | // do noting 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/ServiceExceptionMapper.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.exception; 2 | 3 | import javax.ws.rs.core.Response; 4 | import javax.ws.rs.ext.ExceptionMapper; 5 | import javax.ws.rs.ext.Provider; 6 | 7 | @Provider 8 | public class ServiceExceptionMapper implements ExceptionMapper { 9 | @Override 10 | public Response toResponse(SolrWaybackServiceException exception) { 11 | 12 | Response.Status responseStatus = exception.getResponseStatus(); 13 | String message = exception.getMessage(); 14 | 15 | return (message != null) 16 | ? Response.status(responseStatus) 17 | .entity(message) 18 | .type("text/plain") 19 | .build() 20 | : Response.status(responseStatus).build(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/InternalServiceException.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.exception; 2 | 3 | 4 | import javax.ws.rs.core.Response; 5 | 6 | public class InternalServiceException extends SolrWaybackServiceException { 7 | 8 | private static final long serialVersionUID = 27182818L; 9 | private static final Response.Status responseStatus = Response.Status.INTERNAL_SERVER_ERROR; 10 | 11 | public InternalServiceException() { 12 | super(responseStatus); 13 | } 14 | 15 | public InternalServiceException(String message) { 16 | super(message, responseStatus); 17 | } 18 | 19 | public InternalServiceException(String message, Throwable cause) { 20 | super(message, cause, responseStatus); 21 | } 22 | 23 | public InternalServiceException(Throwable cause) { 24 | super(cause, responseStatus); 25 | } 26 | 27 | 28 | } 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /src/js/src/components/harvestTimeResources/HarvestMaxTimeDifference.vue: -------------------------------------------------------------------------------- 1 | 18 | 19 | 32 | -------------------------------------------------------------------------------- /src/js/src/components/searchSingleItemComponents/SearchSingleItemFocusImage.vue: -------------------------------------------------------------------------------- 1 | 13 | 14 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/HarvestDates.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import javax.xml.bind.annotation.XmlRootElement; 7 | 8 | @XmlRootElement 9 | public class HarvestDates { 10 | 11 | private String url; 12 | private long numberOfHarvests=0; 13 | private List dates = new ArrayList(); 14 | public long getNumberOfHarvests() { 15 | return numberOfHarvests; 16 | } 17 | public void setNumberOfHarvests(long numberOfHarvests) { 18 | this.numberOfHarvests = numberOfHarvests; 19 | } 20 | public List getDates() { 21 | return dates; 22 | } 23 | public void setDates(List dates) { 24 | this.dates = dates; 25 | } 26 | public String getUrl() { 27 | return url; 28 | } 29 | public void setUrl(String url) { 30 | this.url = url; 31 | } 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/InvalidArgumentServiceException.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.exception; 2 | 3 | import javax.ws.rs.core.Response; 4 | 5 | public class InvalidArgumentServiceException extends SolrWaybackServiceException { 6 | 7 | private static final long serialVersionUID = 27182818L; 8 | private static final Response.Status responseStatus = Response.Status.BAD_REQUEST; 9 | 10 | public InvalidArgumentServiceException() { 11 | super(responseStatus); 12 | } 13 | 14 | public InvalidArgumentServiceException(String message) { 15 | super(message, responseStatus); 16 | } 17 | 18 | public InvalidArgumentServiceException(String message, Throwable cause) { 19 | super(message, cause, responseStatus); 20 | } 21 | 22 | public InvalidArgumentServiceException(Throwable cause) { 23 | super(cause, responseStatus); 24 | } 25 | 26 | 27 | } 28 | 29 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/playback/PlaybackHandler.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.playback; 2 | 3 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 4 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; 5 | 6 | public abstract class PlaybackHandler { 7 | 8 | protected ArcEntry arc; 9 | protected IndexDoc doc; 10 | protected boolean showToolbar; 11 | 12 | public PlaybackHandler(ArcEntry arc,IndexDoc doc, boolean showToolbar){ 13 | this.arc=arc; 14 | this.doc=doc; 15 | this.showToolbar=showToolbar; 16 | } 17 | 18 | /** 19 | * Deliver a webpage for playback. 20 | * @param lenient if true, lenient resource URL resolving is used. 21 | * If false, only {@code url_norm:"normURL"} is used. 22 | * @return a webpage for playback. 23 | * @throws Exception if the webpage could not be rendered. 24 | */ 25 | public abstract ArcEntry playback(boolean lenient) throws Exception; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/css2_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Specific problem 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

Nothing here

" 14 | 15 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetHashtag.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import com.fasterxml.jackson.annotation.JsonProperty; 4 | import org.apache.commons.lang3.tuple.Pair; 5 | 6 | public class TweetHashtag implements TweetEntity { 7 | private Pair indices; 8 | 9 | private String text; 10 | 11 | 12 | public TweetHashtag() { 13 | } 14 | 15 | @JsonProperty("indices") 16 | private void unpackIndices(int[] indices) { 17 | this.indices = Pair.of(indices[0], indices[1]); 18 | } 19 | 20 | public Pair getIndices() { 21 | return indices; 22 | } 23 | 24 | @Override 25 | public void setIndices(Pair newIndices) { 26 | this.indices = newIndices; 27 | } 28 | 29 | public String getText() { 30 | return text; 31 | } 32 | 33 | public void setText(String text) { 34 | this.text = text; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/script2_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page with JavaScript 4 | 5 | 6 |

Modified from a specific page in the Danish net Archive

7 | 16 | 17 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/graph/Link.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.graph; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class Link { 7 | 8 | private int source; 9 | private int target; 10 | private int weight = 1; 11 | 12 | public Link(){ 13 | } 14 | 15 | public Link(int source, int target, int weight){ 16 | this.source=source; 17 | this.target=target; 18 | this.weight=weight; 19 | } 20 | 21 | public int getSource() { 22 | return source; 23 | } 24 | 25 | public void setSource(int source) { 26 | this.source = source; 27 | } 28 | 29 | public int getTarget() { 30 | return target; 31 | } 32 | 33 | public void setTarget(int target) { 34 | this.target = target; 35 | } 36 | 37 | public int getWeight() { 38 | return weight; 39 | } 40 | 41 | public void setWeight(int weight) { 42 | this.weight = weight; 43 | } 44 | 45 | 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/js/src/main.js: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue' 2 | import { createPinia } from 'pinia' 3 | import App from './App.vue' 4 | import router from './router' 5 | import Axios from 'axios' 6 | import VTooltip from 'v-tooltip' 7 | import { setServerConfigInApp } from './configs/configHelper' 8 | 9 | import './assets/styles/main.scss' 10 | 11 | const baseURL = import.meta.env.DEV 12 | ? '/' 13 | : import.meta.env.BASE_URL; 14 | 15 | if (import.meta.env.DEV) { 16 | Axios.defaults.baseURL = '/'; 17 | } 18 | 19 | Axios.get(baseURL + 'services/frontend/properties/solrwaybackweb/') 20 | .then(response => { 21 | setServerConfigInApp(response.data) 22 | 23 | const pinia = createPinia() 24 | const app = createApp(App) 25 | 26 | app.use(pinia) 27 | app.use(router) 28 | app.use(VTooltip) 29 | 30 | app.mount('#app') 31 | }) 32 | .catch(error => { 33 | // TODO - unsure what best to do here (Ben) 34 | console.error("Failed to load server config", error) 35 | }) 36 | 37 | -------------------------------------------------------------------------------- /src/js/src/assets/icons/twitter.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 14 | 15 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/statistics/QueryPercentilesStatistics.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.statistics; 2 | 3 | import org.apache.solr.client.solrj.response.FieldStatsInfo; 4 | 5 | import javax.xml.bind.annotation.XmlRootElement; 6 | import java.util.Map; 7 | 8 | @XmlRootElement 9 | public class QueryPercentilesStatistics { 10 | private String name; 11 | private Map percentiles; 12 | 13 | public QueryPercentilesStatistics(FieldStatsInfo fieldStatsInfo){ 14 | this.name= fieldStatsInfo.getName(); 15 | this.percentiles= fieldStatsInfo.getPercentiles(); 16 | } 17 | 18 | // Getters 19 | public String getName(){ 20 | return name; 21 | } 22 | 23 | public Map getPercentiles(){ 24 | return percentiles; 25 | } 26 | 27 | // Setters 28 | public void setName(String name) { 29 | this.name = name; 30 | } 31 | 32 | public void setPercentiles(Map percentiles) { 33 | this.percentiles = percentiles; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/js/src/components/AboutComponent.vue: -------------------------------------------------------------------------------- 1 | 9 | 10 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /src/bundle/solrwayback_bundle.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | REM Check if an argument was provided 3 | IF "%~1"=="" ( 4 | echo Usage: %~nx0 ^{start^|stop^} 5 | exit /b 1 6 | ) 7 | 8 | REM Main case handling 9 | IF /I "%~1"=="start" ( 10 | echo Starting solr... 11 | call solr-9\bin\solr.cmd start -c -m 4g 12 | 13 | echo Starting SolrWayback in tomcat... 14 | 15 | REM Set CATALINA_HOME to the "tomcat-9" folder inside the current directory 16 | set "CATALINA_HOME=%cd%\tomcat-9" 17 | 18 | call tomcat-9\bin\startup.bat 19 | 20 | echo. 21 | echo Started SolrWayback 22 | GOTO :eof 23 | ) 24 | 25 | IF /I "%~1"=="stop" ( 26 | echo Stopping SolrWayback in tomcat... 27 | 28 | REM Set CATALINA_HOME to the "tomcat-9" folder inside the current directory 29 | set "CATALINA_HOME=%cd%\tomcat-9" 30 | call tomcat-9\bin\shutdown.bat 31 | 32 | echo Stopping solr... 33 | call solr-9\bin\solr.cmd stop -all 34 | 35 | echo. 36 | echo Stopped SolrWayback 37 | GOTO :eof 38 | ) 39 | 40 | REM Invalid option 41 | echo Invalid option: %~1 42 | echo Usage: %~nx0 ^{start^|stop^} 43 | exit /b 2 44 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcHTTPResolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | * 14 | */ 15 | package dk.kb.netarchivesuite.solrwayback.interfaces; 16 | 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | 20 | /** 21 | * This class is only kept for backwards compatibility. 22 | * @deprecated use {@link RewriteLocationResolver} instead. 23 | */ 24 | public class ArcHTTPResolver extends RewriteLocationResolver { 25 | private static final Logger log = LoggerFactory.getLogger(ArcHTTPResolver.class); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/js/src/mixins/StringManipulationUtils.js: -------------------------------------------------------------------------------- 1 | export default { 2 | methods: { 3 | 4 | $_displayFacetName(facet) { 5 | return facet.replace('&fq=','').split(':')[0] + ': ' 6 | }, 7 | $_displayFacetValue(facet) { 8 | let s = facet.split(':') 9 | return s.slice(1, s.length).join(' ').replace(/"/g,'') 10 | }, 11 | $_checkDomain(domain) { 12 | // Matches at least 1 dot in the string, and no spaces. 13 | return domain.match(/^[^\s]+\.[^\s]+$/) 14 | }, 15 | $_checkDate(date) { 16 | // Matches format YYYY-MM-DD 17 | return date.match(/^([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))$/) 18 | }, 19 | $_displayDate(date, timeScale){ 20 | // Display only the scale 21 | let end = 0 22 | if (date.length > 0) { 23 | switch (timeScale) { 24 | case 'YEAR': 25 | case 'null': 26 | end = 4 27 | break 28 | case 'MONTH': 29 | end = 7 30 | break 31 | case 'WEEK': 32 | case 'DAY': 33 | default: 34 | end = 10 35 | }} 36 | return date.slice(0, end) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/style_element_expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Style page 4 | 5 | 6 | 7 |

Some styling

8 |
9 | Hello 10 |
11 |
12 | World 13 |
14 |

15 |

paragraph

16 | 17 | -------------------------------------------------------------------------------- /src/js/src/mixins/ImageSearchUtils.js: -------------------------------------------------------------------------------- 1 | import HistoryRoutingUtils from './HistoryRoutingUtils' 2 | import { mapStores, mapActions } from 'pinia' 3 | import { useSearchStore } from '../store/search.store' 4 | 5 | export default { 6 | mixins: [HistoryRoutingUtils], 7 | computed: { 8 | // ...mapState({ 9 | // searchAppliedFacets: state => state.Search.searchAppliedFacets, 10 | // solrSettings: state => state.Search.solrSettings, 11 | // }), 12 | ...mapStores(useSearchStore) 13 | }, 14 | methods: { 15 | ...mapActions(useSearchStore, { 16 | updateSolrSettingImgSearch:'updateSolrSettingImgSearch', 17 | }), 18 | $_startPageSearchFromImage(searchItem) { 19 | return '/search?query=' + 'links_images:"' + encodeURIComponent(searchItem) + '"' + '&offset=0&grouping=' + this.searchStore.solrSettings.grouping + '&imgSearch=false&urlSearch=false&facets=' 20 | }, 21 | $_startImageSearchFromImage(searchItem) { 22 | return '/search?query=' + 'hash:"' + encodeURIComponent(searchItem) + '"' + '&offset=0&grouping=' + this.searchStore.solrSettings.grouping + '&imgSearch=false&urlSearch=false&facets=' 23 | }, 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /src/main/resources/about_collection.txt: -------------------------------------------------------------------------------- 1 |

About the collection

2 |

General Information

3 |

4 | This is the default example template for the 'About The Collection' text.
5 | Change the property collection.text.file in solrwaybackweb.properties to point on a local file with the 6 | full absolute file path. Use HTML markup but with the html/body start and end tag. 7 | 8 |

9 | 10 |

11 | Our collection has been collected through four primary strategies for collecting. 12 |

13 |
14 |

Harvest strategy example

15 |
    16 |
  • Cross-sectional collection which takes a snapshot of all Danish domains up to four times a year
  • 17 |
  • Selective collection from the following types of websites: all Danish news media (ranging from 12 times daily to weekly), political parties, organisations and associations, ministries and agencies, selected profiles from social media, YouTube videos (for example weekly)
  • 18 |
  • Event collection of two or three events annually (for example parliamentary elections or the Corona pandemic)
  • 19 |
  • Special collections for example based on research requests
  • 20 |
21 |
22 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/smurf/SmurfBuckets.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.smurf; 2 | 3 | 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import javax.xml.bind.annotation.XmlRootElement; 8 | 9 | @XmlRootElement 10 | public class SmurfBuckets { 11 | 12 | 13 | private List countPercent = new ArrayList<>(); 14 | private List countsTotal= new ArrayList<>(); 15 | private boolean emptyResult=true; 16 | 17 | public SmurfBuckets(){ 18 | } 19 | 20 | 21 | public List getCountPercent() { 22 | return countPercent; 23 | } 24 | 25 | 26 | public void setCountPercent(List countPercent) { 27 | this.countPercent = countPercent; 28 | } 29 | 30 | 31 | public List getCountsTotal() { 32 | return countsTotal; 33 | } 34 | 35 | 36 | public void setCountsTotal(List countsTotal) { 37 | this.countsTotal = countsTotal; 38 | } 39 | 40 | 41 | public boolean isEmptyResult() { 42 | return emptyResult; 43 | } 44 | 45 | 46 | public void setEmptyResult(boolean emptyResult) { 47 | this.emptyResult = emptyResult; 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /src/js/src/components/harvestTimeResources/HarvestPagePreview.vue: -------------------------------------------------------------------------------- 1 | 18 | 44 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/util.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sort the date array descending (oldest first). 3 | * Note: Mutates the input array 4 | * 5 | * @param {Date} dateArray 6 | */ 7 | export function sortDatesDescending(dateArray) { 8 | // Sort the harvest date objects by time ascending. 9 | return dateArray.sort((dateA, dateB) => dateA.getTime() - dateB.getTime()) 10 | } 11 | 12 | /** 13 | * Converts date to human readable output 14 | * 15 | * @param {Date} date 16 | * @param {Boolean} showWeekday 17 | */ 18 | export function toHumanDate(date, showWeekday = false, showTime = false) { 19 | const days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] 20 | const months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] 21 | if (date instanceof Date) { 22 | let dateString = showTime ? `${months[date.getMonth()]} ${date.getDate()}, ${date.getFullYear()} - ${date.getHours()}:${date.getMinutes()}`:`${months[date.getMonth()]} ${date.getDate()}, ${date.getFullYear()}` 23 | return showWeekday ? days[date.getDay()] + ', ' + dateString : dateString 24 | } 25 | return date 26 | } 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/SolrWaybackServiceException.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.exception; 2 | 3 | import javax.ws.rs.core.Response; 4 | 5 | 6 | public abstract class SolrWaybackServiceException extends Exception { 7 | private static final long serialVersionUID = 27182818L; 8 | private final Response.Status responseStatus; 9 | 10 | public Response.Status getResponseStatus() { 11 | return responseStatus; 12 | } 13 | 14 | public SolrWaybackServiceException(Response.Status responseStatus) 15 | { 16 | super(); 17 | this.responseStatus = responseStatus; 18 | } 19 | 20 | public SolrWaybackServiceException(String message, Response.Status responseStatus) { 21 | super(message); 22 | this.responseStatus = responseStatus; 23 | } 24 | 25 | public SolrWaybackServiceException(String message, Throwable cause, Response.Status responseStatus) { 26 | super(message, cause); 27 | this.responseStatus = responseStatus; 28 | } 29 | 30 | public SolrWaybackServiceException(Throwable cause, Response.Status responseStatus) { 31 | super(cause); 32 | this.responseStatus = responseStatus; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /src/js/src/components/LoadingOverlay.vue: -------------------------------------------------------------------------------- 1 | 14 | 41 | -------------------------------------------------------------------------------- /src/js/src/components/ngrams/searchHelper.js: -------------------------------------------------------------------------------- 1 | import Configs from './netarchive/configs' 2 | export default { 3 | handleSearch(queryFromClick, dateFromClick, searchType, scale) { 4 | let url 5 | let gap 6 | let filter 7 | // search on crawl_year if scale is YEAR 8 | if (scale == 'YEAR') { 9 | filter = '&fq=crawl_year:' + dateFromClick.slice(0,4) 10 | } else { 11 | // search on crawl_date otherwise 12 | if (scale == 'WEEK') { 13 | gap = '+7DAYS' 14 | } else { 15 | gap = '+1' + scale 16 | } 17 | filter = '&fq=crawl_date:[' + dateFromClick + 'T00:00:00Z TO ' + dateFromClick + 'T00:00:00Z'+ gap +']' 18 | } 19 | let param = '&grouping=false&imgSearch=false&offset=0&urlSearch=false' 20 | let facets = filter + '&fq=content_type_norm:"html"' 21 | if (searchType === 'tags'){ 22 | url = `${Configs.BASE_SEARCH_URL()}?query=${encodeURIComponent('elements_used:"'+ queryFromClick +'"')}${param}&facets=${encodeURIComponent(facets)}` 23 | } else { 24 | url = `${Configs.BASE_SEARCH_URL()}?query=${encodeURIComponent(queryFromClick)}${param}&facets=${encodeURIComponent(facets)}` 25 | } 26 | window.open(url, '_blank') 27 | } 28 | } -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | ##some test synonym mappings unlikely to appear in real input text 15 | #aaafoo => aaabar 16 | #bbbfoo => bbbfoo bbbbar 17 | #cccfoo => cccbar cccbaz 18 | #fooaaa,baraaa,bazaaa 19 | 20 | ## Some synonym groups specific to this example 21 | #GB,gib,gigabyte,gigabytes 22 | #MB,mib,megabyte,megabytes 23 | #Television, Televisions, TV, TVs 24 | ##notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | ##after us won't split it into two words. 26 | 27 | ## Synonym mappings can be used for spelling correction too 28 | #pixima => pixma 29 | 30 | -------------------------------------------------------------------------------- /src/js/src/components/notifications/Notifications.vue: -------------------------------------------------------------------------------- 1 | 10 | 11 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/FacetCount.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | public class FacetCount { 4 | private long count; 5 | private String value; 6 | 7 | 8 | public FacetCount(){ 9 | } 10 | 11 | 12 | public long getCount() { 13 | return count; 14 | } 15 | 16 | 17 | public void setCount(long count) { 18 | this.count = count; 19 | } 20 | 21 | 22 | public String getValue() { 23 | return value; 24 | } 25 | 26 | 27 | public void setValue(String value) { 28 | this.value = value; 29 | } 30 | 31 | 32 | @Override 33 | public int hashCode() { 34 | final int prime = 31; 35 | int result = 1; 36 | result = prime * result + ((value == null) ? 0 : value.hashCode()); 37 | return result; 38 | } 39 | 40 | 41 | @Override 42 | public boolean equals(Object obj) { 43 | if (this == obj) 44 | return true; 45 | if (obj == null) 46 | return false; 47 | if (getClass() != obj.getClass()) 48 | return false; 49 | FacetCount other = (FacetCount) obj; 50 | if (value == null) { 51 | if (other.value != null) 52 | return false; 53 | } else if (!value.equals(other.value)) 54 | return false; 55 | return true; 56 | } 57 | 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetURL.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import com.fasterxml.jackson.annotation.JsonProperty; 4 | import org.apache.commons.lang3.tuple.Pair; 5 | 6 | public class TweetURL implements TweetEntity { 7 | private Pair indices; 8 | 9 | private String expandedUrl; 10 | 11 | private String displayUrl; 12 | 13 | 14 | public TweetURL() { 15 | } 16 | 17 | @JsonProperty("indices") 18 | private void unpackIndices(int[] indices) { 19 | this.indices = Pair.of(indices[0], indices[1]); 20 | } 21 | 22 | public Pair getIndices() { 23 | return indices; 24 | } 25 | 26 | @Override 27 | public void setIndices(Pair newIndices) { 28 | this.indices = newIndices; 29 | } 30 | 31 | public String getExpandedUrl() { 32 | return expandedUrl; 33 | } 34 | 35 | public void setExpandedUrl(String expandedUrl) { 36 | this.expandedUrl = expandedUrl; 37 | } 38 | 39 | public String getDisplayUrl() { 40 | return displayUrl; 41 | } 42 | 43 | public void setDisplayUrl(String displayUrl) { 44 | this.displayUrl = displayUrl; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/smurf/NetarchiveYearCountCache.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.smurf; 2 | 3 | import java.util.HashMap; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import dk.kb.netarchivesuite.solrwayback.solr.NetarchiveSolrClient; 9 | 10 | public class NetarchiveYearCountCache { 11 | 12 | private static long lastReloadTime=0; 13 | private static long reloadInterval=4*60*60*1000L; //reload cache every 4 hours 14 | private static HashMap yearFacetsAll = null; 15 | private static final Logger log = LoggerFactory.getLogger(NetarchiveYearCountCache.class); 16 | 17 | private static void reload() throws Exception{ 18 | log.info("Reloading netarchive year count cache"); 19 | lastReloadTime=System.currentTimeMillis(); 20 | HashMap yearFacetsAllTemp = NetarchiveSolrClient.getInstance().getYearFacetsHtmlAll(); 21 | yearFacetsAll=yearFacetsAllTemp; 22 | } 23 | 24 | public static synchronized HashMap getYearFacetsAllQuery() throws Exception{ 25 | if ( (System.currentTimeMillis() - reloadInterval) > lastReloadTime){ 26 | reload(); 27 | } 28 | return yearFacetsAll; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | ##some test synonym mappings unlikely to appear in real input text 15 | #aaafoo => aaabar 16 | #bbbfoo => bbbfoo bbbbar 17 | #cccfoo => cccbar cccbaz 18 | #fooaaa,baraaa,bazaaa 19 | 20 | ## Some synonym groups specific to this example 21 | #GB,gib,gigabyte,gigabytes 22 | #MB,mib,megabyte,megabytes 23 | #Television, Televisions, TV, TVs 24 | ##notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | ##after us won't split it into two words. 26 | 27 | ## Synonym mappings can be used for spelling correction too 28 | #pixima => pixma 29 | 30 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetEntities.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import com.fasterxml.jackson.annotation.JsonProperty; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Even though 'entities' also includes 'media'-objects this is parsed outside this pojo, as media will always 9 | * be contained inside 'extended_entities' that is on the same level as 'entities'. 10 | */ 11 | public class TweetEntities { 12 | @JsonProperty("user_mentions") 13 | private List mentions; 14 | 15 | private List urls; 16 | 17 | private List hashtags; 18 | 19 | 20 | public TweetEntities() { 21 | } 22 | 23 | public List getMentions() { 24 | return mentions; 25 | } 26 | 27 | public void setMentions(List mentions) { 28 | this.mentions = mentions; 29 | } 30 | 31 | public List getUrls() { 32 | return urls; 33 | } 34 | 35 | public void setUrls(List urls) { 36 | this.urls = urls; 37 | } 38 | 39 | public List getHashtags() { 40 | return hashtags; 41 | } 42 | 43 | public void setHashtags(List hashtags) { 44 | this.hashtags = hashtags; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/graph/Node.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.graph; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class Node { 7 | private String name; 8 | private int group; 9 | private int size; 10 | private String color; 11 | 12 | public Node(){ 13 | } 14 | 15 | public Node(String name, int group, int size){ 16 | this.name=name; 17 | this.group=group; 18 | this.size=size; 19 | } 20 | 21 | public Node(String name, int group, int size, String color){ 22 | this.name=name; 23 | this.group=group; 24 | this.size=size; 25 | this.color=color; 26 | } 27 | 28 | public String getName() { 29 | return name; 30 | } 31 | 32 | public void setName(String name) { 33 | this.name = name; 34 | } 35 | 36 | public int getGroup() { 37 | return group; 38 | } 39 | 40 | public void setGroup(int group) { 41 | this.group = group; 42 | } 43 | 44 | public int getSize() { 45 | return size; 46 | } 47 | 48 | public void setSize(int size) { 49 | this.size = size; 50 | } 51 | 52 | public String getColor() { 53 | return color; 54 | } 55 | 56 | public void setColor(String color) { 57 | this.color = color; 58 | } 59 | 60 | 61 | 62 | } -------------------------------------------------------------------------------- /src/js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "solrwayback", 3 | "version": "0.0.1", 4 | "private": true, 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "vite build", 9 | "build-preview": "vite build --mode preview", 10 | "serve": "vite preview", 11 | "lint": "eslint . --ext .vue,.js --ignore-pattern .gitignore" 12 | }, 13 | "dependencies": { 14 | "@sideway/formula": "^3.0.1", 15 | "axios": "^1.10.0", 16 | "chart.js": "^2.9.4", 17 | "core-js": "^3.44.0", 18 | "d3": "3.5.17", 19 | "date-fns": "^4.1.0", 20 | "leaflet": "^1.9.4", 21 | "leaflet.markercluster": "^1.5.3", 22 | "pinia": "^3.0.2", 23 | "video.js": "8.23.3", 24 | "vue": "^3.5.13", 25 | "vue-3-slider-component": "^1.0.2", 26 | "vue-chartjs": "^3.5.1", 27 | "vue-router": "^4.5.1" 28 | }, 29 | "devDependencies": { 30 | "@vitejs/plugin-vue": "^5.0.5", 31 | "@vitejs/plugin-vue-jsx": "^4.1.2", 32 | "@vue/compiler-sfc": "^3.5.17", 33 | "@vue/eslint-config-prettier": "^10.2.0", 34 | "cross-env": "^7.0.3", 35 | "eslint": "^9.32.0", 36 | "eslint-plugin-vue": "^10.4.0", 37 | "prettier": "^3.5.3", 38 | "sass": "^1.55.0", 39 | "unplugin-vue-components": "^28.5.0", 40 | "v-tooltip": "^1.1.6", 41 | "vite": "^6.3.5", 42 | "vitest": "^3.1.3" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/SolrWaybackAsciiLogo.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.listeners; 2 | 3 | public class SolrWaybackAsciiLogo { 4 | 5 | //Some characters are escaped, looks fine when printed. 6 | public final static String SOLRWAYBACK_LOGO= 7 | "\n" 8 | + " _______. ______ __ .______ ____ __ ____ ___ ____ ____ .______ ___ ______ __ ___ \n" 9 | + " / | / __ \\ | | | _ \\ \\ \\ / \\ / / / \\ \\ \\ / / | _ \\ / \\ / || |/ / \n" 10 | + " | (----`| | | | | | | |_) | \\ \\/ \\/ / / ^ \\ \\ \\/ / | |_) | / ^ \\ | ,----'| ' / \n" 11 | + " \\ \\ | | | | | | | / \\ / / /_\\ \\ \\_ _/ | _ < / /_\\ \\ | | | < \n" 12 | + " .----) | | `--' | | `----.| |\\ \\----. \\ /\\ / / _____ \\ | | | |_) | / _____ \\ | `----.| . \\ \n" 13 | + " |_______/ \\______/ |_______|| _| `._____| \\__/ \\__/ /__/ \\__\\ |__| |______/ /__/ \\__\\ \\______||__|\\__\\" 14 | + "\n"; 15 | 16 | 17 | 18 | public static void main(String[] args) { 19 | System.out.println(SOLRWAYBACK_LOGO); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetMention.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.json; 2 | 3 | import com.fasterxml.jackson.annotation.JsonProperty; 4 | import org.apache.commons.lang3.tuple.Pair; 5 | 6 | public class TweetMention implements TweetEntity { 7 | @JsonProperty("id_str") 8 | private String id; 9 | 10 | private Pair indices; 11 | 12 | private String screenName; 13 | 14 | 15 | public TweetMention() { 16 | } 17 | 18 | public TweetMention(String screenName) { 19 | this.screenName = screenName; 20 | } 21 | 22 | @JsonProperty("indices") 23 | private void unpackIndices(int[] indices) { 24 | this.indices = Pair.of(indices[0], indices[1]); 25 | } 26 | 27 | public String getId() { 28 | return id; 29 | } 30 | 31 | public void setId(String id) { 32 | this.id = id; 33 | } 34 | 35 | public Pair getIndices() { 36 | return indices; 37 | } 38 | 39 | @Override 40 | public void setIndices(Pair newIndices) { 41 | this.indices = newIndices; 42 | } 43 | 44 | public String getScreenName() { 45 | return screenName; 46 | } 47 | 48 | public void setScreenName(String screenName) { 49 | this.screenName = screenName; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/playback/CssPlayback.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.playback; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import dk.kb.netarchivesuite.solrwayback.parsers.HtmlParserUrlRewriter; 7 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 8 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; 9 | 10 | public class CssPlayback extends PlaybackHandler{ 11 | 12 | private static final Logger log = LoggerFactory.getLogger(CssPlayback.class); 13 | 14 | public CssPlayback(ArcEntry arc, IndexDoc doc, boolean showToolbar){ 15 | super(arc,doc,showToolbar); 16 | } 17 | 18 | // TODO: Enable propagation of lenient through HtmlParserUrlRewriter.replaceLinksCss(arc) 19 | @Override 20 | public ArcEntry playback(boolean lenient) throws Exception{ 21 | //Never show the toolbar. 22 | // TODO: What was the purpose of this round trip? If re-enabled, please state why in a comment 23 | // arc.setBinary(IOUtils.toByteArray(arc.getStringContentAsStringSafe())); //TODO charset; 24 | 25 | String textReplaced = HtmlParserUrlRewriter.replaceLinksCss(arc); 26 | // content-encoding is about compression; not relevant for charset 27 | // if (!"gzip".equalsIgnoreCase(arc.getContentEncoding())){ 28 | arc.setStringContent(textReplaced); 29 | return arc; 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/plugins/iterators.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Higher-order function that loops through the harvestDataObject, calling a callback for each month. 3 | * 4 | * @param {Object} datesObject The final object of years, months and days to add data to. 5 | * @param {Function} actionFunction The callback to execute for every month. 6 | */ 7 | export function doForEachMonthInDatesObject(datesObject, actionFunction) { 8 | 9 | for (let year of Object.keys(datesObject)) { 10 | for (let month of Object.keys(datesObject[year]['months'])) { 11 | actionFunction(year, month) 12 | } 13 | } 14 | } 15 | 16 | /** 17 | * Higher-order function that loops through the harvestDataObject, calling a callback for each day in the week. 18 | * 19 | * @param {Object} datesObject The final object of years, months and days to add data to. 20 | * @param {Function} actionFunction The callback to execute for every month. 21 | */ 22 | export function doForEachWeekAndDayInDatesObject(datesObject, actionFunction) { 23 | 24 | for (let year of Object.keys(datesObject)) { 25 | for (let week of Object.keys(datesObject[year]['weeks'])) { 26 | for (let day of Object.keys(datesObject[year]['weeks'][week])) { 27 | if (datesObject[year]['weeks'][week][day] !== null) { 28 | actionFunction(year, week, day) 29 | } 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/js/vite.config.js: -------------------------------------------------------------------------------- 1 | import { fileURLToPath, URL } from 'node:url'; 2 | import vue from '@vitejs/plugin-vue'; 3 | import { defineConfig } from 'vite'; 4 | 5 | export default defineConfig(({ command }) => ({ 6 | base: '/solrwayback/', 7 | build: { 8 | rollupOptions: { 9 | input: { 10 | custom: 'solrwayback_index_page.html' 11 | } 12 | } 13 | }, 14 | root: '.', 15 | server: command === 'serve' 16 | ? { 17 | open: 'solrwayback_index_page.html', 18 | proxy: { 19 | '^/solrwayback/services': { 20 | target: 'http://localhost:8080', 21 | changeOrigin: true, 22 | rewrite: (path) => path.replace(/^\/solrwayback\/services/, '/solrwayback/services'), 23 | }, 24 | '/services': { 25 | target: 'http://localhost:8080', 26 | changeOrigin: true, 27 | rewrite: (path) => { 28 | const newPath = path.replace(/^\/?services/, '/solrwayback/services'); 29 | return newPath; 30 | }, 31 | }, 32 | }, 33 | } 34 | : undefined, 35 | preview: { 36 | open: 'solrwayback_index_page.html', 37 | }, 38 | plugins: [vue()], 39 | resolve: { 40 | alias: { 41 | extensions: [".mjs", ".js", ".ts", ".jsx", ".tsx", ".json", ".vue"], 42 | "@": fileURLToPath(new URL("./src", import.meta.url)), 43 | }, 44 | }, 45 | })); 46 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterFromWarcTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 4 | 5 | public class HtmlParserUrlRewriterFromWarcTest { 6 | 7 | 8 | /* 9 | * Integration test class to parse HTML from a warc-file. 10 | * Warc-files can not be in reposity so change path to a local warc-file 11 | * 12 | */ 13 | public static void main(String []args) { 14 | try { 15 | String warcFile="/media/teg/1TB_SSD/solrwayback_package_3.2/indexing/warcs/denoffentlige-00000.warc"; 16 | long offset=2691693; 17 | 18 | ArcEntry arc=ArcParserFileResolver.getArcEntry(warcFile, offset); 19 | String html = arc.getStringContentAsStringSafe(); 20 | 21 | 22 | ParseResult rewritten = HtmlParserUrlRewriter.replaceLinks( 23 | html, "http://example.com/somefolder/", "2020-04-30T13:07:00", 24 | RewriteTestHelper.createOXResolver(true)); 25 | 26 | 27 | //See the replaced HTML. See all urls are replaced with 'notfound' 28 | System.out.println(rewritten.getReplaced()); 29 | 30 | } 31 | catch(Exception e) { 32 | e.printStackTrace(); 33 | } 34 | 35 | 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/PageResource.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | import java.util.Date; 4 | 5 | import javax.xml.bind.annotation.XmlRootElement; 6 | 7 | @XmlRootElement 8 | public class PageResource { 9 | 10 | private String url; 11 | private String downloadUrl; 12 | private String contentType; 13 | private Date crawlTime; 14 | private String timeDifference; 15 | 16 | public PageResource(){ 17 | 18 | } 19 | 20 | public String getUrl() { 21 | return url; 22 | } 23 | 24 | public void setUrl(String url) { 25 | this.url = url; 26 | } 27 | 28 | public String getDownloadUrl() { 29 | return downloadUrl; 30 | } 31 | 32 | public void setDownloadUrl(String downloadUrl) { 33 | this.downloadUrl = downloadUrl; 34 | } 35 | 36 | public String getContentType() { 37 | return contentType; 38 | } 39 | 40 | public void setContentType(String contentType) { 41 | this.contentType = contentType; 42 | } 43 | 44 | public Date getCrawlTime() { 45 | return crawlTime; 46 | } 47 | 48 | public void setCrawlTime(Date crawlTime) { 49 | this.crawlTime = crawlTime; 50 | } 51 | 52 | public String getTimeDifference() { 53 | return timeDifference; 54 | } 55 | 56 | public void setTimeDifference(String timeDifference) { 57 | this.timeDifference = timeDifference; 58 | } 59 | 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/statistics/DomainYearStatistics.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.statistics; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class DomainYearStatistics { 7 | 8 | private int year; 9 | private int ingoingLinks; 10 | private int sizeInKb; 11 | private int uniquePages; 12 | private String domain; 13 | 14 | public DomainYearStatistics(){ 15 | } 16 | 17 | 18 | public int getYear() { 19 | return year; 20 | } 21 | 22 | 23 | public void setYear(int year) { 24 | this.year = year; 25 | } 26 | 27 | 28 | public int getIngoingLinks() { 29 | return ingoingLinks; 30 | } 31 | 32 | 33 | public void setIngoingLinks(int ingoingLinks) { 34 | this.ingoingLinks = ingoingLinks; 35 | } 36 | 37 | 38 | public int getSizeInKb() { 39 | return sizeInKb; 40 | } 41 | 42 | 43 | public void setSizeInKb(int sizeInKb) { 44 | this.sizeInKb = sizeInKb; 45 | } 46 | 47 | 48 | public int getTotalPages() { 49 | return uniquePages; 50 | } 51 | 52 | 53 | public void setTotalPages(int totalPages) { 54 | this.uniquePages = totalPages; 55 | } 56 | 57 | 58 | public String getDomain() { 59 | return domain; 60 | } 61 | 62 | 63 | public void setDomain(String domain) { 64 | this.domain = domain; 65 | } 66 | 67 | 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page 4 | 5 | 6 | 7 | 8 |

A very simple page

9 | 10 |

An absolute link leading to example.com

11 |

A relative link leading to subfoldersubfolder

12 |

A root-based link leading to a parallel page otherpage.html.

13 | 14 |

An absolute referenced image:

15 |

A relatively referenced image:

16 |

A root referenced image (same as the absolute one):

17 | 18 |

An image with delayed loading as per JQuery convention:

19 | 20 | 21 | 22 | 27 | 28 | 29 | 30 |
Single cell
31 | 32 | 33 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcFileLocationResolverInterface.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.interfaces; 2 | 3 | import java.util.Map; 4 | import java.util.function.Supplier; 5 | 6 | public interface ArcFileLocationResolverInterface { 7 | 8 | /** 9 | * Returns a {@link Supplier} that delivers an InputStream for the given (WARC) source file. 10 | * The supplier can be called multiple times, each time delivering an InputStream positioned at the beginning of 11 | * the source file. 12 | * It is the responsibility of the caller to close the InputStream after use. 13 | * 14 | * This level of indirection allows for handling of moved files, WARCs delivered over HTTP or similar. 15 | * A simple situation is just a string manipulation of the url, a more complicated situation can be 16 | * using a lookup service give the filename. 17 | * 18 | * Implementing classes must have the default constructor. 19 | * Parameters are given by setParameters method. 20 | * 21 | * @param source_file_path is the complete file path when the arc file was indexed. example : /mountA/0211/filedir/12345.warc.gz 22 | * @return a Supplier that delivers an InputStream for the (w)arc file, positioned at the beginning. 23 | */ 24 | ArcSource resolveArcFileLocation(String source_file_path); 25 | 26 | void setParameters(Map parameters); 27 | void initialize(); 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/normalize/FilenameNormalizeTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.normalize; 2 | 3 | import org.junit.Test; 4 | 5 | import static dk.kb.netarchivesuite.solrwayback.export.StreamingRawZipExport.normalizeFilename; 6 | import static org.junit.Assert.assertEquals; 7 | 8 | public class FilenameNormalizeTest { 9 | 10 | @Test 11 | public void testUnderscores(){ 12 | String test = "test__filename_with___cuncurrent_underscores_.ext"; 13 | 14 | String result = normalizeFilename(test); 15 | 16 | assertEquals("test_filename_with_cuncurrent_underscores.ext", result); 17 | } 18 | 19 | @Test 20 | public void testBadCharacters(){ 21 | String test = "test%file&name.with.extra.punctuation&what.ext"; 22 | String result = normalizeFilename(test); 23 | assertEquals("testfilenamewithextrapunctuationwhat.ext", result); 24 | } 25 | 26 | 27 | @Test 28 | public void testLongName(){ 29 | String test = "1234567890_thisfilenameiswaytolongforaproperfilename_howdoesthemethodhandleme_iwonderificandestroysomesystemsbybeingsoboringlyuglylong" + 30 | "omgthisisonlyhalfthelengthofwhatineedtobetodestroysomesystemshowonearthdoibecomeaslongasneededmaybeitwillhelpifijustrambleonandonandonforsomewords.ext"; 31 | String result = normalizeFilename(test); 32 | assertEquals(255, result.length()); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | dk.kb.netarchivesuite.solrwayback.service 8 | 9 | 10 | dk.kb.netarchivesuite.solrwayback.listeners.InitializationContextListener 11 | 12 | 13 | 14 | SolrWayback Service 15 | org.apache.cxf.jaxrs.servlet.CXFNonSpringJaxrsServlet 16 | 17 | javax.ws.rs.Application 18 | dk.kb.netarchivesuite.solrwayback.service.SolrWaybackApplication 19 | 20 | 1 21 | 22 | 23 | 24 | SolrWayback Service 25 | /services/* 26 | 27 | 28 | 29 | 30 | index.html 31 | 32 | 33 | 34 | 404 35 | /leakingForward.jsp 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/test/resources/example_warc/Evil-Warc-Headers.warc: -------------------------------------------------------------------------------- 1 | WARC/0.17 2 | WARC-Type: response 3 | WARC-Target-URI: http://www.archive.org/robots.txt 4 | WARC-Date: 2008-04-30T20:48:25Z 5 | WARC-Payload-Digest: sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U 6 | WARC-IP-Address: 207.241.229.39 7 | WARC-Record-ID: 8 | Content-Type: application/http; msgtype=response 9 | Content-Length: 6000 10 | Server: BigIP 11 | Content-Type: text/html; charset=utf-8 12 | Accept-Ranges: bytes 13 | Connection: close 14 | 15 | HTTP/1.1 200 OK 16 | Date: Wed, 30 Apr 2008 20:48:24 GMT 17 | Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 mod_ssl/2.0.54 OpenSSL/0.9.7g 18 | Last-Modified: Sat, 02 Feb 2008 19:40:44 GMT 19 | ETag: "47c3-1d3-11134700" 20 | Accept-Ranges: bytes 21 | Date: Tue, 01 Mar 2022 14:48:02 GMT 22 | Age: 866 23 | Content-Length: 5155 24 | Connection: close 25 | Content-Type: text/plain; charset=UTF-8 26 | 27 | ############################################## 28 | # 29 | # Welcome to the Archive! 30 | # 31 | ############################################## 32 | # Please crawl our files. 33 | # We appreciate if you can crawl responsibly. 34 | # Stay open! 35 | ############################################## 36 | User-agent: * 37 | Disallow: /nothing---please-crawl-us-- 38 | 39 | # slow down the ask jeeves crawler which was hitting our SE a little too fast 40 | # via collection pages. --Feb2008 tracey-- 41 | User-agent: Teoma 42 | Crawl-Delay: 10 -------------------------------------------------------------------------------- /src/js/src/components/searchSingleItemComponents/searchSingleItemTypes/SearchSingleItemImage.vue: -------------------------------------------------------------------------------- 1 | 14 | 15 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/IndexDocShort.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto; 2 | 3 | 4 | import javax.xml.bind.annotation.XmlRootElement; 5 | 6 | 7 | @XmlRootElement 8 | public class IndexDocShort { 9 | 10 | private long offset; 11 | private String source_file_path; 12 | private String crawlDate; // format 2009-12-09T05:32:50Z 13 | private String url; 14 | private String url_norm; 15 | 16 | public IndexDocShort(){ 17 | } 18 | 19 | public long getOffset() { 20 | return offset; 21 | } 22 | 23 | public void setOffset(long offset) { 24 | this.offset = offset; 25 | } 26 | 27 | public String getSource_file_path() { 28 | return source_file_path; 29 | } 30 | 31 | public void setSource_file_path(String source_file_path) { 32 | this.source_file_path = source_file_path; 33 | } 34 | 35 | public String getUrl() { 36 | return url; 37 | } 38 | 39 | public void setUrl(String url) { 40 | this.url = url; 41 | } 42 | 43 | public String getUrl_norm() { 44 | return url_norm; 45 | } 46 | 47 | public void setUrl_norm(String url_norm) { 48 | this.url_norm = url_norm; 49 | } 50 | 51 | public String getCrawlDate() { 52 | return crawlDate; 53 | } 54 | 55 | public void setCrawlDate(String crawlDate) { 56 | this.crawlDate = crawlDate; 57 | } 58 | 59 | 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/playback/JavascriptPlayback.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.playback; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import dk.kb.netarchivesuite.solrwayback.parsers.HtmlParserUrlRewriter; 7 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 8 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; 9 | 10 | public class JavascriptPlayback extends PlaybackHandler{ 11 | 12 | private static final Logger log = LoggerFactory.getLogger(CssPlayback.class); 13 | 14 | public JavascriptPlayback(ArcEntry arc, IndexDoc doc, boolean showToolbar){ 15 | super(arc,doc,showToolbar); 16 | } 17 | 18 | // TODO: Enable propagation of lenient through HtmlParserUrlRewriter.replaceLinksCss 19 | @Override 20 | public ArcEntry playback(boolean lenient) throws Exception{ 21 | //Never show the toolbar. 22 | // TODO: What was the purpose of this round trip? If re-enabled, please state why in a comment 23 | //arc.setBinary(IOUtils.toByteArray(arc.getStringContentAsStringSafe())); //TODO charset; 24 | //log.debug("javascript playback"); 25 | 26 | 27 | String textReplaced = HtmlParserUrlRewriter.replaceLinksCss(arc); 28 | // TODO: This logic was wrong. Content Encoding states compression and is independent of Content Charset 29 | // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding 30 | arc.setStringContent(textReplaced); 31 | return arc; 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /src/js/src/services/dataTransformationHelper.js: -------------------------------------------------------------------------------- 1 | export default { 2 | transformSearchResponse(data) { 3 | data.response.searchType = 'post' 4 | for(let i = 0; i < data.response.docs.length; i++) { 5 | data.response.docs[i].highlight = data.highlighting[data.response.docs[i].id] 6 | } 7 | return data 8 | }, 9 | transformGroupedSearchResponse(data) { 10 | data.response = {} 11 | data.response.docs = [] 12 | data.response.numFound = data.grouped.url.doclist.numFound 13 | data.response.maxScore = data.grouped.url.doclist.maxScore 14 | data.response.start = data.grouped.url.doclist.start 15 | data.response.cardinality = data.stats.stats_fields.url.cardinality 16 | data.response.searchType = 'post' 17 | for(let i = 0; i < data.grouped.url.doclist.docs.length; i++) { 18 | data.response.docs[i] = data.grouped.url.doclist.docs[i] 19 | data.response.docs[i].highlight = data.highlighting[data.grouped.url.doclist.docs[i].id] 20 | } 21 | return data 22 | }, 23 | transformImageResponse(data, type) { 24 | let obj = {} 25 | obj.response = { 26 | searchType:type, 27 | images:data, 28 | } 29 | // THIS IS FOR TEST PURPOSES ONLY - IF THE IMAGES DONT HAVE LAT/LNG DATA. 30 | /*if(type === 'geoImage') { 31 | for(let y = 0; y < obj.response.images.length; y++) { 32 | obj.response.images[y].latitude = 56 + Math.random() 33 | obj.response.images[y].longitude = 10 + Math.random() 34 | } 35 | } */ 36 | return obj 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/util/NamedConsumer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | * 14 | */ 15 | package dk.kb.netarchivesuite.solrwayback.util; 16 | 17 | import java.util.function.Consumer; 18 | 19 | /** 20 | * Wrapper for {@link Consumer} that takes a name and uses it in {@link #toString()}. 21 | * Intended for log messages and debugging. 22 | */ 23 | public class NamedConsumer implements Consumer { 24 | private final String name; 25 | private final Consumer inner; 26 | 27 | public NamedConsumer(Consumer inner, String name) { 28 | this.name = name; 29 | this.inner = inner; 30 | } 31 | 32 | @Override 33 | public void accept(C c) { 34 | inner.accept(c); 35 | } 36 | 37 | @Override 38 | public Consumer andThen(Consumer consumer) { 39 | return Consumer.super.andThen(consumer); 40 | } 41 | 42 | public String toString() { 43 | return "NamedConsumer(" + name + ")"; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/resources/example_rewrite/multisource.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Some page 4 | 5 | 6 |

srcset testing

7 | 8 |

An image with absolute srcset:

9 |

An image with mixed casing and spacing:

10 |

An image with absolute srcset and post-space:

11 |

An image with absolute srcset and substring trickery:

12 |

An image with relative srcset:

13 |

An image with relative srcset and substring trickery:

14 |

An image with root srcset:

15 |

Reproduction of specific problem:

16 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/statistics/DomainStatistics.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.service.dto.statistics; 2 | 3 | import javax.xml.bind.annotation.XmlRootElement; 4 | 5 | @XmlRootElement 6 | public class DomainStatistics { 7 | 8 | private String date; 9 | private int ingoingLinks; 10 | private int sizeInKb; 11 | private int uniquePages; 12 | private String domain; 13 | private int contentTextLength; 14 | 15 | public DomainStatistics(){ 16 | } 17 | 18 | public String getDate() { 19 | return date; 20 | } 21 | 22 | public void setDate(String date) { 23 | this.date = date; 24 | } 25 | 26 | public int getIngoingLinks() { 27 | return ingoingLinks; 28 | } 29 | 30 | public void setIngoingLinks(int ingoingLinks) { 31 | this.ingoingLinks = ingoingLinks; 32 | } 33 | 34 | public int getSizeInKb() { 35 | return sizeInKb; 36 | } 37 | 38 | public void setSizeInKb(int sizeInKb) { 39 | this.sizeInKb = sizeInKb; 40 | } 41 | 42 | public int getTotalPages() { 43 | return uniquePages; 44 | } 45 | 46 | public void setTotalPages(int totalPages) { 47 | this.uniquePages = totalPages; 48 | } 49 | 50 | public String getDomain() { 51 | return domain; 52 | } 53 | 54 | public void setDomain(String domain) { 55 | this.domain = domain; 56 | } 57 | 58 | public int getContentTextLength() { 59 | return contentTextLength; 60 | } 61 | 62 | public void setContentTextLength(int contentTextLength) { 63 | this.contentTextLength = contentTextLength; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/js/src/components/searchSingleItemComponents/searchSingleItemTypes/SearchSingleItemTweet.vue: -------------------------------------------------------------------------------- 1 | 14 | 15 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/js/src/components/searchSingleItemComponents/searchSingleItemTypes/SearchSingleItemWeb.vue: -------------------------------------------------------------------------------- 1 | 14 | 15 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/js/src/components/searchSingleItemComponents/searchSingleItemTypes/SearchSingleItemDefault.vue: -------------------------------------------------------------------------------- 1 | 14 | 15 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/HarvestsDay.vue: -------------------------------------------------------------------------------- 1 | 11 | 12 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/plugins/tranformationHelpers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Calculate activity level linearly between 0 and 4. 3 | * 0 is no activity level at all, 4 is the max level. 4 | */ 5 | export function calculateLinearActivityLevel(harvestsInMonth, maximumHarvests) { 6 | if (harvestsInMonth > maximumHarvests * 0.75 && harvestsInMonth <= maximumHarvests) { 7 | return 4 8 | } else if (harvestsInMonth > maximumHarvests * 0.50 && harvestsInMonth <= maximumHarvests * 0.75) { 9 | return 3 10 | } else if (harvestsInMonth > maximumHarvests * 0.25 && harvestsInMonth <= maximumHarvests * 0.50) { 11 | return 2 12 | } else if (harvestsInMonth > 0 && harvestsInMonth <= maximumHarvests * 0.25) { 13 | return 1 14 | } 15 | 16 | return 0 17 | } 18 | 19 | 20 | /** 21 | * Calculate activity level logarithmically. 22 | */ 23 | export function calculateLogarithmicActivityLevel(harvestsInMonth, maximumHarvests) { 24 | 25 | const logarithmicResult = getBaseLog(maximumHarvests, harvestsInMonth) 26 | 27 | if (logarithmicResult > 0.75 && logarithmicResult <= 1) { 28 | return 4 29 | } else if (logarithmicResult > 0.50 && logarithmicResult <= 0.75) { 30 | return 3 31 | } else if (logarithmicResult > 0.25 && logarithmicResult <= 0.50) { 32 | return 2 33 | } else if (logarithmicResult > 0 && logarithmicResult <= 0.25) { 34 | return 1 35 | } 36 | 37 | return 0 38 | } 39 | 40 | 41 | /** 42 | * The following function returns the logarithm of y with base x, ie. logx(y): 43 | */ 44 | function getBaseLog(x, y) { 45 | return Math.log(y) / Math.log(x) 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/util/CountingMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | * 14 | */ 15 | package dk.kb.netarchivesuite.solrwayback.util; 16 | 17 | import org.apache.commons.logging.LogFactory; 18 | import org.apache.commons.logging.Log; 19 | 20 | import java.util.HashMap; 21 | 22 | /** 23 | * Map that keeps track of the number of successfull and unsuccessfull {@link #get(Object)} calls. 24 | */ 25 | public class CountingMap extends HashMap { 26 | private static Log log = LogFactory.getLog(CountingMap.class); 27 | 28 | private int found = 0; 29 | private int fail = 0; 30 | 31 | @Override 32 | public V get(Object o) { 33 | V value = super.get(o); 34 | if (value == null) { 35 | fail++; 36 | } else { 37 | found++; 38 | } 39 | return value; 40 | } 41 | 42 | public int getFoundCount() { 43 | return found; 44 | } 45 | 46 | public int getFailCount() { 47 | return fail; 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcFileParserFactory.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | import dk.kb.netarchivesuite.solrwayback.interfaces.ArcSource; 4 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 5 | 6 | import java.util.Locale; 7 | 8 | public class ArcFileParserFactory { 9 | 10 | /* 11 | * Do not call this call. This class is called from ArcParseFileResolver and will use file-mapping first 12 | * 13 | * @param file_path is the file location, the file location must be resolved first. 14 | * @param offset offset in the warc file 15 | */ 16 | public static ArcEntry getArcEntry(ArcSource arcSource, long offset) throws Exception{ 17 | 18 | if (arcSource == null ){ 19 | throw new IllegalArgumentException("No arcSupplier provided"); 20 | } 21 | 22 | ArcEntry arcEntry = null; 23 | String sourceLowercase = arcSource.getSource().toLowerCase(Locale.ROOT); 24 | 25 | 26 | if (sourceLowercase.endsWith(".warc") || sourceLowercase.endsWith(".warc.gz") ) { 27 | arcEntry = WarcParser.getWarcEntry(arcSource, offset); 28 | } 29 | 30 | else if (sourceLowercase.endsWith(".arc") || sourceLowercase.endsWith("arc.gz")){ 31 | arcEntry = ArcParser.getArcEntry(arcSource, offset); 32 | } 33 | else{ 34 | throw new IllegalArgumentException( 35 | "Expected (W)ARC source not arc or warc: '"+ arcSource.getSource() + "'"); 36 | } 37 | 38 | return arcEntry; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/export/TestGenerateCSV.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.export; 2 | 3 | import java.io.PrintWriter; 4 | 5 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils; 6 | import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader; 7 | import dk.kb.netarchivesuite.solrwayback.solr.SolrStreamingExportClient; 8 | import org.apache.solr.client.solrj.SolrClient; 9 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient; 10 | 11 | public class TestGenerateCSV { 12 | 13 | private static final String SOLR = "http://localhost:8983/solr/netarchivebuilder"; 14 | 15 | public static void main(String[] args) throws Exception{ 16 | 17 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath()); 18 | 19 | String query = "thomas egense"; 20 | String filter = null; 21 | 22 | String fields = "id, domain, hash , links_images "; 23 | SolrClient solrClient = new HttpJdkSolrClient.Builder(SOLR).build(); 24 | SolrStreamingExportClient solr = SolrStreamingExportClient.createCvsExporter(solrClient, query,fields, filter); 25 | 26 | StreamingSolrExportBufferedInputStream streamExport = new StreamingSolrExportBufferedInputStream(solr,100); 27 | 28 | PrintWriter writer = new PrintWriter("export.txt", "UTF-8"); 29 | 30 | int read = streamExport.read(); 31 | while (read != -1){ 32 | // System.out.print(Character.toString((char) read)); 33 | writer.write(Character.toString((char) read)); 34 | read=streamExport.read(); 35 | } 36 | writer.close(); 37 | streamExport.close(); 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/export/TestGenerateLinkGraphCSV.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.export; 2 | 3 | import java.io.PrintWriter; 4 | 5 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils; 6 | import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader; 7 | import dk.kb.netarchivesuite.solrwayback.solr.SolrStreamingLinkGraphCSVExportClient; 8 | import org.apache.solr.client.solrj.SolrClient; 9 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient; 10 | 11 | public class TestGenerateLinkGraphCSV { 12 | 13 | private static final String SOLR_SERVER = "http://belinda:8983/solr/netarchivebuilder"; 14 | 15 | public static void main(String[] args) throws Exception{ 16 | 17 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath()); 18 | 19 | String query = "katte"; 20 | 21 | SolrClient solrClient = new HttpJdkSolrClient.Builder(PropertiesLoader.SOLR_SERVER).build(); 22 | SolrStreamingLinkGraphCSVExportClient solr = SolrStreamingLinkGraphCSVExportClient.createExporter(solrClient, query); 23 | 24 | //MAX 100.000 results 25 | StreamingSolrExportBufferedInputStream streamExport = new StreamingSolrExportBufferedInputStream(solr,100000); 26 | 27 | PrintWriter writer = new PrintWriter("target/linkgraph.csv", "UTF-8"); 28 | 29 | int read = streamExport.read(); 30 | while (read != -1){ 31 | // System.out.print(Character.toString((char) read)); 32 | writer.write(Character.toString((char) read)); 33 | read=streamExport.read(); 34 | } 35 | writer.close(); 36 | 37 | streamExport.close(); 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/solr/IndexWatcherTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.solr; 2 | 3 | import org.apache.solr.client.solrj.SolrClient; 4 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient; 5 | 6 | /* 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * 19 | */ 20 | public class IndexWatcherTest { 21 | 22 | public static final String SOLR_SERVER = "http://localhost:8983/solr/netarchivebuilder"; 23 | 24 | /** 25 | * Not a unit test! 26 | * 27 | * This method requires a running Solr and only outputs state changes. 28 | * 29 | * Use this by starting the test, then start, stop or update the Solr collection {@code netarchivebuilder} on 30 | * {@code localhost:8983} (default for the SolrWayback bundle) while watching the output. 31 | */ 32 | public void disabledtestAgainstExistingIndex() throws InterruptedException { 33 | SolrClient solrClient = new HttpJdkSolrClient.Builder(SOLR_SERVER).build(); 34 | IndexWatcher watcher = new IndexWatcher( 35 | solrClient, 500, 36 | status -> System.out.println("New status: " + status)); 37 | Thread.sleep(100000000); 38 | } 39 | } -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/AllYearsGraph.vue: -------------------------------------------------------------------------------- 1 | 23 | 24 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/util/LimitedReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | * 14 | */ 15 | package dk.kb.netarchivesuite.solrwayback.util; 16 | 17 | import java.io.IOException; 18 | import java.io.Reader; 19 | 20 | /** 21 | * Wrapper for a {@link Reader} that limits the amount of characters that are delivered. 22 | * Excess characters are ignored. 23 | */ 24 | public class LimitedReader extends Reader { 25 | private final Reader source; 26 | private long charactersLeft; 27 | 28 | public LimitedReader(Reader source, long maxCharacters) { 29 | this.source = source; 30 | charactersLeft = maxCharacters; 31 | } 32 | 33 | @Override 34 | public int read(char[] cbuf, int off, int len) throws IOException { 35 | if (charactersLeft == 0) { 36 | return -1; 37 | } 38 | int newLen = (int) Math.min(charactersLeft, len); 39 | int read = source.read(cbuf, off, newLen); 40 | if (read != -1) { 41 | charactersLeft -= read; 42 | } 43 | return read; 44 | } 45 | 46 | @Override 47 | public void close() throws IOException { 48 | source.close(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /src/bundle/solr_config/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /src/js/src/components/harvestCalendar/harvestDateHelper.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Returns an array of the months in the year (0-11) 3 | */ 4 | export function getArrayOfMonths() { 5 | return [...Array(12).keys()] // [0, 1, 2, ..., 11] 6 | } 7 | 8 | 9 | /** 10 | * Returns an array of the harvest for a given month and year. 11 | * 12 | * @param {number} year 13 | * @param {number} month 14 | * @param {Array} parsedHarvestDates 15 | */ 16 | export function getHarvestsForMonth(year, month, parsedHarvestDates) { 17 | return parsedHarvestDates 18 | .filter(date => date.getMonth() === month && date.getFullYear() === year) 19 | } 20 | 21 | 22 | /** 23 | * Returns an array of the harvests for a given date. 24 | * 25 | * @param {Date} day 26 | * @param {Array} parsedHarvestDates 27 | */ 28 | export function getHarvestsForDay(day, parsedHarvestDates) { 29 | return parsedHarvestDates 30 | .filter(date => isSameDay(date, day)) 31 | } 32 | 33 | 34 | /** 35 | * Helper function for determining of two dates are the same day. 36 | * 37 | * @param {*} day1 38 | * @param {*} day2 39 | */ 40 | function isSameDay(day1, day2) { 41 | return day1.getFullYear() === day2.getFullYear() && day1.getMonth() === day2.getMonth() && day1.getDate() === day2.getDate() 42 | } 43 | 44 | 45 | /** 46 | * Given a Date object, return the number of days in the month. 47 | * Source: http://stackoverflow.com/questions/1184334/get-number-days-in-a-specified-month-using-javascript 48 | * 49 | * It takes adds one to the month of the dateObject, but sets the day to 0. 50 | * This gives the last day of the month of the dateObject. 51 | * 52 | * @param {Date} dateObject 53 | */ 54 | export function getDaysInMonth(dateObject) { 55 | return new Date(dateObject.getFullYear(), dateObject.getMonth() + 1, 0).getDate() 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/normalise/NormalisationMinimal.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.normalise; 2 | 3 | import java.net.URL; 4 | 5 | import org.apache.commons.httpclient.URIException; 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; 9 | 10 | public class NormalisationMinimal { 11 | private static Log log = LogFactory.getLog( NormalisationLegacy.class ); 12 | 13 | private static AggressiveUrlCanonicalizer canon = new AggressiveUrlCanonicalizer(); 14 | 15 | 16 | public static String canonicaliseHost(String host) throws URIException { 17 | return canon.urlStringToKey(host.trim()).replace("/", ""); 18 | } 19 | 20 | public static String canonicaliseURL(String url) { 21 | return canonicaliseURL(url, true, true); 22 | } 23 | 24 | 25 | public static String resolveRelative(String url, String relative) throws IllegalArgumentException { 26 | return resolveRelative(url, relative, true); 27 | } 28 | 29 | public static String resolveRelative(String url, String relative, boolean normalise) throws IllegalArgumentException { 30 | try { 31 | URL rurl = new URL(url); 32 | String resolved = new URL(rurl, relative).toString(); 33 | return normalise ? canonicaliseURL(resolved) : resolved; 34 | } catch (Exception e) { 35 | throw new IllegalArgumentException(String.format( 36 | "Unable to resolve '%s' relative to '%s'", relative, url), e); 37 | } 38 | } 39 | 40 | public static String canonicaliseURL(String url, boolean allowHighOrder, boolean createUnambiguous) { 41 | //DO nothing 42 | return url; 43 | } 44 | } 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/util/StatusInputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | * 14 | */ 15 | package dk.kb.netarchivesuite.solrwayback.util; 16 | 17 | import java.io.FilterInputStream; 18 | import java.io.InputStream; 19 | 20 | /** 21 | * Stream-wrapper with status for the content. 22 | */ 23 | public class StatusInputStream extends FilterInputStream { 24 | public enum STATUS {ok, exception, empty} 25 | 26 | private final STATUS status; 27 | private final Exception exception; 28 | private final long size; 29 | 30 | public StatusInputStream(InputStream in, STATUS status, long expectedSize) { 31 | super(in); 32 | this.status = status; 33 | this.exception = null; 34 | this.size = expectedSize; 35 | } 36 | 37 | public StatusInputStream(InputStream in, Exception exception, long expectedSize) { 38 | super(in); 39 | this.status = STATUS.exception; 40 | this.exception = exception; 41 | this.size = expectedSize; 42 | } 43 | 44 | public STATUS getStatus() { 45 | return status; 46 | } 47 | 48 | public Exception getException() { 49 | return exception; 50 | } 51 | 52 | public long size() { 53 | return size; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/js/src/components/AppliedSearchFacets.vue: -------------------------------------------------------------------------------- 1 | 11 | 12 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/TestExportArc.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers; 2 | 3 | import java.nio.file.Files; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | import java.nio.file.StandardOpenOption; 7 | 8 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils; 9 | import dk.kb.netarchivesuite.solrwayback.interfaces.ArcSource; 10 | import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader; 11 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 12 | 13 | public class TestExportArc { 14 | 15 | public static void main (String[] args) throws Exception{ 16 | 17 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath()); 18 | 19 | 20 | String arcFile="/media/teg/1200GB_SSD/netarkiv/0205/filedir/27119-33-20080401194737-00004-kb-prod-har-001.kb.dk.arc.gz"; 21 | long offset=10776284; 22 | 23 | 24 | 25 | ArcEntry arcEntry = ArcParser.getArcEntry(ArcSource.fromFile(arcFile), offset); 26 | 27 | String warcHeader = ArcHeader2WarcHeader.arcHeader2WarcHeader(arcEntry); 28 | 29 | 30 | Path exportPath = Paths.get("arc2warc-pdf-error.warc"); 31 | 32 | try{ 33 | Files.delete(exportPath); 34 | } 35 | catch(Exception e){ 36 | 37 | } 38 | Files.createFile(exportPath); 39 | System.out.println(arcEntry.getHeader()); 40 | System.out.println("-----"); 41 | System.out.println(warcHeader); 42 | 43 | Files.write(exportPath, warcHeader.getBytes(WarcParser.WARC_HEADER_ENCODING), StandardOpenOption.APPEND); 44 | Files.write(exportPath, arcEntry.getBinaryDecodedBytes(), StandardOpenOption.APPEND); 45 | Files.write(exportPath, "\r\n\r\n".getBytes(WarcParser.WARC_HEADER_ENCODING), StandardOpenOption.APPEND); // separator 46 | 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/js/src/components/modalComponents/PrimaryModal.vue: -------------------------------------------------------------------------------- 1 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/playback/HtmlPlayback.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.playback; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import dk.kb.netarchivesuite.solrwayback.parsers.ParseResult; 7 | import dk.kb.netarchivesuite.solrwayback.parsers.HtmlParserUrlRewriter; 8 | import dk.kb.netarchivesuite.solrwayback.parsers.WaybackToolbarInjecter; 9 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 10 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; 11 | 12 | public class HtmlPlayback extends PlaybackHandler{ 13 | 14 | private static final Logger log = LoggerFactory.getLogger(HtmlPlayback.class); 15 | 16 | public HtmlPlayback(ArcEntry arc, IndexDoc doc, boolean showToolbar){ 17 | super(arc,doc,showToolbar); 18 | } 19 | 20 | @Override 21 | public ArcEntry playback(boolean lenient) throws Exception{ 22 | log.debug(" Generate webpage from FilePath:{} offset:{} content encoding:{} lenient:{}", 23 | doc.getSource_file_path(), doc.getOffset(), arc.getContentEncoding(), lenient); 24 | long start = System.currentTimeMillis(); 25 | 26 | 27 | ParseResult htmlReplaced = HtmlParserUrlRewriter.replaceLinks(arc, lenient); 28 | String textReplaced=htmlReplaced.getReplaced(); 29 | 30 | boolean xhtml = doc.getContentType().toLowerCase().contains("application/xhtml"); 31 | //Inject tooolbar 32 | if (showToolbar ){ //If true or null. 33 | textReplaced = WaybackToolbarInjecter.injectWaybacktoolBar(doc.getSource_file_path(),doc.getOffset(),htmlReplaced , xhtml); 34 | } 35 | 36 | arc.setStringContent(textReplaced); 37 | 38 | log.info("Generating webpage total processing:"+(System.currentTimeMillis()-start) + " "+doc.getSource_file_path()+ " "+ doc.getOffset() +" "+arc.getUrl()); 39 | arc.setHasBeenDecompressed(true); 40 | return arc; 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/encoders/Sha1Hash.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.encoders; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | import java.security.MessageDigest; 7 | public class Sha1Hash { 8 | 9 | public static String createSha1(File file) throws Exception { 10 | InputStream fis = null; 11 | try{ 12 | fis = new FileInputStream(file); 13 | MessageDigest digest = MessageDigest.getInstance("SHA-1"); 14 | int n = 0; 15 | byte[] buffer = new byte[8192]; 16 | while (n != -1) { 17 | n = fis.read(buffer); 18 | if (n > 0) { 19 | digest.update(buffer, 0, n); 20 | } 21 | } 22 | String hash = "sha1:" + Base32Encoder.encode( digest.digest()); 23 | return hash; 24 | } 25 | catch(Exception e){ 26 | throw e; 27 | } 28 | finally{ 29 | if (fis!=null){ 30 | fis.close(); 31 | } 32 | } 33 | } 34 | 35 | public static String createSha1(InputStream fis) throws Exception { 36 | try{ 37 | MessageDigest digest = MessageDigest.getInstance("SHA-1"); 38 | int n = 0; 39 | byte[] buffer = new byte[8192]; 40 | while (n != -1) { 41 | n = fis.read(buffer); 42 | if (n > 0) { 43 | digest.update(buffer, 0, n); 44 | } 45 | } 46 | String hash = "sha1:" + Base32Encoder.encode( digest.digest()); 47 | return hash; 48 | } 49 | catch(Exception e){ 50 | throw e; 51 | } 52 | finally{ 53 | if (fis!=null){ 54 | fis.close(); 55 | } 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/warc/ArcGzParserTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.parsers.warc; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.io.File; 6 | 7 | import org.junit.Test; 8 | 9 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils; 10 | import dk.kb.netarchivesuite.solrwayback.facade.Facade; 11 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 12 | 13 | 14 | public class ArcGzParserTest extends UnitTestUtils { 15 | 16 | @Test 17 | public void testArcGzParserHtml() throws Exception { 18 | 19 | File file = getFile("src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz"); 20 | 21 | ArcEntry arcEntry = Facade.getArcEntry(file.getCanonicalPath(), 1306); //HTML entry 22 | assertEquals("text/html", arcEntry.getContentType()); 23 | assertEquals("www.archive.org", arcEntry.getFileName()); 24 | assertEquals(366, arcEntry.getContentLength()); //From header 25 | assertEquals(366,arcEntry.getBinaryDecodedBytes().length); //Actually loaded in binary 26 | assertEquals(200,arcEntry.getStatus_code()); 27 | //System.out.println(new String(arcEntry.getBinary())); //from to 28 | } 29 | 30 | 31 | @Test 32 | public void testArcGzParserImage() throws Exception { 33 | 34 | File file = getFile("src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz"); 35 | 36 | ArcEntry arcEntry = Facade.getArcEntry(file.getCanonicalPath(), 7733); //Image entry (or 9699) 37 | assertEquals("image/jpeg", arcEntry.getContentType()); 38 | assertEquals("logoc.jpg", arcEntry.getFileName()); 39 | assertEquals(1662, arcEntry.getContentLength()); //From header 40 | assertEquals(1662,arcEntry.getBinaryDecodedBytes().length); //Actually loaded in binary 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/dk/kb/netarchivesuite/solrwayback/interfaces/FileMovedMappingResolverTest.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.interfaces; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.io.File; 6 | import org.junit.Test; 7 | 8 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils; 9 | 10 | public class FileMovedMappingResolverTest extends UnitTestUtils { 11 | 12 | @Test 13 | public void testFileMovedMappingResolver() throws Exception { 14 | File file = getFile("src/test/resources/arc_resolvers/FileMovedMappingTest.txt"); 15 | 16 | FileMovedMappingResolver resolver = new FileMovedMappingResolver(); 17 | resolver.setMappingFile(file.getCanonicalPath()); 18 | resolver.initialize(); 19 | 20 | //Some warc-files not defined in the moved list 21 | String warc1="/abc/test/example.warc"; 22 | String warc2="/netarchivemount/warcs/111/example111.warc"; 23 | assertEquals(warc1, resolver.resolveArcFileLocation(warc1).getSource()); 24 | assertEquals(warc2, resolver.resolveArcFileLocation(warc2).getSource()); 25 | 26 | //These two has been moved 27 | String warc3="/home/xxx/solrwayback_package_4.2.1/indexing/warcs1/356548-347-20210201093000132-00000-sb-prod-har-001.statsbiblioteket.dk.warc.gz"; 28 | String warc4="/mount/netarchive/test-00000.warc.gz"; 29 | String warc3_moved= resolver.resolveArcFileLocation("/home/old/location/356548-347-20210201093000132-00000-sb-prod-har-001.statsbiblioteket.dk.warc.gz").getSource(); 30 | warc3_moved = warc3_moved.replace("\\", "/"); //When running unittest on windows... 31 | assertEquals(warc3, warc3_moved); 32 | String warc4_moved= resolver.resolveArcFileLocation("/oldlocation/test-00000.warc.gz").getSource(); 33 | warc4_moved = warc4_moved.replace("\\", "/"); //When running unittest on windows... 34 | assertEquals(warc4,warc4_moved); 35 | 36 | 37 | 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/dk/kb/netarchivesuite/solrwayback/playback/JodelPlayback.java: -------------------------------------------------------------------------------- 1 | package dk.kb.netarchivesuite.solrwayback.playback; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | 7 | import dk.kb.netarchivesuite.solrwayback.parsers.ParseResult; 8 | import dk.kb.netarchivesuite.solrwayback.parsers.WaybackToolbarInjecter; 9 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; 10 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; 11 | 12 | /* 13 | * This is for JSON harvesting from Jodel. (There is no HTML harvest for Jodel) 14 | */ 15 | public class JodelPlayback extends PlaybackHandler{ 16 | 17 | private static final Logger log = LoggerFactory.getLogger(JodelPlayback.class); 18 | 19 | public JodelPlayback(ArcEntry arc, IndexDoc doc, boolean showToolbar){ 20 | super(arc,doc,showToolbar); 21 | } 22 | 23 | @Override 24 | public ArcEntry playback(boolean lenient) throws Exception{ 25 | log.debug(" Generate Jodel post from FilePath:" + doc.getSource_file_path() + " offset:" + doc.getOffset()); 26 | //Fake html into arc. 27 | 28 | String json = arc.getStringContentAsStringSafe(); 29 | String html = "TODO";//Jodel2Html.render(json, arc.getCrawlDate()); 30 | arc.setStringContent(html); 31 | arc.setContentType("text/html"); 32 | ParseResult htmlReplaced = new ParseResult(); //Do not parse. 33 | htmlReplaced.setReplaced(html); 34 | String textReplaced=htmlReplaced.getReplaced(); //TODO count linkes found, replaced 35 | 36 | //Inject tooolbar 37 | if (showToolbar){ //If true or null. 38 | textReplaced = WaybackToolbarInjecter.injectWaybacktoolBar(doc,htmlReplaced, false); 39 | } 40 | String encoding="UTF-8"; // hack, since the HTML was generated as UTF-8. 41 | arc.setContentEncoding(encoding); 42 | arc.setStringContent(textReplaced); //can give error. uses UTF-8 (from index) instead of ISO-8859-1 43 | 44 | return arc; 45 | } 46 | 47 | } 48 | --------------------------------------------------------------------------------