4 | Her skal Zahra, Jesper og Jørn også lave en hel masse....
5 |
6 |
7 |
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_brotli.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_brotli.warc
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_gzip.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_gzip.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_none.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_none.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_brotli.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_brotli.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_arc/IAH-20080430204825-00000-blackbook.arc.gz
--------------------------------------------------------------------------------
/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc
--------------------------------------------------------------------------------
/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/example_warc/IAH-20080430204825-00000-blackbook.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/hyphenations_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish hyphenations for StopFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | h
4 | n
5 | t
6 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/chart.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/main/resources/dk/kb/netarchivesuite/webservices/configuration/build.properties:
--------------------------------------------------------------------------------
1 | # Build Time Information
2 | APPLICATION.NAME=${pom.name}
3 | APPLICATION.VERSION=${pom.version}
4 | APPLICATION.BUILDTIME=${build.time}
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_gzip_chunked.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_gzip_chunked.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/compressions_warc/transfer_compression_none_truncated.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netarchivesuite/solrwayback/HEAD/src/test/resources/compressions_warc/transfer_compression_none_truncated.warc.gz
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | m
5 | b
6 |
--------------------------------------------------------------------------------
/src/main/resources/build.properties:
--------------------------------------------------------------------------------
1 | #$Id: build.properties 1 2011-10-21 09:33:25Z teg $
2 | # Build Time Information
3 | APPLICATION.NAME=${pom.name}
4 | APPLICATION.VERSION=${pom.version}
5 | APPLICATION.BUILDTIME=${build.time}
--------------------------------------------------------------------------------
/src/test/resources/arc_resolvers/FileMovedMappingTest.txt:
--------------------------------------------------------------------------------
1 | /home/xxx/solrwayback_package_4.2.1/indexing/warcs1/356548-347-20210201093000132-00000-sb-prod-har-001.statsbiblioteket.dk.warc.gz
2 | /mount/netarchive/test-00000.warc.gz
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
1 | # Set of French contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | l
4 | m
5 | t
6 | qu
7 | n
8 | s
9 | j
10 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets fiets
4 | bromfiets bromfiets
5 | ei eier
6 | kind kinder
7 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/image.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/js/src/mixins/SearchboxUtils.js:
--------------------------------------------------------------------------------
1 | export default {
2 | methods: {
3 | $_getSizeOfTextArea(id) {
4 | this.$refs[id].style.height = '1px'
5 | this.$refs[id].style.height = this.$refs[id].scrollHeight + 'px'
6 | },
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/resources/about_this_archive_kb.txt:
--------------------------------------------------------------------------------
1 | About us Welcome to the Danish Netarchive.
2 | For more information see Danish Netarchive .
3 |
4 | The Royal Danish Library.
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
1 | # Set of French contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | l
4 | m
5 | t
6 | qu
7 | n
8 | s
9 | j
10 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets fiets
4 | bromfiets bromfiets
5 | ei eier
6 | kind kinder
7 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/Icons8_flat_checkmark.svg:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/tools.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/video.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/bundle/indexing/batch_warcs1_folder.bat:
--------------------------------------------------------------------------------
1 | cd /D "%~dp0"
2 |
3 | FOR /R warcs1 %%G IN (*.*) do java -Dfile.encoding=UTF-8 -Xmx2048M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" "%%G"
4 |
5 |
--------------------------------------------------------------------------------
/src/bundle/indexing/batch_warcs2_folder.bat:
--------------------------------------------------------------------------------
1 | cd /D "%~dp0"
2 |
3 | FOR /R warcs2 %%G IN (*.*) do java -Dfile.encoding=UTF-8 -Xmx2048M -Djava.io.tmpdir=tika_tmp -jar warc-indexer-3.3.1-jar-with-dependencies.jar -c config3.conf -s "http://localhost:8983/solr/netarchivebuilder" "%%G"
4 |
5 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/default.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/README.md:
--------------------------------------------------------------------------------
1 | # Solr configuration
2 |
3 | This folder contains a copy of the Solr configuration and can be used upload a new Solr configuration to Solr. Only for experience Solr users that knows what they are doing.
4 | See the' Update Solr cloud configuration' in the project README.md
--------------------------------------------------------------------------------
/src/main/webapp/images/today-24px.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/js/src/assets/styles/styleVariables.scss:
--------------------------------------------------------------------------------
1 | :root {
2 | --main-bg-color: #CAF0FE;
3 | --secondary-bg-color: #002E70;
4 | --main-text-color: #303030;
5 | --secondary-text-color: #002E70;
6 | --main-highlight-color: #002E70;
7 | --secondary-highlight-color: #fff6c4;
8 | --seethrough-black: rgba(30,30,30,0.6);
9 | }
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingLineBasedExportClientInterface.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.solr;
2 |
3 | public interface SolrStreamingLineBasedExportClientInterface {
4 |
5 | public String next() throws Exception;
6 |
7 | public int getPageSize();
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetEntity.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers.json;
2 |
3 | import org.apache.commons.lang3.tuple.Pair;
4 |
5 | public interface TweetEntity {
6 | Pair getIndices();
7 | void setIndices(Pair newIndices);
8 | }
9 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/encoding.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Some page with tricky encoding
4 |
5 |
6 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/url_escape.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Some page with tricky URL
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/stopwords_path.txt:
--------------------------------------------------------------------------------
1 | # URL & path elements that should not be indexed (to save space)
2 |
3 | # www is removed by the webarchive-discovery normaliser if it is leading. If it is part of the path we want to keep it, so it is not a stopword
4 |
5 | # All URLs starts with http or https, so definitely remove those
6 | http
7 | https
8 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: "Test"
2 |
3 | on:
4 | push:
5 |
6 | jobs:
7 | build-docker-image:
8 | name: Build Docker image
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout repository
12 | uses: actions/checkout@v3
13 | - name: Build SolrWayback Docker image
14 | run: docker build --tag solrwayback .
15 |
--------------------------------------------------------------------------------
/src/js/src/assets/icons/audio.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/stopwords_path.txt:
--------------------------------------------------------------------------------
1 | # URL & path elements that should not be indexed (to save space)
2 |
3 | # www is removed by the webarchive-discovery normaliser if it is leading. If it is part of the path we want to keep it, so it is not a stopword
4 |
5 | # All URLs starts with http or https, so definitely remove those
6 | http
7 | https
8 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
1 | # Set of Italian contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | c
4 | l
5 | all
6 | dall
7 | dell
8 | nell
9 | sull
10 | coll
11 | pell
12 | gl
13 | agl
14 | dagl
15 | degl
16 | negl
17 | sugl
18 | un
19 | m
20 | t
21 | s
22 | v
23 | d
24 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/encoding_expected.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Some page with tricky encoding
4 |
5 |
6 |
11 |
12 |
--------------------------------------------------------------------------------
/src/main/resources/about_this_archive.txt:
--------------------------------------------------------------------------------
1 |
This is the default SolrWayback about us text.
2 |
3 | The text is intended to have information about the archive.
4 |
5 | Change the property about.text.file in solrwaybackweb.properties to point on a local file with the
6 | full absolute file path. Use HTML markup but with the html/body start and end tag.
7 |
8 |
--------------------------------------------------------------------------------
/src/main/webapp/images/schedule-24dp.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/inline_css.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Inline CSS page
4 |
5 |
10 |
11 |
Inline CSS
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "files.exclude": {
4 | "**/.classpath": true,
5 | "**/.project": true,
6 | "**/.settings": true,
7 | "**/.factorypath": true
8 | },
9 | "editor.codeActionsOnSave": {
10 | "source.fixAll.eslint": true
11 | },
12 | "eslint.validate": ["javascript", "vue"],
13 | "eslint.workingDirectories": [
14 | "./src/js"
15 | ]
16 | }
--------------------------------------------------------------------------------
/src/js/src/assets/icons/web.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
1 | # Set of Italian contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | c
4 | l
5 | all
6 | dall
7 | dell
8 | nell
9 | sull
10 | coll
11 | pell
12 | gl
13 | agl
14 | dagl
15 | degl
16 | negl
17 | sugl
18 | un
19 | m
20 | t
21 | s
22 | v
23 | d
24 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/css_import.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CSS includes
4 |
9 |
10 |
"
14 |
15 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/protwords.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 |
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 |
22 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetHashtag.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers.json;
2 |
3 | import com.fasterxml.jackson.annotation.JsonProperty;
4 | import org.apache.commons.lang3.tuple.Pair;
5 |
6 | public class TweetHashtag implements TweetEntity {
7 | private Pair indices;
8 |
9 | private String text;
10 |
11 |
12 | public TweetHashtag() {
13 | }
14 |
15 | @JsonProperty("indices")
16 | private void unpackIndices(int[] indices) {
17 | this.indices = Pair.of(indices[0], indices[1]);
18 | }
19 |
20 | public Pair getIndices() {
21 | return indices;
22 | }
23 |
24 | @Override
25 | public void setIndices(Pair newIndices) {
26 | this.indices = newIndices;
27 | }
28 |
29 | public String getText() {
30 | return text;
31 | }
32 |
33 | public void setText(String text) {
34 | this.text = text;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/script2_expected.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Some page with JavaScript
4 |
5 |
6 |
Modified from a specific page in the Danish net Archive
8 |
9 |
10 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/src/bundle/solrwayback_bundle.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | REM Check if an argument was provided
3 | IF "%~1"=="" (
4 | echo Usage: %~nx0 ^{start^|stop^}
5 | exit /b 1
6 | )
7 |
8 | REM Main case handling
9 | IF /I "%~1"=="start" (
10 | echo Starting solr...
11 | call solr-9\bin\solr.cmd start -c -m 4g
12 |
13 | echo Starting SolrWayback in tomcat...
14 |
15 | REM Set CATALINA_HOME to the "tomcat-9" folder inside the current directory
16 | set "CATALINA_HOME=%cd%\tomcat-9"
17 |
18 | call tomcat-9\bin\startup.bat
19 |
20 | echo.
21 | echo Started SolrWayback
22 | GOTO :eof
23 | )
24 |
25 | IF /I "%~1"=="stop" (
26 | echo Stopping SolrWayback in tomcat...
27 |
28 | REM Set CATALINA_HOME to the "tomcat-9" folder inside the current directory
29 | set "CATALINA_HOME=%cd%\tomcat-9"
30 | call tomcat-9\bin\shutdown.bat
31 |
32 | echo Stopping solr...
33 | call solr-9\bin\solr.cmd stop -all
34 |
35 | echo.
36 | echo Stopped SolrWayback
37 | GOTO :eof
38 | )
39 |
40 | REM Invalid option
41 | echo Invalid option: %~1
42 | echo Usage: %~nx0 ^{start^|stop^}
43 | exit /b 2
44 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcHTTPResolver.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed under the Apache License, Version 2.0 (the "License");
3 | * you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS,
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | * See the License for the specific language governing permissions and
12 | * limitations under the License.
13 | *
14 | */
15 | package dk.kb.netarchivesuite.solrwayback.interfaces;
16 |
17 | import org.slf4j.Logger;
18 | import org.slf4j.LoggerFactory;
19 |
20 | /**
21 | * This class is only kept for backwards compatibility.
22 | * @deprecated use {@link RewriteLocationResolver} instead.
23 | */
24 | public class ArcHTTPResolver extends RewriteLocationResolver {
25 | private static final Logger log = LoggerFactory.getLogger(ArcHTTPResolver.class);
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/js/src/mixins/StringManipulationUtils.js:
--------------------------------------------------------------------------------
1 | export default {
2 | methods: {
3 |
4 | $_displayFacetName(facet) {
5 | return facet.replace('&fq=','').split(':')[0] + ': '
6 | },
7 | $_displayFacetValue(facet) {
8 | let s = facet.split(':')
9 | return s.slice(1, s.length).join(' ').replace(/"/g,'')
10 | },
11 | $_checkDomain(domain) {
12 | // Matches at least 1 dot in the string, and no spaces.
13 | return domain.match(/^[^\s]+\.[^\s]+$/)
14 | },
15 | $_checkDate(date) {
16 | // Matches format YYYY-MM-DD
17 | return date.match(/^([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))$/)
18 | },
19 | $_displayDate(date, timeScale){
20 | // Display only the scale
21 | let end = 0
22 | if (date.length > 0) {
23 | switch (timeScale) {
24 | case 'YEAR':
25 | case 'null':
26 | end = 4
27 | break
28 | case 'MONTH':
29 | end = 7
30 | break
31 | case 'WEEK':
32 | case 'DAY':
33 | default:
34 | end = 10
35 | }}
36 | return date.slice(0, end)
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/style_element_expected.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Style page
4 |
5 |
6 |
7 |
Some styling
8 |
9 | Hello
10 |
11 |
12 | World
13 |
14 |
15 |
paragraph
16 |
17 |
--------------------------------------------------------------------------------
/src/js/src/mixins/ImageSearchUtils.js:
--------------------------------------------------------------------------------
1 | import HistoryRoutingUtils from './HistoryRoutingUtils'
2 | import { mapStores, mapActions } from 'pinia'
3 | import { useSearchStore } from '../store/search.store'
4 |
5 | export default {
6 | mixins: [HistoryRoutingUtils],
7 | computed: {
8 | // ...mapState({
9 | // searchAppliedFacets: state => state.Search.searchAppliedFacets,
10 | // solrSettings: state => state.Search.solrSettings,
11 | // }),
12 | ...mapStores(useSearchStore)
13 | },
14 | methods: {
15 | ...mapActions(useSearchStore, {
16 | updateSolrSettingImgSearch:'updateSolrSettingImgSearch',
17 | }),
18 | $_startPageSearchFromImage(searchItem) {
19 | return '/search?query=' + 'links_images:"' + encodeURIComponent(searchItem) + '"' + '&offset=0&grouping=' + this.searchStore.solrSettings.grouping + '&imgSearch=false&urlSearch=false&facets='
20 | },
21 | $_startImageSearchFromImage(searchItem) {
22 | return '/search?query=' + 'hash:"' + encodeURIComponent(searchItem) + '"' + '&offset=0&grouping=' + this.searchStore.solrSettings.grouping + '&imgSearch=false&urlSearch=false&facets='
23 | },
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
1 |
2 | a
3 | ach
4 | ag
5 | agus
6 | an
7 | aon
8 | ar
9 | arna
10 | as
11 | b'
12 | ba
13 | beirt
14 | bhúr
15 | caoga
16 | ceathair
17 | ceathrar
18 | chomh
19 | chtó
20 | chuig
21 | chun
22 | cois
23 | céad
24 | cúig
25 | cúigear
26 | d'
27 | daichead
28 | dar
29 | de
30 | deich
31 | deichniúr
32 | den
33 | dhá
34 | do
35 | don
36 | dtí
37 | dá
38 | dár
39 | dó
40 | faoi
41 | faoin
42 | faoina
43 | faoinár
44 | fara
45 | fiche
46 | gach
47 | gan
48 | go
49 | gur
50 | haon
51 | hocht
52 | i
53 | iad
54 | idir
55 | in
56 | ina
57 | ins
58 | inár
59 | is
60 | le
61 | leis
62 | lena
63 | lenár
64 | m'
65 | mar
66 | mo
67 | mé
68 | na
69 | nach
70 | naoi
71 | naonúr
72 | ná
73 | ní
74 | níor
75 | nó
76 | nócha
77 | ocht
78 | ochtar
79 | os
80 | roimh
81 | sa
82 | seacht
83 | seachtar
84 | seachtó
85 | seasca
86 | seisear
87 | siad
88 | sibh
89 | sinn
90 | sna
91 | sé
92 | sí
93 | tar
94 | thar
95 | thú
96 | triúr
97 | trí
98 | trína
99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 |
--------------------------------------------------------------------------------
/src/main/resources/about_collection.txt:
--------------------------------------------------------------------------------
1 |
About the collection
2 |
General Information
3 |
4 | This is the default example template for the 'About The Collection' text.
5 | Change the property collection.text.file in solrwaybackweb.properties to point on a local file with the
6 | full absolute file path. Use HTML markup but with the html/body start and end tag.
7 |
8 |
9 |
10 |
11 | Our collection has been collected through four primary strategies for collecting.
12 |
13 |
14 |
Harvest strategy example
15 |
16 |
Cross-sectional collection which takes a snapshot of all Danish domains up to four times a year
17 |
Selective collection from the following types of websites: all Danish news media (ranging from 12 times daily to weekly), political parties, organisations and associations, ministries and agencies, selected profiles from social media, YouTube videos (for example weekly)
18 |
Event collection of two or three events annually (for example parliamentary elections or the Corona pandemic)
19 |
Special collections for example based on research requests
20 |
21 |
22 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/smurf/SmurfBuckets.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.dto.smurf;
2 |
3 |
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import javax.xml.bind.annotation.XmlRootElement;
8 |
9 | @XmlRootElement
10 | public class SmurfBuckets {
11 |
12 |
13 | private List countPercent = new ArrayList<>();
14 | private List countsTotal= new ArrayList<>();
15 | private boolean emptyResult=true;
16 |
17 | public SmurfBuckets(){
18 | }
19 |
20 |
21 | public List getCountPercent() {
22 | return countPercent;
23 | }
24 |
25 |
26 | public void setCountPercent(List countPercent) {
27 | this.countPercent = countPercent;
28 | }
29 |
30 |
31 | public List getCountsTotal() {
32 | return countsTotal;
33 | }
34 |
35 |
36 | public void setCountsTotal(List countsTotal) {
37 | this.countsTotal = countsTotal;
38 | }
39 |
40 |
41 | public boolean isEmptyResult() {
42 | return emptyResult;
43 | }
44 |
45 |
46 | public void setEmptyResult(boolean emptyResult) {
47 | this.emptyResult = emptyResult;
48 | }
49 |
50 | }
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
1 |
2 | a
3 | ach
4 | ag
5 | agus
6 | an
7 | aon
8 | ar
9 | arna
10 | as
11 | b'
12 | ba
13 | beirt
14 | bhúr
15 | caoga
16 | ceathair
17 | ceathrar
18 | chomh
19 | chtó
20 | chuig
21 | chun
22 | cois
23 | céad
24 | cúig
25 | cúigear
26 | d'
27 | daichead
28 | dar
29 | de
30 | deich
31 | deichniúr
32 | den
33 | dhá
34 | do
35 | don
36 | dtí
37 | dá
38 | dár
39 | dó
40 | faoi
41 | faoin
42 | faoina
43 | faoinár
44 | fara
45 | fiche
46 | gach
47 | gan
48 | go
49 | gur
50 | haon
51 | hocht
52 | i
53 | iad
54 | idir
55 | in
56 | ina
57 | ins
58 | inár
59 | is
60 | le
61 | leis
62 | lena
63 | lenár
64 | m'
65 | mar
66 | mo
67 | mé
68 | na
69 | nach
70 | naoi
71 | naonúr
72 | ná
73 | ní
74 | níor
75 | nó
76 | nócha
77 | ocht
78 | ochtar
79 | os
80 | roimh
81 | sa
82 | seacht
83 | seachtar
84 | seachtó
85 | seasca
86 | seisear
87 | siad
88 | sibh
89 | sinn
90 | sna
91 | sé
92 | sí
93 | tar
94 | thar
95 | thú
96 | triúr
97 | trí
98 | trína
99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 |
--------------------------------------------------------------------------------
/src/js/src/components/harvestTimeResources/HarvestPagePreview.vue:
--------------------------------------------------------------------------------
1 |
2 |
17 |
18 |
44 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
1 | # example set of basque stopwords
2 | al
3 | anitz
4 | arabera
5 | asko
6 | baina
7 | bat
8 | batean
9 | batek
10 | bati
11 | batzuei
12 | batzuek
13 | batzuetan
14 | batzuk
15 | bera
16 | beraiek
17 | berau
18 | berauek
19 | bere
20 | berori
21 | beroriek
22 | beste
23 | bezala
24 | da
25 | dago
26 | dira
27 | ditu
28 | du
29 | dute
30 | edo
31 | egin
32 | ere
33 | eta
34 | eurak
35 | ez
36 | gainera
37 | gu
38 | gutxi
39 | guzti
40 | haiei
41 | haiek
42 | haietan
43 | hainbeste
44 | hala
45 | han
46 | handik
47 | hango
48 | hara
49 | hari
50 | hark
51 | hartan
52 | hau
53 | hauei
54 | hauek
55 | hauetan
56 | hemen
57 | hemendik
58 | hemengo
59 | hi
60 | hona
61 | honek
62 | honela
63 | honetan
64 | honi
65 | hor
66 | hori
67 | horiei
68 | horiek
69 | horietan
70 | horko
71 | horra
72 | horrek
73 | horrela
74 | horretan
75 | horri
76 | hortik
77 | hura
78 | izan
79 | ni
80 | noiz
81 | nola
82 | non
83 | nondik
84 | nongo
85 | nor
86 | nora
87 | ze
88 | zein
89 | zen
90 | zenbait
91 | zenbat
92 | zer
93 | zergatik
94 | ziren
95 | zituen
96 | zu
97 | zuek
98 | zuen
99 | zuten
100 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
3 | #
4 | # Add entries to this file in order to override the statistical model in terms
5 | # of segmentation, readings and part-of-speech tags. Notice that entries do
6 | # not have weights since they are always used when found. This is by-design
7 | # in order to maximize ease-of-use.
8 | #
9 | # Entries are defined using the following CSV format:
10 | # , ... , ... ,
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same is undefined.
16 | #
17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines.
18 | #
19 |
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 |
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 |
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 |
--------------------------------------------------------------------------------
/src/js/src/components/harvestCalendar/util.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Sort the date array descending (oldest first).
3 | * Note: Mutates the input array
4 | *
5 | * @param {Date} dateArray
6 | */
7 | export function sortDatesDescending(dateArray) {
8 | // Sort the harvest date objects by time ascending.
9 | return dateArray.sort((dateA, dateB) => dateA.getTime() - dateB.getTime())
10 | }
11 |
12 | /**
13 | * Converts date to human readable output
14 | *
15 | * @param {Date} date
16 | * @param {Boolean} showWeekday
17 | */
18 | export function toHumanDate(date, showWeekday = false, showTime = false) {
19 | const days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
20 | const months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
21 | if (date instanceof Date) {
22 | let dateString = showTime ? `${months[date.getMonth()]} ${date.getDate()}, ${date.getFullYear()} - ${date.getHours()}:${date.getMinutes()}`:`${months[date.getMonth()]} ${date.getDate()}, ${date.getFullYear()}`
23 | return showWeekday ? days[date.getDay()] + ', ' + dateString : dateString
24 | }
25 | return date
26 | }
27 |
28 |
29 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/exception/SolrWaybackServiceException.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.exception;
2 |
3 | import javax.ws.rs.core.Response;
4 |
5 |
6 | public abstract class SolrWaybackServiceException extends Exception {
7 | private static final long serialVersionUID = 27182818L;
8 | private final Response.Status responseStatus;
9 |
10 | public Response.Status getResponseStatus() {
11 | return responseStatus;
12 | }
13 |
14 | public SolrWaybackServiceException(Response.Status responseStatus)
15 | {
16 | super();
17 | this.responseStatus = responseStatus;
18 | }
19 |
20 | public SolrWaybackServiceException(String message, Response.Status responseStatus) {
21 | super(message);
22 | this.responseStatus = responseStatus;
23 | }
24 |
25 | public SolrWaybackServiceException(String message, Throwable cause, Response.Status responseStatus) {
26 | super(message, cause);
27 | this.responseStatus = responseStatus;
28 | }
29 |
30 | public SolrWaybackServiceException(Throwable cause, Response.Status responseStatus) {
31 | super(cause);
32 | this.responseStatus = responseStatus;
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
1 | # example set of basque stopwords
2 | al
3 | anitz
4 | arabera
5 | asko
6 | baina
7 | bat
8 | batean
9 | batek
10 | bati
11 | batzuei
12 | batzuek
13 | batzuetan
14 | batzuk
15 | bera
16 | beraiek
17 | berau
18 | berauek
19 | bere
20 | berori
21 | beroriek
22 | beste
23 | bezala
24 | da
25 | dago
26 | dira
27 | ditu
28 | du
29 | dute
30 | edo
31 | egin
32 | ere
33 | eta
34 | eurak
35 | ez
36 | gainera
37 | gu
38 | gutxi
39 | guzti
40 | haiei
41 | haiek
42 | haietan
43 | hainbeste
44 | hala
45 | han
46 | handik
47 | hango
48 | hara
49 | hari
50 | hark
51 | hartan
52 | hau
53 | hauei
54 | hauek
55 | hauetan
56 | hemen
57 | hemendik
58 | hemengo
59 | hi
60 | hona
61 | honek
62 | honela
63 | honetan
64 | honi
65 | hor
66 | hori
67 | horiei
68 | horiek
69 | horietan
70 | horko
71 | horra
72 | horrek
73 | horrela
74 | horretan
75 | horri
76 | hortik
77 | hura
78 | izan
79 | ni
80 | noiz
81 | nola
82 | non
83 | nondik
84 | nongo
85 | nor
86 | nora
87 | ze
88 | zein
89 | zen
90 | zenbait
91 | zenbat
92 | zer
93 | zergatik
94 | ziren
95 | zituen
96 | zu
97 | zuek
98 | zuen
99 | zuten
100 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
3 | #
4 | # Add entries to this file in order to override the statistical model in terms
5 | # of segmentation, readings and part-of-speech tags. Notice that entries do
6 | # not have weights since they are always used when found. This is by-design
7 | # in order to maximize ease-of-use.
8 | #
9 | # Entries are defined using the following CSV format:
10 | # , ... , ... ,
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same is undefined.
16 | #
17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines.
18 | #
19 |
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 |
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 |
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 |
--------------------------------------------------------------------------------
/src/js/src/components/LoadingOverlay.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
10 |
11 |
12 |
13 |
14 |
41 |
--------------------------------------------------------------------------------
/src/js/src/components/ngrams/searchHelper.js:
--------------------------------------------------------------------------------
1 | import Configs from './netarchive/configs'
2 | export default {
3 | handleSearch(queryFromClick, dateFromClick, searchType, scale) {
4 | let url
5 | let gap
6 | let filter
7 | // search on crawl_year if scale is YEAR
8 | if (scale == 'YEAR') {
9 | filter = '&fq=crawl_year:' + dateFromClick.slice(0,4)
10 | } else {
11 | // search on crawl_date otherwise
12 | if (scale == 'WEEK') {
13 | gap = '+7DAYS'
14 | } else {
15 | gap = '+1' + scale
16 | }
17 | filter = '&fq=crawl_date:[' + dateFromClick + 'T00:00:00Z TO ' + dateFromClick + 'T00:00:00Z'+ gap +']'
18 | }
19 | let param = '&grouping=false&imgSearch=false&offset=0&urlSearch=false'
20 | let facets = filter + '&fq=content_type_norm:"html"'
21 | if (searchType === 'tags'){
22 | url = `${Configs.BASE_SEARCH_URL()}?query=${encodeURIComponent('elements_used:"'+ queryFromClick +'"')}${param}&facets=${encodeURIComponent(facets)}`
23 | } else {
24 | url = `${Configs.BASE_SEARCH_URL()}?query=${encodeURIComponent(queryFromClick)}${param}&facets=${encodeURIComponent(facets)}`
25 | }
26 | window.open(url, '_blank')
27 | }
28 | }
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | ##some test synonym mappings unlikely to appear in real input text
15 | #aaafoo => aaabar
16 | #bbbfoo => bbbfoo bbbbar
17 | #cccfoo => cccbar cccbaz
18 | #fooaaa,baraaa,bazaaa
19 |
20 | ## Some synonym groups specific to this example
21 | #GB,gib,gigabyte,gigabytes
22 | #MB,mib,megabyte,megabytes
23 | #Television, Televisions, TV, TVs
24 | ##notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | ##after us won't split it into two words.
26 |
27 | ## Synonym mappings can be used for spelling correction too
28 | #pixima => pixma
29 |
30 |
--------------------------------------------------------------------------------
/src/js/src/components/notifications/Notifications.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
8 |
9 |
10 |
11 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/FacetCount.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.dto;
2 |
3 | public class FacetCount {
4 | private long count;
5 | private String value;
6 |
7 |
8 | public FacetCount(){
9 | }
10 |
11 |
12 | public long getCount() {
13 | return count;
14 | }
15 |
16 |
17 | public void setCount(long count) {
18 | this.count = count;
19 | }
20 |
21 |
22 | public String getValue() {
23 | return value;
24 | }
25 |
26 |
27 | public void setValue(String value) {
28 | this.value = value;
29 | }
30 |
31 |
32 | @Override
33 | public int hashCode() {
34 | final int prime = 31;
35 | int result = 1;
36 | result = prime * result + ((value == null) ? 0 : value.hashCode());
37 | return result;
38 | }
39 |
40 |
41 | @Override
42 | public boolean equals(Object obj) {
43 | if (this == obj)
44 | return true;
45 | if (obj == null)
46 | return false;
47 | if (getClass() != obj.getClass())
48 | return false;
49 | FacetCount other = (FacetCount) obj;
50 | if (value == null) {
51 | if (other.value != null)
52 | return false;
53 | } else if (!value.equals(other.value))
54 | return false;
55 | return true;
56 | }
57 |
58 |
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetURL.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers.json;
2 |
3 | import com.fasterxml.jackson.annotation.JsonProperty;
4 | import org.apache.commons.lang3.tuple.Pair;
5 |
6 | public class TweetURL implements TweetEntity {
7 | private Pair indices;
8 |
9 | private String expandedUrl;
10 |
11 | private String displayUrl;
12 |
13 |
14 | public TweetURL() {
15 | }
16 |
17 | @JsonProperty("indices")
18 | private void unpackIndices(int[] indices) {
19 | this.indices = Pair.of(indices[0], indices[1]);
20 | }
21 |
22 | public Pair getIndices() {
23 | return indices;
24 | }
25 |
26 | @Override
27 | public void setIndices(Pair newIndices) {
28 | this.indices = newIndices;
29 | }
30 |
31 | public String getExpandedUrl() {
32 | return expandedUrl;
33 | }
34 |
35 | public void setExpandedUrl(String expandedUrl) {
36 | this.expandedUrl = expandedUrl;
37 | }
38 |
39 | public String getDisplayUrl() {
40 | return displayUrl;
41 | }
42 |
43 | public void setDisplayUrl(String displayUrl) {
44 | this.displayUrl = displayUrl;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/smurf/NetarchiveYearCountCache.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.smurf;
2 |
3 | import java.util.HashMap;
4 |
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import dk.kb.netarchivesuite.solrwayback.solr.NetarchiveSolrClient;
9 |
10 | public class NetarchiveYearCountCache {
11 |
12 | private static long lastReloadTime=0;
13 | private static long reloadInterval=4*60*60*1000L; //reload cache every 4 hours
14 | private static HashMap yearFacetsAll = null;
15 | private static final Logger log = LoggerFactory.getLogger(NetarchiveYearCountCache.class);
16 |
17 | private static void reload() throws Exception{
18 | log.info("Reloading netarchive year count cache");
19 | lastReloadTime=System.currentTimeMillis();
20 | HashMap yearFacetsAllTemp = NetarchiveSolrClient.getInstance().getYearFacetsHtmlAll();
21 | yearFacetsAll=yearFacetsAllTemp;
22 | }
23 |
24 | public static synchronized HashMap getYearFacetsAllQuery() throws Exception{
25 | if ( (System.currentTimeMillis() - reloadInterval) > lastReloadTime){
26 | reload();
27 | }
28 | return yearFacetsAll;
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | ##some test synonym mappings unlikely to appear in real input text
15 | #aaafoo => aaabar
16 | #bbbfoo => bbbfoo bbbbar
17 | #cccfoo => cccbar cccbaz
18 | #fooaaa,baraaa,bazaaa
19 |
20 | ## Some synonym groups specific to this example
21 | #GB,gib,gigabyte,gigabytes
22 | #MB,mib,megabyte,megabytes
23 | #Television, Televisions, TV, TVs
24 | ##notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | ##after us won't split it into two words.
26 |
27 | ## Synonym mappings can be used for spelling correction too
28 | #pixima => pixma
29 |
30 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetEntities.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers.json;
2 |
3 | import com.fasterxml.jackson.annotation.JsonProperty;
4 |
5 | import java.util.List;
6 |
7 | /**
8 | * Even though 'entities' also includes 'media'-objects this is parsed outside this pojo, as media will always
9 | * be contained inside 'extended_entities' that is on the same level as 'entities'.
10 | */
11 | public class TweetEntities {
12 | @JsonProperty("user_mentions")
13 | private List mentions;
14 |
15 | private List urls;
16 |
17 | private List hashtags;
18 |
19 |
20 | public TweetEntities() {
21 | }
22 |
23 | public List getMentions() {
24 | return mentions;
25 | }
26 |
27 | public void setMentions(List mentions) {
28 | this.mentions = mentions;
29 | }
30 |
31 | public List getUrls() {
32 | return urls;
33 | }
34 |
35 | public void setUrls(List urls) {
36 | this.urls = urls;
37 | }
38 |
39 | public List getHashtags() {
40 | return hashtags;
41 | }
42 |
43 | public void setHashtags(List hashtags) {
44 | this.hashtags = hashtags;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/graph/Node.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.dto.graph;
2 |
3 | import javax.xml.bind.annotation.XmlRootElement;
4 |
5 | @XmlRootElement
6 | public class Node {
7 | private String name;
8 | private int group;
9 | private int size;
10 | private String color;
11 |
12 | public Node(){
13 | }
14 |
15 | public Node(String name, int group, int size){
16 | this.name=name;
17 | this.group=group;
18 | this.size=size;
19 | }
20 |
21 | public Node(String name, int group, int size, String color){
22 | this.name=name;
23 | this.group=group;
24 | this.size=size;
25 | this.color=color;
26 | }
27 |
28 | public String getName() {
29 | return name;
30 | }
31 |
32 | public void setName(String name) {
33 | this.name = name;
34 | }
35 |
36 | public int getGroup() {
37 | return group;
38 | }
39 |
40 | public void setGroup(int group) {
41 | this.group = group;
42 | }
43 |
44 | public int getSize() {
45 | return size;
46 | }
47 |
48 | public void setSize(int size) {
49 | this.size = size;
50 | }
51 |
52 | public String getColor() {
53 | return color;
54 | }
55 |
56 | public void setColor(String color) {
57 | this.color = color;
58 | }
59 |
60 |
61 |
62 | }
--------------------------------------------------------------------------------
/src/js/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "solrwayback",
3 | "version": "0.0.1",
4 | "private": true,
5 | "type": "module",
6 | "scripts": {
7 | "dev": "vite",
8 | "build": "vite build",
9 | "build-preview": "vite build --mode preview",
10 | "serve": "vite preview",
11 | "lint": "eslint . --ext .vue,.js --ignore-pattern .gitignore"
12 | },
13 | "dependencies": {
14 | "@sideway/formula": "^3.0.1",
15 | "axios": "^1.10.0",
16 | "chart.js": "^2.9.4",
17 | "core-js": "^3.44.0",
18 | "d3": "3.5.17",
19 | "date-fns": "^4.1.0",
20 | "leaflet": "^1.9.4",
21 | "leaflet.markercluster": "^1.5.3",
22 | "pinia": "^3.0.2",
23 | "video.js": "8.23.3",
24 | "vue": "^3.5.13",
25 | "vue-3-slider-component": "^1.0.2",
26 | "vue-chartjs": "^3.5.1",
27 | "vue-router": "^4.5.1"
28 | },
29 | "devDependencies": {
30 | "@vitejs/plugin-vue": "^5.0.5",
31 | "@vitejs/plugin-vue-jsx": "^4.1.2",
32 | "@vue/compiler-sfc": "^3.5.17",
33 | "@vue/eslint-config-prettier": "^10.2.0",
34 | "cross-env": "^7.0.3",
35 | "eslint": "^9.32.0",
36 | "eslint-plugin-vue": "^10.4.0",
37 | "prettier": "^3.5.3",
38 | "sass": "^1.55.0",
39 | "unplugin-vue-components": "^28.5.0",
40 | "v-tooltip": "^1.1.6",
41 | "vite": "^6.3.5",
42 | "vitest": "^3.1.3"
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/SolrWaybackAsciiLogo.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.listeners;
2 |
3 | public class SolrWaybackAsciiLogo {
4 |
5 | //Some characters are escaped, looks fine when printed.
6 | public final static String SOLRWAYBACK_LOGO=
7 | "\n"
8 | + " _______. ______ __ .______ ____ __ ____ ___ ____ ____ .______ ___ ______ __ ___ \n"
9 | + " / | / __ \\ | | | _ \\ \\ \\ / \\ / / / \\ \\ \\ / / | _ \\ / \\ / || |/ / \n"
10 | + " | (----`| | | | | | | |_) | \\ \\/ \\/ / / ^ \\ \\ \\/ / | |_) | / ^ \\ | ,----'| ' / \n"
11 | + " \\ \\ | | | | | | | / \\ / / /_\\ \\ \\_ _/ | _ < / /_\\ \\ | | | < \n"
12 | + " .----) | | `--' | | `----.| |\\ \\----. \\ /\\ / / _____ \\ | | | |_) | / _____ \\ | `----.| . \\ \n"
13 | + " |_______/ \\______/ |_______|| _| `._____| \\__/ \\__/ /__/ \\__\\ |__| |______/ /__/ \\__\\ \\______||__|\\__\\"
14 | + "\n";
15 |
16 |
17 |
18 | public static void main(String[] args) {
19 | System.out.println(SOLRWAYBACK_LOGO);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 |
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/json/TweetMention.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers.json;
2 |
3 | import com.fasterxml.jackson.annotation.JsonProperty;
4 | import org.apache.commons.lang3.tuple.Pair;
5 |
6 | public class TweetMention implements TweetEntity {
7 | @JsonProperty("id_str")
8 | private String id;
9 |
10 | private Pair indices;
11 |
12 | private String screenName;
13 |
14 |
15 | public TweetMention() {
16 | }
17 |
18 | public TweetMention(String screenName) {
19 | this.screenName = screenName;
20 | }
21 |
22 | @JsonProperty("indices")
23 | private void unpackIndices(int[] indices) {
24 | this.indices = Pair.of(indices[0], indices[1]);
25 | }
26 |
27 | public String getId() {
28 | return id;
29 | }
30 |
31 | public void setId(String id) {
32 | this.id = id;
33 | }
34 |
35 | public Pair getIndices() {
36 | return indices;
37 | }
38 |
39 | @Override
40 | public void setIndices(Pair newIndices) {
41 | this.indices = newIndices;
42 | }
43 |
44 | public String getScreenName() {
45 | return screenName;
46 | }
47 |
48 | public void setScreenName(String screenName) {
49 | this.screenName = screenName;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 |
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/playback/CssPlayback.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.playback;
2 |
3 | import org.slf4j.Logger;
4 | import org.slf4j.LoggerFactory;
5 |
6 | import dk.kb.netarchivesuite.solrwayback.parsers.HtmlParserUrlRewriter;
7 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry;
8 | import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc;
9 |
10 | public class CssPlayback extends PlaybackHandler{
11 |
12 | private static final Logger log = LoggerFactory.getLogger(CssPlayback.class);
13 |
14 | public CssPlayback(ArcEntry arc, IndexDoc doc, boolean showToolbar){
15 | super(arc,doc,showToolbar);
16 | }
17 |
18 | // TODO: Enable propagation of lenient through HtmlParserUrlRewriter.replaceLinksCss(arc)
19 | @Override
20 | public ArcEntry playback(boolean lenient) throws Exception{
21 | //Never show the toolbar.
22 | // TODO: What was the purpose of this round trip? If re-enabled, please state why in a comment
23 | // arc.setBinary(IOUtils.toByteArray(arc.getStringContentAsStringSafe())); //TODO charset;
24 |
25 | String textReplaced = HtmlParserUrlRewriter.replaceLinksCss(arc);
26 | // content-encoding is about compression; not relevant for charset
27 | // if (!"gzip".equalsIgnoreCase(arc.getContentEncoding())){
28 | arc.setStringContent(textReplaced);
29 | return arc;
30 | }
31 |
32 | }
--------------------------------------------------------------------------------
/src/js/src/components/harvestCalendar/plugins/iterators.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Higher-order function that loops through the harvestDataObject, calling a callback for each month.
3 | *
4 | * @param {Object} datesObject The final object of years, months and days to add data to.
5 | * @param {Function} actionFunction The callback to execute for every month.
6 | */
7 | export function doForEachMonthInDatesObject(datesObject, actionFunction) {
8 |
9 | for (let year of Object.keys(datesObject)) {
10 | for (let month of Object.keys(datesObject[year]['months'])) {
11 | actionFunction(year, month)
12 | }
13 | }
14 | }
15 |
16 | /**
17 | * Higher-order function that loops through the harvestDataObject, calling a callback for each day in the week.
18 | *
19 | * @param {Object} datesObject The final object of years, months and days to add data to.
20 | * @param {Function} actionFunction The callback to execute for every month.
21 | */
22 | export function doForEachWeekAndDayInDatesObject(datesObject, actionFunction) {
23 |
24 | for (let year of Object.keys(datesObject)) {
25 | for (let week of Object.keys(datesObject[year]['weeks'])) {
26 | for (let day of Object.keys(datesObject[year]['weeks'][week])) {
27 | if (datesObject[year]['weeks'][week][day] !== null) {
28 | actionFunction(year, week, day)
29 | }
30 | }
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/js/vite.config.js:
--------------------------------------------------------------------------------
1 | import { fileURLToPath, URL } from 'node:url';
2 | import vue from '@vitejs/plugin-vue';
3 | import { defineConfig } from 'vite';
4 |
5 | export default defineConfig(({ command }) => ({
6 | base: '/solrwayback/',
7 | build: {
8 | rollupOptions: {
9 | input: {
10 | custom: 'solrwayback_index_page.html'
11 | }
12 | }
13 | },
14 | root: '.',
15 | server: command === 'serve'
16 | ? {
17 | open: 'solrwayback_index_page.html',
18 | proxy: {
19 | '^/solrwayback/services': {
20 | target: 'http://localhost:8080',
21 | changeOrigin: true,
22 | rewrite: (path) => path.replace(/^\/solrwayback\/services/, '/solrwayback/services'),
23 | },
24 | '/services': {
25 | target: 'http://localhost:8080',
26 | changeOrigin: true,
27 | rewrite: (path) => {
28 | const newPath = path.replace(/^\/?services/, '/solrwayback/services');
29 | return newPath;
30 | },
31 | },
32 | },
33 | }
34 | : undefined,
35 | preview: {
36 | open: 'solrwayback_index_page.html',
37 | },
38 | plugins: [vue()],
39 | resolve: {
40 | alias: {
41 | extensions: [".mjs", ".js", ".ts", ".jsx", ".tsx", ".json", ".vue"],
42 | "@": fileURLToPath(new URL("./src", import.meta.url)),
43 | },
44 | },
45 | }));
46 |
--------------------------------------------------------------------------------
/src/test/java/dk/kb/netarchivesuite/solrwayback/parsers/HtmlParserUrlRewriterFromWarcTest.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers;
2 |
3 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry;
4 |
5 | public class HtmlParserUrlRewriterFromWarcTest {
6 |
7 |
8 | /*
9 | * Integration test class to parse HTML from a warc-file.
10 | * Warc-files can not be in reposity so change path to a local warc-file
11 | *
12 | */
13 | public static void main(String []args) {
14 | try {
15 | String warcFile="/media/teg/1TB_SSD/solrwayback_package_3.2/indexing/warcs/denoffentlige-00000.warc";
16 | long offset=2691693;
17 |
18 | ArcEntry arc=ArcParserFileResolver.getArcEntry(warcFile, offset);
19 | String html = arc.getStringContentAsStringSafe();
20 |
21 |
22 | ParseResult rewritten = HtmlParserUrlRewriter.replaceLinks(
23 | html, "http://example.com/somefolder/", "2020-04-30T13:07:00",
24 | RewriteTestHelper.createOXResolver(true));
25 |
26 |
27 | //See the replaced HTML. See all urls are replaced with 'notfound'
28 | System.out.println(rewritten.getReplaced());
29 |
30 | }
31 | catch(Exception e) {
32 | e.printStackTrace();
33 | }
34 |
35 |
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/PageResource.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.dto;
2 |
3 | import java.util.Date;
4 |
5 | import javax.xml.bind.annotation.XmlRootElement;
6 |
7 | @XmlRootElement
8 | public class PageResource {
9 |
10 | private String url;
11 | private String downloadUrl;
12 | private String contentType;
13 | private Date crawlTime;
14 | private String timeDifference;
15 |
16 | public PageResource(){
17 |
18 | }
19 |
20 | public String getUrl() {
21 | return url;
22 | }
23 |
24 | public void setUrl(String url) {
25 | this.url = url;
26 | }
27 |
28 | public String getDownloadUrl() {
29 | return downloadUrl;
30 | }
31 |
32 | public void setDownloadUrl(String downloadUrl) {
33 | this.downloadUrl = downloadUrl;
34 | }
35 |
36 | public String getContentType() {
37 | return contentType;
38 | }
39 |
40 | public void setContentType(String contentType) {
41 | this.contentType = contentType;
42 | }
43 |
44 | public Date getCrawlTime() {
45 | return crawlTime;
46 | }
47 |
48 | public void setCrawlTime(Date crawlTime) {
49 | this.crawlTime = crawlTime;
50 | }
51 |
52 | public String getTimeDifference() {
53 | return timeDifference;
54 | }
55 |
56 | public void setTimeDifference(String timeDifference) {
57 | this.timeDifference = timeDifference;
58 | }
59 |
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/service/dto/statistics/DomainYearStatistics.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.service.dto.statistics;
2 |
3 | import javax.xml.bind.annotation.XmlRootElement;
4 |
5 | @XmlRootElement
6 | public class DomainYearStatistics {
7 |
8 | private int year;
9 | private int ingoingLinks;
10 | private int sizeInKb;
11 | private int uniquePages;
12 | private String domain;
13 |
14 | public DomainYearStatistics(){
15 | }
16 |
17 |
18 | public int getYear() {
19 | return year;
20 | }
21 |
22 |
23 | public void setYear(int year) {
24 | this.year = year;
25 | }
26 |
27 |
28 | public int getIngoingLinks() {
29 | return ingoingLinks;
30 | }
31 |
32 |
33 | public void setIngoingLinks(int ingoingLinks) {
34 | this.ingoingLinks = ingoingLinks;
35 | }
36 |
37 |
38 | public int getSizeInKb() {
39 | return sizeInKb;
40 | }
41 |
42 |
43 | public void setSizeInKb(int sizeInKb) {
44 | this.sizeInKb = sizeInKb;
45 | }
46 |
47 |
48 | public int getTotalPages() {
49 | return uniquePages;
50 | }
51 |
52 |
53 | public void setTotalPages(int totalPages) {
54 | this.uniquePages = totalPages;
55 | }
56 |
57 |
58 | public String getDomain() {
59 | return domain;
60 | }
61 |
62 |
63 | public void setDomain(String domain) {
64 | this.domain = domain;
65 | }
66 |
67 |
68 |
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/test/resources/example_rewrite/simple.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Some page
4 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/src/js/src/components/harvestCalendar/plugins/tranformationHelpers.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Calculate activity level linearly between 0 and 4.
3 | * 0 is no activity level at all, 4 is the max level.
4 | */
5 | export function calculateLinearActivityLevel(harvestsInMonth, maximumHarvests) {
6 | if (harvestsInMonth > maximumHarvests * 0.75 && harvestsInMonth <= maximumHarvests) {
7 | return 4
8 | } else if (harvestsInMonth > maximumHarvests * 0.50 && harvestsInMonth <= maximumHarvests * 0.75) {
9 | return 3
10 | } else if (harvestsInMonth > maximumHarvests * 0.25 && harvestsInMonth <= maximumHarvests * 0.50) {
11 | return 2
12 | } else if (harvestsInMonth > 0 && harvestsInMonth <= maximumHarvests * 0.25) {
13 | return 1
14 | }
15 |
16 | return 0
17 | }
18 |
19 |
20 | /**
21 | * Calculate activity level logarithmically.
22 | */
23 | export function calculateLogarithmicActivityLevel(harvestsInMonth, maximumHarvests) {
24 |
25 | const logarithmicResult = getBaseLog(maximumHarvests, harvestsInMonth)
26 |
27 | if (logarithmicResult > 0.75 && logarithmicResult <= 1) {
28 | return 4
29 | } else if (logarithmicResult > 0.50 && logarithmicResult <= 0.75) {
30 | return 3
31 | } else if (logarithmicResult > 0.25 && logarithmicResult <= 0.50) {
32 | return 2
33 | } else if (logarithmicResult > 0 && logarithmicResult <= 0.25) {
34 | return 1
35 | }
36 |
37 | return 0
38 | }
39 |
40 |
41 | /**
42 | * The following function returns the logarithm of y with base x, ie. logx(y):
43 | */
44 | function getBaseLog(x, y) {
45 | return Math.log(y) / Math.log(x)
46 | }
47 |
48 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/util/CountingMap.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed under the Apache License, Version 2.0 (the "License");
3 | * you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS,
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | * See the License for the specific language governing permissions and
12 | * limitations under the License.
13 | *
14 | */
15 | package dk.kb.netarchivesuite.solrwayback.util;
16 |
17 | import org.apache.commons.logging.LogFactory;
18 | import org.apache.commons.logging.Log;
19 |
20 | import java.util.HashMap;
21 |
22 | /**
23 | * Map that keeps track of the number of successfull and unsuccessfull {@link #get(Object)} calls.
24 | */
25 | public class CountingMap extends HashMap {
26 | private static Log log = LogFactory.getLog(CountingMap.class);
27 |
28 | private int found = 0;
29 | private int fail = 0;
30 |
31 | @Override
32 | public V get(Object o) {
33 | V value = super.get(o);
34 | if (value == null) {
35 | fail++;
36 | } else {
37 | found++;
38 | }
39 | return value;
40 | }
41 |
42 | public int getFoundCount() {
43 | return found;
44 | }
45 |
46 | public int getFailCount() {
47 | return fail;
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcFileParserFactory.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.parsers;
2 |
3 | import dk.kb.netarchivesuite.solrwayback.interfaces.ArcSource;
4 | import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry;
5 |
6 | import java.util.Locale;
7 |
8 | public class ArcFileParserFactory {
9 |
10 | /*
11 | * Do not call this call. This class is called from ArcParseFileResolver and will use file-mapping first
12 | *
13 | * @param file_path is the file location, the file location must be resolved first.
14 | * @param offset offset in the warc file
15 | */
16 | public static ArcEntry getArcEntry(ArcSource arcSource, long offset) throws Exception{
17 |
18 | if (arcSource == null ){
19 | throw new IllegalArgumentException("No arcSupplier provided");
20 | }
21 |
22 | ArcEntry arcEntry = null;
23 | String sourceLowercase = arcSource.getSource().toLowerCase(Locale.ROOT);
24 |
25 |
26 | if (sourceLowercase.endsWith(".warc") || sourceLowercase.endsWith(".warc.gz") ) {
27 | arcEntry = WarcParser.getWarcEntry(arcSource, offset);
28 | }
29 |
30 | else if (sourceLowercase.endsWith(".arc") || sourceLowercase.endsWith("arc.gz")){
31 | arcEntry = ArcParser.getArcEntry(arcSource, offset);
32 | }
33 | else{
34 | throw new IllegalArgumentException(
35 | "Expected (W)ARC source not arc or warc: '"+ arcSource.getSource() + "'");
36 | }
37 |
38 | return arcEntry;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
5 | # This means that when modifying this list, you might need to add some
6 | # redundant entries, for example containing forms with both أ and ا
7 | من
8 | ومن
9 | منها
10 | منه
11 | في
12 | وفي
13 | فيها
14 | فيه
15 | و
16 | ف
17 | ثم
18 | او
19 | أو
20 | ب
21 | بها
22 | به
23 | ا
24 | أ
25 | اى
26 | اي
27 | أي
28 | أى
29 | لا
30 | ولا
31 | الا
32 | ألا
33 | إلا
34 | لكن
35 | ما
36 | وما
37 | كما
38 | فما
39 | عن
40 | مع
41 | اذا
42 | إذا
43 | ان
44 | أن
45 | إن
46 | انها
47 | أنها
48 | إنها
49 | انه
50 | أنه
51 | إنه
52 | بان
53 | بأن
54 | فان
55 | فأن
56 | وان
57 | وأن
58 | وإن
59 | التى
60 | التي
61 | الذى
62 | الذي
63 | الذين
64 | الى
65 | الي
66 | إلى
67 | إلي
68 | على
69 | عليها
70 | عليه
71 | اما
72 | أما
73 | إما
74 | ايضا
75 | أيضا
76 | كل
77 | وكل
78 | لم
79 | ولم
80 | لن
81 | ولن
82 | هى
83 | هي
84 | هو
85 | وهى
86 | وهي
87 | وهو
88 | فهى
89 | فهي
90 | فهو
91 | انت
92 | أنت
93 | لك
94 | لها
95 | له
96 | هذه
97 | هذا
98 | تلك
99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
5 | # This means that when modifying this list, you might need to add some
6 | # redundant entries, for example containing forms with both أ and ا
7 | من
8 | ومن
9 | منها
10 | منه
11 | في
12 | وفي
13 | فيها
14 | فيه
15 | و
16 | ف
17 | ثم
18 | او
19 | أو
20 | ب
21 | بها
22 | به
23 | ا
24 | أ
25 | اى
26 | اي
27 | أي
28 | أى
29 | لا
30 | ولا
31 | الا
32 | ألا
33 | إلا
34 | لكن
35 | ما
36 | وما
37 | كما
38 | فما
39 | عن
40 | مع
41 | اذا
42 | إذا
43 | ان
44 | أن
45 | إن
46 | انها
47 | أنها
48 | إنها
49 | انه
50 | أنه
51 | إنه
52 | بان
53 | بأن
54 | فان
55 | فأن
56 | وان
57 | وأن
58 | وإن
59 | التى
60 | التي
61 | الذى
62 | الذي
63 | الذين
64 | الى
65 | الي
66 | إلى
67 | إلي
68 | على
69 | عليها
70 | عليه
71 | اما
72 | أما
73 | إما
74 | ايضا
75 | أيضا
76 | كل
77 | وكل
78 | لم
79 | ولم
80 | لن
81 | ولن
82 | هى
83 | هي
84 | هو
85 | وهى
86 | وهي
87 | وهو
88 | فهى
89 | فهي
90 | فهو
91 | انت
92 | أنت
93 | لك
94 | لها
95 | له
96 | هذه
97 | هذا
98 | تلك
99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 |
--------------------------------------------------------------------------------
/src/test/java/dk/kb/netarchivesuite/solrwayback/export/TestGenerateCSV.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.export;
2 |
3 | import java.io.PrintWriter;
4 |
5 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils;
6 | import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader;
7 | import dk.kb.netarchivesuite.solrwayback.solr.SolrStreamingExportClient;
8 | import org.apache.solr.client.solrj.SolrClient;
9 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient;
10 |
11 | public class TestGenerateCSV {
12 |
13 | private static final String SOLR = "http://localhost:8983/solr/netarchivebuilder";
14 |
15 | public static void main(String[] args) throws Exception{
16 |
17 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath());
18 |
19 | String query = "thomas egense";
20 | String filter = null;
21 |
22 | String fields = "id, domain, hash , links_images ";
23 | SolrClient solrClient = new HttpJdkSolrClient.Builder(SOLR).build();
24 | SolrStreamingExportClient solr = SolrStreamingExportClient.createCvsExporter(solrClient, query,fields, filter);
25 |
26 | StreamingSolrExportBufferedInputStream streamExport = new StreamingSolrExportBufferedInputStream(solr,100);
27 |
28 | PrintWriter writer = new PrintWriter("export.txt", "UTF-8");
29 |
30 | int read = streamExport.read();
31 | while (read != -1){
32 | // System.out.print(Character.toString((char) read));
33 | writer.write(Character.toString((char) read));
34 | read=streamExport.read();
35 | }
36 | writer.close();
37 | streamExport.close();
38 |
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/test/java/dk/kb/netarchivesuite/solrwayback/export/TestGenerateLinkGraphCSV.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.export;
2 |
3 | import java.io.PrintWriter;
4 |
5 | import dk.kb.netarchivesuite.solrwayback.UnitTestUtils;
6 | import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader;
7 | import dk.kb.netarchivesuite.solrwayback.solr.SolrStreamingLinkGraphCSVExportClient;
8 | import org.apache.solr.client.solrj.SolrClient;
9 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient;
10 |
11 | public class TestGenerateLinkGraphCSV {
12 |
13 | private static final String SOLR_SERVER = "http://belinda:8983/solr/netarchivebuilder";
14 |
15 | public static void main(String[] args) throws Exception{
16 |
17 | PropertiesLoader.initProperties(UnitTestUtils.getFile("properties/solrwayback_unittest.properties").getPath());
18 |
19 | String query = "katte";
20 |
21 | SolrClient solrClient = new HttpJdkSolrClient.Builder(PropertiesLoader.SOLR_SERVER).build();
22 | SolrStreamingLinkGraphCSVExportClient solr = SolrStreamingLinkGraphCSVExportClient.createExporter(solrClient, query);
23 |
24 | //MAX 100.000 results
25 | StreamingSolrExportBufferedInputStream streamExport = new StreamingSolrExportBufferedInputStream(solr,100000);
26 |
27 | PrintWriter writer = new PrintWriter("target/linkgraph.csv", "UTF-8");
28 |
29 | int read = streamExport.read();
30 | while (read != -1){
31 | // System.out.print(Character.toString((char) read));
32 | writer.write(Character.toString((char) read));
33 | read=streamExport.read();
34 | }
35 | writer.close();
36 |
37 | streamExport.close();
38 |
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
1 | # galican stopwords
2 | a
3 | aínda
4 | alí
5 | aquel
6 | aquela
7 | aquelas
8 | aqueles
9 | aquilo
10 | aquí
11 | ao
12 | aos
13 | as
14 | así
15 | á
16 | ben
17 | cando
18 | che
19 | co
20 | coa
21 | comigo
22 | con
23 | connosco
24 | contigo
25 | convosco
26 | coas
27 | cos
28 | cun
29 | cuns
30 | cunha
31 | cunhas
32 | da
33 | dalgunha
34 | dalgunhas
35 | dalgún
36 | dalgúns
37 | das
38 | de
39 | del
40 | dela
41 | delas
42 | deles
43 | desde
44 | deste
45 | do
46 | dos
47 | dun
48 | duns
49 | dunha
50 | dunhas
51 | e
52 | el
53 | ela
54 | elas
55 | eles
56 | en
57 | era
58 | eran
59 | esa
60 | esas
61 | ese
62 | eses
63 | esta
64 | estar
65 | estaba
66 | está
67 | están
68 | este
69 | estes
70 | estiven
71 | estou
72 | eu
73 | é
74 | facer
75 | foi
76 | foron
77 | fun
78 | había
79 | hai
80 | iso
81 | isto
82 | la
83 | las
84 | lle
85 | lles
86 | lo
87 | los
88 | mais
89 | me
90 | meu
91 | meus
92 | min
93 | miña
94 | miñas
95 | moi
96 | na
97 | nas
98 | neste
99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 |
--------------------------------------------------------------------------------
/src/test/java/dk/kb/netarchivesuite/solrwayback/solr/IndexWatcherTest.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.solr;
2 |
3 | import org.apache.solr.client.solrj.SolrClient;
4 | import org.apache.solr.client.solrj.impl.HttpJdkSolrClient;
5 |
6 | /*
7 | * Licensed under the Apache License, Version 2.0 (the "License");
8 | * you may not use this file except in compliance with the License.
9 | * You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | *
19 | */
20 | public class IndexWatcherTest {
21 |
22 | public static final String SOLR_SERVER = "http://localhost:8983/solr/netarchivebuilder";
23 |
24 | /**
25 | * Not a unit test!
26 | *
27 | * This method requires a running Solr and only outputs state changes.
28 | *
29 | * Use this by starting the test, then start, stop or update the Solr collection {@code netarchivebuilder} on
30 | * {@code localhost:8983} (default for the SolrWayback bundle) while watching the output.
31 | */
32 | public void disabledtestAgainstExistingIndex() throws InterruptedException {
33 | SolrClient solrClient = new HttpJdkSolrClient.Builder(SOLR_SERVER).build();
34 | IndexWatcher watcher = new IndexWatcher(
35 | solrClient, 500,
36 | status -> System.out.println("New status: " + status));
37 | Thread.sleep(100000000);
38 | }
39 | }
--------------------------------------------------------------------------------
/src/js/src/components/harvestCalendar/AllYearsGraph.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ year }}
6 |
7 |
8 |
9 |
10 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/util/LimitedReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed under the Apache License, Version 2.0 (the "License");
3 | * you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS,
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | * See the License for the specific language governing permissions and
12 | * limitations under the License.
13 | *
14 | */
15 | package dk.kb.netarchivesuite.solrwayback.util;
16 |
17 | import java.io.IOException;
18 | import java.io.Reader;
19 |
20 | /**
21 | * Wrapper for a {@link Reader} that limits the amount of characters that are delivered.
22 | * Excess characters are ignored.
23 | */
24 | public class LimitedReader extends Reader {
25 | private final Reader source;
26 | private long charactersLeft;
27 |
28 | public LimitedReader(Reader source, long maxCharacters) {
29 | this.source = source;
30 | charactersLeft = maxCharacters;
31 | }
32 |
33 | @Override
34 | public int read(char[] cbuf, int off, int len) throws IOException {
35 | if (charactersLeft == 0) {
36 | return -1;
37 | }
38 | int newLen = (int) Math.min(charactersLeft, len);
39 | int read = source.read(cbuf, off, newLen);
40 | if (read != -1) {
41 | charactersLeft -= read;
42 | }
43 | return read;
44 | }
45 |
46 | @Override
47 | public void close() throws IOException {
48 | source.close();
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
1 | # galican stopwords
2 | a
3 | aínda
4 | alí
5 | aquel
6 | aquela
7 | aquelas
8 | aqueles
9 | aquilo
10 | aquí
11 | ao
12 | aos
13 | as
14 | así
15 | á
16 | ben
17 | cando
18 | che
19 | co
20 | coa
21 | comigo
22 | con
23 | connosco
24 | contigo
25 | convosco
26 | coas
27 | cos
28 | cun
29 | cuns
30 | cunha
31 | cunhas
32 | da
33 | dalgunha
34 | dalgunhas
35 | dalgún
36 | dalgúns
37 | das
38 | de
39 | del
40 | dela
41 | delas
42 | deles
43 | desde
44 | deste
45 | do
46 | dos
47 | dun
48 | duns
49 | dunha
50 | dunhas
51 | e
52 | el
53 | ela
54 | elas
55 | eles
56 | en
57 | era
58 | eran
59 | esa
60 | esas
61 | ese
62 | eses
63 | esta
64 | estar
65 | estaba
66 | está
67 | están
68 | este
69 | estes
70 | estiven
71 | estou
72 | eu
73 | é
74 | facer
75 | foi
76 | foron
77 | fun
78 | había
79 | hai
80 | iso
81 | isto
82 | la
83 | las
84 | lle
85 | lles
86 | lo
87 | los
88 | mais
89 | me
90 | meu
91 | meus
92 | min
93 | miña
94 | miñas
95 | moi
96 | na
97 | nas
98 | neste
99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 |
--------------------------------------------------------------------------------
/src/bundle/solr_config/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
1 | a
2 | s
3 | k
4 | o
5 | i
6 | u
7 | v
8 | z
9 | dnes
10 | cz
11 | tímto
12 | budeš
13 | budem
14 | byli
15 | jseš
16 | můj
17 | svým
18 | ta
19 | tomto
20 | tohle
21 | tuto
22 | tyto
23 | jej
24 | zda
25 | proč
26 | máte
27 | tato
28 | kam
29 | tohoto
30 | kdo
31 | kteří
32 | mi
33 | nám
34 | tom
35 | tomuto
36 | mít
37 | nic
38 | proto
39 | kterou
40 | byla
41 | toho
42 | protože
43 | asi
44 | ho
45 | naši
46 | napište
47 | re
48 | což
49 | tím
50 | takže
51 | svých
52 | její
53 | svými
54 | jste
55 | aj
56 | tu
57 | tedy
58 | teto
59 | bylo
60 | kde
61 | ke
62 | pravé
63 | ji
64 | nad
65 | nejsou
66 | či
67 | pod
68 | téma
69 | mezi
70 | přes
71 | ty
72 | pak
73 | vám
74 | ani
75 | když
76 | však
77 | neg
78 | jsem
79 | tento
80 | článku
81 | články
82 | aby
83 | jsme
84 | před
85 | pta
86 | jejich
87 | byl
88 | ještě
89 | až
90 | bez
91 | také
92 | pouze
93 | první
94 | vaše
95 | která
96 | nás
97 | nový
98 | tipy
99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 |
--------------------------------------------------------------------------------
/src/js/src/components/harvestCalendar/harvestDateHelper.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Returns an array of the months in the year (0-11)
3 | */
4 | export function getArrayOfMonths() {
5 | return [...Array(12).keys()] // [0, 1, 2, ..., 11]
6 | }
7 |
8 |
9 | /**
10 | * Returns an array of the harvest for a given month and year.
11 | *
12 | * @param {number} year
13 | * @param {number} month
14 | * @param {Array} parsedHarvestDates
15 | */
16 | export function getHarvestsForMonth(year, month, parsedHarvestDates) {
17 | return parsedHarvestDates
18 | .filter(date => date.getMonth() === month && date.getFullYear() === year)
19 | }
20 |
21 |
22 | /**
23 | * Returns an array of the harvests for a given date.
24 | *
25 | * @param {Date} day
26 | * @param {Array} parsedHarvestDates
27 | */
28 | export function getHarvestsForDay(day, parsedHarvestDates) {
29 | return parsedHarvestDates
30 | .filter(date => isSameDay(date, day))
31 | }
32 |
33 |
34 | /**
35 | * Helper function for determining of two dates are the same day.
36 | *
37 | * @param {*} day1
38 | * @param {*} day2
39 | */
40 | function isSameDay(day1, day2) {
41 | return day1.getFullYear() === day2.getFullYear() && day1.getMonth() === day2.getMonth() && day1.getDate() === day2.getDate()
42 | }
43 |
44 |
45 | /**
46 | * Given a Date object, return the number of days in the month.
47 | * Source: http://stackoverflow.com/questions/1184334/get-number-days-in-a-specified-month-using-javascript
48 | *
49 | * It takes adds one to the month of the dateObject, but sets the day to 0.
50 | * This gives the last day of the month of the dateObject.
51 | *
52 | * @param {Date} dateObject
53 | */
54 | export function getDaysInMonth(dateObject) {
55 | return new Date(dateObject.getFullYear(), dateObject.getMonth() + 1, 0).getDate()
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/src/test/resources/solr_9/netarchivebuilder/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
1 | a
2 | s
3 | k
4 | o
5 | i
6 | u
7 | v
8 | z
9 | dnes
10 | cz
11 | tímto
12 | budeš
13 | budem
14 | byli
15 | jseš
16 | můj
17 | svým
18 | ta
19 | tomto
20 | tohle
21 | tuto
22 | tyto
23 | jej
24 | zda
25 | proč
26 | máte
27 | tato
28 | kam
29 | tohoto
30 | kdo
31 | kteří
32 | mi
33 | nám
34 | tom
35 | tomuto
36 | mít
37 | nic
38 | proto
39 | kterou
40 | byla
41 | toho
42 | protože
43 | asi
44 | ho
45 | naši
46 | napište
47 | re
48 | což
49 | tím
50 | takže
51 | svých
52 | její
53 | svými
54 | jste
55 | aj
56 | tu
57 | tedy
58 | teto
59 | bylo
60 | kde
61 | ke
62 | pravé
63 | ji
64 | nad
65 | nejsou
66 | či
67 | pod
68 | téma
69 | mezi
70 | přes
71 | ty
72 | pak
73 | vám
74 | ani
75 | když
76 | však
77 | neg
78 | jsem
79 | tento
80 | článku
81 | články
82 | aby
83 | jsme
84 | před
85 | pta
86 | jejich
87 | byl
88 | ještě
89 | až
90 | bez
91 | také
92 | pouze
93 | první
94 | vaše
95 | která
96 | nás
97 | nový
98 | tipy
99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/normalise/NormalisationMinimal.java:
--------------------------------------------------------------------------------
1 | package dk.kb.netarchivesuite.solrwayback.normalise;
2 |
3 | import java.net.URL;
4 |
5 | import org.apache.commons.httpclient.URIException;
6 | import org.apache.commons.logging.Log;
7 | import org.apache.commons.logging.LogFactory;
8 | import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
9 |
10 | public class NormalisationMinimal {
11 | private static Log log = LogFactory.getLog( NormalisationLegacy.class );
12 |
13 | private static AggressiveUrlCanonicalizer canon = new AggressiveUrlCanonicalizer();
14 |
15 |
16 | public static String canonicaliseHost(String host) throws URIException {
17 | return canon.urlStringToKey(host.trim()).replace("/", "");
18 | }
19 |
20 | public static String canonicaliseURL(String url) {
21 | return canonicaliseURL(url, true, true);
22 | }
23 |
24 |
25 | public static String resolveRelative(String url, String relative) throws IllegalArgumentException {
26 | return resolveRelative(url, relative, true);
27 | }
28 |
29 | public static String resolveRelative(String url, String relative, boolean normalise) throws IllegalArgumentException {
30 | try {
31 | URL rurl = new URL(url);
32 | String resolved = new URL(rurl, relative).toString();
33 | return normalise ? canonicaliseURL(resolved) : resolved;
34 | } catch (Exception e) {
35 | throw new IllegalArgumentException(String.format(
36 | "Unable to resolve '%s' relative to '%s'", relative, url), e);
37 | }
38 | }
39 |
40 | public static String canonicaliseURL(String url, boolean allowHighOrder, boolean createUnambiguous) {
41 | //DO nothing
42 | return url;
43 | }
44 | }
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/src/main/java/dk/kb/netarchivesuite/solrwayback/util/StatusInputStream.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed under the Apache License, Version 2.0 (the "License");
3 | * you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS,
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | * See the License for the specific language governing permissions and
12 | * limitations under the License.
13 | *
14 | */
15 | package dk.kb.netarchivesuite.solrwayback.util;
16 |
17 | import java.io.FilterInputStream;
18 | import java.io.InputStream;
19 |
20 | /**
21 | * Stream-wrapper with status for the content.
22 | */
23 | public class StatusInputStream extends FilterInputStream {
24 | public enum STATUS {ok, exception, empty}
25 |
26 | private final STATUS status;
27 | private final Exception exception;
28 | private final long size;
29 |
30 | public StatusInputStream(InputStream in, STATUS status, long expectedSize) {
31 | super(in);
32 | this.status = status;
33 | this.exception = null;
34 | this.size = expectedSize;
35 | }
36 |
37 | public StatusInputStream(InputStream in, Exception exception, long expectedSize) {
38 | super(in);
39 | this.status = STATUS.exception;
40 | this.exception = exception;
41 | this.size = expectedSize;
42 | }
43 |
44 | public STATUS getStatus() {
45 | return status;
46 | }
47 |
48 | public Exception getException() {
49 | return exception;
50 | }
51 |
52 | public long size() {
53 | return size;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/js/src/components/AppliedSearchFacets.vue:
--------------------------------------------------------------------------------
1 |
2 |