├── src
├── test
│ ├── resources
│ │ ├── XMLDumpParserErrorEmptyTest.xml.bz2
│ │ ├── XMLDumpParserTest.xml.bz2
│ │ ├── XMLDumpParserErrorHeaderTest.xml.bz2
│ │ ├── makeLarge.pl
│ │ ├── enwiktionary-20150224-pages-articles-multistream.xml.bz2
│ │ ├── enwiktionary-20150224-pages-articles-multistream-index.txt.bz2
│ │ ├── articles-de
│ │ │ ├── Abschlusz.txt
│ │ │ ├── Eingaben.txt
│ │ │ ├── Fote.txt
│ │ │ ├── Subdivisio.txt
│ │ │ ├── robber_baron.txt
│ │ │ ├── boulder.txt
│ │ │ ├── Flipchart.txt
│ │ │ ├── Angestellte.txt
│ │ │ ├── Hallo.txt
│ │ │ ├── Tetragraph.txt
│ │ │ ├── Kunsttherapie.txt
│ │ │ ├── mitreissen.txt
│ │ │ ├── Brathaehnchen.txt
│ │ │ ├── Mockumentary.txt
│ │ │ ├── Thulium.txt
│ │ │ ├── Generaladmiral.txt
│ │ │ ├── harness.txt
│ │ │ ├── Verbalsubstantiv.txt
│ │ │ ├── Nutella.txt
│ │ │ └── pittoresk.txt
│ │ ├── articles-ru
│ │ │ ├── lechu.txt
│ │ │ └── lodka.txt
│ │ ├── XMLDumpParserErrorXMLTest.xml
│ │ ├── XMLDumpParserTest.xml
│ │ ├── articles-en
│ │ │ ├── callously.txt
│ │ │ ├── dreier.txt
│ │ │ ├── seawater.txt
│ │ │ ├── granada.txt
│ │ │ ├── bamba.txt
│ │ │ ├── mangueira.txt
│ │ │ ├── varanda.txt
│ │ │ ├── goitrogenic.txt
│ │ │ ├── abele.txt
│ │ │ ├── aborted.txt
│ │ │ ├── for_good_measure.txt
│ │ │ ├── escritorio.txt
│ │ │ ├── sumo.txt
│ │ │ ├── batsman.txt
│ │ │ ├── garçon.txt
│ │ │ ├── cheio.txt
│ │ │ ├── it_s.txt
│ │ │ ├── as_much_as_possible.txt
│ │ │ └── gumbo.txt
│ │ ├── WiktionaryTestData_info.txt
│ │ ├── WiktionaryDumpParserNullTest.xml
│ │ └── WiktionaryDumpParserTest.xml
│ └── java
│ │ └── de
│ │ └── tudarmstadt
│ │ └── ukp
│ │ └── jwktl
│ │ ├── IntegrationTest.java
│ │ ├── parser
│ │ ├── en
│ │ │ └── components
│ │ │ │ └── ENWordFormHandlerTest.java
│ │ ├── de
│ │ │ ├── DEEntryLinkHandlerTest.java
│ │ │ ├── DEWiktionaryEntryParserTest.java
│ │ │ ├── DESenseExampleHandlerTest.java
│ │ │ └── components
│ │ │ │ └── nountable
│ │ │ │ ├── DEWordFormNounTableHandlerTest.java
│ │ │ │ └── DativeHandlerTest.java
│ │ ├── ChainedCBZip2InputStreamTest.java
│ │ └── util
│ │ │ └── PatternUtilsTest.java
│ │ └── WiktionaryTestCase.java
└── main
│ ├── filter
│ ├── META-INF
│ │ └── jwktl-version.properties
│ └── jwktl-version-filter.properties
│ ├── java
│ └── de
│ │ └── tudarmstadt
│ │ └── ukp
│ │ └── jwktl
│ │ ├── parser
│ │ ├── ru
│ │ │ └── wikokit
│ │ │ │ └── base
│ │ │ │ ├── wikipedia
│ │ │ │ ├── util
│ │ │ │ │ └── GraphMLFile.java
│ │ │ │ └── language
│ │ │ │ │ └── LanguageTypeLocal.java
│ │ │ │ └── wikt
│ │ │ │ ├── multi
│ │ │ │ ├── en
│ │ │ │ │ ├── WQuoteEn.java
│ │ │ │ │ ├── LabelEn.java
│ │ │ │ │ └── WRedirectEn.java
│ │ │ │ └── ru
│ │ │ │ │ ├── WRedirectRu.java
│ │ │ │ │ └── quote
│ │ │ │ │ └── TitleAndWikilink.java
│ │ │ │ ├── word
│ │ │ │ └── WSynonyms.java
│ │ │ │ ├── util
│ │ │ │ ├── LangText.java
│ │ │ │ └── POSText.java
│ │ │ │ └── constant
│ │ │ │ └── ContextLabel.java
│ │ ├── de
│ │ │ └── components
│ │ │ │ ├── nountable
│ │ │ │ ├── DativeHandler.java
│ │ │ │ ├── GenitiveHandler.java
│ │ │ │ ├── AccusativeHandler.java
│ │ │ │ ├── NominativeHandler.java
│ │ │ │ ├── CaseHandler.java
│ │ │ │ ├── MehrzahlHandler.java
│ │ │ │ ├── PatternBasedParameterHandler.java
│ │ │ │ ├── EinzahlHandler.java
│ │ │ │ └── PatternBasedIndexedParameterHandler.java
│ │ │ │ ├── DEBlockHandler.java
│ │ │ │ ├── DECollocationsHandler.java
│ │ │ │ ├── DEGenderText.java
│ │ │ │ ├── DEEtymologyHandler.java
│ │ │ │ └── DEWordLanguageHandler.java
│ │ ├── en
│ │ │ └── components
│ │ │ │ ├── ENBlockHandler.java
│ │ │ │ ├── IHeadwordLineHandler.java
│ │ │ │ ├── IWordFormHandler.java
│ │ │ │ ├── ENUsageNotesHandler.java
│ │ │ │ ├── ENDescendantRelationHandler.java
│ │ │ │ ├── ENEtymologyHandler.java
│ │ │ │ └── ENWordLanguageHandler.java
│ │ ├── IWiktionaryEntryParser.java
│ │ ├── IWiktionaryMultistreamDumpParser.java
│ │ ├── MultistreamFilter.java
│ │ ├── IWiktionaryDumpParser.java
│ │ ├── util
│ │ │ ├── IBlockHandler.java
│ │ │ └── PatternUtils.java
│ │ └── components
│ │ │ ├── BlockHandler.java
│ │ │ └── InterwikiLinkHandler.java
│ │ ├── api
│ │ ├── IWiktionaryExample.java
│ │ ├── IQuotation.java
│ │ ├── util
│ │ │ ├── GrammaticalPerson.java
│ │ │ ├── GrammaticalGender.java
│ │ │ ├── GrammaticalDegree.java
│ │ │ ├── GrammaticalNumber.java
│ │ │ ├── GrammaticalTense.java
│ │ │ ├── NonFiniteForm.java
│ │ │ ├── GrammaticalMood.java
│ │ │ ├── GrammaticalCase.java
│ │ │ ├── GrammaticalAspect.java
│ │ │ └── ILanguage.java
│ │ ├── filter
│ │ │ ├── IWiktionaryPageFilter.java
│ │ │ ├── IWiktionaryEntryFilter.java
│ │ │ ├── IWiktionarySenseFilter.java
│ │ │ └── WiktionarySenseFilter.java
│ │ ├── WiktionaryException.java
│ │ ├── entry
│ │ │ ├── WiktionaryExample.java
│ │ │ ├── Pronunciation.java
│ │ │ └── WiktionaryRelation.java
│ │ ├── IWikiString.java
│ │ └── IPronunciation.java
│ │ └── examples
│ │ └── Example5_MultipleLanguages.java
│ └── resources
│ └── assemblies
│ └── dist.xml
├── .codecov.yml
├── CONTRIBUTING.md
├── .gitignore
├── .github
└── workflows
│ └── build.yml
└── CONTRIBUTORS.txt
/src/test/resources/XMLDumpParserErrorEmptyTest.xml.bz2:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 | - "src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/**/*"
3 |
--------------------------------------------------------------------------------
/src/test/resources/XMLDumpParserTest.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/XMLDumpParserTest.xml.bz2
--------------------------------------------------------------------------------
/src/main/filter/META-INF/jwktl-version.properties:
--------------------------------------------------------------------------------
1 | jwktl.version=${jwktl.version}
2 | build.number=${build.number}
3 | svn.revision=${svn.revision}
4 |
--------------------------------------------------------------------------------
/src/main/filter/jwktl-version-filter.properties:
--------------------------------------------------------------------------------
1 | jwktl.version=${pom.version}
2 | build.number=${BUILD_NUMBER}
3 | svn.revision=${SVN_REVISION}
4 |
--------------------------------------------------------------------------------
/src/test/resources/XMLDumpParserErrorHeaderTest.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/XMLDumpParserErrorHeaderTest.xml.bz2
--------------------------------------------------------------------------------
/src/test/resources/makeLarge.pl:
--------------------------------------------------------------------------------
1 | for ($i = 1; $i <= 30000; $i++) {
2 | print " $iPage_$i\n";
3 | }
--------------------------------------------------------------------------------
/src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2
--------------------------------------------------------------------------------
/src/test/resources/enwiktionary-20150224-pages-articles-multistream-index.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/enwiktionary-20150224-pages-articles-multistream-index.txt.bz2
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to JWKTL
2 |
3 | Thank you very much for your willingness to participate in this project.
4 |
5 | Please read the DKPro contribution guidelines at https://dkpro.github.io/contributing/
6 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Abschlusz.txt:
--------------------------------------------------------------------------------
1 | == Abschluß ({{Sprache|Deutsch}}) ==
2 | {{Alte Schreibweise|Abschluss|Reform 1996}}
3 |
4 | [[hu:Abschluß]]
5 | [[is:Abschluß]]
6 | [[ru:Abschluß]]
7 | [[zh:Abschluß]]
8 |
--------------------------------------------------------------------------------
/src/test/resources/articles-ru/lechu.txt:
--------------------------------------------------------------------------------
1 | = {{-ru-|nocat}} =
2 | == лечу I ==
3 | '''ле-чу́''' //
4 | *{{Форма-гл|лететь|наст||1|ед|}}
5 | == лечу II ==
6 | '''ле-чу́''' //
7 | *{{Форма-гл|лечить|наст||1|ед|}}{{длина слова|4}}
8 |
9 | [[fi:лечу]]
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Eclipse ###
2 | /target/
3 | .classpath
4 | .project
5 | .settings/org.eclipse.core.resources.prefs
6 | .settings/org.eclipse.jdt.core.prefs
7 | .settings/org.eclipse.m2e.core.prefs
8 |
9 | ### IntelliJ IDEA ###
10 | *.iml
11 | .idea/
--------------------------------------------------------------------------------
/src/test/resources/XMLDumpParserErrorXMLTest.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 | Some text content
8 |
9 | Some text content
10 |
11 |
12 |
--------------------------------------------------------------------------------
/src/test/resources/XMLDumpParserTest.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 | Some text content
8 |
9 |
10 | Some text content
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/callously.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Adverb===
4 | {{en-adv}}
5 |
6 | # In a [[callous]] manner; done without regard to others' [[sensitivities]].
7 |
8 | ====Synonyms====
9 | *(''in a callous manner''): [[carelessly]], [[hardheartedly]], [[indifferently]], [[unfeelingly]]
10 |
11 | [[et:callously]]
12 | [[es:callously]]
13 | [[fr:callously]]
14 | [[ru:callously]]
15 | [[vi:callously]]
16 | [[zh:callously]]
17 |
--------------------------------------------------------------------------------
/src/test/resources/WiktionaryTestData_info.txt:
--------------------------------------------------------------------------------
1 | de dump:
2 |
3 | 3 Parameter (GERMAN, NOUN)
4 | 2 Mönch (GERMAN, NOUN)
5 | 0 França (CATALAN, NOUN)
6 | 1 França (OCCITAN, NOUN)
7 | 4 Platz (GERMAN, NOUN)
8 |
9 |
10 |
11 | en dump:
12 |
13 | 0 parameter (ENGLISH, NOUN)
14 | 1 place (ENGLISH, NOUN)
15 | 2 place (ENGLISH, VERB)
16 | 3 place (FRENCH, NOUN)
17 | 4 place (FRENCH, VERB)
18 | 5 place (POLISH, NOUN)
19 | 6 place (ROMANIAN, VERB)
20 | 7 place (SPANISH, VERB)
21 |
22 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/dreier.txt:
--------------------------------------------------------------------------------
1 | ==German==
2 |
3 | ===Pronunciation===
4 | * {{IPA|/ˈdʁaɪ̯ɐ/|lang=de}}
5 | * {{hyphenation|drei|er|lang=de}}
6 |
7 | ===Numeral===
8 | {{head|de|numeral}}
9 |
10 | # {{genitive plural of|drei||three|lang=de}}
11 |
12 | ====Usage notes====
13 | Only in adjectival use and only when no article or pronoun is preceding. More at {{term|drei|lang=de}}.
14 |
15 | ----
16 | ==Norwegian Bokmål==
17 |
18 | ===Verb===
19 | {{head|nb|verb form}}
20 |
21 | # {{present tense of|dreie|lang=nb}}
22 |
23 | [[de:dreier]]
24 | [[pl:dreier]]
25 | [[fi:dreier]]
26 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/seawater.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 | ===Etymology===
3 | [[sea]] + [[water]]
4 | ===Noun===
5 | {{en-noun|-}}
6 |
7 | #The saltwater of a [[sea]] or [[ocean]].
8 |
9 | ====Translations====
10 | {{top}}
11 | * Finnish: {{t-|fi|merivesi}}
12 | * French: [[eau de mer]] {{f}}
13 | * German: {{t+|de|Meerwasser|n}}, {{t+|de|Salzwasser|n}}
14 | {{mid}}
15 | * Japanese: [[海水]] ([[かいすい]], kaisui)
16 | * Spanish: {{t-|es|agua salada|f}}
17 | {{bottom}}
18 |
19 | [[et:seawater]]
20 | [[fr:seawater]]
21 | [[ko:seawater]]
22 | [[lo:seawater]]
23 | [[hu:seawater]]
24 | [[ja:seawater]]
25 |
--------------------------------------------------------------------------------
/src/test/resources/WiktionaryDumpParserNullTest.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 0
5 |
6 | 0
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | 2004-09-17
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/granada.txt:
--------------------------------------------------------------------------------
1 | ==Portuguese==
2 |
3 | ===Etymology===
4 | From {{etyl|la|pt}} {{m|la|grānātum||pomegranate}}, from {{m|la|grānātus||having many seeds}}, from {{m|la|grānum||seed, grain}}, from {{etyl|ine-pro|pt}} {{m|ine-pro|*ǵr̥h₂nóm||grain}}.
5 |
6 | ===Noun===
7 | {{pt-noun|f|s}}
8 |
9 | # [[pomegranate]] (fruit)
10 | # [[hand grenade]] (small explosive device)
11 | # [[shell]] (artillery)
12 | # [[garnet]] (mineral group)
13 |
14 | ====Synonyms====
15 | * (pomegranate) [[romã]]
16 | * (hand grenade) [[granada de mão]]
17 | * (garnet) [[granate]]
18 |
19 | [[Category:pt:Explosives]]
20 | [[Category:pt:Fruits]]
21 | [[Category:pt:Mineralogy]]
22 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Eingaben.txt:
--------------------------------------------------------------------------------
1 | == Eingaben ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Deklinierte Form|Deutsch}} ===
3 |
4 | {{Worttrennung}}
5 | :Ein·ga·ben
6 |
7 | {{Aussprache}}
8 | :{{IPA}} {{Lautschrift|ˈaɪ̯nˌɡaːbən}}, {{Lautschrift|ˈaɪ̯nˌɡaːbn̩}}
9 | :{{Hörbeispiele}} {{Audio|}}
10 |
11 | {{Grammatische Merkmale}}
12 | *Nominativ Plural des Substantivs '''[[Eingabe]]'''
13 | *Genitiv Plural des Substantivs '''[[Eingabe]]'''
14 | *Dativ Plural des Substantivs '''[[Eingabe]]'''
15 | *Akkusativ Plural des Substantivs '''[[Eingabe]]'''
16 |
17 | {{Grundformverweis Dekl|Eingabe}}
18 |
19 | [[en:Eingaben]]
20 | [[fi:Eingaben]]
21 | [[ku:Eingaben]]
22 | [[mg:Eingaben]]
23 | [[sv:Eingaben]]
24 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/bamba.txt:
--------------------------------------------------------------------------------
1 | ==Spanish==
2 |
3 | ===Etymology 1===
4 | onomatopoeia
5 |
6 | ====Noun====
7 | {{es-noun|f}}
8 |
9 | # [[hit]], [[strike]] {{gloss|in a game}}
10 | # a Latin American dance
11 |
12 | =====Synonyms=====
13 | * (''hit''): [[acierto]]
14 |
15 | ===Etymology 2===
16 | From a trademark
17 |
18 | ====Noun====
19 | {{es-noun|f}}
20 |
21 | # [[flip-flop]], [[thong]], [[jandal]]
22 | # [[sneaker]]
23 |
24 | =====See also=====
25 | * [[chancla]]
26 | * [[playera]]
27 | * [[sandalia]]
28 |
29 | [[fr:bamba]]
30 | [[ko:bamba]]
31 | [[id:bamba]]
32 | [[jv:bamba]]
33 | [[kn:bamba]]
34 | [[hu:bamba]]
35 | [[mg:bamba]]
36 | [[pt:bamba]]
37 | [[fi:bamba]]
38 | [[sv:bamba]]
39 | [[te:bamba]]
40 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/mangueira.txt:
--------------------------------------------------------------------------------
1 | ==Galician==
2 |
3 | ===Noun===
4 | {{gl-noun|f}}
5 |
6 | # [[mango]] [[tree]]
7 |
8 | ====Related terms====
9 | * [[manga]]
10 |
11 | [[Category:gl:Trees]]
12 |
13 | ----
14 |
15 | ==Portuguese==
16 | ===Etymology 1===
17 | ====Noun====
18 | '''mangueira''' {{g|f}}
19 | # [[hose]].
20 |
21 | ===Etymology 2===
22 | ====Noun====
23 | '''mangueira''' {{g|f}}
24 | # [[mango]] (tree).
25 |
26 | =====Related terms=====
27 | {{top2}}
28 | *[[manga]]
29 | {{mid2}}
30 | {{bottom}}
31 |
32 | [[Category:Portuguese nouns]]
33 |
34 | [[io:mangueira]]
35 | [[mg:mangueira]]
36 | [[fj:mangueira]]
37 | [[pt:mangueira]]
38 | [[ru:mangueira]]
39 | [[tl:mangueira]]
40 | [[chr:mangueira]]
41 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/varanda.txt:
--------------------------------------------------------------------------------
1 | ==Portuguese==
2 | {{wikipedia|lang=pt}}
3 | [[Image:2012 Portugal 7844271368.jpg|thumb|250px|varanda]]
4 |
5 | ===Etymology===
6 | {{unk.|lang=pt|title=Uncertain}}, but possibly related to Spanish {{term|baranda|lang=es}}.
7 |
8 | ===Pronunciation===
9 | * {{a|PT}} {{IPA|/vɐ.ˈɾɐ̃.dɐ/|lang=pt}}
10 | * {{hyphenation|va|ran|da|lang=pt}}
11 |
12 | ===Noun===
13 | {{pt-noun|f}}
14 |
15 | # [[balcony]], [[veranda]], [[terrace]]
16 | # [[porch]]
17 |
18 | ====Descendants====
19 | * Hindi: {{l|hi|बरामदा|tr=barāmdā|sc=Deva}}, {{l|hi|बरण्डा|tr=baraṇḍā|sc=Deva}}
20 | * English: {{l|en|veranda}}, {{l|en|verandah}}
21 | * French: {{l|fr|véranda}}
22 |
23 | [[eo:varanda]]
24 | [[fr:varanda]]
25 | [[mg:varanda]]
26 | [[pt:varanda]]
27 | [[fi:varanda]]
28 | [[zh:varanda]]
29 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build and test
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | permissions:
10 | contents: read
11 | packages: write
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up JDK 11
16 | uses: actions/setup-java@v2
17 | with:
18 | java-version: '11'
19 | distribution: 'temurin'
20 | # server-id: github # Value of the distributionManagement/repository/id field of the pom.xml
21 | # settings-path: ${{ github.workspace }} # location for the settings.xml file
22 |
23 | - name: Build with Maven
24 | run: mvn test
25 |
26 | # - name: Publish to GitHub Packages Apache Maven
27 | # run: mvn deploy -s $GITHUB_WORKSPACE/settings.xml
28 | # env:
29 | # GITHUB_TOKEN: ${{ github.token }}
30 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/IntegrationTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl;
19 |
20 | public interface IntegrationTest {
21 | }
22 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.txt:
--------------------------------------------------------------------------------
1 | # This is the list of people who have contributed code to the DKPro JWKTL repository.
2 | #
3 | # This list is to be used in favor over author attributions in individual files,
4 | # e.g. via @author tags.
5 | #
6 | # Code integrated from third parties by others than their original authors may not have any
7 | # @author tags removed and respective names must not be added to this file. Integration of
8 | # third party code should be avoided.
9 | #
10 | # After the name, one or more mail addresses may be specified in pointy brackets and one or more
11 | # GitHub IDs may be specified in square brackets.
12 | #
13 | # See also https://github.com/dkpro/dkpro-core/blob/master/CONTRIBUTING.md
14 |
15 | # Please keep the list sorted.
16 |
17 | Alexey Valikov [highsource]
18 | Christian M. Meyer [chmeyer]
19 | Christof Müller
20 | Ilya [intracer]
21 | Iryna Gurevych
22 | Jan Berkel [jberkel]
23 | Lizhen Qu
24 | Rafael Hoff [rafaelhoff]
25 | Torsten Zesch
26 | Tristan Miller [logological]
27 | Václav Slavík [vslavik]
28 | Yang Yang [geraint0923]
29 | Yevgen Chebotar
30 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/goitrogenic.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Adjective===
4 | {{en-adj}}
5 |
6 | # Of or pertaining to a [[goitrogen]]
7 | #* '''1968''' July, G. A. Bray, “Increased sensitivity of the thyroid in iodine-depleted rats to the '''goitrogenic''' effects of thyrotropin,” in ''The Clinical Journal of Investigation'' 47(7): 1640–1647,
8 | #*: The present studies demonstrate that iodine depletion increases the sensitivity of the thyroid to the '''goitrogenic''' effects of thyrotropin.
9 | #*'''1948''' J. Seifter and W. E. Ehrich, “Goitrogenic Compounds: Pharmacological and Pathological Effects,” ''Journal of Pharmacology and Experimental Therapeutics'' 92(3): 303-314
10 | #*: Seventy-eight compounds were screened for '''goitrogenic''' action. Of these, 12 were found to be effective, but only thiouracil, propylthiouracil, 2-amino-thiazole and Dithane were found to be markedly active.
11 |
12 | ====Synonyms====
13 | * {{italbrac|of or pertaining to that which reduces the production or effects of thyroid hormones}}: [[antithyroid]]
14 |
15 | ====Usage notes====
16 | See usage note at [[antithyroid]].
17 |
18 | [[zh:goitrogenic]]
19 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/abele.txt:
--------------------------------------------------------------------------------
1 | {{wikipedia}}
2 | {{also|Abele|ābele}}
3 | ==English==
4 |
5 | ===Etymology===
6 | Dutch ''[[abeel]]'' (''abeel-boom''), Old French ''[[abel]]'', ''[[aubel]]'', from a diminutive of Latin ''[[albus]]'', white
7 | [[Image:Czajecice tree 20060812 1401.jpg|thumb|Abele; White poplar]]
8 |
9 | ===Noun===
10 | {{en-noun}}
11 |
12 | # The [[white poplar]] (''[[Populus]] [[alba]]'').
13 | #* Six '''abeles''' i' the churchyard grow - [[Mrs. Browning]]
14 |
15 | ====See also====
16 | * {{pedialite|Populus alba}}
17 | * {{commonslite|Populus alba}}
18 | * {{specieslite|Populus alba}}
19 |
20 | ====Translations====
21 | {{trans-top|the white poplar, Populus alba}}
22 | * Bulgarian: {{t|bg|бяла топола }}
23 | * Dutch: {{t+|nl|abeel|m}}
24 | {{trans-mid}}
25 | * Estonian: [[hõbepappel]]
26 | {{trans-bottom}}
27 |
28 | [[Category:Trees]]
29 |
30 | ----
31 |
32 | ==[[Novial]]==
33 |
34 | ===Noun===
35 | '''abele'''
36 |
37 | # [[bee]]
38 |
39 | [[lt:abele]]
40 | [[hu:abele]]
41 | [[ro:abele]]
42 | [[ru:abele]]
43 | [[fi:abele]]
44 | [[uk:abele]]
45 | [[vi:abele]]
46 | [[wa:abele]]
47 | [[zh:abele]]
48 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikipedia/util/GraphMLFile.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.util;
17 |
18 | /** GraphML loader/writer.
19 | */
20 | public class GraphMLFile {
21 |
22 | /** Creates a new instance of GraphMLFile */
23 | // public GraphMLFile() {
24 | // }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENWordFormHandlerTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | public class ENWordFormHandlerTest extends WordFormHandlerTest {
21 | @Override
22 | public void setUp() throws Exception {
23 | super.setUp();
24 | handler = new ENWordFormHandler("lemma");
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/aborted.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Etymology===
4 | Derivative of [[abort]]
5 |
6 | ===Adjective===
7 | {{en-adj|-}}
8 |
9 | # Brought forth [[prematurely]].
10 | # {{biology}} Rendered [[abortive]] or [[sterile]]; [[undeveloped]]; checked in normal development at a very early stage; as, spines are ''aborted'' branches.
11 | #:''"The eyes of the cirripeds are more or less '''aborted''' in their mature state."'' -[[w:Richard Owen|Richard Owen]].
12 |
13 | ====Translations====
14 | * [[Catalan]]: [[avortat]]
15 | * French: {{t+|fr|avorté}}
16 | * [[Interlingua]]: [[abortate]]
17 | * Italian: {{t-|it|terminato}}
18 | * Portuguese: {{t-|pt|abortado}}
19 | * Spanish: {{t-|es|abortado}}
20 | * Swedish: {{t|sv|aborterad}}
21 |
22 | ===Verb===
23 | '''aborted'''
24 |
25 | # {{past of|[[abort]]}}
26 |
27 | ===Anagrams===
28 | * {{alphagram|abdeort}}
29 | * [[borated#English|borated]]
30 |
31 |
32 | [[Category:English adjectives]]
33 |
34 | [[de:aborted]]
35 | [[fr:aborted]]
36 | [[it:aborted]]
37 | [[hu:aborted]]
38 | [[mg:aborted]]
39 | [[ml:aborted]]
40 | [[my:aborted]]
41 | [[pt:aborted]]
42 | [[ru:aborted]]
43 | [[fi:aborted]]
44 | [[ta:aborted]]
45 | [[vi:aborted]]
46 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/for_good_measure.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Prepositional phrase===
4 | {{en-PP|[[for]] [[good]] [[measure]]}}
5 |
6 | # {{idiomatic}} In excess of the minimum required; Added as an [[extra]]
7 | #: ''He tossed in a couple of extra shirts '''for good measure''' and closed the suitcase.''
8 |
9 | ====Translations====
10 | {{trans-top|in excess of the required minimum}}
11 | * {{trreq|ar}}:
12 | * Chinese:
13 | :* Mandarin: {{zh-tsp||额外补充|}}, {{zh-tsp||保险起见|}}
14 | *: {{trreq|cmn}}:
15 | * {{trreq|cs}}:
16 | * {{trreq|nl}}:
17 | * Finnish: {{t-|fi|kaiken varalta}}, {{t+|fi|varmuuden vuoksi}}, {{t-|fi|varoiksi}}
18 | * French: [[pour]] {{t+|fr|faire bonne mesure}}
19 | {{trans-mid}}
20 | * German: {{t-|de|noch dazu}}
21 | * {{trreq|hi}}:
22 | * {{trreq|it}}:
23 | * {{trreq|ja}}:
24 | * {{trreq|ko}}:
25 | * {{trreq|pl}}:
26 | * Portuguese: {{t-|pt|por precaução}}
27 | * Russian: [[на всякий случай]] ''(for any case)''
28 | * Spanish: {{t-|es|por si acaso}}, {{t-|es|por precaución}}
29 | * {{trreq|vi}}:
30 | {{trans-bottom}}
31 | {{checktrans-top}}
32 | * {{ttbc|ro}}: [[pentru]] [[orice]] [[eventualitate]]
33 | {{trans-bottom}}
34 |
35 | [[et:for good measure]]
36 | [[my:for good measure]]
37 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Fote.txt:
--------------------------------------------------------------------------------
1 | === {{Wortart|Substantiv|Deutsch}}, {{f}} ===
2 |
3 | {{Deutsch Substantiv Dialekt
4 | |Singular=de Fote
5 | |Plural=de Foten
6 | }}
7 |
8 | {{Worttrennung}}
9 | :Fo·te, {{Pl.}} Fo·ten
10 |
11 | {{Aussprache}}
12 | :{{IPA}} {{Lautschrift|ˈfoːtə}}
13 | :{{Hörbeispiele}} {{Audio|}}
14 |
15 | {{Bedeutungen}}
16 | :[1] ''[[berlinisch]]:'' Pfote, Pote; Hand, Fuß
17 |
18 | {{Oberbegriffe}}
19 | :[1] [[Körperteil]]
20 |
21 | {{Beispiele}}
22 | :[1] Sach ma, biste noch janz reene oda wat!? Du kannst do nich eenfach da Katze uff de ''Foten'' tippeln.
23 | :[1] Quatsch nich! Nu mach ma hinne! Ick will da nich ständich uff de ''Foten'' kieken müssen.
24 | :[1] Ick gloob, ick hab ma meene ''Fote'' jeknackst.
25 |
26 | ==== {{Übersetzungen}} ====
27 | {{Ü-Tabelle|Ü-links=
28 | :*{{Übersetzungen umleiten|1|Pfote|1, 2}} {{f}}, {{Übersetzungen umleiten||Pote|}} {{f}}; {{Übersetzungen umleiten||Hand|1}} {{f}}, {{Übersetzungen umleiten||Fuß|1}} {{m}}
29 | |Ü-rechts=
30 | }}
31 |
32 | {{Referenzen}}
33 | :[1] Hans Meyer, Siegfried Mauermann, Walther Kiaulehn: ''Der richtige Berliner in Wörtern und Redensarten'', Neuausgabe der 10. Auflage, C. H. Beck, München 1985. Seite 100. ISBN 3-406-30611-X
34 |
35 | {{Ähnlichkeiten 1|[[Pfote]]}}
36 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/escritorio.txt:
--------------------------------------------------------------------------------
1 | ==Portuguese==
2 | {{wikipedia|lang=pt}}
3 |
4 | ===Alternative forms===
5 | * [[escriptório]] {{qualifier|obsolete}}
6 |
7 | ===Etymology===
8 | {{suffix|escritor|gloss1=writer|io|gloss2=ium|lang=pt}}.
9 |
10 | ===Pronunciation===
11 | * {{a|South Brazil}} {{IPA|/ˌes.kɾi.ˈtɔ.ɾi.o/|/ˌes.kɾi.ˈtɔ.ɾjo/|lang=pt}}
12 | * {{a|PT}} {{IPA|/ˌiʃ.kɾi.ˈtɔ.ɾju/|lang=pt}}
13 | * {{hyphenation|es|cri|tó|ri|o|lang=pt}}
14 |
15 | ===Noun===
16 | {{pt-noun|m|s}}
17 |
18 | # [[office]] (building or room)
19 |
20 | ====Synonyms====
21 | * [[gabinete]]
22 |
23 | ====Related terms====
24 | {{top4}}
25 | * [[alfabeto]]
26 | * [[escrevedor]]
27 | * [[escrever]]
28 | * [[escrevinhar]]
29 | * [[escriba]]
30 | {{mid4}}
31 | * [[escrita]]
32 | * [[escrito]]
33 | * [[escritor]]
34 | * [[escritura]]
35 | {{mid4}}
36 | * [[escrituração]]
37 | * [[escriturado]]
38 | * [[escriturar]]
39 | * [[escrituário]]
40 | {{mid4}}
41 | * [[escrivania]]
42 | * [[escrivaninha]]
43 | * [[escrivão]]
44 | * [[script]]
45 | {{bottom}}
46 |
47 | [[Category:pt:Business]]
48 |
49 | [[es:escritório]]
50 | [[fr:escritório]]
51 | [[lo:escritório]]
52 | [[hu:escritório]]
53 | [[mg:escritório]]
54 | [[fj:escritório]]
55 | [[pl:escritório]]
56 | [[pt:escritório]]
57 | [[sm:escritório]]
58 | [[chr:escritório]]
59 | [[zh:escritório]]
60 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/sumo.txt:
--------------------------------------------------------------------------------
1 | ==Portuguese==
2 |
3 | ===Pronunciation===
4 | * {{a|PT}} {{IPA|/ˈsu.mu/|lang=pt}}
5 |
6 | ===Etymology 1===
7 | From {{etyl|la|pt}} ''[[summus]]''.
8 |
9 | ====Adjective====
10 | {{pt-adj|sum|o}}
11 |
12 | # [[highest]], [[greatest]].
13 |
14 | ====Noun====
15 | {{pt-noun|m|s}}
16 |
17 | # [[summit]], [[top]].
18 |
19 | ===Etymology 2===
20 | [[Image:Orange juice 1 edit1.jpg|thumb|150px|sumo]]
21 | From {{etyl|roa-opt|pt}} {{term|çumo|lang=roa-opt}}, from {{etyl|ar|pt}} {{term|زُوم|lang=ar||juice, sap}}, from {{etyl|grc|pt}} {{term|ζωμός|lang=grc}}. Cognate of Galician {{term|zume|lang=gl}} and Spanish {{term|zumo|lang=es}}.
22 |
23 | ====Noun====
24 | {{pt-noun|m|s}}
25 |
26 | # {{context|Portugal|lang=pt}} [[juice]].
27 |
28 | =====Synonyms=====
29 | * [[suco]] {{qualifier|Brasil}}
30 |
31 | ===Etymology 3===
32 | [[Image:Bulgarian-sumists.jpg|thumb|150px|sumo]]
33 | From {{etyl|ja|pt}} {{term|相撲|tr=sumō|lang=ja}} ''to mutually rush at''.
34 |
35 | ====Alternative forms====
36 | * {{qualifier|Brazil}} [[sumô]]
37 |
38 | ====Noun====
39 | {{pt-noun|m|-}}
40 |
41 | # {{context|martial arts|Portugal|lang=pt}} {{l|en|sumo}}.
42 |
43 | ===Etymology 4===
44 |
45 | ====Verb====
46 | {{pt-verb-form}}
47 |
48 | # {{inflection of|sumir||1|s|pres|indc|lang=pt}}
49 |
50 | ----
51 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Subdivisio.txt:
--------------------------------------------------------------------------------
1 | == Subdivisio ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{f}} ===
3 |
4 | {{Substantiv-Tabelle|
5 | Wer oder was? (Einzahl)=die Subdivisio
6 | |Wer oder was? (Mehrzahl)=die Subdivisiones
7 | |Wessen? (Einzahl)=der Subdivisio
8 | |Wessen? (Mehrzahl)=der Subdivisiones
9 | |Wem? (Einzahl)=der Subdivisio
10 | |Wem? (Mehrzahl)=den Subdivisiones
11 | |Wen? (Einzahl)=die Subdivisio
12 | |Wen? (Mehrzahl)=die Subdivisiones
13 | }}
14 |
15 | {{Silbentrennung}}
16 | : Sub·di·vi·si·o, {{Pl.}} Sub·di·vi·si·o·nes
17 |
18 | {{Aussprache}}
19 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}}
20 | :{{IPA}} {{Lautschrift|...}}, {{Pl.}} {{Lautschrift|...}}
21 |
22 | {{Bedeutungen}}
23 | :[1] [[Biologie]]: Die hierarchische Gliederungsstufe der [[Divisio]] (dt.: [[Abteilung]]) im Reich der Pflanzen und der Pilze kann weiter in '''Subdivisiones''' (dt.: [[Untterabteilung|Unterabteilungen]]) differenziert werden.
24 |
25 | {{Synonyme}}
26 | :[1] Unterabteilung (in der Biologie)
27 |
28 | {{Beispiele}}
29 | :[1]
30 |
31 | ==== Übersetzungen ====
32 | {{Ü-links}}
33 | *{{fr}}: [1] [[sub-division]]
34 | {{Ü-Abstand}}
35 | {{Ü-rechts}}
36 |
37 | {{Referenzen}}
38 | :[1] {{Wikipedia|Abteilung (Biologie)}}
39 |
40 | [[fr:Subdivisio]]
41 | [[ru:Subdivisio]]
42 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/IWiktionaryExample.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api;
19 |
20 | public interface IWiktionaryExample {
21 | /**
22 | * @return the text, including wiki markup
23 | */
24 | String getText();
25 |
26 | /**
27 | * @return the text of this example as wiki string
28 | */
29 | IWikiString getExample();
30 |
31 | /**
32 | * @return the translation of this example, or null
33 | */
34 | IWikiString getTranslation();
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/robber_baron.txt:
--------------------------------------------------------------------------------
1 | == robber baron ({{Sprache|Englisch}}) ==
2 | === {{Wortart|Substantiv|Englisch}} ===
3 |
4 | {{Englisch Substantiv Übersicht
5 | |Singular=the robber baron
6 | |Plural=the robber barons
7 | }}
8 |
9 | {{Worttrennung}}
10 | : rob·ber bar·on, {{Pl.}} rob·ber bar·ons
11 |
12 | {{Aussprache}}
13 | :{{IPA}} {{Lautschrift|…}}, {{Pl.}} {{Lautschrift|…}}
14 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}}
15 |
16 | {{Bedeutungen}}
17 | :[1] Angehörige eines ritterlichen Standes im Spätmittelalter, die ihre schlechte finanzielle Lage durch Straßenraub, [[Fehde]]n und Plünderungszüge verbessern wollten
18 | :[2] ein [[skrupellos]]er [[Kapitalist]], [[Industrielle]]r oder [[Geschäftsmann]] des späten 19. Jahrhunderts
19 |
20 | {{Beispiele}}
21 | :[1]
22 |
23 | ==== Übersetzungen ====
24 | {{Ü-links}}
25 | *{{de}}: [1] [[Raubritter]]; [2] [[skrupellos]]er [[Kapitalist]]; ([[Räuber-Baron]])
26 | {{Ü-Abstand}}
27 | *{{fr}}: [1] {{Ü|fr|}}
28 | {{Ü-rechts}}
29 |
30 | {{Referenzen}}
31 | :[1] {{Wikipedia|spr=en|robber baron}}
32 | :[1] {{Ref-Leo|en|robber+baron}}
33 | :[1] {{Ref-Pons|en|robber+baron}}
34 | :[1] {{Ref-MWD|robber+baron}}
35 | :[1, 2] {{Ref-Dictionary|robber+baron}}
36 | :[1] {{Ref-dictcc|en|robber+baron}}
37 |
38 | {{Referenzen prüfen|Englisch}}
39 |
40 | [[en:robber baron]]
41 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/batsman.txt:
--------------------------------------------------------------------------------
1 | {{wikipedia}}
2 |
3 | ==English==
4 |
5 | ===Synonyms===
6 | * [[batter]]
7 |
8 | ===Noun===
9 | {{en-noun|pl=batsmen}}
10 | # {{cricket}} A [[player]] of the [[batting]] [[side]] now on the [[field]]
11 | # {{cricket}} The [[player]] now [[receiving]] [[strike]]; the [[striker]]
12 | #: 2001: ''The batsman, Kathryn Leng, (who has played for quite a few years for England) asked the umpire dumbfounded if Charlie was going to bowl with a helmet on.'' — [[w:Julia Price|Julia Price]] (Australian cricketer), her women's Ashes diary entry for 19 June 2001 [http://www.southernstars.org.au/ukdiary2001.htm]
13 | # {{cricket}} Any player selected for his or her [[team]] principally to [[bat]], as opposed to a [[bowler]]
14 |
15 | ====Usage notes====
16 | The term batsman is applied to both male and female cricketers; [[batswoman]] is much rarer.
17 |
18 | ====Derived terms====
19 | * [[batsmanship]]
20 |
21 | ====Related terms====
22 | * [[bat]]
23 |
24 | ===Anagrams===
25 | * [[bantams#English|bantams]], [[batmans#English|batmans]]
26 |
27 | [[et:batsman]]
28 | [[fr:batsman]]
29 | [[ko:batsman]]
30 | [[io:batsman]]
31 | [[kn:batsman]]
32 | [[hu:batsman]]
33 | [[ml:batsman]]
34 | [[my:batsman]]
35 | [[pl:batsman]]
36 | [[fi:batsman]]
37 | [[ta:batsman]]
38 | [[vi:batsman]]
39 | [[zh:batsman]]
40 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/IQuotation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api;
19 |
20 | import java.util.List;
21 |
22 | /**
23 | * Represents a quotation.
24 | * @author Christian M. Meyer
25 | * @author Christof Müller
26 | * @author Lizhen Qu
27 | */
28 | public interface IQuotation {
29 |
30 | /** Returns the source of the quotation. */
31 | IWikiString getSource();
32 |
33 | /** Returns the text of the quotation as a list of {@link IWikiString}s. */
34 | List getLines();
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikipedia/language/LanguageTypeLocal.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.language;
17 |
18 |
19 | /** Names of languages in some language (e.g. Russian)
20 | * and the links to the LanguageType codes.
21 | */
22 | public abstract class LanguageTypeLocal {
23 |
24 | /** Language name, e.g. "Russian" */
25 | protected String name;
26 |
27 | /** LanguageType corresponding to this name, e.g. LanguageType.ru */
28 | protected LanguageType type;
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DativeHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
21 |
22 | public class DativeHandler extends CaseHandler {
23 |
24 | protected static final String DATIVE_PATTERN =
25 | // startsWith("Dativ")
26 | "^Dativ|" +
27 | // startsWith("Wem?")
28 | "^Wem\\?";
29 |
30 | public DativeHandler() {
31 | super(DATIVE_PATTERN, GrammaticalCase.DATIVE);
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/GenitiveHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
21 |
22 | public class GenitiveHandler extends CaseHandler {
23 |
24 | protected static final String GENITIVE_PATTERN =
25 | // startsWith("Genitiv")
26 | "^Genitiv|" +
27 | // startsWith("Wessen?")
28 | "^Wessen\\?";
29 |
30 | public GenitiveHandler() {
31 | super(GENITIVE_PATTERN, GrammaticalCase.GENITIVE);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/AccusativeHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
21 |
22 | public class AccusativeHandler extends CaseHandler {
23 |
24 | protected static final String ACCUSATIVE_PATTERN =
25 | // startsWith("Akkusativ")
26 | "^Akkusativ|" +
27 | // startsWith("Wen?")
28 | "^Wen\\?";
29 |
30 | public AccusativeHandler() {
31 | super(ACCUSATIVE_PATTERN, GrammaticalCase.ACCUSATIVE);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/WQuoteEn.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en;
17 |
18 | /** Phrase or sentence that illustrates a meaning of a word in Russian Wiktionary.
19 | */
20 | public class WQuoteEn {
21 |
22 |
23 | /** Removes highlighted marks from a sentence.
24 | * Sentence with '''words'''. -> Sentence with words.
25 | */
26 | public static String removeHighlightedMarksFromSentence(String str)
27 | {
28 | if(str.contains("'''"))
29 | return str.replace("'''", "");
30 |
31 | return str;
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEBlockHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.parser.components.BlockHandler;
21 |
22 | /**
23 | * Abstract base class for all parser components for the German Wiktionary.
24 | * @author Christian M. Meyer
25 | */
26 | public abstract class DEBlockHandler extends BlockHandler {
27 |
28 | /** Initializes the block handler for parsing all sections starting with
29 | * one of the specified labels. */
30 | public DEBlockHandler(final String... labels) {
31 | super(labels);
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENBlockHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.parser.components.BlockHandler;
21 |
22 | /**
23 | * Abstract base class for all parser components for the English Wiktionary.
24 | * @author Christian M. Meyer
25 | */
26 | public abstract class ENBlockHandler extends BlockHandler {
27 |
28 | /** Initializes the block handler for parsing all sections starting with
29 | * one of the specified labels. */
30 | public ENBlockHandler(final String... labels) {
31 | super(labels);
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalPerson.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical number of a {@link IWiktionaryWordForm}.
24 | * @author Christian M. Meyer
25 | */
26 | public enum GrammaticalPerson {
27 |
28 | /** The first person; the speaker; referred to by "I", "we". */
29 | FIRST,
30 |
31 | /** The second person; the addressee; referred to by "you". */
32 | SECOND,
33 |
34 | /** The third person; the other; referred to by "he", "she",
35 | * "it", "they". */
36 | THIRD;
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/NominativeHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
21 |
22 | public class NominativeHandler extends CaseHandler {
23 |
24 | protected static final String NOMINATIVE_PATTERN =
25 | // startsWith("Nominativ")
26 | "^Nominativ|" +
27 | // equals("Genus 1") || equals("Genus 2") ||
28 | // equals("Genus 3") || equals("Genus 4") ||
29 | "^Wer\\soder\\swas\\?";
30 |
31 | public NominativeHandler() {
32 | super(NOMINATIVE_PATTERN, GrammaticalCase.NOMINATIVE);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/garçon.txt:
--------------------------------------------------------------------------------
1 | ==French==
2 |
3 | ===Etymology===
4 | From {{etyl|frm|fr}}, from {{etyl|fro|fr}} {{m|fro|garçun||servant}}, oblique case of {{m|fro|gars}}, from {{etyl|frk|fr}} {{m|frk|*wrakjō||servant, boy}} from {{etyl|gem-pro|fr}} {{m|gem-pro|*wrakjô||exile, driven one}}, from {{etyl|ine-pro|fr}} {{m|ine-pro|*wreg-||to drive}}. Cognate with {{cog|goh|wrecheo}}, {{m|goh|recko||exile, warrior, hero}} (Modern {{cog|de|Recke}}), {{cog|osx|wrekkio||a banished person, exile, stranger}}, {{cog|ang|wreċċa||a wretch, stranger, exile}}, and perhaps to {{cog|non|rekkr||man, warrior, hero}}. More at {{l|en|wretch}}, {{l|en|wreak}}.
5 |
6 | ===Pronunciation===
7 | * {{audio|Fr-garçon.ogg|audio (un garçon)|lang=fr}}
8 | * {{IPA|/ɡaʁsɔ̃/|lang=fr}}
9 |
10 | ===Noun===
11 | {{fr-noun|m}}
12 |
13 | # {{l|en|boy}}
14 | #: {{ux|fr|Il a deux '''garçons''' et une fille.|He has two '''boys''' and a daughter.}}
15 | #: {{syn|fr|gamin}}
16 | #: {{ant|fr|adulte}}
17 | # {{lb|fr|by extension}} {{l|en|[[young]] [[man]]}}; {{l|en|man}}
18 | #: {{syn|fr|homme}}
19 | # {{l|en|waiter}}
20 | #: {{ux|fr|'''Garçon''', l'addition s'il vous plaît.|'''Waiter''', the bill please.|inline=1}}
21 | #: {{syn|fr|serveur|serviteur}}
22 |
23 |
24 | ====Synonyms====
25 | * {{sense|boy}} {{l|fr|fils}}
26 |
27 | ====Derived terms====
28 | * {{l|fr|garçonnet}}
29 |
30 | ====See also====
31 | * {{l|fr|fille}}
32 | * {{l|fr|fils}}
33 |
34 | ===References===
35 | * ''Merriam-Webster's Collegiate Dictionary: Tenth Edition'' (1997)
36 |
37 | ===External links===
38 | * {{R:TLFi}}
39 |
40 | ----
41 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalGender.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
21 |
22 | /**
23 | * Enumeration for modeling the grammatical gender of a
24 | * {@link IWiktionaryEntry}.
25 | * @author Christian M. Meyer
26 | */
27 | public enum GrammaticalGender {
28 |
29 | /** Masculine gender (e.g., the German "Hund"). */
30 | MASCULINE,
31 |
32 | /** Feminine gender (e.g., the German "Katze"). */
33 | FEMININE,
34 |
35 | /** Neuter gender (e.g., the German "Haus"). */
36 | NEUTER;
37 |
38 | // ANIMATE,
39 | // INANIMATE,
40 | // HUMAN,
41 | // NON_HUMAN,
42 | // ANIMAL,
43 | // OTHER;
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/word/WSynonyms.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.word;
17 |
18 |
19 | /** Synonyms of Wiktionary word.
20 | */
21 | public class WSynonyms {
22 |
23 | /* Comment for the set of synonyms, e.g. synonyms for "entry":
24 | * * (''act of entering''): [[access]], [[enter]]ing, [[entrance]],
25 | * * (''doorway that provides a means of entering a building''): [[entrance]], [[way in]] {{UK}}
26 | * .comment=act of entering
27 | * .comment=doorway...
28 | * .words[1].tag=UK
29 | * /
30 | private String[] comment;*/
31 |
32 | /* Synonyms list with tags * /
33 | private WikiWord[] words;*/
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/resources/WiktionaryDumpParserTest.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Wiktionary
4 | http://de.wiktionary.org/wiki/Wiktionary:Hauptseite
5 | MediaWiki 1.16alpha-wmf
6 | case-sensitive
7 |
8 |
9 | Diskussion
10 |
11 |
12 |
13 | Page 1
14 | 9
15 |
16 | 10763
17 | 2004-09-17T08:23:57Z
18 |
19 | TJ
20 | 10
21 |
22 | Text 1
23 |
24 |
25 |
26 | Page 2
27 | 10
28 |
29 | 10764
30 | 2004-09-17T08:34:29Z
31 |
32 | TJ
33 | 10
34 |
35 |
36 | Text 2
37 |
38 | Test Test
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionaryPageFilter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.filter;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage;
21 |
22 | /**
23 | * Interface for implementing a filter for {@link IWiktionaryPage}s.
24 | * That is, a possibility for selecting which pages are to be processed
25 | * (i.e., accepted) or skipped (i.e., filtered out).
26 | * @author Christian M. Meyer
27 | */
28 | @FunctionalInterface
29 | public interface IWiktionaryPageFilter {
30 |
31 | /** Return true if the given page should be accepted or
32 | * false if it should be filtered out. */
33 | boolean accept(final IWiktionaryPage page);
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalDegree.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical degree of a {@link IWiktionaryWordForm}.
24 | * @author Christian M. Meyer
25 | */
26 | public enum GrammaticalDegree {
27 |
28 | /** Denotes an a property (e.g., "Your flowers are _pretty_"). */
29 | POSITIVE,
30 |
31 | /** Indicates a greater degree (e.g., "Your flowers are
32 | * _prettier_ than mine"). */
33 | COMPARATIVE,
34 |
35 | /** Indicates the greatest degree (e.g., "Your flowers are
36 | * _prettiest_"). */
37 | SUPERLATIVE;
38 |
39 | // ELATIVE,
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionaryEntryFilter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.filter;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
21 |
22 | /**
23 | * Interface for implementing a filter for {@link IWiktionaryEntry}s.
24 | * That is, a possibility for selecting which entries are to be processed
25 | * (i.e., accepted) or skipped (i.e., filtered out).
26 | * @author Christian M. Meyer
27 | */
28 | @FunctionalInterface
29 | public interface IWiktionaryEntryFilter {
30 |
31 | /** Return true if the given entry should be accepted or
32 | * false if it should be filtered out. */
33 | boolean accept(final IWiktionaryEntry entry);
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionarySenseFilter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.filter;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense;
21 |
22 | /**
23 | * Interface for implementing a filter for {@link IWiktionarySense}s.
24 | * That is, a possibility for selecting which senses are to be processed
25 | * (i.e., accepted) or skipped (i.e., filtered out).
26 | * @author Christian M. Meyer
27 | */
28 | @FunctionalInterface
29 | public interface IWiktionarySenseFilter {
30 |
31 | /** Return true if the given sense should be accepted or
32 | * false if it should be filtered out. */
33 | boolean accept(final IWiktionarySense sense);
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalNumber.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical number of a {@link IWiktionaryWordForm}.
24 | * @author Christian M. Meyer
25 | */
26 | public enum GrammaticalNumber {
27 |
28 | /** A single item (e.g., "a book", "one pen", "the guy"). */
29 | SINGULAR,
30 |
31 | /** Multiple items (e.g., "books", "two pens", "the guys"). */
32 | PLURAL;
33 |
34 | //SINGULATIVE,
35 | //COLLECTIVE,
36 |
37 | //DUAL, // 2 items
38 | //TRIAL, // 3 items
39 | //QUADRAL, // 4 items
40 | //PAUCAL, // few items
41 | //DISTRIBUTIVE_PLURAL, // independent instances
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryEntryParser.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException;
21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryPage;
22 |
23 | /**
24 | * A parser for separating an article page's text into individual
25 | * Wiktionary word entries.
26 | * @author Christian M. Meyer
27 | */
28 | public interface IWiktionaryEntryParser {
29 |
30 | /** Creates Wiktionary word entry instances from the provided text, and
31 | * adds them to the given article page.
32 | * @throws WiktionaryException in case of any parser errors. */
33 | void parse(final WiktionaryPage page, final String text)
34 | throws WiktionaryException;
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/WiktionarySenseFilter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.filter;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense;
21 |
22 | /**
23 | * Default implementation of the {@link IWiktionarySenseFilter} interface
24 | * which inherits all filter options of the {@link WiktionaryEntryFilter}
25 | * @author Christian M. Meyer
26 | */
27 | public class WiktionarySenseFilter extends WiktionaryEntryFilter
28 | implements IWiktionarySenseFilter {
29 |
30 | /** Initializes a page filter without any filter restrictions. */
31 | public WiktionarySenseFilter() {
32 | super();
33 | }
34 |
35 | public boolean accept(final IWiktionarySense sense) {
36 | if (!accept(sense.getEntry()))
37 | return false;
38 |
39 | return true;
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalTense.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical tense of a {@link IWiktionaryWordForm}.
24 | * Note that tense is often combined with verb aspects (e.g., present
25 | * perfect). Such combinations can be modeled in combination with
26 | * enumeration values from {@link GrammaticalAspect}.
27 | * @author Christian M. Meyer
28 | */
29 | public enum GrammaticalTense {
30 |
31 | /** The past; an utterance refers to the time before a reference time. */
32 | PAST,
33 |
34 | /** The present; an utterance refers to the reference time. */
35 | PRESENT,
36 |
37 | /** The future; an utterance refers to the time after a reference time. */
38 | FUTURE;
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DECollocationsHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.RelationType;
21 |
22 | /**
23 | * Parser component for extracting collocatoins from the German Wiktionary.
24 | * @author Christian M. Meyer
25 | * @author Lizhen Qu
26 | */
27 | public class DECollocationsHandler extends DERelationHandler {
28 |
29 | /** Initializes the block handler for parsing all sections starting with
30 | * one of the specified labels. */
31 | public DECollocationsHandler() {
32 | super(RelationType.CHARACTERISTIC_WORD_COMBINATION, "Charakteristische Wortkombinationen");
33 | }
34 |
35 | @Override
36 | protected String addDelimiters(final String text) {
37 | return super.addDelimiters(text.replace("''", ""));
38 | }
39 |
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryMultistreamDumpParser.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2015
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser;
19 |
20 | import java.io.File;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException;
23 |
24 | public interface IWiktionaryMultistreamDumpParser extends IWiktionaryDumpParser {
25 | /**
26 | * Parses a multistream XML dump file
27 | *
28 | * @param multistreamDumpFile the dumpfile (*-pages-articles-multistream-index.txt.bz2)
29 | * @param indexFile the matching index file (*-pages-articles-multistream.xml.bz2)
30 | * @param filter the filter to use to constrain the parsed pages
31 | * @throws de.tudarmstadt.ukp.jwktl.api.WiktionaryException
32 | */
33 | void parseMultistream(File multistreamDumpFile,
34 | File indexFile,
35 | MultistreamFilter filter) throws WiktionaryException;
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/WiktionaryException.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api;
19 |
20 | /**
21 | * Runtime exception which is thrown by the API in different situations,
22 | * especially when there are problems accessing the parsed Wiktionary data.
23 | * @author Christian M. Meyer
24 | * @author Christof Müller
25 | */
26 | public class WiktionaryException extends RuntimeException {
27 | private static final long serialVersionUID = 5373008056379642627L;
28 |
29 | /***/
30 | public WiktionaryException() {
31 | super();
32 | }
33 |
34 | /***/
35 | public WiktionaryException(final String message) {
36 | super(message);
37 | }
38 |
39 | /***/
40 | public WiktionaryException(final String message, final Throwable cause) {
41 | super(message, cause);
42 | }
43 |
44 | /***/
45 | public WiktionaryException(final Throwable cause) {
46 | super(cause);
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/MultistreamFilter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2015
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser;
19 |
20 | import java.util.Arrays;
21 | import java.util.List;
22 |
23 | @FunctionalInterface
24 | public interface MultistreamFilter {
25 | /** @return whether to include the page with pageId and pageTitle in the parse */
26 | boolean accept(long pageId, String pageTitle);
27 |
28 | /** A filter which includes only page titles contained in the specified list */
29 | class IncludingNames implements MultistreamFilter {
30 | private final List pageNames;
31 |
32 | public IncludingNames(String... pageNames) {
33 | this(Arrays.asList(pageNames));
34 |
35 | }
36 | public IncludingNames(List pageNames) {
37 | this.pageNames = pageNames;
38 | }
39 |
40 | @Override
41 | public boolean accept(long pageId, String pageTitle) {
42 | return pageNames.contains(pageTitle);
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DEEntryLinkHandlerTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage;
22 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEEntryLinkHandler;
23 |
24 | /**
25 | * Test case for {@link DEEntryLinkHandler}.
26 | */
27 | public class DEEntryLinkHandlerTest extends DEWiktionaryEntryParserTest {
28 |
29 | /***/
30 | public void testAbschlusz() throws Exception {
31 | IWiktionaryPage page = parse("Abschlusz.txt");
32 | IWiktionaryEntry entry = page.getEntry(0);
33 | assertEquals("Abschluss", entry.getEntryLink());
34 | }
35 |
36 | /***/
37 | public void testEingaben() throws Exception {
38 | IWiktionaryPage page = parse("Eingaben.txt");
39 | IWiktionaryEntry entry = page.getEntry(0);
40 | assertEquals("Eingabe", entry.getEntryLink());
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/cheio.txt:
--------------------------------------------------------------------------------
1 | ==Portuguese==
2 |
3 | ===Alternative forms===
4 | * {{l/pt|cheo}} {{qualifier|obsolete}}
5 |
6 | ===Etymology===
7 | Earlier {{m|pt|cheo}}, from {{etyl|roa-opt|pt}} {{term|chẽo|lang=roa-opt}}, from {{etyl|la|pt}} {{term|plenus|lang=la}}, from {{etyl|itc-pro|pt}} {{m|itc-pro|*plēnos}}, from {{etyl|ine-pro|pt}} {{m|ine-pro|*pl̥h₁nós||full}}. Compare {{etyl|ca|-}} {{m|ca|ple}}, {{etyl|eo|-}} {{m|eo|plena}}, {{etyl|fr|-}} {{m|fr|plein}}, {{etyl|io|-}} {{m|io|plena}}, {{etyl|it|-}} {{m|it|pieno}}, {{etyl|ro|-}} {{m|ro|plin}}, {{etyl|sc|-}} {{m|sc|prenu}}, {{etyl|es|-}} {{m|es|lleno}}.
8 |
9 | ===Pronunciation===
10 | * {{a|Portugal}} {{IPA|/ˈʃɐj.u/|/ˈʃej.u/|lang=pt}}
11 | * {{a|Brazil}} {{IPA|/ˈʃej.u/|lang=pt}}
12 | * {{hyphenation|chei|o|lang=pt}}
13 |
14 | ===Adjective===
15 | {{pt-adj|chei|o}}
16 |
17 | # {{l/en|full}}, {{l/en|filled}}, {{l/en|completed}}
18 | #: {{usex|lang=pt|A rua está '''cheia''' de trânsito|The street is full of traffic.}}
19 | #: {{usex|lang=pt|Estou '''cheio'''.|I'm full (not hungry anymore).}}
20 | # {{l/en|covered}}
21 | #: A rua está '''cheia''' de óleo.
22 | #:: The street is covered with oil.
23 | # {{context|figurative|lang=pt}} [[fed up]], [[tired]], [[annoyed]]
24 | #: {{usex|lang=pt|Estou '''cheio''' dele.|I'm fed up with him.}}
25 |
26 | ====Inflection====
27 | {{pt-adj-infl|chei|o|dim=1}}
28 |
29 | ====Synonyms====
30 | * {{sense|full}} {{l/pt|repleto}}, {{l/pt|completo}}, {{l/pt|lotado}}
31 | * {{sense|covered}} {{l/pt|coberto}}
32 | * {{sense|fed up}} {{l/pt|farto}}
33 |
34 | [[el:cheio]]
35 | [[es:cheio]]
36 | [[fr:cheio]]
37 | [[gl:cheio]]
38 | [[io:cheio]]
39 | [[ku:cheio]]
40 | [[hu:cheio]]
41 | [[mg:cheio]]
42 | [[fj:cheio]]
43 | [[nl:cheio]]
44 | [[ja:cheio]]
45 | [[pl:cheio]]
46 | [[pt:cheio]]
47 | [[fi:cheio]]
48 | [[chr:cheio]]
49 | [[zh:cheio]]
50 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/WiktionaryExample.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.entry;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWikiString;
21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryExample;
22 |
23 | public class WiktionaryExample implements IWiktionaryExample {
24 | protected IWikiString example;
25 | protected IWikiString translation;
26 |
27 | public WiktionaryExample() {}
28 |
29 | public WiktionaryExample(IWikiString example) {
30 | this(example, null);
31 | }
32 |
33 | public WiktionaryExample(IWikiString example, IWikiString translation) {
34 | this.example = example;
35 | this.translation = translation;
36 | }
37 |
38 | @Override
39 | public String getText() {
40 | return example.getText();
41 | }
42 |
43 | @Override
44 | public IWikiString getExample() {
45 | return example;
46 | }
47 |
48 | @Override
49 | public IWikiString getTranslation() {
50 | return translation;
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/util/LangText.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.util;
17 |
18 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.language.LanguageType;
19 |
20 | /** Data structure consists of a language code and the corresponding text.
21 | */
22 | public class LangText {
23 |
24 | /** Language of the text, e.g. the article about one word can contain "en" block for English word, "de", "fr", etc. */
25 | private LanguageType lang;
26 |
27 | /** Text */
28 | public StringBuffer text;
29 |
30 | public LangText() {}
31 |
32 | public LangText(LanguageType _lang) { //, StringBuffer _text) {
33 | lang = _lang;
34 | text = new StringBuffer();
35 | //text = _text;
36 | }
37 |
38 | /** Gets language of the text, e.g. "en" for English word, "de", "fr", etc. */
39 | public LanguageType getLanguage() {
40 | return lang;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/NonFiniteForm.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration for modelling non-finite {@link IWiktionaryWordForm}s.
24 | * Although other form properties (like {@link GrammaticalTense}) are
25 | * predominantly used to represent finite forms, such properties can
26 | * also be used to describe non-finite forms. For example, the English
27 | * present participle (tense = PRESENT) and past participle (tense = PAST).
28 | * @author Christian M. Meyer
29 | */
30 | public enum NonFiniteForm {
31 |
32 | /** The infinitive form of a verb (e.g., "(to) do"). */
33 | INFINITIVE,
34 |
35 | /** The participle form of a verb (e.g., "done"). Participle forms should
36 | * be combined with a {@link GrammaticalTense}. */
37 | PARTICIPLE;
38 |
39 | // ATTRIBUTIVE,
40 | // CONVERB,
41 | // GERUNDIVE,
42 | // GERUND;
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/LabelEn.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en;
17 |
18 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant.ContextLabel;
19 |
20 | /** Contexual information for definitions, or Synonyms, or Translations
21 | * in English Wiktionary.
22 | *
23 | * See http://en.wiktionary.org/wiki/Template_talk:context
24 | * http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained
25 | */
26 | public class LabelEn extends ContextLabel {
27 |
28 | private LabelEn(String label,String name,String category) {
29 | super(label, name, category);
30 | }
31 |
32 | public static final ContextLabel AU = new LabelEn("AU", "Australia", "");
33 | public static final ContextLabel slang = new LabelEn("slang", "slang", "");
34 |
35 | public static final ContextLabel astronomy = new LabelEn("astronomy","astronomy", "Astronomy");
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/CaseHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import java.util.Objects;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
25 |
26 | public abstract class CaseHandler extends PatternBasedParameterHandler {
27 |
28 | private final GrammaticalCase grammaticalCase;
29 |
30 | public CaseHandler(String regex, GrammaticalCase grammaticalCase) {
31 | super(regex);
32 | Objects.requireNonNull(grammaticalCase, "grammaticalCase must not be null");
33 | this.grammaticalCase = grammaticalCase;
34 | }
35 |
36 | @Override
37 | public void handle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) {
38 | wordForm.setCase(grammaticalCase);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/it_s.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Etymology===
4 | [[contraction|Contraction]] of ‘[[it]] [[is]]’ or ‘it [[has]]’.
5 |
6 | ===Pronunciation===
7 | * {{IPA|/ɪts/}}, {{SAMPA|/Its/}}
8 | * {{audio|en-us-it's.ogg|Audio (US)}}
9 | * {{rhymes|ɪts}}
10 | * {{homophones|its}}
11 |
12 | ===Contraction===
13 | {{en-cont}}
14 |
15 | # It [[is]].
16 | #: '''''It’s''' coming right for us!''
17 | # It [[has]].
18 | #: '''''It’s''' been a long time since I’ve had cheesecake.''
19 |
20 | ====Usage notes====
21 | * See [[its#Usage notes|Usage under "its"]]
22 |
23 | ====Translations====
24 | {{trans-top|it is}}
25 | * [[Catalan]]: [[ell]] [[és]]
26 | * Dutch: [[’t]] [[is]]
27 | * Finnish: [[se]] [[on]]
28 | * French: {{t-|fr|c'est}}
29 | {{trans-mid}}
30 | * Greek: {{t+|el|είναι}}
31 | * Italian: {{t+|it|è}}
32 | * Portuguese: {{t+|pt|é}}
33 | * Spanish: {{t-|es|es}}
34 | * Swedish: [[det]] [[är]], [[den]] är
35 | {{trans-bottom}}
36 |
37 | {{trans-top|it has}}
38 | * [[Catalan]]: [[ell]] [[té]]
39 | * Dutch: [[’t]] [[heeft]], ’t [[is]]
40 | * Greek: {{t|el|έχει}}
41 | * Italian: {{t+|it|ha}}
42 | {{trans-mid}}
43 | * Portuguese: {{t|pt|tem}}
44 | * Spanish: {{t+|es|ha}}
45 | * Swedish: det [[har]], den har
46 | {{trans-bottom}}
47 |
48 | {{checktrans-top}}
49 | * {{ttbc|Latvian}}: [[tas]] [[ir]], [[tā]] ir
50 | * {{ttbc|Lithuanian}}: [[tai]] [[yra]]
51 | {{trans-mid}}
52 | * {{ttbc|Swedish}}
53 | {{trans-bottom}}
54 |
55 | ===Anagrams===
56 | * {{alphagram|[[IST#English|IST]]}} [[sit#English|sit]], [[STI#English|STI]], [['tis#English|'tis]], [[TIS#English|TIS]]
57 |
58 | [[Category:English terms spelled with ']]
59 |
60 | [[es:it's]]
61 | [[fr:it's]]
62 | [[ko:it's]]
63 | [[ja:it's]]
64 | [[no:it's]]
65 | [[pl:it's]]
66 | [[simple:it's]]
67 | [[fi:it's]]
68 | [[tr:it's]]
69 | [[vi:it's]]
70 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/boulder.txt:
--------------------------------------------------------------------------------
1 | == boulder ({{Sprache|Englisch}}) ==
2 | === {{Wortart|Substantiv|Englisch}} ===
3 |
4 | {{erweitern| Beispiel(e) einfügen |Englisch}}
5 |
6 | {{Englisch Substantiv|s
7 | |Bild1=Ffionphort cracked granite boulder.jpg|BBezug1=1|BBeschreibung1=Cracked granite ''boulder''}}
8 |
9 | {{Alternative Schreibweisen}}
10 | :[[bowlder]]
11 |
12 | {{Worttrennung}}
13 | :boul·der, {{Pl.}} boul·ders
14 |
15 | {{Aussprache}}
16 | :{{IPA}} {{Lautschrift|ˈbəʊldəʳ}}, {{Pl.}} {{Lautschrift|ˈbəʊldəʳs}}
17 | :{{Hörbeispiele}} {{Audio|En-us-boulder.ogg|boulder (amerikanisch)}}, {{Pl.}} {{fehlend}}
18 |
19 | {{Bedeutungen}}
20 | :[1] einzelner Felsbrocken/Felsblock, theoretisch beweglich, mehr oder weniger rund
21 | :[2] Geröll
22 |
23 | {{Beispiele}}
24 | :[1]
25 |
26 | {{Abgeleitete Begriffe}}
27 | :[[boulders]], [[bouldered]], [[bouldery]], [[boulder period]], [[boulder clay]]
28 |
29 | ==== Übersetzungen ====
30 | {{Ü-links}}
31 | *{{de}}: [1] [[Felsbrocken]], [[Felsblock]], [[Stein]] {{m}}; [2] [[Geröll]] {{n}}
32 | {{Ü-rechts}}
33 |
34 | {{Referenzen}}
35 | :[1] {{Wikipedia|spr=en|boulder}}
36 | :[1] {{Ref-Oxford|boulder}}
37 | :[1] {{Ref-Macmillan|boulder}}
38 | :[1] {{Ref-MWD|boulder}}
39 | :[1] {{Ref-MWT|boulder}}
40 | :[1] {{Ref-Dictionary|boulder}}
41 | :[1] {{Ref-Pons|en|boulder}}
42 | :[1] {{Ref-dictcc|en|boulder}}
43 | :[1] {{Ref-Leo|en|boulder}}
44 |
45 | [[cy:boulder]]
46 | [[el:boulder]]
47 | [[en:boulder]]
48 | [[eo:boulder]]
49 | [[es:boulder]]
50 | [[et:boulder]]
51 | [[fa:boulder]]
52 | [[fr:boulder]]
53 | [[hu:boulder]]
54 | [[io:boulder]]
55 | [[it:boulder]]
56 | [[kn:boulder]]
57 | [[ko:boulder]]
58 | [[mg:boulder]]
59 | [[ml:boulder]]
60 | [[nl:boulder]]
61 | [[pl:boulder]]
62 | [[simple:boulder]]
63 | [[sv:boulder]]
64 | [[ta:boulder]]
65 | [[vi:boulder]]
66 | [[zh:boulder]]
67 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalMood.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical mood of a {@link IWiktionaryWordForm}.
24 | * @author Christian M. Meyer
25 | */
26 | public enum GrammaticalMood {
27 |
28 | /** The declarative mode (modus indicativus); indicates real events.
29 | * For example: "He built a house." */
30 | INDICATIVE,
31 |
32 | /** The commanding mode (imperare).
33 | * For example: "Built a house!" */
34 | IMPERATIVE,
35 |
36 | /** The conjunctive or subjunctive mode (modus coniunctivus);
37 | * indicates unreal events. For example: "The house that he build."
38 | * (instead of "builds"); "The house that he shall build." Used to
39 | * express the German "Konjunktiv" ("Er hätte ein Haus gebaut"). */
40 | CONJUNCTIVE;
41 |
42 | // CONDITIONAL,
43 | // OPTATIVE,
44 | // JUSSIVE,
45 | // POTENTIAL,
46 | // INTERROGATIVE;
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Flipchart.txt:
--------------------------------------------------------------------------------
1 | == Flipchart ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{mfn}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Genus 1=n
6 | |Genus 2=m
7 | |Genus 3=f
8 | |Nominativ Singular 1=Flipchart
9 | |Nominativ Singular 2=Flipchart
10 | |Nominativ Singular 3=Flipchart
11 | |Nominativ Plural=Flipcharts
12 | |Genitiv Singular 1=Flipcharts
13 | |Genitiv Singular 2=Flipcharts
14 | |Genitiv Singular 3=Flipchart
15 | |Genitiv Plural=Flipcharts
16 | |Dativ Singular 1=Flipchart
17 | |Dativ Singular 2=Flipchart
18 | |Dativ Singular 3=Flipchart
19 | |Dativ Plural=Flipcharts
20 | |Akkusativ Singular 1=Flipchart
21 | |Akkusativ Singular 2=Flipchart
22 | |Akkusativ Singular 3=Flipchart
23 | |Akkusativ Plural=Flipcharts
24 | |Bild=Flipchart1-Asio.JPG|180px|1|Ein Flipchart
25 | }}
26 |
27 | {{Alternative Schreibweisen}}
28 | :[[Flip-Chart]]
29 |
30 | {{Worttrennung}}
31 | :Flip·chart, {{Pl.}} Flip·charts
32 |
33 | {{Aussprache}}
34 | :{{IPA}} {{Lautschrift|ˈflɪpʧaːɐ̯t}}, {{Lautschrift|ˈflɪpʧaʁt}}
35 | :{{Hörbeispiele}} {{Audio|De-Flipchart.ogg}}
36 |
37 | {{Bedeutungen}}
38 | :[1] ein großer auf einem Gestell befestigter Papierblock, dessen Blätter man beschreiben und nach hinten umblättern kann
39 |
40 | {{Herkunft}}
41 | :von dem englischen Begriff [[flip chart]] (deutsch: [[Umblätterdiagramm]]) mit gleicher Bedeutung
42 |
43 | {{Beispiele}}
44 | :[1] Während des Vortrags zeichnete er den neuen Arbeitsablauf auf einem ''Flipchart'' auf.
45 |
46 | {{Absatz}}
47 | ==== {{Übersetzungen}} ====
48 | {{Ü-Tabelle|Ü-links=
49 | *{{en}}: [1] {{Ü|en|flip chart}}
50 | |Ü-rechts=
51 | *{{fr}}: [] {{Ü|fr|}}
52 | *{{sv}}: [1] {{Ü|sv|blädderblock}}
53 | }}
54 |
55 | {{Referenzen}}
56 | :[1] {{Wikipedia|Flipchart}}
57 | :[*] {{Ref-Canoo|Flipchart}}
58 | :[1] {{Ref-UniLeipzig|Flipchart}}
59 | :[1] {{Ref-Duden|Flipchart}}
60 |
61 | [[Kategorie:Entlehnung aus dem Englischen (Deutsch)]]
--------------------------------------------------------------------------------
/src/main/resources/assemblies/dist.xml:
--------------------------------------------------------------------------------
1 |
5 | bin
6 |
7 | dir
8 | zip
9 |
10 |
11 | false
12 |
13 |
14 |
15 |
16 | false
17 | true
18 |
19 | com.sleepycat:je
20 |
21 | lib
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | CHANGELOG.txt
30 | LICENSE.txt
31 | NOTICE.txt
32 | README.txt
33 | pom.xml
34 | license/*
35 |
36 |
37 |
38 |
39 |
40 | src/main/java
41 |
42 | de/tudarmstadt/ukp/jwktl/WiktionaryCli.java
43 | de/tudarmstadt/ukp/jwktl/examples/*.java
44 |
45 | examples
46 |
47 |
48 |
49 |
50 | target/site/apidocs
51 | javadoc
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/ChainedCBZip2InputStreamTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2015
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser;
19 |
20 | import java.io.File;
21 | import java.io.InputStream;
22 | import java.math.BigInteger;
23 | import java.security.MessageDigest;
24 |
25 | import junit.framework.TestCase;
26 |
27 | public class ChainedCBZip2InputStreamTest extends TestCase {
28 | public void testConsumeWholeStream() throws Exception {
29 | MessageDigest md5 = MessageDigest.getInstance("MD5");
30 | InputStream stream =
31 | new ChainedCBZip2InputStream(new File("src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2"));
32 | long count = 0;
33 | int n;
34 | byte[] buffer = new byte[8192];
35 | while ((n = stream.read(buffer)) != -1) {
36 | count += n;
37 | md5.update(buffer, 0, n);
38 | }
39 | String signature = new BigInteger(1, md5.digest()).toString(16);
40 | assertEquals(1800617, count);
41 | assertEquals("bde6a439065407c9c74c83b1f2f97520", signature);
42 |
43 | stream.close();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/IHeadwordLineHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import java.util.regex.Pattern;
21 | import java.util.stream.Stream;
22 |
23 | interface IHeadwordLineHandler {
24 | Pattern LEGACY_PATTERN = Pattern.compile("\\A'''[^']+'''");
25 |
26 | default boolean isTemplate(String line) {
27 | return line.startsWith("{{");
28 | }
29 |
30 | default boolean isExcludedTemplate(String line) {
31 | return Stream.of(
32 | "{{wikipedia",
33 | "{{slim-wikipedia",
34 | "{{wiki}}",
35 | "{{wikispecies",
36 | "{{wikiversity",
37 | "{{wikiquote",
38 | "{{commons",
39 | "{{attention",
40 | "{{rfc",
41 | "{{examples",
42 | "{{enum|",
43 | "{{no entry"
44 | ).anyMatch(templ -> line.toLowerCase().contains(templ));
45 | }
46 |
47 | default boolean isLegacyHeader(String line) {
48 | return LEGACY_PATTERN.matcher(line).find();
49 | }
50 |
51 | default boolean isHeadwordLine(String line) {
52 | return isLegacyHeader(line) || (isTemplate(line) && !isExcludedTemplate(line));
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalCase.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical case of a {@link IWiktionaryWordForm}.
24 | * @author Christian M. Meyer
25 | */
26 | public enum GrammaticalCase {
27 |
28 | /** Indicates the subject of a finite verb. Ask "Wer/Was?" in
29 | * German sentences (e.g., "_Peter_ liest"). */
30 | NOMINATIVE,
31 |
32 | /** Indicates the direct object of a verb. Ask "Wen/Was?" in
33 | * German sentences (e.g., "Peter liest _ein Buch_"). */
34 | ACCUSATIVE,
35 |
36 | /** Indicates the indirect object of a verb. Ask "Wem?" in
37 | * German sentences (e.g., "Peter liest _ihr_ vor").*/
38 | DATIVE,
39 |
40 | /** Indicates possession. Ask "Wessen?" in German sentences
41 | * (e.g., "_Peters_ Buch ist spannend").*/
42 | GENITIVE; //
43 |
44 | //ABLATIVE, // indicates movement from smth. or cause
45 | //VOCATIVE, // indicates addressee
46 | //LOCATIVE, // indicates locatoin
47 | //INSTRUMENTAL, // indicates intrument used for action
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Angestellte.txt:
--------------------------------------------------------------------------------
1 | {{Siehe auch|[[angestellte]]}}
2 | == Angestellte ({{Sprache|Deutsch}}) ==
3 | === {{Wortart|Substantiv|Deutsch}}, adjektivische Deklination, {{f}} ===
4 |
5 | {{Deutsch adjektivische Deklination f|Angestellte}}
6 |
7 | {{Worttrennung}}
8 | :An·ge·stell·te, {{Pl.}} An·ge·stell·ten
9 |
10 | {{Aussprache}}
11 | :{{IPA}} {{Lautschrift|ˈanɡəʃtɛltə}}, {{Pl.}} {{Lautschrift|ˈanɡəʃtɛltn̩}}
12 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}}
13 |
14 | {{Bedeutungen}}
15 | :[1] Frau, die gegen Bezahlung ihre Arbeitskraft zu Verfügung stellt (nicht-körperliche Arbeit)
16 |
17 | {{Abkürzungen}}
18 | :[1] [[Angest.]]
19 |
20 | {{Gegenwörter}}
21 | :[1] [[Arbeiterin]], [[Beamte]]
22 |
23 | {{Männliche Wortformen}}
24 | :[1] [[Angestellter]]
25 |
26 | {{Oberbegriffe}}
27 | :[1] [[Arbeitnehmerin]]
28 |
29 | {{Beispiele}}
30 | :[1] Die sogenannten Markteinkommen von Arbeitern und ''Angestellten'' sind in den vergangenen Jahren deutlich gesunken.
31 |
32 | {{Absatz}}
33 | ==== Übersetzungen ====
34 | {{Ü-links}}
35 | *{{en}}: [1] {{Ü|en|employee}}
36 | *{{fr}}: [1] {{Ü|fr|employée}} {{f}}
37 | *{{nl}}: [1] {{Ü|nl|werknemer}}
38 | {{Ü-Abstand}}
39 | *{{sv}}: [1] {{Ü|sv|anställd}}
40 | *{{sk}}: [1] {{Ü|sk|zamestnanec}}
41 | *{{es}}: [1] {{Ü|es|empleado}}
42 | *{{hu}}: [1] {{Ü|hu|alkalmazott}}
43 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link rechts unterhalb des Editierfensters -->
44 |
45 | {{Referenzen}}
46 | :[1] {{Wikipedia|Angestellte}}
47 | :[1] {{Ref-DWDS|Angestellte}}
48 | :[1] {{Ref-Canoo|Angestellte}}
49 | :[1] {{Ref-UniLeipzig|Angestellte}}
50 | :[1] {{Ref-FreeDictionary|Angestellte}}
51 |
52 | {{Ähnlichkeiten}}
53 | :[[eingestellt]]
54 |
55 | [[el:Angestellte]]
56 | [[en:Angestellte]]
57 | [[fi:Angestellte]]
58 | [[fr:Angestellte]]
59 | [[hu:Angestellte]]
60 | [[id:Angestellte]]
61 | [[io:Angestellte]]
62 | [[it:Angestellte]]
63 | [[ko:Angestellte]]
64 | [[sv:Angestellte]]
65 | [[zh:Angestellte]]
66 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Hallo.txt:
--------------------------------------------------------------------------------
1 | {{Siehe auch|[[hallo]]}}
2 | == Hallo ({{Sprache|Deutsch}}) ==
3 | === {{Wortart|Substantiv|Deutsch}}, {{n}} ===
4 |
5 | {{Deutsch Substantiv Übersicht
6 | |Nominativ Singular=das Hallo
7 | |Nominativ Plural=die Hallos
8 | |Genitiv Singular=des Hallos
9 | |Genitiv Plural=der Hallos
10 | |Dativ Singular=dem Hallo
11 | |Dativ Plural=den Hallos
12 | |Akkusativ Singular=das Hallo
13 | |Akkusativ Plural=die Hallos
14 | }}
15 |
16 | {{Worttrennung}}
17 | :Hal·lo, {{Pl.}} Hal·los
18 |
19 | {{Aussprache}}
20 | :{{IPA}} {{Lautschrift|haˈloː}}, {{Pl.}} {{Lautschrift|haˈloːs}}
21 | :{{Hörbeispiele}} {{Audio|De-Hallo.ogg|Hallo}}, {{Pl.}} {{fehlend}}
22 |
23 | {{Bedeutungen}}
24 | :[1] {{ugs.|:}} große, auf eine Person gerichtete [[Aufmerksamkeit]]
25 |
26 | {{Herkunft}}
27 | :Substantivierung des Grußworts [[hallo]]
28 |
29 | {{Synonyme}}
30 | :[1] [[Hallihallo]], [[Trubel]], [[Jubel]], [[Aufstand]], [[Heiterkeit]]
31 |
32 | {{Beispiele}}
33 | :[1] Als er die Treppe hinaufkam, wurde er mit großem ''Hallo'' empfangen.
34 |
35 | {{Charakteristische Wortkombinationen}}
36 | :[1] großes ''Hallo''
37 |
38 | ==== Übersetzungen ====
39 | {{Ü-links}}
40 | *{{en}}: [1] {{Ü|en|uproar}}
41 | *{{fr}}: [1] {{Ü|fr|animaton}}
42 | {{Ü-Abstand}}
43 | *{{pt}}: [1] {{Ü|pt|olá}}
44 | *{{sv}}: [1] {{Ü|sv|hallå}}, {{Ü|sv|ståhej}}
45 | *{{es}}: [1] {{Ü|es|barullo}} {{m}}, {{Ü|es|jaleo}} {{m}}
46 | {{Ü-rechts}}
47 |
48 | {{Dialektausdrücke (Deutsch)|
49 | *Süddeutsch: Grüß´ Gott oder Servus
50 | |
51 | *Schwäbisch: Hallöle (langes ö)
52 | }}
53 |
54 | {{Referenzen}}
55 | :[1] {{Wikipedia|Hallo}}
56 | :[1] {{Ref-DWDS|Hallo}}
57 | :[*] {{Ref-Canoo|Hallo}}
58 | :[1] {{Ref-UniLeipzig|Hallo}}
59 | :[1] {{Ref-FreeDictionary|Hallo}}
60 |
61 | {{Ähnlichkeiten}}
62 | :[[Hall]], [[Halle]], [[halle]], [[Halo]], [[holla]], [[Holle]]
63 |
64 | [[fr:Hallo]]
65 | [[hu:Hallo]]
66 | [[io:Hallo]]
67 | [[it:Hallo]]
68 | [[mg:Hallo]]
69 | [[ru:Hallo]]
70 | [[zh:Hallo]]
71 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/IWordFormHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2015
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import java.util.List;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalGender;
24 |
25 | public interface IWordFormHandler {
26 | /**
27 | * Start parsing the specified text for inflected word forms. The
28 | * extracted forms can be accessed using {@link #getWordForms()}
29 | * once all lines have been parsed.
30 | *
31 | * @param line a line of wikitext
32 | * @return whether the handler could parse the line
33 | */
34 | boolean parse(String line);
35 |
36 | /**
37 | * @return a list of extracted word forms, or an empty list.
38 | */
39 | List getWordForms();
40 |
41 | /**
42 | * @return the extracted genders (might be null).
43 | */
44 | List getGenders();
45 |
46 | /**
47 | * @return the unprocessed headline
48 | * @see WT:EL Headword line
49 | */
50 | String getRawHeadwordLine();
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Tetragraph.txt:
--------------------------------------------------------------------------------
1 | == Tetragraph ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{mn}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Genus 1=m
6 | |Genus 2=n
7 | |Nominativ Singular 1=Tetragraph
8 | |Nominativ Singular 2=Tetragraph
9 | |Nominativ Plural=Tetragraphen
10 | |Genitiv Singular 1=Tetragraphen
11 | |Genitiv Singular 2=Tetragraphs
12 | |Genitiv Plural=Tetragraphen
13 | |Dativ Singular 1=Tetragraphen
14 | |Dativ Singular 2=Tetragraph
15 | |Dativ Plural=Tetragraphen
16 | |Akkusativ Singular 1=Tetragraphen
17 | |Akkusativ Singular 2=Tetragraph
18 | |Akkusativ Plural=Tetragraphen
19 | }}
20 |
21 | {{Alternative Schreibweisen}}
22 | :[[Tetragraf]]
23 |
24 | {{Worttrennung}}
25 | :Te·t·ra·graph, {{Pl.}} Te·t·ra·gra·phen
26 |
27 | {{Aussprache}}
28 | :{{IPA}} {{Lautschrift|tetʀaˈɡʀaːf}}
29 | :{{Hörbeispiele}} {{Audio|}}
30 | :{{Reime}} {{Reim|aːf|Deutsch}}
31 |
32 | {{Bedeutungen}}
33 | :[1] ''[[Linguistik]]:'' Folge von vier Buchstaben, die einen einzigen Laut repräsentieren
34 |
35 | {{Gegenwörter}}
36 | :[1] [[Digraph]], [[Trigraph]]
37 |
38 | {{Oberbegriffe}}
39 | :[1] [[Graph]]
40 |
41 | {{Beispiele}}
42 | :[1] Im Deutschen steht der ''Tetragraph'' "tsch" für den Laut [ʧ].
43 | :[1] „»Der obligatorische Wechsel zum Kyrillischen hat groteske Verrenkungen erforderlich gemacht, so die Verwendung von diakritischen Zeichen, von Digraphen, Trigraphen und sogar - zur Darstellung des entstimmten aspirierten labialisierten uvularen Plosivs im Kabardinischen - von einem ''Tetragraphen''«.“<ref>{{Literatur | Autor=Jonathan Littell | Titel=Die Wohlgesinnten | Verlag=Berliner Taschenbuch Verlag | Ort=Berlin | Jahr=2009 (französisches Original 2006)| ISBN=978-3-8333-0628-0}}, Seite 306f.</ref>
44 |
45 | ==== {{Übersetzungen}} ====
46 | {{Ü-Tabelle|Ü-links=
47 | *{{en}}: [1] {{Ü|en|}}
48 | *{{fr}}: [1] {{Ü|fr|}}
49 | |Ü-rechts=
50 | *{{es}}: [1] {{Ü|es|}}
51 | }}
52 |
53 | {{Referenzen}}
54 | :[1] {{Wikipedia|Digraph (Linguistik)#Trigraph, Tetragraph, …}}
55 |
56 | {{Quellen}}
57 |
58 | [[Kategorie:Fremdwort]]
59 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/MehrzahlHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import java.util.regex.Matcher;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalNumber;
24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
25 |
26 | public class MehrzahlHandler extends PatternBasedIndexedParameterHandler {
27 |
28 | protected static final String MEHRZAHL_PATTERN =
29 | // endsWith(" (Mehrzahl)")
30 | "\\s\\(Mehrzahl\\)$|" +
31 | // endsWith(" (Mehrzahl 1)") || endsWith(" (Mehrzahl 2)") ||
32 | // endsWith(" (Mehrzahl 3)") || endsWith(" (Mehrzahl 4)")
33 | "\\s\\(Mehrzahl\\s([1-4])\\)$";
34 |
35 | public MehrzahlHandler(DEWordFormNounTableHandler nounTableHandler) {
36 | super(nounTableHandler, MEHRZAHL_PATTERN);
37 | }
38 |
39 | @Override
40 | public void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher,
41 | ParsingContext context) {
42 | wordForm.setNumber(GrammaticalNumber.PLURAL);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/Pronunciation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.entry;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IPronunciation;
21 |
22 | /**
23 | * Default implementation of the {@link IPronunciation} interface.
24 | * See there for details.
25 | * @author Christian M. Meyer
26 | */
27 | public class Pronunciation implements IPronunciation {
28 |
29 | protected PronunciationType type;
30 | protected String text;
31 | protected String note;
32 |
33 | /** Creates a new, empty pronunciation. */
34 | public Pronunciation() {}
35 |
36 | /** Creates a new pronunciation for the given representation text,
37 | * notation type and addition information. For audio files, the
38 | * representation text refers to an audio file name. */
39 | public Pronunciation(final PronunciationType type,
40 | final String text, final String note) {
41 | this.type = type;
42 | this.text = text;
43 | this.note = note;
44 | }
45 |
46 | public PronunciationType getType() {
47 | return type;
48 | }
49 |
50 | public String getText() {
51 | return text;
52 | }
53 |
54 | public String getNote() {
55 | return note;
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Kunsttherapie.txt:
--------------------------------------------------------------------------------
1 | == Kunsttherapie ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{f}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Nominativ Singular= die Kunsttherapie
6 | |Nominativ Plural=die Kunsttherapien
7 | |Genitiv Singular=der Kunsttherapie
8 | |Genitiv Plural=der Kunsttherapien
9 | |Dativ Singular=der Kunsttherapie
10 | |Dativ Plural=den Kunsttherapien
11 | |Akkusativ Singular=die Kunsttherapie
12 | |Akkusativ Plural=die Kunsttherapien
13 | }}
14 |
15 | {{Worttrennung}}
16 | :Kunst·the·ra·pie, {{Pl.}} Kunst·the·ra·pien
17 |
18 | {{Aussprache}}
19 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}}
20 | :{{IPA}} {{Lautschrift|ˈkʊnstteʀapiː}}, {{Pl.}} {{Lautschrift|ˈkʊnstteʀapiːən}}
21 |
22 | {{Bedeutungen}}
23 | :[1] Therapie mit bildnerischen Medien
24 |
25 | {{Bedeutungen}}
26 | :[[Determinativkompositum]] aus ''[[Kunst]]'' und ''[[Therapie]]''
27 |
28 | {{Oberbegriffe}}
29 | :[1] [[Therapie]]
30 |
31 | {{Unterbegriffe}}
32 | :[1] [[Maltherapie]], [[Gestaltungstherapie]]
33 |
34 | {{Beispiele}}
35 | :[1] Die ''Kunsttherapie'' wird zu den ''Künstlerischen Therapien'' gezählt. Sie ist eine u. a. in der [[Psychiatrie]] und [[Psychosomatik]] verbreitete therapeutische Disziplin.
36 |
37 |
38 | ==== Übersetzungen ====
39 | {{Ü-links}}
40 | *{{en}}: [1] {{Ü|en|art therapy}}
41 | *{{fr}}: [1] {{Ü|fr|Art-thérapie}}
42 | *{{he}}: [1] {{Ü|he|תרפיה בהבעה ויצירה}}
43 | *{{ca}}: [1] {{Ü|ca|Artteràpia}}
44 | {{Ü-Abstand}}
45 | *{{pl}}: [1] {{Ü|pl|Arteterapia}}
46 | *{{pt}}: [1] {{Ü|pt|Arte terapia}}
47 | *{{sv}}: [1] {{Ü|sv|konstterapi}}
48 | *{{sr}}: [1] {{Ü|sr|Арт терапија}}
49 | *{{sk}}: [1] {{Ü|sk|Arte terapia}}
50 | *{{es}}: [1] {{Ü|es|Arteterapia}}
51 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link unterhalb des Editierfensters -->
52 |
53 | {{Referenzen}}
54 | :[1] {{Wikipedia|Kunsttherapie}}
55 | :[1] {{Ref-DWDS|Kunsttherapie}}
56 | :[1] {{Ref-Canoo|Kunsttherapie}}
57 | :[1] {{Ref-UniLeipzig|Kunsttherapie}}
58 |
59 | [[pl:Kunsttherapie]]
60 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/as_much_as_possible.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 |
3 | ===Adverb===
4 | {{en-adv|head=[[as]] [[much]] [[as]] [[possible]]|-}}
5 |
6 | # As [[much]] as is [[possible]].
7 |
8 | ====Usage notes====
9 | This is not an idiom. It is a particularly common instance of the general construction: "as X as Y", where both X and Y have a large range of possibilities. X can be an adjectival, an adverbial, or a quantifier determiner (much, little, many, few) and Y can a clauses or an ellipsis of a clause. The full clause for which "possible" is an ellipsis depends on the preceding verb. "He ate as much as possible" is an ellipsis for "He ate as much as it was possible for him to eat." When Y is a noun ("He ate as much as John."), the ellipsis is still for a clause ("He ate as much as John ate.").
10 |
11 | ====Translations====
12 | {{trans-top|as much as is possible}}
13 | * Arabic: {{t|ar|قَدْر اَلْمُسْتَطَاع}}, {{t|ar|قَدْر اَلْإِمْكَان}}
14 | * Chinese:
15 | *: Mandarin: {{t+|cmn|盡量|sc=Hani}} or {{t+|cmn|儘量|sc=Hani}}, {{t|cmn|尽量|tr=jǐnliàng|sc=Hani}}
16 | * Finnish: {{t|fi|niin paljon kuin mahdollista}}
17 | * French: {{t+|fr|autant que possible}}, {{t|fr|tout le possible}}, {{t|fr|le plus possible}}
18 | * German: {{t|de|so viel wie möglich}}, {{t|de|möglichst viel}}
19 | * Hebrew: {{t|he|ככל האפשר|tr=kekhol ha'efshar|sc=Hebr}}
20 | {{trans-mid}}
21 | * Japanese: {{t|ja|できるだけ|tr=dekiru dake|sc=Jpan}}, {{t|ja|有らん限り|tr=あらんかぎり, aran kagiri|sc=Jpan}}, {{t|ja|成るべく|tr=なるべく, naru-beku|sc=Jpan}}
22 | * Persian: {{t|fa|تا جای امکان|sc=fa-Arab}}, {{t|fa|حتیالامکان|tr=hattal-emkaan|sc=fa-Arab}}
23 | * Portuguese: {{t|pt|[[todo]] [[o]] [[possível]]}}, {{t|pt|[[o]] [[máximo]] [[possível]]}}
24 | * Russian: {{t|ru|как мо́жно бо́льше}}
25 | * Serbo-Croatian: {{t|sh|što je više moguće}}
26 | * Spanish: {{t|es|todo lo posible}}
27 | * Telugu: {{t|te|వీలైనంతవరకు|tr=veelainantavaraku|sc=Telu}}
28 | * Volapük: {{t+|vo|mögiküno}}
29 | {{trans-bottom}}
30 |
31 | [[Category:English phrasebook]]
32 |
33 | [[pt:as much as possible]]
34 | [[chr:as much as possible]]
35 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryDumpParser.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser;
19 |
20 | import java.io.File;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException;
23 |
24 | /**
25 | * Parser for Wiktionary dump files obtained from
26 | * http://download.wikimedia.org/backup-index.html.
27 | * @author Christian M. Meyer
28 | */
29 | public interface IWiktionaryDumpParser {
30 | public static final int BATCH_SIZE = 25000;
31 |
32 | /**
33 | * Starts the parsing of the given dump file. The file can be either
34 | * bzip2-compressed or the extracted XML version.
35 | *
36 | * @param dumpFile the dumpFile
37 | * @throws WiktionaryException in case of any parser errors.
38 | */
39 | void parse(final File dumpFile) throws WiktionaryException;
40 |
41 | /**
42 | * Register the given {@link IWiktionaryPageParser}. The registered
43 | * parser will then be notified once a Wiktionary-related XML tag
44 | * has been processed.
45 | */
46 | void register(final IWiktionaryPageParser pageParser);
47 |
48 | /**
49 | * Returns the list of all registered {@link IWiktionaryPageParser}s.
50 | */
51 | Iterable getPageParsers();
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/PatternBasedParameterHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import java.util.Objects;
21 | import java.util.regex.Matcher;
22 | import java.util.regex.Pattern;
23 |
24 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.IWiktionaryWordFormTemplateParameterHandler;
26 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
27 |
28 | public abstract class PatternBasedParameterHandler implements IWiktionaryWordFormTemplateParameterHandler {
29 |
30 | protected final Pattern pattern;
31 |
32 | public PatternBasedParameterHandler(String regex) {
33 | Objects.requireNonNull(regex, "regex must not be null.");
34 | this.pattern = Pattern.compile(regex);
35 | }
36 |
37 | @Override
38 | public void reset() {
39 | // Nothing to do
40 | }
41 |
42 | public boolean canHandle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) {
43 | if (label == null) {
44 | return false;
45 | }
46 | final Matcher matcher = pattern.matcher(label);
47 | return matcher.find();
48 | }
49 | }
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/constant/ContextLabel.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant;
17 |
18 | import java.util.Map;
19 | import java.util.HashMap;
20 |
21 | /** Contexual information for definitions, such as archaic, by analogy,
22 | * chemistry, etc.
23 | *
24 | * See http://en.wiktionary.org/wiki/Template_talk:context
25 | */
26 | public abstract class ContextLabel {
27 |
28 | /** Two (or more) letter label code, e.g. 'устар.', 'п.'. */
29 | // private final String label;
30 |
31 | /** Label name, e.g. 'устарелое', 'переносное значение'. */
32 | // private final String name;
33 |
34 | /** Category associated with this label. */
35 | // private final String category;
36 |
37 | private static Map label2name = new HashMap<>();
38 | private static Map label2category = new HashMap<>();
39 |
40 | protected ContextLabel(String label,String name,String category) {
41 | // this.label = label;
42 | // this.name = name;
43 | // this.category = category;
44 | label2name. put(label, name);
45 | label2category. put(label, category);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/mitreissen.txt:
--------------------------------------------------------------------------------
1 | == mitreißen ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Verb|Deutsch}}, ''unregelmäßig'' ===
3 |
4 | {{Verb-Tabelle
5 | |Gegenwart_ich=reiße mit
6 | |Gegenwart_du=reißt mit
7 | |Gegenwart_er, sie, es=reißt mit
8 | |1.Vergangenheit_ich=riss mit
9 | |Partizip II=mitgerissen
10 | |Konjunktiv II_ich=risse mit
11 | |Befehl_du=reiß mit!
12 | |Befehl_ihr=reißt mit!
13 | |Hilfsverb=haben
14 | |Weitere_Konjugationen= mitreißen (Konjugation)
15 | }}
16 |
17 | {{Alternative Schreibweisen}}
18 | :''[[Hilfe:Schweiz und Liechtenstein|Schweiz und Liechtenstein]]:'' [[mitreissen]]
19 |
20 | {{Worttrennung}}
21 | :mit·rei·ßen, {{Prät.}} riss mit, {{Part.}} mit·ge·ris·sen
22 |
23 | {{Aussprache}}
24 | :{{IPA}} {{Lautschrift|ˈmɪtˌʀaɪ̯sn̩}}, {{Prät.}} {{Lautschrift|ˌʀɪs ˈmɪt}}, {{Part.}} {{Lautschrift|ˈmɪtɡəˈʀɪsn̩}}
25 | :{{Hörbeispiele}} {{fehlend}}, {{Prät.}} {{fehlend}}, {{Part.}} {{fehlend}}
26 |
27 | {{Bedeutungen}}
28 | :[1] jemanden/etwas mit sich zerren
29 | :[2] Enthusiasmus verbreiten
30 |
31 | {{Sinnverwandte Wörter}}
32 | :[1] [[fortreißen]], [[wegreißen]]
33 | :[2] [[begeistern]]
34 |
35 | {{Oberbegriffe}}
36 | :[1] [[reißen]]
37 | :[2] [[beeindrucken]]
38 |
39 | {{Beispiele}}
40 | :[1] Der Felssturz ''riss'' sie ''mit'' in die Tiefe.
41 | :[2] Er ''riss'' die Menschenmenge ''mit.'' Der Enthusiasmus war groß.
42 |
43 | ==== Übersetzungen ====
44 | {{Ü-links}}
45 | *{{eu}}: [2] {{Ü|eu|poztu}}
46 | *{{en}}: [2] {{Ü|en|fill with enthusiasm}}
47 | *{{fr}}: {{in lateinischer Schrift}}[2] {{Ü|fr|enthousiasmer}}
48 | {{Ü-Abstand}}
49 | *{{no}}: {{Ü|no|dra}}/{{Ü|no|trekke}} {{Ü|no|med}} {{Ü|no|seg}}; [2] {{Ü|no|rive}} {{Ü|no|med}}, {{Ü|no|begeistre}}
50 | *{{sv}}: [1] ''refl.:'' {{Ü|sv|dra med sig}}, {{Ü|sv|svepa med sig}}; [1, 2] ''refl.:'' {{Ü|sv|rycka med sig}}
51 | *{{es}}: [2] {{Ü|es|apasionar}}, {{Ü|es|arrebatar}}
52 | {{Ü-rechts}}
53 |
54 | {{Referenzen}}
55 | :[1, 2] {{Ref-DWDS|mitreißen}}
56 | :[2] {{Ref-Canoo|mitrei%DFen}}
57 | :[2] {{Ref-UniLeipzig|mitrei%DFen}}
58 | :[1, 2] {{Ref-FreeDictionary|mitreißen}}
59 |
60 | {{Ähnlichkeiten}}
61 | :[[mitreisen]]
62 |
63 | [[ko:mitreißen]]
64 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEGenderText.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components;
19 |
20 | import java.text.MessageFormat;
21 | import java.util.Objects;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalGender;
24 |
25 | public enum DEGenderText {
26 |
27 | NULL(null, null),
28 | M("m", GrammaticalGender.MASCULINE),
29 | F("f", GrammaticalGender.FEMININE),
30 | N("n", GrammaticalGender.NEUTER),
31 | X("x", null),
32 | _0("0", null),
33 | PL("pl", null),
34 | P_L("Pl", null);
35 |
36 | private final String genderText;
37 | private final GrammaticalGender gender;
38 |
39 | private DEGenderText(String genderText, GrammaticalGender gender) {
40 | this.genderText = genderText;
41 | this.gender = gender;
42 | }
43 |
44 | public GrammaticalGender asGrammaticalGender() {
45 | return this.gender;
46 | }
47 |
48 | public static DEGenderText of(String genderText) {
49 | Objects.requireNonNull(genderText, "genderText must not be null");
50 | for (DEGenderText value : values()) {
51 | if (Objects.equals(genderText, value.genderText)) {
52 | return value;
53 | }
54 | }
55 | throw new IllegalArgumentException(MessageFormat.format("Unrecognized gender text [{0}].", genderText));
56 | }
57 | }
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/util/POSText.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.util;
17 |
18 | //import wikt.constant.POSType;
19 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant.POS;
20 |
21 | /** Data structure consists of a POS code and the corresponding text. */
22 | public class POSText {
23 |
24 | /** Part of speech code of the text. */
25 | private POS pos;
26 |
27 | /** POS name found in text, e.g. explicitly: "Verb", or implicitly "stitch I". */
28 | //private String pos_name;
29 |
30 | /** Text */
31 | private StringBuffer text;
32 |
33 | public POSText() {}
34 |
35 | /*public POSText(POSType _pos) { //, StringBuffer _text) {
36 | pos = _pos;
37 | text = new StringBuffer();
38 | //text = _text;
39 | }*/
40 |
41 | //public POSText(POSType _pos, StringBuffer _text) {
42 | public POSText(POS _pos, String _text) {
43 | pos = _pos;
44 | text = new StringBuffer(_text);
45 | }
46 |
47 | public POSText(POS _pos, StringBuffer _text) {
48 | pos = _pos;
49 | text = _text;
50 | }
51 |
52 | public POS getPOSType() {
53 | return pos;
54 | }
55 |
56 | public StringBuffer getText() {
57 | return text;
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENUsageNotesHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString;
21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryEntry;
22 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
23 |
24 | public class ENUsageNotesHandler extends ENBlockHandler {
25 | private StringBuilder usageNotes;
26 |
27 | public ENUsageNotesHandler() {
28 | super("Usage notes");
29 | }
30 |
31 | @Override
32 | public boolean processHead(String text, ParsingContext context) {
33 | usageNotes = new StringBuilder();
34 | return super.processHead(text, context);
35 | }
36 |
37 | @Override
38 | public boolean processBody(String textLine, ParsingContext context) {
39 | textLine = textLine.trim();
40 | if (!textLine.isEmpty()) {
41 | usageNotes.append(textLine).append("\n");
42 | }
43 | return super.processBody(textLine, context);
44 | }
45 |
46 | @Override
47 | public void fillContent(ParsingContext context) {
48 | if (usageNotes.length() > 0) {
49 | WiktionaryEntry entry = context.findEntry();
50 | if (entry == null) {
51 | throw new RuntimeException("entry is null");
52 | }
53 | entry.setUsageNotes(new WikiString(usageNotes.toString().trim()));
54 | }
55 | super.fillContent(context);
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/test/resources/articles-en/gumbo.txt:
--------------------------------------------------------------------------------
1 | ==English==
2 | {{wikipedia}}
3 | [[File:Bozogumbo.jpg|thumb|Gumbo (stew) with okra pods.]]
4 |
5 | ===Etymology===
6 | From {{etyl|bnt|en}} {{term|ngombo}}, {{term|kingombo||okra plant}}, possibly via {{etyl|gul|en}}.[Oxford American Dictionaries][The Chambers Dictionary, 1994, ISBN 0-550-10255-8] Cognate to {{etyl|pt|-}} {{term|quiabo|lang=pt}}, Caribbean {{etyl|es|-}} {{term|guingambó|lang=es}}, and cognates in other Romance languages.
7 |
8 | ===Pronunciation===
9 | * {{rhymes|ʌmbəʊ|lang=en}}
10 |
11 | ===Noun===
12 | {{en-noun|~|gumbos}}
13 |
14 | # {{context|countable|lang=en}} The [[okra]] plant or its pods.
15 | # {{context|uncountable|lang=en}} A soup or stew made with [[okra]].
16 | # {{context|uncountable|lang=en}} A fine [[silty]] [[soil]] that when wet becomes very thick and heavy.
17 | #* '''1909''', [[w:Ralph Connor|Ralph Connor]], ''The Foreigner'', ch. 11:
18 | #*: The team stuck fast in the black muck, and every effort to extricate them served only to imbed them more hopelessly in the sticky '''gumbo'''.
19 | #* '''1914''' April, "Making Good Roads by Firing Poor Ones," ''Popular Mechanics'', [http://books.google.ca/books?id=890DAAAAMBAJ&pg=PA567&dq=gumbo+caulk+OR+glue+OR+sticky+OR+adhesive+OR+gummy&hl=en&ei=e12LTuftL8LY0QG4osTkBA&sa=X&oi=book_result&ct=result&resnum=7&ved=0CF0Q6AEwBg#v=onepage&q=gumbo%20caulk%20OR%20glue%20OR%20sticky%20OR%20adhesive%20OR%20gummy&f=false p. 567]:
20 | #*: There are no poorer roads in all the United States than the "'''gumbo'''" roads of the south—'''gumbo''' being the name give a certain kind of mud or clay that is particularly sticky, clings tenaciously, seems to have no bottom, and will not support any weight.
21 | #* '''1950''' July 3, "[http://www.time.com/time/magazine/article/0,9171,812721,00.html Labor: Trouble at Lowland]," ''Time'':
22 | #*: The red '''gumbo''' soil uttered ugly sucking sounds at the touch of a man's boot.
23 |
24 | ====Synonyms====
25 | * {{sense|okra plant}} [[okra]], [[ladies' fingers]]
26 |
27 | ====Translations====
28 | {{trans-top|okra}}
29 | * Portuguese: [[quiabeiro]] {{g|m}} (''plant''), [[quiabo]] {{g|m}} (''pods'')
30 | {{trans-mid}}
31 | {{trans-bottom}}
32 |
33 | ===References===
34 |
35 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Brathaehnchen.txt:
--------------------------------------------------------------------------------
1 | >== Brathähnchen ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{n}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Bild=Roast chicken.jpg|250px|1|Brathähnchen
6 | |Nominativ Singular=das Brathähnchen
7 | |Nominativ Plural=die Brathähnchen
8 | |Genitiv Singular=des Brathähnchens
9 | |Genitiv Plural=der Brathähnchen
10 | |Dativ Singular=dem Brathähnchen
11 | |Dativ Plural=den Brathähnchen
12 | |Akkusativ Singular=das Brathähnchen
13 | |Akkusativ Plural=die Brathähnchen
14 | }}
15 |
16 | {{Worttrennung}}
17 | :Brat·hähn·chen, {{Pl.}} Brat·hähn·chen
18 |
19 | {{Aussprache}}
20 | :{{IPA}} {{Lautschrift|ˈbʀaːthɛːnçən}}, {{Lautschrift|ˈbʀaːthɛːnçn̩}}, {{Pl.}} {{Lautschrift|ˈbʀaːthɛːnçən}}, {{Lautschrift|ˈbʀaːthɛːnçn̩}}
21 |
22 | {{Aussprache}}
23 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}}
24 |
25 | {{Bedeutungen}}
26 | :[1] ein am [[Grill]] oder im [[Backofen]] gebratenes [[Huhn]]
27 |
28 | {{Herkunft}}
29 | :[[Determinativkompositum]] aus dem Stamm des Verbs [[braten]] und [[Hähnchen]]
30 |
31 | {{Synonyme}}
32 | :[1] [[Brathendl]], [[Brathuhn]], [[Brathühnchen]], [[Broiler]], [[Grillhähnchen]], [[Gummiadler]]
33 |
34 | {{Oberbegriffe}}
35 | :[1] [[Fleischgericht]], [[Lebensmittel]]
36 |
37 | {{Unterbegriffe}}
38 | :[1] [[Huhn]], [[Hähnchen]], [[Hühnchen]]
39 |
40 | {{Beispiele}}
41 | :[1] Das ''Brathähnchen'' bitte mit Salat.
42 |
43 | {{Charakteristische Wortkombinationen}}
44 | :[1] ein ''Brathähnchen'' mit [[Brötchen]], [[Pommes frites]], [[Salat]]
45 |
46 | ==== Übersetzungen ====
47 | {{Ü-links}}
48 | *{{en}}: [1] {{Ü|en|roast chicken}}
49 | *{{fr}}: [1] {{Ü|fr|poulet rôti}} {{m}}
50 | *{{it}}: [1] {{Ü|it|}}
51 | {{Ü-Abstand}}
52 | *{{sv}}: [1] ''Grill:'' {{Ü|sv|grillad kyckling}}, ''Backofen:'' {{Ü|sv|ungsstekt kyckling}}
53 | *{{es}}: [1] {{Ü|es|pollo asado}}
54 | *{{hu}}: [1] {{Ü|hu|sültcsirke}}, {{Ü|hu|grillcsirke}}
55 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link unterhalb des Editierfensters -->
56 |
57 | {{Referenzen}}
58 | :[1] {{Wikipedia|Brathähnchen}}
59 | :[*] {{Ref-DWDS|Brathähnchen}}
60 | :[1] {{Ref-Canoo|Brath%E4hnchen}}
61 | :[1] {{Ref-UniLeipzig|Brath%E4hnchen}}
62 |
63 | [[ko:Brathähnchen]]
64 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Mockumentary.txt:
--------------------------------------------------------------------------------
1 | == Mockumentary ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{mfn}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Genus 1=m
6 | |Genus 2=f
7 | |Genus 3=n
8 | |Nominativ Singular 1=Mockumentary
9 | |Nominativ Singular 2=Mockumentary
10 | |Nominativ Singular 3=Mockumentary
11 | |Nominativ Plural=Mockumentarys
12 | |Genitiv Singular 1=Mockumentary
13 | |Genitiv Singular 2=Mockumentary
14 | |Genitiv Singular 2*=Mockumentarys
15 | |Genitiv Singular 3=Mockumentary
16 | |Genitiv Singular 3*=Mockumentarys
17 | |Genitiv Plural=Mockumentarys
18 | |Dativ Singular 1=Mockumentary
19 | |Dativ Singular 2=Mockumentary
20 | |Dativ Singular 3=Mockumentary
21 | |Dativ Plural=Mockumentarys
22 | |Akkusativ Singular 1=Mockumentary
23 | |Akkusativ Singular 2=Mockumentary
24 | |Akkusativ Singular 3=Mockumentary
25 | |Akkusativ Plural=Mockumentarys
26 | }}
27 |
28 | {{Worttrennung}}
29 | :Mo·cku·men·ta·ry, {{Pl.}} Mo·cku·men·ta·rys
30 |
31 | {{Aussprache}}
32 | :{{IPA}} {{Lautschrift|mɔkjuˈmɛntəʀi}}
33 | :{{Hörbeispiele}} {{Audio|}}
34 |
35 | {{Bedeutungen}}
36 | :[1] Film, der eine Dokumentation oder das Genre selbst parodiert
37 |
38 | {{Herkunft}}
39 | :von gleichbedeutend englisch ''{{Ü|en|mockumentary}}'' entlehnt, das ein Kofferwort aus ''{{Ü|en|mock}}'' „unecht“ und ''{{Ü|en|documentary}}'' „Dokumentatarfilm“ ist<ref>{{Ref-Duden}}</ref>
40 |
41 | {{Oberbegriffe}}
42 | :[1] [[Film]]
43 |
44 | {{Beispiele}}
45 | :[1] „Zuletzt lief von Stein eine ''Mockumentary'' über das "wahre" Ende der DDR im Fernsehen.“<ref>{{Per-Spiegel Online | Online=http://www.spiegel.de/kultur/tv/tatort-hal-aus-stuttgart-big-data-in-little-stuttgart-a-1103576.html | Autor=Christian Buß | Titel=Stuttgart-"Tatort" über künstliche Intelligenz: Mensch, Technik, Katastrophe | Tag=26 | Monat=08 | Jahr=2016 | Zugriff=2017-08-24 }}</ref>
46 |
47 | ==== {{Übersetzungen}} ====
48 | {{Ü-Tabelle|Ü-links=
49 | *{{en}}: [1] {{Ü|en|mockumentary}}
50 | *{{fr}}: [1] {{Ü|fr|}}
51 | |Ü-rechts=
52 | *{{it}}: [1] {{Ü|it|}}
53 | *{{es}}: [1] {{Ü|es|}}
54 | }}
55 |
56 | {{Referenzen}}
57 | :[1] {{Wikipedia}}
58 | :[*] {{Ref-DWDS}}
59 | :[1] {{Ref-Duden}}
60 | :[*] {{Ref-UniLeipzig}}
61 |
62 | {{Quellen}}
63 |
64 | [[Kategorie:Entlehnung aus dem Englischen (Deutsch)]]
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DEWiktionaryEntryParserTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de;
19 |
20 | import java.io.BufferedReader;
21 | import java.io.File;
22 | import java.io.FileInputStream;
23 | import java.io.IOException;
24 | import java.io.InputStreamReader;
25 |
26 | import junit.framework.TestCase;
27 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage;
28 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryPage;
29 | import de.tudarmstadt.ukp.jwktl.parser.WiktionaryEntryParser;
30 |
31 | /**
32 | * Abstract test case for German Wiktionary parsers.
33 | * @author Christian M. Meyer
34 | */
35 | public abstract class DEWiktionaryEntryParserTest extends TestCase {
36 |
37 | protected IWiktionaryPage parse(final String fileName) throws IOException {
38 | StringBuilder text = new StringBuilder();
39 | BufferedReader reader = new BufferedReader(
40 | new InputStreamReader(new FileInputStream(
41 | new File("src/test/resources/articles-de/" + fileName)),
42 | "UTF-8"));
43 | String line;
44 | while ((line = reader.readLine()) != null)
45 | text.append(line).append("\n");
46 | reader.close();
47 | WiktionaryPage result = new WiktionaryPage();
48 | result.setTitle(fileName);
49 | WiktionaryEntryParser parser = new DEWiktionaryEntryParser();
50 | parser.parse(result, text.toString());
51 | return result;
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalAspect.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
21 |
22 | /**
23 | * Enumeration of the grammatical aspect of a {@link IWiktionaryWordForm}.
24 | * Note that tense is often combined with verb aspects (e.g., present
25 | * perfect). Such combinations can be modeled in combination with
26 | * enumeration values from {@link GrammaticalTense}.
27 | * @author Christian M. Meyer
28 | */
29 | public enum GrammaticalAspect {
30 |
31 | /** An ongoing, habitual, repeated situation. Used to express the
32 | * English simple forms (e.g., "I paint the house") and
33 | * progressive forms ("I am painting the house"). The imperfect
34 | * aspect is also used for the German "Partizip I" form
35 | * (e.g. "die liebende Mutter"). */
36 | IMPERFECT,
37 |
38 | /** A completed situation. Used to express the English perfect forms
39 | * (e.g., "I have painted the house"). The perfect aspect is also
40 | * used for the German "Partizip II" form
41 | * (e.g., "die geliebte Mutter"). */
42 | PERFECT;
43 |
44 | // Perfective
45 | // Aorist
46 | // Momentane
47 | // Semelfactive
48 | // Imperfective
49 | // Continuous and progressive
50 | // Durative
51 | // Imperfect
52 | // Iterative/distributive/frequentative
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENDescendantRelationHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import java.util.ArrayList;
21 | import java.util.List;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.RelationType;
24 |
25 | public class ENDescendantRelationHandler extends ENRelationHandler {
26 | public ENDescendantRelationHandler(String... labels) {
27 | super(RelationType.DESCENDANT, labels);
28 | }
29 |
30 | @Override
31 | protected WordList parseWordList(String text) {
32 | WordList list = super.parseWordList(text);
33 | if (list.size() > 1) {
34 | return new WordList(list.comment, fixDescendantWordList(list.words));
35 | } else {
36 | return list;
37 | }
38 | }
39 |
40 | private static List fixDescendantWordList(List wordList) {
41 | String firstWord = wordList.get(0);
42 | final int colon = (firstWord == null ? -1 : firstWord.indexOf(':'));
43 | if (colon != -1) {
44 | List fixed = new ArrayList<>(wordList.size());
45 | fixed.add(firstWord);
46 |
47 | String language = firstWord.substring(0, colon);
48 | for (int i = 1; i < wordList.size(); i++) {
49 | String word = wordList.get(i);
50 | if (word.indexOf(':') == -1)
51 | fixed.add(language + ": " + word);
52 | else
53 | fixed.add(word);
54 | }
55 | return fixed;
56 | } else {
57 | return wordList;
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/EinzahlHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import java.util.regex.Matcher;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalNumber;
24 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEGenderText;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
26 |
27 | public class EinzahlHandler extends PatternBasedIndexedParameterHandler {
28 |
29 | protected static final String EINZAHL_PATTERN =
30 | // endsWith(" (Einzahl)")
31 | " \\(Einzahl\\)$|" +
32 | // endsWith(" (Einzahl 1)") || endsWith(" (Einzahl 2)") ||
33 | // endsWith(" (Einzahl 3)") || endsWith(" (Einzahl 4)")
34 | " \\(Einzahl\\s([1-4])\\)$";
35 |
36 | public EinzahlHandler(DEWordFormNounTableHandler nounTableHandler) {
37 | super(nounTableHandler, EINZAHL_PATTERN);
38 | }
39 |
40 | @Override
41 | public void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher,
42 | ParsingContext context) {
43 | wordForm.setNumber(GrammaticalNumber.SINGULAR);
44 | final DEGenderText genderText = this.nounTableHandler.getGenusByIndex(index);
45 | if (genderText != null) {
46 | wordForm.setGender(genderText.asGrammaticalGender());
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/WiktionaryTestCase.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl;
19 |
20 | import java.io.File;
21 | import java.util.logging.LogManager;
22 |
23 | import junit.framework.TestCase;
24 |
25 | /**
26 | * Abstract test case for JWKTL.
27 | * @author Christian M. Meyer
28 | */
29 | public abstract class WiktionaryTestCase extends TestCase {
30 |
31 | protected static final File RESOURCE_PATH = new File("src/test/resources");
32 |
33 | protected File workDir;
34 |
35 | @Override
36 | protected void setUp() throws Exception {
37 | super.setUp();
38 | workDir = new File("target/test-output/"
39 | + getClass().getName() + "_" + this.getName());
40 | deleteDirectory(workDir);
41 | workDir.mkdir();
42 | }
43 |
44 | @Override
45 | protected void tearDown() throws Exception {
46 | deleteDirectory(workDir);
47 | super.tearDown();
48 | }
49 |
50 | protected static boolean deleteDirectory(final File path) {
51 | if (path.exists()) {
52 | File[] files = path.listFiles();
53 | for (File file : files)
54 | if (file.isDirectory()) {
55 | if (!deleteDirectory(file))
56 | System.err.println("Unable to delete dir: " + file);
57 | } else {
58 | if (!file.delete())
59 | System.err.println("Unable to delete file: " + file);
60 | }
61 | }
62 | return path.delete();
63 | }
64 |
65 | static {
66 | LogManager.getLogManager().reset();
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/util/PatternUtilsTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.util;
19 |
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | import junit.framework.TestCase;
24 |
25 | /**
26 | * Test case for {@link PatternUtils}.
27 | *
28 | * @author Alexey Valikov
29 | */
30 | public class PatternUtilsTest extends TestCase {
31 |
32 | /***/
33 | public void testExtractIndex() {
34 | Pattern pattern = Pattern.compile("^Group$|^Group\\s([1-9,a-z])$");
35 | try {
36 | assertEquals(null, PatternUtils.extractIndex(matcher(pattern, "Puorg")));
37 | fail("Extracting index from non-matched matcher must fail.");
38 | } catch (IllegalArgumentException iaex) {
39 | assertTrue(true);
40 | }
41 | assertEquals(null, PatternUtils.extractIndex(matcher(pattern, "Group")));
42 | assertEquals(Integer.valueOf(1), PatternUtils.extractIndex(matcher(pattern, "Group 1")));
43 | assertEquals(Integer.valueOf(8), PatternUtils.extractIndex(matcher(pattern, "Group 8")));
44 | try {
45 | PatternUtils.extractIndex(matcher(pattern, "Group q"));
46 | fail("Extracting index from non-integer group must fail.");
47 | } catch (NumberFormatException nfex) {
48 | assertTrue(true);
49 | }
50 | }
51 |
52 | private static Matcher matcher(Pattern pattern, String str) {
53 | Matcher matcher = pattern.matcher(str);
54 | matcher.find();
55 | return matcher;
56 | }
57 | }
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Thulium.txt:
--------------------------------------------------------------------------------
1 | {{Periodensystem}}
2 | == Thulium ({{Sprache|Deutsch}}) ==
3 | === {{Wortart|Substantiv|Deutsch}}, {{n}} ===
4 | {{Substantiv-Tabelle|
5 | Bild=Tm-TableImage.png|210px|1|Thulium im Periodensystem
6 | |Wer oder was? (Einzahl)=das Thulium
7 | |Wer oder was? (Mehrzahl)=—
8 | |Wessen? (Einzahl)=des Thuliums
9 | |Wessen? (Mehrzahl)=—
10 | |Wem? (Einzahl)=dem Thulium
11 | |Wem? (Mehrzahl)=—
12 | |Wen? (Einzahl)=das Thulium
13 | |Wen? (Mehrzahl)=—
14 | }}
15 | {{Silbentrennung}} Thu·li·um, {{Pl.}} ''kein Plural''
16 |
17 | {{Aussprache}}
18 | :[[Hilfe:IPA|IPA]]: {{Lautschrift|'tuːli̯ʊm}}
19 | :[[Hilfe:Hörbeispiele|Hörbeispiele]]: {{fehlend}}
20 |
21 | {{Bedeutungen}}
22 | :[1] [[chemisch]]es [[Element]] mit der Ordnungszahl 69, das zu den [[Lanthanoid]]en gehört
23 |
24 | {{Abkürzungen}}
25 | :[1] [[Tm]] ''(chemisches Zeichen)''
26 |
27 | {{Herkunft}}
28 | :[1] nach Thule, dem mythischen Namen für Skandinavien
29 |
35 |
36 | {{Oberbegriffe}}
37 | :[1] [[Lanthanoid]], [[Metall]], chemisches [[Element]]
38 |
41 |
42 | {{Beispiele}}
43 | :[1] In der [[Natur]] kommt '''Thulium''' nur in [[Verbindung]]en vor.
44 |
45 | {{Charakteristische Wortkombinationen}}
46 |
47 | {{Abgeleitete Begriffe}}
48 | :[1]
49 |
50 | ==== Übersetzungen ====
51 | {{Ü-links}}
52 | *{{ar}}: [1] {{Ü|ar|ثليوم}}
53 | *{{hy}}: [1] {{Ü|hy|թուլիում}} (tulium)
54 | *{{zh}}: [1] {{Ü|zh|铥}}
55 | *{{en}}: [1] {{Ü|en|thulium}}
56 | *{{fr}}: [1] {{Ü|fr|thulium}} ''m''
57 | *{{he}}: [1] {{Ü|he|תוליום}} (tulium)
58 | *{{it}}: [1] {{Ü|it|tulio}}
59 | {{Ü-Abstand}}
60 | *{{lt}}: [1] {{Ü|lt|tulis}}
61 | *{{nl}}: [1] {{Ü|nl|thulium}}
62 | *{{pl}}: [1] {{Ü|pl|tul}}
63 | *{{pt}}: [1] {{Ü|pt|túlio}}
64 | *{{ru}}: [1] {{Ü|ru|тулий}} (tulij)
65 | *{{sv}}: [1] {{Ü|sv|tulium}}
66 | *{{es}}: [1] {{Ü|es|tulio}}
67 | {{Ü-rechts}}
68 |
69 | {{Referenzen}}
70 | :[1] {{Wikipedia|Thulium}}
71 | :[1] {{Ref-DWDS|Thulium}}
72 | :[1] {{Ref-Canoo|Thulium}}
73 | :[1] {{Ref-UniLeipzig|Thulium}}
74 |
75 | [[Kategorie:Illustration]]
76 |
77 | [[cs:Thulium]]
78 | [[en:Thulium]]
79 | [[fr:Thulium]]
80 | [[lt:Thulium]]
81 | [[pl:Thulium]]
82 | [[ro:Thulium]]
83 | [[ru:Thulium]]
84 |
85 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DESenseExampleHandlerTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de;
19 |
20 | import java.util.Iterator;
21 |
22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
23 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryExample;
24 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage;
25 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DESenseExampleHandler;
26 |
27 | /**
28 | * Test case for {@link DESenseExampleHandler}.
29 | * @author Christian M. Meyer
30 | */
31 | public class DESenseExampleHandlerTest extends DEWiktionaryEntryParserTest {
32 |
33 | /***/
34 | public void testRuettelstreifen() throws Exception {
35 | IWiktionaryPage page = parse("Ruettelstreifen.txt");
36 | IWiktionaryEntry entry = page.getEntry(0);
37 | Iterator exampleIter = entry.getSense(1).getExamples().iterator();
38 | assertEquals("„Eine wirksame Maßnahme die Verkehrssicherheit zu steigern, sind z.B.: profilierte Fahrbahnmarkierungen oder ''Rüttelstreifen'' auf der Standspur.“", exampleIter.next().getText());
39 | assertEquals("„''Rüttelstreifen'' am Fahrbahnrand von Autobahnen können die Zahl übermüdungsbedingter Verkehrsunfälle deutlich reduzieren.“", exampleIter.next().getText());
40 | assertEquals("„Schwere Autobahn-Unfälle können mit Hilfe von sogenannten ''Rüttelstreifen'' deutlich verringert werden.“", exampleIter.next().getText());
41 | assertFalse(exampleIter.hasNext());
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEEtymologyHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString;
21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryEntry;
22 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
23 |
24 | /**
25 | * Parser component for extracting etymological information from the
26 | * German Wiktionary.
27 | * @author Christian M. Meyer
28 | * @author Lizhen Qu
29 | */
30 | public class DEEtymologyHandler extends DEBlockHandler {
31 |
32 | protected StringBuilder etymology;
33 |
34 | /** Initializes the block handler for parsing all sections starting with
35 | * one of the specified labels. */
36 | public DEEtymologyHandler() {
37 | super("Herkunft");
38 | }
39 |
40 | @Override
41 | public boolean processHead(final String textLine, final ParsingContext context) {
42 | etymology = new StringBuilder();
43 | return super.processHead(textLine, context);
44 | }
45 |
46 | @Override
47 | public boolean processBody(String textLine, final ParsingContext context) {
48 | textLine = textLine.trim();
49 | if (!textLine.isEmpty())
50 | etymology.append(textLine);
51 | return false;
52 | }
53 |
54 | public void fillContent(final ParsingContext context) {
55 | if (etymology.length() > 0) {
56 | WiktionaryEntry posEntry = context.findEntry();
57 | posEntry.setWordEtymology(new WikiString(etymology.toString()));
58 | }
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/PatternBasedIndexedParameterHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import java.util.Objects;
21 | import java.util.regex.Matcher;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.PatternUtils;
26 |
27 | public abstract class PatternBasedIndexedParameterHandler extends PatternBasedParameterHandler {
28 |
29 | protected final DEWordFormNounTableHandler nounTableHandler;
30 |
31 | public PatternBasedIndexedParameterHandler(DEWordFormNounTableHandler nounTableHandler, String regex) {
32 | super(regex);
33 | Objects.requireNonNull(nounTableHandler, "nounTableHandler must not be null.");
34 | this.nounTableHandler = nounTableHandler;
35 | }
36 |
37 | public void handle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) {
38 | final Matcher matcher = pattern.matcher(label);
39 | if (matcher.find()) {
40 | final Integer index = PatternUtils.extractIndex(matcher);
41 | final int i = index == null ? 1 : index.intValue();
42 | handleIfFound(wordForm, label, i, value, matcher, context);
43 | }
44 | }
45 |
46 | public abstract void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher,
47 | ParsingContext context);
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/resources/articles-ru/lodka.txt:
--------------------------------------------------------------------------------
1 | {{wikipedia}}
2 | = {{-ru-}} =
3 | ===Морфологические и синтаксические свойства===
4 | {{сущ ru f ina 3*a
5 | |основа=ло́дк
6 | |основа1=ло́док
7 | |слоги={{по-слогам|ло́д|ка}}
8 | }}
9 |
10 | {{морфо||лодк||а}} {{Тихонов}}
11 |
12 | ===Произношение===
13 | {{transcription|ˈlotkə}} {{медиа|Ru-лодка.ogg}}
14 |
15 | ===Семантические свойства===
16 | {{илл|Kayaks and canoes from above.jpg|Лодки[1]|size=200px}}
17 | ====Значение====
18 | # водное транспортное средство, небольшое [[судно]], идущее на вёслах, под [[парус]]ом или на моторной тяге {{пример|Мы все уселись в {{выдел|лодку}} и подъехали к левому берегу, ища места, где бы высадиться.|Джером К. Джером|Трое в одной лодке, не считая собаки|перев=М. Салье}}
19 |
20 | ====Синонимы====
21 | #
22 |
23 | ====Антонимы====
24 | #
25 |
26 | ====Гиперонимы====
27 | # [[судно]]
28 |
29 | ====Гипонимы====
30 | # [[баркас]], [[ялик]], [[шлюпка]], [[бот]], [[вельбот]], [[гичка]], [[байдарка]], [[берестянка]]
31 |
32 | ===Родственные слова===
33 | {{родств-блок
34 | |имена-собственные=
35 | |существительные=[[лодочка]], [[лодчонка]], [[подлодка]]
36 | |прилагательные=[[лодочный]]
37 | |глаголы=
38 | |наречия=
39 | }}
40 | ===Фразеологизмы и устойчивые сочетания===
41 | * [[подводная лодка]]
42 | * [[летающая лодка]]
43 | * [[канонерская лодка]]
44 |
45 | ===Загадки===
46 | * [[в лесу родилась, на воде живёт]]
47 |
48 | ===Этимология===
49 | Происходит от слова [[ладья]], далее от {{этимология:лодка}}
50 |
51 | ===Перевод===
52 | {{перев-блок||
53 | |en=[[boat]], [[dinghy]], [[gig]], [[yawl]]
54 | |br=[[bag]]
55 | |vep=[[veneh]]
56 | |vro=[[vineh]], [[loodsik]]
57 | |es=[[lancha]], [[bote]], [[barca]], [[canoa]], [[yola]]
58 | |it=[[barca]], [[canotto]], [[lancia]]
59 | |krl=[[veneh]]
60 | |mdf=[[венеж]]
61 | |de=[[Boot]] n -(e)s, -e
62 | |art=[[ilo tawa telo]]
63 | |fi=[[vene]]
64 | |fr=[[bateau]], [[canot]], [[barque]], [[embarcation]]
65 | |myv=[[венч]]
66 | |eo=[[boato]]
67 | |et=[[paat]], [[lootsik]], (''чёлн'') [[vene]]
68 | }}
69 | {{длина слова|5}}
70 |
71 | [[Категория:Суда]]
72 |
73 | [[bg:лодка]]
74 | [[de:лодка]]
75 | [[el:лодка]]
76 | [[en:лодка]]
77 | [[fi:лодка]]
78 | [[fr:лодка]]
79 | [[fy:лодка]]
80 | [[hu:лодка]]
81 | [[hy:лодка]]
82 | [[io:лодка]]
83 | [[ko:лодка]]
84 | [[li:лодка]]
85 | [[lo:лодка]]
86 | [[nl:лодка]]
87 | [[pl:лодка]]
88 | [[pt:лодка]]
89 | [[ro:лодка]]
90 | [[tr:лодка]]
91 | [[vi:лодка]]
92 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/WRedirectEn.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en;
17 |
18 | import java.util.regex.Matcher;
19 | import java.util.regex.Pattern;
20 |
21 | /** Redirect related functions in wiki and English Wiktionary.
22 | *
23 | * see http://en.wiktionary.org/wiki/Wiktionary:Redirections
24 | */
25 | public class WRedirectEn {
26 |
27 | /** Gets target page of the redirect, extracts [[pagename]] from double brackets. */
28 | private final static Pattern ptrn_redirect = Pattern.compile(
29 | "#REDIRECT \\[\\[(.+?)\\]\\]",
30 | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
31 |
32 | /** Checks whether this is a redirect page. If this is true then
33 | * the title of the target (redirected) page will be returned.
34 | *
35 | * @param page_title word which are described in this article
36 | * @param text defines source wiki text
37 | * @return if this is not a redirect then return null
38 | */
39 | public static String getRedirect(String page_title,
40 | StringBuffer text) {
41 |
42 | // #REDIRECT [[pagename]] (or #redirect [[pagename]]
43 |
44 | //int len = "#REDIRECT [[".length(); // == 12
45 | if(text.length() < 12 || text.charAt(0) != '#')
46 | return null;
47 |
48 | Matcher m = ptrn_redirect.matcher(text);
49 | if (m.find()){
50 | return m.group(2);
51 | }
52 |
53 | return null;
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Generaladmiral.txt:
--------------------------------------------------------------------------------
1 | == Generaladmiral ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{m}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Genus=m
6 | |Nominativ Singular=Generaladmiral
7 | |Nominativ Plural 1=Generaladmirale
8 | |Nominativ Plural 2=Generaladmiräle
9 | |Genitiv Singular=Generaladmirals
10 | |Genitiv Plural 1=Generaladmirale
11 | |Genitiv Plural 2=Generaladmiräle
12 | |Dativ Singular=Generaladmiral
13 | |Dativ Singular*=Generaladmirale
14 | |Dativ Plural 1=Generaladmiralen
15 | |Dativ Plural 2=Generaladmirälen
16 | |Akkusativ Singular 1=Generaladmiral
17 | |Akkusativ Plural 1=Generaladmirale
18 | |Akkusativ Plural 2=Generaladmiräle
19 | |Bild=Generaladmiral Ehrensvärd.gif|mini|2|Der schwedische ''Generaladmiral'' Carl August Ehrensvärd
20 | }}
21 |
22 | {{Worttrennung}}
23 | :Ge·ne·ral·ad·mi·ral, {{Pl.}} Ge·ne·ral·ad·mi·ra·le
24 |
25 | {{Aussprache}}
26 | :{{IPA}} {{Lautschrift|ɡenəˈʀaːlʔatmiˌʀaːl}}
27 | :{{Hörbeispiele}} {{Audio|De-Generaladmiral.ogg}}
28 |
29 | {{Bedeutungen}}
30 | :[1] {{K|Kaiserliche Marine|Kriegsmarine|Reichsmarine}} Zweithöchster Admiralsrang, unter dem [[Großadmiral]] und dem [[Admiral]]
31 | :[3] {{K|Marine}} Titel des jeweilig ältesten Admirals im 17. und 18. Jahrhundert
32 | :[2] {{K|Kaiserliche Marine|Kriegsmarine|Reichsmarine}} Admiral im Range eines Generaladmirals
33 |
34 | {{Herkunft}}
35 | :[[Determinativkompositum]] aus den [[Substantiven]] ''[[General]]'' und ''[[Admiral]]''
36 |
37 | {{Synonyme}}
38 | :[1] ''[[Heer]], [[Luftwaffe]]:'' [[Generaloberst]]
39 |
40 | {{Oberbegriffe}}
41 | :[1] [[Admiralsrang]]
42 | :[2] [[Admiral]]
43 |
44 | {{Beispiele}}
45 | :[1] Die Deutsche Marine kennt keinen ''Großadmiral.''
46 | :[2] Nicht viele schafften es, ''Großadmiral'' zu werden.
47 | :[3] Gegen Mittag trifft der Führer gemeinsam mit ''Generaladmiral'' Raeder ein.<ref>o. A.: ''1939'', in: Manfred Overresch und Friedrich Wilhelm Saal (Hgg.): ''Deutsche Geschichte von Tag zu Tag 1918-1949'', 2000 [1983], S. 2772</ref>
48 |
49 | ==== {{Übersetzungen}} ====
50 | {{Ü-Tabelle|Ü-links=
51 | *{{en}}: [1] {{Ü|en|}}
52 | *{{fr}}: [1] {{Ü|fr|}}
53 | |Ü-rechts=
54 | *{{it}}: [1] {{Ü|it|}}
55 | *{{es}}: [1] {{Ü|es|}}
56 | }}
57 |
58 | {{Referenzen}}
59 | :[1–3] {{Wikipedia|Generaladmiral}}
60 | :[1–3] {{Ref-DWDS|Generaladmiral}}
61 | :[*] {{Ref-Canoo|Generaladmiral}}
62 | :[*] {{Ref-OWID|Generaladmiral}}
63 | :[1–3] {{Ref-Duden|Generaladmiral}}
64 |
65 | {{Quellen}}
66 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/harness.txt:
--------------------------------------------------------------------------------
1 | == harness ({{Sprache|Englisch}}) ==
2 | === {{Wortart|Substantiv|Englisch}} ===
3 |
4 | {{erweitern|Beispiel|Englisch}}
5 |
6 | {{Englisch Substantiv Übersicht
7 | |Bild 1=Harness (PSF).png|220px|2|horse ''harness''
8 | |Bild 2=PSM V39 D327 Crompton thirty six harness worsted loom.jpg|220px|6|''harness''
9 | |Singular=the harness
10 | |Plural=the harnesses
11 | }}
12 |
13 | {{Worttrennung}}
14 | :har·ness, {{Pl.}} har·nesses
15 |
16 | {{Aussprache}}
17 | :{{IPA}} {{Lautschrift|ˈhɑ:nɪs}}, {{amer.|:}} {{Lautschrift|ˈhɑ:rnɪs}}, {{Pl.}} {{Lautschrift|…}}
18 | :{{Hörbeispiele}} {{Audio|En-us-harness.ogg|harness (amerikanisch)}}, {{Pl.}} {{fehlend}}
19 |
20 | {{Bedeutungen}}
21 | :[1] das Zuggeschirr, Geschirr, Gurtzeug, Harnisch, Beschirrung
22 | :[2] das Pferdegeschirr
23 | :[3] das Zaumzeug
24 | :[4] der Kabelstrang
25 | :[5] der Klettergurt
26 | :[6] das Webgeschirr
27 | :[7] ''umgangssprachlich:'' täglliche Routine, Alltagstrott
28 | :[8] der Harnisch, die Rüstung
29 |
30 | {{Beispiele}}
31 | :[1]
32 |
33 | {{Redewendungen}}
34 | :[[double harness]]
35 | :[[in harness]]
36 |
37 | {{Abgeleitete Begriffe}}
38 | :[[baby harness]], [[harness assembly]], [[harness attachment]], [[saventy harness]], [[wiring harness]], [[harnesser]], [[harnessless]], [[harnesslike]], [[reharness]], [[well-harnessed]], [[cable harness]], [[wire harness]], [[harness horse]]
39 |
40 | ==== Übersetzungen ====
41 | {{Ü-links}}
42 | *{{de}}: [1] [[Zuggeschirr]], [[Geschirr]], [[Gurtzeug]], [[Harnisch]], [[Beschirrung]]; [2] das [[Pferdegeschirr]]; [3] das [[Zaumzeug]] [4] der [[Kabelstrang]] [5] der [[Klettergurt]] [6] das [[Webgeschirr]]; [7] [[Alltagstrott]]; [8] [[Harnisch]], [[Rüstung]]
43 | {{Ü-rechts}}
44 |
45 | {{Referenzen}}
46 | :[1,2,5,8] {{Wikipedia|spr=en|harness}}
47 | :[1-3,6,7] {{Ref-MWD|harness}}
48 | :[1,2,4,6,8] {{Ref-Dictionary|harness}}
49 | :[1,2,5,7] {{Ref-Pons|en|harness}}
50 | :[1–5] {{Ref-dictcc|harness}}
51 | :[1–6] {{Ref-Leo|en|harness}}
52 |
53 | [[cs:harness]]
54 | [[en:harness]]
55 | [[eo:harness]]
56 | [[et:harness]]
57 | [[fi:harness]]
58 | [[fr:harness]]
59 | [[hu:harness]]
60 | [[id:harness]]
61 | [[io:harness]]
62 | [[it:harness]]
63 | [[kn:harness]]
64 | [[ko:harness]]
65 | [[mg:harness]]
66 | [[ml:harness]]
67 | [[my:harness]]
68 | [[pl:harness]]
69 | [[pt:harness]]
70 | [[sv:harness]]
71 | [[ta:harness]]
72 | [[te:harness]]
73 | [[vi:harness]]
74 | [[zh:harness]]
75 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/ru/WRedirectRu.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.ru;
17 |
18 | import java.util.regex.Matcher;
19 | import java.util.regex.Pattern;
20 |
21 | /** Redirect related functions in wiki and Russian Wiktionary.
22 | *
23 | * see http://ru.wiktionary.org/wiki/Викисловарь:Перенаправления
24 | */
25 | public class WRedirectRu {
26 |
27 | /** Gets target page of the redirect, extracts [[pagename]] from double brackets. */
28 | private final static Pattern ptrn_redirect = Pattern.compile(
29 | "#(REDIRECT|ПЕРЕНАПРАВЛЕНИЕ) \\[\\[(.+?)\\]\\]",
30 | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
31 |
32 | /** Checks whether this is a redirect page. If this is true then
33 | * the title of the target (redirected) page will be returned.
34 | *
35 | * @param page_title word which are described in this article
36 | * @param text defines source wiki text
37 | * @return if this is not a redirect then return null
38 | */
39 | public static String getRedirect(String page_title,
40 | StringBuffer text) {
41 |
42 | // #REDIRECT [[pagename]] (or #redirect [[pagename]]
43 | // or #ПЕРЕНАПРАВЛЕНИЕ [[pagename]]
44 |
45 | //int len = "#REDIRECT [[".length(); // == 12
46 | if(text.length() < 12 || text.charAt(0) != '#')
47 | return null;
48 |
49 | Matcher m = ptrn_redirect.matcher(text);
50 | if (m.find()){
51 | return m.group(2);
52 | }
53 |
54 | return null;
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/util/IBlockHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.util;
19 |
20 | /**
21 | * A handler encapsulated the extraction of the information items encoded in
22 | * a certain article constituent. There might be, for example, a handler
23 | * for extracting pronunciation information.
24 | * @author Christian M. Meyer
25 | * @author Lizhen Qu
26 | */
27 | public interface IBlockHandler {
28 |
29 | /** Return true if the handler requests to process the article
30 | * constituent starting at the given line of text. */
31 | boolean canHandle(final String blockHeader);
32 |
33 | /** If the handler requested to process this constituent, this hotspot
34 | * will be called for processing the section header of this
35 | * article constituent. Return true if the handler
36 | * requests to handle also the body of this constituent. */
37 | boolean processHead(final String line, final ParsingContext context);
38 |
39 | /** If the handler requested to process the body of this constituent, this
40 | * hotspot will be called for processing each line of the constituent's
41 | * body. Return true if the handler requests to handle also
42 | * the next line using this handler. */
43 | boolean processBody(final String line, final ParsingContext context);
44 |
45 | /** This hotspot is invoked if the parser releases this handler. It can be
46 | * used to store the extracted information to the Wiktionary data
47 | * objects stored in the parsing context. */
48 | void fillContent(final ParsingContext context);
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENEtymologyHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import java.util.Arrays;
21 | import java.util.List;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString;
24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.StringUtils;
26 |
27 | /**
28 | * Parser component for extracting etymological information from
29 | * the English Wiktionary.
30 | * @author Christian M. Meyer
31 | * @author Lizhen Qu
32 | */
33 | public class ENEtymologyHandler extends ENBlockHandler {
34 | private static List SPELLINGS = Arrays.asList("etymology", "etymolgy",
35 | "eytomology", "etmology", "eymology");
36 | protected StringBuffer contentBuffer;
37 |
38 | public boolean canHandle(String blockHeader) {
39 | return SPELLINGS.contains(StringUtils.strip(blockHeader, "{}=: 1234567890").toLowerCase());
40 | }
41 |
42 | @Override
43 | public boolean processHead(String textLine, ParsingContext context) {
44 | contentBuffer = new StringBuffer();
45 | return true;
46 | }
47 |
48 | @Override
49 | public boolean processBody(String textLine, ParsingContext context) {
50 | if (!textLine.isEmpty() && !textLine.startsWith("===")) {
51 | contentBuffer.append(textLine);
52 | }
53 | return false;
54 | }
55 |
56 | public void fillContent(final ParsingContext context) {
57 | if (!contentBuffer.toString().trim().isEmpty()) {
58 | context.setEtymology(new WikiString(contentBuffer.toString().trim()));
59 | } else {
60 | context.setEtymology(null);
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/util/ILanguage.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.util;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage;
22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryTranslation;
23 |
24 | /**
25 | * Generic interface for languages used in Wiktionary. Instances of ILanguage
26 | * are used in as entry language of {@link IWiktionaryPage}s, the word
27 | * language of {@link IWiktionaryEntry}s, and the target language
28 | * of {@link IWiktionaryTranslation}s. Each language is encoded using the
29 | * international standard of language classification (ISO 639). Languages
30 | * are compared by their internal code.
31 | * @author Christian M. Meyer
32 | * @author Christof Müller
33 | * @author Lizhen Qu
34 | */
35 | public interface ILanguage extends Comparable {
36 |
37 | /** Returns the internal language code used by JWTKL. These codes roughly
38 | * correspond to ISO 639-3, but also include language families,
39 | * deprecated classifications, and not yet classified languages. */
40 | String getCode();
41 |
42 | /** Returns the language name (in English language). */
43 | String getName();
44 |
45 | /** Returns the ISO 639-1 code or an empty string if none. */
46 | String getISO639_1();
47 |
48 | /** Returns the ISO 639-2b code or an empty string if none. */
49 | String getISO639_2B();
50 |
51 | /** Returns the ISO 639-2t code or an empty string if none. */
52 | String getISO639_2T();
53 |
54 | /** Returns the ISO 639-3 code or an empty string if none. */
55 | String getISO639_3();
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/ru/quote/TitleAndWikilink.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2008 Andrew Krizhanovsky
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.ru.quote;
17 |
18 | /** (Wikified) title of quote phrase / sentence.
19 | */
20 | public class TitleAndWikilink {
21 | public TitleAndWikilink() {
22 | title = "";
23 | title_wikilink = "";
24 | }
25 |
26 | /** Title of the work. */
27 | public String title;
28 |
29 | /** Link to a book in Wikipedia (format: [[s:title|]] or [[:s:title|]]). */
30 | public String title_wikilink;
31 |
32 |
33 | /** Parses text (e.g. "[[:s:У окна (Андреев)|У окна]]") into
34 | * title_wikilink "У окна (Андреев)" and title "У окна".
35 | */
36 | public void parseTitle(String text) {
37 |
38 | // replace " " by " "
39 | if(text.contains(" "))
40 | text = text.replace(" ", " ");
41 |
42 | title = text; // first version
43 | if(!(text.startsWith("[[:s:") ||
44 | text.startsWith("[[s:")) ||
45 | !text.endsWith("]]") ||
46 | !text.contains("|"))
47 | return;
48 |
49 | if(text.startsWith("[[:s:"))
50 | text = text.substring(5, text.length() - 2); // "[[:s:" . text . "]]"
51 | else
52 | text = text.substring(4, text.length() - 2); // "[[s:" . text . "]]"
53 |
54 | // split by |
55 | // [[:s:The title|The title]]
56 | int pos = text.indexOf("|");
57 | if(-1 == pos)
58 | return;
59 |
60 | title_wikilink = text.substring(0, pos);
61 | title = text.substring(pos + 1);
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Verbalsubstantiv.txt:
--------------------------------------------------------------------------------
1 | == Verbalsubstantiv ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{n}} ===
3 |
4 | {{Substantiv-Tabelle|
5 | Wer oder was? (Einzahl)=das Verbalsubstantiv
6 | |Wer oder was? (Mehrzahl)=die Verbalsubstantive
7 | |Wessen? (Einzahl)=des Verbalsubstantivs
8 | |Wessen? (Mehrzahl)=der Verbalsubstantive
9 | |Wem? (Einzahl)=dem Verbalsubstantiv
10 | |Wem? (Mehrzahl)=den Verbalsubstantiven
11 | |Wen? (Einzahl)=das Verbalsubstantiv
12 | |Wen? (Mehrzahl)=die Verbalsubstantive
13 | }}
14 |
15 | {{Silbentrennung}} Ver·bal·sub·stan·tiv, {{Pl.}} Ver·bal·sub·stan·ti·ve
16 |
17 | {{Aussprache}}
18 | :[[Hilfe:IPA|IPA]]: {{Lautschrift|vɛʁˈbaːlzʊpstanˌtiːf}}, {{Pl.}} {{Lautschrift|vɛʁˈbaːlzʊpstanˌtiːvə}}
19 | :[[Hilfe:Hörbeispiele|Hörbeispiele]]: {{fehlend}}, {{Pl.}} {{fehlend}}
20 |
21 | {{Bedeutungen}}
22 | :[1] [[Linguistik]]: Sammelbegriff für verschiedene Klassen von [[Substantiv]]en, die durch [[Substantivierung]] von [[Verbform]]en enstanden sind oder noch entstehen
23 |
24 | {{Abkürzungen}}
25 |
26 | {{Herkunft}}
27 | : [[Determinativkompositum]] aus dem Adjektiv [[verbal]] und [[Substantiv]]
28 |
29 | {{Synonyme}}
30 | :[1]
31 |
32 | {{Gegenworte}}
33 | :
34 |
35 | {{Oberbegriffe}}
36 | :[1] [[Substantiv]], [[Wortart]], [[Grammatik]]
37 |
38 | {{Unterbegriffe}}
39 | :[1] [[Gerundium]], [[Gerundiv]]/ [[Gerundivum]], [[Nomen actionis]] [ Helmut Glück (Hrsg.), unter Mitarbeit von Friederike Schmöe: ''Metzler Lexikon Sprache.'' Dritte, neubearbeitete Auflage. Metzler, Stuttgart/ Weimar 2005. ISBN 978-3-476-02056-7 ]
40 |
41 |
42 | {{Beispiele}}
43 | :[1] Die Wortwurzel von ''Verbalsubstantiven'' ist ein [[Verb]].
44 |
45 |
46 |
47 | :
48 | {{Abgeleitete Begriffe}}
49 |
50 | :[1]
51 |
52 | ==== Übersetzungen ====
53 | {{Ü-links}}
54 | *{{en}}: [1] {{Ü|en|}}
55 | *{{fr}}: [1] {{Ü|fr|}}
56 | {{Ü-Abstand}}
57 | *{{ru}}: [1] {{Ü|ru|}}
58 | *{{es}}: [1] {{Ü|es|}}
59 | {{Ü-rechts}}
60 |
61 | {{Referenzen}}
62 | :[1] {{Wikipedia|Verbalsubstantiv}}
63 | :[1] {{Ref-DWDS|Verbalsubstantiv}}
64 | :[1] {{Ref-Canoo|Verbalsubstantiv}}
65 | :[1] {{Ref-UniLeipzig|Verbalsubstantiv}}
66 |
67 |
68 | {{Ähnlichkeiten}}
69 |
70 | [[Kategorie:Fremdwort]]
71 |
72 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/IWikiString.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api;
19 |
20 | import java.util.List;
21 |
22 | /**
23 | * Represents a text that contains wiki markup. In addition to the original
24 | * text containing the wiki markup, the interface allows extracting a list
25 | * of wiki-internal and external links as well as a plain text representation
26 | * (i.e., a text without markup).
27 | * @author Christian M. Meyer
28 | * @author Christof Müller
29 | * @author Lizhen Qu
30 | */
31 | public interface IWikiString {
32 |
33 | /** Returns the original text including all wiki markup. */
34 | String getText();
35 |
36 | /** Parses the original text to filter out all wiki markup and thus
37 | * returns a human-readable version of the text. Note that the parsing
38 | * might be done on demand, so avoid invoking this method repeatedly
39 | * for the same text. */
40 | String getPlainText();
41 |
42 | /** Returns a list of wiki-internal links. That is, all substrings
43 | * enclosed by two square brackets. Link captions will be removed.
44 | * If no wiki links are found, an empty list will be returned. Note that
45 | * the parsing might be done on demand, so avoid invoking this method
46 | * repeatedly for the same text. */
47 | List getWikiLinks();
48 |
49 | /* Returns a list of external links. That is, all valid URLs in the
50 | * original text. If no external links are found, an empty list will
51 | * be returned. Note that the parsing might be done on demand, so
52 | * avoid invoking this method repeatedly for the same text. */
53 | // public List getExternalLinks();
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/Nutella.txt:
--------------------------------------------------------------------------------
1 | == Nutella ® ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Substantiv|Deutsch}}, {{m}}, {{f}}, {{n}} ===
3 |
4 | {{Deutsch Substantiv Übersicht
5 | |Genus 1=f
6 | |Genus 2=n
7 | |Genus 3=m
8 | |Nominativ Singular 1=Nutella
9 | |Nominativ Singular 2=Nutella
10 | |Nominativ Singular 3=Nutella
11 | |Nominativ Plural=—
12 | |Genitiv Singular 1=Nutella
13 | |Genitiv Singular 2=Nutellas
14 | |Genitiv Singular 3=Nutellas
15 | |Genitiv Plural=—
16 | |Dativ Singular 1=Nutella
17 | |Dativ Singular 2=Nutella
18 | |Dativ Singular 3=Nutella
19 | |Dativ Plural=—
20 | |Akkusativ Singular 1=Nutella
21 | |Akkusativ Singular 2=Nutella
22 | |Akkusativ Singular 3=Nutella
23 | |Akkusativ Plural=—
24 | }}
25 |
26 | {{Anmerkung|zum Genus}}
27 | :Wie bei Markennamen üblich, hat auch das Wort ''Nutella'' kein festgelegtes Geschlecht. Die häufigste anzutreffende Form ist allerdings je nach Region das weibliche oder das sächliche Geschlecht, am seltensten wird ''der Nutella'' benutzt.[Nach: Duden: Duden – Richtiges und gutes Deutsch (Der Duden in zwölf Bden., Bd. 9). Mannheim, Leipzig, Wien, Zürich 52005, Seite 631.]
28 |
29 | {{Worttrennung}}
30 | :Nu·tel·la, {{kPl.}}
31 |
32 | {{Aussprache}}
33 | :{{IPA}} {{Lautschrift|ˌnuːˈtɛla}}
34 | :{{Hörbeispiele}} {{Audio|}}
35 | :{{Reime}} {{Reim|ɛla|Deutsch}}
36 |
37 | {{Bedeutungen}}
38 | :[1] Markenname einer [[Nuss-Nugat-Creme]] (als Brotaufstrich), als [[Gattungsname]] umgangssprachlich auch für vergleichbare Produkte anderer Hersteller[Seite „Nutella“. In: Wikipedia, Die freie Enzyklopädie. Bearbeitungsstand: 22. Juli 2010, 11:31 UTC. URL: http://de.wikipedia.org/w/index.php?title=Nutella&oldid=76944151 (Abgerufen: 26. Juli 2010, 15:30 UTC)]
39 |
40 | {{Herkunft}}
41 | :Vom Hersteller Ferrero kreiertes Kunstwort, das wahrscheinlich auf dem englisch Wort ''{{Ü|en|nut}}'' „[[Nuss]]“ und der italienischen weiblichen Verniedlichungsform "-ella" beruht.
42 |
43 | {{Oberbegriffe}}
44 | :[1] [[Nuss-Nugat-Creme]], [[Brotaufstrich]]
45 |
46 | {{Beispiele}}
47 | :[1] „Kann ich bitte das (die, den) Nutella haben?"
48 |
49 | ==== Übersetzungen ====
50 | {{Ü-Tabelle|Ü-links=
51 | *{{en}}: [1] {{Ü|en|Nutella}}
52 | *{{fr}}: [1] {{Ü|fr|Nutella}} {{m}}
53 | |Ü-rechts=
54 | *{{it}}: [1] {{Ü|it|Nutella}} {{f}}
55 | *{{sv}}: [1] {{Ü|sv|Nutella}}
56 | }}
57 |
58 | {{Referenzen}}
59 | :[1] {{Wikipedia|Nutella}}
60 | :[1] {{Ref-UniLeipzig|Nutella}}
61 |
62 | {{Quellen}}
63 |
64 | [[en:Nutella]]
65 | [[fr:Nutella]]
66 | [[ru:Nutella]]
67 |
--------------------------------------------------------------------------------
/src/test/resources/articles-de/pittoresk.txt:
--------------------------------------------------------------------------------
1 | == pittoresk ({{Sprache|Deutsch}}) ==
2 | === {{Wortart|Adjektiv|Deutsch}} ===
3 |
4 | {{Adjektiv-Tabelle (Deklination)
5 | |Grundform=pittoresk
6 | |1. Steigerung=pittoresker
7 | |2. Steigerung=am pittoreskesten
8 | }}
9 |
10 | {{Worttrennung}}
11 | :pit·to·resk, {{Komp.}} pit·to·res·ker, {{Sup.}} pit·to·res·kes·ten
12 |
13 | {{Aussprache}}
14 | :{{IPA}} {{Lautschrift|ˌpɪtoˈʀɛsk}}, {{Komp.}} {{Lautschrift|ˌpɪtoˈʀɛskɐ}}, {{Sup.}} {{Lautschrift|ˌpɪtoˈʀɛskəstn̩}}, {{Lautschrift|ˌpɪtoˈʀɛskəstən}}
15 | :{{Hörbeispiele}} {{Audio|De-at-pittoresk.ogg|pittoresk (österreichisch)}}
16 |
17 | {{Bedeutungen}}
18 | :[1] [[malerisch]]
19 |
20 | {{Herkunft}}
21 | :abgeleitet von {{lat.}} ''{{Ü|la|pictus}}'' „gemalt“, zu [[italienisch]] ''{{Ü|it|pittoresco}}''<ref>[http://www.zeno.org/Brockhaus-1837/A/Pittoresk?hl=pittoresk Brockhaus Bilder-Conversations-Lexikon, Band 3. Leipzig 1839., Seite 507.]</ref>; zu [[französisch]] ''{{Ü|fr|pittoresque}}''<ref>[http://www.zeno.org/Brockhaus-1809/B/Pittoresk?hl=pittoresk Brockhaus Conversations-Lexikon Bd. 8. Leipzig 1811, Seite 251.]</ref>
22 |
23 | {{Sinnverwandte Wörter}}
24 | :[1] [[bildschön]], [[hübsch]], [[malerisch]]
25 |
26 | {{Beispiele}}
27 | :[1] Die kleine Stadt mit ihrem Labyrinth enger Straßen und ihren alten Häusern macht einen ''pittoresken'' Eindruck.
28 | :[1] Wir waren in dem ''pittoreskesten'' Dorf der ganzen Umgebung gelandet.
29 |
30 | ==== Übersetzungen ====
31 | {{Ü-links}}
32 | *{{en}}: [1] {{Ü|en|picturesque}}
33 | *{{fr}}: [1] {{Ü|fr|pittoresque}} {{mf}}
34 | *{{it}}: [1] {{Ü|it|pittoresco}}, {{Ü|it|suggestivo}}
35 | *{{nl}}: [1] {{Ü|nl|#pittoresk (Niederländisch)|pittoresk}}, {{Ü|nl|schilderachtig}}
36 | *{{no}}: [1] {{Ü|no|#pittoresk (Norwegisch)|pittoresk}}
37 | *{{pl}}: [1] {{Ü|pl|malowniczy}}
38 | {{Ü-Abstand}}
39 | *{{pt}}: [1] {{Ü|pt|pitoresco}}
40 | *{{ru}}: [1] {{Üxx|ru|shiwopisnyj|живописный}}
41 | *{{sk}}: [1] {{Ü|sk|pitoreskný}}
42 | *{{sl}}: [1] {{Ü|sl|pitoresken}}, {{Ü|sl|razgiban}}, {{Ü|sl|slikovit}}
43 | *{{es}}: [1] {{Ü|es|pintoresco}} {{m}}, {{Ü|es|pintoresca}} {{f}}
44 | *{{cs}}: [1] {{Ü|cs|pitoreskní}}, {{Ü|cs|malebný}}
45 | {{Ü-rechts}}
46 |
47 | {{Referenzen}}
48 | :[1] {{Ref-DWDS|pittoresk}}
49 | :[1] {{Ref-Canoo|pittoresk}}
50 | :[1] {{Ref-UniLeipzig|pittoresk}}
51 |
52 | {{Quellen}}
53 |
54 | [[en:pittoresk]]
55 | [[fi:pittoresk]]
56 | [[fr:pittoresk]]
57 | [[hu:pittoresk]]
58 | [[io:pittoresk]]
59 | [[ko:pittoresk]]
60 | [[pl:pittoresk]]
61 | [[sv:pittoresk]]
62 | [[zh:pittoresk]]
63 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/IPronunciation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api;
19 |
20 | /**
21 | * Pronunciation information for {@link IWiktionaryEntry}s. There can be
22 | * different types of pronunciation information including standardized
23 | * written representations using the IPA or SAMPA notation, audio files
24 | * of people reading a word aloud, and information on the rhyming suffix
25 | * of a lexical entry.
26 | * @author Christiam M. Meyer
27 | */
28 | public interface IPronunciation {
29 |
30 | /** Types of different pronunciation information used by
31 | * {@link IPronunciation#getType()}. */
32 | enum PronunciationType {
33 |
34 | /** International Phonetic Alphabet */
35 | IPA,
36 | /** Speech Assessment Methods Phonetic Alphabet */
37 | SAMPA,
38 | /** Audio file of this pronunciation. */
39 | AUDIO,
40 | /** Suffix used to identify rhymes. */
41 | RHYME,
42 | /** Unprocessed pronunciation template */
43 | RAW
44 | }
45 |
46 | /** Returns the type of this pronunciation, which can be audio files
47 | * or a specific notation schema used to represent pronunciation
48 | * information. */
49 | PronunciationType getType();
50 |
51 | /** The representation of the pronunciation using a standardized
52 | * notation such as IPA. In case of audio files, the file name of
53 | * the sound file is returned. The corresponding URL of this sound file
54 | * needs to be obtained by querying
55 | * http://[LANGUAGE].wiktionary.org/wiki/File:[FILENAME]. */
56 | String getText();
57 |
58 | /** Returns additional information for this pronunciation, such as
59 | * a geographical reference. */
60 | String getNote();
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEWordLanguageHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components;
19 |
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.util.ILanguage;
24 | import de.tudarmstadt.ukp.jwktl.api.util.Language;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
26 |
27 | /**
28 | * Parser component for extracting the lemma and its language from the
29 | * German Wiktionary.
30 | * @author Christian M. Meyer
31 | * @author Lizhen Qu
32 | */
33 | public class DEWordLanguageHandler extends DEBlockHandler {
34 |
35 | /** language regular expression pattern*/
36 | private static final Pattern LANGUAGE_PATTERN = Pattern.compile("^==\\s*([^=].*)\\s*\\(?\\{\\{\\s*Sprache\\s*\\|\\s*([^}]+?)\\s*\\}\\}");
37 |
38 | protected String lemma;
39 | protected ILanguage language;
40 |
41 | /** Determine if the text line contains the language pattern. If the
42 | * language pattern is found, the entry's word and its language will
43 | * be extracted from the text line. */
44 | public boolean canHandle(final String blockHeader) {
45 | if (blockHeader == null)
46 | return false;
47 |
48 | lemma = null;
49 | language = null;
50 | Matcher matcher = LANGUAGE_PATTERN.matcher(blockHeader);
51 | if (!matcher.find())
52 | return false;
53 |
54 | lemma = matcher.group(1);
55 | language = Language.findByName(matcher.group(2));
56 | return true;
57 | }
58 |
59 | /** Store the word and its language in the parsing context. */
60 | public void fillContent(final ParsingContext context) {
61 | context.setLanguage(language);
62 | context.setHeader(lemma);
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/components/BlockHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.parser.util.IBlockHandler;
21 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
22 | import de.tudarmstadt.ukp.jwktl.parser.util.StringUtils;
23 |
24 | /**
25 | * Abstract parser component for processing article constituents. The handler
26 | * can be initialized with a set of fixed labels that denote the header of
27 | * an article constituent that is to be parsed by this handler.
28 | * @author Christian M. Meyer
29 | * @author Lizhen Qu
30 | */
31 | /**
32 | * Default implementation of the {@link IBlockHandler} interface that serves
33 | * as a base class for parsing any article constituent.
34 | */
35 | public abstract class BlockHandler implements IBlockHandler {
36 |
37 | protected String[] labels;
38 |
39 | /** Initializes the block handler for parsing all sections starting with
40 | * one of the specified labels. */
41 | public BlockHandler(final String... labels) {
42 | this.labels = labels;
43 | }
44 |
45 | public boolean canHandle(String blockHeader) {
46 | blockHeader = StringUtils.strip(blockHeader, "{}=: ");
47 | for (String label : labels)
48 | if (label.equals(blockHeader))
49 | return true;
50 |
51 | return false;
52 | }
53 |
54 | public boolean processHead(final String text, final ParsingContext context) {
55 | return true;
56 | }
57 |
58 | public boolean processBody(final String textLine, final ParsingContext context) {
59 | return false;
60 | }
61 |
62 | public void fillContent(final ParsingContext context) {}
63 |
64 | protected String[] getLabels() {
65 | return labels;
66 | }
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENWordLanguageHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.en.components;
19 |
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | import de.tudarmstadt.ukp.jwktl.api.util.ILanguage;
24 | import de.tudarmstadt.ukp.jwktl.api.util.Language;
25 | import de.tudarmstadt.ukp.jwktl.parser.util.IBlockHandler;
26 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
27 |
28 | /**
29 | * Parser component for extracting a words language from the English Wiktionary.
30 | * @author Christian M. Meyer
31 | * @author Lizhen Qu
32 | */
33 | public class ENWordLanguageHandler extends ENBlockHandler implements IBlockHandler {
34 |
35 | protected static final Pattern LANGUAGE_HEADER = Pattern.compile("^\\s*=+\\s*\\[*\\s*(.*?)\\s*\\]*\\s*=+");
36 |
37 | protected ILanguage language;
38 |
39 | public boolean canHandle(String blockHeader) {
40 | if ("----".equals(blockHeader)) {
41 | language = null;
42 | return true;
43 | }
44 |
45 | language = null;
46 | // System.out.println(textLine);
47 | Matcher matcher = LANGUAGE_HEADER.matcher(blockHeader);
48 | if (!matcher.find())
49 | return false;
50 |
51 | // System.out.println(matcher.group(1));
52 | language = Language.findByName(matcher.group(1));
53 | return (language != null);
54 | }
55 |
56 | @Override
57 | public boolean processHead(final String textLine, final ParsingContext context) {
58 | return true;
59 | }
60 |
61 | @Override
62 | public boolean processBody(final String textLine, final ParsingContext context) {
63 | return false;
64 | }
65 |
66 | public void fillContent(final ParsingContext context) {
67 | context.setLanguage(language);
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DEWordFormNounTableHandlerTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEGenderText;
21 | import junit.framework.TestCase;
22 |
23 | public class DEWordFormNounTableHandlerTest extends TestCase {
24 |
25 | private DEWordFormNounTableHandler nounTableHandler;
26 |
27 | @Override
28 | protected void setUp() throws Exception {
29 | nounTableHandler = new DEWordFormNounTableHandler();
30 | }
31 |
32 | public void testGetsSetGenus() {
33 | nounTableHandler.setGenusByIndex(DEGenderText.F, 2);
34 | assertEquals(DEGenderText.F, nounTableHandler.getGenusByIndex(2));
35 | }
36 |
37 | public void testGetsNotSetGenus() {
38 | assertNull(nounTableHandler.getGenusByIndex(3));
39 | }
40 |
41 | public void testThrowsExceptionSettingGenusWithInvalidIndex() {
42 | try {
43 | nounTableHandler.setGenusByIndex(DEGenderText.F, 0);
44 | fail();
45 | } catch (IllegalArgumentException expected) {
46 | assertTrue(true);
47 | }
48 | try {
49 | nounTableHandler.setGenusByIndex(DEGenderText.F, 5);
50 | fail();
51 | } catch (IllegalArgumentException expected) {
52 | assertTrue(true);
53 | }
54 | }
55 |
56 | public void testThrowsExceptionGettingGenusWithInvalidIndex() {
57 | try {
58 | nounTableHandler.getGenusByIndex(0);
59 | fail();
60 | } catch (IllegalArgumentException expected) {
61 | assertTrue(true);
62 | }
63 | try {
64 | nounTableHandler.getGenusByIndex(5);
65 | fail();
66 | } catch (IllegalArgumentException expected) {
67 | assertTrue(true);
68 | }
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DativeHandlerTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm;
21 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase;
22 | import junit.framework.TestCase;
23 |
24 | public class DativeHandlerTest extends TestCase {
25 |
26 | private DativeHandler dativeHandler;
27 |
28 | @Override
29 | protected void setUp() throws Exception {
30 | dativeHandler = new DativeHandler();
31 | }
32 |
33 | public void testCanHandle() {
34 | assertFalse(dativeHandler.canHandle(null, null, null, null));
35 | assertFalse(dativeHandler.canHandle("Vitad", null, null, null));
36 | assertTrue(dativeHandler.canHandle("Dativ Singular", null, null, null));
37 | assertTrue(dativeHandler.canHandle("Dativ", null, null, null));
38 | assertFalse(dativeHandler.canHandle(" Dativ Singular", null, null, null));
39 | assertTrue(dativeHandler.canHandle("Wem? (Einzahl)", null, null, null));
40 | assertTrue(dativeHandler.canHandle("Wem?(Einzahl)", null, null, null));
41 | assertFalse(dativeHandler.canHandle(" Wem? (Einzahl)", null, null, null));
42 | }
43 |
44 | public void testDativeSingular() {
45 | WiktionaryWordForm wordForm = new WiktionaryWordForm("test");
46 | dativeHandler.handle("Dativ Singular", "test", wordForm, null);
47 | assertEquals(GrammaticalCase.DATIVE, wordForm.getCase());
48 | }
49 |
50 | public void testWemEinzahl() {
51 | WiktionaryWordForm wordForm = new WiktionaryWordForm("test");
52 | dativeHandler.handle("Wem? (Einzahl)", "test", wordForm, null);
53 | assertEquals(GrammaticalCase.DATIVE, wordForm.getCase());
54 | }
55 | }
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/components/InterwikiLinkHandler.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.components;
19 |
20 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext;
21 |
22 | /**
23 | * Generic parser component for extracting interwiki links (e.g., [de:dog])
24 | * from the Wiktionary article pages.
25 | * @author Christian M. Meyer
26 | * @author Lizhen Qu
27 | *
28 | */
29 | public class InterwikiLinkHandler extends BlockHandler {
30 |
31 | protected String categoryHead;
32 | protected String language;
33 |
34 | /** Initializes the handler for the specified category head
35 | * (e.g., "Category"). The category head is required for distinugishing
36 | * between categories and interwiki links. */
37 | public InterwikiLinkHandler(final String categoryHead) {
38 | this.categoryHead = categoryHead;
39 | }
40 |
41 | public boolean canHandle(String blockHeader) {
42 | // Check if the line encodes an interwiki link.
43 | String line = blockHeader.trim();
44 | boolean isBracketed = line.startsWith("[[") && line.endsWith("]]");
45 | return (line.contains(":") && !line.contains(categoryHead) && isBracketed);
46 | }
47 |
48 | @Override
49 | public boolean processHead(String textLine, ParsingContext context) {
50 | // Extract the language of the interwiki link.
51 | language = null;
52 | String line = textLine.trim();
53 | int colonIndex = line.indexOf(":");
54 | if (colonIndex != -1) {
55 | language = line.substring(2, colonIndex).trim();
56 | }
57 | return true;
58 | }
59 |
60 | public void fillContent(final ParsingContext context) {
61 | // Add the interwiki link to the current page.
62 | if (language != null)
63 | context.getPage().addInterWikiLink(language);
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/parser/util/PatternUtils.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.parser.util;
19 |
20 | import java.util.Objects;
21 | import java.util.regex.Matcher;
22 |
23 | public class PatternUtils {
24 |
25 | private PatternUtils() {
26 | }
27 |
28 | /**
29 | * Extracts the index from the given previously matched/foundregex
30 | * matcher. If the matcher was not matched yet, throws an exception. Otherwise
31 | * searches for the first non-null group, parse it as an integer
32 | * and returns the result. If no non-null groups are found, returns
33 | * null.
34 | *
35 | * @param matcher
36 | * regular expression matcher.
37 | * @return Extracted index or null.
38 | * @throws NumberFormatException
39 | * If value of the first non-null group could not be
40 | * parsed as an integer.
41 | * @throws IllegalArgumentException If the matcher was not matched yet.
42 | */
43 | public static Integer extractIndex(Matcher matcher) throws NumberFormatException, IllegalArgumentException {
44 | Objects.requireNonNull(matcher, "matcher must not be null.");
45 | try {
46 | if (matcher.start() < 0) {
47 | throw new IllegalArgumentException("The matcher was not matched yet.");
48 | }
49 | } catch (IllegalStateException isex) {
50 | throw new IllegalArgumentException("The matcher was not matched yet.", isex);
51 | }
52 | for (int index = 1; index <= matcher.groupCount(); index++) {
53 | String group = matcher.group(index);
54 | if (group != null) {
55 | return Integer.valueOf(group);
56 | }
57 | }
58 | return null;
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/WiktionaryRelation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.api.entry;
19 |
20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryRelation;
21 | import de.tudarmstadt.ukp.jwktl.api.RelationType;
22 |
23 | /**
24 | * Default implementation of the {@link IWiktionaryRelation} interface.
25 | * See there for details.
26 | * @author Christian M. Meyer
27 | */
28 | public class WiktionaryRelation implements IWiktionaryRelation {
29 |
30 | protected String target;
31 | protected RelationType type;
32 | protected String targetSense;
33 | protected LinkType linkType;
34 | //protected RelationSourceType relationSourceType;
35 |
36 | /** Creates a new, empty relation. */
37 | public WiktionaryRelation() {}
38 |
39 | /** Creates a new relation for the given target and relation type. */
40 | public WiktionaryRelation(final String target, final RelationType type) {
41 | this.target = target;
42 | this.type = type;
43 | //this.relationSourceType = RelationSourceType.ENTRY;
44 | }
45 |
46 | public RelationType getRelationType() {
47 | return type;
48 | }
49 |
50 | public String getTarget() {
51 | return target;
52 | }
53 |
54 | public String getTargetSense() {
55 | return targetSense;
56 | }
57 |
58 | /** Specifies additional information on the target word sense. */
59 | public void setTargetSense(final String targetSense){
60 | this.targetSense = targetSense;
61 | }
62 |
63 | public LinkType getLinkType() {
64 | return linkType;
65 | }
66 |
67 | /** Assigns a new link type for this relation. */
68 | public void setLinkType(final LinkType linkType) {
69 | this.linkType = linkType;
70 | }
71 |
72 | @Override
73 | public String toString() {
74 | return type + ":" + target;
75 | }
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/java/de/tudarmstadt/ukp/jwktl/examples/Example5_MultipleLanguages.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2013
3 | * Ubiquitous Knowledge Processing (UKP) Lab
4 | * Technische Universität Darmstadt
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | ******************************************************************************/
18 | package de.tudarmstadt.ukp.jwktl.examples;
19 |
20 | import java.io.File;
21 |
22 | import de.tudarmstadt.ukp.jwktl.JWKTL;
23 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryCollection;
24 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
25 |
26 | /**
27 | * Example for combining information from multiple Wiktionary language editions
28 | * in a so-called collection.
29 | * @author Yevgen Chebotar
30 | * @author Christian M. Meyer
31 | */
32 | public class Example5_MultipleLanguages {
33 |
34 | /** Runs the example.
35 | * @param args two names of a directory containing parsed Wiktionary data
36 | * (German and English in the example). */
37 | public static void main(String[] args) {
38 | if (args.length != 2)
39 | throw new IllegalArgumentException("Too few arguments. "
40 | + "Required arguments: "
41 | + "");
42 |
43 | // Create new IWiktionaryCollection for the parsed databases.
44 | IWiktionaryCollection wktColl = JWKTL.openCollection(new File(args[0]), new File(args[1]));
45 |
46 | // Query for "arm" in both language editions and print the resulting entries.
47 | for (IWiktionaryEntry entry : wktColl.getEntriesForWord("arm")) {
48 | // Print the language of the defining language edition.
49 | System.out.println(entry.getPage().getEntryLanguage() + ":");
50 |
51 | // Print the word and its language and part of speech.
52 | System.out.println(" " + entry.getWord()
53 | + "/" + entry.getPartOfSpeech()
54 | + "/" + entry.getWordLanguage());
55 | }
56 |
57 | // Close the Wiktionary edition (closes all attached editions).
58 | wktColl.close();
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------