├── src ├── test │ ├── resources │ │ ├── XMLDumpParserErrorEmptyTest.xml.bz2 │ │ ├── XMLDumpParserTest.xml.bz2 │ │ ├── XMLDumpParserErrorHeaderTest.xml.bz2 │ │ ├── makeLarge.pl │ │ ├── enwiktionary-20150224-pages-articles-multistream.xml.bz2 │ │ ├── enwiktionary-20150224-pages-articles-multistream-index.txt.bz2 │ │ ├── articles-de │ │ │ ├── Abschlusz.txt │ │ │ ├── Eingaben.txt │ │ │ ├── Fote.txt │ │ │ ├── Subdivisio.txt │ │ │ ├── robber_baron.txt │ │ │ ├── boulder.txt │ │ │ ├── Flipchart.txt │ │ │ ├── Angestellte.txt │ │ │ ├── Hallo.txt │ │ │ ├── Tetragraph.txt │ │ │ ├── Kunsttherapie.txt │ │ │ ├── mitreissen.txt │ │ │ ├── Brathaehnchen.txt │ │ │ ├── Mockumentary.txt │ │ │ ├── Thulium.txt │ │ │ ├── Generaladmiral.txt │ │ │ ├── harness.txt │ │ │ ├── Verbalsubstantiv.txt │ │ │ ├── Nutella.txt │ │ │ └── pittoresk.txt │ │ ├── articles-ru │ │ │ ├── lechu.txt │ │ │ └── lodka.txt │ │ ├── XMLDumpParserErrorXMLTest.xml │ │ ├── XMLDumpParserTest.xml │ │ ├── articles-en │ │ │ ├── callously.txt │ │ │ ├── dreier.txt │ │ │ ├── seawater.txt │ │ │ ├── granada.txt │ │ │ ├── bamba.txt │ │ │ ├── mangueira.txt │ │ │ ├── varanda.txt │ │ │ ├── goitrogenic.txt │ │ │ ├── abele.txt │ │ │ ├── aborted.txt │ │ │ ├── for_good_measure.txt │ │ │ ├── escritorio.txt │ │ │ ├── sumo.txt │ │ │ ├── batsman.txt │ │ │ ├── garçon.txt │ │ │ ├── cheio.txt │ │ │ ├── it_s.txt │ │ │ ├── as_much_as_possible.txt │ │ │ └── gumbo.txt │ │ ├── WiktionaryTestData_info.txt │ │ ├── WiktionaryDumpParserNullTest.xml │ │ └── WiktionaryDumpParserTest.xml │ └── java │ │ └── de │ │ └── tudarmstadt │ │ └── ukp │ │ └── jwktl │ │ ├── IntegrationTest.java │ │ ├── parser │ │ ├── en │ │ │ └── components │ │ │ │ └── ENWordFormHandlerTest.java │ │ ├── de │ │ │ ├── DEEntryLinkHandlerTest.java │ │ │ ├── DEWiktionaryEntryParserTest.java │ │ │ ├── DESenseExampleHandlerTest.java │ │ │ └── components │ │ │ │ └── nountable │ │ │ │ ├── DEWordFormNounTableHandlerTest.java │ │ │ │ └── DativeHandlerTest.java │ │ ├── ChainedCBZip2InputStreamTest.java │ │ └── util │ │ │ └── PatternUtilsTest.java │ │ └── WiktionaryTestCase.java └── main │ ├── filter │ ├── META-INF │ │ └── jwktl-version.properties │ └── jwktl-version-filter.properties │ ├── java │ └── de │ │ └── tudarmstadt │ │ └── ukp │ │ └── jwktl │ │ ├── parser │ │ ├── ru │ │ │ └── wikokit │ │ │ │ └── base │ │ │ │ ├── wikipedia │ │ │ │ ├── util │ │ │ │ │ └── GraphMLFile.java │ │ │ │ └── language │ │ │ │ │ └── LanguageTypeLocal.java │ │ │ │ └── wikt │ │ │ │ ├── multi │ │ │ │ ├── en │ │ │ │ │ ├── WQuoteEn.java │ │ │ │ │ ├── LabelEn.java │ │ │ │ │ └── WRedirectEn.java │ │ │ │ └── ru │ │ │ │ │ ├── WRedirectRu.java │ │ │ │ │ └── quote │ │ │ │ │ └── TitleAndWikilink.java │ │ │ │ ├── word │ │ │ │ └── WSynonyms.java │ │ │ │ ├── util │ │ │ │ ├── LangText.java │ │ │ │ └── POSText.java │ │ │ │ └── constant │ │ │ │ └── ContextLabel.java │ │ ├── de │ │ │ └── components │ │ │ │ ├── nountable │ │ │ │ ├── DativeHandler.java │ │ │ │ ├── GenitiveHandler.java │ │ │ │ ├── AccusativeHandler.java │ │ │ │ ├── NominativeHandler.java │ │ │ │ ├── CaseHandler.java │ │ │ │ ├── MehrzahlHandler.java │ │ │ │ ├── PatternBasedParameterHandler.java │ │ │ │ ├── EinzahlHandler.java │ │ │ │ └── PatternBasedIndexedParameterHandler.java │ │ │ │ ├── DEBlockHandler.java │ │ │ │ ├── DECollocationsHandler.java │ │ │ │ ├── DEGenderText.java │ │ │ │ ├── DEEtymologyHandler.java │ │ │ │ └── DEWordLanguageHandler.java │ │ ├── en │ │ │ └── components │ │ │ │ ├── ENBlockHandler.java │ │ │ │ ├── IHeadwordLineHandler.java │ │ │ │ ├── IWordFormHandler.java │ │ │ │ ├── ENUsageNotesHandler.java │ │ │ │ ├── ENDescendantRelationHandler.java │ │ │ │ ├── ENEtymologyHandler.java │ │ │ │ └── ENWordLanguageHandler.java │ │ ├── IWiktionaryEntryParser.java │ │ ├── IWiktionaryMultistreamDumpParser.java │ │ ├── MultistreamFilter.java │ │ ├── IWiktionaryDumpParser.java │ │ ├── util │ │ │ ├── IBlockHandler.java │ │ │ └── PatternUtils.java │ │ └── components │ │ │ ├── BlockHandler.java │ │ │ └── InterwikiLinkHandler.java │ │ ├── api │ │ ├── IWiktionaryExample.java │ │ ├── IQuotation.java │ │ ├── util │ │ │ ├── GrammaticalPerson.java │ │ │ ├── GrammaticalGender.java │ │ │ ├── GrammaticalDegree.java │ │ │ ├── GrammaticalNumber.java │ │ │ ├── GrammaticalTense.java │ │ │ ├── NonFiniteForm.java │ │ │ ├── GrammaticalMood.java │ │ │ ├── GrammaticalCase.java │ │ │ ├── GrammaticalAspect.java │ │ │ └── ILanguage.java │ │ ├── filter │ │ │ ├── IWiktionaryPageFilter.java │ │ │ ├── IWiktionaryEntryFilter.java │ │ │ ├── IWiktionarySenseFilter.java │ │ │ └── WiktionarySenseFilter.java │ │ ├── WiktionaryException.java │ │ ├── entry │ │ │ ├── WiktionaryExample.java │ │ │ ├── Pronunciation.java │ │ │ └── WiktionaryRelation.java │ │ ├── IWikiString.java │ │ └── IPronunciation.java │ │ └── examples │ │ └── Example5_MultipleLanguages.java │ └── resources │ └── assemblies │ └── dist.xml ├── .codecov.yml ├── CONTRIBUTING.md ├── .gitignore ├── .github └── workflows │ └── build.yml └── CONTRIBUTORS.txt /src/test/resources/XMLDumpParserErrorEmptyTest.xml.bz2: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/**/*" 3 | -------------------------------------------------------------------------------- /src/test/resources/XMLDumpParserTest.xml.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/XMLDumpParserTest.xml.bz2 -------------------------------------------------------------------------------- /src/main/filter/META-INF/jwktl-version.properties: -------------------------------------------------------------------------------- 1 | jwktl.version=${jwktl.version} 2 | build.number=${build.number} 3 | svn.revision=${svn.revision} 4 | -------------------------------------------------------------------------------- /src/main/filter/jwktl-version-filter.properties: -------------------------------------------------------------------------------- 1 | jwktl.version=${pom.version} 2 | build.number=${BUILD_NUMBER} 3 | svn.revision=${SVN_REVISION} 4 | -------------------------------------------------------------------------------- /src/test/resources/XMLDumpParserErrorHeaderTest.xml.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/XMLDumpParserErrorHeaderTest.xml.bz2 -------------------------------------------------------------------------------- /src/test/resources/makeLarge.pl: -------------------------------------------------------------------------------- 1 | for ($i = 1; $i <= 30000; $i++) { 2 | print " $iPage_$i\n"; 3 | } -------------------------------------------------------------------------------- /src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2 -------------------------------------------------------------------------------- /src/test/resources/enwiktionary-20150224-pages-articles-multistream-index.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkpro/dkpro-jwktl/HEAD/src/test/resources/enwiktionary-20150224-pages-articles-multistream-index.txt.bz2 -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to JWKTL 2 | 3 | Thank you very much for your willingness to participate in this project. 4 | 5 | Please read the DKPro contribution guidelines at https://dkpro.github.io/contributing/ 6 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Abschlusz.txt: -------------------------------------------------------------------------------- 1 | == Abschluß ({{Sprache|Deutsch}}) == 2 | {{Alte Schreibweise|Abschluss|Reform 1996}} 3 | 4 | [[hu:Abschluß]] 5 | [[is:Abschluß]] 6 | [[ru:Abschluß]] 7 | [[zh:Abschluß]] 8 | -------------------------------------------------------------------------------- /src/test/resources/articles-ru/lechu.txt: -------------------------------------------------------------------------------- 1 | = {{-ru-|nocat}} = 2 | == лечу I == 3 | '''ле-чу́''' // 4 | *{{Форма-гл|лететь|наст||1|ед|}} 5 | == лечу II == 6 | '''ле-чу́''' // 7 | *{{Форма-гл|лечить|наст||1|ед|}}{{длина слова|4}} 8 | 9 | [[fi:лечу]] 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Eclipse ### 2 | /target/ 3 | .classpath 4 | .project 5 | .settings/org.eclipse.core.resources.prefs 6 | .settings/org.eclipse.jdt.core.prefs 7 | .settings/org.eclipse.m2e.core.prefs 8 | 9 | ### IntelliJ IDEA ### 10 | *.iml 11 | .idea/ -------------------------------------------------------------------------------- /src/test/resources/XMLDumpParserErrorXMLTest.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 |
6 | 7 | Some text content 8 | 9 | Some text content 10 | 11 |
12 | -------------------------------------------------------------------------------- /src/test/resources/XMLDumpParserTest.xml: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 |
6 | 7 | Some text content 8 | 9 | 10 | Some text content 11 | 12 |
13 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/callously.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Adverb=== 4 | {{en-adv}} 5 | 6 | # In a [[callous]] manner; done without regard to others' [[sensitivities]]. 7 | 8 | ====Synonyms==== 9 | *(''in a callous manner''): [[carelessly]], [[hardheartedly]], [[indifferently]], [[unfeelingly]] 10 | 11 | [[et:callously]] 12 | [[es:callously]] 13 | [[fr:callously]] 14 | [[ru:callously]] 15 | [[vi:callously]] 16 | [[zh:callously]] 17 | -------------------------------------------------------------------------------- /src/test/resources/WiktionaryTestData_info.txt: -------------------------------------------------------------------------------- 1 | de dump: 2 | 3 | 3 Parameter (GERMAN, NOUN) 4 | 2 Mönch (GERMAN, NOUN) 5 | 0 França (CATALAN, NOUN) 6 | 1 França (OCCITAN, NOUN) 7 | 4 Platz (GERMAN, NOUN) 8 | 9 | 10 | 11 | en dump: 12 | 13 | 0 parameter (ENGLISH, NOUN) 14 | 1 place (ENGLISH, NOUN) 15 | 2 place (ENGLISH, VERB) 16 | 3 place (FRENCH, NOUN) 17 | 4 place (FRENCH, VERB) 18 | 5 place (POLISH, NOUN) 19 | 6 place (ROMANIAN, VERB) 20 | 7 place (SPANISH, VERB) 21 | 22 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/dreier.txt: -------------------------------------------------------------------------------- 1 | ==German== 2 | 3 | ===Pronunciation=== 4 | * {{IPA|/ˈdʁaɪ̯ɐ/|lang=de}} 5 | * {{hyphenation|drei|er|lang=de}} 6 | 7 | ===Numeral=== 8 | {{head|de|numeral}} 9 | 10 | # {{genitive plural of|drei||three|lang=de}} 11 | 12 | ====Usage notes==== 13 | Only in adjectival use and only when no article or pronoun is preceding. More at {{term|drei|lang=de}}. 14 | 15 | ---- 16 | ==Norwegian Bokmål== 17 | 18 | ===Verb=== 19 | {{head|nb|verb form}} 20 | 21 | # {{present tense of|dreie|lang=nb}} 22 | 23 | [[de:dreier]] 24 | [[pl:dreier]] 25 | [[fi:dreier]] 26 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/seawater.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | ===Etymology=== 3 | [[sea]] + [[water]] 4 | ===Noun=== 5 | {{en-noun|-}} 6 | 7 | #The saltwater of a [[sea]] or [[ocean]]. 8 | 9 | ====Translations==== 10 | {{top}} 11 | * Finnish: {{t-|fi|merivesi}} 12 | * French: [[eau de mer]] {{f}} 13 | * German: {{t+|de|Meerwasser|n}}, {{t+|de|Salzwasser|n}} 14 | {{mid}} 15 | * Japanese: [[海水]] ([[かいすい]], kaisui) 16 | * Spanish: {{t-|es|agua salada|f}} 17 | {{bottom}} 18 | 19 | [[et:seawater]] 20 | [[fr:seawater]] 21 | [[ko:seawater]] 22 | [[lo:seawater]] 23 | [[hu:seawater]] 24 | [[ja:seawater]] 25 | -------------------------------------------------------------------------------- /src/test/resources/WiktionaryDumpParserNullTest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 0 5 | 6 | 0 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 2004-09-17 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/granada.txt: -------------------------------------------------------------------------------- 1 | ==Portuguese== 2 | 3 | ===Etymology=== 4 | From {{etyl|la|pt}} {{m|la|grānātum||pomegranate}}, from {{m|la|grānātus||having many seeds}}, from {{m|la|grānum||seed, grain}}, from {{etyl|ine-pro|pt}} {{m|ine-pro|*ǵr̥h₂nóm||grain}}. 5 | 6 | ===Noun=== 7 | {{pt-noun|f|s}} 8 | 9 | # [[pomegranate]] (fruit) 10 | # [[hand grenade]] (small explosive device) 11 | # [[shell]] (artillery) 12 | # [[garnet]] (mineral group) 13 | 14 | ====Synonyms==== 15 | * (pomegranate) [[romã]] 16 | * (hand grenade) [[granada de mão]] 17 | * (garnet) [[granate]] 18 | 19 | [[Category:pt:Explosives]] 20 | [[Category:pt:Fruits]] 21 | [[Category:pt:Mineralogy]] 22 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Eingaben.txt: -------------------------------------------------------------------------------- 1 | == Eingaben ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Deklinierte Form|Deutsch}} === 3 | 4 | {{Worttrennung}} 5 | :Ein·ga·ben 6 | 7 | {{Aussprache}} 8 | :{{IPA}} {{Lautschrift|ˈaɪ̯nˌɡaːbən}}, {{Lautschrift|ˈaɪ̯nˌɡaːbn̩}} 9 | :{{Hörbeispiele}} {{Audio|}} 10 | 11 | {{Grammatische Merkmale}} 12 | *Nominativ Plural des Substantivs '''[[Eingabe]]''' 13 | *Genitiv Plural des Substantivs '''[[Eingabe]]''' 14 | *Dativ Plural des Substantivs '''[[Eingabe]]''' 15 | *Akkusativ Plural des Substantivs '''[[Eingabe]]''' 16 | 17 | {{Grundformverweis Dekl|Eingabe}} 18 | 19 | [[en:Eingaben]] 20 | [[fi:Eingaben]] 21 | [[ku:Eingaben]] 22 | [[mg:Eingaben]] 23 | [[sv:Eingaben]] 24 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/bamba.txt: -------------------------------------------------------------------------------- 1 | ==Spanish== 2 | 3 | ===Etymology 1=== 4 | onomatopoeia 5 | 6 | ====Noun==== 7 | {{es-noun|f}} 8 | 9 | # [[hit]], [[strike]] {{gloss|in a game}} 10 | # a Latin American dance 11 | 12 | =====Synonyms===== 13 | * (''hit''): [[acierto]] 14 | 15 | ===Etymology 2=== 16 | From a trademark 17 | 18 | ====Noun==== 19 | {{es-noun|f}} 20 | 21 | # [[flip-flop]], [[thong]], [[jandal]] 22 | # [[sneaker]] 23 | 24 | =====See also===== 25 | * [[chancla]] 26 | * [[playera]] 27 | * [[sandalia]] 28 | 29 | [[fr:bamba]] 30 | [[ko:bamba]] 31 | [[id:bamba]] 32 | [[jv:bamba]] 33 | [[kn:bamba]] 34 | [[hu:bamba]] 35 | [[mg:bamba]] 36 | [[pt:bamba]] 37 | [[fi:bamba]] 38 | [[sv:bamba]] 39 | [[te:bamba]] 40 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/mangueira.txt: -------------------------------------------------------------------------------- 1 | ==Galician== 2 | 3 | ===Noun=== 4 | {{gl-noun|f}} 5 | 6 | # [[mango]] [[tree]] 7 | 8 | ====Related terms==== 9 | * [[manga]] 10 | 11 | [[Category:gl:Trees]] 12 | 13 | ---- 14 | 15 | ==Portuguese== 16 | ===Etymology 1=== 17 | ====Noun==== 18 | '''mangueira''' {{g|f}} 19 | # [[hose]]. 20 | 21 | ===Etymology 2=== 22 | ====Noun==== 23 | '''mangueira''' {{g|f}} 24 | # [[mango]] (tree). 25 | 26 | =====Related terms===== 27 | {{top2}} 28 | *[[manga]] 29 | {{mid2}} 30 | {{bottom}} 31 | 32 | [[Category:Portuguese nouns]] 33 | 34 | [[io:mangueira]] 35 | [[mg:mangueira]] 36 | [[fj:mangueira]] 37 | [[pt:mangueira]] 38 | [[ru:mangueira]] 39 | [[tl:mangueira]] 40 | [[chr:mangueira]] 41 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/varanda.txt: -------------------------------------------------------------------------------- 1 | ==Portuguese== 2 | {{wikipedia|lang=pt}} 3 | [[Image:2012 Portugal 7844271368.jpg|thumb|250px|varanda]] 4 | 5 | ===Etymology=== 6 | {{unk.|lang=pt|title=Uncertain}}, but possibly related to Spanish {{term|baranda|lang=es}}. 7 | 8 | ===Pronunciation=== 9 | * {{a|PT}} {{IPA|/vɐ.ˈɾɐ̃.dɐ/|lang=pt}} 10 | * {{hyphenation|va|ran|da|lang=pt}} 11 | 12 | ===Noun=== 13 | {{pt-noun|f}} 14 | 15 | # [[balcony]], [[veranda]], [[terrace]] 16 | # [[porch]] 17 | 18 | ====Descendants==== 19 | * Hindi: {{l|hi|बरामदा|tr=barāmdā|sc=Deva}}, {{l|hi|बरण्डा|tr=baraṇḍā|sc=Deva}} 20 | * English: {{l|en|veranda}}, {{l|en|verandah}} 21 | * French: {{l|fr|véranda}} 22 | 23 | [[eo:varanda]] 24 | [[fr:varanda]] 25 | [[mg:varanda]] 26 | [[pt:varanda]] 27 | [[fi:varanda]] 28 | [[zh:varanda]] 29 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: read 11 | packages: write 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up JDK 11 16 | uses: actions/setup-java@v2 17 | with: 18 | java-version: '11' 19 | distribution: 'temurin' 20 | # server-id: github # Value of the distributionManagement/repository/id field of the pom.xml 21 | # settings-path: ${{ github.workspace }} # location for the settings.xml file 22 | 23 | - name: Build with Maven 24 | run: mvn test 25 | 26 | # - name: Publish to GitHub Packages Apache Maven 27 | # run: mvn deploy -s $GITHUB_WORKSPACE/settings.xml 28 | # env: 29 | # GITHUB_TOKEN: ${{ github.token }} 30 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/IntegrationTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl; 19 | 20 | public interface IntegrationTest { 21 | } 22 | -------------------------------------------------------------------------------- /CONTRIBUTORS.txt: -------------------------------------------------------------------------------- 1 | # This is the list of people who have contributed code to the DKPro JWKTL repository. 2 | # 3 | # This list is to be used in favor over author attributions in individual files, 4 | # e.g. via @author tags. 5 | # 6 | # Code integrated from third parties by others than their original authors may not have any 7 | # @author tags removed and respective names must not be added to this file. Integration of 8 | # third party code should be avoided. 9 | # 10 | # After the name, one or more mail addresses may be specified in pointy brackets and one or more 11 | # GitHub IDs may be specified in square brackets. 12 | # 13 | # See also https://github.com/dkpro/dkpro-core/blob/master/CONTRIBUTING.md 14 | 15 | # Please keep the list sorted. 16 | 17 | Alexey Valikov [highsource] 18 | Christian M. Meyer [chmeyer] 19 | Christof Müller 20 | Ilya [intracer] 21 | Iryna Gurevych 22 | Jan Berkel [jberkel] 23 | Lizhen Qu 24 | Rafael Hoff [rafaelhoff] 25 | Torsten Zesch 26 | Tristan Miller [logological] 27 | Václav Slavík [vslavik] 28 | Yang Yang [geraint0923] 29 | Yevgen Chebotar 30 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/goitrogenic.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Adjective=== 4 | {{en-adj}} 5 | 6 | # Of or pertaining to a [[goitrogen]] 7 | #* '''1968''' July, G. A. Bray, “Increased sensitivity of the thyroid in iodine-depleted rats to the '''goitrogenic''' effects of thyrotropin,” in ''The Clinical Journal of Investigation'' 47(7): 1640–1647, 8 | #*: The present studies demonstrate that iodine depletion increases the sensitivity of the thyroid to the '''goitrogenic''' effects of thyrotropin. 9 | #*'''1948''' J. Seifter and W. E. Ehrich, “Goitrogenic Compounds: Pharmacological and Pathological Effects,” ''Journal of Pharmacology and Experimental Therapeutics'' 92(3): 303-314 10 | #*: Seventy-eight compounds were screened for '''goitrogenic''' action. Of these, 12 were found to be effective, but only thiouracil, propylthiouracil, 2-amino-thiazole and Dithane were found to be markedly active. 11 | 12 | ====Synonyms==== 13 | * {{italbrac|of or pertaining to that which reduces the production or effects of thyroid hormones}}: [[antithyroid]] 14 | 15 | ====Usage notes==== 16 | See usage note at [[antithyroid]]. 17 | 18 | [[zh:goitrogenic]] 19 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/abele.txt: -------------------------------------------------------------------------------- 1 | {{wikipedia}} 2 | {{also|Abele|ābele}} 3 | ==English== 4 | 5 | ===Etymology=== 6 | Dutch ''[[abeel]]'' (''abeel-boom''), Old French ''[[abel]]'', ''[[aubel]]'', from a diminutive of Latin ''[[albus]]'', white 7 | [[Image:Czajecice tree 20060812 1401.jpg|thumb|Abele; White poplar]] 8 | 9 | ===Noun=== 10 | {{en-noun}} 11 | 12 | # The [[white poplar]] (''[[Populus]] [[alba]]''). 13 | #* Six '''abeles''' i' the churchyard grow - [[Mrs. Browning]] 14 | 15 | ====See also==== 16 | * {{pedialite|Populus alba}} 17 | * {{commonslite|Populus alba}} 18 | * {{specieslite|Populus alba}} 19 | 20 | ====Translations==== 21 | {{trans-top|the white poplar, Populus alba}} 22 | * Bulgarian: {{t|bg|бяла топола }} 23 | * Dutch: {{t+|nl|abeel|m}} 24 | {{trans-mid}} 25 | * Estonian: [[hõbepappel]] 26 | {{trans-bottom}} 27 | 28 | [[Category:Trees]] 29 | 30 | ---- 31 | 32 | ==[[Novial]]== 33 | 34 | ===Noun=== 35 | '''abele''' 36 | 37 | # [[bee]] 38 | 39 | [[lt:abele]] 40 | [[hu:abele]] 41 | [[ro:abele]] 42 | [[ru:abele]] 43 | [[fi:abele]] 44 | [[uk:abele]] 45 | [[vi:abele]] 46 | [[wa:abele]] 47 | [[zh:abele]] 48 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikipedia/util/GraphMLFile.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.util; 17 | 18 | /** GraphML loader/writer. 19 | */ 20 | public class GraphMLFile { 21 | 22 | /** Creates a new instance of GraphMLFile */ 23 | // public GraphMLFile() { 24 | // } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENWordFormHandlerTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | public class ENWordFormHandlerTest extends WordFormHandlerTest { 21 | @Override 22 | public void setUp() throws Exception { 23 | super.setUp(); 24 | handler = new ENWordFormHandler("lemma"); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/aborted.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Etymology=== 4 | Derivative of [[abort]] 5 | 6 | ===Adjective=== 7 | {{en-adj|-}} 8 | 9 | # Brought forth [[prematurely]]. 10 | # {{biology}} Rendered [[abortive]] or [[sterile]]; [[undeveloped]]; checked in normal development at a very early stage; as, spines are ''aborted'' branches. 11 | #:''"The eyes of the cirripeds are more or less '''aborted''' in their mature state."'' -[[w:Richard Owen|Richard Owen]]. 12 | 13 | ====Translations==== 14 | * [[Catalan]]: [[avortat]] 15 | * French: {{t+|fr|avorté}} 16 | * [[Interlingua]]: [[abortate]] 17 | * Italian: {{t-|it|terminato}} 18 | * Portuguese: {{t-|pt|abortado}} 19 | * Spanish: {{t-|es|abortado}} 20 | * Swedish: {{t|sv|aborterad}} 21 | 22 | ===Verb=== 23 | '''aborted''' 24 | 25 | # {{past of|[[abort]]}} 26 | 27 | ===Anagrams=== 28 | * {{alphagram|abdeort}} 29 | * [[borated#English|borated]] 30 | 31 | 32 | [[Category:English adjectives]] 33 | 34 | [[de:aborted]] 35 | [[fr:aborted]] 36 | [[it:aborted]] 37 | [[hu:aborted]] 38 | [[mg:aborted]] 39 | [[ml:aborted]] 40 | [[my:aborted]] 41 | [[pt:aborted]] 42 | [[ru:aborted]] 43 | [[fi:aborted]] 44 | [[ta:aborted]] 45 | [[vi:aborted]] 46 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/for_good_measure.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Prepositional phrase=== 4 | {{en-PP|[[for]] [[good]] [[measure]]}} 5 | 6 | # {{idiomatic}} In excess of the minimum required; Added as an [[extra]] 7 | #: ''He tossed in a couple of extra shirts '''for good measure''' and closed the suitcase.'' 8 | 9 | ====Translations==== 10 | {{trans-top|in excess of the required minimum}} 11 | * {{trreq|ar}}: 12 | * Chinese: 13 | :* Mandarin: {{zh-tsp||额外补充|}}, {{zh-tsp||保险起见|}} 14 | *: {{trreq|cmn}}: 15 | * {{trreq|cs}}: 16 | * {{trreq|nl}}: 17 | * Finnish: {{t-|fi|kaiken varalta}}, {{t+|fi|varmuuden vuoksi}}, {{t-|fi|varoiksi}} 18 | * French: [[pour]] {{t+|fr|faire bonne mesure}} 19 | {{trans-mid}} 20 | * German: {{t-|de|noch dazu}} 21 | * {{trreq|hi}}: 22 | * {{trreq|it}}: 23 | * {{trreq|ja}}: 24 | * {{trreq|ko}}: 25 | * {{trreq|pl}}: 26 | * Portuguese: {{t-|pt|por precaução}} 27 | * Russian: [[на всякий случай]] ''(for any case)'' 28 | * Spanish: {{t-|es|por si acaso}}, {{t-|es|por precaución}} 29 | * {{trreq|vi}}: 30 | {{trans-bottom}} 31 | {{checktrans-top}} 32 | * {{ttbc|ro}}: [[pentru]] [[orice]] [[eventualitate]] 33 | {{trans-bottom}} 34 | 35 | [[et:for good measure]] 36 | [[my:for good measure]] 37 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Fote.txt: -------------------------------------------------------------------------------- 1 | === {{Wortart|Substantiv|Deutsch}}, {{f}} === 2 | 3 | {{Deutsch Substantiv Dialekt 4 | |Singular=de Fote 5 | |Plural=de Foten 6 | }} 7 | 8 | {{Worttrennung}} 9 | :Fo·te, {{Pl.}} Fo·ten 10 | 11 | {{Aussprache}} 12 | :{{IPA}} {{Lautschrift|ˈfoːtə}} 13 | :{{Hörbeispiele}} {{Audio|}} 14 | 15 | {{Bedeutungen}} 16 | :[1] ''[[berlinisch]]:'' Pfote, Pote; Hand, Fuß 17 | 18 | {{Oberbegriffe}} 19 | :[1] [[Körperteil]] 20 | 21 | {{Beispiele}} 22 | :[1] Sach ma, biste noch janz reene oda wat!? Du kannst do nich eenfach da Katze uff de ''Foten'' tippeln. 23 | :[1] Quatsch nich! Nu mach ma hinne! Ick will da nich ständich uff de ''Foten'' kieken müssen. 24 | :[1] Ick gloob, ick hab ma meene ''Fote'' jeknackst. 25 | 26 | ==== {{Übersetzungen}} ==== 27 | {{Ü-Tabelle|Ü-links= 28 | :*{{Übersetzungen umleiten|1|Pfote|1, 2}} {{f}}, {{Übersetzungen umleiten||Pote|}} {{f}}; {{Übersetzungen umleiten||Hand|1}} {{f}}, {{Übersetzungen umleiten||Fuß|1}} {{m}} 29 | |Ü-rechts= 30 | }} 31 | 32 | {{Referenzen}} 33 | :[1] Hans Meyer, Siegfried Mauermann, Walther Kiaulehn: ''Der richtige Berliner in Wörtern und Redensarten'', Neuausgabe der 10. Auflage, C. H. Beck, München 1985. Seite 100. ISBN 3-406-30611-X 34 | 35 | {{Ähnlichkeiten 1|[[Pfote]]}} 36 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/escritorio.txt: -------------------------------------------------------------------------------- 1 | ==Portuguese== 2 | {{wikipedia|lang=pt}} 3 | 4 | ===Alternative forms=== 5 | * [[escriptório]] {{qualifier|obsolete}} 6 | 7 | ===Etymology=== 8 | {{suffix|escritor|gloss1=writer|io|gloss2=ium|lang=pt}}. 9 | 10 | ===Pronunciation=== 11 | * {{a|South Brazil}} {{IPA|/ˌes.kɾi.ˈtɔ.ɾi.o/|/ˌes.kɾi.ˈtɔ.ɾjo/|lang=pt}} 12 | * {{a|PT}} {{IPA|/ˌiʃ.kɾi.ˈtɔ.ɾju/|lang=pt}} 13 | * {{hyphenation|es|cri|tó|ri|o|lang=pt}} 14 | 15 | ===Noun=== 16 | {{pt-noun|m|s}} 17 | 18 | # [[office]] (building or room) 19 | 20 | ====Synonyms==== 21 | * [[gabinete]] 22 | 23 | ====Related terms==== 24 | {{top4}} 25 | * [[alfabeto]] 26 | * [[escrevedor]] 27 | * [[escrever]] 28 | * [[escrevinhar]] 29 | * [[escriba]] 30 | {{mid4}} 31 | * [[escrita]] 32 | * [[escrito]] 33 | * [[escritor]] 34 | * [[escritura]] 35 | {{mid4}} 36 | * [[escrituração]] 37 | * [[escriturado]] 38 | * [[escriturar]] 39 | * [[escrituário]] 40 | {{mid4}} 41 | * [[escrivania]] 42 | * [[escrivaninha]] 43 | * [[escrivão]] 44 | * [[script]] 45 | {{bottom}} 46 | 47 | [[Category:pt:Business]] 48 | 49 | [[es:escritório]] 50 | [[fr:escritório]] 51 | [[lo:escritório]] 52 | [[hu:escritório]] 53 | [[mg:escritório]] 54 | [[fj:escritório]] 55 | [[pl:escritório]] 56 | [[pt:escritório]] 57 | [[sm:escritório]] 58 | [[chr:escritório]] 59 | [[zh:escritório]] 60 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/sumo.txt: -------------------------------------------------------------------------------- 1 | ==Portuguese== 2 | 3 | ===Pronunciation=== 4 | * {{a|PT}} {{IPA|/ˈsu.mu/|lang=pt}} 5 | 6 | ===Etymology 1=== 7 | From {{etyl|la|pt}} ''[[summus]]''. 8 | 9 | ====Adjective==== 10 | {{pt-adj|sum|o}} 11 | 12 | # [[highest]], [[greatest]]. 13 | 14 | ====Noun==== 15 | {{pt-noun|m|s}} 16 | 17 | # [[summit]], [[top]]. 18 | 19 | ===Etymology 2=== 20 | [[Image:Orange juice 1 edit1.jpg|thumb|150px|sumo]] 21 | From {{etyl|roa-opt|pt}} {{term|çumo|lang=roa-opt}}, from {{etyl|ar|pt}} {{term|زُوم|lang=ar||juice, sap}}, from {{etyl|grc|pt}} {{term|ζωμός|lang=grc}}. Cognate of Galician {{term|zume|lang=gl}} and Spanish {{term|zumo|lang=es}}. 22 | 23 | ====Noun==== 24 | {{pt-noun|m|s}} 25 | 26 | # {{context|Portugal|lang=pt}} [[juice]]. 27 | 28 | =====Synonyms===== 29 | * [[suco]] {{qualifier|Brasil}} 30 | 31 | ===Etymology 3=== 32 | [[Image:Bulgarian-sumists.jpg|thumb|150px|sumo]] 33 | From {{etyl|ja|pt}} {{term|相撲|tr=sumō|lang=ja}} ''to mutually rush at''. 34 | 35 | ====Alternative forms==== 36 | * {{qualifier|Brazil}} [[sumô]] 37 | 38 | ====Noun==== 39 | {{pt-noun|m|-}} 40 | 41 | # {{context|martial arts|Portugal|lang=pt}} {{l|en|sumo}}. 42 | 43 | ===Etymology 4=== 44 | 45 | ====Verb==== 46 | {{pt-verb-form}} 47 | 48 | # {{inflection of|sumir||1|s|pres|indc|lang=pt}} 49 | 50 | ---- 51 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Subdivisio.txt: -------------------------------------------------------------------------------- 1 | == Subdivisio ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{f}} === 3 | 4 | {{Substantiv-Tabelle| 5 | Wer oder was? (Einzahl)=die Subdivisio 6 | |Wer oder was? (Mehrzahl)=die Subdivisiones 7 | |Wessen? (Einzahl)=der Subdivisio 8 | |Wessen? (Mehrzahl)=der Subdivisiones 9 | |Wem? (Einzahl)=der Subdivisio 10 | |Wem? (Mehrzahl)=den Subdivisiones 11 | |Wen? (Einzahl)=die Subdivisio 12 | |Wen? (Mehrzahl)=die Subdivisiones 13 | }} 14 | 15 | {{Silbentrennung}} 16 | : Sub·di·vi·si·o, {{Pl.}} Sub·di·vi·si·o·nes 17 | 18 | {{Aussprache}} 19 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}} 20 | :{{IPA}} {{Lautschrift|...}}, {{Pl.}} {{Lautschrift|...}} 21 | 22 | {{Bedeutungen}} 23 | :[1] [[Biologie]]: Die hierarchische Gliederungsstufe der [[Divisio]] (dt.: [[Abteilung]]) im Reich der Pflanzen und der Pilze kann weiter in '''Subdivisiones''' (dt.: [[Untterabteilung|Unterabteilungen]]) differenziert werden. 24 | 25 | {{Synonyme}} 26 | :[1] Unterabteilung (in der Biologie) 27 | 28 | {{Beispiele}} 29 | :[1] 30 | 31 | ==== Übersetzungen ==== 32 | {{Ü-links}} 33 | *{{fr}}: [1] [[sub-division]] 34 | {{Ü-Abstand}} 35 | {{Ü-rechts}} 36 | 37 | {{Referenzen}} 38 | :[1] {{Wikipedia|Abteilung (Biologie)}} 39 | 40 | [[fr:Subdivisio]] 41 | [[ru:Subdivisio]] 42 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/IWiktionaryExample.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api; 19 | 20 | public interface IWiktionaryExample { 21 | /** 22 | * @return the text, including wiki markup 23 | */ 24 | String getText(); 25 | 26 | /** 27 | * @return the text of this example as wiki string 28 | */ 29 | IWikiString getExample(); 30 | 31 | /** 32 | * @return the translation of this example, or null 33 | */ 34 | IWikiString getTranslation(); 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/robber_baron.txt: -------------------------------------------------------------------------------- 1 | == robber baron ({{Sprache|Englisch}}) == 2 | === {{Wortart|Substantiv|Englisch}} === 3 | 4 | {{Englisch Substantiv Übersicht 5 | |Singular=the robber baron 6 | |Plural=the robber barons 7 | }} 8 | 9 | {{Worttrennung}} 10 | : rob·ber bar·on, {{Pl.}} rob·ber bar·ons 11 | 12 | {{Aussprache}} 13 | :{{IPA}} {{Lautschrift|…}}, {{Pl.}} {{Lautschrift|…}} 14 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}} 15 | 16 | {{Bedeutungen}} 17 | :[1] Angehörige eines ritterlichen Standes im Spätmittelalter, die ihre schlechte finanzielle Lage durch Straßenraub, [[Fehde]]n und Plünderungszüge verbessern wollten 18 | :[2] ein [[skrupellos]]er [[Kapitalist]], [[Industrielle]]r oder [[Geschäftsmann]] des späten 19. Jahrhunderts 19 | 20 | {{Beispiele}} 21 | :[1] 22 | 23 | ==== Übersetzungen ==== 24 | {{Ü-links}} 25 | *{{de}}: [1] [[Raubritter]]; [2] [[skrupellos]]er [[Kapitalist]]; ([[Räuber-Baron]]) 26 | {{Ü-Abstand}} 27 | *{{fr}}: [1] {{Ü|fr|}} 28 | {{Ü-rechts}} 29 | 30 | {{Referenzen}} 31 | :[1] {{Wikipedia|spr=en|robber baron}} 32 | :[1] {{Ref-Leo|en|robber+baron}} 33 | :[1] {{Ref-Pons|en|robber+baron}} 34 | :[1] {{Ref-MWD|robber+baron}} 35 | :[1, 2] {{Ref-Dictionary|robber+baron}} 36 | :[1] {{Ref-dictcc|en|robber+baron}} 37 | 38 | {{Referenzen prüfen|Englisch}} 39 | 40 | [[en:robber baron]] 41 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/batsman.txt: -------------------------------------------------------------------------------- 1 | {{wikipedia}} 2 | 3 | ==English== 4 | 5 | ===Synonyms=== 6 | * [[batter]] 7 | 8 | ===Noun=== 9 | {{en-noun|pl=batsmen}} 10 | # {{cricket}} A [[player]] of the [[batting]] [[side]] now on the [[field]] 11 | # {{cricket}} The [[player]] now [[receiving]] [[strike]]; the [[striker]] 12 | #: 2001: ''The batsman, Kathryn Leng, (who has played for quite a few years for England) asked the umpire dumbfounded if Charlie was going to bowl with a helmet on.'' — [[w:Julia Price|Julia Price]] (Australian cricketer), her women's Ashes diary entry for 19 June 2001 [http://www.southernstars.org.au/ukdiary2001.htm] 13 | # {{cricket}} Any player selected for his or her [[team]] principally to [[bat]], as opposed to a [[bowler]] 14 | 15 | ====Usage notes==== 16 | The term batsman is applied to both male and female cricketers; [[batswoman]] is much rarer. 17 | 18 | ====Derived terms==== 19 | * [[batsmanship]] 20 | 21 | ====Related terms==== 22 | * [[bat]] 23 | 24 | ===Anagrams=== 25 | * [[bantams#English|bantams]], [[batmans#English|batmans]] 26 | 27 | [[et:batsman]] 28 | [[fr:batsman]] 29 | [[ko:batsman]] 30 | [[io:batsman]] 31 | [[kn:batsman]] 32 | [[hu:batsman]] 33 | [[ml:batsman]] 34 | [[my:batsman]] 35 | [[pl:batsman]] 36 | [[fi:batsman]] 37 | [[ta:batsman]] 38 | [[vi:batsman]] 39 | [[zh:batsman]] 40 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/IQuotation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api; 19 | 20 | import java.util.List; 21 | 22 | /** 23 | * Represents a quotation. 24 | * @author Christian M. Meyer 25 | * @author Christof Müller 26 | * @author Lizhen Qu 27 | */ 28 | public interface IQuotation { 29 | 30 | /** Returns the source of the quotation. */ 31 | IWikiString getSource(); 32 | 33 | /** Returns the text of the quotation as a list of {@link IWikiString}s. */ 34 | List getLines(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikipedia/language/LanguageTypeLocal.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.language; 17 | 18 | 19 | /** Names of languages in some language (e.g. Russian) 20 | * and the links to the LanguageType codes. 21 | */ 22 | public abstract class LanguageTypeLocal { 23 | 24 | /** Language name, e.g. "Russian" */ 25 | protected String name; 26 | 27 | /** LanguageType corresponding to this name, e.g. LanguageType.ru */ 28 | protected LanguageType type; 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DativeHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 21 | 22 | public class DativeHandler extends CaseHandler { 23 | 24 | protected static final String DATIVE_PATTERN = 25 | // startsWith("Dativ") 26 | "^Dativ|" + 27 | // startsWith("Wem?") 28 | "^Wem\\?"; 29 | 30 | public DativeHandler() { 31 | super(DATIVE_PATTERN, GrammaticalCase.DATIVE); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/GenitiveHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 21 | 22 | public class GenitiveHandler extends CaseHandler { 23 | 24 | protected static final String GENITIVE_PATTERN = 25 | // startsWith("Genitiv") 26 | "^Genitiv|" + 27 | // startsWith("Wessen?") 28 | "^Wessen\\?"; 29 | 30 | public GenitiveHandler() { 31 | super(GENITIVE_PATTERN, GrammaticalCase.GENITIVE); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/AccusativeHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 21 | 22 | public class AccusativeHandler extends CaseHandler { 23 | 24 | protected static final String ACCUSATIVE_PATTERN = 25 | // startsWith("Akkusativ") 26 | "^Akkusativ|" + 27 | // startsWith("Wen?") 28 | "^Wen\\?"; 29 | 30 | public AccusativeHandler() { 31 | super(ACCUSATIVE_PATTERN, GrammaticalCase.ACCUSATIVE); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/WQuoteEn.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en; 17 | 18 | /** Phrase or sentence that illustrates a meaning of a word in Russian Wiktionary. 19 | */ 20 | public class WQuoteEn { 21 | 22 | 23 | /** Removes highlighted marks from a sentence. 24 | * Sentence with '''words'''. -> Sentence with words. 25 | */ 26 | public static String removeHighlightedMarksFromSentence(String str) 27 | { 28 | if(str.contains("'''")) 29 | return str.replace("'''", ""); 30 | 31 | return str; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEBlockHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.parser.components.BlockHandler; 21 | 22 | /** 23 | * Abstract base class for all parser components for the German Wiktionary. 24 | * @author Christian M. Meyer 25 | */ 26 | public abstract class DEBlockHandler extends BlockHandler { 27 | 28 | /** Initializes the block handler for parsing all sections starting with 29 | * one of the specified labels. */ 30 | public DEBlockHandler(final String... labels) { 31 | super(labels); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENBlockHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.parser.components.BlockHandler; 21 | 22 | /** 23 | * Abstract base class for all parser components for the English Wiktionary. 24 | * @author Christian M. Meyer 25 | */ 26 | public abstract class ENBlockHandler extends BlockHandler { 27 | 28 | /** Initializes the block handler for parsing all sections starting with 29 | * one of the specified labels. */ 30 | public ENBlockHandler(final String... labels) { 31 | super(labels); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalPerson.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical number of a {@link IWiktionaryWordForm}. 24 | * @author Christian M. Meyer 25 | */ 26 | public enum GrammaticalPerson { 27 | 28 | /** The first person; the speaker; referred to by "I", "we". */ 29 | FIRST, 30 | 31 | /** The second person; the addressee; referred to by "you". */ 32 | SECOND, 33 | 34 | /** The third person; the other; referred to by "he", "she", 35 | * "it", "they". */ 36 | THIRD; 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/NominativeHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 21 | 22 | public class NominativeHandler extends CaseHandler { 23 | 24 | protected static final String NOMINATIVE_PATTERN = 25 | // startsWith("Nominativ") 26 | "^Nominativ|" + 27 | // equals("Genus 1") || equals("Genus 2") || 28 | // equals("Genus 3") || equals("Genus 4") || 29 | "^Wer\\soder\\swas\\?"; 30 | 31 | public NominativeHandler() { 32 | super(NOMINATIVE_PATTERN, GrammaticalCase.NOMINATIVE); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/garçon.txt: -------------------------------------------------------------------------------- 1 | ==French== 2 | 3 | ===Etymology=== 4 | From {{etyl|frm|fr}}, from {{etyl|fro|fr}} {{m|fro|garçun||servant}}, oblique case of {{m|fro|gars}}, from {{etyl|frk|fr}} {{m|frk|*wrakjō||servant, boy}} from {{etyl|gem-pro|fr}} {{m|gem-pro|*wrakjô||exile, driven one}}, from {{etyl|ine-pro|fr}} {{m|ine-pro|*wreg-||to drive}}. Cognate with {{cog|goh|wrecheo}}, {{m|goh|recko||exile, warrior, hero}} (Modern {{cog|de|Recke}}), {{cog|osx|wrekkio||a banished person, exile, stranger}}, {{cog|ang|wreċċa||a wretch, stranger, exile}}, and perhaps to {{cog|non|rekkr||man, warrior, hero}}. More at {{l|en|wretch}}, {{l|en|wreak}}. 5 | 6 | ===Pronunciation=== 7 | * {{audio|Fr-garçon.ogg|audio (un garçon)|lang=fr}} 8 | * {{IPA|/ɡaʁsɔ̃/|lang=fr}} 9 | 10 | ===Noun=== 11 | {{fr-noun|m}} 12 | 13 | # {{l|en|boy}} 14 | #: {{ux|fr|Il a deux '''garçons''' et une fille.|He has two '''boys''' and a daughter.}} 15 | #: {{syn|fr|gamin}} 16 | #: {{ant|fr|adulte}} 17 | # {{lb|fr|by extension}} {{l|en|[[young]] [[man]]}}; {{l|en|man}} 18 | #: {{syn|fr|homme}} 19 | # {{l|en|waiter}} 20 | #: {{ux|fr|'''Garçon''', l'addition s'il vous plaît.|'''Waiter''', the bill please.|inline=1}} 21 | #: {{syn|fr|serveur|serviteur}} 22 | 23 | 24 | ====Synonyms==== 25 | * {{sense|boy}} {{l|fr|fils}} 26 | 27 | ====Derived terms==== 28 | * {{l|fr|garçonnet}} 29 | 30 | ====See also==== 31 | * {{l|fr|fille}} 32 | * {{l|fr|fils}} 33 | 34 | ===References=== 35 | * ''Merriam-Webster's Collegiate Dictionary: Tenth Edition'' (1997) 36 | 37 | ===External links=== 38 | * {{R:TLFi}} 39 | 40 | ---- 41 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalGender.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 21 | 22 | /** 23 | * Enumeration for modeling the grammatical gender of a 24 | * {@link IWiktionaryEntry}. 25 | * @author Christian M. Meyer 26 | */ 27 | public enum GrammaticalGender { 28 | 29 | /** Masculine gender (e.g., the German "Hund"). */ 30 | MASCULINE, 31 | 32 | /** Feminine gender (e.g., the German "Katze"). */ 33 | FEMININE, 34 | 35 | /** Neuter gender (e.g., the German "Haus"). */ 36 | NEUTER; 37 | 38 | // ANIMATE, 39 | // INANIMATE, 40 | // HUMAN, 41 | // NON_HUMAN, 42 | // ANIMAL, 43 | // OTHER; 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/word/WSynonyms.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.word; 17 | 18 | 19 | /** Synonyms of Wiktionary word. 20 | */ 21 | public class WSynonyms { 22 | 23 | /* Comment for the set of synonyms, e.g. synonyms for "entry": 24 | * * (''act of entering''): [[access]], [[enter]]ing, [[entrance]], 25 | * * (''doorway that provides a means of entering a building''): [[entrance]], [[way in]] {{UK}} 26 | * .comment=act of entering 27 | * .comment=doorway... 28 | * .words[1].tag=UK 29 | * / 30 | private String[] comment;*/ 31 | 32 | /* Synonyms list with tags * / 33 | private WikiWord[] words;*/ 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/WiktionaryDumpParserTest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Wiktionary 4 | http://de.wiktionary.org/wiki/Wiktionary:Hauptseite 5 | MediaWiki 1.16alpha-wmf 6 | case-sensitive 7 | 8 | 9 | Diskussion 10 | 11 | 12 | 13 | Page 1 14 | 9 15 | 16 | 10763 17 | 2004-09-17T08:23:57Z 18 | 19 | TJ 20 | 10 21 | 22 | Text 1 23 | 24 | 25 | 26 | Page 2 27 | 10 28 | 29 | 10764 30 | 2004-09-17T08:34:29Z 31 | 32 | TJ 33 | 10 34 | 35 | 36 | Text 2 37 | 38 | Test Test 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionaryPageFilter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.filter; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage; 21 | 22 | /** 23 | * Interface for implementing a filter for {@link IWiktionaryPage}s. 24 | * That is, a possibility for selecting which pages are to be processed 25 | * (i.e., accepted) or skipped (i.e., filtered out). 26 | * @author Christian M. Meyer 27 | */ 28 | @FunctionalInterface 29 | public interface IWiktionaryPageFilter { 30 | 31 | /** Return true if the given page should be accepted or 32 | * false if it should be filtered out. */ 33 | boolean accept(final IWiktionaryPage page); 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalDegree.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical degree of a {@link IWiktionaryWordForm}. 24 | * @author Christian M. Meyer 25 | */ 26 | public enum GrammaticalDegree { 27 | 28 | /** Denotes an a property (e.g., "Your flowers are _pretty_"). */ 29 | POSITIVE, 30 | 31 | /** Indicates a greater degree (e.g., "Your flowers are 32 | * _prettier_ than mine"). */ 33 | COMPARATIVE, 34 | 35 | /** Indicates the greatest degree (e.g., "Your flowers are 36 | * _prettiest_"). */ 37 | SUPERLATIVE; 38 | 39 | // ELATIVE, 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionaryEntryFilter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.filter; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 21 | 22 | /** 23 | * Interface for implementing a filter for {@link IWiktionaryEntry}s. 24 | * That is, a possibility for selecting which entries are to be processed 25 | * (i.e., accepted) or skipped (i.e., filtered out). 26 | * @author Christian M. Meyer 27 | */ 28 | @FunctionalInterface 29 | public interface IWiktionaryEntryFilter { 30 | 31 | /** Return true if the given entry should be accepted or 32 | * false if it should be filtered out. */ 33 | boolean accept(final IWiktionaryEntry entry); 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/IWiktionarySenseFilter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.filter; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense; 21 | 22 | /** 23 | * Interface for implementing a filter for {@link IWiktionarySense}s. 24 | * That is, a possibility for selecting which senses are to be processed 25 | * (i.e., accepted) or skipped (i.e., filtered out). 26 | * @author Christian M. Meyer 27 | */ 28 | @FunctionalInterface 29 | public interface IWiktionarySenseFilter { 30 | 31 | /** Return true if the given sense should be accepted or 32 | * false if it should be filtered out. */ 33 | boolean accept(final IWiktionarySense sense); 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalNumber.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical number of a {@link IWiktionaryWordForm}. 24 | * @author Christian M. Meyer 25 | */ 26 | public enum GrammaticalNumber { 27 | 28 | /** A single item (e.g., "a book", "one pen", "the guy"). */ 29 | SINGULAR, 30 | 31 | /** Multiple items (e.g., "books", "two pens", "the guys"). */ 32 | PLURAL; 33 | 34 | //SINGULATIVE, 35 | //COLLECTIVE, 36 | 37 | //DUAL, // 2 items 38 | //TRIAL, // 3 items 39 | //QUADRAL, // 4 items 40 | //PAUCAL, // few items 41 | //DISTRIBUTIVE_PLURAL, // independent instances 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryEntryParser.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException; 21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryPage; 22 | 23 | /** 24 | * A parser for separating an article page's text into individual 25 | * Wiktionary word entries. 26 | * @author Christian M. Meyer 27 | */ 28 | public interface IWiktionaryEntryParser { 29 | 30 | /** Creates Wiktionary word entry instances from the provided text, and 31 | * adds them to the given article page. 32 | * @throws WiktionaryException in case of any parser errors. */ 33 | void parse(final WiktionaryPage page, final String text) 34 | throws WiktionaryException; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/filter/WiktionarySenseFilter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.filter; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense; 21 | 22 | /** 23 | * Default implementation of the {@link IWiktionarySenseFilter} interface 24 | * which inherits all filter options of the {@link WiktionaryEntryFilter} 25 | * @author Christian M. Meyer 26 | */ 27 | public class WiktionarySenseFilter extends WiktionaryEntryFilter 28 | implements IWiktionarySenseFilter { 29 | 30 | /** Initializes a page filter without any filter restrictions. */ 31 | public WiktionarySenseFilter() { 32 | super(); 33 | } 34 | 35 | public boolean accept(final IWiktionarySense sense) { 36 | if (!accept(sense.getEntry())) 37 | return false; 38 | 39 | return true; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalTense.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical tense of a {@link IWiktionaryWordForm}. 24 | * Note that tense is often combined with verb aspects (e.g., present 25 | * perfect). Such combinations can be modeled in combination with 26 | * enumeration values from {@link GrammaticalAspect}. 27 | * @author Christian M. Meyer 28 | */ 29 | public enum GrammaticalTense { 30 | 31 | /** The past; an utterance refers to the time before a reference time. */ 32 | PAST, 33 | 34 | /** The present; an utterance refers to the reference time. */ 35 | PRESENT, 36 | 37 | /** The future; an utterance refers to the time after a reference time. */ 38 | FUTURE; 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DECollocationsHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.RelationType; 21 | 22 | /** 23 | * Parser component for extracting collocatoins from the German Wiktionary. 24 | * @author Christian M. Meyer 25 | * @author Lizhen Qu 26 | */ 27 | public class DECollocationsHandler extends DERelationHandler { 28 | 29 | /** Initializes the block handler for parsing all sections starting with 30 | * one of the specified labels. */ 31 | public DECollocationsHandler() { 32 | super(RelationType.CHARACTERISTIC_WORD_COMBINATION, "Charakteristische Wortkombinationen"); 33 | } 34 | 35 | @Override 36 | protected String addDelimiters(final String text) { 37 | return super.addDelimiters(text.replace("''", "")); 38 | } 39 | 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryMultistreamDumpParser.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2015 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser; 19 | 20 | import java.io.File; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException; 23 | 24 | public interface IWiktionaryMultistreamDumpParser extends IWiktionaryDumpParser { 25 | /** 26 | * Parses a multistream XML dump file 27 | * 28 | * @param multistreamDumpFile the dumpfile (*-pages-articles-multistream-index.txt.bz2) 29 | * @param indexFile the matching index file (*-pages-articles-multistream.xml.bz2) 30 | * @param filter the filter to use to constrain the parsed pages 31 | * @throws de.tudarmstadt.ukp.jwktl.api.WiktionaryException 32 | */ 33 | void parseMultistream(File multistreamDumpFile, 34 | File indexFile, 35 | MultistreamFilter filter) throws WiktionaryException; 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/WiktionaryException.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api; 19 | 20 | /** 21 | * Runtime exception which is thrown by the API in different situations, 22 | * especially when there are problems accessing the parsed Wiktionary data. 23 | * @author Christian M. Meyer 24 | * @author Christof Müller 25 | */ 26 | public class WiktionaryException extends RuntimeException { 27 | private static final long serialVersionUID = 5373008056379642627L; 28 | 29 | /***/ 30 | public WiktionaryException() { 31 | super(); 32 | } 33 | 34 | /***/ 35 | public WiktionaryException(final String message) { 36 | super(message); 37 | } 38 | 39 | /***/ 40 | public WiktionaryException(final String message, final Throwable cause) { 41 | super(message, cause); 42 | } 43 | 44 | /***/ 45 | public WiktionaryException(final Throwable cause) { 46 | super(cause); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/MultistreamFilter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2015 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser; 19 | 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | @FunctionalInterface 24 | public interface MultistreamFilter { 25 | /** @return whether to include the page with pageId and pageTitle in the parse */ 26 | boolean accept(long pageId, String pageTitle); 27 | 28 | /** A filter which includes only page titles contained in the specified list */ 29 | class IncludingNames implements MultistreamFilter { 30 | private final List pageNames; 31 | 32 | public IncludingNames(String... pageNames) { 33 | this(Arrays.asList(pageNames)); 34 | 35 | } 36 | public IncludingNames(List pageNames) { 37 | this.pageNames = pageNames; 38 | } 39 | 40 | @Override 41 | public boolean accept(long pageId, String pageTitle) { 42 | return pageNames.contains(pageTitle); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DEEntryLinkHandlerTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage; 22 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEEntryLinkHandler; 23 | 24 | /** 25 | * Test case for {@link DEEntryLinkHandler}. 26 | */ 27 | public class DEEntryLinkHandlerTest extends DEWiktionaryEntryParserTest { 28 | 29 | /***/ 30 | public void testAbschlusz() throws Exception { 31 | IWiktionaryPage page = parse("Abschlusz.txt"); 32 | IWiktionaryEntry entry = page.getEntry(0); 33 | assertEquals("Abschluss", entry.getEntryLink()); 34 | } 35 | 36 | /***/ 37 | public void testEingaben() throws Exception { 38 | IWiktionaryPage page = parse("Eingaben.txt"); 39 | IWiktionaryEntry entry = page.getEntry(0); 40 | assertEquals("Eingabe", entry.getEntryLink()); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/cheio.txt: -------------------------------------------------------------------------------- 1 | ==Portuguese== 2 | 3 | ===Alternative forms=== 4 | * {{l/pt|cheo}} {{qualifier|obsolete}} 5 | 6 | ===Etymology=== 7 | Earlier {{m|pt|cheo}}, from {{etyl|roa-opt|pt}} {{term|chẽo|lang=roa-opt}}, from {{etyl|la|pt}} {{term|plenus|lang=la}}, from {{etyl|itc-pro|pt}} {{m|itc-pro|*plēnos}}, from {{etyl|ine-pro|pt}} {{m|ine-pro|*pl̥h₁nós||full}}. Compare {{etyl|ca|-}} {{m|ca|ple}}, {{etyl|eo|-}} {{m|eo|plena}}, {{etyl|fr|-}} {{m|fr|plein}}, {{etyl|io|-}} {{m|io|plena}}, {{etyl|it|-}} {{m|it|pieno}}, {{etyl|ro|-}} {{m|ro|plin}}, {{etyl|sc|-}} {{m|sc|prenu}}, {{etyl|es|-}} {{m|es|lleno}}. 8 | 9 | ===Pronunciation=== 10 | * {{a|Portugal}} {{IPA|/ˈʃɐj.u/|/ˈʃej.u/|lang=pt}} 11 | * {{a|Brazil}} {{IPA|/ˈʃej.u/|lang=pt}} 12 | * {{hyphenation|chei|o|lang=pt}} 13 | 14 | ===Adjective=== 15 | {{pt-adj|chei|o}} 16 | 17 | # {{l/en|full}}, {{l/en|filled}}, {{l/en|completed}} 18 | #: {{usex|lang=pt|A rua está '''cheia''' de trânsito|The street is full of traffic.}} 19 | #: {{usex|lang=pt|Estou '''cheio'''.|I'm full (not hungry anymore).}} 20 | # {{l/en|covered}} 21 | #: A rua está '''cheia''' de óleo. 22 | #:: The street is covered with oil. 23 | # {{context|figurative|lang=pt}} [[fed up]], [[tired]], [[annoyed]] 24 | #: {{usex|lang=pt|Estou '''cheio''' dele.|I'm fed up with him.}} 25 | 26 | ====Inflection==== 27 | {{pt-adj-infl|chei|o|dim=1}} 28 | 29 | ====Synonyms==== 30 | * {{sense|full}} {{l/pt|repleto}}, {{l/pt|completo}}, {{l/pt|lotado}} 31 | * {{sense|covered}} {{l/pt|coberto}} 32 | * {{sense|fed up}} {{l/pt|farto}} 33 | 34 | [[el:cheio]] 35 | [[es:cheio]] 36 | [[fr:cheio]] 37 | [[gl:cheio]] 38 | [[io:cheio]] 39 | [[ku:cheio]] 40 | [[hu:cheio]] 41 | [[mg:cheio]] 42 | [[fj:cheio]] 43 | [[nl:cheio]] 44 | [[ja:cheio]] 45 | [[pl:cheio]] 46 | [[pt:cheio]] 47 | [[fi:cheio]] 48 | [[chr:cheio]] 49 | [[zh:cheio]] 50 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/WiktionaryExample.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.entry; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWikiString; 21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryExample; 22 | 23 | public class WiktionaryExample implements IWiktionaryExample { 24 | protected IWikiString example; 25 | protected IWikiString translation; 26 | 27 | public WiktionaryExample() {} 28 | 29 | public WiktionaryExample(IWikiString example) { 30 | this(example, null); 31 | } 32 | 33 | public WiktionaryExample(IWikiString example, IWikiString translation) { 34 | this.example = example; 35 | this.translation = translation; 36 | } 37 | 38 | @Override 39 | public String getText() { 40 | return example.getText(); 41 | } 42 | 43 | @Override 44 | public IWikiString getExample() { 45 | return example; 46 | } 47 | 48 | @Override 49 | public IWikiString getTranslation() { 50 | return translation; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/util/LangText.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.util; 17 | 18 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikipedia.language.LanguageType; 19 | 20 | /** Data structure consists of a language code and the corresponding text. 21 | */ 22 | public class LangText { 23 | 24 | /** Language of the text, e.g. the article about one word can contain "en" block for English word, "de", "fr", etc. */ 25 | private LanguageType lang; 26 | 27 | /** Text */ 28 | public StringBuffer text; 29 | 30 | public LangText() {} 31 | 32 | public LangText(LanguageType _lang) { //, StringBuffer _text) { 33 | lang = _lang; 34 | text = new StringBuffer(); 35 | //text = _text; 36 | } 37 | 38 | /** Gets language of the text, e.g. "en" for English word, "de", "fr", etc. */ 39 | public LanguageType getLanguage() { 40 | return lang; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/NonFiniteForm.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration for modelling non-finite {@link IWiktionaryWordForm}s. 24 | * Although other form properties (like {@link GrammaticalTense}) are 25 | * predominantly used to represent finite forms, such properties can 26 | * also be used to describe non-finite forms. For example, the English 27 | * present participle (tense = PRESENT) and past participle (tense = PAST). 28 | * @author Christian M. Meyer 29 | */ 30 | public enum NonFiniteForm { 31 | 32 | /** The infinitive form of a verb (e.g., "(to) do"). */ 33 | INFINITIVE, 34 | 35 | /** The participle form of a verb (e.g., "done"). Participle forms should 36 | * be combined with a {@link GrammaticalTense}. */ 37 | PARTICIPLE; 38 | 39 | // ATTRIBUTIVE, 40 | // CONVERB, 41 | // GERUNDIVE, 42 | // GERUND; 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/LabelEn.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en; 17 | 18 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant.ContextLabel; 19 | 20 | /** Contexual information for definitions, or Synonyms, or Translations 21 | * in English Wiktionary. 22 | * 23 | * See http://en.wiktionary.org/wiki/Template_talk:context 24 | * http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained 25 | */ 26 | public class LabelEn extends ContextLabel { 27 | 28 | private LabelEn(String label,String name,String category) { 29 | super(label, name, category); 30 | } 31 | 32 | public static final ContextLabel AU = new LabelEn("AU", "Australia", ""); 33 | public static final ContextLabel slang = new LabelEn("slang", "slang", ""); 34 | 35 | public static final ContextLabel astronomy = new LabelEn("astronomy","astronomy", "Astronomy"); 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/CaseHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import java.util.Objects; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 25 | 26 | public abstract class CaseHandler extends PatternBasedParameterHandler { 27 | 28 | private final GrammaticalCase grammaticalCase; 29 | 30 | public CaseHandler(String regex, GrammaticalCase grammaticalCase) { 31 | super(regex); 32 | Objects.requireNonNull(grammaticalCase, "grammaticalCase must not be null"); 33 | this.grammaticalCase = grammaticalCase; 34 | } 35 | 36 | @Override 37 | public void handle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) { 38 | wordForm.setCase(grammaticalCase); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/it_s.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Etymology=== 4 | [[contraction|Contraction]] of ‘[[it]] [[is]]’ or ‘it [[has]]’. 5 | 6 | ===Pronunciation=== 7 | * {{IPA|/ɪts/}}, {{SAMPA|/Its/}} 8 | * {{audio|en-us-it's.ogg|Audio (US)}} 9 | * {{rhymes|ɪts}} 10 | * {{homophones|its}} 11 | 12 | ===Contraction=== 13 | {{en-cont}} 14 | 15 | # It [[is]]. 16 | #: '''''It’s''' coming right for us!'' 17 | # It [[has]]. 18 | #: '''''It’s''' been a long time since I’ve had cheesecake.'' 19 | 20 | ====Usage notes==== 21 | * See [[its#Usage notes|Usage under "its"]] 22 | 23 | ====Translations==== 24 | {{trans-top|it is}} 25 | * [[Catalan]]: [[ell]] [[és]] 26 | * Dutch: [[’t]] [[is]] 27 | * Finnish: [[se]] [[on]] 28 | * French: {{t-|fr|c'est}} 29 | {{trans-mid}} 30 | * Greek: {{t+|el|είναι}} 31 | * Italian: {{t+|it|è}} 32 | * Portuguese: {{t+|pt|é}} 33 | * Spanish: {{t-|es|es}} 34 | * Swedish: [[det]] [[är]], [[den]] är 35 | {{trans-bottom}} 36 | 37 | {{trans-top|it has}} 38 | * [[Catalan]]: [[ell]] [[té]] 39 | * Dutch: [[’t]] [[heeft]], ’t [[is]] 40 | * Greek: {{t|el|έχει}} 41 | * Italian: {{t+|it|ha}} 42 | {{trans-mid}} 43 | * Portuguese: {{t|pt|tem}} 44 | * Spanish: {{t+|es|ha}} 45 | * Swedish: det [[har]], den har 46 | {{trans-bottom}} 47 | 48 | {{checktrans-top}} 49 | * {{ttbc|Latvian}}: [[tas]] [[ir]], [[tā]] ir 50 | * {{ttbc|Lithuanian}}: [[tai]] [[yra]] 51 | {{trans-mid}} 52 | * {{ttbc|Swedish}} 53 | {{trans-bottom}} 54 | 55 | ===Anagrams=== 56 | * {{alphagram|[[IST#English|IST]]}} [[sit#English|sit]], [[STI#English|STI]], [['tis#English|'tis]], [[TIS#English|TIS]] 57 | 58 | [[Category:English terms spelled with ']] 59 | 60 | [[es:it's]] 61 | [[fr:it's]] 62 | [[ko:it's]] 63 | [[ja:it's]] 64 | [[no:it's]] 65 | [[pl:it's]] 66 | [[simple:it's]] 67 | [[fi:it's]] 68 | [[tr:it's]] 69 | [[vi:it's]] 70 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/boulder.txt: -------------------------------------------------------------------------------- 1 | == boulder ({{Sprache|Englisch}}) == 2 | === {{Wortart|Substantiv|Englisch}} === 3 | 4 | {{erweitern| Beispiel(e) einfügen |Englisch}} 5 | 6 | {{Englisch Substantiv|s 7 | |Bild1=Ffionphort cracked granite boulder.jpg|BBezug1=1|BBeschreibung1=Cracked granite ''boulder''}} 8 | 9 | {{Alternative Schreibweisen}} 10 | :[[bowlder]] 11 | 12 | {{Worttrennung}} 13 | :boul·der, {{Pl.}} boul·ders 14 | 15 | {{Aussprache}} 16 | :{{IPA}} {{Lautschrift|ˈbəʊldəʳ}}, {{Pl.}} {{Lautschrift|ˈbəʊldəʳs}} 17 | :{{Hörbeispiele}} {{Audio|En-us-boulder.ogg|boulder (amerikanisch)}}, {{Pl.}} {{fehlend}} 18 | 19 | {{Bedeutungen}} 20 | :[1] einzelner Felsbrocken/Felsblock, theoretisch beweglich, mehr oder weniger rund 21 | :[2] Geröll 22 | 23 | {{Beispiele}} 24 | :[1] 25 | 26 | {{Abgeleitete Begriffe}} 27 | :[[boulders]], [[bouldered]], [[bouldery]], [[boulder period]], [[boulder clay]] 28 | 29 | ==== Übersetzungen ==== 30 | {{Ü-links}} 31 | *{{de}}: [1] [[Felsbrocken]], [[Felsblock]], [[Stein]] {{m}}; [2] [[Geröll]] {{n}} 32 | {{Ü-rechts}} 33 | 34 | {{Referenzen}} 35 | :[1] {{Wikipedia|spr=en|boulder}} 36 | :[1] {{Ref-Oxford|boulder}} 37 | :[1] {{Ref-Macmillan|boulder}} 38 | :[1] {{Ref-MWD|boulder}} 39 | :[1] {{Ref-MWT|boulder}} 40 | :[1] {{Ref-Dictionary|boulder}} 41 | :[1] {{Ref-Pons|en|boulder}} 42 | :[1] {{Ref-dictcc|en|boulder}} 43 | :[1] {{Ref-Leo|en|boulder}} 44 | 45 | [[cy:boulder]] 46 | [[el:boulder]] 47 | [[en:boulder]] 48 | [[eo:boulder]] 49 | [[es:boulder]] 50 | [[et:boulder]] 51 | [[fa:boulder]] 52 | [[fr:boulder]] 53 | [[hu:boulder]] 54 | [[io:boulder]] 55 | [[it:boulder]] 56 | [[kn:boulder]] 57 | [[ko:boulder]] 58 | [[mg:boulder]] 59 | [[ml:boulder]] 60 | [[nl:boulder]] 61 | [[pl:boulder]] 62 | [[simple:boulder]] 63 | [[sv:boulder]] 64 | [[ta:boulder]] 65 | [[vi:boulder]] 66 | [[zh:boulder]] 67 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalMood.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical mood of a {@link IWiktionaryWordForm}. 24 | * @author Christian M. Meyer 25 | */ 26 | public enum GrammaticalMood { 27 | 28 | /** The declarative mode (modus indicativus); indicates real events. 29 | * For example: "He built a house." */ 30 | INDICATIVE, 31 | 32 | /** The commanding mode (imperare). 33 | * For example: "Built a house!" */ 34 | IMPERATIVE, 35 | 36 | /** The conjunctive or subjunctive mode (modus coniunctivus); 37 | * indicates unreal events. For example: "The house that he build." 38 | * (instead of "builds"); "The house that he shall build." Used to 39 | * express the German "Konjunktiv" ("Er hätte ein Haus gebaut"). */ 40 | CONJUNCTIVE; 41 | 42 | // CONDITIONAL, 43 | // OPTATIVE, 44 | // JUSSIVE, 45 | // POTENTIAL, 46 | // INTERROGATIVE; 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Flipchart.txt: -------------------------------------------------------------------------------- 1 | == Flipchart ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{mfn}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Genus 1=n 6 | |Genus 2=m 7 | |Genus 3=f 8 | |Nominativ Singular 1=Flipchart 9 | |Nominativ Singular 2=Flipchart 10 | |Nominativ Singular 3=Flipchart 11 | |Nominativ Plural=Flipcharts 12 | |Genitiv Singular 1=Flipcharts 13 | |Genitiv Singular 2=Flipcharts 14 | |Genitiv Singular 3=Flipchart 15 | |Genitiv Plural=Flipcharts 16 | |Dativ Singular 1=Flipchart 17 | |Dativ Singular 2=Flipchart 18 | |Dativ Singular 3=Flipchart 19 | |Dativ Plural=Flipcharts 20 | |Akkusativ Singular 1=Flipchart 21 | |Akkusativ Singular 2=Flipchart 22 | |Akkusativ Singular 3=Flipchart 23 | |Akkusativ Plural=Flipcharts 24 | |Bild=Flipchart1-Asio.JPG|180px|1|Ein Flipchart 25 | }} 26 | 27 | {{Alternative Schreibweisen}} 28 | :[[Flip-Chart]] 29 | 30 | {{Worttrennung}} 31 | :Flip·chart, {{Pl.}} Flip·charts 32 | 33 | {{Aussprache}} 34 | :{{IPA}} {{Lautschrift|ˈflɪpʧaːɐ̯t}}, {{Lautschrift|ˈflɪpʧaʁt}} 35 | :{{Hörbeispiele}} {{Audio|De-Flipchart.ogg}} 36 | 37 | {{Bedeutungen}} 38 | :[1] ein großer auf einem Gestell befestigter Papierblock, dessen Blätter man beschreiben und nach hinten umblättern kann 39 | 40 | {{Herkunft}} 41 | :von dem englischen Begriff [[flip chart]] (deutsch: [[Umblätterdiagramm]]) mit gleicher Bedeutung 42 | 43 | {{Beispiele}} 44 | :[1] Während des Vortrags zeichnete er den neuen Arbeitsablauf auf einem ''Flipchart'' auf. 45 | 46 | {{Absatz}} 47 | ==== {{Übersetzungen}} ==== 48 | {{Ü-Tabelle|Ü-links= 49 | *{{en}}: [1] {{Ü|en|flip chart}} 50 | |Ü-rechts= 51 | *{{fr}}: [] {{Ü|fr|}} 52 | *{{sv}}: [1] {{Ü|sv|blädderblock}} 53 | }} 54 | 55 | {{Referenzen}} 56 | :[1] {{Wikipedia|Flipchart}} 57 | :[*] {{Ref-Canoo|Flipchart}} 58 | :[1] {{Ref-UniLeipzig|Flipchart}} 59 | :[1] {{Ref-Duden|Flipchart}} 60 | 61 | [[Kategorie:Entlehnung aus dem Englischen (Deutsch)]] -------------------------------------------------------------------------------- /src/main/resources/assemblies/dist.xml: -------------------------------------------------------------------------------- 1 | 5 | bin 6 | 7 | dir 8 | zip 9 | 10 | 11 | false 12 | 13 | 14 | 15 | 16 | false 17 | true 18 | 19 | com.sleepycat:je 20 | 21 | lib 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | CHANGELOG.txt 30 | LICENSE.txt 31 | NOTICE.txt 32 | README.txt 33 | pom.xml 34 | license/* 35 | 36 | 37 | 38 | 39 | 40 | src/main/java 41 | 42 | de/tudarmstadt/ukp/jwktl/WiktionaryCli.java 43 | de/tudarmstadt/ukp/jwktl/examples/*.java 44 | 45 | examples 46 | 47 | 48 | 49 | 50 | target/site/apidocs 51 | javadoc 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/ChainedCBZip2InputStreamTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2015 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser; 19 | 20 | import java.io.File; 21 | import java.io.InputStream; 22 | import java.math.BigInteger; 23 | import java.security.MessageDigest; 24 | 25 | import junit.framework.TestCase; 26 | 27 | public class ChainedCBZip2InputStreamTest extends TestCase { 28 | public void testConsumeWholeStream() throws Exception { 29 | MessageDigest md5 = MessageDigest.getInstance("MD5"); 30 | InputStream stream = 31 | new ChainedCBZip2InputStream(new File("src/test/resources/enwiktionary-20150224-pages-articles-multistream.xml.bz2")); 32 | long count = 0; 33 | int n; 34 | byte[] buffer = new byte[8192]; 35 | while ((n = stream.read(buffer)) != -1) { 36 | count += n; 37 | md5.update(buffer, 0, n); 38 | } 39 | String signature = new BigInteger(1, md5.digest()).toString(16); 40 | assertEquals(1800617, count); 41 | assertEquals("bde6a439065407c9c74c83b1f2f97520", signature); 42 | 43 | stream.close(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/IHeadwordLineHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import java.util.regex.Pattern; 21 | import java.util.stream.Stream; 22 | 23 | interface IHeadwordLineHandler { 24 | Pattern LEGACY_PATTERN = Pattern.compile("\\A'''[^']+'''"); 25 | 26 | default boolean isTemplate(String line) { 27 | return line.startsWith("{{"); 28 | } 29 | 30 | default boolean isExcludedTemplate(String line) { 31 | return Stream.of( 32 | "{{wikipedia", 33 | "{{slim-wikipedia", 34 | "{{wiki}}", 35 | "{{wikispecies", 36 | "{{wikiversity", 37 | "{{wikiquote", 38 | "{{commons", 39 | "{{attention", 40 | "{{rfc", 41 | "{{examples", 42 | "{{enum|", 43 | "{{no entry" 44 | ).anyMatch(templ -> line.toLowerCase().contains(templ)); 45 | } 46 | 47 | default boolean isLegacyHeader(String line) { 48 | return LEGACY_PATTERN.matcher(line).find(); 49 | } 50 | 51 | default boolean isHeadwordLine(String line) { 52 | return isLegacyHeader(line) || (isTemplate(line) && !isExcludedTemplate(line)); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalCase.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical case of a {@link IWiktionaryWordForm}. 24 | * @author Christian M. Meyer 25 | */ 26 | public enum GrammaticalCase { 27 | 28 | /** Indicates the subject of a finite verb. Ask "Wer/Was?" in 29 | * German sentences (e.g., "_Peter_ liest"). */ 30 | NOMINATIVE, 31 | 32 | /** Indicates the direct object of a verb. Ask "Wen/Was?" in 33 | * German sentences (e.g., "Peter liest _ein Buch_"). */ 34 | ACCUSATIVE, 35 | 36 | /** Indicates the indirect object of a verb. Ask "Wem?" in 37 | * German sentences (e.g., "Peter liest _ihr_ vor").*/ 38 | DATIVE, 39 | 40 | /** Indicates possession. Ask "Wessen?" in German sentences 41 | * (e.g., "_Peters_ Buch ist spannend").*/ 42 | GENITIVE; // 43 | 44 | //ABLATIVE, // indicates movement from smth. or cause 45 | //VOCATIVE, // indicates addressee 46 | //LOCATIVE, // indicates locatoin 47 | //INSTRUMENTAL, // indicates intrument used for action 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Angestellte.txt: -------------------------------------------------------------------------------- 1 | {{Siehe auch|[[angestellte]]}} 2 | == Angestellte ({{Sprache|Deutsch}}) == 3 | === {{Wortart|Substantiv|Deutsch}}, adjektivische Deklination, {{f}} === 4 | 5 | {{Deutsch adjektivische Deklination f|Angestellte}} 6 | 7 | {{Worttrennung}} 8 | :An·ge·stell·te, {{Pl.}} An·ge·stell·ten 9 | 10 | {{Aussprache}} 11 | :{{IPA}} {{Lautschrift|ˈanɡəʃtɛltə}}, {{Pl.}} {{Lautschrift|ˈanɡəʃtɛltn̩}} 12 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}} 13 | 14 | {{Bedeutungen}} 15 | :[1] Frau, die gegen Bezahlung ihre Arbeitskraft zu Verfügung stellt (nicht-körperliche Arbeit) 16 | 17 | {{Abkürzungen}} 18 | :[1] [[Angest.]] 19 | 20 | {{Gegenwörter}} 21 | :[1] [[Arbeiterin]], [[Beamte]] 22 | 23 | {{Männliche Wortformen}} 24 | :[1] [[Angestellter]] 25 | 26 | {{Oberbegriffe}} 27 | :[1] [[Arbeitnehmerin]] 28 | 29 | {{Beispiele}} 30 | :[1] Die sogenannten Markteinkommen von Arbeitern und ''Angestellten'' sind in den vergangenen Jahren deutlich gesunken. 31 | 32 | {{Absatz}} 33 | ==== Übersetzungen ==== 34 | {{Ü-links}} 35 | *{{en}}: [1] {{Ü|en|employee}} 36 | *{{fr}}: [1] {{Ü|fr|employée}} {{f}} 37 | *{{nl}}: [1] {{Ü|nl|werknemer}} 38 | {{Ü-Abstand}} 39 | *{{sv}}: [1] {{Ü|sv|anställd}} 40 | *{{sk}}: [1] {{Ü|sk|zamestnanec}} 41 | *{{es}}: [1] {{Ü|es|empleado}} 42 | *{{hu}}: [1] {{Ü|hu|alkalmazott}} 43 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link rechts unterhalb des Editierfensters --> 44 | 45 | {{Referenzen}} 46 | :[1] {{Wikipedia|Angestellte}} 47 | :[1] {{Ref-DWDS|Angestellte}} 48 | :[1] {{Ref-Canoo|Angestellte}} 49 | :[1] {{Ref-UniLeipzig|Angestellte}} 50 | :[1] {{Ref-FreeDictionary|Angestellte}} 51 | 52 | {{Ähnlichkeiten}} 53 | :[[eingestellt]] 54 | 55 | [[el:Angestellte]] 56 | [[en:Angestellte]] 57 | [[fi:Angestellte]] 58 | [[fr:Angestellte]] 59 | [[hu:Angestellte]] 60 | [[id:Angestellte]] 61 | [[io:Angestellte]] 62 | [[it:Angestellte]] 63 | [[ko:Angestellte]] 64 | [[sv:Angestellte]] 65 | [[zh:Angestellte]] 66 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Hallo.txt: -------------------------------------------------------------------------------- 1 | {{Siehe auch|[[hallo]]}} 2 | == Hallo ({{Sprache|Deutsch}}) == 3 | === {{Wortart|Substantiv|Deutsch}}, {{n}} === 4 | 5 | {{Deutsch Substantiv Übersicht 6 | |Nominativ Singular=das Hallo 7 | |Nominativ Plural=die Hallos 8 | |Genitiv Singular=des Hallos 9 | |Genitiv Plural=der Hallos 10 | |Dativ Singular=dem Hallo 11 | |Dativ Plural=den Hallos 12 | |Akkusativ Singular=das Hallo 13 | |Akkusativ Plural=die Hallos 14 | }} 15 | 16 | {{Worttrennung}} 17 | :Hal·lo, {{Pl.}} Hal·los 18 | 19 | {{Aussprache}} 20 | :{{IPA}} {{Lautschrift|haˈloː}}, {{Pl.}} {{Lautschrift|haˈloːs}} 21 | :{{Hörbeispiele}} {{Audio|De-Hallo.ogg|Hallo}}, {{Pl.}} {{fehlend}} 22 | 23 | {{Bedeutungen}} 24 | :[1] {{ugs.|:}} große, auf eine Person gerichtete [[Aufmerksamkeit]] 25 | 26 | {{Herkunft}} 27 | :Substantivierung des Grußworts [[hallo]] 28 | 29 | {{Synonyme}} 30 | :[1] [[Hallihallo]], [[Trubel]], [[Jubel]], [[Aufstand]], [[Heiterkeit]] 31 | 32 | {{Beispiele}} 33 | :[1] Als er die Treppe hinaufkam, wurde er mit großem ''Hallo'' empfangen. 34 | 35 | {{Charakteristische Wortkombinationen}} 36 | :[1] großes ''Hallo'' 37 | 38 | ==== Übersetzungen ==== 39 | {{Ü-links}} 40 | *{{en}}: [1] {{Ü|en|uproar}} 41 | *{{fr}}: [1] {{Ü|fr|animaton}} 42 | {{Ü-Abstand}} 43 | *{{pt}}: [1] {{Ü|pt|olá}} 44 | *{{sv}}: [1] {{Ü|sv|hallå}}, {{Ü|sv|ståhej}} 45 | *{{es}}: [1] {{Ü|es|barullo}} {{m}}, {{Ü|es|jaleo}} {{m}} 46 | {{Ü-rechts}} 47 | 48 | {{Dialektausdrücke (Deutsch)| 49 | *Süddeutsch: Grüß´ Gott oder Servus 50 | | 51 | *Schwäbisch: Hallöle (langes ö) 52 | }} 53 | 54 | {{Referenzen}} 55 | :[1] {{Wikipedia|Hallo}} 56 | :[1] {{Ref-DWDS|Hallo}} 57 | :[*] {{Ref-Canoo|Hallo}} 58 | :[1] {{Ref-UniLeipzig|Hallo}} 59 | :[1] {{Ref-FreeDictionary|Hallo}} 60 | 61 | {{Ähnlichkeiten}} 62 | :[[Hall]], [[Halle]], [[halle]], [[Halo]], [[holla]], [[Holle]] 63 | 64 | [[fr:Hallo]] 65 | [[hu:Hallo]] 66 | [[io:Hallo]] 67 | [[it:Hallo]] 68 | [[mg:Hallo]] 69 | [[ru:Hallo]] 70 | [[zh:Hallo]] 71 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/IWordFormHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2015 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import java.util.List; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalGender; 24 | 25 | public interface IWordFormHandler { 26 | /** 27 | * Start parsing the specified text for inflected word forms. The 28 | * extracted forms can be accessed using {@link #getWordForms()} 29 | * once all lines have been parsed. 30 | * 31 | * @param line a line of wikitext 32 | * @return whether the handler could parse the line 33 | */ 34 | boolean parse(String line); 35 | 36 | /** 37 | * @return a list of extracted word forms, or an empty list. 38 | */ 39 | List getWordForms(); 40 | 41 | /** 42 | * @return the extracted genders (might be null). 43 | */ 44 | List getGenders(); 45 | 46 | /** 47 | * @return the unprocessed headline 48 | * @see WT:EL Headword line 49 | */ 50 | String getRawHeadwordLine(); 51 | } 52 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Tetragraph.txt: -------------------------------------------------------------------------------- 1 | == Tetragraph ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{mn}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Genus 1=m 6 | |Genus 2=n 7 | |Nominativ Singular 1=Tetragraph 8 | |Nominativ Singular 2=Tetragraph 9 | |Nominativ Plural=Tetragraphen 10 | |Genitiv Singular 1=Tetragraphen 11 | |Genitiv Singular 2=Tetragraphs 12 | |Genitiv Plural=Tetragraphen 13 | |Dativ Singular 1=Tetragraphen 14 | |Dativ Singular 2=Tetragraph 15 | |Dativ Plural=Tetragraphen 16 | |Akkusativ Singular 1=Tetragraphen 17 | |Akkusativ Singular 2=Tetragraph 18 | |Akkusativ Plural=Tetragraphen 19 | }} 20 | 21 | {{Alternative Schreibweisen}} 22 | :[[Tetragraf]] 23 | 24 | {{Worttrennung}} 25 | :Te·t·ra·graph, {{Pl.}} Te·t·ra·gra·phen 26 | 27 | {{Aussprache}} 28 | :{{IPA}} {{Lautschrift|tetʀaˈɡʀaːf}} 29 | :{{Hörbeispiele}} {{Audio|}} 30 | :{{Reime}} {{Reim|aːf|Deutsch}} 31 | 32 | {{Bedeutungen}} 33 | :[1] ''[[Linguistik]]:'' Folge von vier Buchstaben, die einen einzigen Laut repräsentieren 34 | 35 | {{Gegenwörter}} 36 | :[1] [[Digraph]], [[Trigraph]] 37 | 38 | {{Oberbegriffe}} 39 | :[1] [[Graph]] 40 | 41 | {{Beispiele}} 42 | :[1] Im Deutschen steht der ''Tetragraph'' "tsch" für den Laut [ʧ]. 43 | :[1] „»Der obligatorische Wechsel zum Kyrillischen hat groteske Verrenkungen erforderlich gemacht, so die Verwendung von diakritischen Zeichen, von Digraphen, Trigraphen und sogar - zur Darstellung des entstimmten aspirierten labialisierten uvularen Plosivs im Kabardinischen - von einem ''Tetragraphen''«.“<ref>{{Literatur | Autor=Jonathan Littell | Titel=Die Wohlgesinnten | Verlag=Berliner Taschenbuch Verlag | Ort=Berlin | Jahr=2009 (französisches Original 2006)| ISBN=978-3-8333-0628-0}}, Seite 306f.</ref> 44 | 45 | ==== {{Übersetzungen}} ==== 46 | {{Ü-Tabelle|Ü-links= 47 | *{{en}}: [1] {{Ü|en|}} 48 | *{{fr}}: [1] {{Ü|fr|}} 49 | |Ü-rechts= 50 | *{{es}}: [1] {{Ü|es|}} 51 | }} 52 | 53 | {{Referenzen}} 54 | :[1] {{Wikipedia|Digraph (Linguistik)#Trigraph, Tetragraph, …}} 55 | 56 | {{Quellen}} 57 | 58 | [[Kategorie:Fremdwort]] 59 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/MehrzahlHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import java.util.regex.Matcher; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalNumber; 24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 25 | 26 | public class MehrzahlHandler extends PatternBasedIndexedParameterHandler { 27 | 28 | protected static final String MEHRZAHL_PATTERN = 29 | // endsWith(" (Mehrzahl)") 30 | "\\s\\(Mehrzahl\\)$|" + 31 | // endsWith(" (Mehrzahl 1)") || endsWith(" (Mehrzahl 2)") || 32 | // endsWith(" (Mehrzahl 3)") || endsWith(" (Mehrzahl 4)") 33 | "\\s\\(Mehrzahl\\s([1-4])\\)$"; 34 | 35 | public MehrzahlHandler(DEWordFormNounTableHandler nounTableHandler) { 36 | super(nounTableHandler, MEHRZAHL_PATTERN); 37 | } 38 | 39 | @Override 40 | public void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher, 41 | ParsingContext context) { 42 | wordForm.setNumber(GrammaticalNumber.PLURAL); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/Pronunciation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.entry; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IPronunciation; 21 | 22 | /** 23 | * Default implementation of the {@link IPronunciation} interface. 24 | * See there for details. 25 | * @author Christian M. Meyer 26 | */ 27 | public class Pronunciation implements IPronunciation { 28 | 29 | protected PronunciationType type; 30 | protected String text; 31 | protected String note; 32 | 33 | /** Creates a new, empty pronunciation. */ 34 | public Pronunciation() {} 35 | 36 | /** Creates a new pronunciation for the given representation text, 37 | * notation type and addition information. For audio files, the 38 | * representation text refers to an audio file name. */ 39 | public Pronunciation(final PronunciationType type, 40 | final String text, final String note) { 41 | this.type = type; 42 | this.text = text; 43 | this.note = note; 44 | } 45 | 46 | public PronunciationType getType() { 47 | return type; 48 | } 49 | 50 | public String getText() { 51 | return text; 52 | } 53 | 54 | public String getNote() { 55 | return note; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Kunsttherapie.txt: -------------------------------------------------------------------------------- 1 | == Kunsttherapie ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{f}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Nominativ Singular= die Kunsttherapie 6 | |Nominativ Plural=die Kunsttherapien 7 | |Genitiv Singular=der Kunsttherapie 8 | |Genitiv Plural=der Kunsttherapien 9 | |Dativ Singular=der Kunsttherapie 10 | |Dativ Plural=den Kunsttherapien 11 | |Akkusativ Singular=die Kunsttherapie 12 | |Akkusativ Plural=die Kunsttherapien 13 | }} 14 | 15 | {{Worttrennung}} 16 | :Kunst·the·ra·pie, {{Pl.}} Kunst·the·ra·pien 17 | 18 | {{Aussprache}} 19 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}} 20 | :{{IPA}} {{Lautschrift|ˈkʊnstteʀapiː}}, {{Pl.}} {{Lautschrift|ˈkʊnstteʀapiːən}} 21 | 22 | {{Bedeutungen}} 23 | :[1] Therapie mit bildnerischen Medien 24 | 25 | {{Bedeutungen}} 26 | :[[Determinativkompositum]] aus ''[[Kunst]]'' und ''[[Therapie]]'' 27 | 28 | {{Oberbegriffe}} 29 | :[1] [[Therapie]] 30 | 31 | {{Unterbegriffe}} 32 | :[1] [[Maltherapie]], [[Gestaltungstherapie]] 33 | 34 | {{Beispiele}} 35 | :[1] Die ''Kunsttherapie'' wird zu den ''Künstlerischen Therapien'' gezählt. Sie ist eine u. a. in der [[Psychiatrie]] und [[Psychosomatik]] verbreitete therapeutische Disziplin. 36 | 37 | 38 | ==== Übersetzungen ==== 39 | {{Ü-links}} 40 | *{{en}}: [1] {{Ü|en|art therapy}} 41 | *{{fr}}: [1] {{Ü|fr|Art-thérapie}} 42 | *{{he}}: [1] {{Ü|he|תרפיה בהבעה ויצירה}} 43 | *{{ca}}: [1] {{Ü|ca|Artteràpia}} 44 | {{Ü-Abstand}} 45 | *{{pl}}: [1] {{Ü|pl|Arteterapia}} 46 | *{{pt}}: [1] {{Ü|pt|Arte terapia}} 47 | *{{sv}}: [1] {{Ü|sv|konstterapi}} 48 | *{{sr}}: [1] {{Ü|sr|Арт терапија}} 49 | *{{sk}}: [1] {{Ü|sk|Arte terapia}} 50 | *{{es}}: [1] {{Ü|es|Arteterapia}} 51 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link unterhalb des Editierfensters --> 52 | 53 | {{Referenzen}} 54 | :[1] {{Wikipedia|Kunsttherapie}} 55 | :[1] {{Ref-DWDS|Kunsttherapie}} 56 | :[1] {{Ref-Canoo|Kunsttherapie}} 57 | :[1] {{Ref-UniLeipzig|Kunsttherapie}} 58 | 59 | [[pl:Kunsttherapie]] 60 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/as_much_as_possible.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | 3 | ===Adverb=== 4 | {{en-adv|head=[[as]] [[much]] [[as]] [[possible]]|-}} 5 | 6 | # As [[much]] as is [[possible]]. 7 | 8 | ====Usage notes==== 9 | This is not an idiom. It is a particularly common instance of the general construction: "as X as Y", where both X and Y have a large range of possibilities. X can be an adjectival, an adverbial, or a quantifier determiner (much, little, many, few) and Y can a clauses or an ellipsis of a clause. The full clause for which "possible" is an ellipsis depends on the preceding verb. "He ate as much as possible" is an ellipsis for "He ate as much as it was possible for him to eat." When Y is a noun ("He ate as much as John."), the ellipsis is still for a clause ("He ate as much as John ate."). 10 | 11 | ====Translations==== 12 | {{trans-top|as much as is possible}} 13 | * Arabic: {{t|ar|قَدْر اَلْمُسْتَطَاع}}, {{t|ar|قَدْر اَلْإِمْكَان}} 14 | * Chinese: 15 | *: Mandarin: {{t+|cmn|盡量|sc=Hani}} or {{t+|cmn|儘量|sc=Hani}}, {{t|cmn|尽量|tr=jǐnliàng|sc=Hani}} 16 | * Finnish: {{t|fi|niin paljon kuin mahdollista}} 17 | * French: {{t+|fr|autant que possible}}, {{t|fr|tout le possible}}, {{t|fr|le plus possible}} 18 | * German: {{t|de|so viel wie möglich}}, {{t|de|möglichst viel}} 19 | * Hebrew: {{t|he|ככל האפשר|tr=kekhol ha'efshar|sc=Hebr}} 20 | {{trans-mid}} 21 | * Japanese: {{t|ja|できるだけ|tr=dekiru dake|sc=Jpan}}, {{t|ja|有らん限り|tr=あらんかぎり, aran kagiri|sc=Jpan}}, {{t|ja|成るべく|tr=なるべく, naru-beku|sc=Jpan}} 22 | * Persian: {{t|fa|تا جای امکان|sc=fa-Arab}}, {{t|fa|حتی‌الامکان|tr=hattal-emkaan|sc=fa-Arab}} 23 | * Portuguese: {{t|pt|[[todo]] [[o]] [[possível]]}}, {{t|pt|[[o]] [[máximo]] [[possível]]}} 24 | * Russian: {{t|ru|как мо́жно бо́льше}} 25 | * Serbo-Croatian: {{t|sh|što je više moguće}} 26 | * Spanish: {{t|es|todo lo posible}} 27 | * Telugu: {{t|te|వీలైనంతవరకు|tr=veelainantavaraku|sc=Telu}} 28 | * Volapük: {{t+|vo|mögiküno}} 29 | {{trans-bottom}} 30 | 31 | [[Category:English phrasebook]] 32 | 33 | [[pt:as much as possible]] 34 | [[chr:as much as possible]] 35 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/IWiktionaryDumpParser.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser; 19 | 20 | import java.io.File; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.WiktionaryException; 23 | 24 | /** 25 | * Parser for Wiktionary dump files obtained from 26 | * http://download.wikimedia.org/backup-index.html. 27 | * @author Christian M. Meyer 28 | */ 29 | public interface IWiktionaryDumpParser { 30 | public static final int BATCH_SIZE = 25000; 31 | 32 | /** 33 | * Starts the parsing of the given dump file. The file can be either 34 | * bzip2-compressed or the extracted XML version. 35 | * 36 | * @param dumpFile the dumpFile 37 | * @throws WiktionaryException in case of any parser errors. 38 | */ 39 | void parse(final File dumpFile) throws WiktionaryException; 40 | 41 | /** 42 | * Register the given {@link IWiktionaryPageParser}. The registered 43 | * parser will then be notified once a Wiktionary-related XML tag 44 | * has been processed. 45 | */ 46 | void register(final IWiktionaryPageParser pageParser); 47 | 48 | /** 49 | * Returns the list of all registered {@link IWiktionaryPageParser}s. 50 | */ 51 | Iterable getPageParsers(); 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/PatternBasedParameterHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import java.util.Objects; 21 | import java.util.regex.Matcher; 22 | import java.util.regex.Pattern; 23 | 24 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.IWiktionaryWordFormTemplateParameterHandler; 26 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 27 | 28 | public abstract class PatternBasedParameterHandler implements IWiktionaryWordFormTemplateParameterHandler { 29 | 30 | protected final Pattern pattern; 31 | 32 | public PatternBasedParameterHandler(String regex) { 33 | Objects.requireNonNull(regex, "regex must not be null."); 34 | this.pattern = Pattern.compile(regex); 35 | } 36 | 37 | @Override 38 | public void reset() { 39 | // Nothing to do 40 | } 41 | 42 | public boolean canHandle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) { 43 | if (label == null) { 44 | return false; 45 | } 46 | final Matcher matcher = pattern.matcher(label); 47 | return matcher.find(); 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/constant/ContextLabel.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant; 17 | 18 | import java.util.Map; 19 | import java.util.HashMap; 20 | 21 | /** Contexual information for definitions, such as archaic, by analogy, 22 | * chemistry, etc. 23 | * 24 | * See http://en.wiktionary.org/wiki/Template_talk:context 25 | */ 26 | public abstract class ContextLabel { 27 | 28 | /** Two (or more) letter label code, e.g. 'устар.', 'п.'. */ 29 | // private final String label; 30 | 31 | /** Label name, e.g. 'устарелое', 'переносное значение'. */ 32 | // private final String name; 33 | 34 | /** Category associated with this label. */ 35 | // private final String category; 36 | 37 | private static Map label2name = new HashMap<>(); 38 | private static Map label2category = new HashMap<>(); 39 | 40 | protected ContextLabel(String label,String name,String category) { 41 | // this.label = label; 42 | // this.name = name; 43 | // this.category = category; 44 | label2name. put(label, name); 45 | label2category. put(label, category); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/mitreissen.txt: -------------------------------------------------------------------------------- 1 | == mitreißen ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Verb|Deutsch}}, ''unregelmäßig'' === 3 | 4 | {{Verb-Tabelle 5 | |Gegenwart_ich=reiße mit 6 | |Gegenwart_du=reißt mit 7 | |Gegenwart_er, sie, es=reißt mit 8 | |1.Vergangenheit_ich=riss mit 9 | |Partizip II=mitgerissen 10 | |Konjunktiv II_ich=risse mit 11 | |Befehl_du=reiß mit! 12 | |Befehl_ihr=reißt mit! 13 | |Hilfsverb=haben 14 | |Weitere_Konjugationen= mitreißen (Konjugation) 15 | }} 16 | 17 | {{Alternative Schreibweisen}} 18 | :''[[Hilfe:Schweiz und Liechtenstein|Schweiz und Liechtenstein]]:'' [[mitreissen]] 19 | 20 | {{Worttrennung}} 21 | :mit·rei·ßen, {{Prät.}} riss mit, {{Part.}} mit·ge·ris·sen 22 | 23 | {{Aussprache}} 24 | :{{IPA}} {{Lautschrift|ˈmɪtˌʀaɪ̯sn̩}}, {{Prät.}} {{Lautschrift|ˌʀɪs ˈmɪt}}, {{Part.}} {{Lautschrift|ˈmɪtɡəˈʀɪsn̩}} 25 | :{{Hörbeispiele}} {{fehlend}}, {{Prät.}} {{fehlend}}, {{Part.}} {{fehlend}} 26 | 27 | {{Bedeutungen}} 28 | :[1] jemanden/etwas mit sich zerren 29 | :[2] Enthusiasmus verbreiten 30 | 31 | {{Sinnverwandte Wörter}} 32 | :[1] [[fortreißen]], [[wegreißen]] 33 | :[2] [[begeistern]] 34 | 35 | {{Oberbegriffe}} 36 | :[1] [[reißen]] 37 | :[2] [[beeindrucken]] 38 | 39 | {{Beispiele}} 40 | :[1] Der Felssturz ''riss'' sie ''mit'' in die Tiefe. 41 | :[2] Er ''riss'' die Menschenmenge ''mit.'' Der Enthusiasmus war groß. 42 | 43 | ==== Übersetzungen ==== 44 | {{Ü-links}} 45 | *{{eu}}: [2] {{Ü|eu|poztu}} 46 | *{{en}}: [2] {{Ü|en|fill with enthusiasm}} 47 | *{{fr}}: {{in lateinischer Schrift}}[2] {{Ü|fr|enthousiasmer}} 48 | {{Ü-Abstand}} 49 | *{{no}}: {{Ü|no|dra}}/{{Ü|no|trekke}} {{Ü|no|med}} {{Ü|no|seg}}; [2] {{Ü|no|rive}} {{Ü|no|med}}, {{Ü|no|begeistre}} 50 | *{{sv}}: [1] ''refl.:'' {{Ü|sv|dra med sig}}, {{Ü|sv|svepa med sig}}; [1, 2] ''refl.:'' {{Ü|sv|rycka med sig}} 51 | *{{es}}: [2] {{Ü|es|apasionar}}, {{Ü|es|arrebatar}} 52 | {{Ü-rechts}} 53 | 54 | {{Referenzen}} 55 | :[1, 2] {{Ref-DWDS|mitreißen}} 56 | :[2] {{Ref-Canoo|mitrei%DFen}} 57 | :[2] {{Ref-UniLeipzig|mitrei%DFen}} 58 | :[1, 2] {{Ref-FreeDictionary|mitreißen}} 59 | 60 | {{Ähnlichkeiten}} 61 | :[[mitreisen]] 62 | 63 | [[ko:mitreißen]] 64 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEGenderText.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components; 19 | 20 | import java.text.MessageFormat; 21 | import java.util.Objects; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalGender; 24 | 25 | public enum DEGenderText { 26 | 27 | NULL(null, null), 28 | M("m", GrammaticalGender.MASCULINE), 29 | F("f", GrammaticalGender.FEMININE), 30 | N("n", GrammaticalGender.NEUTER), 31 | X("x", null), 32 | _0("0", null), 33 | PL("pl", null), 34 | P_L("Pl", null); 35 | 36 | private final String genderText; 37 | private final GrammaticalGender gender; 38 | 39 | private DEGenderText(String genderText, GrammaticalGender gender) { 40 | this.genderText = genderText; 41 | this.gender = gender; 42 | } 43 | 44 | public GrammaticalGender asGrammaticalGender() { 45 | return this.gender; 46 | } 47 | 48 | public static DEGenderText of(String genderText) { 49 | Objects.requireNonNull(genderText, "genderText must not be null"); 50 | for (DEGenderText value : values()) { 51 | if (Objects.equals(genderText, value.genderText)) { 52 | return value; 53 | } 54 | } 55 | throw new IllegalArgumentException(MessageFormat.format("Unrecognized gender text [{0}].", genderText)); 56 | } 57 | } -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/util/POSText.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.util; 17 | 18 | //import wikt.constant.POSType; 19 | import de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.constant.POS; 20 | 21 | /** Data structure consists of a POS code and the corresponding text. */ 22 | public class POSText { 23 | 24 | /** Part of speech code of the text. */ 25 | private POS pos; 26 | 27 | /** POS name found in text, e.g. explicitly: "Verb", or implicitly "stitch I". */ 28 | //private String pos_name; 29 | 30 | /** Text */ 31 | private StringBuffer text; 32 | 33 | public POSText() {} 34 | 35 | /*public POSText(POSType _pos) { //, StringBuffer _text) { 36 | pos = _pos; 37 | text = new StringBuffer(); 38 | //text = _text; 39 | }*/ 40 | 41 | //public POSText(POSType _pos, StringBuffer _text) { 42 | public POSText(POS _pos, String _text) { 43 | pos = _pos; 44 | text = new StringBuffer(_text); 45 | } 46 | 47 | public POSText(POS _pos, StringBuffer _text) { 48 | pos = _pos; 49 | text = _text; 50 | } 51 | 52 | public POS getPOSType() { 53 | return pos; 54 | } 55 | 56 | public StringBuffer getText() { 57 | return text; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENUsageNotesHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString; 21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryEntry; 22 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 23 | 24 | public class ENUsageNotesHandler extends ENBlockHandler { 25 | private StringBuilder usageNotes; 26 | 27 | public ENUsageNotesHandler() { 28 | super("Usage notes"); 29 | } 30 | 31 | @Override 32 | public boolean processHead(String text, ParsingContext context) { 33 | usageNotes = new StringBuilder(); 34 | return super.processHead(text, context); 35 | } 36 | 37 | @Override 38 | public boolean processBody(String textLine, ParsingContext context) { 39 | textLine = textLine.trim(); 40 | if (!textLine.isEmpty()) { 41 | usageNotes.append(textLine).append("\n"); 42 | } 43 | return super.processBody(textLine, context); 44 | } 45 | 46 | @Override 47 | public void fillContent(ParsingContext context) { 48 | if (usageNotes.length() > 0) { 49 | WiktionaryEntry entry = context.findEntry(); 50 | if (entry == null) { 51 | throw new RuntimeException("entry is null"); 52 | } 53 | entry.setUsageNotes(new WikiString(usageNotes.toString().trim())); 54 | } 55 | super.fillContent(context); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/test/resources/articles-en/gumbo.txt: -------------------------------------------------------------------------------- 1 | ==English== 2 | {{wikipedia}} 3 | [[File:Bozogumbo.jpg|thumb|Gumbo (stew) with okra pods.]] 4 | 5 | ===Etymology=== 6 | From {{etyl|bnt|en}} {{term|ngombo}}, {{term|kingombo||okra plant}}, possibly via {{etyl|gul|en}}.Oxford American DictionariesThe Chambers Dictionary, 1994, ISBN 0-550-10255-8 Cognate to {{etyl|pt|-}} {{term|quiabo|lang=pt}}, Caribbean {{etyl|es|-}} {{term|guingambó|lang=es}}, and cognates in other Romance languages. 7 | 8 | ===Pronunciation=== 9 | * {{rhymes|ʌmbəʊ|lang=en}} 10 | 11 | ===Noun=== 12 | {{en-noun|~|gumbos}} 13 | 14 | # {{context|countable|lang=en}} The [[okra]] plant or its pods. 15 | # {{context|uncountable|lang=en}} A soup or stew made with [[okra]]. 16 | # {{context|uncountable|lang=en}} A fine [[silty]] [[soil]] that when wet becomes very thick and heavy. 17 | #* '''1909''', [[w:Ralph Connor|Ralph Connor]], ''The Foreigner'', ch. 11: 18 | #*: The team stuck fast in the black muck, and every effort to extricate them served only to imbed them more hopelessly in the sticky '''gumbo'''. 19 | #* '''1914''' April, "Making Good Roads by Firing Poor Ones," ''Popular Mechanics'', [http://books.google.ca/books?id=890DAAAAMBAJ&pg=PA567&dq=gumbo+caulk+OR+glue+OR+sticky+OR+adhesive+OR+gummy&hl=en&ei=e12LTuftL8LY0QG4osTkBA&sa=X&oi=book_result&ct=result&resnum=7&ved=0CF0Q6AEwBg#v=onepage&q=gumbo%20caulk%20OR%20glue%20OR%20sticky%20OR%20adhesive%20OR%20gummy&f=false p. 567]: 20 | #*: There are no poorer roads in all the United States than the "'''gumbo'''" roads of the south—'''gumbo''' being the name give a certain kind of mud or clay that is particularly sticky, clings tenaciously, seems to have no bottom, and will not support any weight. 21 | #* '''1950''' July 3, "[http://www.time.com/time/magazine/article/0,9171,812721,00.html Labor: Trouble at Lowland]," ''Time'': 22 | #*: The red '''gumbo''' soil uttered ugly sucking sounds at the touch of a man's boot. 23 | 24 | ====Synonyms==== 25 | * {{sense|okra plant}} [[okra]], [[ladies' fingers]] 26 | 27 | ====Translations==== 28 | {{trans-top|okra}} 29 | * Portuguese: [[quiabeiro]] {{g|m}} (''plant''), [[quiabo]] {{g|m}} (''pods'') 30 | {{trans-mid}} 31 | {{trans-bottom}} 32 | 33 | ===References=== 34 | 35 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Brathaehnchen.txt: -------------------------------------------------------------------------------- 1 | >== Brathähnchen ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{n}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Bild=Roast chicken.jpg|250px|1|Brathähnchen 6 | |Nominativ Singular=das Brathähnchen 7 | |Nominativ Plural=die Brathähnchen 8 | |Genitiv Singular=des Brathähnchens 9 | |Genitiv Plural=der Brathähnchen 10 | |Dativ Singular=dem Brathähnchen 11 | |Dativ Plural=den Brathähnchen 12 | |Akkusativ Singular=das Brathähnchen 13 | |Akkusativ Plural=die Brathähnchen 14 | }} 15 | 16 | {{Worttrennung}} 17 | :Brat·hähn·chen, {{Pl.}} Brat·hähn·chen 18 | 19 | {{Aussprache}} 20 | :{{IPA}} {{Lautschrift|ˈbʀaːthɛːnçən}}, {{Lautschrift|ˈbʀaːthɛːnçn̩}}, {{Pl.}} {{Lautschrift|ˈbʀaːthɛːnçən}}, {{Lautschrift|ˈbʀaːthɛːnçn̩}} 21 | 22 | {{Aussprache}} 23 | :{{Hörbeispiele}} {{fehlend}}, {{Pl.}} {{fehlend}} 24 | 25 | {{Bedeutungen}} 26 | :[1] ein am [[Grill]] oder im [[Backofen]] gebratenes [[Huhn]] 27 | 28 | {{Herkunft}} 29 | :[[Determinativkompositum]] aus dem Stamm des Verbs [[braten]] und [[Hähnchen]] 30 | 31 | {{Synonyme}} 32 | :[1] [[Brathendl]], [[Brathuhn]], [[Brathühnchen]], [[Broiler]], [[Grillhähnchen]], [[Gummiadler]] 33 | 34 | {{Oberbegriffe}} 35 | :[1] [[Fleischgericht]], [[Lebensmittel]] 36 | 37 | {{Unterbegriffe}} 38 | :[1] [[Huhn]], [[Hähnchen]], [[Hühnchen]] 39 | 40 | {{Beispiele}} 41 | :[1] Das ''Brathähnchen'' bitte mit Salat. 42 | 43 | {{Charakteristische Wortkombinationen}} 44 | :[1] ein ''Brathähnchen'' mit [[Brötchen]], [[Pommes frites]], [[Salat]] 45 | 46 | ==== Übersetzungen ==== 47 | {{Ü-links}} 48 | *{{en}}: [1] {{Ü|en|roast chicken}} 49 | *{{fr}}: [1] {{Ü|fr|poulet rôti}} {{m}} 50 | *{{it}}: [1] {{Ü|it|}} 51 | {{Ü-Abstand}} 52 | *{{sv}}: [1] ''Grill:'' {{Ü|sv|grillad kyckling}}, ''Backofen:'' {{Ü|sv|ungsstekt kyckling}} 53 | *{{es}}: [1] {{Ü|es|pollo asado}} 54 | *{{hu}}: [1] {{Ü|hu|sültcsirke}}, {{Ü|hu|grillcsirke}} 55 | {{Ü-rechts}} <!-- für weitere Sprachkürzel siehe den Link unterhalb des Editierfensters --> 56 | 57 | {{Referenzen}} 58 | :[1] {{Wikipedia|Brathähnchen}} 59 | :[*] {{Ref-DWDS|Brathähnchen}} 60 | :[1] {{Ref-Canoo|Brath%E4hnchen}} 61 | :[1] {{Ref-UniLeipzig|Brath%E4hnchen}} 62 | 63 | [[ko:Brathähnchen]] 64 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Mockumentary.txt: -------------------------------------------------------------------------------- 1 | == Mockumentary ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{mfn}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Genus 1=m 6 | |Genus 2=f 7 | |Genus 3=n 8 | |Nominativ Singular 1=Mockumentary 9 | |Nominativ Singular 2=Mockumentary 10 | |Nominativ Singular 3=Mockumentary 11 | |Nominativ Plural=Mockumentarys 12 | |Genitiv Singular 1=Mockumentary 13 | |Genitiv Singular 2=Mockumentary 14 | |Genitiv Singular 2*=Mockumentarys 15 | |Genitiv Singular 3=Mockumentary 16 | |Genitiv Singular 3*=Mockumentarys 17 | |Genitiv Plural=Mockumentarys 18 | |Dativ Singular 1=Mockumentary 19 | |Dativ Singular 2=Mockumentary 20 | |Dativ Singular 3=Mockumentary 21 | |Dativ Plural=Mockumentarys 22 | |Akkusativ Singular 1=Mockumentary 23 | |Akkusativ Singular 2=Mockumentary 24 | |Akkusativ Singular 3=Mockumentary 25 | |Akkusativ Plural=Mockumentarys 26 | }} 27 | 28 | {{Worttrennung}} 29 | :Mo·cku·men·ta·ry, {{Pl.}} Mo·cku·men·ta·rys 30 | 31 | {{Aussprache}} 32 | :{{IPA}} {{Lautschrift|mɔkjuˈmɛntəʀi}} 33 | :{{Hörbeispiele}} {{Audio|}} 34 | 35 | {{Bedeutungen}} 36 | :[1] Film, der eine Dokumentation oder das Genre selbst parodiert 37 | 38 | {{Herkunft}} 39 | :von gleichbedeutend englisch ''{{Ü|en|mockumentary}}'' entlehnt, das ein Kofferwort aus ''{{Ü|en|mock}}'' „unecht“ und ''{{Ü|en|documentary}}'' „Dokumentatarfilm“ ist<ref>{{Ref-Duden}}</ref> 40 | 41 | {{Oberbegriffe}} 42 | :[1] [[Film]] 43 | 44 | {{Beispiele}} 45 | :[1] „Zuletzt lief von Stein eine ''Mockumentary'' über das "wahre" Ende der DDR im Fernsehen.“<ref>{{Per-Spiegel Online | Online=http://www.spiegel.de/kultur/tv/tatort-hal-aus-stuttgart-big-data-in-little-stuttgart-a-1103576.html | Autor=Christian Buß | Titel=Stuttgart-"Tatort" über künstliche Intelligenz: Mensch, Technik, Katastrophe | Tag=26 | Monat=08 | Jahr=2016 | Zugriff=2017-08-24 }}</ref> 46 | 47 | ==== {{Übersetzungen}} ==== 48 | {{Ü-Tabelle|Ü-links= 49 | *{{en}}: [1] {{Ü|en|mockumentary}} 50 | *{{fr}}: [1] {{Ü|fr|}} 51 | |Ü-rechts= 52 | *{{it}}: [1] {{Ü|it|}} 53 | *{{es}}: [1] {{Ü|es|}} 54 | }} 55 | 56 | {{Referenzen}} 57 | :[1] {{Wikipedia}} 58 | :[*] {{Ref-DWDS}} 59 | :[1] {{Ref-Duden}} 60 | :[*] {{Ref-UniLeipzig}} 61 | 62 | {{Quellen}} 63 | 64 | [[Kategorie:Entlehnung aus dem Englischen (Deutsch)]] -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DEWiktionaryEntryParserTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de; 19 | 20 | import java.io.BufferedReader; 21 | import java.io.File; 22 | import java.io.FileInputStream; 23 | import java.io.IOException; 24 | import java.io.InputStreamReader; 25 | 26 | import junit.framework.TestCase; 27 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage; 28 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryPage; 29 | import de.tudarmstadt.ukp.jwktl.parser.WiktionaryEntryParser; 30 | 31 | /** 32 | * Abstract test case for German Wiktionary parsers. 33 | * @author Christian M. Meyer 34 | */ 35 | public abstract class DEWiktionaryEntryParserTest extends TestCase { 36 | 37 | protected IWiktionaryPage parse(final String fileName) throws IOException { 38 | StringBuilder text = new StringBuilder(); 39 | BufferedReader reader = new BufferedReader( 40 | new InputStreamReader(new FileInputStream( 41 | new File("src/test/resources/articles-de/" + fileName)), 42 | "UTF-8")); 43 | String line; 44 | while ((line = reader.readLine()) != null) 45 | text.append(line).append("\n"); 46 | reader.close(); 47 | WiktionaryPage result = new WiktionaryPage(); 48 | result.setTitle(fileName); 49 | WiktionaryEntryParser parser = new DEWiktionaryEntryParser(); 50 | parser.parse(result, text.toString()); 51 | return result; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/GrammaticalAspect.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm; 21 | 22 | /** 23 | * Enumeration of the grammatical aspect of a {@link IWiktionaryWordForm}. 24 | * Note that tense is often combined with verb aspects (e.g., present 25 | * perfect). Such combinations can be modeled in combination with 26 | * enumeration values from {@link GrammaticalTense}. 27 | * @author Christian M. Meyer 28 | */ 29 | public enum GrammaticalAspect { 30 | 31 | /** An ongoing, habitual, repeated situation. Used to express the 32 | * English simple forms (e.g., "I paint the house") and 33 | * progressive forms ("I am painting the house"). The imperfect 34 | * aspect is also used for the German "Partizip I" form 35 | * (e.g. "die liebende Mutter"). */ 36 | IMPERFECT, 37 | 38 | /** A completed situation. Used to express the English perfect forms 39 | * (e.g., "I have painted the house"). The perfect aspect is also 40 | * used for the German "Partizip II" form 41 | * (e.g., "die geliebte Mutter"). */ 42 | PERFECT; 43 | 44 | // Perfective 45 | // Aorist 46 | // Momentane 47 | // Semelfactive 48 | // Imperfective 49 | // Continuous and progressive 50 | // Durative 51 | // Imperfect 52 | // Iterative/distributive/frequentative 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENDescendantRelationHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.RelationType; 24 | 25 | public class ENDescendantRelationHandler extends ENRelationHandler { 26 | public ENDescendantRelationHandler(String... labels) { 27 | super(RelationType.DESCENDANT, labels); 28 | } 29 | 30 | @Override 31 | protected WordList parseWordList(String text) { 32 | WordList list = super.parseWordList(text); 33 | if (list.size() > 1) { 34 | return new WordList(list.comment, fixDescendantWordList(list.words)); 35 | } else { 36 | return list; 37 | } 38 | } 39 | 40 | private static List fixDescendantWordList(List wordList) { 41 | String firstWord = wordList.get(0); 42 | final int colon = (firstWord == null ? -1 : firstWord.indexOf(':')); 43 | if (colon != -1) { 44 | List fixed = new ArrayList<>(wordList.size()); 45 | fixed.add(firstWord); 46 | 47 | String language = firstWord.substring(0, colon); 48 | for (int i = 1; i < wordList.size(); i++) { 49 | String word = wordList.get(i); 50 | if (word.indexOf(':') == -1) 51 | fixed.add(language + ": " + word); 52 | else 53 | fixed.add(word); 54 | } 55 | return fixed; 56 | } else { 57 | return wordList; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/EinzahlHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import java.util.regex.Matcher; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 23 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalNumber; 24 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEGenderText; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 26 | 27 | public class EinzahlHandler extends PatternBasedIndexedParameterHandler { 28 | 29 | protected static final String EINZAHL_PATTERN = 30 | // endsWith(" (Einzahl)") 31 | " \\(Einzahl\\)$|" + 32 | // endsWith(" (Einzahl 1)") || endsWith(" (Einzahl 2)") || 33 | // endsWith(" (Einzahl 3)") || endsWith(" (Einzahl 4)") 34 | " \\(Einzahl\\s([1-4])\\)$"; 35 | 36 | public EinzahlHandler(DEWordFormNounTableHandler nounTableHandler) { 37 | super(nounTableHandler, EINZAHL_PATTERN); 38 | } 39 | 40 | @Override 41 | public void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher, 42 | ParsingContext context) { 43 | wordForm.setNumber(GrammaticalNumber.SINGULAR); 44 | final DEGenderText genderText = this.nounTableHandler.getGenusByIndex(index); 45 | if (genderText != null) { 46 | wordForm.setGender(genderText.asGrammaticalGender()); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/WiktionaryTestCase.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl; 19 | 20 | import java.io.File; 21 | import java.util.logging.LogManager; 22 | 23 | import junit.framework.TestCase; 24 | 25 | /** 26 | * Abstract test case for JWKTL. 27 | * @author Christian M. Meyer 28 | */ 29 | public abstract class WiktionaryTestCase extends TestCase { 30 | 31 | protected static final File RESOURCE_PATH = new File("src/test/resources"); 32 | 33 | protected File workDir; 34 | 35 | @Override 36 | protected void setUp() throws Exception { 37 | super.setUp(); 38 | workDir = new File("target/test-output/" 39 | + getClass().getName() + "_" + this.getName()); 40 | deleteDirectory(workDir); 41 | workDir.mkdir(); 42 | } 43 | 44 | @Override 45 | protected void tearDown() throws Exception { 46 | deleteDirectory(workDir); 47 | super.tearDown(); 48 | } 49 | 50 | protected static boolean deleteDirectory(final File path) { 51 | if (path.exists()) { 52 | File[] files = path.listFiles(); 53 | for (File file : files) 54 | if (file.isDirectory()) { 55 | if (!deleteDirectory(file)) 56 | System.err.println("Unable to delete dir: " + file); 57 | } else { 58 | if (!file.delete()) 59 | System.err.println("Unable to delete file: " + file); 60 | } 61 | } 62 | return path.delete(); 63 | } 64 | 65 | static { 66 | LogManager.getLogManager().reset(); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/util/PatternUtilsTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.util; 19 | 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | 23 | import junit.framework.TestCase; 24 | 25 | /** 26 | * Test case for {@link PatternUtils}. 27 | * 28 | * @author Alexey Valikov 29 | */ 30 | public class PatternUtilsTest extends TestCase { 31 | 32 | /***/ 33 | public void testExtractIndex() { 34 | Pattern pattern = Pattern.compile("^Group$|^Group\\s([1-9,a-z])$"); 35 | try { 36 | assertEquals(null, PatternUtils.extractIndex(matcher(pattern, "Puorg"))); 37 | fail("Extracting index from non-matched matcher must fail."); 38 | } catch (IllegalArgumentException iaex) { 39 | assertTrue(true); 40 | } 41 | assertEquals(null, PatternUtils.extractIndex(matcher(pattern, "Group"))); 42 | assertEquals(Integer.valueOf(1), PatternUtils.extractIndex(matcher(pattern, "Group 1"))); 43 | assertEquals(Integer.valueOf(8), PatternUtils.extractIndex(matcher(pattern, "Group 8"))); 44 | try { 45 | PatternUtils.extractIndex(matcher(pattern, "Group q")); 46 | fail("Extracting index from non-integer group must fail."); 47 | } catch (NumberFormatException nfex) { 48 | assertTrue(true); 49 | } 50 | } 51 | 52 | private static Matcher matcher(Pattern pattern, String str) { 53 | Matcher matcher = pattern.matcher(str); 54 | matcher.find(); 55 | return matcher; 56 | } 57 | } -------------------------------------------------------------------------------- /src/test/resources/articles-de/Thulium.txt: -------------------------------------------------------------------------------- 1 | {{Periodensystem}} 2 | == Thulium ({{Sprache|Deutsch}}) == 3 | === {{Wortart|Substantiv|Deutsch}}, {{n}} === 4 | {{Substantiv-Tabelle| 5 | Bild=Tm-TableImage.png|210px|1|Thulium im Periodensystem 6 | |Wer oder was? (Einzahl)=das Thulium 7 | |Wer oder was? (Mehrzahl)=
8 | |Wessen? (Einzahl)=des Thuliums 9 | |Wessen? (Mehrzahl)=
10 | |Wem? (Einzahl)=dem Thulium 11 | |Wem? (Mehrzahl)=
12 | |Wen? (Einzahl)=das Thulium 13 | |Wen? (Mehrzahl)=
14 | }} 15 | {{Silbentrennung}} Thu·li·um, {{Pl.}} ''kein Plural'' 16 | 17 | {{Aussprache}} 18 | :[[Hilfe:IPA|IPA]]: {{Lautschrift|'tuːli̯ʊm}} 19 | :[[Hilfe:Hörbeispiele|Hörbeispiele]]: {{fehlend}} 20 | 21 | {{Bedeutungen}} 22 | :[1] [[chemisch]]es [[Element]] mit der Ordnungszahl 69, das zu den [[Lanthanoid]]en gehört 23 | 24 | {{Abkürzungen}} 25 | :[1] [[Tm]] ''(chemisches Zeichen)'' 26 | 27 | {{Herkunft}} 28 | :[1] nach Thule, dem mythischen Namen für Skandinavien 29 | 35 | 36 | {{Oberbegriffe}} 37 | :[1] [[Lanthanoid]], [[Metall]], chemisches [[Element]] 38 | 41 | 42 | {{Beispiele}} 43 | :[1] In der [[Natur]] kommt '''Thulium''' nur in [[Verbindung]]en vor. 44 | 45 | {{Charakteristische Wortkombinationen}} 46 | 47 | {{Abgeleitete Begriffe}} 48 | :[1] 49 | 50 | ==== Übersetzungen ==== 51 | {{Ü-links}} 52 | *{{ar}}: [1] {{Ü|ar|ثليوم}} 53 | *{{hy}}: [1] {{Ü|hy|թուլիում}} (tulium) 54 | *{{zh}}: [1] {{Ü|zh|铥}} 55 | *{{en}}: [1] {{Ü|en|thulium}} 56 | *{{fr}}: [1] {{Ü|fr|thulium}} ''m'' 57 | *{{he}}: [1] {{Ü|he|תוליום}} (tulium) 58 | *{{it}}: [1] {{Ü|it|tulio}} 59 | {{Ü-Abstand}} 60 | *{{lt}}: [1] {{Ü|lt|tulis}} 61 | *{{nl}}: [1] {{Ü|nl|thulium}} 62 | *{{pl}}: [1] {{Ü|pl|tul}} 63 | *{{pt}}: [1] {{Ü|pt|túlio}} 64 | *{{ru}}: [1] {{Ü|ru|тулий}} (tulij) 65 | *{{sv}}: [1] {{Ü|sv|tulium}} 66 | *{{es}}: [1] {{Ü|es|tulio}} 67 | {{Ü-rechts}} 68 | 69 | {{Referenzen}} 70 | :[1] {{Wikipedia|Thulium}} 71 | :[1] {{Ref-DWDS|Thulium}} 72 | :[1] {{Ref-Canoo|Thulium}} 73 | :[1] {{Ref-UniLeipzig|Thulium}} 74 | 75 | [[Kategorie:Illustration]] 76 | 77 | [[cs:Thulium]] 78 | [[en:Thulium]] 79 | [[fr:Thulium]] 80 | [[lt:Thulium]] 81 | [[pl:Thulium]] 82 | [[ro:Thulium]] 83 | [[ru:Thulium]] 84 | 85 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/DESenseExampleHandlerTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de; 19 | 20 | import java.util.Iterator; 21 | 22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 23 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryExample; 24 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage; 25 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DESenseExampleHandler; 26 | 27 | /** 28 | * Test case for {@link DESenseExampleHandler}. 29 | * @author Christian M. Meyer 30 | */ 31 | public class DESenseExampleHandlerTest extends DEWiktionaryEntryParserTest { 32 | 33 | /***/ 34 | public void testRuettelstreifen() throws Exception { 35 | IWiktionaryPage page = parse("Ruettelstreifen.txt"); 36 | IWiktionaryEntry entry = page.getEntry(0); 37 | Iterator exampleIter = entry.getSense(1).getExamples().iterator(); 38 | assertEquals("„Eine wirksame Maßnahme die Verkehrssicherheit zu steigern, sind z.B.: profilierte Fahrbahnmarkierungen oder ''Rüttelstreifen'' auf der Standspur.“", exampleIter.next().getText()); 39 | assertEquals("„''Rüttelstreifen'' am Fahrbahnrand von Autobahnen können die Zahl übermüdungsbedingter Verkehrsunfälle deutlich reduzieren.“", exampleIter.next().getText()); 40 | assertEquals("„Schwere Autobahn-Unfälle können mit Hilfe von sogenannten ''Rüttelstreifen'' deutlich verringert werden.“", exampleIter.next().getText()); 41 | assertFalse(exampleIter.hasNext()); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEEtymologyHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString; 21 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryEntry; 22 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 23 | 24 | /** 25 | * Parser component for extracting etymological information from the 26 | * German Wiktionary. 27 | * @author Christian M. Meyer 28 | * @author Lizhen Qu 29 | */ 30 | public class DEEtymologyHandler extends DEBlockHandler { 31 | 32 | protected StringBuilder etymology; 33 | 34 | /** Initializes the block handler for parsing all sections starting with 35 | * one of the specified labels. */ 36 | public DEEtymologyHandler() { 37 | super("Herkunft"); 38 | } 39 | 40 | @Override 41 | public boolean processHead(final String textLine, final ParsingContext context) { 42 | etymology = new StringBuilder(); 43 | return super.processHead(textLine, context); 44 | } 45 | 46 | @Override 47 | public boolean processBody(String textLine, final ParsingContext context) { 48 | textLine = textLine.trim(); 49 | if (!textLine.isEmpty()) 50 | etymology.append(textLine); 51 | return false; 52 | } 53 | 54 | public void fillContent(final ParsingContext context) { 55 | if (etymology.length() > 0) { 56 | WiktionaryEntry posEntry = context.findEntry(); 57 | posEntry.setWordEtymology(new WikiString(etymology.toString())); 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/PatternBasedIndexedParameterHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import java.util.Objects; 21 | import java.util.regex.Matcher; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.PatternUtils; 26 | 27 | public abstract class PatternBasedIndexedParameterHandler extends PatternBasedParameterHandler { 28 | 29 | protected final DEWordFormNounTableHandler nounTableHandler; 30 | 31 | public PatternBasedIndexedParameterHandler(DEWordFormNounTableHandler nounTableHandler, String regex) { 32 | super(regex); 33 | Objects.requireNonNull(nounTableHandler, "nounTableHandler must not be null."); 34 | this.nounTableHandler = nounTableHandler; 35 | } 36 | 37 | public void handle(String label, String value, WiktionaryWordForm wordForm, ParsingContext context) { 38 | final Matcher matcher = pattern.matcher(label); 39 | if (matcher.find()) { 40 | final Integer index = PatternUtils.extractIndex(matcher); 41 | final int i = index == null ? 1 : index.intValue(); 42 | handleIfFound(wordForm, label, i, value, matcher, context); 43 | } 44 | } 45 | 46 | public abstract void handleIfFound(WiktionaryWordForm wordForm, String label, int index, String value, Matcher matcher, 47 | ParsingContext context); 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/test/resources/articles-ru/lodka.txt: -------------------------------------------------------------------------------- 1 | {{wikipedia}} 2 | = {{-ru-}} = 3 | ===Морфологические и синтаксические свойства=== 4 | {{сущ ru f ina 3*a 5 | |основа=ло́дк 6 | |основа1=ло́док 7 | |слоги={{по-слогам|ло́д|ка}} 8 | }} 9 | 10 | {{морфо||лодк||а}} {{Тихонов}} 11 | 12 | ===Произношение=== 13 | {{transcription|ˈlotkə}} {{медиа|Ru-лодка.ogg}} 14 | 15 | ===Семантические свойства=== 16 | {{илл|Kayaks and canoes from above.jpg|Лодки[1]|size=200px}} 17 | ====Значение==== 18 | # водное транспортное средство, небольшое [[судно]], идущее на вёслах, под [[парус]]ом или на моторной тяге {{пример|Мы все уселись в {{выдел|лодку}} и подъехали к левому берегу, ища места, где бы высадиться.|Джером К. Джером|Трое в одной лодке, не считая собаки|перев=М. Салье}} 19 | 20 | ====Синонимы==== 21 | # 22 | 23 | ====Антонимы==== 24 | # 25 | 26 | ====Гиперонимы==== 27 | # [[судно]] 28 | 29 | ====Гипонимы==== 30 | # [[баркас]], [[ялик]], [[шлюпка]], [[бот]], [[вельбот]], [[гичка]], [[байдарка]], [[берестянка]] 31 | 32 | ===Родственные слова=== 33 | {{родств-блок 34 | |имена-собственные= 35 | |существительные=[[лодочка]], [[лодчонка]], [[подлодка]] 36 | |прилагательные=[[лодочный]] 37 | |глаголы= 38 | |наречия= 39 | }} 40 | ===Фразеологизмы и устойчивые сочетания=== 41 | * [[подводная лодка]] 42 | * [[летающая лодка]] 43 | * [[канонерская лодка]] 44 | 45 | ===Загадки=== 46 | * [[в лесу родилась, на воде живёт]] 47 | 48 | ===Этимология=== 49 | Происходит от слова [[ладья]], далее от {{этимология:лодка}} 50 | 51 | ===Перевод=== 52 | {{перев-блок|| 53 | |en=[[boat]], [[dinghy]], [[gig]], [[yawl]] 54 | |br=[[bag]] 55 | |vep=[[veneh]] 56 | |vro=[[vineh]], [[loodsik]] 57 | |es=[[lancha]], [[bote]], [[barca]], [[canoa]], [[yola]] 58 | |it=[[barca]], [[canotto]], [[lancia]] 59 | |krl=[[veneh]] 60 | |mdf=[[венеж]] 61 | |de=[[Boot]] n -(e)s, -e 62 | |art=[[ilo tawa telo]] 63 | |fi=[[vene]] 64 | |fr=[[bateau]], [[canot]], [[barque]], [[embarcation]] 65 | |myv=[[венч]] 66 | |eo=[[boato]] 67 | |et=[[paat]], [[lootsik]], (''чёлн'') [[vene]] 68 | }} 69 | {{длина слова|5}} 70 | 71 | [[Категория:Суда]] 72 | 73 | [[bg:лодка]] 74 | [[de:лодка]] 75 | [[el:лодка]] 76 | [[en:лодка]] 77 | [[fi:лодка]] 78 | [[fr:лодка]] 79 | [[fy:лодка]] 80 | [[hu:лодка]] 81 | [[hy:лодка]] 82 | [[io:лодка]] 83 | [[ko:лодка]] 84 | [[li:лодка]] 85 | [[lo:лодка]] 86 | [[nl:лодка]] 87 | [[pl:лодка]] 88 | [[pt:лодка]] 89 | [[ro:лодка]] 90 | [[tr:лодка]] 91 | [[vi:лодка]] 92 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/en/WRedirectEn.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.en; 17 | 18 | import java.util.regex.Matcher; 19 | import java.util.regex.Pattern; 20 | 21 | /** Redirect related functions in wiki and English Wiktionary. 22 | * 23 | * see http://en.wiktionary.org/wiki/Wiktionary:Redirections 24 | */ 25 | public class WRedirectEn { 26 | 27 | /** Gets target page of the redirect, extracts [[pagename]] from double brackets. */ 28 | private final static Pattern ptrn_redirect = Pattern.compile( 29 | "#REDIRECT \\[\\[(.+?)\\]\\]", 30 | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); 31 | 32 | /** Checks whether this is a redirect page. If this is true then 33 | * the title of the target (redirected) page will be returned. 34 | * 35 | * @param page_title word which are described in this article 36 | * @param text defines source wiki text 37 | * @return if this is not a redirect then return null 38 | */ 39 | public static String getRedirect(String page_title, 40 | StringBuffer text) { 41 | 42 | // #REDIRECT [[pagename]] (or #redirect [[pagename]] 43 | 44 | //int len = "#REDIRECT [[".length(); // == 12 45 | if(text.length() < 12 || text.charAt(0) != '#') 46 | return null; 47 | 48 | Matcher m = ptrn_redirect.matcher(text); 49 | if (m.find()){ 50 | return m.group(2); 51 | } 52 | 53 | return null; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Generaladmiral.txt: -------------------------------------------------------------------------------- 1 | == Generaladmiral ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{m}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Genus=m 6 | |Nominativ Singular=Generaladmiral 7 | |Nominativ Plural 1=Generaladmirale 8 | |Nominativ Plural 2=Generaladmiräle 9 | |Genitiv Singular=Generaladmirals 10 | |Genitiv Plural 1=Generaladmirale 11 | |Genitiv Plural 2=Generaladmiräle 12 | |Dativ Singular=Generaladmiral 13 | |Dativ Singular*=Generaladmirale 14 | |Dativ Plural 1=Generaladmiralen 15 | |Dativ Plural 2=Generaladmirälen 16 | |Akkusativ Singular 1=Generaladmiral 17 | |Akkusativ Plural 1=Generaladmirale 18 | |Akkusativ Plural 2=Generaladmiräle 19 | |Bild=Generaladmiral Ehrensvärd.gif|mini|2|Der schwedische ''Generaladmiral'' Carl August Ehrensvärd 20 | }} 21 | 22 | {{Worttrennung}} 23 | :Ge·ne·ral·ad·mi·ral, {{Pl.}} Ge·ne·ral·ad·mi·ra·le 24 | 25 | {{Aussprache}} 26 | :{{IPA}} {{Lautschrift|ɡenəˈʀaːlʔatmiˌʀaːl}} 27 | :{{Hörbeispiele}} {{Audio|De-Generaladmiral.ogg}} 28 | 29 | {{Bedeutungen}} 30 | :[1] {{K|Kaiserliche Marine|Kriegsmarine|Reichsmarine}} Zweithöchster Admiralsrang, unter dem [[Großadmiral]] und dem [[Admiral]] 31 | :[3] {{K|Marine}} Titel des jeweilig ältesten Admirals im 17. und 18. Jahrhundert 32 | :[2] {{K|Kaiserliche Marine|Kriegsmarine|Reichsmarine}} Admiral im Range eines Generaladmirals 33 | 34 | {{Herkunft}} 35 | :[[Determinativkompositum]] aus den [[Substantiven]] ''[[General]]'' und ''[[Admiral]]'' 36 | 37 | {{Synonyme}} 38 | :[1] ''[[Heer]], [[Luftwaffe]]:'' [[Generaloberst]] 39 | 40 | {{Oberbegriffe}} 41 | :[1] [[Admiralsrang]] 42 | :[2] [[Admiral]] 43 | 44 | {{Beispiele}} 45 | :[1] Die Deutsche Marine kennt keinen ''Großadmiral.'' 46 | :[2] Nicht viele schafften es, ''Großadmiral'' zu werden. 47 | :[3] Gegen Mittag trifft der Führer gemeinsam mit ''Generaladmiral'' Raeder ein.<ref>o. A.: ''1939'', in: Manfred Overresch und Friedrich Wilhelm Saal (Hgg.): ''Deutsche Geschichte von Tag zu Tag 1918-1949'', 2000 [1983], S. 2772</ref> 48 | 49 | ==== {{Übersetzungen}} ==== 50 | {{Ü-Tabelle|Ü-links= 51 | *{{en}}: [1] {{Ü|en|}} 52 | *{{fr}}: [1] {{Ü|fr|}} 53 | |Ü-rechts= 54 | *{{it}}: [1] {{Ü|it|}} 55 | *{{es}}: [1] {{Ü|es|}} 56 | }} 57 | 58 | {{Referenzen}} 59 | :[1–3] {{Wikipedia|Generaladmiral}} 60 | :[1–3] {{Ref-DWDS|Generaladmiral}} 61 | :[*] {{Ref-Canoo|Generaladmiral}} 62 | :[*] {{Ref-OWID|Generaladmiral}} 63 | :[1–3] {{Ref-Duden|Generaladmiral}} 64 | 65 | {{Quellen}} 66 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/harness.txt: -------------------------------------------------------------------------------- 1 | == harness ({{Sprache|Englisch}}) == 2 | === {{Wortart|Substantiv|Englisch}} === 3 | 4 | {{erweitern|Beispiel|Englisch}} 5 | 6 | {{Englisch Substantiv Übersicht 7 | |Bild 1=Harness (PSF).png|220px|2|horse ''harness'' 8 | |Bild 2=PSM V39 D327 Crompton thirty six harness worsted loom.jpg|220px|6|''harness'' 9 | |Singular=the harness 10 | |Plural=the harnesses 11 | }} 12 | 13 | {{Worttrennung}} 14 | :har·ness, {{Pl.}} har·nesses 15 | 16 | {{Aussprache}} 17 | :{{IPA}} {{Lautschrift|ˈhɑ:nɪs}}, {{amer.|:}} {{Lautschrift|ˈhɑ:rnɪs}}, {{Pl.}} {{Lautschrift|…}} 18 | :{{Hörbeispiele}} {{Audio|En-us-harness.ogg|harness (amerikanisch)}}, {{Pl.}} {{fehlend}} 19 | 20 | {{Bedeutungen}} 21 | :[1] das Zuggeschirr, Geschirr, Gurtzeug, Harnisch, Beschirrung 22 | :[2] das Pferdegeschirr 23 | :[3] das Zaumzeug 24 | :[4] der Kabelstrang 25 | :[5] der Klettergurt 26 | :[6] das Webgeschirr 27 | :[7] ''umgangssprachlich:'' täglliche Routine, Alltagstrott 28 | :[8] der Harnisch, die Rüstung 29 | 30 | {{Beispiele}} 31 | :[1] 32 | 33 | {{Redewendungen}} 34 | :[[double harness]] 35 | :[[in harness]] 36 | 37 | {{Abgeleitete Begriffe}} 38 | :[[baby harness]], [[harness assembly]], [[harness attachment]], [[saventy harness]], [[wiring harness]], [[harnesser]], [[harnessless]], [[harnesslike]], [[reharness]], [[well-harnessed]], [[cable harness]], [[wire harness]], [[harness horse]] 39 | 40 | ==== Übersetzungen ==== 41 | {{Ü-links}} 42 | *{{de}}: [1] [[Zuggeschirr]], [[Geschirr]], [[Gurtzeug]], [[Harnisch]], [[Beschirrung]]; [2] das [[Pferdegeschirr]]; [3] das [[Zaumzeug]] [4] der [[Kabelstrang]] [5] der [[Klettergurt]] [6] das [[Webgeschirr]]; [7] [[Alltagstrott]]; [8] [[Harnisch]], [[Rüstung]] 43 | {{Ü-rechts}} 44 | 45 | {{Referenzen}} 46 | :[1,2,5,8] {{Wikipedia|spr=en|harness}} 47 | :[1-3,6,7] {{Ref-MWD|harness}} 48 | :[1,2,4,6,8] {{Ref-Dictionary|harness}} 49 | :[1,2,5,7] {{Ref-Pons|en|harness}} 50 | :[1–5] {{Ref-dictcc|harness}} 51 | :[1–6] {{Ref-Leo|en|harness}} 52 | 53 | [[cs:harness]] 54 | [[en:harness]] 55 | [[eo:harness]] 56 | [[et:harness]] 57 | [[fi:harness]] 58 | [[fr:harness]] 59 | [[hu:harness]] 60 | [[id:harness]] 61 | [[io:harness]] 62 | [[it:harness]] 63 | [[kn:harness]] 64 | [[ko:harness]] 65 | [[mg:harness]] 66 | [[ml:harness]] 67 | [[my:harness]] 68 | [[pl:harness]] 69 | [[pt:harness]] 70 | [[sv:harness]] 71 | [[ta:harness]] 72 | [[te:harness]] 73 | [[vi:harness]] 74 | [[zh:harness]] 75 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/ru/WRedirectRu.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.ru; 17 | 18 | import java.util.regex.Matcher; 19 | import java.util.regex.Pattern; 20 | 21 | /** Redirect related functions in wiki and Russian Wiktionary. 22 | * 23 | * see http://ru.wiktionary.org/wiki/Викисловарь:Перенаправления 24 | */ 25 | public class WRedirectRu { 26 | 27 | /** Gets target page of the redirect, extracts [[pagename]] from double brackets. */ 28 | private final static Pattern ptrn_redirect = Pattern.compile( 29 | "#(REDIRECT|ПЕРЕНАПРАВЛЕНИЕ) \\[\\[(.+?)\\]\\]", 30 | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); 31 | 32 | /** Checks whether this is a redirect page. If this is true then 33 | * the title of the target (redirected) page will be returned. 34 | * 35 | * @param page_title word which are described in this article 36 | * @param text defines source wiki text 37 | * @return if this is not a redirect then return null 38 | */ 39 | public static String getRedirect(String page_title, 40 | StringBuffer text) { 41 | 42 | // #REDIRECT [[pagename]] (or #redirect [[pagename]] 43 | // or #ПЕРЕНАПРАВЛЕНИЕ [[pagename]] 44 | 45 | //int len = "#REDIRECT [[".length(); // == 12 46 | if(text.length() < 12 || text.charAt(0) != '#') 47 | return null; 48 | 49 | Matcher m = ptrn_redirect.matcher(text); 50 | if (m.find()){ 51 | return m.group(2); 52 | } 53 | 54 | return null; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/util/IBlockHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.util; 19 | 20 | /** 21 | * A handler encapsulated the extraction of the information items encoded in 22 | * a certain article constituent. There might be, for example, a handler 23 | * for extracting pronunciation information. 24 | * @author Christian M. Meyer 25 | * @author Lizhen Qu 26 | */ 27 | public interface IBlockHandler { 28 | 29 | /** Return true if the handler requests to process the article 30 | * constituent starting at the given line of text. */ 31 | boolean canHandle(final String blockHeader); 32 | 33 | /** If the handler requested to process this constituent, this hotspot 34 | * will be called for processing the section header of this 35 | * article constituent. Return true if the handler 36 | * requests to handle also the body of this constituent. */ 37 | boolean processHead(final String line, final ParsingContext context); 38 | 39 | /** If the handler requested to process the body of this constituent, this 40 | * hotspot will be called for processing each line of the constituent's 41 | * body. Return true if the handler requests to handle also 42 | * the next line using this handler. */ 43 | boolean processBody(final String line, final ParsingContext context); 44 | 45 | /** This hotspot is invoked if the parser releases this handler. It can be 46 | * used to store the extracted information to the Wiktionary data 47 | * objects stored in the parsing context. */ 48 | void fillContent(final ParsingContext context); 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENEtymologyHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.entry.WikiString; 24 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.StringUtils; 26 | 27 | /** 28 | * Parser component for extracting etymological information from 29 | * the English Wiktionary. 30 | * @author Christian M. Meyer 31 | * @author Lizhen Qu 32 | */ 33 | public class ENEtymologyHandler extends ENBlockHandler { 34 | private static List SPELLINGS = Arrays.asList("etymology", "etymolgy", 35 | "eytomology", "etmology", "eymology"); 36 | protected StringBuffer contentBuffer; 37 | 38 | public boolean canHandle(String blockHeader) { 39 | return SPELLINGS.contains(StringUtils.strip(blockHeader, "{}=: 1234567890").toLowerCase()); 40 | } 41 | 42 | @Override 43 | public boolean processHead(String textLine, ParsingContext context) { 44 | contentBuffer = new StringBuffer(); 45 | return true; 46 | } 47 | 48 | @Override 49 | public boolean processBody(String textLine, ParsingContext context) { 50 | if (!textLine.isEmpty() && !textLine.startsWith("===")) { 51 | contentBuffer.append(textLine); 52 | } 53 | return false; 54 | } 55 | 56 | public void fillContent(final ParsingContext context) { 57 | if (!contentBuffer.toString().trim().isEmpty()) { 58 | context.setEtymology(new WikiString(contentBuffer.toString().trim())); 59 | } else { 60 | context.setEtymology(null); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/util/ILanguage.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.util; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 21 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryPage; 22 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryTranslation; 23 | 24 | /** 25 | * Generic interface for languages used in Wiktionary. Instances of ILanguage 26 | * are used in as entry language of {@link IWiktionaryPage}s, the word 27 | * language of {@link IWiktionaryEntry}s, and the target language 28 | * of {@link IWiktionaryTranslation}s. Each language is encoded using the 29 | * international standard of language classification (ISO 639). Languages 30 | * are compared by their internal code. 31 | * @author Christian M. Meyer 32 | * @author Christof Müller 33 | * @author Lizhen Qu 34 | */ 35 | public interface ILanguage extends Comparable { 36 | 37 | /** Returns the internal language code used by JWTKL. These codes roughly 38 | * correspond to ISO 639-3, but also include language families, 39 | * deprecated classifications, and not yet classified languages. */ 40 | String getCode(); 41 | 42 | /** Returns the language name (in English language). */ 43 | String getName(); 44 | 45 | /** Returns the ISO 639-1 code or an empty string if none. */ 46 | String getISO639_1(); 47 | 48 | /** Returns the ISO 639-2b code or an empty string if none. */ 49 | String getISO639_2B(); 50 | 51 | /** Returns the ISO 639-2t code or an empty string if none. */ 52 | String getISO639_2T(); 53 | 54 | /** Returns the ISO 639-3 code or an empty string if none. */ 55 | String getISO639_3(); 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/ru/wikokit/base/wikt/multi/ru/quote/TitleAndWikilink.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2008 Andrew Krizhanovsky 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | package de.tudarmstadt.ukp.jwktl.parser.ru.wikokit.base.wikt.multi.ru.quote; 17 | 18 | /** (Wikified) title of quote phrase / sentence. 19 | */ 20 | public class TitleAndWikilink { 21 | public TitleAndWikilink() { 22 | title = ""; 23 | title_wikilink = ""; 24 | } 25 | 26 | /** Title of the work. */ 27 | public String title; 28 | 29 | /** Link to a book in Wikipedia (format: [[s:title|]] or [[:s:title|]]). */ 30 | public String title_wikilink; 31 | 32 | 33 | /** Parses text (e.g. "[[:s:У окна (Андреев)|У окна]]") into 34 | * title_wikilink "У окна (Андреев)" and title "У окна". 35 | */ 36 | public void parseTitle(String text) { 37 | 38 | // replace " " by " " 39 | if(text.contains(" ")) 40 | text = text.replace(" ", " "); 41 | 42 | title = text; // first version 43 | if(!(text.startsWith("[[:s:") || 44 | text.startsWith("[[s:")) || 45 | !text.endsWith("]]") || 46 | !text.contains("|")) 47 | return; 48 | 49 | if(text.startsWith("[[:s:")) 50 | text = text.substring(5, text.length() - 2); // "[[:s:" . text . "]]" 51 | else 52 | text = text.substring(4, text.length() - 2); // "[[s:" . text . "]]" 53 | 54 | // split by | 55 | // [[:s:The title|The title]] 56 | int pos = text.indexOf("|"); 57 | if(-1 == pos) 58 | return; 59 | 60 | title_wikilink = text.substring(0, pos); 61 | title = text.substring(pos + 1); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Verbalsubstantiv.txt: -------------------------------------------------------------------------------- 1 | == Verbalsubstantiv ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{n}} === 3 | 4 | {{Substantiv-Tabelle| 5 | Wer oder was? (Einzahl)=das Verbalsubstantiv 6 | |Wer oder was? (Mehrzahl)=die Verbalsubstantive 7 | |Wessen? (Einzahl)=des Verbalsubstantivs 8 | |Wessen? (Mehrzahl)=der Verbalsubstantive 9 | |Wem? (Einzahl)=dem Verbalsubstantiv 10 | |Wem? (Mehrzahl)=den Verbalsubstantiven 11 | |Wen? (Einzahl)=das Verbalsubstantiv 12 | |Wen? (Mehrzahl)=die Verbalsubstantive 13 | }} 14 | 15 | {{Silbentrennung}} Ver·bal·sub·stan·tiv, {{Pl.}} Ver·bal·sub·stan·ti·ve 16 | 17 | {{Aussprache}} 18 | :[[Hilfe:IPA|IPA]]: {{Lautschrift|vɛʁˈbaːlzʊpstanˌtiːf}}, {{Pl.}} {{Lautschrift|vɛʁˈbaːlzʊpstanˌtiːvə}} 19 | :[[Hilfe:Hörbeispiele|Hörbeispiele]]: {{fehlend}}, {{Pl.}} {{fehlend}} 20 | 21 | {{Bedeutungen}} 22 | :[1] [[Linguistik]]: Sammelbegriff für verschiedene Klassen von [[Substantiv]]en, die durch [[Substantivierung]] von [[Verbform]]en enstanden sind oder noch entstehen 23 | 24 | {{Abkürzungen}} 25 | 26 | {{Herkunft}} 27 | : [[Determinativkompositum]] aus dem Adjektiv [[verbal]] und [[Substantiv]] 28 | 29 | {{Synonyme}} 30 | :[1] 31 | 32 | {{Gegenworte}} 33 | : 34 | 35 | {{Oberbegriffe}} 36 | :[1] [[Substantiv]], [[Wortart]], [[Grammatik]] 37 | 38 | {{Unterbegriffe}} 39 | :[1] [[Gerundium]], [[Gerundiv]]/ [[Gerundivum]], [[Nomen actionis]] Helmut Glück (Hrsg.), unter Mitarbeit von Friederike Schmöe: ''Metzler Lexikon Sprache.'' Dritte, neubearbeitete Auflage. Metzler, Stuttgart/ Weimar 2005. ISBN 978-3-476-02056-7 40 | 41 | 42 | {{Beispiele}} 43 | :[1] Die Wortwurzel von ''Verbalsubstantiven'' ist ein [[Verb]]. 44 | 45 | 46 | 47 | : 48 | {{Abgeleitete Begriffe}} 49 | 50 | :[1] 51 | 52 | ==== Übersetzungen ==== 53 | {{Ü-links}} 54 | *{{en}}: [1] {{Ü|en|}} 55 | *{{fr}}: [1] {{Ü|fr|}} 56 | {{Ü-Abstand}} 57 | *{{ru}}: [1] {{Ü|ru|}} 58 | *{{es}}: [1] {{Ü|es|}} 59 | {{Ü-rechts}} 60 | 61 | {{Referenzen}} 62 | :[1] {{Wikipedia|Verbalsubstantiv}} 63 | :[1] {{Ref-DWDS|Verbalsubstantiv}} 64 | :[1] {{Ref-Canoo|Verbalsubstantiv}} 65 | :[1] {{Ref-UniLeipzig|Verbalsubstantiv}} 66 | 67 | 68 | {{Ähnlichkeiten}} 69 | 70 | [[Kategorie:Fremdwort]] 71 | 72 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/IWikiString.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api; 19 | 20 | import java.util.List; 21 | 22 | /** 23 | * Represents a text that contains wiki markup. In addition to the original 24 | * text containing the wiki markup, the interface allows extracting a list 25 | * of wiki-internal and external links as well as a plain text representation 26 | * (i.e., a text without markup). 27 | * @author Christian M. Meyer 28 | * @author Christof Müller 29 | * @author Lizhen Qu 30 | */ 31 | public interface IWikiString { 32 | 33 | /** Returns the original text including all wiki markup. */ 34 | String getText(); 35 | 36 | /** Parses the original text to filter out all wiki markup and thus 37 | * returns a human-readable version of the text. Note that the parsing 38 | * might be done on demand, so avoid invoking this method repeatedly 39 | * for the same text. */ 40 | String getPlainText(); 41 | 42 | /** Returns a list of wiki-internal links. That is, all substrings 43 | * enclosed by two square brackets. Link captions will be removed. 44 | * If no wiki links are found, an empty list will be returned. Note that 45 | * the parsing might be done on demand, so avoid invoking this method 46 | * repeatedly for the same text. */ 47 | List getWikiLinks(); 48 | 49 | /* Returns a list of external links. That is, all valid URLs in the 50 | * original text. If no external links are found, an empty list will 51 | * be returned. Note that the parsing might be done on demand, so 52 | * avoid invoking this method repeatedly for the same text. */ 53 | // public List getExternalLinks(); 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/Nutella.txt: -------------------------------------------------------------------------------- 1 | == Nutella ® ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Substantiv|Deutsch}}, {{m}}, {{f}}, {{n}} === 3 | 4 | {{Deutsch Substantiv Übersicht 5 | |Genus 1=f 6 | |Genus 2=n 7 | |Genus 3=m 8 | |Nominativ Singular 1=Nutella 9 | |Nominativ Singular 2=Nutella 10 | |Nominativ Singular 3=Nutella 11 | |Nominativ Plural=— 12 | |Genitiv Singular 1=Nutella 13 | |Genitiv Singular 2=Nutellas 14 | |Genitiv Singular 3=Nutellas 15 | |Genitiv Plural=— 16 | |Dativ Singular 1=Nutella 17 | |Dativ Singular 2=Nutella 18 | |Dativ Singular 3=Nutella 19 | |Dativ Plural=— 20 | |Akkusativ Singular 1=Nutella 21 | |Akkusativ Singular 2=Nutella 22 | |Akkusativ Singular 3=Nutella 23 | |Akkusativ Plural=— 24 | }} 25 | 26 | {{Anmerkung|zum Genus}} 27 | :Wie bei Markennamen üblich, hat auch das Wort ''Nutella'' kein festgelegtes Geschlecht. Die häufigste anzutreffende Form ist allerdings je nach Region das weibliche oder das sächliche Geschlecht, am seltensten wird ''der Nutella'' benutzt.Nach: Duden: Duden – Richtiges und gutes Deutsch (Der Duden in zwölf Bden., Bd. 9). Mannheim, Leipzig, Wien, Zürich 52005, Seite 631. 28 | 29 | {{Worttrennung}} 30 | :Nu·tel·la, {{kPl.}} 31 | 32 | {{Aussprache}} 33 | :{{IPA}} {{Lautschrift|ˌnuːˈtɛla}} 34 | :{{Hörbeispiele}} {{Audio|}} 35 | :{{Reime}} {{Reim|ɛla|Deutsch}} 36 | 37 | {{Bedeutungen}} 38 | :[1] Markenname einer [[Nuss-Nugat-Creme]] (als Brotaufstrich), als [[Gattungsname]] umgangssprachlich auch für vergleichbare Produkte anderer HerstellerSeite „Nutella“. In: Wikipedia, Die freie Enzyklopädie. Bearbeitungsstand: 22. Juli 2010, 11:31 UTC. URL: http://de.wikipedia.org/w/index.php?title=Nutella&oldid=76944151 (Abgerufen: 26. Juli 2010, 15:30 UTC) 39 | 40 | {{Herkunft}} 41 | :Vom Hersteller Ferrero kreiertes Kunstwort, das wahrscheinlich auf dem englisch Wort ''{{Ü|en|nut}}'' „[[Nuss]]“ und der italienischen weiblichen Verniedlichungsform "-ella" beruht. 42 | 43 | {{Oberbegriffe}} 44 | :[1] [[Nuss-Nugat-Creme]], [[Brotaufstrich]] 45 | 46 | {{Beispiele}} 47 | :[1] „Kann ich bitte das (die, den) Nutella haben?" 48 | 49 | ==== Übersetzungen ==== 50 | {{Ü-Tabelle|Ü-links= 51 | *{{en}}: [1] {{Ü|en|Nutella}} 52 | *{{fr}}: [1] {{Ü|fr|Nutella}} {{m}} 53 | |Ü-rechts= 54 | *{{it}}: [1] {{Ü|it|Nutella}} {{f}} 55 | *{{sv}}: [1] {{Ü|sv|Nutella}} 56 | }} 57 | 58 | {{Referenzen}} 59 | :[1] {{Wikipedia|Nutella}} 60 | :[1] {{Ref-UniLeipzig|Nutella}} 61 | 62 | {{Quellen}} 63 | 64 | [[en:Nutella]] 65 | [[fr:Nutella]] 66 | [[ru:Nutella]] 67 | -------------------------------------------------------------------------------- /src/test/resources/articles-de/pittoresk.txt: -------------------------------------------------------------------------------- 1 | == pittoresk ({{Sprache|Deutsch}}) == 2 | === {{Wortart|Adjektiv|Deutsch}} === 3 | 4 | {{Adjektiv-Tabelle (Deklination) 5 | |Grundform=pittoresk 6 | |1. Steigerung=pittoresker 7 | |2. Steigerung=am pittoreskesten 8 | }} 9 | 10 | {{Worttrennung}} 11 | :pit·to·resk, {{Komp.}} pit·to·res·ker, {{Sup.}} pit·to·res·kes·ten 12 | 13 | {{Aussprache}} 14 | :{{IPA}} {{Lautschrift|ˌpɪtoˈʀɛsk}}, {{Komp.}} {{Lautschrift|ˌpɪtoˈʀɛskɐ}}, {{Sup.}} {{Lautschrift|ˌpɪtoˈʀɛskəstn̩}}, {{Lautschrift|ˌpɪtoˈʀɛskəstən}} 15 | :{{Hörbeispiele}} {{Audio|De-at-pittoresk.ogg|pittoresk (österreichisch)}} 16 | 17 | {{Bedeutungen}} 18 | :[1] [[malerisch]] 19 | 20 | {{Herkunft}} 21 | :abgeleitet von {{lat.}} ''{{Ü|la|pictus}}'' „gemalt“, zu [[italienisch]] ''{{Ü|it|pittoresco}}''<ref>[http://www.zeno.org/Brockhaus-1837/A/Pittoresk?hl=pittoresk Brockhaus Bilder-Conversations-Lexikon, Band 3. Leipzig 1839., Seite 507.]</ref>; zu [[französisch]] ''{{Ü|fr|pittoresque}}''<ref>[http://www.zeno.org/Brockhaus-1809/B/Pittoresk?hl=pittoresk Brockhaus Conversations-Lexikon Bd. 8. Leipzig 1811, Seite 251.]</ref> 22 | 23 | {{Sinnverwandte Wörter}} 24 | :[1] [[bildschön]], [[hübsch]], [[malerisch]] 25 | 26 | {{Beispiele}} 27 | :[1] Die kleine Stadt mit ihrem Labyrinth enger Straßen und ihren alten Häusern macht einen ''pittoresken'' Eindruck. 28 | :[1] Wir waren in dem ''pittoreskesten'' Dorf der ganzen Umgebung gelandet. 29 | 30 | ==== Übersetzungen ==== 31 | {{Ü-links}} 32 | *{{en}}: [1] {{Ü|en|picturesque}} 33 | *{{fr}}: [1] {{Ü|fr|pittoresque}} {{mf}} 34 | *{{it}}: [1] {{Ü|it|pittoresco}}, {{Ü|it|suggestivo}} 35 | *{{nl}}: [1] {{Ü|nl|#pittoresk (Niederländisch)|pittoresk}}, {{Ü|nl|schilderachtig}} 36 | *{{no}}: [1] {{Ü|no|#pittoresk (Norwegisch)|pittoresk}} 37 | *{{pl}}: [1] {{Ü|pl|malowniczy}} 38 | {{Ü-Abstand}} 39 | *{{pt}}: [1] {{Ü|pt|pitoresco}} 40 | *{{ru}}: [1] {{Üxx|ru|shiwopisnyj|живописный}} 41 | *{{sk}}: [1] {{Ü|sk|pitoreskný}} 42 | *{{sl}}: [1] {{Ü|sl|pitoresken}}, {{Ü|sl|razgiban}}, {{Ü|sl|slikovit}} 43 | *{{es}}: [1] {{Ü|es|pintoresco}} {{m}}, {{Ü|es|pintoresca}} {{f}} 44 | *{{cs}}: [1] {{Ü|cs|pitoreskní}}, {{Ü|cs|malebný}} 45 | {{Ü-rechts}} 46 | 47 | {{Referenzen}} 48 | :[1] {{Ref-DWDS|pittoresk}} 49 | :[1] {{Ref-Canoo|pittoresk}} 50 | :[1] {{Ref-UniLeipzig|pittoresk}} 51 | 52 | {{Quellen}} 53 | 54 | [[en:pittoresk]] 55 | [[fi:pittoresk]] 56 | [[fr:pittoresk]] 57 | [[hu:pittoresk]] 58 | [[io:pittoresk]] 59 | [[ko:pittoresk]] 60 | [[pl:pittoresk]] 61 | [[sv:pittoresk]] 62 | [[zh:pittoresk]] 63 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/IPronunciation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api; 19 | 20 | /** 21 | * Pronunciation information for {@link IWiktionaryEntry}s. There can be 22 | * different types of pronunciation information including standardized 23 | * written representations using the IPA or SAMPA notation, audio files 24 | * of people reading a word aloud, and information on the rhyming suffix 25 | * of a lexical entry. 26 | * @author Christiam M. Meyer 27 | */ 28 | public interface IPronunciation { 29 | 30 | /** Types of different pronunciation information used by 31 | * {@link IPronunciation#getType()}. */ 32 | enum PronunciationType { 33 | 34 | /** International Phonetic Alphabet */ 35 | IPA, 36 | /** Speech Assessment Methods Phonetic Alphabet */ 37 | SAMPA, 38 | /** Audio file of this pronunciation. */ 39 | AUDIO, 40 | /** Suffix used to identify rhymes. */ 41 | RHYME, 42 | /** Unprocessed pronunciation template */ 43 | RAW 44 | } 45 | 46 | /** Returns the type of this pronunciation, which can be audio files 47 | * or a specific notation schema used to represent pronunciation 48 | * information. */ 49 | PronunciationType getType(); 50 | 51 | /** The representation of the pronunciation using a standardized 52 | * notation such as IPA. In case of audio files, the file name of 53 | * the sound file is returned. The corresponding URL of this sound file 54 | * needs to be obtained by querying 55 | * http://[LANGUAGE].wiktionary.org/wiki/File:[FILENAME]. */ 56 | String getText(); 57 | 58 | /** Returns additional information for this pronunciation, such as 59 | * a geographical reference. */ 60 | String getNote(); 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/de/components/DEWordLanguageHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components; 19 | 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.util.ILanguage; 24 | import de.tudarmstadt.ukp.jwktl.api.util.Language; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 26 | 27 | /** 28 | * Parser component for extracting the lemma and its language from the 29 | * German Wiktionary. 30 | * @author Christian M. Meyer 31 | * @author Lizhen Qu 32 | */ 33 | public class DEWordLanguageHandler extends DEBlockHandler { 34 | 35 | /** language regular expression pattern*/ 36 | private static final Pattern LANGUAGE_PATTERN = Pattern.compile("^==\\s*([^=].*)\\s*\\(?\\{\\{\\s*Sprache\\s*\\|\\s*([^}]+?)\\s*\\}\\}"); 37 | 38 | protected String lemma; 39 | protected ILanguage language; 40 | 41 | /** Determine if the text line contains the language pattern. If the 42 | * language pattern is found, the entry's word and its language will 43 | * be extracted from the text line. */ 44 | public boolean canHandle(final String blockHeader) { 45 | if (blockHeader == null) 46 | return false; 47 | 48 | lemma = null; 49 | language = null; 50 | Matcher matcher = LANGUAGE_PATTERN.matcher(blockHeader); 51 | if (!matcher.find()) 52 | return false; 53 | 54 | lemma = matcher.group(1); 55 | language = Language.findByName(matcher.group(2)); 56 | return true; 57 | } 58 | 59 | /** Store the word and its language in the parsing context. */ 60 | public void fillContent(final ParsingContext context) { 61 | context.setLanguage(language); 62 | context.setHeader(lemma); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/components/BlockHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.parser.util.IBlockHandler; 21 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 22 | import de.tudarmstadt.ukp.jwktl.parser.util.StringUtils; 23 | 24 | /** 25 | * Abstract parser component for processing article constituents. The handler 26 | * can be initialized with a set of fixed labels that denote the header of 27 | * an article constituent that is to be parsed by this handler. 28 | * @author Christian M. Meyer 29 | * @author Lizhen Qu 30 | */ 31 | /** 32 | * Default implementation of the {@link IBlockHandler} interface that serves 33 | * as a base class for parsing any article constituent. 34 | */ 35 | public abstract class BlockHandler implements IBlockHandler { 36 | 37 | protected String[] labels; 38 | 39 | /** Initializes the block handler for parsing all sections starting with 40 | * one of the specified labels. */ 41 | public BlockHandler(final String... labels) { 42 | this.labels = labels; 43 | } 44 | 45 | public boolean canHandle(String blockHeader) { 46 | blockHeader = StringUtils.strip(blockHeader, "{}=: "); 47 | for (String label : labels) 48 | if (label.equals(blockHeader)) 49 | return true; 50 | 51 | return false; 52 | } 53 | 54 | public boolean processHead(final String text, final ParsingContext context) { 55 | return true; 56 | } 57 | 58 | public boolean processBody(final String textLine, final ParsingContext context) { 59 | return false; 60 | } 61 | 62 | public void fillContent(final ParsingContext context) {} 63 | 64 | protected String[] getLabels() { 65 | return labels; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/en/components/ENWordLanguageHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.en.components; 19 | 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | 23 | import de.tudarmstadt.ukp.jwktl.api.util.ILanguage; 24 | import de.tudarmstadt.ukp.jwktl.api.util.Language; 25 | import de.tudarmstadt.ukp.jwktl.parser.util.IBlockHandler; 26 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 27 | 28 | /** 29 | * Parser component for extracting a words language from the English Wiktionary. 30 | * @author Christian M. Meyer 31 | * @author Lizhen Qu 32 | */ 33 | public class ENWordLanguageHandler extends ENBlockHandler implements IBlockHandler { 34 | 35 | protected static final Pattern LANGUAGE_HEADER = Pattern.compile("^\\s*=+\\s*\\[*\\s*(.*?)\\s*\\]*\\s*=+"); 36 | 37 | protected ILanguage language; 38 | 39 | public boolean canHandle(String blockHeader) { 40 | if ("----".equals(blockHeader)) { 41 | language = null; 42 | return true; 43 | } 44 | 45 | language = null; 46 | // System.out.println(textLine); 47 | Matcher matcher = LANGUAGE_HEADER.matcher(blockHeader); 48 | if (!matcher.find()) 49 | return false; 50 | 51 | // System.out.println(matcher.group(1)); 52 | language = Language.findByName(matcher.group(1)); 53 | return (language != null); 54 | } 55 | 56 | @Override 57 | public boolean processHead(final String textLine, final ParsingContext context) { 58 | return true; 59 | } 60 | 61 | @Override 62 | public boolean processBody(final String textLine, final ParsingContext context) { 63 | return false; 64 | } 65 | 66 | public void fillContent(final ParsingContext context) { 67 | context.setLanguage(language); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DEWordFormNounTableHandlerTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.parser.de.components.DEGenderText; 21 | import junit.framework.TestCase; 22 | 23 | public class DEWordFormNounTableHandlerTest extends TestCase { 24 | 25 | private DEWordFormNounTableHandler nounTableHandler; 26 | 27 | @Override 28 | protected void setUp() throws Exception { 29 | nounTableHandler = new DEWordFormNounTableHandler(); 30 | } 31 | 32 | public void testGetsSetGenus() { 33 | nounTableHandler.setGenusByIndex(DEGenderText.F, 2); 34 | assertEquals(DEGenderText.F, nounTableHandler.getGenusByIndex(2)); 35 | } 36 | 37 | public void testGetsNotSetGenus() { 38 | assertNull(nounTableHandler.getGenusByIndex(3)); 39 | } 40 | 41 | public void testThrowsExceptionSettingGenusWithInvalidIndex() { 42 | try { 43 | nounTableHandler.setGenusByIndex(DEGenderText.F, 0); 44 | fail(); 45 | } catch (IllegalArgumentException expected) { 46 | assertTrue(true); 47 | } 48 | try { 49 | nounTableHandler.setGenusByIndex(DEGenderText.F, 5); 50 | fail(); 51 | } catch (IllegalArgumentException expected) { 52 | assertTrue(true); 53 | } 54 | } 55 | 56 | public void testThrowsExceptionGettingGenusWithInvalidIndex() { 57 | try { 58 | nounTableHandler.getGenusByIndex(0); 59 | fail(); 60 | } catch (IllegalArgumentException expected) { 61 | assertTrue(true); 62 | } 63 | try { 64 | nounTableHandler.getGenusByIndex(5); 65 | fail(); 66 | } catch (IllegalArgumentException expected) { 67 | assertTrue(true); 68 | } 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/test/java/de/tudarmstadt/ukp/jwktl/parser/de/components/nountable/DativeHandlerTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.de.components.nountable; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.entry.WiktionaryWordForm; 21 | import de.tudarmstadt.ukp.jwktl.api.util.GrammaticalCase; 22 | import junit.framework.TestCase; 23 | 24 | public class DativeHandlerTest extends TestCase { 25 | 26 | private DativeHandler dativeHandler; 27 | 28 | @Override 29 | protected void setUp() throws Exception { 30 | dativeHandler = new DativeHandler(); 31 | } 32 | 33 | public void testCanHandle() { 34 | assertFalse(dativeHandler.canHandle(null, null, null, null)); 35 | assertFalse(dativeHandler.canHandle("Vitad", null, null, null)); 36 | assertTrue(dativeHandler.canHandle("Dativ Singular", null, null, null)); 37 | assertTrue(dativeHandler.canHandle("Dativ", null, null, null)); 38 | assertFalse(dativeHandler.canHandle(" Dativ Singular", null, null, null)); 39 | assertTrue(dativeHandler.canHandle("Wem? (Einzahl)", null, null, null)); 40 | assertTrue(dativeHandler.canHandle("Wem?(Einzahl)", null, null, null)); 41 | assertFalse(dativeHandler.canHandle(" Wem? (Einzahl)", null, null, null)); 42 | } 43 | 44 | public void testDativeSingular() { 45 | WiktionaryWordForm wordForm = new WiktionaryWordForm("test"); 46 | dativeHandler.handle("Dativ Singular", "test", wordForm, null); 47 | assertEquals(GrammaticalCase.DATIVE, wordForm.getCase()); 48 | } 49 | 50 | public void testWemEinzahl() { 51 | WiktionaryWordForm wordForm = new WiktionaryWordForm("test"); 52 | dativeHandler.handle("Wem? (Einzahl)", "test", wordForm, null); 53 | assertEquals(GrammaticalCase.DATIVE, wordForm.getCase()); 54 | } 55 | } -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/components/InterwikiLinkHandler.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.components; 19 | 20 | import de.tudarmstadt.ukp.jwktl.parser.util.ParsingContext; 21 | 22 | /** 23 | * Generic parser component for extracting interwiki links (e.g., [de:dog]) 24 | * from the Wiktionary article pages. 25 | * @author Christian M. Meyer 26 | * @author Lizhen Qu 27 | * 28 | */ 29 | public class InterwikiLinkHandler extends BlockHandler { 30 | 31 | protected String categoryHead; 32 | protected String language; 33 | 34 | /** Initializes the handler for the specified category head 35 | * (e.g., "Category"). The category head is required for distinugishing 36 | * between categories and interwiki links. */ 37 | public InterwikiLinkHandler(final String categoryHead) { 38 | this.categoryHead = categoryHead; 39 | } 40 | 41 | public boolean canHandle(String blockHeader) { 42 | // Check if the line encodes an interwiki link. 43 | String line = blockHeader.trim(); 44 | boolean isBracketed = line.startsWith("[[") && line.endsWith("]]"); 45 | return (line.contains(":") && !line.contains(categoryHead) && isBracketed); 46 | } 47 | 48 | @Override 49 | public boolean processHead(String textLine, ParsingContext context) { 50 | // Extract the language of the interwiki link. 51 | language = null; 52 | String line = textLine.trim(); 53 | int colonIndex = line.indexOf(":"); 54 | if (colonIndex != -1) { 55 | language = line.substring(2, colonIndex).trim(); 56 | } 57 | return true; 58 | } 59 | 60 | public void fillContent(final ParsingContext context) { 61 | // Add the interwiki link to the current page. 62 | if (language != null) 63 | context.getPage().addInterWikiLink(language); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/parser/util/PatternUtils.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.parser.util; 19 | 20 | import java.util.Objects; 21 | import java.util.regex.Matcher; 22 | 23 | public class PatternUtils { 24 | 25 | private PatternUtils() { 26 | } 27 | 28 | /** 29 | * Extracts the index from the given previously matched/foundregex 30 | * matcher. If the matcher was not matched yet, throws an exception. Otherwise 31 | * searches for the first non-null group, parse it as an integer 32 | * and returns the result. If no non-null groups are found, returns 33 | * null. 34 | * 35 | * @param matcher 36 | * regular expression matcher. 37 | * @return Extracted index or null. 38 | * @throws NumberFormatException 39 | * If value of the first non-null group could not be 40 | * parsed as an integer. 41 | * @throws IllegalArgumentException If the matcher was not matched yet. 42 | */ 43 | public static Integer extractIndex(Matcher matcher) throws NumberFormatException, IllegalArgumentException { 44 | Objects.requireNonNull(matcher, "matcher must not be null."); 45 | try { 46 | if (matcher.start() < 0) { 47 | throw new IllegalArgumentException("The matcher was not matched yet."); 48 | } 49 | } catch (IllegalStateException isex) { 50 | throw new IllegalArgumentException("The matcher was not matched yet.", isex); 51 | } 52 | for (int index = 1; index <= matcher.groupCount(); index++) { 53 | String group = matcher.group(index); 54 | if (group != null) { 55 | return Integer.valueOf(group); 56 | } 57 | } 58 | return null; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/api/entry/WiktionaryRelation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.api.entry; 19 | 20 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryRelation; 21 | import de.tudarmstadt.ukp.jwktl.api.RelationType; 22 | 23 | /** 24 | * Default implementation of the {@link IWiktionaryRelation} interface. 25 | * See there for details. 26 | * @author Christian M. Meyer 27 | */ 28 | public class WiktionaryRelation implements IWiktionaryRelation { 29 | 30 | protected String target; 31 | protected RelationType type; 32 | protected String targetSense; 33 | protected LinkType linkType; 34 | //protected RelationSourceType relationSourceType; 35 | 36 | /** Creates a new, empty relation. */ 37 | public WiktionaryRelation() {} 38 | 39 | /** Creates a new relation for the given target and relation type. */ 40 | public WiktionaryRelation(final String target, final RelationType type) { 41 | this.target = target; 42 | this.type = type; 43 | //this.relationSourceType = RelationSourceType.ENTRY; 44 | } 45 | 46 | public RelationType getRelationType() { 47 | return type; 48 | } 49 | 50 | public String getTarget() { 51 | return target; 52 | } 53 | 54 | public String getTargetSense() { 55 | return targetSense; 56 | } 57 | 58 | /** Specifies additional information on the target word sense. */ 59 | public void setTargetSense(final String targetSense){ 60 | this.targetSense = targetSense; 61 | } 62 | 63 | public LinkType getLinkType() { 64 | return linkType; 65 | } 66 | 67 | /** Assigns a new link type for this relation. */ 68 | public void setLinkType(final LinkType linkType) { 69 | this.linkType = linkType; 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | return type + ":" + target; 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/de/tudarmstadt/ukp/jwktl/examples/Example5_MultipleLanguages.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2013 3 | * Ubiquitous Knowledge Processing (UKP) Lab 4 | * Technische Universität Darmstadt 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | ******************************************************************************/ 18 | package de.tudarmstadt.ukp.jwktl.examples; 19 | 20 | import java.io.File; 21 | 22 | import de.tudarmstadt.ukp.jwktl.JWKTL; 23 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryCollection; 24 | import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry; 25 | 26 | /** 27 | * Example for combining information from multiple Wiktionary language editions 28 | * in a so-called collection. 29 | * @author Yevgen Chebotar 30 | * @author Christian M. Meyer 31 | */ 32 | public class Example5_MultipleLanguages { 33 | 34 | /** Runs the example. 35 | * @param args two names of a directory containing parsed Wiktionary data 36 | * (German and English in the example). */ 37 | public static void main(String[] args) { 38 | if (args.length != 2) 39 | throw new IllegalArgumentException("Too few arguments. " 40 | + "Required arguments: " 41 | + ""); 42 | 43 | // Create new IWiktionaryCollection for the parsed databases. 44 | IWiktionaryCollection wktColl = JWKTL.openCollection(new File(args[0]), new File(args[1])); 45 | 46 | // Query for "arm" in both language editions and print the resulting entries. 47 | for (IWiktionaryEntry entry : wktColl.getEntriesForWord("arm")) { 48 | // Print the language of the defining language edition. 49 | System.out.println(entry.getPage().getEntryLanguage() + ":"); 50 | 51 | // Print the word and its language and part of speech. 52 | System.out.println(" " + entry.getWord() 53 | + "/" + entry.getPartOfSpeech() 54 | + "/" + entry.getWordLanguage()); 55 | } 56 | 57 | // Close the Wiktionary edition (closes all attached editions). 58 | wktColl.close(); 59 | } 60 | 61 | } 62 | --------------------------------------------------------------------------------