├── mak ├── mkemmorph.sh ├── setroot.sh ├── errchk.sh ├── mkX.sh ├── mkhuhfst.sh ├── xlx2lglexc.sh ├── mk1.sh ├── mkxlxrmseg.sh ├── xlx2lglexc.make ├── hu-hfst.make └── xlxu.make ├── quotethis.bib ├── LICENSE.txt ├── src ├── filesX.hpl ├── sfxfsa.hpl ├── sfxfsagen.hpl ├── fixrps.pat └── rev.srt ├── pl ├── generic │ ├── mtouch.pl │ ├── bsort.pl │ ├── stemalt.pl │ ├── sfxalt.pl │ ├── convtags.pl │ ├── banner.pl │ ├── dumpsh.pl │ ├── multich.pl │ ├── lcase.pl │ ├── diewarn.pl │ ├── lx3lex.pl │ ├── selrep.pl │ ├── m2getopt.pl │ ├── entfix.pat │ ├── dumpdata.pl │ ├── mtxlex.pl │ ├── scanmeta.pl │ ├── newproplst.pl │ ├── normform.pl │ ├── mtx2hash.pl │ ├── metalex.pl │ ├── set.pl │ ├── sort.pl │ ├── metadict.pl │ └── lex2.pl └── mkavs │ ├── uniq.pl │ ├── irreg.pl │ ├── fixpron.pl │ ├── getsfxtags.pl │ ├── sfxlex1.pl │ ├── addfea.pl │ └── stmlex2.pl ├── README.md └── lexc └── casenormhuX.xfs /mak/mkemmorph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bash mkX.sh xlx &&\ 3 | bash mkxlxrmseg.sh &&\ 4 | bash xlx2lglexc.sh huX &&\ 5 | bash mkhuhfst.sh &&\ 6 | mkdir ../hfst 7 | mv ../lexc/hu.hfstol ../hfst/hu.hfstol 8 | -------------------------------------------------------------------------------- /mak/setroot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$hpldir" != "" ]; then export ROOT="$hpldir"; fi 3 | if [ "$ROOT" == "" ]; then export ROOT=..; fi 4 | export bsort="$ROOT"/pl/generic 5 | export humlogdir="$ROOT"/gen/ 6 | -------------------------------------------------------------------------------- /mak/errchk.sh: -------------------------------------------------------------------------------- 1 | ERR=$? 2 | if [ $ERR -gt 0 ]; then 3 | echo There were errors in ${0##*/}... see $logfile 4 | echo There were errors in ${0##*/}... >>"$logfile" 5 | else 6 | echo Done ${0##*/} 7 | echo Done ${0##*/} >>"$logfile" 8 | fi 9 | exit $ERR 10 | -------------------------------------------------------------------------------- /mak/mkX.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$hpldir" == "" ]; then export hpldir=..; fi 4 | export bsort="$hpldir/pl/generic" 5 | export ANA=1 6 | export BIT2MTX=1 7 | export METADICT=metadict 8 | export GSFX=X 9 | 10 | bash "$hpldir/mak/mk1.sh" X xlx 11 | 12 | -------------------------------------------------------------------------------- /mak/mkhuhfst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$hpldir" == "" ]; then export hpldir=..; fi 3 | source "$hpldir"/mak/setroot.sh 4 | export logfile="$humlogdir"make"$GEN""$APP"hfst.log 5 | 6 | cd $hpldir/lexc 7 | make -rR -f "$hpldir"/mak/hu-hfst.make 2>"$logfile" 8 | 9 | source $hpldir/mak/errchk.sh 10 | 11 | -------------------------------------------------------------------------------- /mak/xlx2lglexc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$hpldir" == "" ]; then export hpldir=..; fi 3 | source "$hpldir"/mak/setroot.sh 4 | export F=huX 5 | 6 | export logfile="$humlogdir"make"$GEN""$APP"lexc.log 7 | 8 | make -rR -f "$hpldir"/mak/xlx2lglexc.make 2>"$logfile" 9 | 10 | source $hpldir/mak/errchk.sh 11 | 12 | -------------------------------------------------------------------------------- /mak/mk1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$hpldir" == "" ]; then export hpldir=..; fi 3 | source "$hpldir"/mak/setroot.sh 4 | mkdir $hpldir/gen 5 | export APP=$1 6 | if [ "$1" == "." ]; then export APP=; fi 7 | shift 8 | export logfile="$humlogdir"make"$GEN""$APP".log 9 | cd $hpldir/gen 10 | make -rR -f "$hpldir"/mak/uhun.make 2>"$logfile" $1 11 | 12 | source $hpldir/mak/errchk.sh 13 | 14 | -------------------------------------------------------------------------------- /mak/mkxlxrmseg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export X=X 3 | export BASELEX= 4 | export S=X 5 | export LEX=genX.lx2 6 | 7 | export hpldir=.. 8 | source "$hpldir/mak/setroot.sh" 9 | export EXCL="\"-excl=restr:[gGm]+|infsfx.*(?:IN|EX|AD)L\"" 10 | export SRFONLY=-rmseg 11 | 12 | logfile="$humlogdir""makeXLX$GUESS$x$s.log" 13 | cd $hpldir/gen 14 | make -rR -f $hpldir/mak/xlxu.make 2> "$logfile" $1 15 | 16 | source $hpldir/mak/errchk.sh 17 | 18 | -------------------------------------------------------------------------------- /quotethis.bib: -------------------------------------------------------------------------------- 1 | @InProceedings{novak14, 2 | author = {Attila Novák}, 3 | title = {A New Form of {Humor} -- {Mapping} Constraint-Based Computational Morphologies to a Finite-State Representation}, 4 | booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, 5 | year = {2014}, 6 | month = {may}, 7 | date = {26-31}, 8 | address = {Reykjavik, Iceland}, 9 | editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, 10 | publisher = {European Language Resources Association (ELRA)}, 11 | isbn = {978-2-9517408-8-4}, 12 | language = {english} 13 | } 14 | 15 | @InProceedings{novak16, 16 | author = {Attila Novák and Borbála Siklósi and Csaba Oravecz}, 17 | title = {A New Integrated Open-source Morphological Analyzer for {Hungarian}}, 18 | booktitle = {Proceedings of the Tenth International {Conference on Language Resources and Evaluation (LREC 2016)}}, 19 | year = {2016}, 20 | month = {may}, 21 | date = {23-28}, 22 | location = {Portorož, Slovenia}, 23 | editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, 24 | publisher = {European Language Resources Association (ELRA)}, 25 | address = {Paris, France}, 26 | isbn = {978-2-9517408-9-1}, 27 | language = {english} 28 | } 29 | 30 | @inproceedings{novak03, 31 | address = {Szeged}, 32 | title = {Milyen a jó {Humor}? [{What} is good {Humor} like?]}, 33 | booktitle = {I. {Magyar} {Számítógépes} {Nyelvészeti} {Konferencia}}, 34 | publisher = {SZTE}, 35 | author = {Novák, Attila}, 36 | year = {2003}, 37 | pages = {138--144} 38 | } 39 | 40 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 2 | Copyright (C) 2001-2016 Attila Novák 3 | 4 | The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 5 | The resource is available from: https://github.com/dlt-rilmta/emMorph 6 | 7 | The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 8 | (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 9 | with the following amendments: 10 | 11 | By downloading/cloning/using this database and tools you accept the following terms: 12 | 13 | 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 14 | clearly indicating what you use this database or tool for in your application/experiment/resource. 15 | 16 | 2. If possible, please publish a scientific paper about each application, experimental system 17 | or linguistic resource you create or experiment you perform using this resource quoting the articles below, 18 | and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 19 | 20 | Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 21 | (See the BibTeX file quotethis.bib in the root directory): 22 | 23 | Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 24 | In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 25 | 26 | Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 27 | In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 28 | 29 | Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 30 | 31 | 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 32 | 33 | 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. -------------------------------------------------------------------------------- /src/filesX.hpl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | $proplst='proplstX.hpl'; 52 | $propsets='propsets'; 53 | $humsrc='../gen'; 54 | -------------------------------------------------------------------------------- /pl/generic/mtouch.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | $now = time; 56 | utime $now, $now, map glob,(@ARGV); 57 | -------------------------------------------------------------------------------- /pl/mkavs/uniq.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | while(<>) 56 | { 57 | print unless $_ eq $p; 58 | $p=$_; 59 | } 60 | -------------------------------------------------------------------------------- /pl/generic/bsort.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | BEGIN{ 55 | $hpldir=$ENV{'hpldir'} if !$hpldir; 56 | $hpldir='../..' if !$hpldir; 57 | } 58 | 59 | use lib "$hpldir/pl/generic"; 60 | use lib "$hpldir/src"; 61 | 62 | do 'm2getopt.pl'; 63 | 64 | for(sort{$a cmp $b}(<>)) 65 | { 66 | if($uniq) 67 | { 68 | next if $p eq $_; 69 | $p=$_; 70 | } 71 | print $_; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /pl/generic/stemalt.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | 63 | require 'mkrulef.pl'; 64 | require 'banner.pl'; 65 | 66 | start_banner('Rule file to perl script converter'); 67 | mkrulefile (qw/stemalt rp rr/); 68 | end_banner(); 69 | -------------------------------------------------------------------------------- /pl/generic/sfxalt.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | 63 | require 'mkrulef.pl'; 64 | 65 | require 'banner.pl'; 66 | 67 | start_banner('Rule file to perl script converter'); 68 | mkrulefile (qw/sfxalt1 lp lr/); 69 | end_banner(); 70 | -------------------------------------------------------------------------------- /pl/generic/convtags.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #perl convtags.pl tagconvfile lexcfile 56 | open I,shift or die; 57 | while() 58 | { 59 | tr/[]//d; 60 | ($f,$t)=split /[\t\n]/,$_; 61 | $conv{$f}=$t; 62 | } 63 | close I; 64 | $trpat=join '|', map {quotemeta} sort {$b cmp $a} keys %conv; 65 | #warn "$trpat\n"; 66 | while(<>) 67 | { 68 | s/(\|NM.*_)([et][1-3]\])/$1N$2/g; 69 | s/\[($trpat)\]/\[$conv{$1}\]/og; 70 | print; 71 | } -------------------------------------------------------------------------------- /pl/generic/banner.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | { 56 | my ($stm,$end,$oldfh); 57 | 58 | $stm=time; 59 | $oldfh = select(STDERR); $| = 1; select($oldfh); 60 | 61 | sub start_banner 62 | { 63 | print STDERR "<<<",scalar(localtime),": STARTING $_[0] ($0)\n---args: @ARGV\n"; 64 | } 65 | 66 | sub end_banner 67 | { 68 | $end=time-$stm; 69 | printf STDERR ">>>elapsed: %02d:%02d, input records=$., FINISHED $0\n\n",$end/60,$end%60; 70 | } 71 | } 72 | $banner_loaded=1; 73 | -------------------------------------------------------------------------------- /pl/generic/dumpsh.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | use lib "$hpldir/pl/generic"; 60 | 61 | require 'selrep.pl'; 62 | 63 | use Data::Dumper; 64 | 65 | sub dumpsh 66 | { 67 | my($str,$names)=@_; 68 | my($s); 69 | $Data::Dumper::Terse=0; 70 | $Data::Dumper::Indent=1; 71 | $Data::Dumper::Deepcopy=1; 72 | $s=Data::Dumper->Dumpxs($str,$names); 73 | selrep($s,'/\[\n\s+\'.*?\n\s+\]/s',('s/\n\s+//sg')); 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # emMorph (Humor) Hungarian morphological analyzer 2 | 3 | ## Requirements 4 | 5 | The morphological analyzer can be compiled on Unix systems. 6 | You'll need perl and hfst. Hfst version above 9.12.0 required. 7 | 8 | ### Installing hfst 9 | 10 | ``` 11 | wget http://apertium.projectjj.com/rpm/install-nightly.sh -O - | sudo bash 12 | sudo apt-get install hfst 13 | ``` 14 | 15 | ## Compilation of the morphology 16 | 17 | ``` 18 | cd mak 19 | bash mkemmorph.sh 20 | ``` 21 | 22 | The compiled lexicon is `hfst/hu.hfstol` 23 | 24 | ## Usage 25 | 26 | ``` 27 | hfst-lookup --cascade=composition hu.hfstol 28 | ``` 29 | 30 | If you want to redirect input from a file, use: 31 | 32 | ``` 33 | hfst-lookup --pipe-mode=input --cascade=composition hu.hfstol outtext 34 | ``` 35 | 36 | ### Lemmatized output 37 | 38 | If you want lemmatized output, download the lemmatizer from [https://github.com/dlt-rilmta/hunlp-GATE/tree/master/Lang_Hungarian/resources/hfst](https://github.com/dlt-rilmta/hunlp-GATE/tree/master/Lang_Hungarian/resources/hfst). 39 | 40 | One way to download only this tool from the repository: 41 | Visit [http://kinolien.github.io/gitzip/](http://kinolien.github.io/gitzip/) and paste the link above. 42 | 43 | ### Using the compiled lexicon on Windows 44 | 45 | Install hfst from: [http://apertium.projectjj.com/win32/nightly/hfst-latest.7z](http://apertium.projectjj.com/win32/nightly/hfst-latest.7z). 46 | 47 | Usage is the same as above. 48 | 49 | ## License 50 | 51 | Copyright (C) 2001-2016 Attila Novák 52 | 53 | The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 54 | with the amendments below under Publication. 55 | 56 | If you are interested in using or adapting the resource for commercial purposes, please contact the author at: [novakat@gmail.com](mailto:novakat@gmail.com) 57 | 58 | ## Publication 59 | 60 | If you use this database and/or the tools: 61 | 62 | 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools clearly indicating what you use this database or tool for in your application/experiment/resource. 63 | 64 | 2. If possible, please publish a scientific paper about each application, experimental system or linguistic resource you create or experiment you perform using this resource quoting the articles below, and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 65 | 66 | Articles to quote are the following: (See the BibTeX file [quotethis.bib](https://github.com/nytud/emMorph/blob/master/quotethis.bib) in the root directory): 67 | 68 | * Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 69 | 70 | * Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 71 | 72 | * Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 73 | 74 | 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 75 | 76 | ## References 77 | 78 | This tool is also [integrated](https://github.com/dlt-rilmta/hunlp-GATE) into the [e-magyar](http://www.e-magyar.hu) language processing system and its successor, [emtsv](https://github.com/nytud/emtsv). 79 | If you use the e-magyar/emtsv/hunlp-GATE system, please also refer to Novák et al. (2016). 80 | -------------------------------------------------------------------------------- /pl/mkavs/irreg.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/gen"; 61 | use lib "$hpldir/pl/generic"; 62 | use lib "$hpldir/src"; 63 | 64 | require 'diewarn.pl'; 65 | require 'banner.pl'; 66 | start_banner('Irregular word gatherer'); 67 | require 'm2getopt.pl'; 68 | 69 | while(<>) 70 | { 71 | next if /^[^;]+\+|\|(?:ROV|BETU)|^\s*\*(?!\*\.\.\.)|^\s*$/; 72 | s/(zarte|isa|loc):.*?;|rp:male;//g; 73 | next if /\];\s*(?:$|\*)/; 74 | s/^(?!\s*\*\*\.\.\.)/ **.../; 75 | s/\.{6}/.../; 76 | print; 77 | } 78 | warn1("There were errors.\n") if $error; 79 | die_if_errors(); 80 | 81 | end_banner(); 82 | -------------------------------------------------------------------------------- /pl/generic/multich.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | use lib "$hpldir/gen"; 63 | 64 | require 'm2getopt.pl'; 65 | #extract multi character symbols form X lexicons 66 | while(<>) 67 | { 68 | next if /^\s*!|LEXICON\s/; 69 | ($w,$cont)=split(/(? {}, 53 | 'mcat_deriv;' => {}, 54 | }; 55 | $sfxfsa->{''}{''}{''}{'mcat:CASE;'} = {}; 56 | $sfxfsa->{''}{''}{'mcat:ANP;'} = $sfxfsa->{''}{''}{''}; 57 | $sfxfsa->{''}{'mcat:FAM;'} = $sfxfsa->{''}{''}; 58 | $sfxfsa->{'mcat:PL;'} = $sfxfsa->{''}{''}; 59 | $sfxfsa->{'mcat:POSS;'} = $sfxfsa->{''}; 60 | $sfxfsa->{'mcat:POSS;'}{'mcat:KEPP;'} = {}; 61 | #Ás+Om+kor 62 | $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}{'tag:TEM;'} = {}; 63 | $sfxfsa->{'tag:VNAs;'}{'phon:LA;'} = $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}; 64 | $sfxfsa->{'tag:VNAs;'}{'phon:LUkL;'} = $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}; 65 | #bb+An/Ul 66 | $sfxfsa->{'tag:CMP;'}{'tag:AAdvMANR;'} = {}; 67 | #cskA+bb+An/Ul 68 | $sfxfsa->{'tag:DIMA;'}{'tag:CMP;'} = $sfxfsa->{'tag:CMP;'}; 69 | #cskA+An/Ul 70 | $sfxfsa->{'tag:DIMA;'}{''} = $sfxfsa->{'tag:CMP;'}; 71 | 72 | 1; -------------------------------------------------------------------------------- /src/sfxfsagen.hpl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | $sfxfsa = { 52 | 'mcat:INFL;' => {}, 53 | 'mcat_deriv;' => {}, 54 | }; 55 | $sfxfsa->{''}{''}{''}{'mcat:CASE;'} = {}; 56 | $sfxfsa->{''}{''}{'mcat:ANP;'} = $sfxfsa->{''}{''}{''}; 57 | $sfxfsa->{''}{'mcat:FAM;'} = $sfxfsa->{''}{''}; 58 | $sfxfsa->{'mcat:PL;'} = $sfxfsa->{''}{''}; 59 | $sfxfsa->{'mcat:POSS;'} = $sfxfsa->{''}; 60 | $sfxfsa->{'mcat:POSS;'}{'mcat:KEPP;'} = {}; 61 | #Ás+Om+kor 62 | $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}{'tag:TEM;'} = {}; 63 | $sfxfsa->{'tag:VNAs;'}{'phon:LA;'} = $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}; 64 | $sfxfsa->{'tag:VNAs;'}{'phon:LUkL;'} = $sfxfsa->{'tag:VNAs;'}{'lp:POSS;'}; 65 | #bb+An/Ul 66 | $sfxfsa->{'tag:CMP;'}{'tag:AAdvMANR;'} = {}; 67 | #cskA+bb+An/Ul 68 | $sfxfsa->{'tag:DIMA;'}{'tag:CMP;'} = $sfxfsa->{'tag:CMP;'}; 69 | #cskA+An/Ul 70 | $sfxfsa->{'tag:DIMA;'}{''} = $sfxfsa->{'tag:CMP;'}; 71 | 72 | 1; -------------------------------------------------------------------------------- /pl/generic/lcase.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | 62 | require 'm2getopt.pl'; 63 | 64 | while(<>) 65 | { 66 | if(!/\[/) 67 | { 68 | s/(\@D\.A\.\+\@)/$1\@D.LCA.+\@/; 69 | s/(\@P\.[OD]\.\+\@)/$1\@D.LCA.-\@\@C.LCA\@/; 70 | } 71 | else 72 | { 73 | s/(\@D\.A\.\+\@)/$1 \@D.LCA.+\@ \@D.LCA.-\@ \@P.LCA.+\@ \@P.LCA.-\@ \@C.LCA\@/; 74 | } 75 | if(($lex,$srf,$rest)=/^(.+?[^%]):(.+?)\t(.*)/ and $srf=~/\p{Lu}\p{Ll}|\p{Ll}\p{Lu}/) 76 | { 77 | # print ;#"$1\t$2\t$3\n"; 78 | $lsrf="\L$srf"; 79 | print "\@P.LCA.+\@$lex:\@P.LCA.+\@$lsrf\t$rest\n"; 80 | print "\@P.LCA.-\@$lex:\@P.LCA.-\@$srf\t$rest\n"; 81 | next; 82 | } 83 | print; 84 | } -------------------------------------------------------------------------------- /pl/generic/diewarn.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #this provides a warn and a die sub which prefix "WARN:/ERROR:" to the message, respectively 56 | { 57 | my $errorflag; 58 | 59 | sub dienow 60 | { 61 | my(@a)=@_; 62 | $a[0]="ERROR: ".$a[0]; 63 | $a[-1].=" at line ".(caller())[2].", input record: $.\n" if $a[-1]!~/\n$/; 64 | die @_; 65 | } 66 | 67 | sub warn1 68 | { 69 | my(@a)=@_; 70 | $a[0]="WARN: ".$a[0]; 71 | $a[-1].=" at line ".(caller())[2].", input record: $.\n" if $a[-1]!~/\n$/; 72 | warn @_; 73 | } 74 | 75 | sub die1 76 | { 77 | my(@a)=@_; 78 | $a[0]="ERROR: ".$a[0]; 79 | $a[-1].=" at line ".(caller())[2].", input record: $.\n" if $a[-1]!~/\n$/; 80 | warn(@_); $errorflag++; 81 | } 82 | 83 | sub die_if_errors 84 | { 85 | die "$errorflag fatal error(s) occured.\n" if $errorflag; 86 | } 87 | 88 | 1; 89 | } -------------------------------------------------------------------------------- /pl/generic/lx3lex.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | 63 | require 'm2getopt.pl'; 64 | 65 | require 'banner.pl'; 66 | start_banner('X allomorph lexicon converter'); 67 | 68 | $"=''; 69 | 70 | while(<>) 71 | { 72 | s/\trestr:.*//; 73 | ($lex,$w,$cont)=split /[\t\n]/,$_; 74 | # $cont{$cont}++; 75 | $lex=~s/^lex\%?://; 76 | print "\nLEXICON $lex\n" if $lex ne $plex; 77 | $plex=$lex; 78 | print "$w\t$cont\n"; 79 | } 80 | 81 | =cmt 82 | print STDERR "\nMultichar_Symbols\n"; 83 | for(sort keys %multi) 84 | { 85 | print STDERR "$_ "; 86 | } 87 | print STDERR "\n\n"; 88 | 89 | delete $cont{"#;"}; 90 | 91 | for(sort keys %cont) 92 | { 93 | tr/;//d; 94 | print STDERR "\nLEXICON $_\n"; 95 | } 96 | =cut 97 | end_banner(); 98 | -------------------------------------------------------------------------------- /pl/generic/selrep.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | sub selrep0 56 | { 57 | my($exc_,$str_,$sel_,@rep_)=@_; 58 | my($a_,$b_,$c_,$d_,$spc_,$todo_); 59 | $todo_='$c_=$str_;$d_=\'\';while($a_=($c_=~'.$sel_.')'.($exc_?',$a_||$c_':'').'){($a_,$b_,$c_)=!$a_?($c_,\'\',\'\'):(($& eq $1.$2.$3)?($`.$1,$2,$3.$\'):($`,$&,$\'));'."\n"; 60 | { 61 | $spc_='$a_',last if $exc_; 62 | $spc_='$b_'; 63 | } 64 | foreach $pat_(@rep_) 65 | { 66 | $todo_.="$spc_=~".$pat_.";\n" 67 | } 68 | $todo_.='$d_=$d_.$a_.$b_;}'; 69 | # print $todo_; 70 | eval $todo_; 71 | return $d_.$c_; 72 | } 73 | 74 | #selective replace 75 | #replace only in selected part 76 | #selrep($in_what,$selection,@subst_expr_list) 77 | #e.g.: 78 | #$selection='/\{@.*?@\}/g'; 79 | #@subst_expr_list=('s/[|+=]//g'); 80 | sub selrep 81 | { 82 | selrep0(0,@_); 83 | } 84 | 85 | #selective replace 86 | #replace only outside the selected part 87 | sub selexrep 88 | { 89 | selrep0(1,@_); 90 | } 91 | 92 | 1; -------------------------------------------------------------------------------- /pl/mkavs/fixpron.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #this script fixes pronouns so that they have the lemma én/te etc. 56 | #as determined by person+number marking e1/e2 etc. 57 | 58 | use lib "$hpldir/pl/generic"; 59 | use lib "$hpldir/src"; 60 | 61 | do 'delim.hpl' unless $delim; 62 | $delim="\x1" unless $delim; 63 | 64 | if($zarte_ana) 65 | { 66 | %ppron=qw/e1 én e2 të e3 ő t1 mi t2 ti t3 ők/; 67 | } 68 | else 69 | { 70 | %ppron=qw/e1 én e2 te e3 ő t1 mi t2 ti t3 ők/; 71 | } 72 | while(<>) 73 | { 74 | s/abbrperiodPUNCT(.*)S_PUNCT/$1/; 75 | s/S_(PUNCT|KSZ)/I_$1/g; 76 | s/S_(?=H?KJ|PER)/H_/g; 77 | if (/${delim}([^${delim}+]*)\+[^${delim}]+${delim}[^${delim}+]*\+[^${delim}]+${delim}FN\|NM[^${delim}]*\+([et][1-3])/o) 78 | { 79 | $pro=$1 eq 'ugyanő'?"ugyan$ppron{$2}":$ppron{$2}; 80 | s/${delim}[^${delim}+]*\+(?=[^${delim}]+${delim}FN\|NM(?:[^${delim}]+\+[et][1-3]|[^${delim}]*\+[et][1-3]\+(?:ACC|NOM)))/"${delim}$pro+"/oe; 81 | } 82 | print; 83 | } 84 | -------------------------------------------------------------------------------- /pl/generic/m2getopt.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | while ( $_=shift(@ARGV) ) 56 | { 57 | print SDTERR $_; 58 | if(/^-([^-])/) 59 | { 60 | if($1 eq '=') 61 | { 62 | $OutFileName=$'; 63 | s//>/; 64 | die "unable to open $_ for writing" unless open (GO_OUT,$_); 65 | print STDERR "Output to $_\n"; 66 | open(STDOUT, ">&GO_OUT") || die "Can't dup stdout"; 67 | # select(GO_OUT); 68 | } 69 | else 70 | { 71 | s//\$$1/; 72 | $_.='=1;' if($_!~/=/); 73 | s/=(.*)/='$1'/ if($_!~/^=[0-9.-]+$/); 74 | print STDERR "$_\n"; 75 | eval $_; 76 | } 77 | } 78 | elsif($_) 79 | { 80 | unshift(@ARGV,$_); 81 | last; 82 | } 83 | else 84 | { 85 | last; 86 | } 87 | } 88 | # print STDERR "@ARGV\n"; 89 | if(defined $stderr) 90 | { 91 | die "unable to open $stderr for writing" unless open (ERR_OUT,">$stderr"); 92 | print STDERR "Error output to $stderr\n"; 93 | open(STDERR, ">&ERR_OUT") || die "Can't dup stderr"; 94 | } 95 | 1; 96 | -------------------------------------------------------------------------------- /pl/generic/entfix.pat: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | /&plus%;/+/ 64 | /±%;/±/ 65 | /(&#\d+)%;/$1;/ 66 | #/&(?!#)/&/ 67 | -------------------------------------------------------------------------------- /pl/generic/dumpdata.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | sub dumpdata 56 | { 57 | my($ref,$name,$filesfx)=@_; 58 | # require 'dumpsh.pl'; 59 | if($store) 60 | { 61 | use Storable; 62 | store ($ref,"$filesfx.$name.str"); 63 | return " 64 | use Storable; 65 | \$$name=retrieve('$filesfx.$name.str');\n"; 66 | } 67 | else 68 | { 69 | use Data::Dumper; 70 | $Data::Dumper::Terse=0; 71 | $Data::Dumper::Indent=1; 72 | $Data::Dumper::Deepcopy=1; 73 | 74 | return Data::Dumper->Dumpxs([$ref],[$name]); 75 | # use Data::Dump qw(dump); 76 | # my $a=dump($ref); 77 | # $a=~s/ {8,}/ /g; 78 | # $a=~s/(\d,|\[)\n/$1/g; 79 | # return "\n\$$name=\n$a;\n"; 80 | 81 | } 82 | 83 | # print Data::Dumper->Dumpxs([$mexp_sort],['mexp_sort']); 84 | # print Data::Dumper->Dumpxs([$mtxexps],['mtxexps']); 85 | # print Data::Dumper->Dumpxs([$matrices],['matrices']); 86 | # print Data::Dumper->Dumpxs([$mtxenc],['mtxenc']); 87 | # print Data::Dumper->Dumpxs([$Gprops],['Gprops']); 88 | # print dumpsh([$Gprops],['Gprops']); 89 | # print Data::Dumper->Dumpxs([$matrixsel],['matrixsel']); 90 | # print dumpsh([$Gpropset],['Gpropset']); 91 | } 92 | 93 | 1; -------------------------------------------------------------------------------- /pl/generic/mtxlex.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | use lib "$hpldir/gen"; 63 | 64 | require 'm2getopt.pl'; 65 | 66 | require 'diewarn.pl'; 67 | require 'banner.pl'; 68 | 69 | start_banner('X matrix lexicon generator'); 70 | 71 | $encoding="encoding$gen.hpl" unless $encoding; 72 | 73 | require "$encoding.mtxcont.hpl"; 74 | require "$encoding.morphclasses.hpl"; 75 | 76 | for(sort keys %$Mclasses) 77 | { 78 | print "\nLEXICON $_\n"; 79 | ($rm,$rc)=/M_(.*?)_(.*)/; 80 | $classes=''; 81 | $invalid=''; 82 | for(@{$mtx->{$rm}{$rc}}) 83 | { 84 | $lc="L_${rm}_$_"; 85 | if($Lclasses->{$lc}) 86 | { 87 | $classes.="$lc;\n"; 88 | } 89 | else 90 | { 91 | $invalid.="$lc; "; 92 | } 93 | } 94 | print $classes; 95 | die1("No valid continuations for $_:\nNo lex entries for: $invalid\n") if !$classes; 96 | } 97 | 98 | die_if_errors(); 99 | end_banner(); 100 | 101 | -------------------------------------------------------------------------------- /src/fixrps.pat: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | sel /\+\+.*/ 64 | /!/!:/ 65 | unsel 66 | 67 | sel /(rp:)(.*?);/ 68 | /\.[^&;]+// 69 | /\%// 70 | /!(?!:)/not_/ 71 | /ESS_UlL/ESS_Ul/ 72 | unsel 73 | /(?:no_)?inh:.*?;// 74 | /loc:/rp:loc_/ 75 | /(?) 73 | { 74 | next unless /$tagname:([^;]+)/o; 75 | $a=$_; 76 | $tag=$1; 77 | for(split(/\+/,$tag)) 78 | { 79 | $sfxtags{$_}++ unless /$stemtags/o; 80 | # $tag=~s/.*\+//; 81 | next if $_=~/$stemtags/o; 82 | next if $mcat{$_}; 83 | $mcat{$_}='I',next unless $a=~/mcat:[^;>]*>(=?)([^;]+)/; 84 | $dercat{$_}=$1.cat2hum($2); 85 | $mcat{$_}="D=$dercat{$_}"; 86 | } 87 | 88 | } 89 | print '$sfxtags=\'(?:'.join('|',sort keys %sfxtags).")(?=\$|[]+])';\n"; 90 | print '%mcat=(\''.join('\',\'',%mcat)."');\n"; 91 | print '%dercat=(\''.join('\',\'',%dercat)."');\n"; 92 | print '$dertags=\'(?:'.join('|',map(quotemeta,sort keys %dercat)).")(?=\$|[]+])';\n"; 93 | print "1;\n"; 94 | -------------------------------------------------------------------------------- /src/rev.srt: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | #$keeptmp=1; 64 | $pat=' 65 | chomp; 66 | my($a,$b,$c); 67 | ($a)=/^\s*;?([^;]+)/; 68 | ($c)=$a=~/(\[[^\]]*\])$/; 69 | $a=~s/[?!#=+%@&^(){}"]|[<[].*?[]>]|\.\.\.|_.*//g; 70 | $a=reverse($a); 71 | $b=$a; 72 | $b=~tr/A-ZÁÉÍÓÚÖÜŐŰ.-/a-záéíóúöüőű/d; 73 | $_="$b\004$a\004$c"; 74 | '; 75 | -------------------------------------------------------------------------------- /pl/generic/scanmeta.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | use Data::Dumper;# qw(dump); 56 | 57 | %flop1=qw/? R = P/; 58 | %flop0=qw/? D = N/; 59 | 60 | while(<>) 61 | { 62 | next if /^\s*#/; 63 | if(/(\S+?):\s*([\%\$]?)/) 64 | { 65 | ($st,$f)=($1,$2); 66 | $root=$st if $f eq '%'; 67 | $final{$st}++ if $f eq '$'; 68 | next; 69 | } 70 | if(/(\S+)\s*\->\s*(\S+)/) 71 | { 72 | ($cat,$st2)=($1,$2); 73 | $fl=''; 74 | @fl=/([?=])\{(.*?)\}/g; 75 | while(@fl) 76 | { 77 | ($op1,$chk)=(shift @fl,shift @fl); 78 | @chk=reverse split //,$chk; 79 | $i='A'; 80 | for(@chk) 81 | { 82 | next if !/[.+-01]/; 83 | $i++,next if /\./; 84 | $op=/[+1]/?$flop1{$op1}:$flop0{$op1}; 85 | $fl.="\@$op.$i.+\@"; 86 | $i++; 87 | } 88 | } 89 | $transitions->{$cat}{"$st\->$st2;$fl"}++; 90 | } 91 | } 92 | for(keys %$transitions) 93 | { 94 | $transitions->{$_}=[keys %{$transitions->{$_}}]; 95 | } 96 | 97 | print "\$root='$root';\n"; 98 | #$a=dump({%final}); 99 | #$a=~s/ {8,}/ /g; 100 | #print "\n\$final=\n$a;\n"; 101 | $a=Data::Dumper->Dumpxs([{%final}],['final']); 102 | print "\n$a\n"; 103 | 104 | #$a=dump($transitions); 105 | #$a=~s/ {8,}/ /g; 106 | #print "\n\$transitions=\n$a;\n"; 107 | 108 | $a=Data::Dumper->Dumpxs([$transitions],['transitions']); 109 | print "\n$a\n"; 110 | 111 | -------------------------------------------------------------------------------- /pl/generic/newproplst.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #program to list properties in a .propset file not yet listed in the 56 | #proplst.hpl property-encoding definition file 57 | 58 | BEGIN{ 59 | $hpldir=$ENV{'hpldir'} if !$hpldir; 60 | $hpldir='../..' if !$hpldir; 61 | } 62 | use lib "$hpldir/pl/generic"; 63 | use lib "$hpldir/src"; 64 | 65 | require 'm2getopt.pl'; 66 | $proplst='proplst.hpl' if !$proplst; 67 | 68 | require 'dumpsh.pl'; 69 | require $proplst; 70 | 71 | #$/=''; 72 | 73 | #produce property list (part of proplst.hpl) from the propsets file 74 | while(<>) 75 | { 76 | $l=$_; 77 | ($lr,$prp,$req)=(/^([lr]),(.*?)(?:,(.*?))?;/); 78 | for(split(/[&,!()|]+/,$prp)) 79 | { 80 | warn("New property: \"$_\" found in $ARGV:\n$l\n"), 81 | $Gnprops->{$_}=[$lr,'','',''] if $_ ne '' && !$Gprops->{$_} && !$Gnprops->{$_} && !$Gcmp_prop->{$_}; 82 | } 83 | $lr=$lr eq 'r'?'l':'r'; 84 | for(split(/[&,!()|]+/,$req)) 85 | { 86 | warn "New property: \"$_\" found in $ARGV:\n$l\n", 87 | $Gnprops->{$_}=[$lr,'','',''] if $_ ne '' && !$Gprops->{$_} && !$Gnprops->{$_} && !$Gcmp_prop->{$_}; 88 | } 89 | } 90 | if(keys %$Gnprops) 91 | { 92 | $a=(dumpsh([$Gnprops],['Gprops'])); 93 | # print $a; 94 | $"=",\n"; 95 | @a=split(/,?\n/,$a); 96 | shift(@a); 97 | pop(@a); 98 | @a=sort {$a=~/'(.*?)'.*?\['([rl])'/;$aa=$2.$1;$b=~/'(.*?)'.*?\['([rl])'/;$bb=$2.$1;$aa cmp $bb}(@a); 99 | print "\$Gprops = {\n@a,\n};\n"; 100 | die "New properties found."; 101 | } 102 | -------------------------------------------------------------------------------- /mak/xlx2lglexc.make: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | ifndef ROOT 64 | ROOT := .. 65 | endif 66 | 67 | PL := $(ROOT)/pl/generic 68 | GENDIR := $(ROOT)/gen 69 | LEXC := $(ROOT)/lexc 70 | F := $(LEXC)/$(F) 71 | 72 | do: $(F)lg.lexc 73 | 74 | $(F)lg.lexc: $(PL)/convtags.pl $(F)tags $(F)f.xlx 75 | perl $(PL)/convtags.pl $(F)tags $(F)f.xlx >$(F)lg.lexc 76 | 77 | $(F)tags: $(PL)/hum2lgrh.pl $(F)f.xlx 78 | perl $(PL)/hum2lgrh.pl -=$(F)tags -listtags $(F)f.xlx 79 | 80 | $(F)f.xlx: $(PL)/greplace.pl $(PL)/entfix.pat $(F).xlx $(PL)/lcase.pl 81 | perl $(PL)/greplace.pl -all $(PL)/entfix.pat $(F).xlx | perl $(PL)/lcase.pl >$(F)f.xlx 82 | 83 | -------------------------------------------------------------------------------- /pl/generic/normform.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | use Storable; 56 | #normalize allomorph structure 57 | sub norm_allomf 58 | { 59 | my $m=shift; 60 | my ($gl,$k); 61 | for $k(keys %$m) 62 | { 63 | next if $k=~/allomf/; 64 | $m->{$k}=normform($m->{$k}); 65 | #encode global properties/requirements as a single 66 | #right property ( :( ) 67 | if($k=~/^g[lr]?[pr]$/) 68 | { 69 | $m->{$k}=~tr/|&()!/#%<>~/; 70 | $gl.="_${k}_$m->{$k}"; 71 | } 72 | } 73 | $m->{'rp'}.="&$gl" if defined $gl; 74 | $m->{'rp'}=~s/^&//; 75 | } 76 | 77 | my $normfrm; 78 | eval{$normfrm=retrieve('normfrm.tmp') if -r 'normfrm.tmp'}; 79 | 80 | sub normform 81 | { 82 | my $mm=shift; 83 | return $normfrm->{$mm} if $normfrm->{$mm}; 84 | my($i,$op,@t,$r,$aa,$bb,$m); 85 | 86 | $m=$mm; 87 | $m=~s/^[& ]+|[& ]+$//g; 88 | $m=~tr/ /&/s;#;#;/ 89 | @t=split(/([()]+)/,$m); 90 | for($i=0;$i<=$#t;$i+=2) 91 | { 92 | ($op)=$t[$i]=~/([\&|])/; 93 | $m=join($op,sort({$aa=$a;$bb=$b;$aa=~s/!//;$bb=~s/!//;$aa cmp $bb} split(/[\&|]/,$t[$i]))); 94 | $m=~s/(^|[\&|])([^\&|]+)(?:\&\2)+(?=$|[\&|])/$1$2/g; 95 | $r.=$m; 96 | $r.=$op if $t[$i+1]=~/^\(/; 97 | $r.=$t[$i+1]; 98 | } 99 | $normfrm->{$mm}=$r; 100 | } 101 | 102 | sub savenormfrm 103 | { 104 | store ($normfrm,'normfrm.tmp') if defined $normfrm; 105 | } 106 | 107 | #$/=''; 108 | # 109 | #while(<>) 110 | #{ 111 | # for(split(/\n/)) 112 | # { 113 | # s/req=>(.*?)\}/'req=>'.normform(' '.$1).'}'/e; 114 | # print "$_\n"; 115 | # } 116 | # print "\n"; 117 | #} 118 | 119 | 1; -------------------------------------------------------------------------------- /mak/hu-hfst.make: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | hu.hfstol: casenormhu.hfstol huXlgNoSt.hfstol 64 | cat casenormhu.hfstol huXlgNoSt.hfstol >hu.hfstol 65 | 66 | casenormhu.hfstol: casenormhu.hfst 67 | hfst-fst2fst -O -o casenormhu.hfstol casenormhu.hfst 68 | 69 | huXlgNoSt.hfstol: huXlgNoSt.hfst 70 | hfst-fst2fst -O -o huXlgNoSt.hfstol huXlgNoSt.hfst 71 | 72 | hu.hfst: casenormhu.hfst huXlgNoSt.hfst 73 | cat casenormhu.hfst huXlgNoSt.hfst >hu.hfst 74 | 75 | casenormhu.hfst: casenormhuX.xfs 76 | echo "" | hfst-xfst -F casenormhuX.xfs 77 | 78 | huXlgNoSt.hfst: huXlg.hfst 79 | echo "" | hfst-xfst -e "load huXlg.hfst" -e "eliminate flag St" -e "ss huXlgNoSt.hfst" -e "exit" 80 | 81 | huXlg.hfst: huXlg.lexc 82 | echo "" | hfst-xfst -e "read lexc huXlg.lexc" -e "invert" -e "ss huXlg.hfst" -e "exit" 83 | 84 | -------------------------------------------------------------------------------- /pl/generic/mtx2hash.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | 62 | require 'm2getopt.pl'; 63 | require 'banner.pl'; 64 | require 'diewarn.pl'; 65 | $store=1; 66 | require 'dumpdata.pl'; 67 | 68 | start_banner('Matrices to hash converter'); 69 | $stm=time; 70 | 71 | sub printmsg 72 | { 73 | print MSG time-$stm,": ",shift,"\n"; 74 | } 75 | 76 | open(MSG,'>&STDERR'); 77 | select(MSG);$|=1;select(STDOUT); 78 | 79 | #transpose matrix 80 | sub transpose 81 | { 82 | my($me)=shift; 83 | my(@met,$i); 84 | printmsg("transposing matrix..."); 85 | for($i=0;$i<=$#$me;$i++) 86 | { 87 | $j=0; 88 | for(split / +/,$me->[$i]) 89 | { 90 | unless($i) 91 | { 92 | $met[$j]=$_.' '; 93 | } 94 | else 95 | { 96 | $met[$j].=$_.' '; 97 | } 98 | $j++; 99 | } 100 | } 101 | @$me=@met; 102 | } 103 | 104 | sub mtx2hash 105 | { 106 | my @mtx=map{chomp;$_}(@{$_[0]}); 107 | @mtx=transpose(\@mtx); 108 | $_=shift @mtx; 109 | printmsg("converting matrix..."); 110 | my @head=split /\s+#?/; 111 | shift @head; 112 | my @idx=(0..$#head); 113 | my(@row,$code); 114 | 115 | map 116 | { 117 | @row=split/\s+/,$_; 118 | $code=shift @row; 119 | $code=substr($code,1); 120 | @row=map{$row[$_] eq '*'?$head[$_]:()}@idx; 121 | warn("Code $_ unjoinable\n") if !@row; 122 | $code,[@row]; 123 | }@mtx; 124 | } 125 | 126 | my(@a,$mtxname); 127 | for(@ARGV) 128 | { 129 | open(I,$_) or die "Matrix file $_ missing.\n"; 130 | ($mtxname)=/_([^_]*?)\.txt/i; 131 | @a=(); 132 | printmsg("processing matrix $mtxname..."); 133 | $mtx->{$mtxname}={mtx2hash(\@a)}; 134 | close I; 135 | } 136 | 137 | printmsg("generating output..."); 138 | $store=1; 139 | print dumpdata($mtx,'mtx',$OutFileName); 140 | #$a=dump($mtx); 141 | #$a=~s/ {8,}/ /g; 142 | #$a=~s/(\d,|\[)\n/$1/g; 143 | #print "\n\$mtx=\n$a;\n"; 144 | 145 | die_if_errors(); 146 | end_banner(); 147 | -------------------------------------------------------------------------------- /pl/generic/metalex.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | use lib "$hpldir/gen"; 63 | 64 | #use Data::Dump qw(dump); 65 | 66 | require 'm2getopt.pl'; 67 | 68 | require 'diewarn.pl'; 69 | require 'banner.pl'; 70 | 71 | start_banner('X word grammar lexicon generator'); 72 | 73 | $encoding="encoding$gen.hpl" unless $encoding; 74 | $trans="metadict.txt.trans.hpl" unless $trans; 75 | 76 | require "$encoding.morphclasses.hpl"; 77 | require $trans; 78 | require $encoding; 79 | 80 | sub getencoding 81 | { 82 | my($pr)=$_[0]; 83 | if($pr) 84 | { 85 | $mc=$Gpropset->{$pr}[6]?-1:$Gpropset->{$pr}[0];#mark matrix-unjoinable morphs with #-1 86 | die1("Encoding not found for: $pr ($ssrf)\n") if !$Gpropset->{$pr}[0]; 87 | } 88 | else 89 | { 90 | $mc=0; # default id is #0; it matches anything 91 | } 92 | $mc; 93 | } 94 | 95 | #generate root lexicon with start conditions defined in encoding 96 | 97 | die "\$startcond_propset not defined in $encoding\n" if !defined $startcond_propset; 98 | 99 | $rc=getencoding($startcond_propset); 100 | @rm=@{$Gpropset->{$startcond_propset}[3]};#right matrix 101 | 102 | if($guess) 103 | { 104 | print "\nLEXICON Root\n"; 105 | print "<"; 106 | print "?* " if $guess; 107 | print "%\@U%.St%.$root%\@> M_$rm[0]_$rc;\n"; 108 | } 109 | else 110 | { 111 | print "\nLEXICON Root\n\@U.St.$root\@ M_$rm[0]_$rc;\n"; 112 | } 113 | 114 | for(sort keys %$Rclasses) 115 | { 116 | ($cc=$_)=~s/([\s!\%;"<>])/\%$1/go;#escape special characters in lexicon name 117 | print "\nLEXICON $cc\n"; 118 | ($wcat,$rmrc)=/R_\((.*?)\)(_.*)/; 119 | for(@{$transitions->{$wcat}}) 120 | { 121 | # warn "$_\n"; 122 | ($s1,$s2,$cond)=/^(.*?)->(.*?);(.*)/; 123 | # warn "$s1\t$s2\t$cond\n"; 124 | print "\@U.St.$s1\@\@P.St.$s2\@${cond} M$rmrc;\n"; 125 | # print "\@U.St.$s1\@\@P.St.$s2\@${cond}+:\@U.St.$s1\@\@P.St.$s2\@${cond}0 M$rmrc;\n"; 126 | #for final states: add a transition to # 127 | print "\@U.St.$s1\@\@P.St.$s2\@${cond} #;\n" if $final->{$s2}; 128 | } 129 | #to make empty lexicons OK 130 | if(!@{$transitions->{$wcat}}) 131 | { 132 | print "\@D.St\@ #;\n"; 133 | } 134 | } 135 | 136 | die_if_errors(); 137 | end_banner(); 138 | 139 | -------------------------------------------------------------------------------- /pl/mkavs/sfxlex1.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #generate Hungarian level 1 suffix lexicon file sfx.1 from sfx.txt (Excel file) 56 | BEGIN{ 57 | $hpldir=$ENV{'hpldir'} if !$hpldir; 58 | $hpldir='../..' if !$hpldir; 59 | } 60 | 61 | use lib "$hpldir/pl/generic"; 62 | use lib "$hpldir/src"; 63 | 64 | require 'banner.pl'; 65 | start_banner('Level 1 suffix lexicon generator'); 66 | 67 | $glossfld=25; 68 | while(<>) 69 | { 70 | chomp; 71 | next if /^#|^\t{19}|^$|^\@ME.FORMAT/; 72 | @attr=split /\t/; 73 | splice @attr,20,0,splice(@attr,2,1); 74 | last if $attr[0] eq ''; 75 | } 76 | 77 | while(<>) 78 | { 79 | $type=$1 if /^[^\t].*(deriv|infl)/; 80 | $cat=$2.$3 if /^[^\t]*((Nom)inal|(V)erbal)/; 81 | next if /^#|^\t{21}|^$|^\@ME.FORMAT/; 82 | s/\t+//,print("#$_"),next if /\t{21}/|!/\S\t/; 83 | chomp; 84 | @a=split /\t/; 85 | $a[$glossfld]='' unless $a[$glossfld]; #just to make sure that $a[20] exists (splice needs this) 86 | splice @a,20,0,splice @a,2,1; 87 | $a[0]='',@lbl=@a,$l=$#lbl,next if $a[0] eq 'props'; 88 | @f=split m#/#,$a[2] if $a[2]!~/^\+/; 89 | @f=($a[2]) if $#f<0;#to save 0 morphs 90 | $c=''; 91 | $c=$a[2],$c=~s/^\+// if $a[2]=~/^\+/; 92 | next if $a[1]=~/\.\./; 93 | $fvl=''; 94 | $low=''; 95 | $fvl="L" if $a[6]=~/^\+/;# && $a[2]!~/^\+/; 96 | $low="L" if $a[14]=~/^\+/; 97 | # $a[2]="L$a[2]" if $a[6]=~/^\+/ && $a[2]!~/^\+/; 98 | # $a[2].="L" if $a[13]=~/^\+/; 99 | $a[$glossfld]="#$a[$glossfld]"; 100 | print ("#$a[0]\n"),$a[0]='' if $a[0]; 101 | $f=''; 102 | $a[20]=$a[1] if $a[20]=~/:$|^$/&&$a[1]!~/:$|^$/; 103 | $a[1]=$tag if $a[1]=~/:$|^$/; 104 | $a[20]=$hum if $a[20]=~/:$|^$/; 105 | $tag=$a[1]; 106 | $hum=$a[20]; 107 | for $b(@f) 108 | { 109 | @b=@a; 110 | for($i=1;$i<=$l;$i++) 111 | { 112 | $b[$i]='' if $b[$i]=~/^%/; # % marks a feature commented out 113 | $b[$i]='+' if $b[$i]=~/^A\+?$/ && $b=~/^[AU]/; 114 | $b[$i]='-' if $b[$i]=~/^A\+?$/ && $b!~/^[AU]/; 115 | $b[$i]="$lbl[$i]_$b[$i]" if $b[$i]!~/[\*-]|^\s*$/ && $lbl[$i]!~/^(\s*|\*)$/; 116 | do 117 | { 118 | $b[$i]="$attr[$i]:$b[$i]"; 119 | $b[$i]="props_$b[$i]" if $lbl[$i]!~/^\s*$/ 120 | } if $attr[$i]!~/^\s*$/; 121 | $b[$i]=~s/_\+//g; 122 | $b[$i]=~s/(^|:)[^:]*[-*][^:]*$/$1/g if $lbl[$i]!~/^\*?$/; 123 | } 124 | $b="F$b" if $b[4]!~/VH/;#$b~!/AÁOÓUÚV/; 125 | if(!$c) 126 | { 127 | $b[2]=$b; 128 | } 129 | else 130 | { 131 | $b[2]=$b.$c; 132 | } 133 | $b[2]="phon:$fvl$b[2]$low"; 134 | $_=join ',',@b; 135 | s/(lp:.*)VH_([BF])/lr:VH$2,$1/; 136 | while(s/(lp:.*?)(VH(_[^,]+)?|RH)/$1/){}; 137 | s/,+/,/g; 138 | s/^,+//; 139 | s/\n(.+)/$1\n/; 140 | s/,(?=[^:,]+:)/;/g; 141 | s/;(?!phon:)[^:,;]+:(?=;|,#)//g; 142 | s/,#/;#/g; 143 | s/:,/:/g; 144 | s/props_rp:.*?;// if $type eq 'infl'; 145 | # $f='phon:d;' if s/Ad\/d;/Ad;/; 146 | $f="phon:$1Ál;" if s/([^:]+)\/\+Ál/$1/; 147 | #derivational suffixes (mcat:lcat>rcat) 148 | #have lr:cat_lcat category requirement (unless lcat=*) and 149 | #rp:cat_rcat right category 150 | $lcat=$cat; 151 | $lcat=$1 if /mcat:.*?([^" ;:,\]]+)\]?>/; 152 | ($rcat)=/mcat:.*?>=?([^" ;:,]+)/; 153 | $frt="type:$type;props_rp:mcat_$type;"; 154 | $frt.="props_lr:cat_$lcat inflable;" if $lcat ne '*'; 155 | $frt.="props_rp:cat_$rcat;" if $rcat; 156 | print "$frt$_\n"; 157 | s/phon:.*?;/$f/,print "$frt$_\n" if $f=~/phon:[jsz]/ 158 | } 159 | # s/phon:.*?;/$f/,print "$frt$_\n" if $f eq 'phon:d;'; 160 | } 161 | end_banner(); 162 | -------------------------------------------------------------------------------- /pl/generic/set.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | use lib "$hpldir/pl/generic"; 60 | #require 'umlaut.pl'; 61 | 62 | sub union 63 | { 64 | my ($a,$b)=@_; 65 | my (%a); 66 | @a{@$a}=(); 67 | @a{@$b}=(); 68 | return [keys(%a)]; 69 | } 70 | 71 | sub intersect 72 | { 73 | my ($a,$b)=@_; 74 | my($x,$y); 75 | my (@c,%a); 76 | for(@$a){$a{$_}++}; 77 | for(@$b){$a{$_}--}; 78 | while(($x,$y)=each(%a)){push(@c,$x) if !$y} 79 | return \@c; 80 | } 81 | 82 | sub issubset 83 | { 84 | my ($a,$b)=@_; 85 | my ($res,%b); 86 | @b{@$b}=(); 87 | $res=1; 88 | for(@$a){$res&&=exists($b{$_})}; 89 | return $res; 90 | } 91 | 92 | sub equal1 93 | { 94 | my ($a,$b)=@_; 95 | return join('\xff',sort(@$a)) eq join('\xff',sort(@$b)); 96 | } 97 | 98 | sub equal2 99 | { 100 | my ($a,$b)=@_; 101 | my (%a,%b); 102 | @a{@$a}=(); 103 | @b{@$b}=(); 104 | return join('\xff',keys(%a)) eq join('\xff',keys(%b)); 105 | } 106 | 107 | =cmt 108 | sub mergecatcomps 109 | { 110 | my ($a,$b)=@_; 111 | my (@a,@b,%a); 112 | my ($x,$y,$z,$ay,$bx); 113 | # @a=split(/\+/,$a); 114 | # @b=split(/\+/,$b); 115 | ($x)=$a=~s/^(.*(?:\+|$))//; 116 | ($y)=$b=~s/^(.*(?:\+|$))//; 117 | while($x||$y) 118 | { 119 | if($x eq $y) 120 | { 121 | $z.=$x; 122 | ($x)=$a=~s/^(.*(?:\+|$))//; 123 | ($y)=$b=~s/^(.*(?:\+|$))//; 124 | } 125 | else 126 | { 127 | ($bx)=$b=~/(.*?\+?)$x/; 128 | ($ay)=$a=~/(.*?\+?)$y/; 129 | if(!defined $bx) 130 | { 131 | $z.=$x; 132 | ($x)=$a=~s/^(.*(?:\+|$))//; 133 | } 134 | elsif(!defined $ay) 135 | { 136 | $z.=$y; 137 | ($y)=$b=~s/^(.*(?:\+|$))//; 138 | } 139 | elsif(length($bx)$(LEXC)/hu$(S).xlx 90 | 91 | $(LEXC): 92 | mkdir $(LEXC) 93 | 94 | $(GENDIR)/multich$(X).xlx: $(PL)/multich.pl $(GENDIR)/mrf$(S).xlx $(GENDIR)/mtx$(X).xlx $(GENDIR)/meta$(X).xlx 95 | perl $(PL)/multich.pl $(GENDIR)/mrf$(S).xlx $(GENDIR)/mtx$(X).xlx $(GENDIR)/meta$(X).xlx >$(GENDIR)/multich$(X).xlx 96 | 97 | $(GENDIR)/mrf$(S).xlx: $(PL)/lx3lex.pl $(PL)/m2getopt.pl $(PL)/banner.pl $(GENDIR)/mrf$(S).s 98 | perl $(PL)/lx3lex.pl -=$(GENDIR)/mrf$(S).xlx $(GENDIR)/mrf$(S).s 99 | 100 | $(GENDIR)/mtx$(X).xlx: $(PL)/mtxlex.pl $(PL)/m2getopt.pl $(PL)/diewarn.pl $(PL)/banner.pl $(GENDIR)/encoding$(X)bit2mtx.hpl $(GENDIR)/encoding$(X)bit2mtx.hpl.mtxcont.hpl $(GENDIR)/encoding$(X)bit2mtx.hpl.morphclasses.hpl 101 | perl $(PL)/mtxlex.pl -encoding=$(GENDIR)/encoding$(X)bit2mtx.hpl -=$(GENDIR)/mtx$(X).xlx 102 | 103 | $(GENDIR)/meta$(X).xlx: $(PL)/metalex.pl $(PL)/m2getopt.pl $(PL)/diewarn.pl $(PL)/banner.pl $(GENDIR)/encoding$(X)bit2mtx.hpl $(GENDIR)/metadict$(GUESS).txt.trans.hpl 104 | perl $(PL)/metalex.pl $(SRFONLY) -guess=$(GUESS) -encoding=$(GENDIR)/encoding$(X)bit2mtx.hpl -trans=$(GENDIR)/metadict$(GUESS).txt.trans.hpl -=$(GENDIR)/meta$(X).xlx 105 | 106 | $(GENDIR)/mrf$(S).s: $(GENDIR)/mrf$(S).lx3 $(PL)/bsort.pl 107 | perl $(PL)/bsort.pl -uniq $(GENDIR)/mrf$(S).lx3 >$(GENDIR)/mrf$(S).s 108 | 109 | $(GENDIR)/mrf$(S).lx3 $(GENDIR)/encoding$(X)bit2mtx.hpl.morphclasses.hpl: $(PL)/morphlex.pl $(PL)/m2getopt.pl $(PL)/diewarn.pl $(PL)/banner.pl $(GENDIR)/encoding$(X)bit2mtx.hpl $(GENDIR)/sfxgen.lx2 $(BASELEX) $(LEX) $(PL)/banner.pl 110 | perl $(PL)/morphlex.pl $(EXCL) $(SRFONLY) -encoding=$(GENDIR)/encoding$(X)bit2mtx.hpl -=$(GENDIR)/mrf$(S).lx3 $(GENDIR)/sfxgen.lx2 $(BASELEX) $(LEX) 111 | 112 | $(GENDIR)/encoding$(X)bit2mtx.hpl.mtxcont.hpl: $(PL)/mtx2hash.pl $(PL)/m2getopt.pl $(PL)/banner.pl $(PL)/diewarn.pl $(PL)/dumpdata.pl $(HUM)/mtx$(X)_n.txt $(HUM)/mtx$(X)_v.txt 113 | perl $(PL)/mtx2hash.pl -=$(GENDIR)/encoding$(X)bit2mtx.hpl.mtxcont.hpl $(HUM)/mtx$(X)_n.txt $(HUM)/mtx$(X)_v.txt 114 | 115 | $(GENDIR)/metadict$(GUESS).txt.trans.hpl: $(PL)/scanmeta.pl $(HUM)/metadict$(GUESS).txt 116 | perl $(PL)/scanmeta.pl $(HUM)/metadict$(GUESS).txt >$(GENDIR)/metadict$(GUESS).txt.trans.hpl 117 | 118 | -------------------------------------------------------------------------------- /pl/generic/sort.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $bsort=$ENV{'bsort'} if !$bsort; 57 | $bsort.='/' unless !$bsort||$bsort=~/(^|\/)$/; 58 | $bsort=$ENV{'PL'} if !$bsort; 59 | $bsort.='/' unless !$bsort||$bsort=~/(^|\/)$/; 60 | $hpldir=$ENV{'hpldir'} if !$hpldir; 61 | $hpldir='../..' if !$hpldir; 62 | } 63 | 64 | use lib "$hpldir/pl/generic"; 65 | use lib "$hpldir/src"; 66 | 67 | require 'banner.pl'; 68 | start_banner('Regex pattern driven sort program'); 69 | 70 | while ( $_=shift(@ARGV) ) 71 | { 72 | print SDTERR $_; 73 | if(/^-([^-])/) 74 | { 75 | if($1 eq '=') 76 | { 77 | $OutFileName=$'; 78 | s//>/; 79 | die "unable to open $_ for writing" unless open (GO_OUT,$_); 80 | print STDERR "Output to $_\n"; 81 | select(GO_OUT); 82 | } 83 | else 84 | { 85 | s//\$$1/; 86 | $_.='=1;' if($_!~/=/); 87 | s/=(.*)/='$1'/ if($_!~/=[0-9]/); 88 | print STDERR "$_\n"; 89 | eval $_; 90 | } 91 | } 92 | elsif($_) 93 | { 94 | unshift(@ARGV,$_); 95 | last; 96 | } 97 | else 98 | { 99 | last; 100 | } 101 | } 102 | # print STDERR "@ARGV\n"; 103 | 104 | #$stime0=time; 105 | if($#ARGV<0&&!$nohelp) 106 | { 107 | print STDERR <outfile 111 | scriptfile 'script' is eval'd before sorting 112 | The options are: 113 | -pat="s///;" pattern to sort by (expression 'eval'ed on each input line) 114 | -\$/='' set input record separator 115 | -00 paragraph mode 116 | -0oct set input record separator as octal number 'oct' 117 | -sw="-[a-z]..." switches to pass to bsort 118 | -keeptmp keep tmpfile 119 | -q quiet operation (no progress indication) 120 | EOM 121 | exit -1; 122 | } 123 | 124 | sub dispperc 125 | { 126 | local(*IN)=@_; 127 | my($pos,$perc,$now,$gone,$left); 128 | $pos=tell(IN); 129 | $perc=$pos/$len; 130 | $now=time; 131 | $gone=$now-$stime; 132 | if($now-$ptime) 133 | { 134 | $avg=.1*($pos-$ppos/($now-$ptime))+.9*$avg; 135 | $ppos=$pos; 136 | $ptime=$now; 137 | } 138 | $left=($len-$pos)/$avg; 139 | printf STDERR "$pos of $len (%0.4f%%) %02d:%02d %02d:%02d \r",100*$perc,$gone/60,$gone%60,$left/60,$left%60; 140 | $i=0; 141 | } 142 | 143 | sub getlen 144 | { 145 | local(*IN)=@_; 146 | $stime=time; 147 | seek(IN,0,2); 148 | $len=tell(IN); 149 | seek(IN,$start,0); 150 | $avg=1; 151 | } 152 | 153 | #$pat='/.*/$&/' if(!$pat); 154 | #if(0) 155 | #{ 156 | # die "Unable to open script file $f\n" unless open(SCR,$f); 157 | # while() 158 | # { 159 | # eval; die "Invalid script file $f:\n $@" if $@; 160 | # } 161 | # close SCR; 162 | #} 163 | 164 | $f=shift if !defined $f &&!defined $pat; 165 | $file=shift; 166 | $out=shift; 167 | #$file eq '-' && IN=STDIN or 168 | die "Unable to open input file: $file\n" unless (open(IN,$file)); 169 | print STDERR "Sorting $file...\n"; 170 | require $f if defined $f; 171 | $sw.=' -q' if $q; 172 | $RSep=$/; 173 | goto doout if $doout; 174 | #die "szar: $doout"; 175 | &getlen(*IN); 176 | $expr='while() 177 | { 178 | '.$pat.'; 179 | '.($q?'':' if($i&0x800) 180 | { 181 | &dispperc(*IN); 182 | } 183 | $i++; 184 | ').' chomp; 185 | @keys=($_) if(!@keys); 186 | foreach $key(@keys) 187 | { 188 | if (length($key)>240) 189 | { 190 | warn "line too long (".length($key)."):\n$key"; 191 | $key=substr($key,0,240); 192 | } 193 | printf TMP "%s\001%09d\n",$key,$pos if defined $key; 194 | die "\\\\n in string:\n$key" if $key=~/\n/; 195 | } 196 | $pos=tell; 197 | undef @keys; 198 | }'; 199 | #".pack("L",$pos)." 200 | print STDERR "Running script:\n$expr\n" if $v; 201 | print STDERR "Creating tempfile...\n"; 202 | open(TMP,">_plsort.tmp"); 203 | $pos=0; 204 | eval $expr; die "Error eval'ing expression:\n $expr\n$@" if $@; 205 | close TMP; 206 | #print STDERR "${bsort}bsort _plsort.tmp _plsrt2.tmp $sw\n"; 207 | die if system("perl ${bsort}bsort.pl $sw _plsort.tmp >_plsrt2.tmp"); 208 | unlink '_plsort.tmp' if !$keeptmp; 209 | doout: 210 | print STDERR "Creating output...\n"; 211 | select OUT if (open(OUT,">$out")); 212 | open(IN,$file); 213 | open(TMP,"_plsrt2.tmp"); 214 | $outexpr=' 215 | &getlen(*TMP); 216 | while(!eof(TMP)) 217 | { 218 | $/="\n"; 219 | $_=; 220 | '.($q?'':' if($i&0x800) 221 | { 222 | &dispperc(*TMP); 223 | } 224 | $i++; 225 | ').' ($key,$pos)=/^(.*?)\001([0-9]+)$/; 226 | die "Error positioning in file $file\n" unless seek(IN,$pos,0); 227 | $/=$RSep; 228 | $_=; 229 | '.$opat.';print; 230 | }'; 231 | print STDERR "Running script:\n$outexpr\n" if $v; 232 | eval $outexpr; die "Error eval'ing expression:\n $outexpr\n$@" if $@; 233 | $pos=$.; 234 | close TMP; 235 | if(!$keeptmp) 236 | { 237 | print STDERR "Deleting tempfile...\n"; 238 | unlink '_plsrt2.tmp'; 239 | } 240 | #$end=time-$stime0; 241 | #printf STDERR "elapsed: %02d:%02d\n",$end/60,$end%60; 242 | $.=$pos; 243 | end_banner(); 244 | -------------------------------------------------------------------------------- /lexc/casenormhuX.xfs: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # The Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA) 39 | # license is available at: https://creativecommons.org/licenses/by-nc-sa/4.0/ 40 | # 41 | # Disclaimer of Warranties and Limitation of Liability. 42 | # 43 | # Unless otherwise separately undertaken by the Licensor, to the extent possible, 44 | # the Licensor offers the Licensed Material as-is and as-available, and makes no 45 | # representations or warranties of any kind concerning the Licensed Material, 46 | # whether express, implied, statutory, or other. This includes, without 47 | # limitation, warranties of title, merchantability, fitness for a particular 48 | # purpose, non-infringement, absence of latent or other defects, accuracy, or the 49 | # presence or absence of errors, whether or not known or discoverable. Where 50 | # disclaimers of warranties are not allowed in full or in part, this disclaimer 51 | # may not apply to You. 52 | # 53 | # To the extent possible, in no event will the Licensor be liable to You on any 54 | # legal theory (including, without limitation, negligence) or otherwise for any 55 | # direct, special, indirect, incidental, consequential, punitive, exemplary, or 56 | # other losses, costs, expenses, or damages arising out of this Public License or 57 | # use of the Licensed Material, even if the Licensor has been advised of the 58 | # possibility of such losses, costs, expenses, or damages. Where a limitation of 59 | # liability is not allowed in full or in part, this limitation may not apply to You. 60 | # 61 | ################################################## END OF LICENSE ################################################## 62 | 63 | #upper case letters 64 | 65 | define UC [ 66 | A|Á|B|C|D|E|É|F|G|H|I|Í|J|K|L|M|N|O|Ó|Ö|Ő|P|Q|R|S|T|U|Ú|Ü|Ű|V|W|X|Y|Z|Â|Ă|Ä|Ą|Ć|Ç|Č|Ď|Đ|Ę|Ë|Ě|Î|Ł|Ľ|Ń|Ň|Ô|Ŕ|Ř|Ş|Š|Ś|Ţ|Ť|Ů|Ý|Z|Ž|Ź|Ż 67 | ]; 68 | 69 | #lower case letters 70 | define LC [ 71 | a|á|b|c|d|e|é|f|g|h|i|í|j|k|l|m|n|o|ó|ö|ő|p|q|r|s|t|u|ú|ü|ű|v|w|x|y|z|â|ă|ä|ą|ć|ç|č|ď|đ|ę|ë|ě|î|ł|ľ|ń|ň|ô|ŕ|ř|ş|š|ś|ţ|ť|ů|ý|z|ž|ź|ż 72 | ]; 73 | 74 | #not case markup 75 | define NCM [ 76 | \ [ 77 | "^2U"|"^2L"|"^U"|"^L"|"-"|"/"|"'" 78 | ]]; 79 | 80 | #proper word elements are either 81 | define PropWElem [ 82 | [ 83 | #all lower case (optionally decapitalized) 84 | #[ ("^2L") ("^L") "^L" * ] | 85 | #we exclude decapitalization 86 | #[ ("^L") "^L" * ] | 87 | #or all uppercase (all-capitalized) 88 | #[ ("^2U") "^U" ["^2U" "^U"]+ ] | 89 | [ [("^2U") "^U" ["^2U" "^U"]+| "^U"+]] | 90 | #or capitalized 91 | [ ("^2U") "^U" "^L"*] 92 | ]/NCM | 93 | #or contain no case conversion at all 94 | \["^2U"|"^2L"]+ 95 | 96 | ]; 97 | 98 | #bad capitalization 99 | define BadCap [ 100 | ~[ 101 | PropWElem [["-"|"/"|"'"] PropWElem] * 102 | ]]; 103 | 104 | #proper case normalization 105 | define CaseConv [ 106 | 107 | #capitalization 108 | [ 109 | a "^2U" -> A, 110 | á "^2U" -> Á, 111 | b "^2U" -> B, 112 | c "^2U" -> C, 113 | d "^2U" -> D, 114 | e "^2U" -> E, 115 | é "^2U" -> É, 116 | f "^2U" -> F, 117 | g "^2U" -> G, 118 | h "^2U" -> H, 119 | i "^2U" -> I, 120 | í "^2U" -> Í, 121 | j "^2U" -> J, 122 | k "^2U" -> K, 123 | l "^2U" -> L, 124 | m "^2U" -> M, 125 | n "^2U" -> N, 126 | o "^2U" -> O, 127 | ó "^2U" -> Ó, 128 | ö "^2U" -> Ö, 129 | ő "^2U" -> Ő, 130 | p "^2U" -> P, 131 | q "^2U" -> Q, 132 | r "^2U" -> R, 133 | s "^2U" -> S, 134 | t "^2U" -> T, 135 | u "^2U" -> U, 136 | ú "^2U" -> Ú, 137 | ü "^2U" -> Ü, 138 | ű "^2U" -> Ű, 139 | v "^2U" -> V, 140 | w "^2U" -> W, 141 | x "^2U" -> X, 142 | y "^2U" -> Y, 143 | z "^2U" -> Z, 144 | â "^2U" -> Â, 145 | ă "^2U" -> Ă, 146 | ä "^2U" -> Ä, 147 | ą "^2U" -> Ą, 148 | ć "^2U" -> Ć, 149 | ç "^2U" -> Ç, 150 | č "^2U" -> Č, 151 | ď "^2U" -> Ď, 152 | đ "^2U" -> Đ, 153 | ę "^2U" -> Ę, 154 | ë "^2U" -> Ë, 155 | ě "^2U" -> Ě, 156 | î "^2U" -> Î, 157 | ł "^2U" -> Ł, 158 | ľ "^2U" -> Ľ, 159 | ń "^2U" -> Ń, 160 | ň "^2U" -> Ň, 161 | ô "^2U" -> Ô, 162 | ŕ "^2U" -> Ŕ, 163 | ř "^2U" -> Ř, 164 | ş "^2U" -> Ş, 165 | š "^2U" -> Š, 166 | ś "^2U" -> Ś, 167 | ţ "^2U" -> Ţ, 168 | ť "^2U" -> Ť, 169 | ů "^2U" -> Ů, 170 | ý "^2U" -> Ý, 171 | z "^2U" -> Z, 172 | ž "^2U" -> Ž, 173 | ź "^2U" -> Ź, 174 | ż "^2U" -> Ż, 175 | 176 | #decapitalization 177 | 178 | A "^2L" -> a, 179 | Á "^2L" -> á, 180 | B "^2L" -> b, 181 | C "^2L" -> c, 182 | D "^2L" -> d, 183 | E "^2L" -> e, 184 | É "^2L" -> é, 185 | F "^2L" -> f, 186 | G "^2L" -> g, 187 | H "^2L" -> h, 188 | I "^2L" -> i, 189 | Í "^2L" -> í, 190 | J "^2L" -> j, 191 | K "^2L" -> k, 192 | L "^2L" -> l, 193 | M "^2L" -> m, 194 | N "^2L" -> n, 195 | O "^2L" -> o, 196 | Ó "^2L" -> ó, 197 | Ö "^2L" -> ö, 198 | Ő "^2L" -> ő, 199 | P "^2L" -> p, 200 | Q "^2L" -> q, 201 | R "^2L" -> r, 202 | S "^2L" -> s, 203 | T "^2L" -> t, 204 | U "^2L" -> u, 205 | Ú "^2L" -> ú, 206 | Ü "^2L" -> ü, 207 | Ű "^2L" -> ű, 208 | V "^2L" -> v, 209 | W "^2L" -> w, 210 | X "^2L" -> x, 211 | Y "^2L" -> y, 212 | Z "^2L" -> z, 213 | Â "^2L" -> â, 214 | Ă "^2L" -> ă, 215 | Ä "^2L" -> ä, 216 | Ą "^2L" -> ą, 217 | Ć "^2L" -> ć, 218 | Ç "^2L" -> ç, 219 | Č "^2L" -> č, 220 | Ď "^2L" -> ď, 221 | Đ "^2L" -> đ, 222 | Ę "^2L" -> ę, 223 | Ë "^2L" -> ë, 224 | Ě "^2L" -> ě, 225 | Î "^2L" -> î, 226 | Ł "^2L" -> ł, 227 | Ľ "^2L" -> ľ, 228 | Ń "^2L" -> ń, 229 | Ň "^2L" -> ň, 230 | Ô "^2L" -> ô, 231 | Ŕ "^2L" -> ŕ, 232 | Ř "^2L" -> ř, 233 | Ş "^2L" -> ş, 234 | Š "^2L" -> š, 235 | Ś "^2L" -> ś, 236 | Ţ "^2L" -> ţ, 237 | Ť "^2L" -> ť, 238 | Ů "^2L" -> ů, 239 | Ý "^2L" -> ý, 240 | Z "^2L" -> z, 241 | Ž "^2L" -> ž, 242 | Ź "^2L" -> ź, 243 | Ż "^2L" -> ż 244 | 245 | .o. 246 | 247 | [ 248 | #mark upper case letters as such 249 | 250 | UC @-> ... "^U" 251 | 252 | .o. 253 | 254 | #mark lower case letters as such 255 | LC @-> ... "^L" 256 | ].i 257 | ].i 258 | 259 | ]; 260 | 261 | regex [ 262 | CaseConv 263 | #disallow bad capitalization on upper side 264 | .o. 265 | 266 | ~[BadCap] 267 | 268 | #remove case markup from lexical side 269 | 270 | .o. 271 | 272 | [["^2U"|"^2L"|"^U"|"^L"] -> 0] 273 | 274 | ]; 275 | 276 | save stack casenormhu.hfst 277 | -------------------------------------------------------------------------------- /pl/generic/metadict.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | 63 | require 'banner.pl'; 64 | start_banner('Word grammar converter'); 65 | 66 | require 'm2getopt.pl'; 67 | require 'diewarn.pl'; 68 | 69 | do 'files.hpl' if !$files || !defined do $files; 70 | 71 | do 'delim.hpl' unless $delim; 72 | $delim="\x1" unless $delim; 73 | 74 | $proplst='proplst.hpl' if !$proplst; 75 | $humsrc='.' if !$humsrc; 76 | 77 | require $proplst; 78 | 79 | $bits_set='='; 80 | $bits_neg='!'; 81 | 82 | #convert flag expressions to binary flag strings 83 | #individual flags are delimited by space or & 84 | 85 | #the binary pattern for each flag must be defined in $flags hash in input; 86 | 87 | #operator example result 88 | #none flag ..1... 89 | #negation ! !flag ..0... 90 | #increment (+ (+flag1 ..(++)... (stop when reaching ..11...) 91 | #decrement (- (+flag1 ..(--)... (stop when reaching ..00...) 92 | #increment [+ {+flag1 ..[++]... (..00... follows ..11...) 93 | #decrement [- {+flag1 ..[--]... (..11... follows ..00...) 94 | 95 | sub genbits 96 | { 97 | my($expr)=@_; 98 | my(@bits)=('.') x 32; 99 | my($op,$mask,@mask,$par,$flag,$ovr_); 100 | for $flag(split /[& ]+/,$expr) 101 | { 102 | $flag=~s/^([([]?)([!+\-]?)//; 103 | $par=$1; 104 | $op=$2; 105 | $flag=~s/[])]$//; 106 | $flags->{$flag}=~s/[<>](\d+)/'.' x $1/e; 107 | $mask=$flags->{$flag}; 108 | $op eq '' && $mask=~tr#x#1# or 109 | $op eq '!' && $mask=~tr#01x#10# or 110 | $op eq '+' && $mask=~tr#01x#+# or 111 | $op eq '-' && $mask=~tr#01x#-#; 112 | $par eq '(' && $mask=~s#([\-+]+)#($1)#g or 113 | $par eq '[' && $mask=~s#([\-+]+)#[$1]#g; 114 | @mask=$mask=~/[([]?[+\-01.][])]?/g; 115 | $i=0; 116 | for $m(reverse @mask) 117 | { 118 | if($m eq '.') 119 | { 120 | $bits[$i]=$m if !defined $bits[$i]; 121 | $i++; 122 | next; 123 | } 124 | $ovr_.="$expr:$bits[$i] -> $m at $i;" if $bits[$i] && $bits[$i] ne '.' && ($bits[$i] ne $m); 125 | $bits[$i]=$m; 126 | $i++; 127 | } 128 | } 129 | $bits=join('',reverse @bits); 130 | $bits=~s/^\.+//; 131 | die1("Inconsistent bit operations:\n$ovr_\nResult:$bits\nin: $expr\n\n") if $ovr_; 132 | $bits; 133 | } 134 | 135 | sub printlines 136 | { 137 | my $cat; 138 | for(@_) 139 | { 140 | #translate bit checking and setting expressions ?{}, ={}, LCF{} etc. 141 | s/\{(.*?)(?=\}\s)/'{'.genbits($1)/eg; 142 | #check if the (meta)category at the beginning of the line is defined 143 | if(($cat)=/^\s*(\S+)\s*->/) 144 | { 145 | if(!$metacteg->{$cat}&&!$undef{$cat}) 146 | { 147 | my($cat1,$constr,$side); 148 | $undef{$cat}++; 149 | die1("Category $cat is not defined in \$metacteg section of $proplst\n"); 150 | #for category names containing '_' or '&', cut parts beginning with _/& 151 | #successively from end and try to find the definition of the rest 152 | #then add the cut part as an additional constraint 153 | $cat1=$cat; 154 | $side='l'; 155 | while($cat1=~s/(.*)[_&](.*)/$1/) 156 | { 157 | #add constraint 158 | $constr.="&$2"; 159 | #check if the base category is defined 160 | #and add constrints if found 161 | $side=$metacteg->{$cat1}[0], 162 | $constr=$metacteg->{$cat1}[1].$constr, 163 | last 164 | if($metacteg->{$cat1}); 165 | } 166 | print MC "'$cat' => ['$side','$constr'],\n"; 167 | } 168 | else 169 | { 170 | $used{$cat}++; 171 | } 172 | } 173 | print; 174 | } 175 | } 176 | 177 | open MC,">$hpldir/src/metacteg.new"; 178 | 179 | my($l,$i,$wait_for_semi,%vars,@vars,$var,$i,$j,@l); 180 | 181 | while(<>) 182 | { 183 | 184 | s/^[ \t]+//; 185 | #ignore comments and multiedit format line 186 | next if /^#|^\Q\@ME.FORMAT/; 187 | #if line looks like a variable (macro) definition, then wait for semicolon 188 | if(/^[\$\@]\w+\s*=/) 189 | { 190 | $l=''; 191 | $wait_for_semi=1; 192 | } 193 | #wait until the definition is finished 194 | #then evaluate it 195 | if($wait_for_semi) 196 | { 197 | $l.=$_; 198 | if(/;\s*$/) 199 | { 200 | $wait_for_semi=0; 201 | eval $l; 202 | #check if the definition is syntactically correct 203 | die1($@) if $@; 204 | } 205 | next; 206 | } 207 | #print line if no macro expansion character (@) is present 208 | printlines($_), next unless /\@/; 209 | #gather all macro variables used in the line 210 | undef %vars; 211 | for(/\@([A-Za-z\d]+\[?)/g) 212 | { 213 | $vars{$_}++; 214 | } 215 | @vars=keys %vars; 216 | @l=($_); 217 | #substitute each variable with all its possible values respectively 218 | while($var=shift @vars) 219 | { 220 | #if the variable contains a list of lists, the reference must contain a numeric index 221 | if($var=~s/\[$//) 222 | { 223 | @l=map 224 | { 225 | $i=$_; 226 | map 227 | { 228 | $j=$i; 229 | $j=~s/\@$var\[(\d+)\]/$_->[$1]/g; 230 | #check if the macro is defined 231 | die1("macro $_\[$1] is not defined") if !defined $_->[$1]; 232 | $j; 233 | }@$var; 234 | }@l; 235 | } 236 | #no index if the variable is a simple list 237 | else 238 | { 239 | @l=map 240 | { 241 | $i=$_; 242 | map 243 | { 244 | $j=$i; 245 | $j=~s/\@$var(?![a-zA-Z\d])/$_/g; 246 | #check if the macro is defined 247 | die1("macro $_ is not defined") if !defined $_; 248 | $j; 249 | }@$var; 250 | }@l; 251 | } 252 | } 253 | printlines(@l); 254 | } 255 | 256 | close MC; 257 | #check if all defined categories are actually used in the grammar 258 | for(sort keys %$metacteg) 259 | { 260 | warn("Category $_ is defined in $proplst but not used in the grammar.\n") if !$used{$_}; 261 | } 262 | die_if_errors(); 263 | end_banner(); 264 | 265 | -------------------------------------------------------------------------------- /pl/mkavs/addfea.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | 63 | require 'banner.pl'; 64 | start_banner('Level 1 stem lexicon unifier'); 65 | require 'm2getopt.pl'; 66 | 67 | $warnaddonly=1 unless defined $warnaddonly; 68 | $strictcat=1 unless defined $strictcat; 69 | 70 | die "Add properties from addlex to baselex (the files must be identically sorted): 71 | Usage: perl addfea.pl [-switches] {baselex.lx1} {addlex.lx1} >{result.lx1} 72 | Switches: 73 | -errtofile write error comments to output file 74 | -warnbaseonly warn about base entries not in additional lexicon 75 | -warnaddonly=0 do not warn about additional entries not in base lexicon 76 | -strictcat=0 do not be strict on category match 77 | -ignorecase ignore case mismatches 78 | -addonlytag=.. define tag to be added to additional entries not in base lex. 79 | -baseonlytag=.. define tag to be added to base entries not in add. lex. 80 | -noaddonly do not add additional entries not in base lexicon 81 | (only add features to already existing entries) 82 | -nobaseonly do not add base entries not in the additional lexicon 83 | -baseonly list entries which only appear in base lexicon 84 | -addonly list entries which only appear in additional lexicon 85 | -addseg use segmented form as it appears in additional lexicon 86 | -addpropfirst properties coming form the additional lexicon should precede those 87 | coming from the base lexicon 88 | 89 | " unless $#ARGV==1; 90 | 91 | if($errtofile) 92 | { 93 | open(ERR, ">&STDOUT") || die "Can't dup stdout"; 94 | } 95 | else 96 | { 97 | open(ERR, ">&STDERR") || die "Can't dup stderr"; 98 | } 99 | select(ERR); $| = 1; 100 | select(STDOUT); $| = 1; 101 | 102 | #remove segmentation characters 103 | sub rmsgm 104 | { 105 | local $_=shift; 106 | # s/[?!#=+%@&^(){}"]|[<[].*?[]>]|\.\.\.|_.*//g; 107 | s/[?!#=+%@&^(){}"]|[<[].*?[]>]|_.*//g; 108 | tr/A-ZÁÉÍÓÚÖÜŐŰ.-/a-záéíóúöüőű/d if $ignorecase; 109 | $_; 110 | }; 111 | 112 | #read and parse an input word 113 | sub readwd 114 | { 115 | $a=shift; 116 | local $_=<$a>; 117 | chomp; 118 | my ($as,$acat,$ap)=/^(\s*;?[^;]*?)\[([^\]]+)\];(.*)/; 119 | #segmented form, category, properties 120 | $aw=$as; 121 | $aw=~s/^\s*;?//; 122 | my $aw=reverse(rmsgm($aw)); 123 | my $awlc=$aw; 124 | $awlc=~tr/A-ZÁÉÍÓÚÖÜŐŰ.-/a-záéíóúöüőű/d; 125 | ($_,$aw,$awlc,$as,$acat,$ap); 126 | } 127 | 128 | $cats='(?:[FM]N|SZN|HA|IGE|IK|NU)'; 129 | sub compatible 130 | { 131 | my ($a,$b)=@_; 132 | 133 | return 1 if $a=~/^\??$/||$b=~/^\??$/; 134 | $a=join '#',sort map{s/\|[^&]*//g;s/($cats)[a-z]+/$1/go;$_}($a,$b); 135 | # return 1 if $a=~/FN#MN/; 136 | $a=~/($cats)(?:\&$cats)*#(?:$cats\&)*\1/o; 137 | 138 | } 139 | 140 | sub merge2 141 | { 142 | my $f; 143 | open(B,$f=shift) or die "Unable to open base lexicon file $f"; 144 | open(A,$f=shift) or die "Unable to open additional lexicon file $f"; 145 | 146 | my $files=2; 147 | undef($a),$files-- if eof(A); 148 | undef($b),$files-- if eof(B); 149 | my ($a,$aw,$awlc,$as,$acat,$ap)=readwd(\*A); 150 | my ($b,$bw,$bwlc,$bs,$bcat,$bp)=readwd(\*B); 151 | while($files) 152 | { 153 | while(defined $b && (!defined($a)||$bwlc lt $awlc||($bwlc eq $awlc&&$bw lt $aw))) 154 | { 155 | unless($nobaseonly||$addonly) 156 | { 157 | print ERR ("*$b*WARN:Base lexicon entry missing from additional lexicon\n") if $warnbaseonly; 158 | print "$b$baseonlytag\n"; 159 | } 160 | undef($b),$files--,last if eof(B); 161 | ($b,$bw,$bwlc,$bs,$bcat,$bp)=readwd(\*B); 162 | } 163 | while(defined $a && (!defined($b)||$bwlc gt $awlc||($bwlc eq $awlc&&$bw gt $aw))) 164 | { 165 | unless($noaddonly||$baseonly) 166 | { 167 | print ERR ("*$a*WARN:Additional lexicon entry missing from base lexicon\n") if $warnaddonly; 168 | print "$a$addonlytag\n"; 169 | } 170 | undef($a),$files--,last if eof(A); 171 | ($a,$aw,$awlc,$as,$acat,$ap)=readwd(\*A); 172 | } 173 | while(defined $a && defined $b && ($bw eq $aw)) 174 | { 175 | if(!$strictcat&&compatible($bcat,$acat)||$bcat eq $acat) 176 | { 177 | unless($baseonly||$addonly) 178 | { 179 | print ERR ("*$bs\[$bcat];$bp$ap*WARN:Category mismatch $bcat/$acat (base category $bcat assumed)\n") if $bcat ne $acat; 180 | $prop=$addpropfirst?"$ap$bp":"$bp$ap"; 181 | $seg=$addseg?$as:$bs; 182 | print "$seg\[$bcat];$prop\n"; 183 | } 184 | undef($a),$files-- if eof(A); 185 | undef($b),$files-- if eof(B); 186 | ($a,$aw,$awlc,$as,$acat,$ap)=readwd(\*A); 187 | while($bcat=~/&/&&defined $a && defined $b && ($bw eq $aw)&&(compatible($bcat,$acat)||$bcat eq $acat)) 188 | { 189 | unless($baseonly||$addonly) 190 | { 191 | print ERR ("*$as\[$acat];$ap*WARN:Category mismatch $bcat/$acat (additional entry skipped)\n") if $bcat ne $acat; 192 | } 193 | undef($a),$files-- if eof(A); 194 | ($a,$aw,$awlc,$as,$acat,$ap)=readwd(\*A); 195 | } 196 | ($b,$bw,$bwlc,$bs,$bcat,$bp)=readwd(\*B); 197 | } 198 | elsif($bcat lt $acat) 199 | { 200 | unless($nobaseonly||$addonly) 201 | { 202 | print ERR ("*$b*WARN:Category mismatch $bcat/$acat: base lexicon entry\n"); 203 | print "$b$baseonlytag\n"; 204 | } 205 | undef($b),$files--,last if eof(B); 206 | ($b,$bw,$bwlc,$bs,$bcat,$bp)=readwd(\*B); 207 | } 208 | else 209 | { 210 | unless($noaddonly||$baseonly) 211 | { 212 | print ERR ("*$a*WARN:Category mismatch $bcat/$acat: additional lexicon entry\n"); 213 | print "$a$addonlytag\n"; 214 | } 215 | undef($a),$files--,last if eof(A); 216 | ($a,$aw,$awlc,$as,$acat,$ap)=readwd(\*A); 217 | } 218 | } 219 | } 220 | } 221 | 222 | merge2(shift,shift); 223 | end_banner(); 224 | -------------------------------------------------------------------------------- /pl/mkavs/stmlex2.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | #generate Hungarian level 2 stem lexicon from level 1 56 | BEGIN{ 57 | $hpldir=$ENV{'hpldir'} if !$hpldir; 58 | $hpldir='../..' if !$hpldir; 59 | } 60 | 61 | use lib "$hpldir/pl/generic"; 62 | use lib "$hpldir/src"; 63 | use lib "$hpldir/gen"; 64 | 65 | $tagname='tag'; 66 | 67 | sub abcphon{$_[0]} 68 | sub cat2hum{$_[0]} 69 | sub hum2cat 70 | { 71 | local $_=shift; 72 | s/(?:.*[+>]|^)(.*)/$1/; 73 | s/\|.*//; 74 | $_; 75 | } 76 | sub hum2cat{$_[0]} 77 | 78 | require 'dumpsh.pl'; 79 | require 'stemalt1.pl'; 80 | require 'vhrm.pl'; 81 | require 'unif.pl'; 82 | require 'lex2.pl'; 83 | require 'diewarn.pl'; 84 | 85 | $avs=1 if !defined $avs; 86 | 87 | sub noalm 88 | { 89 | open(NOA,">$out.noalm"),$noalm=1 unless $noalm; 90 | print NOA $_; 91 | } 92 | 93 | sub chkline 94 | { 95 | my($c,$err,$w); 96 | # for $w(split(/(?$out.lx2") or dienow("Unable to create level2 lexicon file $out.lx2"); 133 | (open(AVS,">$out.avs") or dienow("Unable to create level2 avs lexicon file $out.avs")),$avs='AVS' if $avs; 134 | 135 | while(<>) 136 | { 137 | next if /^\s*\*|^\s*$/;#||$excl && /$excl/o; 138 | s/;\s*\*.*/;/; 139 | chkline(); 140 | #lines ending in a backslash are merged with the next line 141 | $_=$prevline.$_,undef $prevline if defined $prevline; 142 | $prevline=$_,next if s/\\+\s*\n//; 143 | #the entry may contain lexically specified allomorphs, these begin with ++ 144 | @almfs1=split(/(?<=;)\s*\+\+\s*/); 145 | undef @almfs2; 146 | for(@almfs1) 147 | { 148 | # ($seg,$hum,$r)=/\s*(.*?)\[([^]]*)\];(.*)/; 149 | ($seg,$r)=/^\s*(;?(?:[^&;]|\&[^&;\[\]]+;)*?|.*?\[[^][;]*\]);\s*(.*)/; 150 | while ($r=~s/((?:;|^)r[pr]:[^;]*?)&/$1 /g){}; #change & to space after rp:/rr: 151 | @fd=split /;\s*/,$r;#split into properties 152 | undef $mrf; 153 | $mrf->{'allomf'}=$seg if defined $seg; 154 | for(@fd) 155 | { 156 | ($attr,$val)=split/:/;#separate attr and value 157 | if(defined $val) 158 | { 159 | # $val=[split /,\s*/,$val] if $val=~/,/||$attr=~/_/; 160 | $val=~s/,\s*/ /g;#multiple values separated by space 161 | blk:{ 162 | do{ 163 | if(!$mrf->{$attr}){$mrf->{$attr}=$val;}#if mrf has no such attribute yet 164 | else{$mrf->{$attr}.=" $val";}#else add new value 165 | last blk; 166 | }if $attr!~/_/;#if attribute has no path prefix 167 | ($prp,$attr)=split(/_/,$attr);#else split path 168 | blk2:{ 169 | #adding an array value to path 170 | push(@{$mrf->{$prp}{$attr}},@$val),last blk2 if ref $val eq 'ARRAY'; 171 | #or a single scalar value 172 | $mrf->{$prp}{$attr}=$val; 173 | } 174 | }} 175 | } 176 | push @almfs2,$mrf; 177 | } 178 | $mrf=shift(@almfs2); 179 | ($seg)=$mrf->{'allomf'}=~/\s*(.*?)$/; 180 | $seg=~s/([^]\\])\+(?![@#=])/$1*/g; #change compound separator from + to * unless preceded by [CAT] or backslash or followed by @ or # or = 181 | $hum=join('+',$seg=~/\[([^][]*)\](?=\+|$)/g); #create segmented category tag list 182 | $seg=~s/\[([^][]*)\](?=\+|$)//g; #remove tags from $seg 183 | # $seg=~s/([^\\]\+)(?![@#=])/$1*/g; #change + to +* unless followed by @ or # or = 184 | # ($seg,$hum)=$mrf->{'allomf'}=~/\s*(.*?)\[([^][]*)\]$/; 185 | 186 | if (($seg0)=$seg=~/^(.*[*#+])/) 187 | { 188 | $seg0=~s/$srf2rm//og if defined $srf2rm;#remove whatever is defined in $srf2rm (e.g. parentheses) 189 | for(@almfs2) 190 | { 191 | $_->{'allomf'}=~s/^(?!=)/$seg0/ or 192 | $_->{'allomf'}=~s/^=//; 193 | } 194 | } 195 | elsif(/\+\+=/&&$seg!~/^\s*(=)/) 196 | { 197 | for(@almfs2) 198 | { 199 | $_->{'allomf'}=~s/^=//; 200 | } 201 | } 202 | $mrf->{'seg'}=$seg if defined $seg; 203 | $mrf->{phon}=abcphon($seg) if !$mrf->{phon}&&$hum=~/\|ABC(?!x)|\|BETU/; 204 | #push as many copies of $mrf into @mrfs as there are different cat's in $hum 205 | @mrfs=map 206 | { 207 | $mrf->{cat}=hum2cat($_) unless $mrf->{cat}&&$hum!~/&/; 208 | $mrf->{$tagname}=$_; 209 | $hum!~/&/?$mrf:avscpy($mrf); 210 | } 211 | (split /&/,$hum); 212 | #split morphs having multiple pronunciations 213 | if($mrf->{phon}=~/&/) 214 | { 215 | @ph=split (/&/,$mrf->{phon}); 216 | @mrfs=map 217 | { 218 | $mrf=avscpy($_); 219 | map 220 | { 221 | $mrf->{phon}=$_; 222 | avscpy($mrf); 223 | } 224 | @ph; 225 | } 226 | @mrfs; 227 | } 228 | #split morphs having multiple equivalents 229 | if($mrf->{equ}=~/&/) 230 | { 231 | @ph=split (/&/,$mrf->{equ}); 232 | @mrfs=map 233 | { 234 | $mrf=avscpy($_); 235 | map 236 | { 237 | $mrf->{equ}=$_; 238 | avscpy($mrf); 239 | } 240 | @ph; 241 | } 242 | @mrfs; 243 | } 244 | warn1 ("Possible syntax error in input lexicon:\n\n$_\n(No allomorphs)"),$error++ if $#mrfs<0; 245 | for(@mrfs) 246 | { 247 | # $alm2=stemalt($_,\@almfs2); 248 | $almfs2=$hum!~/&/?\@almfs2:avscpy(\@almfs2); 249 | $alm2=stemalt($_,$almfs2); 250 | warn1 ("No allomorphs for $_->{seg}\[$_->{cat}\/$_->{$tagname}]"),$error++ if $#$alm2<0; 251 | $_->{allomfs}=$alm2;#add allomorphs 252 | $_->{hcat}=cat2hum($_->{cat}); 253 | if($avs) 254 | { 255 | #remove allomorph-level properties from the root level: 256 | delete @$_{lr,rr,lp,rp,gp,glr,grr,allomf}; 257 | $a=dumpsh([$_],['mrf']); 258 | $a=~s/=> ' +(?![ '])/=> '/g; 259 | print $avs $a,"\n"; 260 | } 261 | lex2($_); 262 | } 263 | } 264 | close(LX2); 265 | close(AVS); 266 | print_propsets($out); 267 | savenormfrm(); 268 | #$end=time-$stm; 269 | #warn sprintf "elapsed: %02d:%02d",$end/60,$end%60; 270 | warn1("$error entries produced no allomorphs") if $error; 271 | die_if_errors(); 272 | end_banner(); 273 | -------------------------------------------------------------------------------- /pl/generic/lex2.pl: -------------------------------------------------------------------------------- 1 | ################################################## START OF LICENSE ################################################## 2 | # 3 | # This file is part of the emMorph / Humor morphological analyzer description for Hungarian. 4 | # Copyright (C) 2001-2016 Attila Novák 5 | # 6 | # The author of the database and the database compilation environment is Attila Novák (novakat@gmail.com). 7 | # The resource is available from: https://github.com/dlt-rilmta/emMorph 8 | # 9 | # The database files are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 10 | # (CC BY-NC-SA) license, the compilation scripts under the GNU General Public License (GPL v3) 11 | # with the following amendments: 12 | # 13 | # By downloading/cloning/using this database and tools you accept the following terms: 14 | # 15 | # 1. Please inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about your use of the database/tools 16 | # clearly indicating what you use this database or tool for in your application/experiment/resource. 17 | # 18 | # 2. If possible, please publish a scientific paper about each application, experimental system 19 | # or linguistic resource you create or experiment you perform using this resource quoting the articles below, 20 | # and inform the author at [novakat@gmail.com](mailto:novakat@gmail.com) about each article you publish. 21 | # 22 | # Articles to quote are listed at https://github.com/dlt-rilmta/emMorph, the list is currently the following: 23 | # (See the BibTeX file quotethis.bib in the root directory): 24 | # 25 | # Attila Novák (2014): A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation. 26 | # In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC-2014). Reykjavík, pp. 1068–1073 (ISBN 978-2-9517408-8-4) 27 | # 28 | # Attila Novák; Borbála Siklósi; Csaba Oravecz (2016): A New Integrated Open-source Morphological Analyzer for Hungarian 29 | # In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016). Portorož, pp. 1315–1322. 30 | # 31 | # Novák Attila (2003): Milyen a jó Humor? [What is good Humor like?] In: Magyar Számítógépes Nyelvészeti Konferencia (MSZNY 2003). Szegedi Tudományegyetem, pp. 138–145 32 | # 33 | # 3. Please do share your adaptations of the morphology (vocabulary extensions etc.) using the same licenses. 34 | # 35 | # 4. If you are interested in using or adapting the resource for commercial purposes, please contact the author. 36 | # *** 37 | # 38 | # This program is free software: you can redistribute it and/or modify 39 | # it under the terms of the GNU General Public License as published by 40 | # the Free Software Foundation, either version 3 of the License, or 41 | # (at your option) any later version. 42 | # 43 | # This program is distributed in the hope that it will be useful, 44 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 45 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 46 | # GNU General Public License for more details. 47 | # 48 | # 49 | ################################################## END OF LICENSE ################################################## 50 | 51 | use utf8; 52 | use open qw/:encoding(utf8)/; 53 | use open qw/:std :encoding(utf8)/; 54 | 55 | BEGIN{ 56 | $hpldir=$ENV{'hpldir'} if !$hpldir; 57 | $hpldir='../..' if !$hpldir; 58 | } 59 | 60 | use lib "$hpldir/pl/generic"; 61 | use lib "$hpldir/src"; 62 | #use Storable; 63 | 64 | require 'normform.pl'; 65 | require 'sfxtags.hpl'; 66 | 67 | do 'delim.hpl' unless $delim; 68 | $delim="\x1" unless $delim; 69 | sub linechange 70 | { 71 | } 72 | #if linechange() is defined in linechange.pl, 73 | #it should return true if it changed $lx2line 74 | do 'linechange.pl'; 75 | 76 | #my $propsets=retrieve('propsets.tmp'); 77 | 78 | { 79 | my %mtags; 80 | 81 | #convert category tag sequences: add a prefix denoting morphological category 82 | #to each tag as given in %mcat (loaded from sfxtags.hpl) 83 | 84 | #usually: 85 | #I_ inflection 86 | #D=POS_ derivational suffix converting to POS 87 | #S_ stem (this is the default) 88 | #P_ prefix 89 | 90 | sub mtags 91 | { 92 | my($tags,$hcat)=@_; 93 | return $mtags{$tags} if defined $mtags{$tags}; 94 | my($res)=join('+',map{($mcat{$_}?$mcat{$_}:'S').'_'.$_}split(/\+/,$tags)); 95 | # $res=~s/D=(?:[^_]+)_(?!.*\+[PSD][=_])/D=${hcat}_/ if $hcat; 96 | warn $res if $res=~/((?:^|\+)(?:S_([^+]+)|D=([^=_]+)_[^+]+)\+D=)=[^_]+/; 97 | while($res=~s/((?:^|\+)(?:S_([^+]+)|D=([^=_]+)_[^+]+)\+D=)=[^_]+/$1$2$3/){warn $res}; 98 | $mtags{$tags}=$res unless $tags=~/rov|abbr/i; 99 | $res; 100 | } 101 | } 102 | 103 | sub segcat 104 | { 105 | my ($seg,$cat)=@_; 106 | my @seg=split(/\+/,$seg); 107 | my @cat=split(/\+/,$cat); 108 | my $res; 109 | 110 | for(my $i=0;$i<=$#cat;$i++) 111 | { 112 | $res.="$seg[$i]\[$cat[$i]\]"; 113 | } 114 | $res; 115 | } 116 | 117 | my @lr=('r','l'); 118 | my @lr1=('right','left'); 119 | 120 | #$srf: surface form 121 | #$ssrf: +-segmented surface form 122 | #$slex: +-segmented lexical form 123 | #$scat: +-separated list of morpheme category names 124 | #$smcat: +-separated list of morpheme categories, each prefixed 125 | # with morphological category P_, S_ (stem), D=POS_ (deriv sfx) or I_ (inflection) 126 | #$lem: lemma 127 | #$hyph: hyphenated form 128 | #$srf1: surface form of 1st morpheme 129 | #$cat: category tag of 1st morpheme 130 | #$hcat: head category of the morph sequence 131 | 132 | sub lex2 133 | { 134 | # my $out=shift; 135 | my $mrf=shift; 136 | 137 | my $cat=$mrf->{$tagname}; 138 | $cat='?' if !$cat;#!!! if we do not know category 139 | my $seg=$mrf->{'seg'}; 140 | $seg=~s/(^|\+)[=@*]/$1/; 141 | if(defined $mrf->{'equ'}&&!$noequtag) 142 | { 143 | my $equ=$mrf->{'equ'}; 144 | $equ=~tr/+@#//d; 145 | $equ=~s/ +/_/g; 146 | $mrf->{gseg}=$seg unless defined $mrf->{gseg}; 147 | $seg.="_$equ";#add what it is an abbreviation for 148 | } 149 | my ($srf,$scat,$srf1,$slex,$ssrf,$lem,$lemmrfs,$pr,$smcat,$AB63); 150 | #$AB63 is used for 6-3 rule in Hungarian 151 | for(@{$mrf->{'allomfs'}}) #generate allomorphs 152 | { 153 | 154 | next if !defined $_->{'allomf'}||($_->{'allomf'}=~/\@$/&&!$generator); 155 | # next if $mrf->{'lemma'}=~/,/; # avoid ji, ji, ji 156 | $srf=$_->{'allomf'}; #surface form 157 | # $srf=~s/[\*|\@](?!$)|[;].+//g; 158 | # $srf=~s/[*\@]|[;].+//g; 159 | # $scat=$srf; #segmented category 160 | $scat=$_->{'cats'}; #segmented category 161 | $srf=~s/\[.*?\]//g; #remove tags from srf form 162 | $slex=$srf; 163 | $slex=~s/[*\@]//g; 164 | #change {a>á} to a in lex form, to á in srf form 165 | # $srf=~s/\{[^}>]*>([^}>]*)\}/$1/g; 166 | # $slex=~s/\{([^}>]*)>[^}>]*\}/$1/g; 167 | $srf1=$ssrf=$slex; 168 | $hyph=$srf; 169 | $srf=~s/[\+\*@]//go; #remove +'s from srf form 170 | $srf1=~s/\+.*//; # srf form of 1st morpheme 171 | #the lexseg feature may contain a segmented lexical form which differs from seg 172 | #but it must contain just as many segments 173 | #the first segment of the segmented lexical form (slex) 174 | #is the value of the seglex feature or if that's empty 175 | #it is the value of the seg feature or if that's empty 176 | #and $mrf->{'seg'} is not empty 177 | #it is the first surface segment 178 | $lem=$_->{'lexseg'} or $lem=$mrf->{'lexseg'} or $lem=$seg or !$mrf->{'seg'} or $lem=$srf1; 179 | #count the morphs in the lemma (minus 1) 180 | $lemmrfs=$lem=~tr/+//; 181 | # $lem=~s/[+,].*//; 182 | # $slex=~s/^[^+]*/$lem/; 183 | #substitute the lemma for the first $lemmrfs+1 morphs in slex 184 | $slex=~s/^[^+]*(?:\+[^+]*){$lemmrfs}/$lem/; 185 | #remove everything from scat other than categories and +'s 186 | if($scat=~/\[./) 187 | { 188 | $scat=~s/\+/\[+\]/g; 189 | $scat=~s/.*?\[(.*?)\]/$1/g; 190 | $scat=~s/^\++/+/; 191 | } 192 | else 193 | { 194 | $scat=''; 195 | } 196 | $scat=$cat.$scat if $scat=~/^\+|^$/; 197 | $ssrf=~s/\+//g,$slex=~s/\+//g if $scat!~/\+/; #remove +'s if no + in cat 198 | #create property set of allomorph 199 | $mtxbit=''; 200 | norm_allomf($_);#normalize allomorph 201 | for $k('r','l') 202 | { 203 | $pr=$_->{"${k}p"}; 204 | $pr.=','.$_->{"${k}r"} if $_->{"${k}r"}; 205 | if($pr) 206 | { 207 | $pr="$k,$pr"; 208 | $pr1="$pr;<<"; 209 | #restrictions (if any) are part of the key 210 | $pr1.=";restr:$_->{'restr'}" if $_->{'restr'}; 211 | #store to $propsets hash 212 | $propsets->{$pr1}="$_->{'allomf'}" if !defined $propsets->{$pr1}; 213 | } 214 | $mtxbit.="$pr${delim}"; 215 | } 216 | $smcat=mtags($scat,$mrf->{'hcat'}); 217 | $AB63="$1$2" if $mtxbit=~/A=(.)[& ]B=(.)/; 218 | =cmt 219 | #gather possible suffix category sequences 220 | if($smcat=~/(?:^|\+)[DI]/) 221 | { 222 | $sfxcat=$smcat; 223 | $sfxcat=~s/(^|\+)(?:[SP]_[^+]*|I_|D=[^_]*)/$1/g; 224 | $sfxcat=~s/^\++//; 225 | $sfxcat=~s/\|[^+]*//g; 226 | $sfxcat=~s/\+/][/g; 227 | $sfxcat{'['.$sfxcat.']'}++; 228 | } 229 | =cut 230 | $cat!~/\+/ and $segcat="$seg\[$cat]" or $segcat=segcat($seg,$cat); 231 | for($srf,$ssrf,$slex,$scat,$smcat,$lem){s/(?=$delim)/\\/go;} 232 | $lx2line="$mtxbit$srf${delim}$ssrf${delim}$slex${delim}$scat${delim}$lem${delim}$hyph${delim}$_->{'restr'}${delim}$smcat${delim}$segcat${delim}$mrf->{'phon'}${delim}$mrf->{'root'}${delim}$mrf->{'UR'}${delim}$mrf->{'X'}${delim}$AB63`\n"; 233 | print LX2 $lx2line; 234 | print LX2 $lx2line if linechange(); 235 | if($mrf->{'gseg'}&&$_->{'restr'}!~/a/) 236 | { 237 | # $slex=~s/^\Q$seg/$mrf->{'gseg'}/; 238 | $slex=~s/^[^+]*/$mrf->{'gseg'}/; 239 | $lem=$slex; 240 | $lem=~s/\+.*//; 241 | $lx2line="$mtxbit$srf${delim}$ssrf${delim}$slex${delim}$scat${delim}$lem${delim}$hyph${delim}$_->{'restr'}G${delim}$smcat${delim}$segcat${delim}$mrf->{'phon'}${delim}$mrf->{'root'}${delim}$mrf->{'UR'}${delim}$mrf->{'X'}${delim}$AB63`\n"; 242 | print LX2 $lx2line; 243 | print LX2 $lx2line if linechange(); 244 | } 245 | } 246 | } 247 | 248 | sub print_propsets 249 | { 250 | my $out=shift; 251 | open(PPS,">$out.propsets") or die "Unable to create property list file $out.propsets"; 252 | for(sort(keys(%$propsets))) 253 | { 254 | print PPS "$_;$propsets->{$_}>>\n"; 255 | } 256 | close(PPS); 257 | =cmt 258 | #print gathered suffix category sequences 259 | open(PPS,">$out.sfxseq") or die "Unable to create suffix tag sequence file $out.sfxseq"; 260 | for(sort(keys(%sfxcat))) 261 | { 262 | print PPS "$_\n"; 263 | } 264 | close(PPS); 265 | =cut 266 | } 267 | --------------------------------------------------------------------------------