├── .gitignore ├── Changes ├── LICENSE ├── MANIFEST ├── Makefile.PL ├── README ├── cu.txt ├── lib └── Lingua │ ├── CU.pm │ └── CU │ ├── Collate.pm │ ├── Hyphenate.pm │ ├── Scripts.pm │ └── Scripts │ ├── HIP.pm │ ├── UCS.pm │ ├── hipequivs │ ├── hipequivs_Cyrl │ ├── hipequivs_Latn │ ├── hipequivs_Zf │ ├── ostrogequivs │ └── ucsequivs ├── mklocale.pl ├── scripts ├── hip2unicode └── ucs2unicode └── t └── Lingua-CU.t /.gitignore: -------------------------------------------------------------------------------- 1 | !Build/ 2 | .last_cover_stats 3 | /META.yml 4 | /META.json 5 | /MYMETA.* 6 | *.o 7 | *.bs 8 | 9 | # Devel::Cover 10 | cover_db/ 11 | 12 | # Devel::NYTProf 13 | nytprof.out 14 | 15 | # Dizt::Zilla 16 | /.build/ 17 | 18 | # Module::Build 19 | _build/ 20 | Build 21 | Build.bat 22 | 23 | # Module::Install 24 | inc/ 25 | 26 | # ExtUitls::MakeMaker 27 | /blib/ 28 | /_eumm/ 29 | /*.gz 30 | /Makefile 31 | /Makefile.old 32 | /MANIFEST.bak 33 | /pm_to_blib 34 | /*.zip 35 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | Revision history for Perl extension Lingua::CU. 2 | 3 | 0.04 Wed Dec 30 11:02:30 2015 4 | - adding scripts, rewrite of numeric methods 5 | 6 | 0.03 Tue Feb 10 16:19:40 2015 7 | - updated Collation algorithm based on changes in Roadmap 8 | 9 | 0.02 Sun Oct 19 14:51:30 2014 10 | - updated Collation algorithm to support Unicode 8.0 upcoming updates 11 | 12 | 0.01 Tue Aug 20 21:01:20 2013 13 | - original version; created by h2xs 1.23 with options 14 | -AXc -n Lingua::CU 15 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Preamble 2 | 3 | The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. 4 | 5 | Definitions: 6 | 7 | "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. 8 | 9 | "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder as specified below. 10 | 11 | "Copyright Holder" is whoever is named in the copyright or copyrights for the package. 12 | 13 | "You" is you, if you're thinking about copying or distributing this Package. 14 | 15 | "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) 16 | 17 | "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 18 | 19 | 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 20 | 21 | 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 22 | 23 | 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: 24 | 25 | a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as uunet.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. 26 | b) use the modified Package only within your corporation or organization. 27 | c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. 28 | d) make other distribution arrangements with the Copyright Holder. 29 | 30 | 4.You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: 31 | 32 | a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. 33 | b) accompany the distribution with the machine-readable source of the Package with your modifications. 34 | c) give non-standard executables non-standard names, and clearly document the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. 35 | d) make other distribution arrangements with the Copyright Holder. 36 | 37 | 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. You may embed this Package's interpreter within an executable of yours (by linking); this shall be construed as a mere form of aggregation, provided that the complete Standard Version of the interpreter is so embedded. 38 | 39 | 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whoever generated them, and may be sold commercially, and may be aggregated with this Package. If such scripts or library files are aggregated with this Package via the so-called "undump" or "unexec" methods of producing a binary executable image, then distribution of such an image shall neither be construed as a distribution of this Package nor shall it fall under the restrictions of Paragraphs 3 and 4, provided that you do not represent such an executable image as a Standard Version of this Package. 40 | 41 | 7. C subroutines (or comparably compiled subroutines in other languages) supplied by you and linked into this Package in order to emulate subroutines and variables of the language defined by this Package shall not be considered part of this Package, but are the equivalent of input as in Paragraph 6, provided these subroutines do not change the language in any way that would cause it to fail the regression tests for the language. 42 | 43 | 8. Aggregation of this Package with a commercial distribution is always permitted provided that the use of this Package is embedded; that is, when no overt attempt is made to make this Package's interfaces visible to the end user of the commercial distribution. Such use shall not be construed as a distribution of this Package. 44 | 45 | 9. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 46 | 47 | 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 48 | 49 | The End 50 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | Changes 2 | cu.txt 3 | LICENSE 4 | Makefile.PL 5 | MANIFEST 6 | mklocale.pl 7 | README 8 | t/Lingua-CU.t 9 | lib/Lingua/CU.pm 10 | lib/Lingua/CU/Collate.pm 11 | lib/Lingua/CU/Hyphenate.pm 12 | lib/Lingua/CU/Scripts.pm 13 | lib/Lingua/CU/Scripts/hipequivs 14 | lib/Lingua/CU/Scripts/hipequivs_Cyrl 15 | lib/Lingua/CU/Scripts/hipequivs_Latn 16 | lib/Lingua/CU/Scripts/hipequivs_Zf 17 | lib/Lingua/CU/Scripts/HIP.pm 18 | lib/Lingua/CU/Scripts/ucsequivs 19 | lib/Lingua/CU/Scripts/ostrogequivs 20 | lib/Lingua/CU/Scripts/UCS.pm 21 | -------------------------------------------------------------------------------- /Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.014002; 2 | use ExtUtils::MakeMaker; 3 | 4 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 5 | # the contents of the Makefile that is written. 6 | 7 | my @scripts = grep {-f && !m/\./o && !m/~$/o } glob("scripts/*"); 8 | 9 | ## generate the cu.pl file 10 | print "Generating collation tailoring for Church Slavonic...\n"; 11 | unless (my $return = do './mklocale.pl') { 12 | die "Couldn't create collation tailoring: $@" if $@; 13 | die "couldn't do mklocale.pl: $!" unless defined $return; 14 | die "couldn't run mklocale.pl" unless $return; 15 | } 16 | 17 | WriteMakefile( 18 | NAME => 'Lingua::CU', 19 | clean => { FILES => "lib/Lingua/CU/cu.pl" }, 20 | VERSION_FROM => 'lib/Lingua/CU.pm', # finds $VERSION 21 | PREREQ_PM => { 'Tie::IxHash' => 0 }, # TODO: remove dependency on Tie::IxHash 22 | EXE_FILES => \@scripts, 23 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 24 | (ABSTRACT_FROM => 'lib/Lingua/CU.pm', # retrieve abstract from module 25 | AUTHOR => 'Aleksandr Andreev ') : ()), 26 | ); 27 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Lingua-CU version 0.04 2 | ====================== 3 | 4 | Lingua::CU is a Perl module for working with Church Slavonic text. 5 | It provides the following features: 6 | 7 | * Resolve Church Slavonic abbreviations and _nomina sacra_ 8 | * Convert between Cyrillic numerals and ASCII digits 9 | * Convert Church Slavonic text to Russian characters (not yet supported) 10 | * Romanize (transliterate to Latin) Church Slavonic text (not yet supported) 11 | * Convert between Cyrillic and Glagolitic text (not yet supported) 12 | * Convert between Unicode and legacy UCS and HIP encodings 13 | * Sort Church Slavonic words using a tailoring of the DUCET 14 | * Perform stemming of Church Slavonic words (not yet supported) 15 | * Perform hyphenation of Church Slavonic words (work in progress) 16 | 17 | This program is ALPHA-PHASE software and is thus provided with ABSOLUTELY NO WARRANTY, 18 | not even the implied warranties of merchantability or fitness for a purpose. 19 | 20 | The development of this program is part of the Slavonic Computing Initiative (SCI) at the 21 | Ponomar Project. For more information, please visit http://www.ponomar.net/ 22 | 23 | INSTALLATION 24 | 25 | To install this module type the following: 26 | 27 | perl Makefile.PL 28 | make 29 | make test 30 | make install 31 | 32 | DEPENDENCIES 33 | 34 | This program requires Unicode support (Perl 5.8.1 or higher for sure) 35 | Unicode::Collate version 1.04 or newer is required (DUCET version 6.3.0) 36 | 37 | COPYRIGHT AND LICENCE 38 | 39 | Copyright (C) 2012-2015 by Aleksandr Andreev 40 | 41 | This program is part of the Ponomar Project and uses certain components 42 | development by Ponomar Project coauthors Yuri Shardt and Nikita Simmons. 43 | See http://www.ponomar.net/ for more information. 44 | 45 | This library is free software; you can redistribute it and/or modify 46 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 47 | at your option, any later version of Perl 5 you may have available. 48 | 49 | 50 | -------------------------------------------------------------------------------- /cu.txt: -------------------------------------------------------------------------------- 1 | # Church Slavic sort order according to UTN #41: http://www.unicode.org/notes/tn41/ 2 | # Copyright 2018 Aleksandr Andreev and others 3 | # Slavonic Computing Initiative: http://sci.ponomar.net/ 4 | # 5 | # This code is FREE SOFTWARE and is licensed under the same terms as Perl itself 6 | upper 7 | backwards 8 | suppress 0400 040D 0419 0450 045D 0439 0407 0457 9 | 0487;[.0000.0001.0000] 10 | A67C;<0487> 11 | A67E; 12 | 0485;<0487>++2 13 | 0486;<0485>++2 14 | 0301;<0486>++2 15 | 0300;<0301>++2 16 | 0311;<0300>++2 17 | 0483;<0311>++2 18 | 0306;<0483>++2 19 | 0308;<0306>++2 20 | 030F;<0308> 21 | 2DF6;<0308>++2 22 | 2DE0;<2DF6>++2 23 | 2DE1;<2DE0>++2 24 | 2DE2;<2DE1>++2 25 | 2DE3;<2DE2>++2 26 | 2DF7;<2DE3>++2 27 | A674;<2DF7>++2 28 | 2DE4;++2 29 | 2DE5;<2DE4>++2 30 | A675;<2DE5>++2 31 | A676;++2 32 | 2DE6;++2 33 | 2DE7;<2DE6>++2 34 | 2DE8;<2DE7>++2 35 | 2DE9;<2DE8>++2 36 | 2DEA;<2DE9>++2 37 | A67B;<2DEA>++2 38 | 2DEB;++2 39 | 2DEC;<2DEB>++2 40 | 2DED;<2DEC>++2 41 | 2DEE;<2DED>++2 42 | 2DF9;<2DEE>++2 43 | A677;<2DF9>++2 44 | A69E;++2 45 | 2DEF;++2 46 | 2DF0;<2DEF>++2 47 | 2DF1;<2DF0>++2 48 | 2DF2;<2DF1>++2 49 | 2DF3;<2DF2>++2 50 | 033E;<2DF3>++2 51 | A678;<033E> 52 | 2E2F; 53 | A679;<033E>++2 54 | A67F;++2 55 | A67D; 56 | A67A; 57 | 2DFA;++2 58 | 2DFB;<2DFA>++2 59 | 2DFE;<2DFB>++2 60 | 2DFC;<2DFE>++2 61 | 2DFD;<2DFC>++2 62 | 2DF4;<2DFD>++2 63 | 2DF5;<2DED><2DEE> 64 | 1C81;<0434> 65 | 0454;<0415>+++2 66 | 0404;<0454>+++2 67 | 0437;<0455>+2 68 | A641;<0437> 69 | 0417;<0437>+++2 70 | A640;<0417> 71 | 0456;<0418>+2 72 | 0406;<0456>+++2 73 | 047B;<041D>+2 74 | 047A;<047B>+++2 75 | 043E;<047A>+++2 76 | 1C82;<043E> 77 | 041E;<043E>+++2 78 | 0461;<041E>+++2 79 | 0460;<0461>+++2 80 | A64D;<0460>+++2 81 | A64C;+++2 82 | 047C;<0486><0311> 83 | 047D;<0486><0311> 84 | 047E;<0460><0442> 85 | 047F;<0461><0442> 86 | 0479;<0422>+2 87 | 043E 0443;<0479> 88 | 1C82 0443;<0479> 89 | 0478;<0479>+++2 90 | 041E 0443;<0478> 91 | 041E 0423;<0478> 92 | A64B;<0478>+++2 93 | A64A;+++2 94 | 0443;+++2 95 | 0423;<0443>+++2 96 | 0463;<042D>+2 97 | 0462;<0463>+++2 98 | 046B;<044E>+2 99 | 046A;<046B>+++2 100 | 044F;<046A>+2 101 | 042F;<044F>+++2 102 | A657;<042F>+2 103 | A656;+++2 104 | 0467;+++2 105 | 0466;<0467>+++2 106 | 046F;<0466>+2 107 | 046E;<046F>+++2 108 | 0471;<046E>+2 109 | 0470;<0471>+++2 110 | 0473;<0470>+2 111 | 0472;<0473>+++2 112 | 0475;<0472>+2 113 | 0474;<0475>+++2 114 | -------------------------------------------------------------------------------- /lib/Lingua/CU.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU; 2 | 3 | require 5.014002; 4 | use strict; 5 | use warnings; 6 | use utf8; 7 | 8 | require Exporter; 9 | require Carp; 10 | use Unicode::Normalize qw( NFD NFC ); 11 | 12 | our @ISA = qw(Exporter); 13 | 14 | # Items to export into callers namespace by default. Note: do not export 15 | # names by default without a very good reason. Use EXPORT_OK instead. 16 | # Do not simply export all your public functions/methods/constants. 17 | 18 | # This allows declaration use Lingua::CU ':all'; 19 | # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK 20 | # will save memory. 21 | our %EXPORT_TAGS = ( 'all' => [ qw( 22 | asciiToCyrillic cyrillicToAscii resolve cu2ru hip2unicode zf2unicode 23 | ) ] ); 24 | 25 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); 26 | 27 | our @EXPORT = qw( 28 | 29 | ); 30 | 31 | our $VERSION = '0.04'; 32 | my %definitions; 33 | my %digits = ( 34 | '' => 0, 35 | 'а' => 1, 36 | 'в' => 2, 37 | 'г' => 3, 38 | 'д' => 4, 39 | 'є' => 5, 40 | 'ѕ' => 6, 41 | 'з' => 7, 42 | 'и' => 8, 43 | 'ѳ' => 9, 44 | 'і' => 10, 45 | 'к' => 20, 46 | 'л' => 30, 47 | 'м' => 40, 48 | 'н' => 50, 49 | 'ѯ' => 60, 50 | 'о' => 70, 51 | 'п' => 80, 52 | 'ч' => 90, 53 | 'р' => 100, 54 | 'с' => 200, 55 | 'т' => 300, 56 | 'у' => 400, 57 | 'ф' => 500, 58 | 'х' => 600, 59 | 'ѱ' => 700, 60 | 'ѿ' => 800, 61 | 'ц' => 900); 62 | my @letters = qw/а б в г д е є ж ѕ з ꙁ и і ї й к л м н о ѻ п р с т у ꙋ ф х ѡ ѿ ѽ ꙍ ц ч ш щ ъ ы ь ѣ ю ѧ ѫ ꙗ ѯ ѱ ѳ ѵ ѷ/; 63 | my @LETTERS = qw/А Б В Г Д Е Є Ж Ѕ З Ꙁ И І Ї Й К Л М Н О Ѻ П Р С Т У Ꙋ Ф Х Ѡ Ѿ Ѽ Ꙍ Ц Ч Ш Щ Ъ Ы Ь Ѣ Ю Ѧ Ѫ Ꙗ Ѯ Ѱ Ѳ Ѵ Ѷ/; 64 | 65 | my %resolver = ( 66 | chr(0x0405) => "З", # capital Zelo 67 | chr(0x0404) => "Е", # capital wide Est 68 | chr(0x0454) => "е", # lowercase wide est 69 | chr(0x0455) => "з", # lowercase zelo 70 | chr(0x0456) . chr(0x0308) => chr(0x0456), # double-dotted i 71 | chr(0x0457) => chr(0x0456), 72 | chr(0x0460) => "О", # capital Omega 73 | chr(0x0461) => "о", # lowercase omega 74 | chr(0x0466) => "Я", # capital small Yus 75 | chr(0x0467) => "я", # lowercase small yus 76 | chr(0x046E) => "Кс", # capital Xi 77 | chr(0x046F) => "кс", #lowercase xi 78 | chr(0x0470) => "Пс", # capital Psi 79 | chr(0x0471) => "пс", # lowercase psi 80 | chr(0x0472) => "Ф", # capital Theta 81 | chr(0x0473) => "ф", # lowercase theta 82 | chr(0x0474) => "В", # izhitsa 83 | chr(0x0475) => "в", # izhitsa 84 | # chr(0x041E) . chr(0x0443) => "У", # Ou 85 | # chr(0x1C82) . chr(0x0443) => "у", # ou 86 | chr(0x047A) => "О", # wide O 87 | chr(0x047B) => "о", # wide o 88 | chr(0x047C) => "О", # omega with great apostrophe 89 | chr(0x047D) => "о", # omega with great apostrophe 90 | chr(0x047E) => "Отъ", # Ot 91 | chr(0x047F) => "отъ", # ot 92 | chr(0xA64A) => "У", # Uk 93 | chr(0xA64B) => "у", # uk 94 | chr(0xA64C) => "О", # wide omega 95 | chr(0xA64D) => "о", # wide omega 96 | chr(0xA656) => "Я", # Ioted a 97 | chr(0xA657) => "я" # ioted a 98 | ); 99 | 100 | our @explodable = qw( 0x0400 0x0401 0x0403 0x0407 0x040c 0x040d 0x040e 0x0419 0x0439 0x0450 0x0451 0x0453 0x0457 0x045c 0x045d 0x045e 0x0476 0x0477 0x0479 ); 101 | 102 | INIT { 103 | # load the Titlo resolution Data into memory 104 | while () { 105 | next if (substr($_, 1, 1) eq "#"); 106 | s/\r?\n//g; 107 | next unless (length $_); 108 | my @parts = split /\t/; 109 | $parts[0] =~ s/\./\\b/g; 110 | $definitions{$parts[0]} = $parts[1]; 111 | } 112 | close DATA; 113 | } 114 | 115 | END { 116 | undef %definitions; 117 | } 118 | 119 | # Preloaded methods go here. 120 | sub resolve { 121 | my $text = shift; 122 | 123 | my $what = join("|", keys %definitions); 124 | $text =~ s/($what)/$definitions{$1}/g; 125 | return $text; 126 | } 127 | 128 | sub cyrillicToAscii { 129 | my $number = shift; 130 | 131 | my $o = join ('|', grep { $digits{$_} < 10 } keys %digits); 132 | my $t = join ('|', grep { $digits{$_} >= 10 && $digits{$_} < 100 } keys %digits); 133 | my $h = join ('|', grep { $digits{$_} >= 100 } keys %digits); 134 | 135 | # remove all occurences of titlo 136 | $number =~ s/\x{0483}//g; 137 | my $result = 0; 138 | if (index($number, " ") != -1) { 139 | my $umpteen = substr($number, 0, index($number, " ")); 140 | $umpteen =~ s/҂//g; 141 | if ($umpteen =~ /^([$h]?)([клмнѯопч]?)([$o]?)$/) { 142 | $umpteen =~ s/([$h]?)([клмнѯопч]?)([$o]?)/$digits{$1} + $digits{$2} + $digits{$3}/e; 143 | } elsif ($umpteen =~ /^([$h]?)([$o]?)(і)$/) { 144 | $umpteen =~ s/([$h]?)([$o]?)(і)/$digits{$1} + $digits{$2} + $digits{$3}/e; 145 | } else { 146 | Carp::carp (__PACKAGE__ . "::cyrillicToAscii ($number) - Error: $number is not a valid Cyrillic number"); 147 | } 148 | $result += $umpteen * 1000; 149 | $number = substr($number, index($number, " ") + 1); 150 | } 151 | 152 | if ($number =~ /^(?:҂([$h]))*(?:҂([$t]))*(?:҂([$o]))*([$h]?)([клмнѯопч]?)([$o]?)$/) { 153 | $number =~ s/(?:҂([$h]))*(?:҂([$t]))*(?:҂([$o]))*([$h]?)([клмнѯопч]?)([$o]?)/1000 * ($digits{$1||''} + $digits{$2||''} + $digits{$3||''}) + $digits{$4} + $digits{$5} + $digits{$6}/e; 154 | } elsif ($number =~ /^(?:҂([$h]))*(?:҂([$t]))*(?:҂([$o]))*([$h]?)([$o]?)(і)$/) { 155 | $number =~ s/(?:҂([$h]))*(?:҂([$t]))*(?:҂([$o]))*([$h]?)([$o]?)(і)/1000 * ($digits{$1||''} + $digits{$2||''} + $digits{$3||''}) + $digits{$4} + $digits{$5} + $digits{$6}/e; 156 | } else { 157 | Carp::carp (__PACKAGE__ . "::cyrillicToAscii ($number) - Error: $number is not a valid Cyrillic number"); 158 | } 159 | $result += $number; #$ * 1000 ** (scalar(@parts) - 1 - $i); 160 | return $result; 161 | 162 | } 163 | 164 | sub asciiToCyrillic { 165 | my $number = shift; 166 | my $omitTitlo = shift; 167 | 168 | # check if $number is in fact numeric 169 | unless ($number =~ /^[+-]?\d+(\.\d+)?$/) { 170 | Carp::carp (__PACKAGE__ . "::asciiToCyrillic ($number) - Error: $number is not a valid ASCII digit"); 171 | } 172 | 173 | my $output = ""; 174 | my %numerals = reverse %digits; 175 | if ($number >= 1000) { 176 | $output .= "҂"; 177 | $output .= asciiToCyrillic($number / 1000, 1); 178 | $output .= ' ' if($number > 10000 && ($number - 10000) % 1000 != 0); 179 | $omitTitlo = $number % 1000 == 0; 180 | } 181 | 182 | my $num = $number % 1000; 183 | foreach my $i (sort { $b <=> $a } keys %numerals) { 184 | last if ($num <= 0); 185 | if ($num < 20 && $num > 10) { 186 | $num -= 10; 187 | $output .= asciiToCyrillic($num, 1) . $numerals{10}; 188 | last; 189 | } 190 | 191 | if ($i <= $num) { 192 | $num -= $i; 193 | $output .= $numerals{$i}; 194 | } 195 | } 196 | 197 | # add the titlo character 198 | unless ($omitTitlo) { 199 | if (length($output) == 1) { 200 | $output .= chr(0x0483); 201 | } elsif (length($output) == 2) { 202 | if ($number > 800 && $number < 900) { 203 | $output .= chr(0x0483); 204 | } else { 205 | substr($output, 1, 0, chr(0x0483)); 206 | } 207 | } else { 208 | if (index($output, " ") == length($output) - 2) { 209 | $output .= chr(0x0483); 210 | } else { 211 | substr($output, length($output) - 1, 0, chr(0x0483)); 212 | } 213 | } 214 | } 215 | return $output; 216 | } 217 | 218 | sub isNumericCu { 219 | my $text = shift; 220 | my $o = join ('|', grep { $digits{$_} < 10 } keys %digits); 221 | my $t = join ('|', grep { $digits{$_} >= 10 && $digits{$_} < 100 } keys %digits); 222 | my $h = join ('|', grep { $digits{$_} >= 100 } keys %digits); 223 | return $text =~ /^(҂+[$h])?(҂+[$t])?(҂+[$o])?[$h]?([клмнѯопч]?[$o]?|[$o]?і)$/; 224 | } 225 | 226 | sub cu2ru { 227 | my $text = shift; 228 | my $params = shift; # params: noaccent, modernrules 229 | 230 | unless (length $text) { 231 | Carp::carp (__PACKAGE__ . "::cu2ru ($text) - Error: argument is empty"); 232 | } 233 | 234 | study $text; 235 | $text =~ s/\r?\n//g; 236 | # resolve titli via the dictionary 237 | unless (exists $params->{skiptitlos}) { 238 | $text = resolve $text; 239 | } 240 | 241 | ### STEP ONE: CONVERT YEROK TO HARD SIGN 242 | $text =~ s/\x{033E}|\x{2E2F}/ъ/g; 243 | 244 | ## STEP TWO: CONVERT GRAVE AND CIRCUMFLEX ACCENTS TO ACUTE 245 | $text =~ s/\x{0300}|\x{0311}/\x{0301}/g; 246 | 247 | ### STEP THREE: CONVERT IZHITSA 248 | $text =~ s/\x{0474}([\x{0486}\x{0301}])/И$1/g; 249 | $text =~ s/\x{0475}([\x{0486}\x{0301}])/и$1/g; 250 | 251 | ## STEP FOUR: REMOVE ALL BREATHING MARKS AND DOUBLE DOTS 252 | $text =~ s/\x{0486}|\x{A67C}|\x{A67E}|\x{0308}//g; 253 | 254 | ## STEP FIVE: CHARACTER INITIALIZATION 255 | # RESOLVE DIAGRAPH OU TO U 256 | $text =~ s/ᲂу|ѹ/у/g; 257 | $text =~ s/Оу|Ѹ/У/g; 258 | 259 | # RESOLVE ALL FORMS OF IZHITSA WITH ACCENT 260 | $text =~ s/\x{0474}\x{0301}/И\x{0301}/g; 261 | $text =~ s/\x{0475}\x{0301}/и\x{0301}/g; 262 | $text =~ s/\x{0474}\x{030F}/И/g; 263 | $text =~ s/\x{0475}\x{030F}/и/g; 264 | 265 | # REMOVE ALL VARIATION SELECTORS 266 | $text =~ s/[\x{FE00}\x{FE01}]//g; 267 | 268 | # convert semicolon to question mark 269 | $text =~ s/;/\?/g; 270 | 271 | ### AT THIS POINT, ATTEMPT TO RESOLVE ANY NUMERALS 272 | # XXX: WE CAN ONLY CONVERT IN THIS WAY NUMERALS BELOW ONE THOUSAND 273 | my $who = join ("|", keys %digits); 274 | 275 | $text =~ s/([$who][$who][\x{0483}][$who])/&cyrillicToAscii($1)/ge; 276 | $text =~ s/([$who][\x{0483}][$who])/&cyrillicToAscii($1)/ge; 277 | $text =~ s/([$who][$who][\x{0483}])/&cyrillicToAscii($1)/ge; 278 | $text =~ s/([$who][\x{0483}])/&cyrillicToAscii($1)/ge; 279 | 280 | $who = join("|", keys %resolver); 281 | ## STEP SIX: RESOLVE LETTERS PECULIAR TO CHURCH SLAVONIC 282 | $text =~ s/($who)/$resolver{$1}/g; 283 | 284 | ## STEP SEVEN: STANDARDIZE RUSSIAN-STYLE SPELLING 285 | # ъи => ы 286 | $text =~ s/ъи/ы/g; 287 | # жы, шы, щы => и 288 | $text =~ s/([жшщ])ы/$1и/g; 289 | # жя, шя, щя, чя => а 290 | $text =~ s/([жшщч])я/$1а/g; 291 | # отъ[consonant|hard vowel] -> от$1 292 | $text =~ s/([оО])тъ([абвгджзклмнопрстуфхцчшщ])/$1т$2/g; 293 | ## other normalizations may be performed here, if desired 294 | 295 | if (exists $params->{modernrules}) { 296 | ### STEP EIGHT: ADDITIONAL CONVERSIONS FOR MODERN ORTHOGRAPHY 297 | # GET RID OF THE DECIMAL I 298 | $text =~ s/\x{0406}/И/g; 299 | $text =~ s/\x{0456}/и/g; 300 | 301 | ## GET RID OF THE YAT 302 | $text =~ s/\x{0462}/Е/g; 303 | $text =~ s/\x{0463}/е/g; 304 | 305 | ## GET RID OF ALL TRAILING HARD SIGNS 306 | $text =~ s/ъ\b|Ъ\b//g; 307 | } 308 | 309 | if (exists $params->{noaccents}) { 310 | ### STEP NINE: IF DESIRED, REMOVE STRESS MARK (ACUTE ACCENT) 311 | $text =~ s/\x{0301}//g; 312 | } 313 | 314 | return $text; 315 | } 316 | 317 | sub hip2unicode { 318 | my $text = shift; 319 | my $script = shift || 'Cyrs'; 320 | use Lingua::CU::Scripts::HIP; 321 | return $script eq 'Cyrs' ? Lingua::CU::Scripts::HIP::convert($text) : 322 | $script eq 'Cyrl' ? Lingua::CU::Scripts::HIP::convert_Cyrl($text) : 323 | $script eq 'Latn' ? Lingua::CU::Scripts::HIP::convert_Latn($text) : 324 | $text; 325 | } 326 | 327 | sub zf2unicode { 328 | my $text = shift; 329 | use Lingua::CU::Scripts::HIP; 330 | return Lingua::CU::Scripts::HIP::convert_Zf($text); 331 | } 332 | 333 | sub explodeNFD { 334 | my $text = shift; 335 | 336 | $text = NFC( $text ); 337 | my %explodeMap = map { chr(hex( $_ )) => NFD(chr(hex( $_ ))) } @explodable; 338 | $explodeMap{chr(hex('0x047d'))} = chr(hex('0xa64d')) . chr(hex('0x0486')) . chr(hex('0x0311')); # broad omega with veliky apostrof (Not in UAX 15) 339 | $explodeMap{chr(hex('0x0479'))} = chr(hex('0x1c82')) . chr(hex('0x0443')); # digraph uk (not in UAX 15) 340 | my $explodeRex = join('|', keys %explodeMap); 341 | unless ($text =~ m/$explodeRex/) { 342 | return $text; 343 | } 344 | 345 | my @array; 346 | my ($first, $rest) = split(//, $text, 2); 347 | push @array, map { $first . $_ } explodeNFD($rest); 348 | if ($first =~ m/$explodeRex/) { 349 | push @array, map { $explodeMap{$first} . $_ } explodeNFD($rest); 350 | } 351 | return @array; 352 | } 353 | 354 | 1; 355 | 356 | =pod 357 | 358 | =encoding utf8 359 | 360 | =head1 NAME 361 | 362 | Lingua::CU - Perl extension for working with Church Slavonic text in Unicode 363 | 364 | =head1 SYNOPSIS 365 | 366 | use Lingua::CU ':all'; 367 | asciiToCyrillic (21); # returns к҃а 368 | cyrillicToAscii ("к҃а"); # returns 21 369 | resolve ("ст҃ъ"); # returns свѧ́тъ 370 | cu2ru ("ст҃ъ"); # returns свя́тъ 371 | cu2ru ("ст҃ъ", { noaccents => 1, modernrules => 1 }); # returns свят 372 | 373 | =head1 DESCRIPTION 374 | 375 | Lingua::CU is a module for performing various operations with Church Slavonic texts. 376 | 377 | It includes the following capabilities: 378 | 379 | =over 4 380 | 381 | =item Resolve Church Slavonic abbreviations and I 382 | 383 | =item Convert between Cyrillic numerals and Ascii digits 384 | 385 | =item Convert Church Slavonic text to Russian characters (both traditional and reformed orthography) 386 | 387 | =item Romanize (transliterate to Latin) Church Slavonic text using various systems 388 | 389 | =item Convert between Unicode and legacy UCS and HIP encodings 390 | 391 | =item Sort Church Slavonic words using a tailoring of the DUCET 392 | 393 | =item Perform hyphenation of Church Slavonic words (TODO) 394 | 395 | =back 396 | 397 | All text supplied to this library must be encoded in UTF-8 and, unless otherwise specified, is assumed to be 398 | in Unicode. For more on Church Slavonic using Unicode, please see 399 | Unicode Technical Note #41, I 400 | available at L. 401 | 402 | This program is ALPHA STAGE SOFTWARE and is provided with ABSOLUTELY NO WARRANTY of any kind, 403 | express or implied, not even the implied warranties of merchantability, fitness for a purpose, or non-infringement. 404 | 405 | =head2 EXPORT 406 | 407 | No methods are exported by default. 408 | 409 | The following methods may be exported if specified explicitly: 410 | C C C C. 411 | 412 | You may also export all of the above methods by writing: C. 413 | 414 | =head1 METHODS 415 | 416 | =head2 asciiToCyrillic 417 | 418 | Usage: C 419 | 420 | Takes a number in ASCII digits and returns the corresponding Cyrillic numeral. 421 | Croaks if C<$number> is not numeric. 422 | 423 | Example: C returns C<рк҃а>. 424 | 425 | =head2 cyrillicToAscii 426 | 427 | Usage: C 428 | 429 | Takes a Cyrillic numeral and returns the corresponding ASCII digits. 430 | Carps if C<$numeral> is not a well-formatted Slavonic number. 431 | 432 | Example: C returns C<121>. 433 | 434 | =head2 isNumericCu 435 | 436 | Usage: C 437 | 438 | Given UTF-8 encoded text C<$text>, return C if C<$text> is a 439 | well-formatted Cyrillic numeral and C otherwise. 440 | 441 | Example: C returns C. 442 | 443 | =head2 resolve 444 | 445 | Usage: C 446 | 447 | Takes a word that is written with a titlo or lettered titlo (as an abbreviation or I) 448 | and writes it out in full, resolving the abbreviation. 449 | 450 | Bugs: correct placement of stress marks and capitalization are not guaranteed. 451 | Titlo resolution relies on a list that can still be improved. 452 | 453 | Warning: the Slavonic word B<сн҃а> could both be an abbreviation for Сы́на and a numeral (251). Thus, 454 | C will return C<Сы́на> but C will return C<251>. 455 | 456 | =head2 cu2ru 457 | 458 | Usage: C 459 | 460 | Takes well-formatted Church Slavonic C<$text> and transforms it into Russian (civil) orthography. 461 | The following operations are performed: 462 | 463 | =over 4 464 | 465 | =item Titli and lettered titli are resolved (if the C parameter is not zero, this step is skipped; 466 | this is only useful if you are converting text known to have no abbreviations and wish to save time or 467 | if you are writing your own titlo processor for non standard text like text with XML markup or pre-Nikonian editions) 468 | 469 | =item Cyrillic numerals are resolved to Ascii numbers (but see note above concerning B<сн҃а>) 470 | 471 | =item Stress marks are transformed to the acute accent (U+0301) and all other diacritical marks are removed 472 | 473 | =item Letters that do not occur in Russian are transformed into their Russian analogs (e.g., ѧ is transformed to я) 474 | 475 | =item Some spelling is normalized to agree with common Russian rules (e.g., шы is transformed to ши) 476 | 477 | =item If optional parameter C is not zero, all stress marks are removed; otherwise, 478 | stress indications remain in the text, but only as the acute accent (U+0301) 479 | 480 | =item If optional parameter C is not zero, the text is further simplified 481 | into modern Russian orthography (that means that і is resolved to и, ѣ is resolved to е, 482 | and trailing ъ is removed); otherwise, traditional (pre-1918) orthography is assumed. 483 | 484 | =back 485 | 486 | =head2 hip2unicode 487 | 488 | Usage: C 489 | 490 | Takes C<$text> encoded in the legacy HyperInvariant Presentation (HIP) 491 | and returns its Unicode representation. 492 | For more on HIP, see L (in Russian). 493 | 494 | Optional parameter C<$script> is an ISO 15924 script code specifying the script 495 | of the HIP file (I -- Slavonic, I -- Civil Cyrillic, I -- Glagolitic, 496 | I -- Greek). If C<$script> is not specified, I is assumed. 497 | 498 | Text in C<$text> must be encoded in UTF-8. 499 | 500 | For converting entire files, see the command line I script provided by 501 | B. 502 | 503 | =head2 zf2unicode 504 | 505 | Usage: C 506 | 507 | Takes C<$text> encoded in the modified HyperInvariant Presentation (HIP) 508 | as used by the Znamenny Fund and returns its Unicode representation. 509 | For more on HIP, see L (in Russian). 510 | 511 | Text in C<$text> must be encoded in UTF-8. 512 | 513 | For converting entire files, see the command line I script provided by 514 | B. 515 | 516 | =head2 explodeNFD 517 | 518 | Usage: C 519 | 520 | Takes C<$text> and generates all possible equivalent representations by 521 | substituting each expandable character with all possible NFD expansions. 522 | 523 | =head1 SEE ALSO 524 | 525 | This software is part of the Ponomar Project (see http://www.ponomar.net/​). 526 | 527 | Be sure to read Unicode Technical Note #21 I 528 | and to download the Unicode-compatible Ponomar Unicode font. 529 | 530 | Be sure to read as well C and C in the Perl manual. 531 | 532 | =head1 AUTHOR 533 | 534 | Aleksandr Andreev 535 | 536 | =head1 COPYRIGHT AND LICENSE 537 | 538 | Copyright (C) 2012-2015 by Aleksandr Andreev 539 | 540 | This library is free software; you can redistribute it and/or modify 541 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 542 | at your option, any later version of Perl you may have available. 543 | 544 | =cut 545 | 546 | __DATA__ 547 | ## THIS IS A LIST OF RULES FOR THE RESOLUTION OF TITLI IN CHURCH SLAVONIC 548 | ## LINES BEGINNING WITH # ARE IGNORED 549 | ## COLUMNS ARE SEPARATED BY TAB 550 | гг҃л нгел 551 | пⷭ҇л по́стол 552 | пⷭ҇тол по́стол 553 | пⷭ҇тѡл по́стѡл 554 | пⷭ҇кп пи́скоп 555 | пⷭ҇коп пи́скоп 556 | гг҃єл нгєл 557 | бг҃а. Бо́га 558 | бг҃ови. Бо́гови 559 | .бг҃ома́т Богома́т 560 | .бг҃омлад Богомлад 561 | .бг҃омт҃ Богома́т 562 | бг҃ом бо́гом 563 | .бг҃оро́д Богоро́д 564 | .бг҃ꙋ. Бо́гу 565 | бг҃ъ Бо́гъ 566 | бг҃ бог 567 | Бг҃ Бог 568 | .бж҃е. Бо́же 569 | бж҃е боже 570 | Бж҃е Боже 571 | бж҃ї бо́жї 572 | Бж҃ї Бо́жї 573 | бжⷭ҇т боже́ст 574 | Бжⷭ҇т Боже́ст 575 | бз҃и бози 576 | Бз҃и Бози 577 | .бз҃ѣ Бозѣ 578 | бз҃ѣ бозѣ 579 | Бз҃ѣ Бозѣ 580 | блгⷣт благода́т 581 | Блгⷣт Благода́т 582 | блгⷭ҇в благослов 583 | Блгⷭ҇в Благослов 584 | .бл҃га. бла́га 585 | .бл҃ги. бла́ги 586 | .бл҃го. бла́го 587 | .бл҃гъ. бла́гъ 588 | бл҃г благ 589 | Бл҃г Благ 590 | бл҃ж блаж 591 | Бл҃ж Блаж 592 | бл҃з блаз 593 | Бл҃з Блаз 594 | .бцⷣ Богоро́диц 595 | бцⷣ богоро́диц 596 | Бцⷣ Богоро́диц 597 | бчⷣ богоро́дич 598 | Бчⷣ Богоро́дич 599 | .влⷣк Влады́к 600 | влⷣк влады́к 601 | Влⷣк Влады́к 602 | влⷣц влады́ц 603 | Влⷣц Влады́ц 604 | влⷣч влады́ч 605 | Влⷣч Влады́ч 606 | кр҃с кре́с 607 | кр҃л кре́сл 608 | кр҃ш креш 609 | крⷭ҇и креси 610 | крⷭ҇ кре́с 611 | гдⷭ҇а Го́спода 612 | гдⷭ҇ви Го́сподеви 613 | гдⷭ҇е Го́споде 614 | гдⷭ҇и Го́споди 615 | Гдⷭ҇ Го́спод 616 | гдⷭ҇и́н господи́н 617 | гдⷭ҇и Го́споди 618 | гдⷭ҇к госпо́дск 619 | гдⷭ҇н госпо́дн 620 | гдⷭ҇о господо 621 | гдⷭ҇р госуда́р 622 | гдⷭ҇с госпо́дс 623 | гдⷭ҇ꙋ. Го́сподꙋ 624 | гдⷭ҇ь. Госпо́дь 625 | гдⷭ҇ь госпо́дь 626 | гдⷭ҇ѣ Го́сподѣ 627 | глаⷡ҇ глава̀ 628 | гл҃а глаго́ла 629 | Гл҃а Глаго́ла 630 | гл҃г глаг 631 | Гл҃г Глаг 632 | гл҃е глаго́ле 633 | Гл҃е Глаго́ле 634 | гл҃и глаго́ли 635 | Гл҃и Глаго́ли 636 | гл҃ꙋ глаго́лꙋ 637 | Гл҃ꙋ Глаго́лꙋ 638 | гл҃ъ глаго́лъ 639 | Гл҃ъ Глаго́лъ 640 | гл҃ю глаго́лю 641 | Гл҃ю Глаго́лю 642 | гл҃ѧ глаго́лѧ 643 | Гл҃ѧ Глаго́лѧ 644 | глⷡ҇а глава̀ 645 | Глⷡ҇а Глава̀ 646 | гпⷭ҇ж госпож 647 | Гпⷭ҇ж Госпож 648 | дваⷤ два́жды 649 | Дваⷤ Два́жды 650 | .дв҃а Дѣ́ва 651 | дв҃а дѣ́ва 652 | дв҃д Дави́д 653 | Дв҃д Дави́д 654 | дв҃и́ дѣви́ 655 | дв҃и̑ дѣви̑ 656 | .дв҃о Дѣ́во 657 | дв҃о дѣ́во 658 | дв҃с дѣ́вс 659 | .дв҃ꙋ Дѣ́вꙋ 660 | дв҃ꙋ дѣ́вꙋ 661 | дв҃ц дѣ́виц 662 | дв҃ч дѣви́ч 663 | .дв҃ы Дѣ́вы 664 | дв҃ы дѣ́вы 665 | .дв҃ѣ Дѣ́вѣ 666 | дв҃ѣ дѣ́вѣ 667 | двⷭ҇т дѣ́вст 668 | Дв҃ Дѣ́в 669 | дс҃ѣ ду́сѣ 670 | Дс҃ Ду́с 671 | дх҃а ду́ха 672 | дх҃и ду́хи 673 | дх҃н духн 674 | дх҃о духо 675 | дх҃ѡ духѡ 676 | дх҃ꙋ ду́хꙋ 677 | дх҃ъ ду́хъ 678 | дх҃ѣ ду́хѣ 679 | Дх҃ Ду́х 680 | дш҃а душа 681 | дш҃е. ду́ше 682 | дш҃е душе 683 | дш҃и души 684 | дш҃ꙋ ду́шꙋ 685 | дш҃ы ду́шы 686 | Дш҃ Душ 687 | ѵⷢ҇л ѵа́нгел 688 | заⷱ҇ зача́ло 689 | Заⷱ҇ Зача́ло 690 | мⷬ҇к мѧрѣ́к 691 | рⷭ҇л русал 692 | і҆и҃л І҆сра́ил 693 | і҆и҃с І҆исꙋ́с 694 | кн҃г кнѧг 695 | кн҃же. кнѧ́же 696 | кн҃ж кнѧж 697 | кн҃з кнѧ́з 698 | Кн҃ Кнѧ́ 699 | кр҃с крес 700 | кр҃щ крещ 701 | крⷭ҇т крест 702 | Крⷭ҇т Крест 703 | мл҃т моли́т 704 | Мл҃т Моли́т 705 | млⷣн младе́н 706 | Млⷣн Младе́н 707 | млⷭ҇р милосѣ́р 708 | Млⷭ҇р Милосѣ́р 709 | млⷭ҇т ми́лост 710 | Млⷭ҇т Ми́лост 711 | мнⷭ҇т монаст 712 | Мнⷭ҇т Монаст 713 | мр҃і Марі 714 | мр҃ї Марї 715 | Мр҃і Марі 716 | Мр҃ї Марї 717 | мт҃е ма́те 718 | .мт҃и Ма́ти 719 | мт҃и ма́ти 720 | мт҃р ма́тер 721 | мт҃ь ма́ть 722 | Мт҃ Ма́т 723 | мцⷭ҇ мѣ́сѧц 724 | Мцⷭ҇ Мѣ́сѧц 725 | мч҃е му́че 726 | Мч҃е Му́че 727 | мч҃є му́чє 728 | Мч҃є Му́чє 729 | мч҃н му́чен 730 | Мч҃н Му́чен 731 | мчⷭ҇н мѣ́сѧчн 732 | м҃-ц четыредесѧ́тниц 733 | мⷭ҇ц мѣ́сѧц 734 | Мⷭ҇ц Мѣ́сѧц 735 | мⷭ҇ч мѣ́сѧч 736 | Мⷭ҇ч Mѣ́сѧч 737 | нб҃а не́ба 738 | нб҃е небе 739 | нб҃о не́бо 740 | нб҃с небе́с 741 | нб҃ꙋ не́бꙋ 742 | нб҃ѣ не́бѣ 743 | Нб҃ Не́б 744 | нбⷭ҇н небе́сн 745 | Нбⷭ҇н Небе́сн 746 | нлⷣ недѣ́л 747 | Нлⷣ Недѣ́л 748 | нн҃ѣ ны́нѣ 749 | Нн҃ѣ Ны́нѣ 750 | ѻ҆ц҃а̀ ѻ҆тца̀ 751 | Ѻ҆ц҃а̀ Ѻ҆тца̀ 752 | ѻ҆ц҃а́ ѻ҆тца́ 753 | Ѻ҆ц҃а́ Ѻ҆тца́ 754 | ѻ҆ц҃е́ ѻ҆тце́ 755 | ѻ҆ц҃е́ ѻ҆тце́ 756 | Ѻ҆ц҃е́ Ѻ҆тце́ 757 | ѻ҆ц҃ꙋ̀ ѻ҆тцꙋ̀ 758 | Ѻ҆ц҃ꙋ̀ Ѻ҆тцꙋ̀ 759 | ѻ҆ц҃ꙋ́ ѻ҆тцꙋ́ 760 | Ѻ҆ц҃ꙋ́ Ѻ҆тцꙋ́ 761 | ѻ҆ц҃ъ ѻ҆те́цъ 762 | Ѻ҆ц҃ъ Ѻ҆те́цъ 763 | ѻ҆ц҃ы̀ ѻ҆тцы̀ 764 | Ѻ҆ц҃ы̀ Ѻ҆тцы̀ 765 | ѻ҆ц҃ѣ̀ ѻ҆тцѣ̀ 766 | Ѻ҆ц҃ѣ̀ Ѻ҆тцѣ̀ 767 | ѻ҆ч҃ес ѻ҆те́чес 768 | Ѻ҆ч҃ес Ѻ҆те́чес 769 | ѻ҆ч҃єс ѻ҆те́чєс 770 | Ѻ҆ч҃єс Ѻ҆те́чєс 771 | ѻ҆ч҃е ѻ҆́тче 772 | Ѻ҆ч҃е Ѻ҆тче 773 | ѻ҆ч҃с ѻ҆те́чес 774 | Ѻ҆ч҃с Ѻ҆те́чес 775 | ѻ҆ч҃ь ѻ҆те́чь 776 | Ѻ҆ч҃ь Ѻ҆те́чь 777 | ѻ҆́ч҃а ѻ҆́тча 778 | Ѻ҆́ч҃а Ѻ҆́тча 779 | ѻ҆́ч҃ес ѻ҆́течес 780 | Ѻ҆́ч҃ес Ѻ҆́течес 781 | ѻ҆́ч҃е ѻ҆́тче 782 | Ѻ҆́ч҃е Ѻ҆́тче 783 | Ѻ҆́ч҃е Ѻ҆́тче 784 | ѻ҆́ч҃и ѻ҆́тчи 785 | Ѻ҆́ч҃и Ѻ҆́тчи 786 | ѻ҆́ч҃ї ѻ҆́тчї 787 | Ѻ҆́ч҃ї Ѻ҆́тчї 788 | ѻ҆́ч҃ꙋ ѻ҆́тчꙋ 789 | Ѻ҆́ч҃ꙋ Ѻ҆́тчꙋ 790 | ѻ҆́ч҃ь ѻ҆́течь 791 | Ѻ҆́ч҃ь Ѻ҆́течь 792 | ѡⷮ ѿ 793 | ѡ҆сщ҃а́ ѡ҆свѧща́ 794 | Ѡ҆сщ҃а́ Ѡ҆свѧща́ 795 | ѡ҆чⷭ҇т ѡ҆чи́ст 796 | Ѡ҆чⷭ҇т Ѡ҆чи́ст 797 | ѻц҃а ѻтца 798 | Ѻц҃а Ѻтца 799 | ѻц҃є ѻтцє 800 | ѻ҆ц҃є́ ѻ҆тцє́ 801 | Ѻц҃є Ѻтцє 802 | ѻц҃ꙋ ѻтцꙋ 803 | Ѻц҃ꙋ Ѻтцꙋ 804 | ѻц҃ъ ѻтецъ 805 | Ѻц҃ъ Ѻтецъ 806 | ѻц҃ы ѻтцы 807 | Ѻц҃ы Ѻтцы 808 | ѻц҃ѣ ѻтцѣ 809 | Ѻц҃ѣ Ѻтцѣ 810 | ѻч҃е ѻтече 811 | Ѻч҃е Ѻтече 812 | ѻч҃є ѻтечє 813 | Ѻч҃є Ѻтечє 814 | ѻч҃и ѻтчи 815 | Ѻч҃и Ѻтчи 816 | ѻч҃ї ѻтчї 817 | Ѻч҃ї Ѻтчї 818 | ѻц҃ъ ѻтецъ 819 | Ѻц҃ъ Ѻтецъ 820 | пл҃т пло́т 821 | Пл҃т Пло́т 822 | поⷣ подо́бенъ 823 | Поⷣ Подо́бенъ 824 | првⷣ пра́вед 825 | Првⷣ Пра́вед 826 | пречⷭ҇т пречи́ст 827 | Пречⷭ҇т Пречи́ст 828 | мⷣр му́др 829 | прпⷣб преподо́б 830 | Прпⷣб Преподо́б 831 | пⷣб подо́б 832 | прпⷣн преподо́бн 833 | Прпⷣн Преподо́бн 834 | прⷣт предт 835 | Прⷣт Предт 836 | прⷪ҇рк проро́к 837 | Прⷪ҇рк Проро́к 838 | прⷪ҇р прор 839 | Прⷪ҇р Прор 840 | прⷭ҇н присн 841 | Прⷭ҇н Присн 842 | прⷭ҇т прест 843 | Прⷭ҇т Прест 844 | пѧⷦ҇ пѧто́къ 845 | Пѧⷦ҇ Пѧто́къ 846 | ржⷭ҇т рождест 847 | Ржⷭ҇т Рождест 848 | рожⷭ҇т рождест 849 | Рожⷭ҇т Рождест 850 | сл҃н со́лн 851 | Сл҃н Со́лн 852 | см҃рт сме́рт 853 | См҃рт Сме́рт 854 | сн҃а сы́на 855 | Сн҃а Сы́на 856 | сн҃е сы́не 857 | Сн҃е Сы́не 858 | сн҃є сы́нє 859 | Сн҃є Сы́нє 860 | сн҃о сыно 861 | Сн҃о Сыно 862 | сн҃ѡ сынѡ 863 | Сн҃ѡ Сынѡ 864 | сн҃ꙋ сы́нꙋ 865 | Сн҃ꙋ Сы́нꙋ 866 | сн҃ъ сы́нъ 867 | Сн҃ъ Сы́нъ 868 | сн҃ы сы́ны 869 | Сн҃ы Сы́ны 870 | сн҃ѣ сы́нѣ 871 | Сн҃ѣ Сы́нѣ 872 | сп҃са. Cпа́са 873 | сп҃са спаса 874 | сп҃се Cпа́се 875 | сп҃сѐ спасѐ 876 | сп҃се́ спасе́ 877 | сп҃сє́ спасє́ 878 | сп҃си спаси 879 | сп҃сл спасл 880 | сп҃со спасо 881 | сп҃сѡ спасѡ 882 | сп҃сс спа́сс 883 | сп҃ст спаст 884 | сп҃сꙋ спасꙋ 885 | сп҃сш спа́сш 886 | сп҃съ Спа́съ 887 | сп҃сы спасы 888 | сп҃сѣ спа́сѣ 889 | Сп҃с Спас 890 | спⷭ҇л спа́сл 891 | Cпⷭ҇л Cпа́сл 892 | спⷭ҇н спасе́н 893 | Cпⷭ҇н Cпасе́н 894 | спⷭ҇т спаст 895 | Cпⷭ҇т Cпаст 896 | спⷭ҇ш спа́сш 897 | Cпⷭ҇ш Cпа́сш 898 | срⷣц се́рдц 899 | Срⷣц Се́рдц 900 | стрⷭ҇ти стра́сти 901 | стрⷭ҇тї стра́стї 902 | стрⷭ҇ть стра́сть 903 | стрⷭ҇т страст 904 | Стрⷭ҇т Стра́ст 905 | ст҃а. свѧ́та 906 | ст҃а свѧта 907 | ст҃е свѧ́те 908 | ст҃и свѧти 909 | ст҃і свѧті 910 | ст҃л свѧти́тел 911 | ст҃о свѧ́то 912 | ст҃ѡ свѧ́тѡ 913 | ст҃ꙋ свѧ́тꙋ 914 | ст҃ъ свѧ́тъ 915 | ст҃ы свѧты 916 | ст҃ѣ свѧтѣ 917 | ст҃ѧ свѧтѧ 918 | Ст҃ Свѧт 919 | сꙋⷠ҇ сꙋббо́та 920 | Cꙋⷠ҇ Cꙋббо́та 921 | сщ҃е свѧще 922 | сщ҃є свѧщє 923 | сщ҃ꙋ свѧщꙋ 924 | Сщ҃ Свѧщ 925 | сⷯ сти́хъ 926 | трⷪ҇ц Тро́иц 927 | Трⷪ҇ц Тро́иц 928 | трⷪ҇ч тро́ич 929 | Трⷪ҇ч Тро́ич 930 | трⷭ҇т трисвѧт 931 | Трⷭ҇т Трисвѧт 932 | триⷤ три́жды 933 | Триⷤ Три́жды 934 | у҆чн҃и у҆чени 935 | Оу҆ч҃ни Оу҆чени 936 | у҆ч҃ни у҆чени 937 | Оу҆ч҃н Оу҆чен 938 | у҆ч҃н у҆чен 939 | Оу҆чн҃к Оу҆ченик 940 | у҆чн҃к у҆ченик 941 | Оу҆чн҃ц Оу҆чениц 942 | у҆чн҃ц у҆чениц 943 | Оу҆чт҃л Оу҆чи́тел 944 | у҆чт҃л у҆чи́тел 945 | Оу҆ч҃нї Оу҆че́нї 946 | у҆ч҃нї у҆че́нї 947 | Оу҆ч҃те Оу҆чи́те 948 | у҆ч҃те у҆чи́те 949 | Оу҆ч҃тл Оу҆чите́л 950 | у҆ч҃тл у҆чите́л 951 | хрⷭ҇т Христ 952 | Хрⷭ҇т Христ 953 | х҃с Христо́съ 954 | цр҃е царе 955 | Цр҃е Царе 956 | цр҃є царє 957 | Цр҃є Царє 958 | цр҃и цари 959 | Цр҃и Цари 960 | цр҃ї ца́рї 961 | Цр҃ї Ца́рї 962 | цр҃ква це́рква 963 | Цр҃ква Це́рква 964 | цр҃кве церкве́ 965 | Цр҃кве Церкве́ 966 | цр҃кви це́ркви 967 | Цр҃кви Це́ркви 968 | цр҃квь це́рковь 969 | Цр҃квь Це́рковь 970 | цр҃ков це́рков 971 | Цр҃ков Це́рков 972 | цр҃ко́в церко́в 973 | Цр҃ко́в Церко́в 974 | цр҃кѡ́в церкѡ́в 975 | Цр҃кѡ́в Церкѡ́в 976 | цр҃с ца́рс 977 | Цр҃с Ца́рс 978 | цр҃ц цари́ц 979 | Цр҃ц Цари́ц 980 | цр҃ь ца́рь 981 | Цр҃ь Ца́рь 982 | цр҃ѣ царѣ 983 | Цр҃ѣ Царѣ 984 | цр҃ю царю 985 | Цр҃ю Царю 986 | цр҃ѧ царѧ 987 | Цр҃ѧ Царѧ 988 | црⷭ҇к ца́рск 989 | Црⷭ҇к Ца́рск 990 | црⷭ҇т ца́рст 991 | Црⷭ҇т Ца́рст 992 | чеⷦ҇ человѣ́къ 993 | Чеⷦ҇ Человѣ́къ 994 | чл҃вѣ человѣ 995 | Чл҃вѣ Человѣ 996 | чл҃к человѣк 997 | Чл҃к Человѣк 998 | чтⷭ҇а чи́ста 999 | Чтⷭ҇а Чи́ста 1000 | чтⷭ҇е че́сте 1001 | Чтⷭ҇е Че́сте 1002 | чтⷭ҇ї чи́стї 1003 | Чтⷭ҇ї Чи́стї 1004 | чтⷭ҇н че́стн 1005 | Чтⷭ҇н Че́стн 1006 | чтⷭ҇о чи́сто 1007 | Чтⷭ҇о Чи́сто 1008 | чтⷭ҇ꙋ чи́стꙋ 1009 | Чтⷭ҇ꙋ Чи́стꙋ 1010 | чтⷭ҇ъ чи́стъ 1011 | Чтⷭ҇ъ Чи́стъ 1012 | чтⷭ҇ы чи́сты 1013 | Чтⷭ҇ы Чи́сты 1014 | чтⷭ҇ь че́сть 1015 | Чтⷭ҇ь Че́сть 1016 | чтⷭ҇ѣ чи́стѣ 1017 | Чтⷭ҇ѣ Чи́стѣ 1018 | чⷭ҇та чи́ста 1019 | Чⷭ҇та Чи́ста 1020 | чⷭ҇те че́сте 1021 | Чⷭ҇те Че́сте 1022 | чⷭ҇тї чи́стї 1023 | Чⷭ҇тї Чи́стї 1024 | чⷭ҇тн че́стн 1025 | Чⷭ҇тн Че́стн 1026 | чⷭ҇то чи́сто 1027 | Чⷭ҇то Чи́сто 1028 | чⷭ҇тꙋ чи́стꙋ 1029 | Чⷭ҇тꙋ Чи́стꙋ 1030 | чⷭ҇ты чи́сты 1031 | Чⷭ҇ты Чи́сты 1032 | чⷭ҇тѣ чи́стѣ 1033 | Чⷭ҇тѣ Чи́стѣ 1034 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Collate.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU::Collate; 2 | 3 | require 5.006; 4 | use strict; 5 | use warnings; 6 | use utf8; 7 | 8 | require Exporter; 9 | require Carp; 10 | 11 | no warnings 'utf8'; 12 | use Unicode::Collate '1.04'; 13 | 14 | our @ISA = qw(Exporter Unicode::Collate); 15 | our $VERSION = '1.04'; ## XXX: keep version alligned with version of Unicode::Collate 16 | 17 | my $return; 18 | 19 | unless ($return = do 'Lingua/CU/cu.pl') { 20 | Carp::carp( "Couldn't get result of cu.pl: $@" ) if $@; 21 | Carp::carp( "couldn't do cu.pl: $!" ) unless defined $return; 22 | Carp::carp( "couldn't run cu.pl" ) unless $return; 23 | } 24 | 25 | sub new { 26 | my $class = shift; 27 | my %tailoring = @_; 28 | 29 | while (my ($k, $v) = each %$return) { 30 | if (!exists $tailoring{$k}) { 31 | $tailoring{$k} = $v; 32 | } elsif ($k eq "entry") { 33 | $tailoring{$k} = $v . $tailoring{$k}; 34 | } else { 35 | Carp::croak (__PACKAGE__ . "::new - Error: $k is reserved and cannot be overwritten"); 36 | } 37 | } 38 | 39 | return new Unicode::Collate(%tailoring); 40 | } 41 | 42 | 1; 43 | 44 | =pod 45 | 46 | =encoding utf8 47 | 48 | =head1 NAME 49 | 50 | Lingua::CU::Collate - Collation for Church Slavonic in Unicode 51 | 52 | =head1 SYNOPSIS 53 | 54 | use Lingua::CU::Collate; 55 | #construct 56 | $Collator = Lingua::CU::Collate->new(); # custom %tailoring may also be specified 57 | 58 | #sort 59 | @sorted = $Collator->sort(@not_sorted); 60 | 61 | #compare 62 | $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 63 | 64 | =head1 DESCRIPTION 65 | 66 | Lingua::CU::Collate is a wrapper around Unicode::Collate that provides a custom collation tailoring for Church Slavonic 67 | 68 | All text supplied to this library must be encoded in UTF-8 and, unless otherwise specified, is assumed to be 69 | in Unicode. For more on Church Slavonic using Unicode, and for a description of the tailoring, please see the paper 70 | I available at 71 | http://www.ponomar.net/. 72 | 73 | This program is ALPHA STAGE SOFTWARE and is provided with ABSOLUTELY NO WARRANTY of any kind, 74 | express or implied, not even the implied warranties of merchantability, fitness for a purpose, or non-infringement. 75 | 76 | =head2 EXPORT 77 | 78 | This module provides no methods except for new. It only serves as a wrapper around Unicode::Collate. 79 | 80 | =head1 METHODS 81 | 82 | =head2 new 83 | 84 | Usage: C 85 | 86 | Creates a new Unicode::Collate object. The same keys in C may be specified as would be specified in creating Unicode::Collate directly 87 | however the following are reserved by Lingua::CU::Collate and may not be overwritten: C, C 88 | and C. If C is specified, it is appended to the entries specified by Lingua::CU::Collate. 89 | 90 | =head1 SEE ALSO 91 | 92 | This software is part of the Ponomar Project (see http://www.ponomar.net/​). 93 | 94 | Be sure to read the I and to download the 95 | Unicode-compatible Hirmos Ponomar font. 96 | 97 | Be sure to read as well C and C in the Perl manual. 98 | 99 | =head1 AUTHOR 100 | 101 | Aleksandr Andreev 102 | 103 | =head1 COPYRIGHT AND LICENSE 104 | 105 | Copyright (C) 2012-2014 by Aleksandr Andreev 106 | 107 | This library is free software; you can redistribute it and/or modify 108 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 109 | at your option, any later version of Perl you may have available. 110 | 111 | =cut 112 | 113 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Hyphenate.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU::Hyphenate; 2 | 3 | use strict; 4 | use utf8; 5 | 6 | our $VERSION = '0.01'; 7 | 8 | 1; 9 | 10 | =head1 NAME 11 | 12 | Lingua::CU::Hyphenate - Provides a hyphenation and syllabification algorithm for Church Slavonic. 13 | 14 | =head1 DESCRIPTION 15 | 16 | This is presently just a wrapper for some data used to create Church Slavonic hyphenation support. 17 | 18 | =head1 SEE ALSO 19 | 20 | perl(1), TeX::Hyphen. 21 | 22 | =head1 AUTHOR 23 | 24 | Aleksandr Andreev L. 25 | 26 | =head1 LICENSING 27 | 28 | Copyright (c) 2015 Aleksandr Andreev (http://sci.ponomar.net/) 29 | 30 | This library is free software; you can redistribute it and/or modify 31 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 32 | at your option, any later version of Perl you may have available. 33 | 34 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU::Scripts; 2 | 3 | use strict; 4 | 5 | our $VERSION = '0.01'; 6 | 7 | 1; 8 | 9 | =head1 NAME 10 | 11 | Lingua::CU::Scripts - Script supporting modules and scripts for Church Slavonic implementation 12 | 13 | =head1 DESCRIPTION 14 | 15 | This module contains a number of useful command-line programs for working with Church Slavonic: 16 | 17 | hip2unicode - converts legacy HyperInvariant Presentation (HIP) encoding to Unicode 18 | ucs2unicode - converts legacy Universal Church Slavonic (UCS) encoding to Unicode 19 | 20 | When executed without parameters, most commands will emit usage message. 21 | 22 | =head1 SEE ALSO 23 | 24 | L, 25 | L. 26 | 27 | =head1 AUTHOR 28 | 29 | Aleksandr Andreev L. 30 | 31 | =head1 LICENSING 32 | 33 | Copyright (c) 2015 Aleksandr Andreev (http://sci.ponomar.net/) 34 | 35 | This library is free software; you can redistribute it and/or modify 36 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 37 | at your option, any later version of Perl you may have available. 38 | 39 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/HIP.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU::Scripts::HIP; 2 | 3 | use warnings; 4 | use strict; 5 | use utf8; 6 | 7 | use vars qw($VERSION @ISA); 8 | use Unicode::Normalize; 9 | use Tie::IxHash; 10 | use Carp qw( croak ); 11 | 12 | our $VERSION = '0.04'; 13 | our @ISA = (); 14 | 15 | tie my %dictionary, "Tie::IxHash"; 16 | tie my %latin, "Tie::IxHash"; 17 | tie my %civil, "Tie::IxHash"; 18 | 19 | my %numsigns = ( 20 | "<тьма>&" => chr(0x20DD), 21 | "<легион>&" => chr(0x0488), 22 | "<леодр>&" => chr(0x0489), 23 | "<вран>&" => chr(0xA670), 24 | "<колода>&" => chr(0xA671) 25 | ); 26 | my $diactrics = join('', map { chr($_) } ((0x0300 .. 0x036F), (0x0483 .. 0x0487), (0x2DE0 .. 0x2DFF))); 27 | unless (%dictionary = do "Lingua/CU/Scripts/hipequivs") { 28 | croak "Couldn't parse hipequivs: $@" if ($@); 29 | croak "Couldn't do hipequivs: $!" unless (%dictionary); 30 | croak "Couldn't run hipequivs" unless (keys %dictionary); 31 | } 32 | unless (%latin = do "Lingua/CU/Scripts/hipequivs_Latn") { 33 | croak "Couldn't parse hipequivs_Latn: $@" if ($@); 34 | croak "Couldn't do hipequivs_Latn: $!" unless (%latin); 35 | croak "Couldn't run hipequivs_Latn" unless (keys %latin); 36 | } 37 | unless (%civil = do "Lingua/CU/Scripts/hipequivs_Cyrl") { 38 | croak "Couldn't parse hipequivs_Cyrl: $@" if ($@); 39 | croak "Couldn't do hipequivs_Cyrl: $!" unless (%civil); 40 | croak "Couldn't run hipequivs_Cyrl" unless (keys %civil); 41 | } 42 | 43 | sub convert_Latn { 44 | my $string = shift; 45 | study $string; 46 | my $what = join("|", map (quotemeta, keys %latin)); 47 | $string =~ s/($what)/$latin{$1}/g; 48 | return NFC($string); 49 | } 50 | 51 | sub convert_Cyrl { 52 | my $string = shift; 53 | study $string; 54 | my $what = join("|", map (quotemeta, keys %civil)); 55 | $string =~ s/($what)/$civil{$1}/g; 56 | return NFC($string); 57 | } 58 | 59 | sub convert_Zf { 60 | my $string = shift; 61 | 62 | # add in the additional equivs used by ZF 63 | my %zfequivs = (); 64 | unless (%zfequivs = do "Lingua/CU/Scripts/hipequivs_Zf") { 65 | croak "Couldn't parse hipequivs_Zf: $@" if ($@); 66 | croak "Couldn't do hipequivs_Zf: $!" unless (%civil); 67 | croak "Couldn't run hipequivs_Zf" unless (keys %civil); 68 | } 69 | 70 | @dictionary{keys %zfequivs} = values %zfequivs; 71 | delete $dictionary{'*'}; 72 | study $string; 73 | my $what = join("|", map (quotemeta, keys %dictionary)); 74 | $string =~ s/($what)/$dictionary{$1}/g; 75 | 76 | # dot the i's 77 | $what = chr(0x0456) . "([^$diactrics])"; 78 | $string =~ s/$what/\x{0456}\x{0308}$1/g; 79 | $string =~ s/\x{F8FF}/\x{0456}/g; 80 | return NFC($string); 81 | } 82 | 83 | sub convert { 84 | my $string = shift; 85 | 86 | study $string; 87 | my $what = join("|", map (quotemeta, keys %dictionary)); 88 | foreach my $number (keys %numsigns) { 89 | next unless index($string, $number) != -1; 90 | s/$number\{(\w+)\}/$1$numsigns{$number}/g; 91 | } 92 | 93 | $string =~ s/($what)/$dictionary{$1}/g; 94 | 95 | # dot the i's 96 | $what = chr(0x0456) . "([^$diactrics])"; 97 | $string =~ s/$what/\x{0456}\x{0308}$1/g; 98 | $string =~ s/\x{F8FF}/\x{0456}/g; 99 | return NFC($string); 100 | } 101 | 102 | 1; 103 | 104 | =head1 NAME 105 | 106 | Lingua::CU::Scripts::HIP - process HIP (HyperInvariant Presentation) pseudocoding 107 | 108 | =head1 AUTHOR 109 | 110 | Aleksandr Andreev L. 111 | 112 | =head1 LICENSING 113 | 114 | Copyright (c) 2015 Aleksandr Andreev (http://sci.ponomar.net/) 115 | 116 | This library is free software; you can redistribute it and/or modify 117 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 118 | at your option, any later version of Perl you may have available. 119 | 120 | =cut 121 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/UCS.pm: -------------------------------------------------------------------------------- 1 | package Lingua::CU::Scripts::UCS; 2 | 3 | use warnings; 4 | use strict; 5 | use utf8; 6 | 7 | use vars qw($VERSION @ISA); 8 | use Unicode::Normalize; 9 | use Tie::IxHash; 10 | use Carp qw( croak ); 11 | 12 | our $VERSION = '0.04'; 13 | our @ISA = (); 14 | 15 | tie my %dictionary, "Tie::IxHash"; 16 | 17 | unless (%dictionary = do "Lingua/CU/Scripts/ucsequivs") { 18 | croak "Couldn't parse ucsequivs: $@" if ($@); 19 | croak "Couldn't do ucsequivs: $!" unless (%dictionary); 20 | croak "Couldn't run ucsequivs" unless (keys %dictionary); 21 | } 22 | 23 | sub convert { 24 | my $string = shift; 25 | study $string; 26 | my $what = join("|", map (quotemeta, keys %dictionary)); 27 | $string =~ s/($what)/$dictionary{$1}/g; 28 | # fix initial e, which is not specified correctly in UCS 29 | $string =~ s/\x{0415}\x{0486}/\x{0404}\x{0486}/g; 30 | return NFC($string); 31 | } 32 | 33 | 1; 34 | 35 | =head1 NAME 36 | 37 | Lingua::CU::Scripts::UCS - process UCS (Universal Church Slavonic) legacy encoding 38 | 39 | =head1 AUTHOR 40 | 41 | Aleksandr Andreev L. 42 | 43 | =head1 LICENSING 44 | 45 | Copyright (c) 2015 Aleksandr Andreev (http://sci.ponomar.net/) 46 | 47 | This library is free software; you can redistribute it and/or modify 48 | it under the same terms as Perl itself, either Perl version 5.14.2 or, 49 | at your option, any later version of Perl you may have available. 50 | 51 | =cut 52 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/hipequivs: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # this is hipequivs 3 | 'A' => chr(0x0410), 4 | 'a' => chr(0x0430), 5 | 'B' => chr(0x0412), 6 | 'b' => chr(0x0432), 7 | 'E' => chr(0x0045), 8 | 'e' => chr(0x0435), 9 | 'K' => chr(0x041A), 10 | 'k' => chr(0x043A), 11 | 'M' => chr(0x041C), 12 | 'm' => chr(0x043C), 13 | 'H' => chr(0x041D), 14 | 'h' => chr(0x043D), 15 | 'O' => chr(0x041E), 16 | 'o' => chr(0x043E), 17 | 'P' => chr(0x0420), 18 | 'p' => chr(0x0440), 19 | 'C' => chr(0x0421), 20 | 'c' => chr(0x0441), 21 | 'T' => chr(0x0422), 22 | 't' => chr(0x0442), 23 | 'Y' => chr(0x0423), 24 | 'y' => chr(0x0443), 25 | 'X' => chr(0x0425), 26 | 'x' => chr(0x0445), 27 | 'І' => chr(0x0049), 28 | 'і' => chr(0x0069), 29 | 'jь' => chr(0x0463), 30 | 'Jь' => chr(0x0462), 31 | 'JЬ' => chr(0x0462), 32 | '<кс>' => chr(0x046F), 33 | '<КС>' => chr(0x046E), 34 | '<Кс>' => chr(0x046E), 35 | 'ju' => chr(0x046D), 36 | '<е>' => chr(0x0454), 37 | '<Е>' => chr(0x0404), 38 | 's' => chr(0x0455), 39 | 'S' => chr(0x0405), 40 | 'v"' => chr(0x0477), 41 | 'V"' => chr(0x0476), 42 | 'v' => chr(0x0475), 43 | 'V' => chr(0x0474), 44 | 'f' => chr(0x0473), 45 | 'F' => chr(0x0472), 46 | 'u' => chr(0x046B), 47 | '' => chr(0x0456), 48 | '_i' => chr(0xF8FF), 49 | 'i' => chr(0x0456), 50 | '' => chr(0xA647), 51 | '<е>' => chr(0x0454), 52 | '_е' => chr(0x0454), 53 | '_Е' => chr(0x0404), 54 | '<Е>' => chr(0x0404), 55 | '_кс' => chr(0x046F), 56 | '_КС' => chr(0x046E), 57 | '_Кс' => chr(0x046E), 58 | '<кс>' => chr(0x046F), 59 | '<КС>' => chr(0x046E), 60 | '<Кс>' => chr(0x046E), 61 | '<о>' => chr(0x047B), 62 | '<О>' => chr(0x047A), 63 | '_о' => chr(0x047B), 64 | '_О' => chr(0x047A), 65 | 'w\т' => chr(0x047F), 66 | 'W\т' => chr(0x047E), 67 | 'W\Т' => chr(0x047E), 68 | '' => chr(0x047D), 69 | '' => chr(0x047C), 70 | '_w' => chr(0x047D), 71 | '_W' => chr(0x047C), 72 | 'w' => chr(0x0461), 73 | 'W' => chr(0x0460), 74 | '<пс>' => chr(0x0471), 75 | '<ПС>' => chr(0x0470), 76 | '<Пс>' => chr(0x0470), 77 | '_пс' => chr(0x0471), 78 | '_ПС' => chr(0x0470), 79 | '_Пс' => chr(0x0470), 80 | 'о<у>' => chr(0x1C82) . chr(0x0443), 81 | 'о_у' => chr(0x1C82) . chr(0x0443), 82 | 'О<у>' => chr(0x041E) . chr(0x0443), 83 | 'О_у' => chr(0x041E) . chr(0x0443), 84 | '<у>' => chr(0x0443), 85 | '_у' => chr(0x0443), 86 | 'у' => chr(0xA64B), 87 | 'У' => chr(0xA64A), 88 | 'jа' => chr(0xA657), 89 | 'JА' => chr(0xA656), 90 | 'Jа' => chr(0xA656), 91 | 'я' => chr(0x0467), 92 | 'Я' => chr(0x0466), 93 | '<а>' => chr(0xFFFD), 94 | '<г>' => chr(0x0491), 95 | '<Г>' => chr(0x0490), 96 | '<д>' => chr(0x1C81), 97 | '<дг>' => chr(0xA663), 98 | '<ДГ>' => chr(0xA662), 99 | '<дж>' => chr(0x045F), 100 | '<ДЖ>' => chr(0x040F), 101 | 'jе' => chr(0x0465), 102 | 'JЕ' => chr(0x0464), 103 | 'Jе' => chr(0x0464), 104 | 'jjь' => chr(0xA653), 105 | 'JJЬ' => chr(0xA652), 106 | '' => chr(0x1C87), 107 | '' => chr(0xFFFD), 108 | '' => chr(0xA643), 109 | '' => chr(0xA643), 110 | '' => chr(0xA645), 111 | '<з>' => chr(0xA641), 112 | 'g' => chr(0xA649), 113 | '<лг>' => chr(0xA665), 114 | '<нг>' => chr(0x04A5), 115 | '<оо>' => chr(0xA699), 116 | '<о+>' => chr(0xA69B), 117 | '<о.о.>' => chr(0xA66D), 118 | '<о.>' => chr(0xA669), 119 | '<о:>' => chr(0xA66B), 120 | '<о_>' => chr(0x1C82), 121 | '<_w>' => chr(0xA64D), 122 | '<с>' => chr(0x1C83), 123 | '<т>' => chr(0x1C84), 124 | '<|т|>' => chr(0x1C85), 125 | '<оv>' => chr(0x1C88), 126 | '<ч>' => chr(0x0447), 127 | 'q' => chr(0x0481), 128 | '<ъ>' => chr(0x1C86), 129 | '<ы>' => chr(0xA651), 130 | '<ьi_>' => chr(0x044C) . chr(0xA647), 131 | '<ъi_>' => chr(0x044A) . chr(0xA647), 132 | '<ьи>' => chr(0x044C) . chr(0x0438), 133 | '<ъи>' => chr(0x044A) . chr(0x0438), 134 | '<ь-i>' => chr(0xFFFD), 135 | '<ъ-i>' => chr(0xFFFD), 136 | '<ь-i_>' => chr(0xFFFD), 137 | '<ъ-i_>' => chr(0xFFFD), 138 | '<ь-и>' => chr(0xFFFD), 139 | '<ъ-и>' => chr(0xFFFD), 140 | 'э' => chr(0x044D), 141 | 'Э' => chr(0x042D), 142 | '<ю>' => chr(0xA655), 143 | 'ю@' => chr(0x044E) . chr(0x0306), 144 | 'I' => chr(0x0406), 145 | '' => chr(0xA657), 146 | 'jя' => chr(0x0469), 147 | '<яu>' => chr(0xA65B), 148 | '<я.>' => chr(0xFFFD), 149 | '<я>' => chr(0xA659), 150 | '<я_>' => chr(0xFFFD), 151 | '' => chr(0xFFFD), 152 | '<ын>' => chr(0xA65F), 153 | '=' => chr(0x0486), 154 | "'" => chr(0x0301), 155 | '^' => chr(0x0311), 156 | '`' => chr(0x0300), 157 | '~' => chr(0x0483), 158 | '"' => chr(0x030F), 159 | '\ъ' => chr(0x033E), 160 | '\Ъ' => chr(0x033E), 161 | '\б' => chr(0x2DE0) . chr(0x0487), 162 | '\Б' => chr(0x2DE0) . chr(0x0487), 163 | '\в' => chr(0x2DE1) . chr(0x0487), 164 | '\В' => chr(0x2DE1) . chr(0x0487), 165 | '\г' => chr(0x2DE2) . chr(0x0487), 166 | '\Г' => chr(0x2DE2) . chr(0x0487), 167 | "\\д" => chr(0x2DE3), 168 | '\Д' => chr(0x2DE3), 169 | '\ж' => chr(0x2DE4), 170 | '\Ж' => chr(0x2DE4), 171 | '\з' => chr(0x2DE5), 172 | '\З' => chr(0x2DE5), 173 | '\к' => chr(0x2DE6) . chr(0x0487), 174 | '\К' => chr(0x2DE6) . chr(0x0487), 175 | '\л' => chr(0x2DE7) . chr(0x0487), 176 | '\Л' => chr(0x2DE7) . chr(0x0487), 177 | '\м' => chr(0x2DE8), 178 | '\М' => chr(0x2DE8), 179 | '\н' => chr(0x2DE9) . chr(0x0487), 180 | '\Н' => chr(0x2DE9) . chr(0x0487), 181 | '\о' => chr(0x2DEA) . chr(0x0487), 182 | '\О' => chr(0x2DEA) . chr(0x0487), 183 | '\п' => chr(0x2DEB), 184 | '\П' => chr(0x2DE8), 185 | '\р' => chr(0x2DEC) . chr(0x0487), 186 | '\Р' => chr(0x2DEC) . chr(0x0487), 187 | '\с' => chr(0x2DED) . chr(0x0487), 188 | '\С' => chr(0x2DED) . chr(0x0487), 189 | '\т' => chr(0x2DEE), 190 | '\Т' => chr(0x2DEE), 191 | '\х' => chr(0x2DEF), 192 | '\Х' => chr(0x2DEF), 193 | '\ф' => chr(0xA69E) . chr(0x0487), 194 | '\Ф' => chr(0xA69E) . chr(0x0487), 195 | '\ц' => chr(0x2DF0), 196 | '\Ц' => chr(0x2DF0), 197 | '\ч' => chr(0x2DF1) . chr(0x0487), 198 | '\Ч' => chr(0x2DF1) . chr(0x0487), 199 | '\ш' => chr(0x2DF2) . chr(0x0487), 200 | '\Ш' => chr(0x2DF2) . chr(0x0487), 201 | '\щ' => chr(0x2DF3) . chr(0x0487), 202 | '\Щ' => chr(0x2DF3) . chr(0x0487), 203 | '\f' => chr(0x2DF4) . chr(0x0487), 204 | '\F' => chr(0x2DF4) . chr(0x0487), 205 | '\а' => chr(0x2DF6) . chr(0x0487), 206 | '\А' => chr(0x2DF6) . chr(0x0487), 207 | '\е' => chr(0x2DF7), 208 | '\Е' => chr(0x2DF7), 209 | '\g' => chr(0x2DF8), 210 | '\G' => chr(0x2DF8), 211 | '\у' => chr(0x2DF9), 212 | '\У' => chr(0x2DF9), 213 | '\jе' => chr(0x2DFA), 214 | '\ю' => chr(0x2DFB), 215 | '\Ю' => chr(0x2DFB), 216 | '\jа' => chr(0x2DFC), 217 | '\я' => chr(0x2DFD), 218 | '\Я' => chr(0x2DFD), 219 | '\u' => chr(0x2DFE), 220 | '\U' => chr(0x2DFE), 221 | '\ju' => chr(0x2DFF), 222 | '\и' => chr(0xA675), 223 | '\И' => chr(0xA675), 224 | '\^' => chr(0x0487), 225 | '\-' => chr(0x0487), 226 | '#' => chr(0x0482), 227 | '@' => chr(0xA67E), 228 | '+' => chr(0x2020), 229 | '*' => chr(0xA673), 230 | '<*>' => chr(0xA673), 231 | '<М\р>' => chr(0x1F545), 232 | '<+>' => chr(0x1F542), 233 | '<\+/>' => chr(0x1F541), 234 | '<(+)>' => chr(0x1F540), 235 | '<(:.>' => chr(0x1F543), 236 | '<.:)>' => chr(0x1F544), 237 | '<тьматем>' => chr(0x044B) . chr(0xA672), 238 | '<->' => chr(0x2013), 239 | '<тьма>&' => chr(0x20DD), 240 | '<легион>&' => chr(0x0488), 241 | '<леодр>&' => chr(0x0489), 242 | '<вран>&' => chr(0xA670), 243 | '<колода>&' => chr(0xA671), 244 | '<>' => chr(0x000A) . chr(0x000A), 245 | '_/' => chr(0x000A) . chr(0x000A), 246 | '<_>' => chr(0x200B), 247 | '__' => chr(0x200B), 248 | '<|>' => chr(0x034F), 249 | '&' => chr(0x200D), 250 | '%-' => chr(0x00AD), 251 | '<((>' => chr(0x00AB), 252 | '<))>' => chr(0x00BB), 253 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/hipequivs_Cyrl: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # this is hipequivs_Cyrl 3 | 'A' => chr(0x0410), 4 | 'a' => chr(0x0430), 5 | 'B' => chr(0x0412), 6 | 'b' => chr(0x0432), 7 | 'E' => chr(0x0045), 8 | 'e' => chr(0x0435), 9 | 'K' => chr(0x041A), 10 | 'k' => chr(0x043A), 11 | 'M' => chr(0x041C), 12 | 'm' => chr(0x043C), 13 | 'H' => chr(0x041D), 14 | 'h' => chr(0x043D), 15 | 'O' => chr(0x041E), 16 | 'o' => chr(0x043E), 17 | 'P' => chr(0x0420), 18 | 'p' => chr(0x0440), 19 | 'C' => chr(0x0421), 20 | 'c' => chr(0x0441), 21 | 'T' => chr(0x0422), 22 | 't' => chr(0x0442), 23 | 'Y' => chr(0x0423), 24 | 'y' => chr(0x0443), 25 | 'X' => chr(0x0425), 26 | 'x' => chr(0x0445), 27 | "у" => chr(0x0443), 28 | "У" => chr(0x0423), 29 | 'jь' => chr(0x0463), 30 | 'Jь' => chr(0x0462), 31 | 'JЬ' => chr(0x0462), 32 | '<кс>' => chr(0x046F), 33 | '<КС>' => chr(0x046E), 34 | '<Кс>' => chr(0x046E), 35 | 'ju' => chr(0x046D), 36 | '<е>' => chr(0x0454), 37 | '<Е>' => chr(0x0404), 38 | 's' => chr(0x0455), 39 | 'S' => chr(0x0405), 40 | 'v"' => chr(0x0477), 41 | 'V"' => chr(0x0476), 42 | 'v' => chr(0x0475), 43 | 'V' => chr(0x0474), 44 | 'f' => chr(0x0473), 45 | 'F' => chr(0x0472), 46 | 'u' => chr(0x046B), 47 | "я" => chr(0x044F), 48 | "Я" => chr(0x042F), # Russian YA - Note that this is not the same as SMALL YUS in the Slavonic case 49 | "г'" => chr(0x0453), 50 | "Г'" => chr(0x0403), # македонское ГЬ 51 | "<г>" => chr(0x0491), 52 | "<Г>" => chr(0x0490), # украинское Г ТВЕРДОЕ 53 | "<дж>" => chr(0x045F), 54 | "<ДЖ>" => chr(0x040F), 55 | "<Дж>" => chr(0x040F), # сербско-македонское ДЖ (похоже на Ц, но хвостик посередине) 56 | "е\\:" => chr(0x0451), 57 | "Е\\:" => chr(0x0401), # Ё (йо) 58 | "к'" => chr(0x045C), 59 | "К'" => chr(0x040C), # македонское КЬ 60 | "<ль>" => chr(0x0459), 61 | "<ЛЬ>" => chr(0x0409), 62 | "<Ль>" => chr(0x0409), # сербско-македонское ЛЬ 63 | "<нь>" => chr(0x045A), 64 | "<Нь>" => chr(0x040A), 65 | "<НЬ>" => chr(0x040A), # сербско-македонское НЬ 66 | "у\\@" => chr(0x045E), 67 | "У\\@" => chr(0x040E), # белорусское У КРАТКОЕ 68 | "g" => chr(0x045B), 69 | "G" => chr(0x040B), # сербское ЧЬ/ТЬ/КЬ (в виде перечеркнутого "h") 70 | "" => chr(0x0452), 71 | "" => chr(0x0402), # сербское ДЖЬ/ДЬ/ГЬ (перечеркнутое "h" с круглой ножкой) 72 | "i" => chr(0x0456), 73 | "I" => chr(0x0406), # "И" 10-ричное (строчная по умолчанию с одной точкой) 74 | "i\\:" => chr(0x0456) . chr(0x0308), 75 | "I\\:" => chr(0x0407), # украинское ЙИ 76 | "" => chr(0x0458), 77 | "" => chr(0x0408), # сербско-македонское J 78 | "" => chr(0x046D), 79 | "" => chr(0x046C), 80 | "" => chr(0x046C), # ЙОТИРОВАННЫЙ ЮС БОЛЬШОЙ (болгарский 19 века; без точки над "i") 81 | "<г->" => chr(0x0493), 82 | "<Г->" => chr(0x0492), # перечеркнутое Г (азерб.) 83 | "<к->" => chr(0x049D), 84 | "<К->" => chr(0x049C), # перечеркнутое К (азерб.) 85 | "<о>" => chr(0x04E9), 86 | "<О>" => chr(0x04E8), # фитообразный знак для звука вроде нем. О-УМЛЯУТ или фр. ОЕ 87 | "<у>" => chr(0x04AF), 88 | "<У>" => chr(0x04AE), # "у" с вертикальной ножкой (прописная буква выглядит как игрек) 89 | "<у->" => chr(0x04B1), 90 | "<У->" => chr(0x04B0), # "у" с вертикальной ножкой, перечеркнутое 91 | "" => chr(0x04AF), 92 | "" => chr(0x04AE), # latinates of above 93 | "<х>" => chr(0x04BB), 94 | "<Х>" => chr(0x04BA), # h-образная казахская буква 95 | "<э>" => chr(0x04D9), 96 | "<Э>" => chr(0x04D8), # перевернутое "е" -> presumably this refers to SCHWA 97 | "<г,>" => chr(0x04F7), # GHE with descender used in Yupik 98 | "<Г,>" => chr(0x04F6), 99 | "<ж,>" => chr(0x0497), # ZHE with descender used in Tatar 100 | "<Ж,>" => chr(0x0496), 101 | "<з,>" => chr(0x0499), # ZE with descender used in Bashkir 102 | "<З,>" => chr(0x0498), 103 | "<к,>" => chr(0x049B), 104 | "<К,>" => chr(0x049A), # KA with descender used in Abkhazian 105 | "<н,>" => chr(0x04A3), 106 | "<Н,>" => chr(0x04A2), # EN with descender used in Bashkir 107 | "<с,>" => chr(0x04AB), 108 | "<С,>" => chr(0x04AA), # ES with descender used in Bashkir 109 | "<т,>" => chr(0x04AD), 110 | "<Т,>" => chr(0x04AC), # TE with descender used in Abkhazian 111 | "" => chr(0x04AD), 112 | "" => chr(0x04AC), # Latinate forms 113 | "<х,>" => chr(0x04B3), 114 | "<Х,>" => chr(0x04B2), # HA with descender used in Abkhazian 115 | "<ч,>" => chr(0x04B7), 116 | "<Ч,>" => chr(0x04B6), # CHE with descender used in Abkhazian 117 | "<,,>" => chr(0x201E), 118 | "<``>" => chr(0x201C), 119 | "<(>" => chr(0x007B), 120 | "<)>" => chr(0x007D), 121 | "<[>" => chr(0x003C), 122 | "<]>" => chr(0x003E), 123 | "<'>" => chr(0x0027), 124 | "<пгф>" => chr(0x00A7), # or is it 00B6? not clear from the description: знак параграфа 125 | "" => chr(0x0023), 126 | "<проц>" => chr(0x0025), 127 | "*" => chr(0x002A), 128 | "+" => chr(0x2020), 129 | '<>' => chr(0x000A) . chr(0x000A), 130 | '_/' => chr(0x000A) . chr(0x000A), 131 | '<_>' => chr(0x200B), 132 | '__' => chr(0x200B), 133 | '<|>' => chr(0x034F), 134 | '&' => chr(0x200D), 135 | '%-' => chr(0x00AD), 136 | '<((>' => chr(0x00AB), 137 | '<))>' => chr(0x00BB), 138 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/hipequivs_Latn: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # this is hipequivs_Latn 3 | 'І' => chr(0x0049), 4 | 'і' => chr(0x0069), 5 | "а" => chr(0x0061), 6 | "А" => chr(0x0041), 7 | "в" => chr(0x0062), 8 | "В" => chr(0x0042), 9 | "с" => chr(0x0063), 10 | "С" => chr(0x0043), 11 | "е" => chr(0x0065), 12 | "Е" => chr(0x0045), 13 | "н" => chr(0x0068), 14 | "Н" => chr(0x0048), 15 | "к" => chr(0x006B), 16 | "К" => chr(0x004B), 17 | "м" => chr(0x006D), 18 | "М" => chr(0x004D), 19 | "о" => chr(0x006F), 20 | "О" => chr(0x004F), 21 | "р" => chr(0x0070), 22 | "Р" => chr(0x0050), 23 | "т" => chr(0x0074), 24 | "Т" => chr(0x0054), 25 | "х" => chr(0x0078), 26 | "Х" => chr(0x0058), 27 | "у" => chr(0x0079), 28 | "У" => chr(0x0059), 29 | "а&е" => chr(0x00E6), 30 | "A&E" => chr(0x00C6), # лигатура АЕ 31 | "е&т" => chr(0x0026), # лигатура ЕТ (знак "&") 32 | "о&е" => chr(0x0153), 33 | "О&E" => chr(0x0152), # лигатура ОЕ 34 | "<о>" => chr(0x0275), 35 | "<О>" => chr(0x019F), # перечеркнутое О 36 | "" => chr(0x0111), 37 | "" => chr(0x0110), # хорватское ДЖЬ (перечеркнутое d) 38 | "" => chr(0x0131), # I без точек 39 | "" => chr(0x0237), # J без точек 40 | "" => chr(0x0142), 41 | "" => chr(0x0141), # польское твердое L 42 | "" => chr(0x00DF), # немецкое ЭС-ЦЕТ (В-образное) 43 | "" => chr(0x017F), # S высокое (f-образное) 44 | '<>' => chr(0x000A) . chr(0x000A), 45 | '_/' => chr(0x000A) . chr(0x000A), 46 | '<_>' => chr(0x200B), 47 | '__' => chr(0x200B), 48 | '<|>' => chr(0x034F), 49 | '&' => chr(0x200D), 50 | '%-' => chr(0x00AD), 51 | '<((>' => chr(0x00AB), 52 | '<))>' => chr(0x00BB), 53 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/hipequivs_Zf: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # additional hip equivs used by ZF 3 | '_я' => chr(0xA657), 4 | '_Я' => chr(0xA656), 5 | 'ё' => chr(0x0463), 6 | 'Ё' => chr(0x0462), 7 | 'э' => chr(0x044D), 8 | 'Э' => chr(0x042D), 9 | '*о' => chr(0x0461), 10 | '*О' => chr(0x0460), 11 | '*и' => chr(0xF8FF), 12 | '*И' => chr(0xF8FF), 13 | '*й' => chr(0x0456), # this will need to be converted to 0456 14 | '*Й' => chr(0x0456), # this will need to be converted to 0406 15 | '*е' => chr(0x0465), 16 | '*Е' => chr(0x0464), 17 | '*я' => chr(0x0469), 18 | '*Я' => chr(0x0468), 19 | '*ю' => chr(0x046D), 20 | '*Ю' => chr(0x046C), 21 | '_*о' => chr(0xA64D), 22 | '_*О' => chr(0xA64C), 23 | '_з' => chr(0x0455), 24 | '_З' => chr(0x0405), 25 | '_ф' => chr(0x0473), 26 | '_Ф' => chr(0x0472), 27 | '_ю' => chr(0x046B), 28 | '_Ю' => chr(0x046A), 29 | '_в' => chr(0x0475), 30 | '_В' => chr(0x0474), 31 | '_и' => chr(0x0477), 32 | '_И' => chr(0x0476), 33 | 'w\т' => chr(0x047F), 34 | 'W\т' => chr(0x047E), 35 | '*о\т' => chr(0x047F), 36 | '*О\т' => chr(0x047E), 37 | '\c\т' => chr(0x2DED) . chr(0xA66F), 38 | '\р\т' => chr(0x2DEC) . chr(0xA66F), 39 | '\т' => chr(0x2DEE), 40 | '\Т' => chr(0xA66F), 41 | '\ъ' => chr(0x033E), 42 | '0' => chr(0x1C82), 43 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/ostrogequivs: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # this is ostrogequivs :: encoding of Ostrog Bible files 3 | '"' => chr(0xA67C), 4 | '#' => chr(0xA67D), 5 | '$' => chr(0x0486), 6 | '%' => chr(0x0486) . chr(0x0301), 7 | '&' => chr(0x0486) . chr(0x0300), 8 | "'" => chr(0x0301), 9 | '*' => chr(0xA673), 10 | '/' => chr(0x0301), 11 | '=' => chr(0x030F), 12 | '@' => chr(0x0301), 13 | 'A' => chr(0x2DF6), 14 | 'C' => chr(0x2DF0), 15 | 'D' => chr(0x2DE3), 16 | 'E' => chr(0x0483), 17 | 'G' => chr(0x2DE3), 18 | 'H' => chr(0x2DE2), 19 | 'L' => chr(0x2DE7), 20 | 'M' => chr(0x2DE8), 21 | 'O' => chr(0x0486) . chr(0x0311), 22 | 'Q' => chr(0x2DE4) . chr(0x2DF7), 23 | 'T' => chr(0x2DEE), 24 | 'W' => chr(0x0482), 25 | 'X' => chr(0x2DEF), 26 | 'Z' => chr(0x2DE5), 27 | '^' => chr(0x0486), 28 | '`' => chr(0x0300), 29 | 'b' => chr(0x2DE0) . chr(0x0487), 30 | 'c' => chr(0x2DF0), 31 | 'd' => chr(0x2DE3) . chr(0xA675), # combining DE-I 32 | 'e' => chr(0x0300), 33 | 'f' => chr(0xA69E) . chr(0x0487), 34 | 'g' => chr(0x2DF4) . chr(0x0487), 35 | 'h' => chr(0x2DE2) . chr(0x0487), 36 | 'j' => chr(0x2DF2) . chr(0x0487), 37 | 'k' => chr(0x2DE6) . chr(0x0487), 38 | 'l' => chr(0x2DE7) . chr(0x0487), 39 | 'n' => chr(0x2DE9) . chr(0x0487), 40 | 'o' => chr(0x2DEA) . chr(0x0487), 41 | 'p' => chr(0x2DEB) . chr(0x0487), 42 | 'q' => chr(0x2DF1) . chr(0x0487), 43 | 'r' => chr(0x2DEC) . chr(0x0487), 44 | 's' => chr(0x2DED) . chr(0x0487), 45 | 't' => chr(0x2DEE), 46 | 'u' => chr(0x2DF3) . chr(0x0487), 47 | 'v' => chr(0x2DE1) . chr(0x0487), 48 | 'w' => chr(0x2DF0) . chr(0x0487), 49 | 'x' => chr(0x2DEF), 50 | 'y' => chr(0x2DE4) . chr(0x2DF7), 51 | 'z' => chr(0x2DE5), 52 | '|' => chr(0x033E), 53 | '~' => chr(0x0483), 54 | chr(0x0087) => chr(0x02CA), 55 | '³' => chr(0x0456), 56 | 'ç' => chr(0x0437), 57 | 'Ѐ' => chr(0x041E) . chr(0x0423), 58 | 'Ё' => chr(0xA64B), 59 | 'ЈА' => chr(0xA656), 60 | 'Ћ' => chr(0x046E), 61 | 'Ў' => chr(0x0470), 62 | 'Џ' => chr(0x0472), 63 | 'Э' => chr(0xA64C), 64 | 'о' => chr(0x1C82), 65 | 'э' => chr(0x0461), 66 | 'я' => chr(0x0467), 67 | 'ё' => chr(0x1C82) . chr(0x0443), 68 | 'ђ' => chr(0xA64B), 69 | 'ѓ' => chr(0x047B), 70 | 'ја' => chr(0xA657), 71 | 'јя' => chr(0x0467), 72 | 'јќ' => chr(0x0469), 73 | 'љ' => chr(0x0475), 74 | 'њ' => chr(0x0463), 75 | 'ћ' => chr(0x045B), 76 | 'ќ' => chr(0x046B), 77 | 'ў' => chr(0x0471), 78 | 'џ' => chr(0x0473), 79 | "”" => chr(0x00B7), 80 | "‡" => chr(0x0301), 81 | 82 | -------------------------------------------------------------------------------- /lib/Lingua/CU/Scripts/ucsequivs: -------------------------------------------------------------------------------- 1 | use utf8; 2 | # this is ucsequivs 3 | '#' => chr(0x0486), 4 | '$' => chr(0x0486) . chr(0x0301), 5 | '%' => chr(0x0486) . chr(0x0300), 6 | '&' => chr(0x0483), 7 | '*' => chr(0xA673), 8 | '+' => chr(0x2DE1) . chr(0x0487), # combining VE 9 | '0' => chr(0x043E) . chr(0x0301), 10 | '1' => chr(0x0301), 11 | '2' => chr(0x0300), 12 | '3' => chr(0x0486), 13 | '4' => chr(0x0486) . chr(0x0301), 14 | '5' => chr(0x0486) . chr(0x0300), 15 | '6' => chr(0x0311), # combining inverted breve 16 | '7' => chr(0x0483), # titlo 17 | '8' => chr(0x033E), # combining vertical tilde 18 | '9' => chr(0x0436) . chr(0x0483), # zhe with titlo above 19 | '<' => chr(0x2DEF), # combining HA 20 | '=' => chr(0x2DE9) . chr(0x0487), # combining EN 21 | '>' => chr(0x2DEC) . chr(0x0487), # combining ER 22 | '?' => chr(0x2DF1) . chr(0x0487), # combining CHE 23 | '@' => chr(0x0300), 24 | 'A' => chr(0x0430) . chr(0x0300), # latin A maps to AZ with grave accent 25 | 'B' => chr(0x0463) . chr(0x0311), # latin B maps to Yat' with inverted breve 26 | 'C' => chr(0x2DED) . chr(0x0487), # combining ES 27 | 'D' => chr(0x0434) . chr(0x2DED) . chr(0x0487), 28 | 'E' => chr(0x0435) . chr(0x0300), # latin E maps to e with grave accent 29 | 'F' => chr(0x0472), # F maps to THETA 30 | 'G' => chr(0x0433) . chr(0x0483), # G maps to ge with TITLO 31 | 'H' => chr(0x0461) . chr(0x0301), # latin H maps to omega with acute accent 32 | 'I' => chr(0x0406), 33 | 'J' => chr(0x0456) . chr(0x0300), 34 | 'K' => chr(0xA656) . chr(0x0486), # YA with psili 35 | 'L' => chr(0x043B) . chr(0x2DE3), # el with cobining de 36 | 'M' => chr(0x0476), # capital IZHITSA with kendema 37 | 'N' => chr(0x047A) . chr(0x0486), # capital WIDE ON with psili 38 | 'O' => chr(0x047A), # just capital WIDE ON 39 | 'P' => chr(0x0470), # capital PSI 40 | 'Q' => chr(0x047C), # capital omega with great apostrophe 41 | 'R' => chr(0x0440) . chr(0x0483), # lowercase re with titlo 42 | 'S' => chr(0x0467) . chr(0x0300), # lowercase small yus with grave 43 | 'T' => chr(0x047E), # capital OT 44 | 'U' => chr(0x041E) . chr(0x0443), # diagraph capital UK 45 | 'V' => chr(0x0474), # capital IZHITSA 46 | 'W' => chr(0x0460), # capital OMEGA 47 | 'X' => chr(0x046E), # capital XI 48 | 'Y' => chr(0xA64B) . chr(0x0300), # monograph uk with grave 49 | 'Z' => chr(0x0466), # capital SMALL YUS 50 | '\\' => chr(0x0483), # yet another titlo 51 | '^' => chr(0x0311), # combining inverted breve 52 | '_' => chr(0x033E), # yet another yerik 53 | 'a' => chr(0x0430) . chr(0x0301), # latin A maps to AZ with acute accent 54 | 'b' => chr(0x2DEA) . chr(0x0487), # combining ON 55 | 'c' => chr(0x2DED) . chr(0x0487), # combining ES 56 | 'd' => chr(0x2DE3), # combining DE 57 | 'e' => chr(0x0435) . chr(0x0301), # latin E maps to e with acute accent 58 | 'f' => chr(0x0473), # lowercase theta 59 | 'g' => chr(0x2DE2) . chr(0x0487), # combining ge 60 | 'h' => chr(0x044B) . chr(0x0301), # ery with acute accent 61 | 'i' => chr(0x0456), 62 | 'j' => chr(0x0456) . chr(0x0301), # i with acute accent 63 | 'k' => chr(0xA657) . chr(0x0486), # iotaed a with psili 64 | 'l' => chr(0x043B) . chr(0x0483), # el with titlo 65 | 'm' => chr(0x0477), # izhitsa with izhe titlo 66 | 'n' => chr(0x047B) . chr(0x0486), # wide on with psili 67 | 'o' => chr(0x047B), # wide on 68 | 'p' => chr(0x0471), # lowercase psi 69 | 'q' => chr(0x047D), # lowercase omega with great apostrophe 70 | 'r' => chr(0x0440) . chr(0x2DED) . chr(0x0487), # lowercase er with combining es 71 | 's' => chr(0x0467) . chr(0x0301), # lowercase small yus with acute accent 72 | 't' => chr(0x047F), # lowercase ot 73 | 'u' => chr(0x1C82) . chr(0x0443), # diagraph uk 74 | 'v' => chr(0x0475), # lowercase izhitsa 75 | 'w' => chr(0x0461), # lowercase omega 76 | 'x' => chr(0x046F), # lowercase xi 77 | 'y' => chr(0xA64B) . chr(0x0301), # monograph uk with acute accent 78 | 'z' => chr(0x0467), # lowercase small yus 79 | '{' => chr(0xA64B) . chr(0x0311), # monograph uk with inverted breve 80 | '|' => chr(0x0467) . chr(0x0486) . chr(0x0300), # lowercase small yus with apostroph 81 | '}' => chr(0x0438) . chr(0x0483), # the numeral eight 82 | '~' => chr(0x0301), # yet another acute accent 83 | ### SECOND HALF IS THE CYRILLIC BLOCK 84 | 'Ђ' => chr(0x0475) . chr(0x0301), # lowercase izhitsa with acute 85 | 'Ѓ' => chr(0x0410) . chr(0x0486) . chr(0x0301), # uppercase A with psili and acute 86 | '‚' => chr(0x201A), 87 | 'ѓ' => chr(0x0430) . chr(0x0486) . chr(0x0301), # lowercase A with psili and acute 88 | '„' => chr(0x201E), 89 | '…' => chr(0x046F) . chr(0x0483), # the numberal sixty 90 | '†' => chr(0x0430) . chr(0x0311), # lowercase a with inverted breve 91 | '‡' => chr(0x0456) . chr(0x0311), # lowercase i with inverted breve 92 | '€' => chr(0x2DE5), # combining ze 93 | '‰' => chr(0x0467) . chr(0x0311), # lowercase small yus with inverted breve 94 | 'Љ' => chr(0x0466) . chr(0x0486), # upercase small yus with psili 95 | '‹' => chr(0x0456) . chr(0x0483), # the numeral ten 96 | 'Њ' => chr(0x0460) . chr(0x0486), # capital OMEGA with psili 97 | 'Ќ' => chr(0x041E) . chr(0x0443) . chr(0x0486) . chr(0x0301), # diagraph uk with apostroph 98 | 'Ћ' => chr(0xA656) . chr(0x0486) . chr(0x0301), # uppercase Iotated A with apostroph 99 | 'Џ' => chr(0x047A) . chr(0x0486) . chr(0x0301), # uppercase Round O with apostroph 100 | 'ђ' => chr(0x0475) . chr(0x2DE2) . chr(0x0487), # lowercase izhitsa with combining ge 101 | '‘' => chr(0x2018), 102 | '’' => chr(0x2019), 103 | '“' => chr(0x201C), 104 | '”' => chr(0x201D), 105 | '•' => chr(0x2DE4), # combining zhe 106 | '–' => chr(0x2013), 107 | '—' => chr(0x2014), 108 | '™' => chr(0x0442) . chr(0x0483), 109 | 'љ' => chr(0x0467) . chr(0x0486), # lowercase small yus with psili 110 | '›' => chr(0x0475) . chr(0x0311), # izhitsa with inverted breve 111 | 'њ' => chr(0x0461) . chr(0x0486), # lowercase omega with psili 112 | 'ќ' => chr(0x1C82) . chr(0x0443) . chr(0x0486) . chr(0x0301), # diagraph uk with apostroph 113 | 'ћ' => chr(0xA657) . chr(0x0486) . chr(0x0301), # lowercase iotaed a with apostroph 114 | 'џ' => chr(0x047B) . chr(0x0486) . chr(0x0301), # lowercase Round O with apostroph 115 | 'Ў' => chr(0x041E) . chr(0x0443) . chr(0x0486), # Capital Diagraph Uk with psili 116 | 'ў' => chr(0x1C82) . chr(0x0443) . chr(0x0486), # lowercase of the above 117 | 'Ј' => chr(0x0406) . chr(0x0486) . chr(0x0301), # Uppercase I with apostroph 118 | '¤' => chr(0x0482), # cyrillic thousands sign 119 | 'Ґ' => chr(0x0410) . chr(0x0486), # capital A with psili 120 | '¦' => chr(0x0445) . chr(0x0483), # lowercase kha with titlo 121 | '§' => chr(0x0447) . chr(0x0483), # the numeral ninety 122 | 'Ё' => chr(0x0463) . chr(0x0300), # lowecase yat with grave accent 123 | '©' => chr(0x0441) . chr(0x0483), # the numeral two hundred 124 | '«' => chr(0x00AB), 125 | '¬' => chr(0x00AC), 126 | '®' => chr(0x0440) . chr(0x2DE3), # lowercase er with dobro titlo 127 | 'Ї' => chr(0x0406) . chr(0x0486), 128 | '°' => chr(0xA67E), # kavyka 129 | '±' => chr(0xA657) . chr(0x0486) . chr(0x0300), 130 | 'І' => chr(0x0406), 131 | 'і' => chr(0x0456) . chr(0x0308), 132 | 'ґ' => chr(0x0430) . chr(0x0486), 133 | 'µ' => chr(0x0443), # small letter u (why encoded at the micro sign?!) 134 | 'ё' => chr(0x0463) . chr(0x0301), # lowercase yat with acute accent 135 | '№' => chr(0x0430) . chr(0x0483), # the numeral one 136 | 'є' => chr(0x0454), # wide E 137 | '»' => chr(0x00BB), 138 | 'ј' => chr(0x0456) . chr(0x0486) . chr(0x0301), # lowercase i with apostroph 139 | 'Ѕ' => chr(0x0405), 140 | 'ѕ' => chr(0x0455), 141 | 'ї' => chr(0x0456) . chr(0x0486), # lowercase i with psili 142 | 'У' => chr(0xA64A), 143 | 'Э' => chr(0x0462), # capital yat 144 | 'Я' => chr(0xA656), # capital Iotified A 145 | 'у' => chr(0xA64B), # monograph Uk (why?!) 146 | 'э' => chr(0x0463), # lowercase yat 147 | 'я' => chr(0xA657), # iotaed a 148 | -------------------------------------------------------------------------------- /mklocale.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # This script makes the CU Locale based on the version of DUCET being used 4 | require 5.006; 5 | use strict; 6 | use utf8; 7 | use Carp; 8 | use Unicode::Collate '1.04'; 9 | 10 | BEGIN { 11 | unless ("A" eq pack('U', 0x41)) { 12 | die "Lingua::CU cannot stringify a Unicode code point\n"; 13 | } 14 | unless (0x41 == unpack('U', 'A')) { 15 | die "Lingua::CU cannot get a Unicode code point\n"; 16 | } 17 | } 18 | 19 | use constant SBase => 0xAC00; 20 | use constant SFinal => 0xD7A3; 21 | use constant NCount => 588; 22 | use constant TCount => 28; 23 | use constant LBase => 0x1100; 24 | use constant VBase => 0x1161; 25 | use constant TBase => 0x11A7; 26 | 27 | use constant Min2Wt => 0x20; 28 | use constant Min3Wt => 0x02; 29 | 30 | my $OvCJK = 'overrideCJK'; 31 | my $OvHang = 'overrideHangul'; 32 | 33 | my $vDUCET; # from @version, such as "6.0.0" 34 | my $DEFAULT_LOCALE_VERSION = Unicode::Collate->VERSION; 35 | my $Use4th; # Use 4th level (Unicode 6.2.0 or before) 36 | my %Keys; # "0300" => "[.0000.0035.0002.0300]" 37 | my %Code; # "[.0000.0035.0002.0300]" => "0300" 38 | my %Name; # "0300" => "COMBINING GRAVE ACCENT" 39 | my %Equiv; # "[.0000.0035.0002.0300]" => ["0340", "0953"] 40 | 41 | sub get_resource_by_name { 42 | my $path = shift; 43 | my @found = (); 44 | INC_ENTRY: 45 | foreach my $inc_entry (@INC) { 46 | if ( ref $inc_entry ) { 47 | warn q{Don't know how to handle @INC entries of type: } . ref $inc_entry; 48 | next INC_ENTRY; 49 | } 50 | my $full_path = File::Spec->join($inc_entry, $path); 51 | if ( -e $full_path ) { 52 | if ( ! wantarray ) { 53 | return $full_path; 54 | } 55 | push @found, $full_path; 56 | } 57 | } 58 | wantarray ? return @found : return; 59 | } 60 | 61 | sub trim { $_[0] =~ s/^\ +//; $_[0] =~ s/\ +\z// } 62 | sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } 63 | sub ce { 64 | my $var = shift; 65 | my $vc = $var ? '*' : '.'; 66 | my $hx = join '.', map { sprintf '%04X', $_ } @_; 67 | return "[$vc$hx]"; 68 | } 69 | 70 | 71 | # figure out where DUCET is located 72 | my $ducet = get_resource_by_name('Unicode/Collate/allkeys.txt'); 73 | my $ENT_FMT = "%-9s ; %s # %s\n"; 74 | my $RE_CE = '(?:\[[0-9A-Fa-f\.\*]+\])'; 75 | 76 | croak "Unable to locate DUCET. Execution stopped " unless (defined $ducet); 77 | croak "Unable to locate DUCET. Execution stopped " unless (length $ducet); 78 | croak "Unable to locate DUCET. Execution stopped " unless (-e $ducet); 79 | 80 | # now read DUCET 81 | open (DUCET, $ducet) || croak ("Unable to read from DUCET. Maybe you need to be root?"); 82 | while (my $line = ) { 83 | chomp $line; 84 | next if $line =~ /^\s*#/; 85 | $vDUCET = $1 if $line =~ /^\@version\s*(\S*)/; 86 | 87 | next if $line !~ /^\s*[0-9A-Fa-f]/; 88 | my $name = ($line =~ s/[#%]\s*(.*)//) ? $1 : ''; 89 | my($e, $k) = split /;/, $line; 90 | trim($e); 91 | trim($k); 92 | $name =~ s/; QQ[A-Z]+//; 93 | $name =~ s/^ ?\[[0-9A-F]+\] ?//; 94 | 95 | if ($k =~ /\[\.0000\.0000\.0000(\.?0*)\]/) { 96 | $Use4th = 1 if $1; 97 | $Name{$e} = $name; 98 | next; 99 | } 100 | croak "Wrong Entry: must be separated by ';' " . "from " if ! $k; 101 | push @{ $Equiv{$k} }, $e if exists $Code{$k}; 102 | 103 | $Keys{$e} = $k; 104 | $Code{$k} = $e if !exists $Code{$k}; 105 | $Name{$e} = $name; 106 | # ignoring all the CJK stuff -- not needed for Church Slavic 107 | } 108 | close (DUCET); 109 | 110 | # $Code{$k} : precomposed (such as 04D1, CYRILLIC SMALL LETTER A WITH BREVE) 111 | # $eqs : equivalent sequence (such as <0430><0306>) 112 | # $starter : starter codepoint (integer such as hex '0430') 113 | my @Contractions; # store Cyrillic, currently required, and others. 114 | for my $k (sort keys %Equiv) { 115 | if ($Code{$k} !~ / / && $Equiv{$k}[0] =~ / /) { 116 | (my $eqs = "<$Equiv{$k}[0]>") =~ s/ />/ ? hex($1) : ''; 118 | push @Contractions, [$starter, "$Code{$k};$eqs"]; 119 | } 120 | } 121 | 122 | # read the cu.txt file 123 | my $in = "cu.txt"; 124 | my $out = "lib/Lingua/CU/cu.pl"; 125 | my %locale_keys; 126 | 127 | open (INPUT, "< $in") || croak "Cannot read from collate definition file cu.txt: $!"; 128 | open (OUTPUT, "> $out") || croak "Cannot create output file cu.pl: $!"; 129 | binmode OUTPUT; 130 | 131 | my $ptxt = ''; 132 | my $entry = ''; 133 | my $locale_version = $DEFAULT_LOCALE_VERSION; 134 | 135 | while () { 136 | chomp; 137 | next if /^\s*\z/; 138 | if (s/^locale_version//) { 139 | $locale_version = $1 if /(\S+)/; 140 | next; 141 | } 142 | if (/^(alternate)\s+(\S+)/) { 143 | my $v = "variable"; 144 | $ptxt .= " $v => '$2',\n"; 145 | $ptxt .= " $1 => '$2',\n"; 146 | next; 147 | } 148 | if (/^backwards$/) { 149 | $ptxt .= " backwards => 2,\n"; 150 | next; 151 | } 152 | if (/^upper$/) { 153 | $ptxt .= " upper_before_lower => 1,\n"; 154 | next; 155 | } 156 | if (s/^suppress//) { #/ 157 | s/\s*-\s*/../g; 158 | my @c = split; 159 | s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c; 160 | my $list = join ", ", @c; 161 | $ptxt .= " suppress => [$list],\n"; 162 | next; 163 | } 164 | if (/^([\s\-0-9A-Fa-fXx]+)\z/) { # continue the last list 165 | s/\s*-\s*/../g; 166 | my @c = split; 167 | s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c; 168 | my $list = join ", ", @c; 169 | $ptxt =~ s/\](,$)/$1/; 170 | $ptxt .= "\t\t$list],\n"; 171 | next; 172 | } 173 | if (/^\s*(#\s*)/) { 174 | $ptxt .= "$_\n" if $1 ne '#'; 175 | next; 176 | } 177 | 178 | $entry .= parse_entry($_, \%locale_keys); 179 | } 180 | 181 | # precomposed chars to be suppressed as additional equivalents 182 | if ($ptxt =~ /suppress => \[(.*)\]/s) { 183 | my @suplist = eval $1; 184 | my %suppressed; 185 | @suppressed{@suplist} = (1) x @suplist; 186 | 187 | for my $contract (@Contractions) { 188 | my $starter = $contract->[0]; 189 | my $addline = $contract->[1]; 190 | next if ! $suppressed{$starter}; 191 | $entry .= parse_entry($addline, \%locale_keys); 192 | } 193 | } 194 | 195 | if ($entry) { 196 | my $v = $vDUCET ? " # for DUCET v$vDUCET" : ''; 197 | $ptxt .= " entry => <<'ENTRY',$v\n"; 198 | $ptxt .= $entry; 199 | $ptxt .= "ENTRY\n"; 200 | } 201 | 202 | my $lv = " locale_version => $locale_version,\n"; 203 | print OUTPUT "+{\n$lv$ptxt};\n"; 204 | 205 | close (OUTPUT); 206 | close (INPUT); 207 | 208 | sub parse_entry { 209 | my $line = shift; 210 | my $lockeys = shift; 211 | 212 | my($e,$rule) = split_e_rule($line); 213 | my $name = getname($e); 214 | my $eq_rule = $rule eq '='; 215 | $rule = join '', map "<$_>", split ' ', $e if $eq_rule; 216 | my ($newce, $simpdec) = parse_rule($e, $rule, $lockeys); 217 | 218 | my $newentry = ''; 219 | 220 | if (!$lockeys->{$e}) { 221 | $newentry .= sprintf $ENT_FMT, $e, $newce, $name if !$eq_rule; 222 | $lockeys->{$e} = $newce; 223 | } else { 224 | $newentry .= "# already tailored: $_\n"; 225 | } 226 | 227 | if (!$simpdec && $Keys{$e}) { # duplicate for the decomposition 228 | my $key = $Keys{$e}; 229 | my @ce = $key =~ /$RE_CE/go; 230 | if (@ce > 1) { 231 | my $ok = 1; 232 | my $ee = ''; 233 | for my $c (@ce) { 234 | $ok = 0, last if !$Code{$c}; 235 | $ee .= ' ' if $ee ne ''; 236 | $ee .= $Code{$c}; 237 | } 238 | if ($ok && !$lockeys->{$ee}) { 239 | $newentry .= sprintf $ENT_FMT, $ee, $newce, $name; 240 | $lockeys->{$ee} = $newce; 241 | } 242 | if ($ee =~ s/ 030([01])/ 034$1/ && $ok && !$lockeys->{$ee}) { 243 | $newentry .= sprintf $ENT_FMT, $ee, $newce, $name; 244 | $lockeys->{$ee} = $newce; 245 | } 246 | } 247 | if ($Equiv{$key}) { 248 | for my $eq (@{ $Equiv{$key} }) { 249 | next if $key =~ /^\[\.0000\.[^]]+\]\z/; # primary ignorable 250 | next if $lockeys->{$eq}; 251 | next if $eq eq '3038'; # 3038 is identical to 2F17 in DUCET, 252 | $newentry .= sprintf $ENT_FMT, $eq, $newce, $Name{$eq}; 253 | $lockeys->{$eq} = $newce; 254 | } 255 | } 256 | } 257 | return $newentry; 258 | } 259 | 260 | sub getunicode { 261 | return join ' ', map { sprintf '%04X', $_ } unpack 'U*', shift; 262 | } 263 | 264 | sub parse_element { 265 | my $e = shift; 266 | $e =~ s/\{([A-Za-z']+)\}/' '.getunicode($1).' '/ge; 267 | $e =~ s/ +/ /g; 268 | trim($e); 269 | return $e; 270 | } 271 | 272 | sub split_e_rule { 273 | my $line = shift; 274 | my($e, $r) = split /;/, $line; 275 | return (parse_element($e), $r); 276 | } 277 | 278 | sub getname { 279 | my $e = shift; 280 | return $Name{$e} if $Name{$e}; # single collation element (without <>) 281 | my @e = split ' ', $e; 282 | my @name = map { $Name{$_} ? $Name{$_} : 283 | /^FD[DE][0-9A-F]\z/ ? "noncharacter-$_" : 284 | 'unknown' } @e; 285 | return sprintf '<%s>', join ', ', @name; 286 | } 287 | 288 | sub parse_rule { 289 | my $e = shift; 290 | my $e1 = $e =~ /^([0-9A-F]+)/ ? $1 : ''; 291 | my $rule = shift; 292 | my $lockeys = shift; 293 | my $result = ''; 294 | my $simple_decomp = 1; # rules containing only [A-Za-z'"] or 295 | 296 | for (my $prerule = $rule; $rule ne ''; $prerule = $rule) { 297 | $rule =~ s/^ +//; 298 | last if $rule =~ /^#/; 299 | if ($rule =~ s/^($RE_CE)//o) { 300 | my $k = $1; 301 | my $var = $k =~ /^\[\*/ ? 1 : 0; 302 | my @c = _getHexArray($k); 303 | @c = @c[0..2] if !$Use4th; 304 | $result .= ce($var, @c); 305 | next; 306 | } 307 | 308 | if ($rule =~ s/^(<([0-9A-F ]+)>\+\+\+\?)//) { 309 | my $cr = $1; 310 | my @c = split ' ', $2; 311 | my $compat = $Keys{$e}; 312 | my $decomp = join '', map { 313 | $Keys{$_} ? $Keys{$_} : $Keys{$_} #simple_cjk_deriv($_) 314 | } @c; 315 | my $regexp = $decomp; 316 | $regexp =~ s/([\[\]\.\*])/\\$1/g; 317 | $regexp =~ s/\.00(?:0[1-9A-F]|1[0-9A-F])(?:\\\.[0-9A-F]+|)\\\] 318 | /.(00(?:0[1-9A-F]|1[0-9A-F]))(?:\\.[0-9A-F]+|)\\\]/gx; 319 | # tertiary weights of 01-1F (excluding 00) 320 | my @tD = map hex($_), $decomp =~ /^$regexp\z/; 321 | my @tC = map hex($_), $compat =~ /^$regexp\z/; 322 | croak "wrong at $cr" unless @c == @tD && @c == @tC; 323 | my $r = join ' ', map "<$c[$_]>+++".($tC[$_] - $tD[$_]), 0..@c-1; 324 | $rule = $r.$rule; 325 | next; 326 | } 327 | 328 | my $key; 329 | if ($rule =~ s/^(<[0-9A-Za-z'{ }]+>|[A-Za-z'"])//) { 330 | my $e = $1; 331 | my $c = $e =~ tr/<>//d ? parse_element($e) : getunicode($e); 332 | croak "<$c> is too short" if 4 > length $c; 333 | $key = $lockeys->{$c} || $Keys{$c}; 334 | if (!defined $key) { 335 | my $u = hex $c; 336 | my @u = $Use4th ? ($u) : (); 337 | my @r; 338 | if (SBase <= $u && $u <= SFinal) { 339 | @r = $lockeys->{$OvHang}->($u) if $lockeys->{$OvHang}; 340 | } else { 341 | # but no check if $u is in CJK ideographs 342 | @r = $lockeys->{$OvCJK} ->($u) if $lockeys->{$OvCJK}; 343 | } 344 | if (@r) { 345 | $key = join '', map { 346 | ref $_ ? ce(0, @$_) : ce(0, $_, Min2Wt, Min3Wt, @u) 347 | } @r; 348 | } 349 | } 350 | } 351 | 352 | my @base; 353 | for my $k ($key =~ /$RE_CE/go) { 354 | my $var = $k =~ /^\[\*/ ? 1 : 0; 355 | push @base, [$var, _getHexArray($k)]; 356 | } 357 | croak "the rule seems wrong at $prerule" if !@base; 358 | 359 | my $replaced = 0; 360 | while ($rule =~ s/^(([+-])\2*)(\d+)//) { 361 | my $idx = length($1); 362 | my $num = $2 eq '-' ? -$3 : $3; 363 | $base[0][$idx] += $num; 364 | ++$replaced; 365 | } 366 | 367 | $simple_decomp = 0 if $replaced; 368 | for my $c (@base) { 369 | $c->[4] = hex $e1 if $replaced && $Use4th; 370 | $result .= ce(@$c); 371 | } 372 | croak "something wrong at $rule" if $prerule eq $rule; 373 | } 374 | return($result, $simple_decomp); 375 | } 376 | 377 | -------------------------------------------------------------------------------- /scripts/hip2unicode: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | use utf8; 6 | use Encode; 7 | 8 | # use lib "/home/sasha/Documents/API/Perl-Lingua-CU/lib/";#XXX: DEBUG ONLY 9 | use Lingua::CU::Scripts::HIP; 10 | use Getopt::Long; 11 | use Pod::Usage; 12 | use File::Basename; 13 | 14 | our(%opts); 15 | GetOptions(\%opts, 16 | 'encoding|c=s', 17 | 'format|f=s', 18 | 'help|h'); 19 | 20 | unless ($ARGV[0] || $opts{'help'}) 21 | { 22 | pod2usage(1); 23 | exit; 24 | } 25 | 26 | if ($opts{'help'}) 27 | { 28 | pod2usage(-verbose => 2, -noperldoc => 1); 29 | exit; 30 | } 31 | 32 | my $encoding = $opts{'encoding'} || "UTF-8"; 33 | if ($encoding) { 34 | my %list = map { $_ => 1 } Encode->encodings(":all"); 35 | $list{"UTF-8"} = 1; # hack 36 | unless (exists $list{$encoding}) { 37 | print "Error: Encoding $encoding is not defined.$/"; 38 | print "Possible encodings are:$/"; 39 | print join(", ", keys %list); 40 | print "$/"; 41 | exit; 42 | } 43 | } 44 | 45 | my $opt_f = $opts{"format"} || "txt"; 46 | # formatting setup 47 | my %format_chars_html = ( 48 | "%<" => "", 49 | "%>" => "", 50 | "%[" => "", 51 | "%]" => "", 52 | "%(" => "", 53 | "%)" => ""); 54 | 55 | my %format_chars_xml = ( 56 | "%<" => "", 57 | "%>" => "", 58 | "%[" => "", 59 | "%]" => "", 60 | "%(" => "", 61 | "%)" => ""); 62 | 63 | my %format_chars_text = ( 64 | "%<" => "", 65 | "%>" => "", 66 | "%[" => "", 67 | "%]" => "", 68 | "%(" => "", 69 | "%)" => ""); # I.E., all formatting information is lost 70 | 71 | my %format_chars_latex = ( 72 | "%<" => "\\textcolor{red}{", 73 | "%>" => "}", 74 | "%[" => "{\\emph{", # user should override emph to do what he would like 75 | "%]" => "}}", 76 | "%(" => "{\\scriptsize{", 77 | "%)" => "}}", 78 | "%" => "\\%", 79 | ); # TODO: must escape all other occurences of % 80 | my $infile = $ARGV[0]; 81 | unless (-e $infile) { 82 | print "Error: unable to find $infile.$/"; 83 | exit; 84 | } 85 | 86 | my ($name, $path, $suffix) = fileparse($infile, qr/\.[^.]*/); 87 | 88 | # set up file IO 89 | my $newline = $opt_f eq "html" ? "
$/" : $opt_f eq "tex" ? $/ . $/ : $opt_f eq "xml" ? "
$/" : $/; 90 | my $outfile = $path . $name; 91 | $outfile .= $opt_f eq "html" ? ".html" : $opt_f eq "tex" ? ".tex" : $opt_f eq "xml" ? ".xml" : ".txt"; 92 | 93 | my %format_dict = $opt_f eq "html" ? %format_chars_html : $opt_f eq "tex" ? %format_chars_latex : $opt_f eq "xml" ? %format_chars_xml : %format_chars_text; 94 | my @Separators = qw/<::лат> <::рус> <::слав> <::греч> <::глаг>/; 95 | my $rejex = join("|", @Separators); 96 | my $mode = 2; # default mode is Slavonic 97 | my @Starts = $opt_f eq "html" ? ("", "", "", "", "") : 98 | $opt_f eq "tex" ? ("", "", "{\\slv ", "", "") : 99 | $opt_f eq "xml" ? ("", "", "", "", "") : ("", "", "", "", ""); 100 | my @Stops = $opt_f eq "html" ? (("") x 5, "") : $opt_f eq "tex" ? ("", "", "}", "", "", "") : 101 | $opt_f eq "xml" ? ("", "", "", "", "", "") : ("") x 6; 102 | 103 | my $header = $opt_f eq "html" ? qq($outfile 104 | 105 | 125 | $/$/) : $opt_f eq "tex" ? qq(\\documentclass[12pt,a4paper]{article}\\usepackage{color} 126 | \\usepackage{xltxtra}\\newfontfamily{\\slv}{Ponomar Unicode} 127 | \\setmainfont[Mapping=tex-text]{Linux Libertine O} 128 | \\newcommand{\\comments}[1]{} 129 | \\begin{document}$/$/) : 130 | $opt_f eq "xml" ? qq( 131 | 132 | ) : ""; 133 | my $footer = $opt_f eq "html" ? qq($/) : $opt_f eq "tex" ? qq($/\\end{document}) : $opt_f eq "xml" ? "