/\n/g;
178 | tr/\x01-\x09\x1b-\x1f\x21-\x2d\x2f-\x5a\x5c\x5e-\x9f//d; # Deletes all chars below xa0 except: 0a,20,2e,5b,5d
179 | }
180 |
181 | if ($input_type eq 'utf8') {
182 | tr/اأبپتثجچحخدذرزژسشصضطظعغفقكگلمنوهيَُِآةکیءىۀئؤًّ،؛؟٪/ABbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoeOPkiMiXIUN~,;?%*\-/;
183 | }
184 |
185 | elsif ($input_type eq 'ncr') {
186 | my %unihtml2roman = (
187 | 'ا' => 'A', '☿' => 'A', 'أ' => 'B', 'ب' => 'b', 'ة' => 'P', 'پ' => 'p', 'ت' => 't', 'ث' => 'V', 'ج' => 'j', 'چ' => 'c', 'ح' => 'H', 'خ' => 'x', 'د' => 'd', 'ذ' => 'L', 'ر' => 'r', 'ز' => 'z', 'ژ' => 'J', 'س' => 's', 'ش' => 'C', 'ص' => 'S', 'ض' => 'D', 'ط' => 'T', 'ظ' => 'Z', 'ع' => 'E', 'غ' => 'G', 'ف' => 'f', 'ق' => 'q', 'ك' => 'k', 'ک' => 'k', 'گ' => 'g', 'ل' => 'l', 'م' => 'm', 'ن' => 'n', 'و' => 'u', 'ه' => 'h', 'ي' => 'i', 'ی' => 'i', 'ى' => 'A', 'َ' => 'a', 'ُ' => 'o', 'ِ' => 'e', 'ّ' => '~', 'آ' => 'O', 'ء' => 'M', 'ً' => 'N', 'أ' => 'A', 'ؤ' => 'U', 'إ' => 'A', 'ئ' => 'I', 'ۀ' => 'X', '٪' => '%', '،' => ',', '؛' => ';', '؟' => '?', '' => "-", ' ' => ' ', '.' => '.', ':' => ':', );
188 | my @charx = split(/(?=\&\#)|(?=\s)|(?=\n)/, $_);
189 | $_ = "";
190 | foreach my $charx (@charx) {
191 | $_ .= $unihtml2roman{$charx};
192 | }
193 | } # ends elsif ($input_type eq 'ncr')
194 |
195 | elsif ($input_type eq 'cp1256') {
196 | tr/\xc7\xc3\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/ABbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoeOPkMXIUN~,;?{}\-i/; }
197 |
198 | elsif ($input_type eq 'isiri3342') {
199 | tr/\xc1\xf8\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/ABbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoeO\x7cPkiMIUN~,;?%{}\-/; }
200 |
201 | else { die "Perstem error: unrecognized --input type\n\n" . $usage }
202 |
203 | } # if ($input_type)
204 |
205 | @_ = split(/(?\S*?)(?:\S{3}(? mA Ast, but sentence-final punctuation not necessary
337 |
338 | ### Non-verbal ###
339 | s/\b([^+ ]{3,}?)(? 'ا', '|' => 'ا', 'B' => 'أ', 'b' => 'ب', 'p' => 'پ', 't' => 'ت', 'V' => 'ث', 'j' => 'ج', 'c' => 'چ', 'H' => 'ح', 'x' => 'خ', 'd' => 'د', 'L' => 'ذ', 'r' => 'ر', 'z' => 'ز', 'J' => 'ژ', 's' => 'س', 'C' => 'ش', 'S' => 'ص', 'D' => 'ض', 'T' => 'ط', 'Z' => 'ظ', 'E' => 'ع', 'G' => 'غ', 'f' => 'ف', 'q' => 'ق', 'k' => 'ک', 'K' => 'ك', 'g' => 'گ', 'l' => 'ل', 'm' => 'م', 'n' => 'ن', 'u' => 'و', 'v' => 'و', 'w' => 'و', 'h' => 'ه', 'X' => 'ۀ', 'i' => 'ی', 'I' => 'ئ', 'a' => 'َ', 'o' => 'ُ', 'e' => 'ِ', '~' => 'ّ', ',' => '،', ';' => '؛', '?' => '؟', 'O' => 'آ', 'M' => 'ء', 'N' => 'ً', 'U' => 'ؤ', '-' => '', ' ' => ' ', '_' => '_', '+' => '+', "\n" => '
', '.' => '.', );
457 | my @charx = split(//, $_);
458 | $_ = '';
459 | foreach my $charx (@charx) {
460 | $_ .= $roman2unihtml{$charx};
461 | }
462 | } # ends elsif (ncr)
463 |
464 | elsif ($output_type eq 'cp1256') {
465 | tr/ABbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoeOPkMXIUN~,;?{}\-i/\xc7\xc3\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/;
466 |
467 | # s/\x2e/\xfe\x2e\xfd/g; # Corrects periods to be RTL embedded; broken
468 | }
469 |
470 | elsif ($output_type eq 'isiri3342') {
471 | tr/ABbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoeO\x7cPkiMIUN~,;?%{}\-/\xc1\xf8\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/; }
472 |
473 | elsif ($output_type eq 'arabtex') {
474 | my %roman2arabtex = (
475 | 'A' => 'A', '|' => 'a', 'b' => 'b', 'p' => 'p', 't' => 't', 'V' => '_t', 'j' => 'j', 'c' => '^c', 'H' => '.h', 'x' => 'x', 'd' => 'd', 'L' => '_d', 'r' => 'r', 'z' => 'z', 'J' => '^z', 's' => 's', 'C' => '^s', 'S' => '.s', 'D' => '.d', 'T' => '.t', 'Z' => '.z', 'E' => '`', 'G' => '.g', 'f' => 'f', 'q' => 'q', 'K' => 'k', 'k' => 'k', 'g' => 'g', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'u' => 'U', 'v' => 'w', 'w' => 'w', 'h' => 'h', 'X' => 'H-i', 'i' => 'I', 'I' => '\'y', 'a' => 'a', 'o' => 'o', 'e' => 'e', 'P' => 'T', '~' => '', ',' => ',', ';' => ';', '?' => '?', 'O' => '^A', 'M' => '\'', 'N' => 'aN', 'U' => 'U\'', '{' => '\lq ', '}' => '\rq ', '-' => '\hspace{0ex}', '.' => '.', ' ' => ' ', '_' => '_', '+' => '+', );
476 | my @charx = split(//, $_);
477 | $_ = '';
478 | foreach my $charx (@charx) {
479 | $_ .= $roman2arabtex{$charx};
480 | }
481 |
482 | # $_ .= '\\\\'; # Appends LaTeX newline '\\' after each line
483 | } # ends elsif (arabtex)
484 |
485 | else { die "Perstem error: unrecognized --output type\n\n" . $usage }
486 |
487 | ## Restore temporary Latin doppelgaenger characters to their normal forms
488 | tr/ⓐ-ⓩⒶ-Ⓩ⓿①-⑨⁆⁓‚;⁇‰⁎‐✢/a-zA-Z01-9~,;?%*\-+/;
489 |
490 | if ($output_type eq 'utf8' && m/[^ \n]/) { # If utf8 & non-empty
491 | binmode(STDOUT, ":utf8"); # Uses the :utf8 output layer
492 | $full_line .= "$_ ";
493 | }
494 | elsif ( /[^ \n]/ ) { # if arabic-script line is non-empty
495 | $full_line .= "$_ ";
496 | }
497 |
498 | } # ends if ($output_type ne 'translit') -- for native Perso-Arabic-script input
499 | elsif ( /[^ \n]/ ) { # if latin-script line is non-empty
500 | if ($input_type ne 'translit') {
501 | ## Deal with latin-script strings from arabic-script input
502 | tr/ⓐ-ⓩⒶ-Ⓩ⓿①-⑨⁆⁓‚;⁇‰⁎‐✢/a-zA-Z01-9~,;?%*\-+/;
503 | }
504 | $full_line .= "$_ ";
505 | }
506 |
507 | } # ends foreach @_
508 |
509 | $full_line =~ s/ $//;
510 | print $full_line;
511 |
512 | } # ends while (<>)
513 |
514 | ### Resolve section
515 | ## The format of the Resolve section ( __DATA__ ) is as follows:
516 | ## 1. Mokassar (broken plurals): 'ktb ktAb' OR 'ktb ktAb_+PL'
517 | ## 2. Preparsed (speed): 'krdn kr_+dn'
518 | ## 3. Don't stem (false positive): 'bArAn bArAn'
519 | ## 4. Stop word (delete): 'u '
520 | __DATA__
521 | u u CONJ
522 | iA iA CONJ
523 | AmA AmA CONJ
524 | uli uli CONJ
525 | dr dr P
526 | bh bh P
527 | Az Az P
528 | bA bA P
529 | tA tA P
530 | bi bi P
531 | br br P
532 | brAi brAi P
533 | rui ru_+e P+EZ
534 | Hti Hti P
535 | sui su_+e P+EZ
536 | kh kh C
537 | Ain Ain DT+PROX
538 | On On DT+DIST
539 | ik ik DT
540 | hr hr DT
541 | rA rA ACC
542 | rAi rA_+e ACC+EZ
543 | mi mi MORPH.IPFV
544 | hA hA MORPH.PL
545 | Ai Ai MORPH
546 | hm hm
547 | mn mn PRON+1.SG
548 | tu tu PRON+2.SG
549 | Au Au PRON+3.SG
550 | mA mA PRON+1.PL
551 | CmA CmA PRON+2
552 | AiCAn AiCAn PRON+3.PL
553 | OnhA OnhA PRON+3.PL
554 | OnAn OnAn PRON+3.PL
555 | iki iki PRON+3.SG
556 | Agr Agr PRT+COND
557 | ps ps INTJ
558 | ch ch
559 | hic hic NEG
560 | nh nh NEG
561 | bArAn bArAn N
562 | tim tim N
563 | hfth hfth N
564 | kihAn kihAn N
565 | zndgi zndgi N
566 | sAzmAn sAzmAn N
567 | EnuAn EnuAn N
568 | nZAm nZAm N
569 | jhAn jhAn N
570 | pAiAn pAiAn N
571 | miAn miAn N
572 | frmAndh frmAndh N
573 | nmAindh nmAindh N
574 | nmAiC nmAiC N
575 | nuisndh nuisndh N
576 | prundh prundh N
577 | xndh xndh N
578 | bzrgi bzrg_+i N+ATTR
579 | bEid bEid A
580 | biCtr biC A
581 | digr digr A
582 | nhAii nhAii A
583 | nhAIi nhAii A
584 | frxndh frxndh A
585 | milAdi milAdi A
586 | Oindh O_+ndh A+PRPT
587 | frhngi frhngi
588 | tnhA tnhA
589 | AntxAbAt AntxAbAt N
590 | AstfAdh AstfAdh N
591 | iAzdh iAzdh NUM
592 | duAzdh duAzdh NUM
593 | pAnzdh pAnzdh NUM
594 | sizdh sizdh NUM
595 | CAnzdh CAnzdh NUM
596 | nuzdh nuzdh NUM
597 | miliArd miliArd NUM
598 | rIis rIis N
599 | lndn lndn N
600 | mEdn mEdn N
601 | tmdn tmdn
602 | grdn grdn N
603 | lAdn lAdn
604 | kudn kudn
605 | mAdh mAdh
606 | kilumtr kilumtr N
607 | jAdh jAdh
608 | ktb ktAb N
609 | AfkAr fkr N
610 | AEDA EDu
611 | AfGAnstAn AfGAnstAn N
612 | AslAmi AslAm_+i N
613 | Ardn Ardn N
614 | OmrikA OmrikA N
615 | OmrikAii OmrikA_+i
616 | AnsAni AnsAn_+i N
617 | bnglAdC bnglAdC N
618 | thrAn thrAn N
619 | pArlmAn pArlmAn N
620 | zbAnhAi zbAn_+hA_+e N+PL+EZ
621 | zbAnhA zbAn_+hA N+PL
622 | kCurhAi kCur_+hA_+e N+PL+EZ
623 | kCurhA kCur_+hA N+PL
624 | tBsisAt tBsis_+At N+PL
625 | mrdm mrdm N
626 | dftr dftr N
627 | dfAtr dftr N
628 | dktr dktr N
629 | jAi jA_+e N+EZ
630 | ksAni ks N+PL+INDEF
631 | OVAr AVr N+PL.BROKEN
632 | Amur Amr N+PL.BROKEN
633 | AfrAd frd N+PL.BROKEN
634 | AfrAdi frd_+i N+PL.BROKEN+INDEF
635 | muAd mAdh N+PL.BROKEN
636 | ruAbT rAbTh N+PL.BROKEN
637 | CrAiT CrT N+PL.BROKEN
638 | mnATq mnTqh N+PL.BROKEN
639 | mnAbE mnbE N+PL.BROKEN
640 | msAIl msIlh N+PL.BROKEN
641 | SnAiE SniEh N+PL.BROKEN
642 | ntAij ntijh N+PL.BROKEN
643 | mll mlt N+PL.BROKEN
644 | Hdud Hd N+PL.BROKEN
645 | Hquq Hq N+PL.BROKEN
646 | mrAsm rsm N+PL.BROKEN
647 | AnuAE nuE N+PL.BROKEN
648 | muArd murd N+PL.BROKEN
649 | EuAml EAml N+PL.BROKEN
650 | mrAkz mrkz N+PL.BROKEN
651 | Elum Elm N+PL.BROKEN
652 | nqAT nqTh N+PL.BROKEN
653 | AfkAr fkr N+PL.BROKEN
654 | ASul ASl N+PL.BROKEN
655 | quAnin qAnun N+PL.BROKEN
656 | mnAfE mnfEt N+PL.BROKEN
657 | EnASr EnSr N+PL.BROKEN
658 | ATrAf Trf N+PL.BROKEN
659 | xTuT xT N+PL.BROKEN
660 | EuArD EArDh N+PL.BROKEN
661 | AHzAb Hzb N+PL.BROKEN
662 | AEDAi EDu_+e N+PL.BROKEN+EZ
663 | mrA mn rA
664 | trA tu rA
665 | cist ch Ast
666 | kjAst kjA Ast
667 | xuAhd xuAh_+d AUX+3.SG
668 | bAid bA_+d AUX+3.SG
669 | CAid CA_+d AUX+3.SG
670 | Omdh Om_+dh V+PSPT
671 | Ourdh Our_+dh V+PSPT
672 | Ast Ast V.3.SG.PRS
673 | bAxt bAx_+t V+PST.3.SG
674 | brdh br_+dh V+PSPT
675 | bud bu_+d V+PST.3.SG
676 | budh bu_+dh V+PSPT
677 | budn bu_+dn V+GER
678 | budnd bu_+d_+nd V+PST+3.PL
679 | Cdh C_+dh V+PSPT
680 | Cdn C_+dn V+GER
681 | Cud Cu_+d V.PRS+3.SG
682 | Cundh Cu_+ndh V.PRS+PRPT
683 | dACt dAC_+t V+PST.3.SG
684 | dACth dAC_+th V+PSPT
685 | dAdh dA_+dh V+PSPT
686 | dAdn dA_+dn V+GER
687 | dAdnd dA_+d_+nd V+PST+3.PL
688 | midAd mi-+_dA_+d V+IPFV+PST.3.SG
689 | mi-dAd mi-+_dA_+d V+IPFV+PST.3.SG
690 | dAnst dAns_+t V+PST.3.SG
691 | dArd dAr_+d V.PRS+3.SG
692 | dhd dh_+d V.PRS+3.SG
693 | dhndh dh_+ndh V.PRS+PRPT
694 | didn di_+dn V+GER
695 | didh di_+dh V+PSPT
696 | binndh bin_+ndh V.PRS+PRPT
697 | gft gf_+t V+PST.3.SG
698 | gLACt gLAC_+t V+PST.3.SG
699 | gLACth gLAC_+th V+PSPT
700 | gLCth gLC_+th V+PSPT
701 | grfth grf_+th V+PSPT
702 | grft grf_+t V+PST.3.SG
703 | iAft iAf_+t V+PST.3.SG
704 | kCt kC_+t V+PST.3.SG
705 | knnd kn_+nd V.PRS+3.PL
706 | knndh kn_+ndh V.PRS+PRPT
707 | knd kn_+d V.PRS+3.SG
708 | krdn kr_+dn V+GER
709 | krdh kr_+dh V+PSPT
710 | krdnd kr_+d_+nd V V+PST+3.PL
711 | hst hs_+t V+PST.3.SG
712 | nCdh n+_C_+dh V+NEG+PSPT
713 | nist n+_Ast V+NEG+3.SG.PRS
714 | ntuAnst ntuAns_+t V+PST.3.SG
715 | prdAxt prdAx_+t V+PST.3.SG
716 | rft rf_+t V+PST.3.SG
717 | sAxt sAx_+t V+PST.3.SG
718 | sAxth sAx_+th V+PSPT
719 | tuAnst tuAns_+t V+PST.3.SG
720 | xuAst xuAs_+t V+PST.3.SG
721 | zdh z_+dh V+PSPT
722 | zdn z_+dn V+GER
723 | zdnd z_+d_+nd V+PST+3.PL
724 | znndh zn_+ndh V.PRS+PRPT
725 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/FarsiAnalysisBinderProcessor.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | public class FarsiAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
4 |
5 |
6 | @Override
7 | public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
8 |
9 | }
10 |
11 |
12 | @Override
13 | public void processAnalyzers(AnalyzersBindings analyzersBindings) {
14 | analyzersBindings.processAnalyzer("farsi", FarsiAnalyzerProvider.class);
15 | }
16 |
17 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/FarsiAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.elasticsearch.common.inject.Inject;
4 | import org.elasticsearch.common.inject.assistedinject.Assisted;
5 | import org.elasticsearch.common.settings.Settings;
6 | import org.elasticsearch.env.Environment;
7 | import org.elasticsearch.index.Index;
8 | import org.elasticsearch.index.settings.IndexSettingsService;
9 |
10 | import ir.areka.analyzer.lucene.FarsiAnalyzer;
11 |
12 | public class FarsiAnalyzerProvider extends AbstractIndexAnalyzerProvider