├── util.py ├── VERSION ├── tokenize ├── __init__.py └── tokenizer.py ├── README.md ├── stem ├── __init__.py ├── api.py └── itrstem.py ├── __init__.py ├── rmvstopwords.py └── vectorizer.py /util.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.0 -------------------------------------------------------------------------------- /tokenize/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # np_lang 2 | modules for Nepali language processing 3 | -------------------------------------------------------------------------------- /stem/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Stemmers 2 | 3 | from np_lang.stem.api import Stemmer 4 | from np_lang.stem.itrstem import IterativeStemmer 5 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Nepali Language Processing 2 | ########################################################### 3 | # PACKAGES 4 | ########################################################### 5 | 6 | from np_lang import stem 7 | from np_lang import tokenize 8 | from np_lang import util, rmvstopwords, vectorizer 9 | -------------------------------------------------------------------------------- /rmvstopwords.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | class StopWordRemover: 4 | def __init__(self, stopwords=[]): 5 | self.stopwords = stopwords or self.get_stopwords() 6 | 7 | 8 | def get_stopwords(self): 9 | f = open('stopwords.txt', 'r') 10 | stopwords = f.read().splitlines() 11 | return stopwords 12 | 13 | 14 | def remove_stopwords(self, text): 15 | result = [] 16 | texts = text.split() 17 | for word in texts: 18 | if word not in self.stopwords: 19 | result.append(word) 20 | final = ' '.join(result) 21 | return final -------------------------------------------------------------------------------- /stem/api.py: -------------------------------------------------------------------------------- 1 | # Nepali Language Processing: Stemmer Interface 2 | 3 | 4 | from abc import ABCMeta, abstractmethod 5 | from six import add_metaclass 6 | 7 | 8 | @add_metaclass(ABCMeta) 9 | class Stemmer(object): 10 | """ 11 | A processing interface for removing morphological affixes from 12 | words. This process is known as stemming. 13 | 14 | """ 15 | 16 | 17 | @abstractmethod 18 | def stem(self, token): 19 | """ 20 | Strip affixes from the token and return the stem. 21 | 22 | :param token: The token that should be stemmed. 23 | :type token: str 24 | """ 25 | 26 | -------------------------------------------------------------------------------- /tokenize/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | 5 | class Tokenizer: 6 | """Base class for all tokenizers 7 | """ 8 | 9 | def __init__(self): 10 | # We need this for the build_repr to work properly in py2.7 11 | pass 12 | 13 | def sentence_tokenize(self, text): 14 | """ 15 | :param text: text to split into sentences 16 | :return: a tokenized sentences from the text 17 | """ 18 | 19 | return re.split('(?<=[।?!]) +', text) 20 | 21 | def word_tokenize(self, text): 22 | """Tokenizes text into words 23 | Parameter 24 | -------- 25 | text: text to split into words 26 | 27 | Returns 28 | ------- 29 | words: non-ascii array of words 30 | """ 31 | colon_lexicon = ['अंशत:', 'मूलत:', 'सर्वत:', 'प्रथमत:', 'सम्भवत:', 'सामान्यत:', 'विशेषत:', 'प्रत्यक्षत:', 32 | 'मुख्यत:', 'स्वरुपत:', 'अन्तत:', 'पूर्णत:', 'फलत:', 'क्रमश:', 'अक्षरश:', 'प्रायश:', 33 | 'कोटिश:', 'शतश:', 'शब्दश:'] 34 | 35 | # Handling punctuations: , " ' ) ( { } [ ] ! ‘ ’ “ ” :- ? । / — 36 | text = re.sub('\,|\"|\'| \)|\(|\)| \{| \}| \[| \]|!|‘|’|“|”| \:-|\?|।|/|\—', ' ', text) 37 | words_original = text.split() 38 | 39 | words = [] 40 | for word in words_original: 41 | if word[len(word) - 1:] == '-': 42 | if not word == '-': 43 | words.append(word[:len(word) - 1]) 44 | else: 45 | if word[len(word) - 1:] == ':' and word not in colon_lexicon: 46 | words.append(word[:len(word) - 1]) 47 | else: 48 | words.append(word) 49 | 50 | return words -------------------------------------------------------------------------------- /stem/itrstem.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from six import python_2_unicode_compatible 3 | from np_lang.stem.api import Stemmer 4 | 5 | 6 | @python_2_unicode_compatible 7 | class IterativeStemmer(Stemmer): 8 | def __init__(self): 9 | self.category_1 = self.read_file('category_1.txt') 10 | self.category_2 = self.read_file('category_2.txt') 11 | self.category_3 = self.read_file('category_3.txt') 12 | 13 | def read_file(self, filename): 14 | f = open(filename, 'r', encoding='utf-8') 15 | rule = f.read().splitlines() 16 | return rule 17 | 18 | def stem(self, word): 19 | result = self.remove_category_1(word) 20 | return self.remove_category_2(result) 21 | 22 | def remove_category_1(self, word): 23 | for rule in self.category_1: 24 | if word.endswith(rule): 25 | return word[:-len(rule)] 26 | break 27 | return word 28 | 29 | def remove_category_3(self, word): 30 | for rule in self.category_3: 31 | if word.endswith(rule): 32 | return word[:-len(rule)], True 33 | break 34 | return word, False 35 | 36 | def remove_category_2(self, word): 37 | if word.endswith(tuple(self.category_2)): 38 | if word.endswith("ँ") or word.endswith("ं"): 39 | if word[:-1].endswith(("ो", "ु", "उ", "े", "ोै")): 40 | return self.remove_category_2(word[:-1]) 41 | else: 42 | return word 43 | elif word.endswith("ै"): 44 | if word[:-1].endswith("त्र"): 45 | return self.remove_category_2(word[:-1]) 46 | else: 47 | return word 48 | else: 49 | return self.remove_category_2(word[:-1]) 50 | 51 | result, success = self.remove_category_3(word) 52 | if success: 53 | return self.remove_category_2(result) 54 | return result -------------------------------------------------------------------------------- /vectorizer.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from np_lang.stem.itrstem import IterativeStemmer 4 | from np_lang.tokenize.tokenizer import Tokenizer 5 | from np_lang.rmvstopwords import StopWordRemover 6 | 7 | """ 8 | >>> from np_lang.vectorizer import VectorSpaceModel 9 | >>> d1 = "त्यो घर रातो छ" 10 | >>> d2 = "यो निलो कलम हो" 11 | >>> d3 = "भाईको घर हो" 12 | >>> documents = [d1, d2, d3] 13 | 14 | >>> vector_space_model = VectorSpaceModel(documents) 15 | 16 | >>> print (vector_space_model) 17 | 18 | 19 | 20 | >>> print(vector_space_model.tf) 21 | 22 | (0, 4) 1 23 | (0, 1) 1 24 | (1, 0) 1 25 | (1, 2) 1 26 | (2, 3) 1 27 | (2, 1) 1 28 | 29 | >>> print(vector_space_model.tf_idf) 30 | 31 | (0, 1) 0.605348508106 32 | (0, 4) 0.795960541568 33 | (1, 2) 0.707106781187 34 | (1, 0) 0.707106781187 35 | (2, 1) 0.605348508106 36 | (2, 3) 0.795960541568 37 | 38 | >>> print(vector_space_model.vocabulary()) 39 | 40 | {'निलो': 2, 'कलम': 0, 'रातो': 4, 'घर': 1, 'भाई': 3} 41 | 42 | >>> print(vector_space_model.tf_matrix()) 43 | 44 | [[0 1 0 0 1] 45 | [1 0 1 0 0] 46 | [0 1 0 1 0]] 47 | 48 | >>> print(vector_space_model.idf_matrix()) 49 | 50 | [ 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718] 51 | 52 | >>> print(vector_space_model.tf_idf_matrix()) 53 | [[ 0. 0.60534851 0. 0. 0.79596054] 54 | [ 0.70710678 0. 0.70710678 0. 0. ] 55 | [ 0. 0.60534851 0. 0.79596054 0. ]] 56 | 57 | >>> print(vector_space_model.document_similarity()) 58 | [[ 1. 0. 0.36644682] 59 | [ 0. 1. 0. ] 60 | [ 0.36644682 0. 1. ]] 61 | """ 62 | 63 | 64 | class StemmedCountVectorizer(CountVectorizer): 65 | def build_analyzer(self): 66 | stemmer = IterativeStemmer() 67 | analyzer = super(StemmedCountVectorizer, self).build_analyzer() 68 | return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) 69 | 70 | 71 | class VectorSpaceModel: 72 | def __init__(self, documents): 73 | self.documents = documents 74 | self.vectorizer = StemmedCountVectorizer() 75 | self.tf = None 76 | self.tf_idf = None 77 | self.__tfidf = None 78 | 79 | def __repr__(self): 80 | return "" 81 | 82 | def __compute_tf(self): 83 | if not self.documents: 84 | raise Exception("Documents cannot be empty") 85 | self.vectorizer = StemmedCountVectorizer(stop_words=StopWordRemover().get_stopwords(), 86 | tokenizer=lambda text: Tokenizer().word_tokenize(text=text), 87 | analyzer='word') 88 | self.tf = self.vectorizer.fit_transform(self.documents) 89 | 90 | def compute_tf_idf(self): 91 | self.__compute_tf() 92 | self.__tfidf = TfidfTransformer(norm="l2") 93 | self.__tfidf.fit(self.tf) 94 | self.tf_idf = self.__tfidf.transform(self.tf) 95 | 96 | def vocabulary(self): 97 | try: 98 | return self.vectorizer.vocabulary_ 99 | except Exception: 100 | return {} 101 | 102 | def tf_matrix(self): 103 | if self.tf is not None: 104 | return self.tf.todense() 105 | return [] 106 | 107 | def idf_matrix(self): 108 | if self.__tfidf is not None: 109 | return self.__tfidf.idf_ 110 | return [] 111 | 112 | def tf_idf_matrix(self): 113 | if self.tf_idf is not None: 114 | return self.tf_idf.todense() 115 | return [] 116 | 117 | def document_similarity(self): 118 | if self.tf_idf is not None: 119 | return cosine_similarity(self.tf_idf) 120 | raise Exception("Compute tf idf first") 121 | --------------------------------------------------------------------------------