├── util.py
├── VERSION
├── tokenize
    ├── __init__.py
    └── tokenizer.py
├── README.md
├── stem
    ├── __init__.py
    ├── api.py
    └── itrstem.py
├── __init__.py
├── rmvstopwords.py
└── vectorizer.py


/util.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.0


--------------------------------------------------------------------------------
/tokenize/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # np_lang
2 | modules for Nepali language processing
3 | 


--------------------------------------------------------------------------------
/stem/__init__.py:
--------------------------------------------------------------------------------
1 | # Natural Language Toolkit: Stemmers
2 | 
3 | from np_lang.stem.api import Stemmer
4 | from np_lang.stem.itrstem import IterativeStemmer
5 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # Nepali Language Processing
2 | ###########################################################
3 | # PACKAGES
4 | ###########################################################
5 | 
6 | from np_lang import stem
7 | from np_lang import tokenize
8 | from np_lang import util, rmvstopwords, vectorizer
9 | 


--------------------------------------------------------------------------------
/rmvstopwords.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | class StopWordRemover:
 4 |     def __init__(self, stopwords=[]):
 5 |         self.stopwords = stopwords or self.get_stopwords()
 6 | 
 7 | 
 8 |     def get_stopwords(self):
 9 |         f = open('stopwords.txt', 'r')
10 |         stopwords = f.read().splitlines()
11 |         return stopwords
12 | 
13 | 
14 |     def remove_stopwords(self, text):
15 |         result = []
16 |         texts = text.split()
17 |         for word in texts:
18 |             if word not in self.stopwords:
19 |                 result.append(word)
20 |         final = ' '.join(result)
21 |         return final


--------------------------------------------------------------------------------
/stem/api.py:
--------------------------------------------------------------------------------
 1 | # Nepali Language Processing: Stemmer Interface
 2 | 
 3 | 
 4 | from abc import ABCMeta, abstractmethod
 5 | from six import add_metaclass
 6 | 
 7 | 
 8 | @add_metaclass(ABCMeta)
 9 | class Stemmer(object):
10 |     """
11 |    A processing interface for removing morphological affixes from
12 |    words.  This process is known as stemming.
13 | 
14 |    """
15 | 
16 | 
17 |     @abstractmethod
18 |     def stem(self, token):
19 |         """
20 |          Strip affixes from the token and return the stem.
21 | 
22 |          :param token: The token that should be stemmed.
23 |          :type token: str
24 |          """
25 | 
26 | 


--------------------------------------------------------------------------------
/tokenize/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | 
 5 | class Tokenizer:
 6 |     """Base class for all tokenizers
 7 |     """
 8 | 
 9 |     def __init__(self):
10 |         # We need this for the build_repr to work properly in py2.7
11 |         pass
12 | 
13 |     def sentence_tokenize(self, text):
14 |         """
15 |         :param text: text to split into sentences
16 |         :return: a tokenized sentences from the text
17 |         """
18 | 
19 |         return re.split('(?<=[।?!]) +', text)
20 | 
21 |     def word_tokenize(self, text):
22 |         """Tokenizes text into words
23 |         Parameter
24 |         --------
25 |         text: text to split into words
26 | 
27 |         Returns
28 |         -------
29 |         words: non-ascii array of words
30 |         """
31 |         colon_lexicon = ['अंशत:', 'मूलत:', 'सर्वत:', 'प्रथमत:', 'सम्भवत:', 'सामान्यत:', 'विशेषत:', 'प्रत्यक्षत:',
32 |         				'मुख्यत:', 'स्वरुपत:', 'अन्तत:', 'पूर्णत:', 'फलत:', 'क्रमश:', 'अक्षरश:', 'प्रायश:',
33 |         				'कोटिश:', 'शतश:', 'शब्दश:']
34 | 
35 |         # Handling punctuations: , " ' ) ( { } [ ] ! ‘ ’ “ ” :- ? । / —
36 |         text = re.sub('\,|\"|\'| \)|\(|\)| \{| \}| \[| \]|!|‘|’|“|”| \:-|\?|।|/|\—', ' ', text)
37 |         words_original = text.split()
38 | 
39 |         words = []
40 |         for word in words_original:
41 |             if word[len(word) - 1:] == '-':
42 |                 if not word == '-':
43 |                     words.append(word[:len(word) - 1])
44 |             else:
45 |                 if word[len(word) - 1:] == ':' and word not in colon_lexicon:
46 |                     words.append(word[:len(word) - 1])
47 |                 else:
48 |                     words.append(word)
49 | 
50 |         return words


--------------------------------------------------------------------------------
/stem/itrstem.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | from six import python_2_unicode_compatible
 3 | from np_lang.stem.api import Stemmer
 4 | 
 5 | 
 6 | @python_2_unicode_compatible
 7 | class IterativeStemmer(Stemmer):
 8 |     def __init__(self):
 9 |         self.category_1 = self.read_file('category_1.txt')
10 |         self.category_2 = self.read_file('category_2.txt')
11 |         self.category_3 = self.read_file('category_3.txt')
12 | 
13 |     def read_file(self, filename):
14 |         f = open(filename, 'r', encoding='utf-8')
15 |         rule = f.read().splitlines()
16 |         return rule
17 | 
18 |     def stem(self, word):
19 |         result = self.remove_category_1(word)
20 |         return self.remove_category_2(result)
21 | 
22 |     def remove_category_1(self, word):
23 |         for rule in self.category_1:
24 |             if word.endswith(rule):
25 |                 return word[:-len(rule)]
26 |                 break
27 |         return word
28 | 
29 |     def remove_category_3(self, word):
30 |         for rule in self.category_3:
31 |             if word.endswith(rule):
32 |                 return word[:-len(rule)], True
33 |                 break
34 |         return word, False
35 | 
36 |     def remove_category_2(self, word):
37 |         if word.endswith(tuple(self.category_2)):
38 |             if word.endswith("ँ") or word.endswith("ं"):
39 |                 if word[:-1].endswith(("ो", "ु", "उ", "े", "ोै")):
40 |                     return self.remove_category_2(word[:-1])
41 |                 else:
42 |                     return word
43 |             elif word.endswith("ै"):
44 |                 if word[:-1].endswith("त्र"):
45 |                     return self.remove_category_2(word[:-1])
46 |                 else:
47 |                     return word
48 |             else:
49 |                 return self.remove_category_2(word[:-1])
50 | 
51 |         result, success = self.remove_category_3(word)
52 |         if success:
53 |             return self.remove_category_2(result)
54 |         return result


--------------------------------------------------------------------------------
/vectorizer.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  2 | from sklearn.metrics.pairwise import cosine_similarity
  3 | from np_lang.stem.itrstem import IterativeStemmer
  4 | from np_lang.tokenize.tokenizer import Tokenizer
  5 | from np_lang.rmvstopwords import StopWordRemover
  6 | 
  7 | """
  8 |  >>> from np_lang.vectorizer import VectorSpaceModel
  9 |  >>>  d1 = "त्यो घर रातो छ"
 10 |  >>>  d2 = "यो निलो कलम हो"
 11 |  >>>  d3 = "भाईको घर हो"
 12 |  >>>  documents = [d1, d2, d3]
 13 |  
 14 |  >>>  vector_space_model = VectorSpaceModel(documents)
 15 | 
 16 |  >>>  print (vector_space_model)
 17 |         
 18 |         <VectorSpaceModel with vocabulary {'निलो': 2, 'कलम': 0, 'रातो': 4, 'घर': 1, 'भाई': 3} >
 19 | 
 20 |  >>>  print(vector_space_model.tf)
 21 |       
 22 |       (0, 4)	1
 23 |       (0, 1)	1
 24 |       (1, 0)	1
 25 |       (1, 2)	1
 26 |       (2, 3)	1
 27 |       (2, 1)	1
 28 |             
 29 |   >>>  print(vector_space_model.tf_idf)
 30 |   
 31 |       (0, 1)	0.605348508106
 32 |       (0, 4)	0.795960541568
 33 |       (1, 2)	0.707106781187
 34 |       (1, 0)	0.707106781187
 35 |       (2, 1)	0.605348508106
 36 |       (2, 3)	0.795960541568
 37 |  
 38 |  >>>  print(vector_space_model.vocabulary())
 39 | 
 40 |         {'निलो': 2, 'कलम': 0, 'रातो': 4, 'घर': 1, 'भाई': 3}
 41 |  
 42 |  >>>  print(vector_space_model.tf_matrix())
 43 |         
 44 |          [[0 1 0 0 1]
 45 |           [1 0 1 0 0]
 46 |           [0 1 0 1 0]]
 47 |          
 48 |  >>>  print(vector_space_model.idf_matrix())
 49 |         
 50 |         [ 1.69314718  1.28768207  1.69314718  1.69314718  1.69314718]
 51 |              
 52 |  >>>  print(vector_space_model.tf_idf_matrix())
 53 |         [[ 0.          0.60534851  0.          0.          0.79596054]
 54 |          [ 0.70710678  0.          0.70710678  0.          0.        ]
 55 |          [ 0.          0.60534851  0.          0.79596054  0.        ]]
 56 |  
 57 |  >>>  print(vector_space_model.document_similarity())
 58 |         [[ 1.          0.          0.36644682]
 59 |          [ 0.          1.          0.        ]
 60 |          [ 0.36644682  0.          1.        ]] 
 61 | """
 62 | 
 63 | 
 64 | class StemmedCountVectorizer(CountVectorizer):
 65 |     def build_analyzer(self):
 66 |         stemmer = IterativeStemmer()
 67 |         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
 68 |         return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
 69 | 
 70 | 
 71 | class VectorSpaceModel:
 72 |     def __init__(self, documents):
 73 |         self.documents = documents
 74 |         self.vectorizer = StemmedCountVectorizer()
 75 |         self.tf = None
 76 |         self.tf_idf = None
 77 |         self.__tfidf = None
 78 | 
 79 |     def __repr__(self):
 80 |         return "<VectorSpaceModel with vocabulary " + str(self.vocabulary()) + " >"
 81 | 
 82 |     def __compute_tf(self):
 83 |         if not self.documents:
 84 |             raise Exception("Documents cannot be empty")
 85 |         self.vectorizer = StemmedCountVectorizer(stop_words=StopWordRemover().get_stopwords(),
 86 |                                                  tokenizer=lambda text: Tokenizer().word_tokenize(text=text),
 87 |                                                  analyzer='word')
 88 |         self.tf = self.vectorizer.fit_transform(self.documents)
 89 | 
 90 |     def compute_tf_idf(self):
 91 |         self.__compute_tf()
 92 |         self.__tfidf = TfidfTransformer(norm="l2")
 93 |         self.__tfidf.fit(self.tf)
 94 |         self.tf_idf = self.__tfidf.transform(self.tf)
 95 | 
 96 |     def vocabulary(self):
 97 |         try:
 98 |             return self.vectorizer.vocabulary_
 99 |         except Exception:
100 |             return {}
101 | 
102 |     def tf_matrix(self):
103 |         if self.tf is not None:
104 |             return self.tf.todense()
105 |         return []
106 | 
107 |     def idf_matrix(self):
108 |         if self.__tfidf is not None:
109 |             return self.__tfidf.idf_
110 |         return []
111 | 
112 |     def tf_idf_matrix(self):
113 |         if self.tf_idf is not None:
114 |             return self.tf_idf.todense()
115 |         return []
116 | 
117 |     def document_similarity(self):
118 |         if self.tf_idf is not None:
119 |             return cosine_similarity(self.tf_idf)
120 |         raise Exception("Compute tf idf first")
121 | 


--------------------------------------------------------------------------------