├── .gitignore
├── LICENSE
├── README
├── polyglot
    ├── __init__.py
    ├── config.py
    ├── convert.py
    ├── detect.py
    ├── identifier.py
    ├── models
    │   └── default
    └── utils.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | polyglot - 
 2 | Language identifier for multilingual documents by Marco Lui April 2013
 3 | 
 4 | Based on research by Marco Lui and Tim Baldwin.
 5 | 
 6 | Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without modification, are
 9 | permitted provided that the following conditions are met:
10 | 
11 |    1. Redistributions of source code must retain the above copyright notice, this list of
12 |       conditions and the following disclaimer.
13 | 
14 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
15 |       of conditions and the following disclaimer in the documentation and/or other materials
16 |       provided with the distribution.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
19 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
26 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | The views and conclusions contained in the software and documentation are those of the
29 | authors and should not be interpreted as representing official policies, either expressed
30 | or implied, of the copyright holder.
31 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Polyglot is a language identifier for detecting text documents containing text
 2 | written in more than one language, and for identifying the languages therein.
 3 | It is an experimental project. For monolingual language detection, langid.py[1]
 4 | is a proven off-the-shelf solution.
 5 | 
 6 | The theoretical motivation behind it is described in "Automatic Detection and 
 7 | Language Identification of Multilingual Documents.  Marco Lui, Jey Han Lau, 
 8 | Timothy Baldwin. TACL Vol 2 (2014)" [2].
 9 | 
10 | To re-train polyglot on custom data, use the training tools for langid.py [1] 
11 | to build a model, and convert it to polyglot's format using the script in 
12 | ./polyglot/convert.py
13 | 
14 | Marco Lui <saffsd@gmail.com>,
15 | November 2013
16 | 
17 | [1] https://github.com/saffsd/langid.py
18 | [2] https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/view/86
19 | 


--------------------------------------------------------------------------------
/polyglot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saffsd/polyglot/afef05ec2f0eb5fb17c5b16cd07af9d24595ab69/polyglot/__init__.py


--------------------------------------------------------------------------------
/polyglot/config.py:
--------------------------------------------------------------------------------
1 | N_ITERS = 3
2 | MAX_LANG = 5
3 | THRESHOLD = 0.25
4 | 


--------------------------------------------------------------------------------
/polyglot/convert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert langid.py model to one suitable for use in multilangid.py.
 3 | The main issue is that we need to renormalize P(t|C) as it is stored as
 4 | log-prob in langid.py.
 5 | 
 6 | Marco Lui, March 2013
 7 | """
 8 | import argparse, os
 9 | import numpy as np
10 | import bz2, base64
11 | import logging
12 | 
13 | from cPickle import loads, dumps
14 | 
15 | from identifier import MultiLanguageIdentifier
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | def read_nb_model(path):
20 |   logger.info("reading model from {0}".format(path))
21 | 
22 |   if os.path.isdir(path):
23 |     path = os.path.join(path, 'model')
24 | 
25 |   with open(path) as f:
26 |     model = loads(bz2.decompress(base64.b64decode(f.read())))
27 |   nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model
28 |   nb_numfeats = len(nb_ptc) / len(nb_pc)
29 |   nb_ptc = np.array(nb_ptc).reshape(len(nb_ptc)/len(nb_pc), len(nb_pc))
30 |   logger.debug("ptc shape: {0}".format(nb_ptc.shape))
31 | 
32 |   # Normalize to 1 on the term axis
33 |   for i in range(nb_ptc.shape[1]):
34 |     logger.debug("normalizing row {0} of {1}".format(i+1, nb_ptc.shape[1]))
35 |     nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1))
36 | 
37 |   return (nb_classes, nb_ptc, tk_nextmove, tk_output)
38 | 
39 | def write_polyglot_model(model, path):
40 |   logger.info("writing converted model to {0}".format(path))
41 |   # TODO: Validate model
42 |   # nb_classes, nb_ptc, tk_nextmove, tk_output = model
43 |   output = base64.b64encode(bz2.compress(dumps(model)))
44 |   with open(path, 'w') as f:
45 |     f.write(output)
46 |   logger.info("wrote {0} bytes".format(len(output)))
47 | 
48 | def read_polyglot_model(path):
49 |   with open(path) as f:
50 |     return MultiLanguageIdentifier.unpack_model(f.read())
51 | 
52 | 
53 | def main():
54 |   parser = argparse.ArgumentParser()
55 |   parser.add_argument('--verbose','-v',action="store_true")
56 |   parser.add_argument('model', metavar="MODEL_DIR", help="path to langid.py training model dir")
57 |   parser.add_argument('output', metavar="OUTPUT", help="produce output in")
58 |   args = parser.parse_args()
59 | 
60 |   logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING)
61 |   
62 |   model = read_nb_model(args.model)
63 |   write_polyglot_model(model, args.output)
64 | 
65 | if __name__ == "__main__":
66 |   main()
67 | 


--------------------------------------------------------------------------------
/polyglot/detect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Multi-langid based on a pre-trained P(w|t) and a Gibbs sampler for
  3 | estimating P(t|d).
  4 | 
  5 | Marco Lui, March 2013
  6 | """
  7 | import argparse, sys 
  8 | import multiprocessing as mp
  9 | import numpy as np
 10 | import logging
 11 | import json
 12 | import tarfile
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | from identifier import MultiLanguageIdentifier
 17 | from utils import Timer, MapPool
 18 | import config
 19 | 
 20 | def setup_identify(model_path, langs=None, n_iters=None, max_lang=None, thresh=None, prior=None):
 21 |   global _identifier
 22 | 
 23 |   n_iters = n_iters if n_iters is not None else config.N_ITERS
 24 |   max_lang = max_lang if max_lang is not None else config.MAX_LANG
 25 |   thresh = thresh if thresh is not None else config.THRESHOLD
 26 |   _identifier = MultiLanguageIdentifier.from_modelpath(model_path, langs, n_iters, max_lang, thresh, prior)
 27 | 
 28 | def setup_default_identify(langs=None, n_iters = None, max_lang=None, thresh=None, prior=None):
 29 |   global _identifier
 30 | 
 31 |   n_iters = n_iters if n_iters is not None else config.N_ITERS
 32 |   max_lang = max_lang if max_lang is not None else config.MAX_LANG
 33 |   thresh = thresh if thresh is not None else config.THRESHOLD
 34 |   _identifier = MultiLanguageIdentifier.default(langs, n_iters, max_lang, thresh, prior)
 35 | 
 36 | 
 37 | def explain(doc):
 38 |   """
 39 |   Explain the document as a distribution of tokens over the full language set.
 40 |   """
 41 |   global _identifier
 42 |   name, text = doc
 43 | 
 44 |   fv = _identifier.instance2fv(text)
 45 |   if fv.sum() == 0:
 46 |     # empty document
 47 |     return {'path':name, 'langs':{}}
 48 |   retval = _identifier.explain(fv)
 49 |   
 50 |   # normalize
 51 |   retval = retval.astype(float) / retval.sum()
 52 |   lang_preds = dict((k,v) for k,v in zip(_identifier.nb_classes, retval) if v > 0 )
 53 |   return {'path':name, 'langs':lang_preds}
 54 | 
 55 | def identify(doc):
 56 |   global _identifier
 57 |   name, text = doc
 58 | 
 59 |   try:
 60 |     pred = _identifier.identify(text)
 61 |   except ValueError:
 62 |     pred = {}
 63 | 
 64 |   return {'path':name, 'langs':pred}
 65 | 
 66 | def tokenize(doc):
 67 |   name, text = doc
 68 |   global _identifier
 69 |   return _identifier.instance2fv(text)
 70 | 
 71 | def main():
 72 |   # TODO: output parameters used
 73 |   # TODO: output distribution
 74 |   parser = argparse.ArgumentParser()
 75 |   parser.add_argument('--iters','-i',type=int, metavar='N', default=config.N_ITERS,
 76 |                       help="perform N iterations of Gibbs sampling (default: {})".format(config.N_ITERS) )
 77 |   parser.add_argument('--jobs','-j',type=int, metavar='N', help="use N processes", default=mp.cpu_count())
 78 |   parser.add_argument('--output','-o', help="output file (json format)", type=argparse.FileType('w'), default=sys.stdout)
 79 |   parser.add_argument('--max_lang', type=int, default=config.MAX_LANG,
 80 |                       help="maximum number of langugages to consider per-document (default: {})".format(config.MAX_LANG))
 81 |   parser.add_argument('--thresh', '-t', type=float, default=config.THRESHOLD,
 82 |                       help="threshold for including a language (default: {})".format(config.THRESHOLD))
 83 |   parser.add_argument('--model', '-m', metavar="MODEL", help="path to model")
 84 |   parser.add_argument('--verbose', '-v', action='store_true', help="verbose output")
 85 |   parser.add_argument('--explain', '-e', action='store_true', help="only explain documents as a breakdown over the full language set")
 86 |   parser.add_argument('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)')
 87 |   parser.add_argument('--prior', '-p', nargs="?", const=True, help="use prior from file PRIOR (computed if PRIOR is not specified)")
 88 | 
 89 |   docgroup = parser.add_mutually_exclusive_group(required=True)
 90 |   docgroup.add_argument('--tarfile', help="process documents in a tarfile")
 91 |   docgroup.add_argument('--bootcat', help="process a bootcat corpus")
 92 |   docgroup.add_argument('--docs', metavar='FILE', help='files to process (read from stdin if blank)', nargs='*')
 93 | 
 94 |   args = parser.parse_args()
 95 | 
 96 |   logging.basicConfig(level=logging.DEBUG if args.verbose else logging.WARNING)
 97 | 
 98 |   
 99 |   if args.langs:
100 |     langs = args.langs.strip().split(',')
101 |     logger.debug( "restricting language set to: {0}".format(langs))
102 |   else:
103 |     langs = None
104 | 
105 |   if args.model:
106 |     initalizer = setup_identify
107 |     initargs = (args.model, langs, args.iters, args.max_lang, args.thresh)
108 |     avail_langs = set(MultiLanguageIdentifier.list_langs(args.model))
109 |   else:
110 |     initalizer = setup_default_identify
111 |     initargs = (langs, args.iters, args.max_lang, args.thresh)
112 |     avail_langs = set(MultiLanguageIdentifier.list_langs())
113 | 
114 |   if langs is not None:
115 | 		for l in langs:
116 | 			if l not in avail_langs:
117 | 				parser.error("language {} not in the available set".format(l))
118 | 
119 |   #if args.docs and args.tarfile:
120 |   #  parser.error("no files should be specified if tarfile is used")
121 | 
122 |   if args.docs:
123 |     # A list of paths was provided with the invocation
124 |     doclist = args.docs
125 |     num_docs = len(doclist)
126 |     docs = ((d, open(d).read()) for d in doclist)
127 |     chunksize = max(1,num_docs / (args.jobs + 4))
128 |     if num_docs < args.jobs:
129 |       args.jobs = num_docs
130 |     logger.info( "processing {0} docs".format(num_docs) )
131 |   elif args.tarfile:
132 |     # A tarfile is to be processed
133 |     archive = tarfile.open(args.tarfile)
134 |     docs = ((m.name, archive.extractfile(m).read()) for m in archive if m.isfile())
135 |     chunksize = 20 
136 |     logger.info( "processing a tarfile" )
137 |   elif args.bootcat:
138 |     # Process a bootcat corpus
139 |     def bootcat_iter(path):
140 |       with open(path) as in_f:
141 |         for row in in_f:
142 |           if row.startswith('CURRENT URL'):
143 |             docname = row.split()[-1]
144 |           else:
145 |             yield (docname, row)
146 |     docs = bootcat_iter(args.bootcat)
147 |     chunksize = 20 
148 |     logger.info( "processing a bootcat corpus" )
149 |   else:
150 |     # A list of files is read from stdin if filenames are not provided
151 |     doclist = map(str.strip, sys.stdin)
152 |     num_docs = len(doclist)
153 |     docs = ((d, open(d).read()) for d in doclist)
154 |     chunksize = max(1,num_docs / (args.jobs + 4))
155 |     if num_docs < args.jobs:
156 |       args.jobs = num_docs
157 |     logger.info( "processing {0} docs".format(num_docs) )
158 | 
159 |   if args.prior:
160 |     if args.prior is True:
161 |       logger.debug("using average document as prior")
162 |       with MapPool(args.jobs, initalizer, initargs, chunksize=chunksize) as p:
163 |         fvs = [ v.astype(float) / v.sum() for v in p(tokenize, docs)]
164 |       prior = np.sum(fvs, axis=0)
165 |     else:
166 |       logger.debug("loading prior from: {0}".format(args.prior))
167 |       with open(args.prior) as f:
168 |         reader = csv.reader(f)
169 |         prior = map(float, reader.next())
170 | 
171 |     initargs += (prior,)
172 | 
173 |   # Determine the type of output
174 |   if args.explain:
175 |     process = explain 
176 |   else:
177 |     process = identify
178 | 
179 | 
180 |   # Process the documents specified
181 |   doc_count = 0
182 |   with MapPool(args.jobs, initalizer, initargs, chunksize=chunksize) as p, Timer() as t:
183 |     for retval in p(process, docs):
184 |       json.dump(retval, args.output)
185 |       args.output.write('\n')
186 |       doc_count += 1
187 |       logger.info("processed {0} docs in {1:.2f}s ({2:.2f} r/s)".format(doc_count, t.elapsed, t.rate(doc_count) ))
188 | 
189 | if __name__ == "__main__":
190 |   main()
191 | 


--------------------------------------------------------------------------------
/polyglot/identifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of the core multi-language identifier class.
  3 | 
  4 | Marco Lui, April 2013
  5 | """
  6 | import bz2, base64
  7 | import numpy as np
  8 | import os
  9 | import pkgutil
 10 | import logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | from cPickle import loads
 15 | from collections import defaultdict
 16 | 
 17 | import config
 18 | from itertools import compress
 19 | 
 20 | class MultiLanguageIdentifier(object):
 21 |   """
 22 |   LD feature space tokenizer based on a stripped-down version of
 23 |   the LanguageIdentifier class of langid.py
 24 |   """
 25 | 
 26 |   @classmethod
 27 |   def list_langs(cls, model=None):
 28 |     """
 29 |     List the languages supported by a pre-trained model.
 30 | 
 31 |     @param model model string or path to file containing model string
 32 |     @returns list of languages supported
 33 |     """
 34 |     if model is None:
 35 |       langs = cls.unpack_model(pkgutil.get_data('polyglot','models/default'))[0]
 36 |     elif os.path.exists(model):
 37 |       with open(model) as f:
 38 |         langs = cls.unpack_model(f.read())[0]
 39 |     else:
 40 |       langs = cls.unpack_model(model)[0]
 41 | 
 42 |     return langs
 43 | 
 44 | 
 45 |   @classmethod
 46 |   def unpack_model(cls, string):
 47 |     return loads(bz2.decompress(base64.b64decode(string)))
 48 | 
 49 |   @classmethod
 50 |   def default(cls, *args, **kwargs):
 51 |     nb_classes, nb_ptc, tk_nextmove, tk_output = cls.unpack_model(pkgutil.get_data('polyglot','models/default'))
 52 |    
 53 |     return cls( nb_classes, nb_ptc, tk_nextmove, tk_output, *args, **kwargs)
 54 | 
 55 |   @classmethod
 56 |   def from_modelstring(cls, string, *args, **kwargs):
 57 |     nb_classes, nb_ptc, tk_nextmove, tk_output = cls.unpack_model(string)
 58 |    
 59 |     return cls( nb_classes, nb_ptc, tk_nextmove, tk_output, *args, **kwargs)
 60 | 
 61 |   @classmethod
 62 |   def from_modelpath(cls, path, *args, **kwargs):
 63 |     with open(path) as f:
 64 |       return cls.from_modelstring(f.read(), *args, **kwargs)
 65 | 
 66 |   def __init__(self, nb_classes, nb_ptc, tk_nextmove, tk_output, langs, n_iters, max_lang, thresh, prior):
 67 |     self.tk_nextmove = tk_nextmove
 68 |     self.tk_output = tk_output
 69 |     self.n_iters = n_iters
 70 |     self.max_lang = max_lang
 71 |     self.thresh = thresh
 72 | 
 73 | 
 74 |     # Class 0 is used for the prior over the feature set
 75 |     if langs is None:
 76 |       self.nb_classes = ('PRIOR',) + tuple(nb_classes)
 77 |     else:
 78 |       self.nb_classes = ('PRIOR',) + tuple(langs) 
 79 | 
 80 |     logger.debug("nb_classes: {}".format(self.nb_classes))
 81 | 
 82 |     # Prepare prior and attach it to nb_ptc
 83 |     if prior is None:
 84 |       prior = np.ones(nb_ptc.shape[0])
 85 |     
 86 |     if len(prior) != nb_ptc.shape[0]:
 87 |       raise ValueError("length of prior does not match number of terms in ptc")
 88 |     prior = np.array(prior, dtype=float) / np.sum(prior) # Normalize to sum 1
 89 | 
 90 |     if langs is None:
 91 |       self.nb_ptc = np.hstack((prior[:,None], nb_ptc))
 92 |     else:
 93 |       self.nb_ptc = np.hstack((prior[:,None], nb_ptc[:,[nb_classes.index(l) for l in langs]]))
 94 | 
 95 |     logger.debug("initialized a MultiLanguageIdentifier instance")
 96 |     logger.debug("n_iters: {0}".format(self.n_iters))
 97 |     logger.debug("max_lang: {0}".format(self.max_lang))
 98 |     logger.debug("thresh: {0}".format(self.thresh))
 99 |     logger.debug("ptc shape: {0}".format(self.nb_ptc.shape))
100 | 
101 |   def instance2fv(self, text):
102 |     """
103 |     Map an instance into the feature space of the trained model.
104 |     """
105 |     if isinstance(text, unicode):
106 |       text = text.encode('utf8')
107 | 
108 |     arr = np.zeros((self.nb_ptc.shape[0],), dtype='uint32')
109 | 
110 |     # Convert the text to a sequence of ascii values
111 |     ords = map(ord, text)
112 | 
113 |     # Count the number of times we enter each state
114 |     state = 0
115 |     statecount = defaultdict(int)
116 |     for letter in ords:
117 |       state = self.tk_nextmove[(state << 8) + letter]
118 |       statecount[state] += 1
119 | 
120 |     # Update all the productions corresponding to the state
121 |     for state in statecount:
122 |       for index in self.tk_output.get(state, []):
123 |         arr[index] += statecount[state]
124 | 
125 |     return arr
126 | 
127 |   def explain(self, fv, iters = None, alpha = 0., subset = None):
128 |     """
129 |     Explain a feature vector in terms of a set of classes.
130 |     Uses a Gibbs sampler to compute the most likely class distribution
131 |     over the specified class set to have generated this feature vector.
132 | 
133 |     @param subset specifies the subset of classes to use (defaults to all)
134 |     @returns counts of how many documents have been allocated to each topic
135 |     """
136 | 
137 |     if iters is None:
138 |       iters = self.n_iters
139 | 
140 |     if subset is None:
141 |       ptc = self.nb_ptc
142 |     else:
143 |       ptc = self.nb_ptc[:,subset]
144 | 
145 |     # Initially random allocation of terms to topics
146 |     K = ptc.shape[1] # number of topics (languages)
147 |     z_n = np.random.randint(0, K, fv.sum())
148 |     n_m_z = np.bincount(z_n, minlength=K) + alpha
149 | 
150 |     t_nz = list(compress(enumerate(fv), fv>0))
151 | 
152 |     for i in range(iters):
153 |         # We have a collased representation of the document, where we
154 |         # only keep the counts of terms and not their relative ordering
155 |         # (which the model assumes is fully exchangeable anyway)
156 |         n = 0 # keep track of the feature index
157 |         for t, n_t in t_nz:
158 |           for _ in xrange(n_t):
159 |             # discount for n-th word t with topic z
160 |             z = z_n[n]
161 |             n_m_z[z] -= 1
162 | 
163 |             # sampling topic new_z for t
164 |             dist = np.cumsum(ptc[t] * n_m_z)
165 |             samp = np.random.random() * dist[-1]
166 |             new_z = np.searchsorted(dist,samp)
167 | 
168 |             # set z the new topic and increment counters
169 |             z_n[n] = new_z
170 |             n_m_z[new_z] += 1
171 | 
172 |             n += 1
173 | 
174 |     # n_m_z must be projected back into the full class space
175 |     retval = np.zeros((self.nb_ptc.shape[1],), dtype=int)
176 |     retval[subset] = (n_m_z - alpha).astype(int)
177 | 
178 |     return retval
179 | 
180 | 
181 |   def logprob(self, fv, classes, iters=None, lam_c=None):
182 |     """
183 |     Compute the log-probability under our p(t|c) that the instance
184 |     is composed by the given set of classes.
185 |     """
186 |     if lam_c is None:
187 |       # most likely distribution assuming the set of classes
188 |       lam_c = self.explain(fv, iters, subset=classes)
189 |       lam_c = lam_c.astype(float) / lam_c.sum() # norm to 1
190 | 
191 |     nz_t = fv > 0 # non-zero features
192 |     prod = lam_c[classes] * self.nb_ptc[:,classes][nz_t] 
193 |     acc = np.sum(fv[nz_t] * np.log(np.sum(prod, axis=1)))
194 |     return acc
195 | 
196 |   def identify(self, text):
197 |     # tokenize document into a distribution over terms
198 |     fv = self.instance2fv(text) 
199 |     doclen = np.sum(fv)
200 |     if doclen == 0:
201 |       # no LD tokens -> no languages present
202 |       return {}
203 | 
204 |     dist = self.explain(fv)
205 |     logger.debug("prior: {0} / {1} ({2:.1f}%)".format(dist[0], dist.sum(), dist[0]*100. / dist.sum()))
206 |     cl_order = np.arange(len(dist))[dist.argsort()][::-1]
207 | 
208 |     # initially explain the document only in terms of the prior 
209 |     cl_set = [0]
210 |     cl_dist = np.array([1.]) 
211 |     lp = self.logprob(fv, cl_set)
212 | 
213 |     for new_cl in [c for c in cl_order if c != 0 ][:self.max_lang]:
214 |       cl_set_n = cl_set + [new_cl]
215 |       # We obtain lam_c distinct from logprob as we will need it if we decide to keep.
216 |       lam_c = self.explain(fv, subset=cl_set_n)
217 |       lam_c = lam_c.astype(float) / lam_c.sum() # norm to 1
218 |       est_lp = self.logprob(fv, cl_set_n, lam_c=lam_c)
219 |       improve = (est_lp - lp) / doclen
220 |       if improve > self.thresh:
221 |         logger.debug("  {0} ACCEPT (improves by {1:.3f})".format(self.nb_classes[new_cl], improve))
222 |         lp = est_lp
223 |         cl_set = cl_set_n
224 |         cl_dist = lam_c
225 |       else:
226 |         logger.debug("  {0} REJECT (improves by {1:.3f})".format(self.nb_classes[new_cl], improve))
227 | 
228 |     # Re-normalize the mass over the languages to 1 - ignoring the class0 mass.
229 |     cl_dist[1:] /= cl_dist[1:].sum()
230 | 
231 |     retval = { self.nb_classes[c]:cl_dist[c] for c in cl_set[1:]}
232 |     return retval
233 | 


--------------------------------------------------------------------------------
/polyglot/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous utilities.
 3 | 
 4 | Marco Lui <saffsd@gmail.com>, April 2013
 5 | """
 6 | 
 7 | from contextlib import contextmanager, closing
 8 | import multiprocessing as mp
 9 | from itertools import imap
10 | 
11 | @contextmanager
12 | def MapPool(processes=None, initializer=None, initargs=tuple(), maxtasksperchild=None, chunksize=1):
13 |   """
14 |   Contextmanager to express the common pattern of not using multiprocessing if
15 |   only 1 job is allocated (for example for debugging reasons)
16 |   """
17 |   if processes is None:
18 |     processes = mp.cpu_count() + 4
19 | 
20 |   if processes > 1:
21 |     with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
22 |       f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
23 |       yield f
24 |   else:
25 |     if initializer is not None:
26 |       initializer(*initargs)
27 |     f = imap
28 |     yield f
29 | 
30 |   if processes > 1:
31 |     pool.join()
32 | 
33 | from timeit import default_timer
34 | class Timer(object):
35 |   def __init__(self):
36 |     self.timer = default_timer
37 |     self.start = None
38 |     self.end = None
39 | 
40 |   def __enter__(self):
41 |     self.start = self.timer()
42 |     self.end = None
43 |     return self
44 | 
45 |   def __exit__(self, *args):
46 |     self.end = self.timer()
47 | 
48 |   @property
49 |   def elapsed(self):
50 |     now = self.timer()
51 |     if self.end is not None:
52 |       self.end - self.start
53 |     else:
54 |       return now - self.start
55 | 
56 |   def rate(self, count):
57 |     now = self.timer()
58 |     if self.start is None:
59 |       raise ValueError("Not yet started")
60 | 
61 |     return count / (now - self.start)
62 | 
63 |   def ETA(self, count, target):
64 |     """
65 |     Linearly estimate the ETA to reach target based on the current rate.
66 |     """
67 |     rate = self.rate(count)
68 |     time_left = timedelta(seconds=int((target-count) / rate))
69 |     return time_left 
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import sys, os
 3 | 
 4 | version = '0.1'
 5 | 
 6 | setup(name='polyglot',
 7 |       version=version,
 8 |       description="polyglot is a tool for detecting multilingual documents and identifying the languages therein.",
 9 |       long_description= open("README").read(),
10 |       classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
11 |       keywords=['language detection', 'multilingual documents', 'text classification'],
12 |       author='Marco Lui',
13 |       author_email='saffsd@gmail.com',
14 |       url='https://github.com/saffsd/polyglot',
15 |       license='BSD',
16 |       packages=['polyglot'],
17 |       package_data={'polyglot':['models/*']},
18 |       include_package_data=True,
19 |       zip_safe=False,
20 |       install_requires=[
21 |           # -*- Extra requirements: -*-
22 |           'numpy',
23 |       ],
24 |       entry_points= {
25 |         'console_scripts': [
26 |           'polyglot = polyglot.detect:main',
27 |         ],
28 |       },
29 |       )
30 | 


--------------------------------------------------------------------------------