├── .gitignore ├── LICENSE ├── README ├── polyglot ├── __init__.py ├── config.py ├── convert.py ├── detect.py ├── identifier.py ├── models │ └── default └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | polyglot - 2 | Language identifier for multilingual documents by Marco Lui April 2013 3 | 4 | Based on research by Marco Lui and Tim Baldwin. 5 | 6 | Copyright 2013 Marco Lui . All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are 9 | permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this list of 12 | conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 15 | of conditions and the following disclaimer in the documentation and/or other materials 16 | provided with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 19 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 26 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | The views and conclusions contained in the software and documentation are those of the 29 | authors and should not be interpreted as representing official policies, either expressed 30 | or implied, of the copyright holder. 31 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Polyglot is a language identifier for detecting text documents containing text 2 | written in more than one language, and for identifying the languages therein. 3 | It is an experimental project. For monolingual language detection, langid.py[1] 4 | is a proven off-the-shelf solution. 5 | 6 | The theoretical motivation behind it is described in "Automatic Detection and 7 | Language Identification of Multilingual Documents. Marco Lui, Jey Han Lau, 8 | Timothy Baldwin. TACL Vol 2 (2014)" [2]. 9 | 10 | To re-train polyglot on custom data, use the training tools for langid.py [1] 11 | to build a model, and convert it to polyglot's format using the script in 12 | ./polyglot/convert.py 13 | 14 | Marco Lui , 15 | November 2013 16 | 17 | [1] https://github.com/saffsd/langid.py 18 | [2] https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/view/86 19 | -------------------------------------------------------------------------------- /polyglot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saffsd/polyglot/afef05ec2f0eb5fb17c5b16cd07af9d24595ab69/polyglot/__init__.py -------------------------------------------------------------------------------- /polyglot/config.py: -------------------------------------------------------------------------------- 1 | N_ITERS = 3 2 | MAX_LANG = 5 3 | THRESHOLD = 0.25 4 | -------------------------------------------------------------------------------- /polyglot/convert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert langid.py model to one suitable for use in multilangid.py. 3 | The main issue is that we need to renormalize P(t|C) as it is stored as 4 | log-prob in langid.py. 5 | 6 | Marco Lui, March 2013 7 | """ 8 | import argparse, os 9 | import numpy as np 10 | import bz2, base64 11 | import logging 12 | 13 | from cPickle import loads, dumps 14 | 15 | from identifier import MultiLanguageIdentifier 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | def read_nb_model(path): 20 | logger.info("reading model from {0}".format(path)) 21 | 22 | if os.path.isdir(path): 23 | path = os.path.join(path, 'model') 24 | 25 | with open(path) as f: 26 | model = loads(bz2.decompress(base64.b64decode(f.read()))) 27 | nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model 28 | nb_numfeats = len(nb_ptc) / len(nb_pc) 29 | nb_ptc = np.array(nb_ptc).reshape(len(nb_ptc)/len(nb_pc), len(nb_pc)) 30 | logger.debug("ptc shape: {0}".format(nb_ptc.shape)) 31 | 32 | # Normalize to 1 on the term axis 33 | for i in range(nb_ptc.shape[1]): 34 | logger.debug("normalizing row {0} of {1}".format(i+1, nb_ptc.shape[1])) 35 | nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1)) 36 | 37 | return (nb_classes, nb_ptc, tk_nextmove, tk_output) 38 | 39 | def write_polyglot_model(model, path): 40 | logger.info("writing converted model to {0}".format(path)) 41 | # TODO: Validate model 42 | # nb_classes, nb_ptc, tk_nextmove, tk_output = model 43 | output = base64.b64encode(bz2.compress(dumps(model))) 44 | with open(path, 'w') as f: 45 | f.write(output) 46 | logger.info("wrote {0} bytes".format(len(output))) 47 | 48 | def read_polyglot_model(path): 49 | with open(path) as f: 50 | return MultiLanguageIdentifier.unpack_model(f.read()) 51 | 52 | 53 | def main(): 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--verbose','-v',action="store_true") 56 | parser.add_argument('model', metavar="MODEL_DIR", help="path to langid.py training model dir") 57 | parser.add_argument('output', metavar="OUTPUT", help="produce output in") 58 | args = parser.parse_args() 59 | 60 | logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING) 61 | 62 | model = read_nb_model(args.model) 63 | write_polyglot_model(model, args.output) 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /polyglot/detect.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multi-langid based on a pre-trained P(w|t) and a Gibbs sampler for 3 | estimating P(t|d). 4 | 5 | Marco Lui, March 2013 6 | """ 7 | import argparse, sys 8 | import multiprocessing as mp 9 | import numpy as np 10 | import logging 11 | import json 12 | import tarfile 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | from identifier import MultiLanguageIdentifier 17 | from utils import Timer, MapPool 18 | import config 19 | 20 | def setup_identify(model_path, langs=None, n_iters=None, max_lang=None, thresh=None, prior=None): 21 | global _identifier 22 | 23 | n_iters = n_iters if n_iters is not None else config.N_ITERS 24 | max_lang = max_lang if max_lang is not None else config.MAX_LANG 25 | thresh = thresh if thresh is not None else config.THRESHOLD 26 | _identifier = MultiLanguageIdentifier.from_modelpath(model_path, langs, n_iters, max_lang, thresh, prior) 27 | 28 | def setup_default_identify(langs=None, n_iters = None, max_lang=None, thresh=None, prior=None): 29 | global _identifier 30 | 31 | n_iters = n_iters if n_iters is not None else config.N_ITERS 32 | max_lang = max_lang if max_lang is not None else config.MAX_LANG 33 | thresh = thresh if thresh is not None else config.THRESHOLD 34 | _identifier = MultiLanguageIdentifier.default(langs, n_iters, max_lang, thresh, prior) 35 | 36 | 37 | def explain(doc): 38 | """ 39 | Explain the document as a distribution of tokens over the full language set. 40 | """ 41 | global _identifier 42 | name, text = doc 43 | 44 | fv = _identifier.instance2fv(text) 45 | if fv.sum() == 0: 46 | # empty document 47 | return {'path':name, 'langs':{}} 48 | retval = _identifier.explain(fv) 49 | 50 | # normalize 51 | retval = retval.astype(float) / retval.sum() 52 | lang_preds = dict((k,v) for k,v in zip(_identifier.nb_classes, retval) if v > 0 ) 53 | return {'path':name, 'langs':lang_preds} 54 | 55 | def identify(doc): 56 | global _identifier 57 | name, text = doc 58 | 59 | try: 60 | pred = _identifier.identify(text) 61 | except ValueError: 62 | pred = {} 63 | 64 | return {'path':name, 'langs':pred} 65 | 66 | def tokenize(doc): 67 | name, text = doc 68 | global _identifier 69 | return _identifier.instance2fv(text) 70 | 71 | def main(): 72 | # TODO: output parameters used 73 | # TODO: output distribution 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument('--iters','-i',type=int, metavar='N', default=config.N_ITERS, 76 | help="perform N iterations of Gibbs sampling (default: {})".format(config.N_ITERS) ) 77 | parser.add_argument('--jobs','-j',type=int, metavar='N', help="use N processes", default=mp.cpu_count()) 78 | parser.add_argument('--output','-o', help="output file (json format)", type=argparse.FileType('w'), default=sys.stdout) 79 | parser.add_argument('--max_lang', type=int, default=config.MAX_LANG, 80 | help="maximum number of langugages to consider per-document (default: {})".format(config.MAX_LANG)) 81 | parser.add_argument('--thresh', '-t', type=float, default=config.THRESHOLD, 82 | help="threshold for including a language (default: {})".format(config.THRESHOLD)) 83 | parser.add_argument('--model', '-m', metavar="MODEL", help="path to model") 84 | parser.add_argument('--verbose', '-v', action='store_true', help="verbose output") 85 | parser.add_argument('--explain', '-e', action='store_true', help="only explain documents as a breakdown over the full language set") 86 | parser.add_argument('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)') 87 | parser.add_argument('--prior', '-p', nargs="?", const=True, help="use prior from file PRIOR (computed if PRIOR is not specified)") 88 | 89 | docgroup = parser.add_mutually_exclusive_group(required=True) 90 | docgroup.add_argument('--tarfile', help="process documents in a tarfile") 91 | docgroup.add_argument('--bootcat', help="process a bootcat corpus") 92 | docgroup.add_argument('--docs', metavar='FILE', help='files to process (read from stdin if blank)', nargs='*') 93 | 94 | args = parser.parse_args() 95 | 96 | logging.basicConfig(level=logging.DEBUG if args.verbose else logging.WARNING) 97 | 98 | 99 | if args.langs: 100 | langs = args.langs.strip().split(',') 101 | logger.debug( "restricting language set to: {0}".format(langs)) 102 | else: 103 | langs = None 104 | 105 | if args.model: 106 | initalizer = setup_identify 107 | initargs = (args.model, langs, args.iters, args.max_lang, args.thresh) 108 | avail_langs = set(MultiLanguageIdentifier.list_langs(args.model)) 109 | else: 110 | initalizer = setup_default_identify 111 | initargs = (langs, args.iters, args.max_lang, args.thresh) 112 | avail_langs = set(MultiLanguageIdentifier.list_langs()) 113 | 114 | if langs is not None: 115 | for l in langs: 116 | if l not in avail_langs: 117 | parser.error("language {} not in the available set".format(l)) 118 | 119 | #if args.docs and args.tarfile: 120 | # parser.error("no files should be specified if tarfile is used") 121 | 122 | if args.docs: 123 | # A list of paths was provided with the invocation 124 | doclist = args.docs 125 | num_docs = len(doclist) 126 | docs = ((d, open(d).read()) for d in doclist) 127 | chunksize = max(1,num_docs / (args.jobs + 4)) 128 | if num_docs < args.jobs: 129 | args.jobs = num_docs 130 | logger.info( "processing {0} docs".format(num_docs) ) 131 | elif args.tarfile: 132 | # A tarfile is to be processed 133 | archive = tarfile.open(args.tarfile) 134 | docs = ((m.name, archive.extractfile(m).read()) for m in archive if m.isfile()) 135 | chunksize = 20 136 | logger.info( "processing a tarfile" ) 137 | elif args.bootcat: 138 | # Process a bootcat corpus 139 | def bootcat_iter(path): 140 | with open(path) as in_f: 141 | for row in in_f: 142 | if row.startswith('CURRENT URL'): 143 | docname = row.split()[-1] 144 | else: 145 | yield (docname, row) 146 | docs = bootcat_iter(args.bootcat) 147 | chunksize = 20 148 | logger.info( "processing a bootcat corpus" ) 149 | else: 150 | # A list of files is read from stdin if filenames are not provided 151 | doclist = map(str.strip, sys.stdin) 152 | num_docs = len(doclist) 153 | docs = ((d, open(d).read()) for d in doclist) 154 | chunksize = max(1,num_docs / (args.jobs + 4)) 155 | if num_docs < args.jobs: 156 | args.jobs = num_docs 157 | logger.info( "processing {0} docs".format(num_docs) ) 158 | 159 | if args.prior: 160 | if args.prior is True: 161 | logger.debug("using average document as prior") 162 | with MapPool(args.jobs, initalizer, initargs, chunksize=chunksize) as p: 163 | fvs = [ v.astype(float) / v.sum() for v in p(tokenize, docs)] 164 | prior = np.sum(fvs, axis=0) 165 | else: 166 | logger.debug("loading prior from: {0}".format(args.prior)) 167 | with open(args.prior) as f: 168 | reader = csv.reader(f) 169 | prior = map(float, reader.next()) 170 | 171 | initargs += (prior,) 172 | 173 | # Determine the type of output 174 | if args.explain: 175 | process = explain 176 | else: 177 | process = identify 178 | 179 | 180 | # Process the documents specified 181 | doc_count = 0 182 | with MapPool(args.jobs, initalizer, initargs, chunksize=chunksize) as p, Timer() as t: 183 | for retval in p(process, docs): 184 | json.dump(retval, args.output) 185 | args.output.write('\n') 186 | doc_count += 1 187 | logger.info("processed {0} docs in {1:.2f}s ({2:.2f} r/s)".format(doc_count, t.elapsed, t.rate(doc_count) )) 188 | 189 | if __name__ == "__main__": 190 | main() 191 | -------------------------------------------------------------------------------- /polyglot/identifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of the core multi-language identifier class. 3 | 4 | Marco Lui, April 2013 5 | """ 6 | import bz2, base64 7 | import numpy as np 8 | import os 9 | import pkgutil 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | from cPickle import loads 15 | from collections import defaultdict 16 | 17 | import config 18 | from itertools import compress 19 | 20 | class MultiLanguageIdentifier(object): 21 | """ 22 | LD feature space tokenizer based on a stripped-down version of 23 | the LanguageIdentifier class of langid.py 24 | """ 25 | 26 | @classmethod 27 | def list_langs(cls, model=None): 28 | """ 29 | List the languages supported by a pre-trained model. 30 | 31 | @param model model string or path to file containing model string 32 | @returns list of languages supported 33 | """ 34 | if model is None: 35 | langs = cls.unpack_model(pkgutil.get_data('polyglot','models/default'))[0] 36 | elif os.path.exists(model): 37 | with open(model) as f: 38 | langs = cls.unpack_model(f.read())[0] 39 | else: 40 | langs = cls.unpack_model(model)[0] 41 | 42 | return langs 43 | 44 | 45 | @classmethod 46 | def unpack_model(cls, string): 47 | return loads(bz2.decompress(base64.b64decode(string))) 48 | 49 | @classmethod 50 | def default(cls, *args, **kwargs): 51 | nb_classes, nb_ptc, tk_nextmove, tk_output = cls.unpack_model(pkgutil.get_data('polyglot','models/default')) 52 | 53 | return cls( nb_classes, nb_ptc, tk_nextmove, tk_output, *args, **kwargs) 54 | 55 | @classmethod 56 | def from_modelstring(cls, string, *args, **kwargs): 57 | nb_classes, nb_ptc, tk_nextmove, tk_output = cls.unpack_model(string) 58 | 59 | return cls( nb_classes, nb_ptc, tk_nextmove, tk_output, *args, **kwargs) 60 | 61 | @classmethod 62 | def from_modelpath(cls, path, *args, **kwargs): 63 | with open(path) as f: 64 | return cls.from_modelstring(f.read(), *args, **kwargs) 65 | 66 | def __init__(self, nb_classes, nb_ptc, tk_nextmove, tk_output, langs, n_iters, max_lang, thresh, prior): 67 | self.tk_nextmove = tk_nextmove 68 | self.tk_output = tk_output 69 | self.n_iters = n_iters 70 | self.max_lang = max_lang 71 | self.thresh = thresh 72 | 73 | 74 | # Class 0 is used for the prior over the feature set 75 | if langs is None: 76 | self.nb_classes = ('PRIOR',) + tuple(nb_classes) 77 | else: 78 | self.nb_classes = ('PRIOR',) + tuple(langs) 79 | 80 | logger.debug("nb_classes: {}".format(self.nb_classes)) 81 | 82 | # Prepare prior and attach it to nb_ptc 83 | if prior is None: 84 | prior = np.ones(nb_ptc.shape[0]) 85 | 86 | if len(prior) != nb_ptc.shape[0]: 87 | raise ValueError("length of prior does not match number of terms in ptc") 88 | prior = np.array(prior, dtype=float) / np.sum(prior) # Normalize to sum 1 89 | 90 | if langs is None: 91 | self.nb_ptc = np.hstack((prior[:,None], nb_ptc)) 92 | else: 93 | self.nb_ptc = np.hstack((prior[:,None], nb_ptc[:,[nb_classes.index(l) for l in langs]])) 94 | 95 | logger.debug("initialized a MultiLanguageIdentifier instance") 96 | logger.debug("n_iters: {0}".format(self.n_iters)) 97 | logger.debug("max_lang: {0}".format(self.max_lang)) 98 | logger.debug("thresh: {0}".format(self.thresh)) 99 | logger.debug("ptc shape: {0}".format(self.nb_ptc.shape)) 100 | 101 | def instance2fv(self, text): 102 | """ 103 | Map an instance into the feature space of the trained model. 104 | """ 105 | if isinstance(text, unicode): 106 | text = text.encode('utf8') 107 | 108 | arr = np.zeros((self.nb_ptc.shape[0],), dtype='uint32') 109 | 110 | # Convert the text to a sequence of ascii values 111 | ords = map(ord, text) 112 | 113 | # Count the number of times we enter each state 114 | state = 0 115 | statecount = defaultdict(int) 116 | for letter in ords: 117 | state = self.tk_nextmove[(state << 8) + letter] 118 | statecount[state] += 1 119 | 120 | # Update all the productions corresponding to the state 121 | for state in statecount: 122 | for index in self.tk_output.get(state, []): 123 | arr[index] += statecount[state] 124 | 125 | return arr 126 | 127 | def explain(self, fv, iters = None, alpha = 0., subset = None): 128 | """ 129 | Explain a feature vector in terms of a set of classes. 130 | Uses a Gibbs sampler to compute the most likely class distribution 131 | over the specified class set to have generated this feature vector. 132 | 133 | @param subset specifies the subset of classes to use (defaults to all) 134 | @returns counts of how many documents have been allocated to each topic 135 | """ 136 | 137 | if iters is None: 138 | iters = self.n_iters 139 | 140 | if subset is None: 141 | ptc = self.nb_ptc 142 | else: 143 | ptc = self.nb_ptc[:,subset] 144 | 145 | # Initially random allocation of terms to topics 146 | K = ptc.shape[1] # number of topics (languages) 147 | z_n = np.random.randint(0, K, fv.sum()) 148 | n_m_z = np.bincount(z_n, minlength=K) + alpha 149 | 150 | t_nz = list(compress(enumerate(fv), fv>0)) 151 | 152 | for i in range(iters): 153 | # We have a collased representation of the document, where we 154 | # only keep the counts of terms and not their relative ordering 155 | # (which the model assumes is fully exchangeable anyway) 156 | n = 0 # keep track of the feature index 157 | for t, n_t in t_nz: 158 | for _ in xrange(n_t): 159 | # discount for n-th word t with topic z 160 | z = z_n[n] 161 | n_m_z[z] -= 1 162 | 163 | # sampling topic new_z for t 164 | dist = np.cumsum(ptc[t] * n_m_z) 165 | samp = np.random.random() * dist[-1] 166 | new_z = np.searchsorted(dist,samp) 167 | 168 | # set z the new topic and increment counters 169 | z_n[n] = new_z 170 | n_m_z[new_z] += 1 171 | 172 | n += 1 173 | 174 | # n_m_z must be projected back into the full class space 175 | retval = np.zeros((self.nb_ptc.shape[1],), dtype=int) 176 | retval[subset] = (n_m_z - alpha).astype(int) 177 | 178 | return retval 179 | 180 | 181 | def logprob(self, fv, classes, iters=None, lam_c=None): 182 | """ 183 | Compute the log-probability under our p(t|c) that the instance 184 | is composed by the given set of classes. 185 | """ 186 | if lam_c is None: 187 | # most likely distribution assuming the set of classes 188 | lam_c = self.explain(fv, iters, subset=classes) 189 | lam_c = lam_c.astype(float) / lam_c.sum() # norm to 1 190 | 191 | nz_t = fv > 0 # non-zero features 192 | prod = lam_c[classes] * self.nb_ptc[:,classes][nz_t] 193 | acc = np.sum(fv[nz_t] * np.log(np.sum(prod, axis=1))) 194 | return acc 195 | 196 | def identify(self, text): 197 | # tokenize document into a distribution over terms 198 | fv = self.instance2fv(text) 199 | doclen = np.sum(fv) 200 | if doclen == 0: 201 | # no LD tokens -> no languages present 202 | return {} 203 | 204 | dist = self.explain(fv) 205 | logger.debug("prior: {0} / {1} ({2:.1f}%)".format(dist[0], dist.sum(), dist[0]*100. / dist.sum())) 206 | cl_order = np.arange(len(dist))[dist.argsort()][::-1] 207 | 208 | # initially explain the document only in terms of the prior 209 | cl_set = [0] 210 | cl_dist = np.array([1.]) 211 | lp = self.logprob(fv, cl_set) 212 | 213 | for new_cl in [c for c in cl_order if c != 0 ][:self.max_lang]: 214 | cl_set_n = cl_set + [new_cl] 215 | # We obtain lam_c distinct from logprob as we will need it if we decide to keep. 216 | lam_c = self.explain(fv, subset=cl_set_n) 217 | lam_c = lam_c.astype(float) / lam_c.sum() # norm to 1 218 | est_lp = self.logprob(fv, cl_set_n, lam_c=lam_c) 219 | improve = (est_lp - lp) / doclen 220 | if improve > self.thresh: 221 | logger.debug(" {0} ACCEPT (improves by {1:.3f})".format(self.nb_classes[new_cl], improve)) 222 | lp = est_lp 223 | cl_set = cl_set_n 224 | cl_dist = lam_c 225 | else: 226 | logger.debug(" {0} REJECT (improves by {1:.3f})".format(self.nb_classes[new_cl], improve)) 227 | 228 | # Re-normalize the mass over the languages to 1 - ignoring the class0 mass. 229 | cl_dist[1:] /= cl_dist[1:].sum() 230 | 231 | retval = { self.nb_classes[c]:cl_dist[c] for c in cl_set[1:]} 232 | return retval 233 | -------------------------------------------------------------------------------- /polyglot/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utilities. 3 | 4 | Marco Lui , April 2013 5 | """ 6 | 7 | from contextlib import contextmanager, closing 8 | import multiprocessing as mp 9 | from itertools import imap 10 | 11 | @contextmanager 12 | def MapPool(processes=None, initializer=None, initargs=tuple(), maxtasksperchild=None, chunksize=1): 13 | """ 14 | Contextmanager to express the common pattern of not using multiprocessing if 15 | only 1 job is allocated (for example for debugging reasons) 16 | """ 17 | if processes is None: 18 | processes = mp.cpu_count() + 4 19 | 20 | if processes > 1: 21 | with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool: 22 | f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize) 23 | yield f 24 | else: 25 | if initializer is not None: 26 | initializer(*initargs) 27 | f = imap 28 | yield f 29 | 30 | if processes > 1: 31 | pool.join() 32 | 33 | from timeit import default_timer 34 | class Timer(object): 35 | def __init__(self): 36 | self.timer = default_timer 37 | self.start = None 38 | self.end = None 39 | 40 | def __enter__(self): 41 | self.start = self.timer() 42 | self.end = None 43 | return self 44 | 45 | def __exit__(self, *args): 46 | self.end = self.timer() 47 | 48 | @property 49 | def elapsed(self): 50 | now = self.timer() 51 | if self.end is not None: 52 | self.end - self.start 53 | else: 54 | return now - self.start 55 | 56 | def rate(self, count): 57 | now = self.timer() 58 | if self.start is None: 59 | raise ValueError("Not yet started") 60 | 61 | return count / (now - self.start) 62 | 63 | def ETA(self, count, target): 64 | """ 65 | Linearly estimate the ETA to reach target based on the current rate. 66 | """ 67 | rate = self.rate(count) 68 | time_left = timedelta(seconds=int((target-count) / rate)) 69 | return time_left 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import sys, os 3 | 4 | version = '0.1' 5 | 6 | setup(name='polyglot', 7 | version=version, 8 | description="polyglot is a tool for detecting multilingual documents and identifying the languages therein.", 9 | long_description= open("README").read(), 10 | classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 11 | keywords=['language detection', 'multilingual documents', 'text classification'], 12 | author='Marco Lui', 13 | author_email='saffsd@gmail.com', 14 | url='https://github.com/saffsd/polyglot', 15 | license='BSD', 16 | packages=['polyglot'], 17 | package_data={'polyglot':['models/*']}, 18 | include_package_data=True, 19 | zip_safe=False, 20 | install_requires=[ 21 | # -*- Extra requirements: -*- 22 | 'numpy', 23 | ], 24 | entry_points= { 25 | 'console_scripts': [ 26 | 'polyglot = polyglot.detect:main', 27 | ], 28 | }, 29 | ) 30 | --------------------------------------------------------------------------------