├── .github ├── FUNDING.yml └── workflows │ ├── deploy.yml │ ├── deploy_test.yml │ ├── generate_documentation.yml │ ├── numpy_version.py │ └── pull_request_test.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.kr.rst ├── README.rst ├── benchmark.py ├── document ├── document_header.html └── pdoc_localization.sh ├── examples ├── coherence.py ├── corpus_and_inference.py ├── corpus_and_labeling.py ├── ctm_network.py ├── dmr_multi_label.py ├── dmr_plot.py ├── dtm.py ├── extract_ngram.py ├── gdmr_both_categorical_and_numerical.py ├── gdmr_plot.py ├── hdp_basic.py ├── hdp_visualization.py ├── hlda_basic.py ├── lda_basic.py ├── lda_visualization.py ├── raw_corpus_and_labeling.py └── word_prior.py ├── licenses_bundled ├── EigenRand └── MapboxVariant ├── requirements.txt ├── setup.py ├── src ├── Coherence │ ├── CoherenceModel.hpp │ ├── Common.h │ ├── ConfirmMeasurer.hpp │ ├── ProbEstimator.hpp │ └── Segmentor.hpp ├── Labeling │ ├── FoRelevance.cpp │ ├── FoRelevance.h │ ├── Labeler.h │ └── Phraser.hpp ├── TopicModel │ ├── CT.h │ ├── CTModel.cpp │ ├── CTModel.hpp │ ├── DMR.h │ ├── DMRModel.cpp │ ├── DMRModel.hpp │ ├── DT.h │ ├── DTM.h │ ├── DTModel.cpp │ ├── DTModel.hpp │ ├── GDMR.h │ ├── GDMRModel.cpp │ ├── GDMRModel.hpp │ ├── HDP.h │ ├── HDPModel.cpp │ ├── HDPModel.hpp │ ├── HLDA.h │ ├── HLDAModel.cpp │ ├── HLDAModel.hpp │ ├── HPA.h │ ├── HPAModel.cpp │ ├── HPAModel.hpp │ ├── LDA.h │ ├── LDACVB0Model.hpp │ ├── LDAModel.cpp │ ├── LDAModel.hpp │ ├── LLDA.h │ ├── LLDAModel.cpp │ ├── LLDAModel.hpp │ ├── MGLDA.h │ ├── MGLDAModel.cpp │ ├── MGLDAModel.hpp │ ├── PA.h │ ├── PAModel.cpp │ ├── PAModel.hpp │ ├── PLDA.h │ ├── PLDAModel.cpp │ ├── PLDAModel.hpp │ ├── PT.h │ ├── PTModel.cpp │ ├── PTModel.hpp │ ├── SLDA.h │ ├── SLDAModel.cpp │ ├── SLDAModel.hpp │ └── TopicModel.hpp ├── Utils │ ├── AliasMethod.hpp │ ├── Dictionary.cpp │ ├── Dictionary.h │ ├── EigenAddonOps.hpp │ ├── LBFGS.h │ ├── LBFGS │ │ ├── LineSearchBacktracking.h │ │ ├── LineSearchBracketing.h │ │ └── Param.h │ ├── LUT.hpp │ ├── Mmap.cpp │ ├── Mmap.h │ ├── MultiNormalDistribution.hpp │ ├── PolyaGamma.hpp │ ├── PolyaGammaHybrid.hpp │ ├── SharedString.cpp │ ├── SharedString.h │ ├── ThreadPool.hpp │ ├── Trie.hpp │ ├── TruncMultiNormal.hpp │ ├── Utils.hpp │ ├── avx_gamma.h │ ├── avx_mathfun.h │ ├── exception.h │ ├── math.h │ ├── neon_gamma.h │ ├── rtnorm.hpp │ ├── sample.hpp │ ├── serializer.cpp │ ├── serializer.hpp │ ├── slp.hpp │ ├── sse_gamma.h │ ├── sse_mathfun.h │ ├── text.hpp │ └── tvector.hpp └── python │ ├── dispatcher │ └── py_rt.cpp │ └── handler │ ├── PyUtils.h │ ├── coherence.h │ ├── docs.h │ ├── label.h │ ├── label_docs.h │ ├── module.h │ ├── py_CT.cpp │ ├── py_DMR.cpp │ ├── py_DT.cpp │ ├── py_GDMR.cpp │ ├── py_HDP.cpp │ ├── py_HLDA.cpp │ ├── py_HPA.cpp │ ├── py_LDA.cpp │ ├── py_LLDA.cpp │ ├── py_MGLDA.cpp │ ├── py_PA.cpp │ ├── py_PLDA.cpp │ ├── py_PT.cpp │ ├── py_SLDA.cpp │ ├── py_coherence.cpp │ ├── py_label.cpp │ ├── py_main.cpp │ ├── py_utils.cpp │ └── utils.h ├── test ├── sample.txt ├── sample_raw.txt ├── sample_tp.txt ├── sample_with_md.txt └── unit_test.py └── tomotopy ├── __init__.py ├── _call_utils.py ├── _show_progress.py ├── _summary.py ├── _version.py ├── auto_labeling_code.rst ├── auto_labeling_code_with_porter.rst ├── coherence.py ├── documentation.kr.rst ├── documentation.rst ├── label.py ├── utils.py └── viewer ├── __init__.py ├── __main__.py ├── template.html └── viewer_server.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [bab2min] 4 | -------------------------------------------------------------------------------- /.github/workflows/generate_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Generate the documentation 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | - '!v*.*.*d' 8 | 9 | jobs: 10 | build_manylinux: 11 | name: Build for manylinux 12 | runs-on: ubuntu-latest 13 | container: 14 | image: docker://quay.io/pypa/manylinux2014_x86_64 15 | strategy: 16 | max-parallel: 1 17 | matrix: 18 | language: [kr, en] 19 | 20 | steps: 21 | - uses: actions/checkout@v1 22 | - name: Install dependencies 23 | run: | 24 | /opt/python/cp39-cp39/bin/python -m pip install --upgrade pip 25 | yum install libffi-devel -y 26 | git clone https://gitlab.com/libeigen/eigen 27 | cd eigen 28 | git checkout tags/3.4.0 29 | cd .. 30 | mkdir include 31 | mv eigen/Eigen/ include/ 32 | git clone https://github.com/bab2min/EigenRand 33 | cd EigenRand 34 | git checkout tags/v0.4.1 35 | cd .. 36 | mv EigenRand/EigenRand include/ 37 | git clone https://github.com/mapbox/variant 38 | cd variant 39 | git checkout tags/v1.1.3 40 | cd .. 41 | mv variant/include/mapbox include/ 42 | - name: build 43 | run: | 44 | /opt/python/cp39-cp39/bin/python -m pip install numpy==`/opt/python/cp39-cp39/bin/python .github/workflows/numpy_version.py` 45 | /opt/python/cp39-cp39/bin/python -m pip install pdoc3==0.8.4 46 | export TOMOTOPY_LANG=${{ matrix.language }} 47 | /opt/python/cp39-cp39/bin/python setup.py install 48 | - name: gen doc 49 | run: | 50 | export TOMOTOPY_VER="`/opt/python/cp39-cp39/bin/python -m pip show tomotopy | grep Version | cut -d' ' -f2`" 51 | export TOMOTOPY_LANG=${{ matrix.language }} 52 | /opt/python/cp39-cp39/bin/python -m pdoc --html tomotopy 53 | sed -i -E "s/documentation<\/title>/documentation (v${TOMOTOPY_VER})<\/title>/" html/tomotopy/*.html 54 | sed -i -E 's/<\/title>/<\/title>/' html/tomotopy/*.html 55 | sed -i -E 's/(

<\/p>)/ 66 | 67 | -------------------------------------------------------------------------------- /document/pdoc_localization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$TOMOTOPY_LANG" = "kr" ]; then 4 | sed -i -E "s/Parameters<\/h2>/파라미터<\/h2>/g" $@ 5 | sed -i -E "s/Added in version:/추가된 버전:/g" $@ 6 | sed -i -E "s/Instance variables<\/h3>/인스턴스 변수<\/h3>/g" $@ 7 | sed -i -E "s/Methods<\/h3>/메소드<\/h3>/g" $@ 8 | sed -i -E "s/Inherited members<\/h3>/상속받은 메소드 및 변수<\/h3>/g" $@ 9 | sed -i -E "s/Ancestors<\/h3>/부모 클래스<\/h3>/g" $@ 10 | sed -i -E "s/Super-module<\/h3>/상위 모듈<\/h3>/g" $@ 11 | sed -i -E "s/Sub-modules<\/a>/하위 모듈<\/a>/g" $@ 12 | sed -i -E "s/Global variables<\/a>/전역 변수<\/a>/g" $@ 13 | sed -i -E "s/Classes<\/a>/클래스<\/a>/g" $@ 14 | fi 15 | -------------------------------------------------------------------------------- /examples/coherence.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example shows how to perform a Latent Dirichlet Allocation 3 | and calculate coherence of the results. 4 | 5 | Required Packages: 6 | nltk, sklearn 7 | ''' 8 | 9 | import tomotopy as tp 10 | import nltk 11 | from nltk.corpus import stopwords 12 | import re 13 | from sklearn.datasets import fetch_20newsgroups 14 | import itertools 15 | 16 | print('Training lda models...') 17 | try: 18 | # load if trained model exist already 19 | mdl = tp.LDAModel.load('trained_lda_model.bin') 20 | except: 21 | porter_stemmer = nltk.PorterStemmer().stem 22 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) 23 | pat = re.compile('^[a-z]{2,}$') 24 | corpus = tp.utils.Corpus( 25 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 26 | stopwords=lambda x: x in english_stops or not pat.match(x) 27 | ) 28 | newsgroups_train = fetch_20newsgroups() 29 | corpus.process(d.lower() for d in newsgroups_train.data) 30 | 31 | mdl = tp.LDAModel(min_df=5, rm_top=30, k=20, corpus=corpus) 32 | mdl.train(0) 33 | 34 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 35 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 36 | )) 37 | print('Removed Top words: ', *mdl.removed_top_words) 38 | 39 | # Let's train the model 40 | mdl.train(1000, show_progress=True) 41 | mdl.summary() 42 | 43 | # save lda model for reuse 44 | mdl.save('trained_lda_model.bin') 45 | 46 | # calculate coherence using preset 47 | for preset in ('u_mass', 'c_uci', 'c_npmi', 'c_v'): 48 | coh = tp.coherence.Coherence(mdl, coherence=preset) 49 | average_coherence = coh.get_score() 50 | coherence_per_topic = [coh.get_score(topic_id=k) for k in range(mdl.k)] 51 | print('==== Coherence : {} ===='.format(preset)) 52 | print('Average:', average_coherence, '\nPer Topic:', coherence_per_topic) 53 | print() 54 | 55 | # calculate coherence using custom combination 56 | for seg, cm, im in itertools.product(tp.coherence.Segmentation, tp.coherence.ConfirmMeasure, tp.coherence.IndirectMeasure): 57 | coh = tp.coherence.Coherence(mdl, coherence=(tp.coherence.ProbEstimation.DOCUMENT, seg, cm, im)) 58 | average_coherence = coh.get_score() 59 | coherence_per_topic = [coh.get_score(topic_id=k) for k in range(mdl.k)] 60 | print('==== Coherence : {}, {}, {} ===='.format(repr(seg), repr(cm), repr(im))) 61 | print('Average:', average_coherence, '\nPer Topic:', coherence_per_topic) 62 | print() 63 | -------------------------------------------------------------------------------- /examples/corpus_and_inference.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | # You can get the sample data file 'enwiki-stemmed-1000.txt' 5 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 6 | 7 | def infer_new_corpus(): 8 | ''' 9 | Since 0.10.0 version, inference using an instance of `Corpus` was supported. 10 | ''' 11 | 12 | train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 13 | train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8')) 14 | 15 | test_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 16 | test_corpus.process(open('corpus_to_be_inferred.txt', encoding='utf-8')) 17 | 18 | # make LDA model and train 19 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus) 20 | mdl.train(0) 21 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 22 | print('Removed top words:', mdl.removed_top_words) 23 | 24 | mdl.train(1000, show_progress=True) 25 | mdl.summary() 26 | 27 | inferred_corpus, ll = mdl.infer(test_corpus) 28 | 29 | # print topic distributions of each document 30 | for doc in inferred_corpus: 31 | #print(doc.raw) # print raw string of the document 32 | #print(list(doc)) # print a list of words within the document 33 | print(doc.get_topic_dist()) 34 | 35 | def infer_new_doc(): 36 | ''' 37 | Prior to version 0.10.0, we had to make instances of `Document` using `make_doc` first 38 | and call `infer`. 39 | ''' 40 | train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 41 | train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8')) 42 | 43 | # make LDA model and train 44 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus) 45 | mdl.train(0) 46 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 47 | print('Removed top words:', mdl.removed_top_words) 48 | for i in range(0, 1000, 10): 49 | mdl.train(10) 50 | print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) 51 | 52 | mdl.summary() 53 | 54 | docs = [] 55 | for line in open('enwiki-stemmed-1000.txt', encoding='utf-8'): 56 | docs.append(mdl.make_doc(line.lower().split())) 57 | 58 | topic_distributions, ll = mdl.infer(docs) 59 | 60 | # print topic distributions of each document 61 | for doc, topic_dist in zip(docs, topic_distributions): 62 | #print(doc) 63 | print(topic_dist) 64 | 65 | infer_new_corpus() 66 | infer_new_doc() -------------------------------------------------------------------------------- /examples/corpus_and_labeling.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def corpus_and_labeling_example(input_file): 5 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 6 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 7 | corpus.process(open(input_file, encoding='utf-8')) 8 | 9 | # make LDA model and train 10 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) 11 | mdl.train(0) 12 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 13 | print('Removed top words:', mdl.removed_top_words) 14 | 15 | mdl.train(1000, show_progress=True) 16 | mdl.summary() 17 | 18 | # extract candidates for auto topic labeling 19 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) 20 | cands = extractor.extract(mdl) 21 | 22 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) 23 | for k in range(mdl.k): 24 | print("== Topic #{} ==".format(k)) 25 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) 26 | for word, prob in mdl.get_topic_words(k, top_n=10): 27 | print(word, prob, sep='\t') 28 | print() 29 | 30 | # You can get the sample data file 'enwiki-stemmed-1000.txt' 31 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 32 | 33 | print('Running LDA and Labeling') 34 | corpus_and_labeling_example('enwiki-stemmed-1000.txt') 35 | -------------------------------------------------------------------------------- /examples/ctm_network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example shows how to perform a Correlated Topic Model using tomotopy 3 | and visualize the correlation between topics. 4 | 5 | 6 | Required Packages: 7 | nltk, sklearn, pyvis 8 | ''' 9 | 10 | import tomotopy as tp 11 | import nltk 12 | from nltk.corpus import stopwords 13 | import re 14 | from sklearn.datasets import fetch_20newsgroups 15 | from pyvis.network import Network 16 | 17 | try: 18 | # load if preprocessed corpus exists 19 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps') 20 | except IOError: 21 | porter_stemmer = nltk.PorterStemmer().stem 22 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) 23 | pat = re.compile('^[a-z]{2,}$') 24 | corpus = tp.utils.Corpus( 25 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 26 | stopwords=lambda x: x in english_stops or not pat.match(x) 27 | ) 28 | newsgroups_train = fetch_20newsgroups() 29 | corpus.process(d.lower() for d in newsgroups_train.data) 30 | # save preprocessed corpus for reuse 31 | corpus.save('preprocessed_20news.cps') 32 | 33 | mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus) 34 | mdl.train(0) 35 | 36 | # Since we have more than ten thousand of documents, 37 | # setting the `num_beta_sample` smaller value will not cause an inaccurate result. 38 | mdl.num_beta_sample = 5 39 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 40 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 41 | )) 42 | print('Removed Top words: ', *mdl.removed_top_words) 43 | 44 | # Let's train the model 45 | mdl.train(1000, show_progress=True) 46 | mdl.summary() 47 | 48 | # Let's visualize the result 49 | g = Network(width=800, height=800, font_color="#333") 50 | correl = mdl.get_correlations().reshape([-1]) 51 | correl.sort() 52 | top_tenth = mdl.k * (mdl.k - 1) // 10 53 | top_tenth = correl[-mdl.k - top_tenth] 54 | 55 | for k in range(mdl.k): 56 | label = "#{}".format(k) 57 | title= ' '.join(word for word, _ in mdl.get_topic_words(k, top_n=6)) 58 | print('Topic', label, title) 59 | g.add_node(k, label=label, title=title, shape='ellipse') 60 | for l, correlation in zip(range(k - 1), mdl.get_correlations(k)): 61 | if correlation < top_tenth: continue 62 | g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation)) 63 | 64 | g.barnes_hut(gravity=-1000, spring_length=20) 65 | g.show_buttons() 66 | g.show("topic_network.html") 67 | -------------------------------------------------------------------------------- /examples/dmr_multi_label.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example show how to perform a DMR topic model with multi-metadata using tomotopy 3 | ''' 4 | import itertools 5 | 6 | import tomotopy as tp 7 | import numpy as np 8 | 9 | # You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data . 10 | corpus = tp.utils.Corpus() 11 | for line in open('text_mining_year_journal.txt', encoding='utf-8'): 12 | fd = line.strip().split('\t', maxsplit=2) 13 | corpus.add_doc(fd[2].split(), multi_metadata=['y_' + fd[0], 'j_' + fd[1]]) 14 | # We add prefix 'y' for year-label and 'j' for journal-label 15 | 16 | # We set a range of the first metadata as [2000, 2017] 17 | # and one of the second metadata as [0, 1]. 18 | mdl = tp.DMRModel(tw=tp.TermWeight.ONE, 19 | k=20, 20 | corpus=corpus 21 | ) 22 | mdl.optim_interval = 20 23 | mdl.burn_in = 200 24 | 25 | mdl.train(0) 26 | 27 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 28 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 29 | )) 30 | 31 | # Let's train the model 32 | mdl.train(2000, show_progress=True) 33 | 34 | mdl.summary() 35 | 36 | year_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('y_')) 37 | journal_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('j_')) 38 | 39 | # calculate topic distribution with each metadata using get_topic_prior() 40 | print('Topic distributions by year') 41 | for l in year_labels: 42 | print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n') 43 | 44 | print('Topic distributions by journal') 45 | for l in journal_labels: 46 | print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n') 47 | 48 | # Also we can estimate topic distributions with multiple metadata 49 | print('Topic distributions by year-journal') 50 | for y, j in itertools.product(year_labels, journal_labels): 51 | print(y, ',', j, '\n', mdl.get_topic_prior(multi_metadata=[y, j]), '\n') 52 | -------------------------------------------------------------------------------- /examples/dmr_plot.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example show how to perform a DMR topic model using tomotopy 3 | and visualize the topic distribution for each metadata 4 | 5 | Required Packages: 6 | matplotlib 7 | ''' 8 | 9 | import tomotopy as tp 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | ''' 14 | You can get the sample data file from https://drive.google.com/file/d/1AUHdwaPzw5qW0j8MaKqFNfw-SQDMbIzw/view?usp=sharing . 15 | ''' 16 | 17 | corpus = tp.utils.Corpus() 18 | for line in open('text_mining.txt', encoding='utf-8'): 19 | fd = line.strip().split('\t') 20 | corpus.add_doc(fd[1].lower().split(), metadata=fd[0]) 21 | 22 | # We set a range of the first metadata as [2000, 2017] 23 | # and one of the second metadata as [0, 1]. 24 | mdl = tp.DMRModel(tw=tp.TermWeight.PMI, 25 | k=15, 26 | corpus=corpus 27 | ) 28 | mdl.optim_interval = 20 29 | mdl.burn_in = 200 30 | 31 | mdl.train(0) 32 | 33 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 34 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 35 | )) 36 | 37 | # Let's train the model 38 | mdl.train(2000, show_progress=True) 39 | 40 | mdl.summary() 41 | 42 | # calculate topic distribution for each metadata using softmax 43 | probs = np.exp(mdl.lambdas - mdl.lambdas.max(axis=0)) 44 | probs /= probs.sum(axis=0) 45 | 46 | print('Topic distributions for each metadata') 47 | for f, metadata_name in enumerate(mdl.metadata_dict): 48 | print(metadata_name, probs[:, f], '\n') 49 | 50 | x = np.arange(mdl.k) 51 | width = 1 / (mdl.f + 2) 52 | 53 | fig, ax = plt.subplots() 54 | for f, metadata_name in enumerate(mdl.metadata_dict): 55 | ax.bar(x + width * (f - mdl.f / 2), probs[:, f], width, label=mdl.metadata_dict[f]) 56 | 57 | ax.set_ylabel('Probabilities') 58 | ax.set_yscale('log') 59 | ax.set_title('Topic distributions') 60 | ax.set_xticks(x) 61 | ax.set_xticklabels(['Topic #{}'.format(k) for k in range(mdl.k)]) 62 | ax.legend() 63 | 64 | fig.tight_layout() 65 | plt.show() 66 | -------------------------------------------------------------------------------- /examples/dtm.py: -------------------------------------------------------------------------------- 1 | import tomotopy as tp 2 | import numpy as np 3 | import nltk 4 | import pyLDAvis 5 | 6 | def data_feeder(input_file): 7 | for line in open(input_file, encoding='utf-8'): 8 | fd = line.strip().split(maxsplit=1) 9 | timepoint = int(fd[0]) 10 | yield fd[1], None, {'timepoint':timepoint} 11 | 12 | porter_stemmer = nltk.PorterStemmer().stem 13 | corpus = tp.utils.Corpus( 14 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer) 15 | ) 16 | corpus.process(data_feeder('../test/sample_tp.txt')) 17 | 18 | mdl = tp.DTModel(min_cf=3, k=10, t=13, phi_var=1e-2, corpus=corpus) 19 | mdl.train(0) 20 | 21 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 22 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 23 | )) 24 | print('Removed Top words: ', *mdl.removed_top_words) 25 | 26 | # Let's train the model 27 | mdl.train(1000, show_progress=True) 28 | mdl.summary() 29 | 30 | topic_dist_by_time = np.zeros(shape=[mdl.num_timepoints, mdl.k], dtype=np.float) 31 | for doc in mdl.docs: 32 | topic_dist_by_time[doc.timepoint] += doc.get_topic_dist() 33 | 34 | topic_dist_by_time /= mdl.num_docs_by_timepoint[:, np.newaxis] 35 | 36 | for k in range(mdl.k): 37 | print('Topic #{}'.format(k), *(w for w, _ in mdl.get_topic_words(k, 0, top_n=5))) 38 | print(topic_dist_by_time[:, k]) 39 | 40 | for timepoint in range(mdl.num_timepoints): 41 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k, timepoint=timepoint) for k in range(mdl.k)]) 42 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs if doc.timepoint == timepoint]) 43 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True) 44 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs if doc.timepoint == timepoint]) 45 | vocab = list(mdl.used_vocabs) 46 | term_frequency = mdl.used_vocab_freq 47 | 48 | prepared_data = pyLDAvis.prepare( 49 | topic_term_dists, 50 | doc_topic_dists, 51 | doc_lengths, 52 | vocab, 53 | term_frequency, 54 | start_index=0, 55 | sort_topics=False 56 | ) 57 | pyLDAvis.save_html(prepared_data, 'dtmvis_{}.html'.format(timepoint)) 58 | -------------------------------------------------------------------------------- /examples/extract_ngram.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def extract_ngrams_example(input_file): 5 | from nltk.corpus import stopwords 6 | stops = set(stopwords.words('english')) 7 | stops.update(['many', 'also', 'would', 'often', 'could']) 8 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), 9 | stopwords=lambda x: len(x) <= 2 or x in stops) 10 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 11 | corpus.process(open(input_file, encoding='utf-8')) 12 | 13 | # extract the n-gram candidates first 14 | cands = corpus.extract_ngrams(min_cf=20, min_df=10, max_len=5, max_cand=1000, normalized=False) 15 | print('==== extracted n-gram collocations (using PMI) ====') 16 | for cand in cands: 17 | print(cand) 18 | 19 | # it prints like: 20 | # tomotopy.label.Candidate(words=["academic","nobel","prize","laureate"], name="", score=23.376673) 21 | # tomotopy.label.Candidate(words=["canadian","ice","hockey","player"], name="", score=21.658447) 22 | # tomotopy.label.Candidate(words=["english","race","car","driver"], name="", score=20.356688) 23 | # tomotopy.label.Candidate(words=["australian","rugby","league","player"], name="", score=20.124966) 24 | # tomotopy.label.Candidate(words=["american","race","car","driver"], name="", score=19.717760) 25 | # tomotopy.label.Candidate(words=["new","zealand","rugby","player"], name="", score=18.866398) 26 | # tomotopy.label.Candidate(words=["american","ice","hockey","player"], name="", score=17.599983) 27 | # tomotopy.label.Candidate(words=["american","actor","director","producer"], name="", score=16.722300) 28 | # tomotopy.label.Candidate(words=["nobel","prize","laureate"], name="", score=16.635370) 29 | # tomotopy.label.Candidate(words=["eastern","orthodox","liturgics"], name="", score=16.540277) 30 | # ... 31 | 32 | cands = corpus.extract_ngrams(min_cf=20, min_df=10, max_len=5, max_cand=1000, normalized=True) 33 | print('==== extracted n-gram collocations (using Normalized PMI) ====') 34 | for cand in cands: 35 | print(cand) 36 | 37 | # it prints like: 38 | # tomotopy.label.Candidate(words=["buenos","aires"], name="", score=0.996445) 39 | # tomotopy.label.Candidate(words=["los","angeles"], name="", score=0.988719) 40 | # tomotopy.label.Candidate(words=["las","vegas"], name="", score=0.982273) 41 | # tomotopy.label.Candidate(words=["hong","kong"], name="", score=0.978606) 42 | # tomotopy.label.Candidate(words=["hip","hop"], name="", score=0.965971) 43 | # tomotopy.label.Candidate(words=["nova","scotia"], name="", score=0.957440) 44 | # tomotopy.label.Candidate(words=["ice","hockey"], name="", score=0.932300) 45 | # tomotopy.label.Candidate(words=["nobel","prize","laureate"], name="", score=0.927281) 46 | # tomotopy.label.Candidate(words=["sri","lankan"], name="", score=0.925504) 47 | # tomotopy.label.Candidate(words=["ann","arbor"], name="", score=0.921129) 48 | # ... 49 | 50 | # before concat 51 | print(corpus[3]) 52 | 53 | # concat n-grams in the corpus 54 | corpus.concat_ngrams(cands, delimiter='_') 55 | 56 | # after concat 57 | print(corpus[3]) 58 | 59 | # You can get the sample data file 'enwiki-1000.txt' 60 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 61 | 62 | extract_ngrams_example('enwiki-1000.txt') 63 | -------------------------------------------------------------------------------- /examples/gdmr_both_categorical_and_numerical.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example show how to perform a g-DMR topic model 3 | for mixture of categorical and numerical metadata using tomotopy 4 | and visualize a topic distribution. 5 | 6 | Required Packages: 7 | matplotlib 8 | ''' 9 | 10 | import tomotopy as tp 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | import matplotlib.colors as clr 14 | import re 15 | 16 | #You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data . 17 | corpus = tp.utils.Corpus() 18 | for line in open('text_mining_year_journal.txt', encoding='utf-8'): 19 | fd = line.strip().split('\t', maxsplit=2) 20 | corpus.add_doc(fd[2].split(), numeric_metadata=[float(fd[0])], metadata=fd[1]) 21 | # Use the argument `numeric_metadata` for continuous numerical metadata (list of float type), 22 | # and the argument `metadata` for categorical metadata (str type) 23 | 24 | # We set a range of the numeric metadata as [2000, 2017]. 25 | # `decay=1.0` penalizes higher-order terms of lambdas to prevent overfitting. 26 | mdl = tp.GDMRModel(tw=tp.TermWeight.ONE, k=30, degrees=[6], 27 | alpha=1e-2, sigma=0.25, sigma0=3.0, decay=1.0, 28 | metadata_range=[(2000, 2017)], corpus=corpus 29 | ) 30 | mdl.optim_interval = 20 31 | mdl.burn_in = 200 32 | 33 | mdl.train(0) 34 | 35 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 36 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 37 | )) 38 | 39 | # Let's train the model 40 | mdl.train(1000, show_progress=True) 41 | mdl.summary() 42 | 43 | # Let's visualize the result 44 | topic_counts = mdl.get_count_by_topics() 45 | lambdas = mdl.lambdas 46 | lambdas = lambdas.reshape(lambdas.shape[:1] + (len(mdl.metadata_dict), -1)) 47 | # lambdas shape: [num_topics, num_categorical_metadata, degrees + 1] 48 | 49 | md_range = mdl.metadata_range 50 | r = np.stack([mdl.tdf_linspace( 51 | [md_range[0][0]], 52 | [md_range[0][1]], 53 | [50], # interpolation size 54 | cat 55 | ) for cat in mdl.metadata_dict]) 56 | # r shape: [num_categorical_metadata, 50, num_topics] 57 | 58 | xs = np.linspace(*md_range[0], 50) 59 | for k in (-topic_counts).argsort(): 60 | print('Topic #{} ({})'.format(k, topic_counts[k])) 61 | print(*(w for w, _ in mdl.get_topic_words(k))) 62 | print('Lambda:', lambdas[k].reshape((len(mdl.metadata_dict), -1))) 63 | 64 | for label, ys in zip(mdl.metadata_dict, r[:, :, k]): 65 | label = re.sub(r'^(Proceedings|Journal)( of)?( the)?( -)?|International Conference on', '', label).strip() 66 | if len(label) >= 35: label = label[:33] + '...' 67 | plt.plot(xs, ys, linewidth=2, label=label) 68 | plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5)))) 69 | plt.legend() 70 | plt.show() 71 | -------------------------------------------------------------------------------- /examples/gdmr_plot.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example show how to perform a g-DMR topic model using tomotopy 3 | and visualize a topic distribution map. 4 | 5 | Required Packages: 6 | matplotlib 7 | ''' 8 | 9 | import tomotopy as tp 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import matplotlib.colors as clr 13 | 14 | class ExpNormalize(clr.Normalize): 15 | def __init__(self, scale): 16 | super().__init__() 17 | self.scale = scale 18 | 19 | def __call__(self, value, clip=None): 20 | if clip is None: 21 | clip = self.clip 22 | 23 | result, is_scalar = self.process_value(value) 24 | 25 | self.autoscale_None(result) 26 | (vmin,), _ = self.process_value(self.vmin) 27 | (vmax,), _ = self.process_value(self.vmax) 28 | if vmin == vmax: 29 | result.fill(0) 30 | elif vmin > vmax: 31 | raise ValueError("minvalue must be less than or equal to maxvalue") 32 | else: 33 | if clip: 34 | mask = np.ma.getmask(result) 35 | result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax), 36 | mask=mask) 37 | resdat = result.data 38 | resdat = 1 - np.exp(-2 * resdat / self.scale) 39 | result = np.ma.array(resdat, mask=result.mask, copy=False) 40 | if is_scalar: 41 | result = result[0] 42 | return result 43 | 44 | heat = clr.LinearSegmentedColormap.from_list('heat', 45 | [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)], 46 | N=1024 47 | ) 48 | 49 | ''' 50 | You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data . 51 | ''' 52 | 53 | corpus = tp.utils.Corpus() 54 | for line in open('dataset2.txt', encoding='utf-8'): 55 | fd = line.strip().split() 56 | corpus.add_doc(fd[2:], numeric_metadata=list(map(float, fd[:2]))) 57 | 58 | # We set a range of the first metadata as [2000, 2017] 59 | # and one of the second metadata as [0, 1]. 60 | mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=30, degrees=[4, 3], 61 | alpha=1e-2, sigma=0.25, sigma0=3.0, 62 | metadata_range=[(2000, 2017), (0, 1)], corpus=corpus 63 | ) 64 | mdl.optim_interval = 20 65 | mdl.burn_in = 200 66 | 67 | mdl.train(0) 68 | 69 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 70 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 71 | )) 72 | 73 | # Let's train the model 74 | mdl.train(1000, show_progress=True) 75 | mdl.summary() 76 | 77 | # Let's visualize the result 78 | topic_counts = mdl.get_count_by_topics() 79 | lambdas = mdl.lambdas 80 | 81 | md_range = mdl.metadata_range 82 | # Our topic distribution map has 83 | # 400 pixels for the first axis and 84 | # 200 pixels for the second axis. 85 | r = mdl.tdf_linspace( 86 | [md_range[0][0], md_range[1][0]], 87 | [md_range[0][1], md_range[1][1]], 88 | [400, 200] 89 | ) 90 | 91 | for k in (-topic_counts).argsort(): 92 | print('Topic #{} ({})'.format(k, topic_counts[k])) 93 | print(*(w for w, _ in mdl.get_topic_words(k))) 94 | print('Lambda:', lambdas[k]) 95 | 96 | imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()), 97 | origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04), 98 | extent=[*md_range[0], *md_range[1]], 99 | aspect='auto' 100 | ) 101 | plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5)))) 102 | plt.colorbar() 103 | plt.show() 104 | -------------------------------------------------------------------------------- /examples/hdp_basic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def hdp_example(input_file, save_path): 5 | mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5) 6 | for n, line in enumerate(open(input_file, encoding='utf-8')): 7 | ch = line.strip().split() 8 | mdl.add_doc(ch) 9 | mdl.burn_in = 100 10 | mdl.train(0) 11 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 12 | print('Removed top words:', mdl.removed_top_words) 13 | print('Training...', file=sys.stderr, flush=True) 14 | mdl.train(1000, show_progress=True) 15 | mdl.summary() 16 | print('Saving...', file=sys.stderr, flush=True) 17 | mdl.save(save_path, True) 18 | 19 | important_topics = [k for k, v in sorted(enumerate(mdl.get_count_by_topics()), key=lambda x:x[1], reverse=True)] 20 | for k in important_topics: 21 | if not mdl.is_live_topic(k): continue 22 | print('Topic #{}'.format(k)) 23 | for word, prob in mdl.get_topic_words(k): 24 | print('\t', word, prob, sep='\t') 25 | # You can get the sample data file 'enwiki-stemmed-1000.txt' 26 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 27 | 28 | print('Running HDP') 29 | hdp_example('enwiki-stemmed-1000.txt', 'test.hdp.bin') 30 | -------------------------------------------------------------------------------- /examples/hdp_visualization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example shows how to perform a Hierarchical Dirichlet Process using tomotopy 3 | and visualize the result. 4 | 5 | 6 | Required Packages: 7 | nltk, sklearn, pyldavis 8 | ''' 9 | 10 | import tomotopy as tp 11 | import nltk 12 | from nltk.corpus import stopwords 13 | import re 14 | from sklearn.datasets import fetch_20newsgroups 15 | import numpy as np 16 | import pyLDAvis 17 | 18 | try: 19 | # load if preprocessed corpus exists 20 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps') 21 | except IOError: 22 | porter_stemmer = nltk.PorterStemmer().stem 23 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) 24 | pat = re.compile('^[a-z]{2,}$') 25 | corpus = tp.utils.Corpus( 26 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 27 | stopwords=lambda x: x in english_stops or not pat.match(x) 28 | ) 29 | newsgroups_train = fetch_20newsgroups() 30 | corpus.process(d.lower() for d in newsgroups_train.data) 31 | # save preprocessed corpus for reuse 32 | corpus.save('preprocessed_20news.cps') 33 | 34 | mdl = tp.HDPModel(tw=tp.TermWeight.PMI, min_df=5, rm_top=30, alpha=1, gamma=10, initial_k=10, corpus=corpus) 35 | mdl.train(0) 36 | mdl.burn_in = 500 37 | 38 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 39 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 40 | )) 41 | print('Removed Top words: ', *mdl.removed_top_words) 42 | 43 | # Let's train the model 44 | mdl.train(5000, show_progress=True) 45 | mdl.summary() 46 | 47 | live_topics = [k for k in range(mdl.k) if mdl.is_live_topic(k)] 48 | 49 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)]) 50 | topic_term_dists = topic_term_dists[live_topics] 51 | topic_term_dists /= topic_term_dists.sum(axis=1, keepdims=True) 52 | 53 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs]) 54 | doc_topic_dists = doc_topic_dists[:, live_topics] 55 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True) 56 | 57 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs]) 58 | vocab = list(mdl.used_vocabs) 59 | term_frequency = mdl.used_vocab_freq 60 | 61 | prepared_data = pyLDAvis.prepare( 62 | topic_term_dists, 63 | doc_topic_dists, 64 | doc_lengths, 65 | vocab, 66 | term_frequency, 67 | start_index=0, 68 | sort_topics=False 69 | ) 70 | pyLDAvis.save_html(prepared_data, 'ldavis.html') 71 | -------------------------------------------------------------------------------- /examples/hlda_basic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | import numpy as np 4 | 5 | def hlda_example(input_file, save_path): 6 | from nltk.stem.porter import PorterStemmer 7 | from nltk.corpus import stopwords 8 | try: 9 | cps = tp.utils.Corpus.load(input_file + '.cached.cps') 10 | except IOError: 11 | stemmer = PorterStemmer() 12 | stops = set(stopwords.words('english')) 13 | cps = tp.utils.Corpus( 14 | tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 15 | stopwords=lambda x: len(x) <= 2 or x in stops 16 | ) 17 | cps.process(open(input_file, encoding='utf-8')) 18 | cps.save(input_file + '.cached.cps') 19 | 20 | np.random.seed(42) 21 | ridcs = np.random.permutation(len(cps)) 22 | test_idcs = ridcs[:20] 23 | train_idcs = ridcs[20:] 24 | 25 | test_cps = cps[test_idcs] 26 | train_cps = cps[train_idcs] 27 | 28 | mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10, corpus=train_cps) 29 | mdl.train(0) 30 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 31 | print('Removed top words:', mdl.removed_top_words) 32 | print('Training...', file=sys.stderr, flush=True) 33 | for _ in range(0, 1000, 10): 34 | mdl.train(7) 35 | mdl.train(3, freeze_topics=True) 36 | print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k)) 37 | 38 | for _ in range(0, 100, 10): 39 | mdl.train(10, freeze_topics=True) 40 | print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k)) 41 | 42 | mdl.summary() 43 | print('Saving...', file=sys.stderr, flush=True) 44 | mdl.save(save_path, True) 45 | 46 | test_result_cps, ll = mdl.infer(test_cps) 47 | for doc in test_result_cps: 48 | print(doc.path, doc.get_words(top_n=10)) 49 | 50 | # You can get the sample data file 'enwiki-16000.txt' 51 | # at https://drive.google.com/file/d/1OfyJ9TqaMiqzO6Qw-c_jXL-pmSIZf5Xt/view?usp=sharing 52 | 53 | if __name__ == '__main__': 54 | hlda_example('enwiki-16000.txt', 'test.hlda.tmm') 55 | -------------------------------------------------------------------------------- /examples/lda_basic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def lda_example(input_file, save_path): 5 | mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20) 6 | for n, line in enumerate(open(input_file, encoding='utf-8')): 7 | ch = line.strip().split() 8 | mdl.add_doc(ch) 9 | mdl.burn_in = 100 10 | mdl.train(0) 11 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 12 | print('Removed top words:', mdl.removed_top_words) 13 | print('Training...', file=sys.stderr, flush=True) 14 | mdl.train(1000, show_progress=True) 15 | mdl.summary() 16 | print('Saving...', file=sys.stderr, flush=True) 17 | mdl.save(save_path, True) 18 | 19 | for k in range(mdl.k): 20 | print('Topic #{}'.format(k)) 21 | for word, prob in mdl.get_topic_words(k): 22 | print('\t', word, prob, sep='\t') 23 | 24 | # You can get the sample data file 'enwiki-stemmed-1000.txt' 25 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 26 | 27 | print('Running LDA') 28 | lda_example('enwiki-stemmed-1000.txt', 'test.lda.bin') 29 | -------------------------------------------------------------------------------- /examples/lda_visualization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example shows how to perform a Latent Dirichlet Allocation using tomotopy 3 | and visualize the result. 4 | 5 | 6 | Required Packages: 7 | nltk, sklearn, pyldavis 8 | ''' 9 | 10 | import tomotopy as tp 11 | import nltk 12 | from nltk.corpus import stopwords 13 | import re 14 | from sklearn.datasets import fetch_20newsgroups 15 | import numpy as np 16 | import pyLDAvis 17 | 18 | try: 19 | # load if preprocessed corpus exists 20 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps') 21 | except IOError: 22 | porter_stemmer = nltk.PorterStemmer().stem 23 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) 24 | pat = re.compile('^[a-z]{2,}$') 25 | corpus = tp.utils.Corpus( 26 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 27 | stopwords=lambda x: x in english_stops or not pat.match(x) 28 | ) 29 | newsgroups_train = fetch_20newsgroups() 30 | corpus.process(d.lower() for d in newsgroups_train.data) 31 | # save preprocessed corpus for reuse 32 | corpus.save('preprocessed_20news.cps') 33 | 34 | mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus) 35 | mdl.train(0) 36 | 37 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 38 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 39 | )) 40 | print('Removed Top words: ', *mdl.removed_top_words) 41 | 42 | # Let's train the model 43 | mdl.train(1000, show_progress=True) 44 | mdl.summary() 45 | 46 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)]) 47 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs]) 48 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True) 49 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs]) 50 | vocab = list(mdl.used_vocabs) 51 | term_frequency = mdl.used_vocab_freq 52 | 53 | prepared_data = pyLDAvis.prepare( 54 | topic_term_dists, 55 | doc_topic_dists, 56 | doc_lengths, 57 | vocab, 58 | term_frequency, 59 | start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1 60 | sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching! 61 | ) 62 | pyLDAvis.save_html(prepared_data, 'ldavis.html') 63 | -------------------------------------------------------------------------------- /examples/raw_corpus_and_labeling.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def raw_corpus_and_labeling_example(input_file): 5 | from nltk.stem.porter import PorterStemmer 6 | from nltk.corpus import stopwords 7 | stemmer = PorterStemmer() 8 | stops = set(stopwords.words('english')) 9 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 10 | stopwords=lambda x: len(x) <= 2 or x in stops) 11 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 12 | corpus.process(open(input_file, encoding='utf-8')) 13 | 14 | # make LDA model and train 15 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) 16 | mdl.train(0) 17 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 18 | print('Removed top words:', mdl.removed_top_words) 19 | mdl.train(1000, show_progress=True) 20 | mdl.summary() 21 | 22 | # extract candidates for auto topic labeling 23 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000, normalized=True) 24 | cands = extractor.extract(mdl) 25 | 26 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) 27 | for k in range(mdl.k): 28 | print("== Topic #{} ==".format(k)) 29 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) 30 | for word, prob in mdl.get_topic_words(k, top_n=10): 31 | print(word, prob, sep='\t') 32 | print() 33 | 34 | 35 | # You can get the sample data file 'enwiki-1000.txt' 36 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 37 | 38 | print('Running LDA from raw corpus and Labeling') 39 | raw_corpus_and_labeling_example('enwiki-1000.txt') 40 | -------------------------------------------------------------------------------- /examples/word_prior.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tomotopy as tp 3 | 4 | def word_prior_example(input_file): 5 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 6 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 7 | corpus.process(open(input_file, encoding='utf-8')) 8 | 9 | # make LDA model and train 10 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) 11 | # The word 'church' is assigned to Topic 0 with a weight of 1.0 and to the remaining topics with a weight of 0.1. 12 | # Therefore, a topic related to 'church' can be fixed at Topic 0 . 13 | mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) 14 | # Topic 1 for a topic related to 'softwar' 15 | mdl.set_word_prior('softwar', [1.0 if k == 1 else 0.1 for k in range(20)]) 16 | # Topic 2 for a topic related to 'citi' 17 | mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)]) 18 | mdl.train(0) 19 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 20 | print('Removed top words:', mdl.removed_top_words) 21 | mdl.train(1000, show_progress=True) 22 | mdl.summary() 23 | 24 | for k in range(mdl.k): 25 | print("== Topic #{} ==".format(k)) 26 | for word, prob in mdl.get_topic_words(k, top_n=10): 27 | print(word, prob, sep='\t') 28 | print() 29 | 30 | 31 | # You can get the sample data file 'enwiki-stemmed-1000.txt' 32 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing 33 | 34 | print('Set Word Prior') 35 | word_prior_example('enwiki-stemmed-1000.txt') 36 | -------------------------------------------------------------------------------- /licenses_bundled/EigenRand: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, bab2min 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /licenses_bundled/MapboxVariant: -------------------------------------------------------------------------------- 1 | Copyright (c) MapBox 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | - Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | - Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the documentation and/or 11 | other materials provided with the distribution. 12 | - Neither the name "MapBox" nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 20 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 23 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.10.0,<2 -------------------------------------------------------------------------------- /src/Coherence/CoherenceModel.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | * Röder, M., Both, A., & Hinneburg, A. (2015, February). Exploring the space of topic coherence measures. In Proceedings of the eighth ACM international conference on Web search and data mining (pp. 399-408). 5 | http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf 6 | https://github.com/dice-group/Palmetto 7 | 8 | */ 9 | 10 | #include "Common.h" 11 | #include "ConfirmMeasurer.hpp" 12 | #include "ProbEstimator.hpp" 13 | #include "Segmentor.hpp" 14 | 15 | namespace tomoto 16 | { 17 | namespace coherence 18 | { 19 | class CoherenceModel 20 | { 21 | std::unique_ptr pe; 22 | ProbEstimation pe_type = ProbEstimation::none; 23 | 24 | template 25 | void init(size_t windowSize) 26 | { 27 | pe_type = _pe; 28 | pe = std::make_unique>(windowSize); 29 | } 30 | 31 | template 32 | void _insertTargets(_TargetIter targetFirst, _TargetIter targetLast) 33 | { 34 | ((ProbEstimator<_pe>*)pe.get())->insertTargets(targetFirst, targetLast); 35 | } 36 | 37 | template 38 | void _insertDoc(_TargetIter wordFirst, _TargetIter wordLast) 39 | { 40 | ((ProbEstimator<_pe>*)pe.get())->insertDoc(wordFirst, wordLast); 41 | } 42 | 43 | public: 44 | CoherenceModel() = default; 45 | 46 | CoherenceModel(ProbEstimation _pe, size_t windowSize) 47 | { 48 | switch (_pe) 49 | { 50 | case ProbEstimation::document: 51 | init(windowSize); 52 | break; 53 | case ProbEstimation::sliding_windows: 54 | init(windowSize); 55 | break; 56 | default: 57 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" }; 58 | } 59 | } 60 | 61 | template 62 | void insertTargets(_TargetIter targetFirst, _TargetIter targetLast) 63 | { 64 | switch (pe_type) 65 | { 66 | case ProbEstimation::document: 67 | return _insertTargets(targetFirst, targetLast); 68 | case ProbEstimation::sliding_windows: 69 | return _insertTargets(targetFirst, targetLast); 70 | default: 71 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" }; 72 | } 73 | } 74 | 75 | template 76 | void insertDoc(_TargetIter wordFirst, _TargetIter wordLast) 77 | { 78 | switch (pe_type) 79 | { 80 | case ProbEstimation::document: 81 | return _insertDoc(wordFirst, wordLast); 82 | case ProbEstimation::sliding_windows: 83 | return _insertDoc(wordFirst, wordLast); 84 | default: 85 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" }; 86 | } 87 | } 88 | 89 | template 90 | double getScore(_CMFunc&& cm, _TargetIter targetFirst, _TargetIter targetLast) const 91 | { 92 | return makeSegmentor<_seg>(std::forward<_CMFunc>(cm), pe.get())(targetFirst, targetLast); 93 | } 94 | 95 | }; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/Coherence/Common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../TopicModel/TopicModel.hpp" 3 | 4 | namespace tomoto 5 | { 6 | namespace coherence 7 | { 8 | enum class Segmentation 9 | { 10 | none = 0, 11 | one_one, 12 | one_pre, 13 | one_suc, 14 | one_all, 15 | one_set, 16 | }; 17 | 18 | enum class ProbEstimation 19 | { 20 | none = 0, 21 | document, 22 | sliding_windows, 23 | }; 24 | 25 | class IProbEstimator 26 | { 27 | public: 28 | virtual double getProb(Vid word) const = 0; 29 | virtual double getProb(Vid word1, Vid word2) const = 0; 30 | virtual double getProb(const std::vector& words) const = 0; 31 | virtual double getJointNotProb(Vid word1, Vid word2) const = 0; 32 | virtual double getJointNotProb(Vid word1, const std::vector& word2) const = 0; 33 | virtual ~IProbEstimator() {} 34 | 35 | double getProb(Vid word1, const std::vector& word2) const 36 | { 37 | auto words = word2; 38 | if(std::find(words.begin(), words.end(), word1) == words.end()) words.emplace_back(word1); 39 | return getProb(words); 40 | } 41 | }; 42 | 43 | enum class ConfirmMeasure 44 | { 45 | none = 0, 46 | difference, 47 | ratio, 48 | likelihood, 49 | loglikelihood, 50 | pmi, 51 | npmi, 52 | logcond, 53 | }; 54 | 55 | enum class IndirectMeasure 56 | { 57 | none = 0, 58 | cosine, 59 | dice, 60 | jaccard, 61 | }; 62 | 63 | /*enum class Aggregation 64 | { 65 | none = 0, 66 | amean, 67 | median, 68 | gmean, 69 | hmean, 70 | qmean, 71 | };*/ 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/Coherence/Segmentor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Common.h" 4 | 5 | namespace tomoto 6 | { 7 | namespace coherence 8 | { 9 | template 10 | class Segmentor; 11 | 12 | template 13 | Segmentor<_seg, typename std::remove_reference<_CMFunc>::type> 14 | makeSegmentor(_CMFunc&& cm, const IProbEstimator* pe) 15 | { 16 | return { std::forward<_CMFunc>(cm), pe }; 17 | } 18 | 19 | template 20 | class Segmentor 21 | { 22 | const IProbEstimator* pe; 23 | _CMFunc cm; 24 | public: 25 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe } 26 | { 27 | } 28 | 29 | template 30 | double operator()(_TargetIter wordFirst, _TargetIter wordLast) 31 | { 32 | double ret = 0; 33 | double n = 0; 34 | for (auto it1 = wordFirst; it1 != wordLast; ++it1) 35 | { 36 | for (auto it2 = wordFirst; it2 != wordLast; ++it2) 37 | { 38 | if (it1 == it2) continue; 39 | ret += cm(pe, *it1, *it2); 40 | n += 1; 41 | } 42 | } 43 | return ret / n; 44 | } 45 | }; 46 | 47 | template 48 | class Segmentor 49 | { 50 | const IProbEstimator* pe; 51 | _CMFunc cm; 52 | public: 53 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe } 54 | { 55 | } 56 | 57 | template 58 | double operator()(_TargetIter wordFirst, _TargetIter wordLast) 59 | { 60 | double ret = 0; 61 | double n = 0; 62 | for (auto it1 = wordFirst; it1 != wordLast; ++it1) 63 | { 64 | for (auto it2 = wordFirst; it2 != it1; ++it2) 65 | { 66 | ret += cm(pe, *it1, *it2); 67 | n += 1; 68 | } 69 | } 70 | return ret / n; 71 | } 72 | }; 73 | 74 | template 75 | class Segmentor 76 | { 77 | const IProbEstimator* pe; 78 | _CMFunc cm; 79 | public: 80 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe } 81 | { 82 | } 83 | 84 | template 85 | double operator()(_TargetIter wordFirst, _TargetIter wordLast) 86 | { 87 | double ret = 0; 88 | double n = 0; 89 | for (auto it1 = wordFirst; it1 != wordLast; ++it1) 90 | { 91 | for (auto it2 = it1 + 1; it2 == wordLast; ++it2) 92 | { 93 | ret += cm(pe, *it1, *it2); 94 | n += 1; 95 | } 96 | } 97 | return ret / n; 98 | } 99 | }; 100 | 101 | template 102 | class Segmentor 103 | { 104 | const IProbEstimator* pe; 105 | _CMFunc cm; 106 | public: 107 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe } 108 | { 109 | } 110 | 111 | template 112 | double operator()(_TargetIter wordFirst, _TargetIter wordLast) 113 | { 114 | double ret = 0; 115 | double n = 0; 116 | for (auto it1 = wordFirst; it1 != wordLast; ++it1) 117 | { 118 | ret += cm(pe, *it1, std::vector{ wordFirst, wordLast }); 119 | n += 1; 120 | } 121 | return ret / n; 122 | } 123 | }; 124 | 125 | 126 | template 127 | class Segmentor 128 | { 129 | const IProbEstimator* pe; 130 | _CMFunc cm; 131 | public: 132 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe } 133 | { 134 | } 135 | 136 | template 137 | double operator()(_TargetIter wordFirst, _TargetIter wordLast) 138 | { 139 | double ret = 0; 140 | double n = 0; 141 | for (auto it1 = wordFirst; it1 != wordLast; ++it1) 142 | { 143 | std::vector rest; 144 | rest.insert(rest.end(), wordFirst, it1); 145 | rest.insert(rest.end(), it1 + 1, wordLast); 146 | ret += cm(pe, *it1, rest); 147 | n += 1; 148 | } 149 | return ret / n; 150 | } 151 | }; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/Labeling/FoRelevance.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "Labeler.h" 5 | #include "../Utils/EigenAddonOps.hpp" 6 | #include "../Utils/Trie.hpp" 7 | #include "../Utils/ThreadPool.hpp" 8 | 9 | /* 10 | Implementation of First-order Relevance for topic labeling by bab2min 11 | 12 | * Mei, Q., Shen, X., & Zhai, C. (2007, August). Automatic labeling of multinomial topic models. In Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 490-499). 13 | 14 | */ 15 | 16 | namespace tomoto 17 | { 18 | namespace label 19 | { 20 | class PMIExtractor : public IExtractor 21 | { 22 | size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates; 23 | bool normalized; 24 | public: 25 | PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, 26 | size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000, 27 | bool _normalized = false 28 | ) 29 | : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, 30 | minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, 31 | maxCandidates{ _maxCandidates }, normalized{ _normalized } 32 | { 33 | } 34 | 35 | std::vector extract(const ITopicModel* tm) const override; 36 | }; 37 | 38 | class PMIBEExtractor : public IExtractor 39 | { 40 | size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates; 41 | public: 42 | PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2, 43 | size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000 44 | ) 45 | : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates } 46 | { 47 | } 48 | 49 | std::vector extract(const ITopicModel* tm) const override; 50 | }; 51 | 52 | class FoRelevance : public ILabeler 53 | { 54 | struct CandidateEx : public Candidate 55 | { 56 | std::unordered_map names; 57 | std::set docIds; 58 | Eigen::Array scores; 59 | 60 | CandidateEx() 61 | { 62 | } 63 | 64 | CandidateEx(const Candidate& c) 65 | : Candidate{ c } 66 | { 67 | } 68 | }; 69 | 70 | const ITopicModel* tm; 71 | size_t candMinDf; 72 | float smoothing, lambda, mu; 73 | size_t windowSize; 74 | std::unique_ptr pool; 75 | std::unique_ptr mtx; 76 | std::vector candidates; 77 | 78 | template 79 | const Eigen::ArrayXi& updateContext(size_t docId, const tomoto::DocumentBase* doc, const Trie* root); 80 | 81 | void estimateContexts(); 82 | 83 | public: 84 | template 85 | FoRelevance(const ITopicModel* _tm, 86 | _Iter candFirst, _Iter candEnd, 87 | size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f, 88 | size_t _windowSize = (size_t)-1, 89 | size_t numWorkers = 0) 90 | : tm{ _tm }, candMinDf{ _candMinDf }, 91 | smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize } 92 | { 93 | if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); 94 | if (numWorkers > 1) 95 | { 96 | pool = std::make_unique(numWorkers); 97 | mtx = std::make_unique(numWorkers); 98 | } 99 | 100 | for (; candFirst != candEnd; ++candFirst) 101 | { 102 | candidates.emplace_back(*candFirst); 103 | } 104 | 105 | estimateContexts(); 106 | } 107 | 108 | std::vector> getLabels(Tid tid, size_t topK = 10) const override; 109 | }; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/Labeling/Labeler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "../TopicModel/TopicModel.hpp" 5 | 6 | namespace tomoto 7 | { 8 | namespace label 9 | { 10 | struct Candidate 11 | { 12 | float score = 0; 13 | size_t cf = 0, df = 0; 14 | std::vector w; 15 | std::string name; 16 | 17 | Candidate() 18 | { 19 | } 20 | 21 | Candidate(float _score, Vid w1) 22 | : score{ _score }, w{ w1 } 23 | { 24 | } 25 | 26 | Candidate(float _score, Vid w1, Vid w2) 27 | : score{ _score }, w{ w1, w2 } 28 | { 29 | } 30 | 31 | Candidate(float _score, const std::vector& _w) 32 | : score{ _score }, w{ _w } 33 | { 34 | } 35 | }; 36 | 37 | class IExtractor 38 | { 39 | public: 40 | 41 | virtual std::vector extract(const ITopicModel* tm) const = 0; 42 | virtual ~IExtractor() {} 43 | }; 44 | 45 | class ILabeler 46 | { 47 | public: 48 | virtual std::vector> getLabels(Tid tid, size_t topK = 10) const = 0; 49 | virtual ~ILabeler() {} 50 | }; 51 | } 52 | } -------------------------------------------------------------------------------- /src/TopicModel/CT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentCTM : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | Matrix beta; // Dim: (K, betaSample) 12 | Vector smBeta; // Dim: K 13 | 14 | DECLARE_SERIALIZER_WITH_VERSION(0); 15 | DECLARE_SERIALIZER_WITH_VERSION(1); 16 | }; 17 | 18 | struct CTArgs : public LDAArgs 19 | { 20 | 21 | }; 22 | 23 | class ICTModel : public ILDAModel 24 | { 25 | public: 26 | using DefaultDocType = DocumentCTM; 27 | static ICTModel* create(TermWeight _weight, const CTArgs& args, 28 | bool scalarRng = false); 29 | 30 | virtual void setNumBetaSample(size_t numSample) = 0; 31 | virtual size_t getNumBetaSample() const = 0; 32 | virtual void setNumTMNSample(size_t numSample) = 0; 33 | virtual size_t getNumTMNSample() const = 0; 34 | virtual void setNumDocBetaSample(size_t numSample) = 0; 35 | virtual size_t getNumDocBetaSample() const = 0; 36 | virtual std::vector getPriorMean() const = 0; 37 | virtual std::vector getPriorCov() const = 0; 38 | virtual std::vector getCorrelationTopic(Tid k) const = 0; 39 | }; 40 | } 41 | -------------------------------------------------------------------------------- /src/TopicModel/CTModel.cpp: -------------------------------------------------------------------------------- 1 | #include "CTModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentCTM); 9 | 10 | ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, CTModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/DMR.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | class IDMRModel; 7 | 8 | template 9 | struct DocumentDMR : public DocumentLDA<_tw> 10 | { 11 | using BaseDocument = DocumentLDA<_tw>; 12 | using DocumentLDA<_tw>::DocumentLDA; 13 | uint64_t metadata = 0; 14 | std::vector multiMetadata; 15 | Vector mdVec; 16 | size_t mdHash = (size_t)-1; 17 | mutable Matrix cachedAlpha; 18 | 19 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override; 20 | 21 | DECLARE_SERIALIZER_WITH_VERSION(0); 22 | DECLARE_SERIALIZER_WITH_VERSION(1); 23 | }; 24 | 25 | struct DMRArgs : public LDAArgs 26 | { 27 | Float alphaEps = 1e-10; 28 | Float sigma = 1.0; 29 | }; 30 | 31 | class IDMRModel : public ILDAModel 32 | { 33 | public: 34 | using DefaultDocType = DocumentDMR; 35 | static IDMRModel* create(TermWeight _weight, const DMRArgs& args, 36 | bool scalarRng = false); 37 | 38 | virtual void setAlphaEps(Float _alphaEps) = 0; 39 | virtual Float getAlphaEps() const = 0; 40 | virtual void setOptimRepeat(size_t repeat) = 0; 41 | virtual size_t getOptimRepeat() const = 0; 42 | virtual size_t getF() const = 0; 43 | virtual size_t getMdVecSize() const = 0; 44 | virtual Float getSigma() const = 0; 45 | virtual const Dictionary& getMetadataDict() const = 0; 46 | virtual const Dictionary& getMultiMetadataDict() const = 0; 47 | virtual std::vector getLambdaByMetadata(size_t metadataId) const = 0; 48 | virtual std::vector getLambdaByTopic(Tid tid) const = 0; 49 | 50 | virtual std::vector getTopicPrior( 51 | const std::string& metadata, 52 | const std::vector& multiMetadata, 53 | bool raw = false 54 | ) const = 0; 55 | }; 56 | 57 | template 58 | RawDoc::MiscType DocumentDMR<_tw>::makeMisc(const ITopicModel* tm) const 59 | { 60 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm); 61 | auto inst = static_cast(tm); 62 | ret["metadata"] = inst->getMetadataDict().toWord(metadata); 63 | return ret; 64 | } 65 | } -------------------------------------------------------------------------------- /src/TopicModel/DMRModel.cpp: -------------------------------------------------------------------------------- 1 | #include "DMRModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentDMR); 9 | 10 | IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/DT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDAModel.hpp" 3 | #include "LDA.h" 4 | 5 | namespace tomoto 6 | { 7 | template 8 | struct DocumentDTM : public DocumentLDA<_tw> 9 | { 10 | using BaseDocument = DocumentLDA<_tw>; 11 | using DocumentLDA<_tw>::DocumentLDA; 12 | 13 | uint64_t timepoint = 0; 14 | ShareableMatrix eta; 15 | sample::AliasMethod<> aliasTable; 16 | 17 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override 18 | { 19 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm); 20 | ret["timepoint"] = (uint32_t)timepoint; 21 | return ret; 22 | } 23 | 24 | DECLARE_SERIALIZER_WITH_VERSION(0); 25 | DECLARE_SERIALIZER_WITH_VERSION(1); 26 | }; 27 | 28 | struct DTArgs : public LDAArgs 29 | { 30 | size_t t = 1; 31 | Float phi = 0.1; 32 | Float shapeA = 0.01; 33 | Float shapeB = 0.1; 34 | Float shapeC = 0.55; 35 | Float etaL2Reg = 0; 36 | 37 | DTArgs() 38 | { 39 | alpha[0] = 0.1; 40 | eta = 0.1; 41 | } 42 | }; 43 | 44 | class IDTModel : public ILDAModel 45 | { 46 | public: 47 | using DefaultDocType = DocumentDTM; 48 | static IDTModel* create(TermWeight _weight, const DTArgs& args, 49 | bool scalarRng = false); 50 | 51 | virtual size_t getT() const = 0; 52 | virtual std::vector getNumDocsByT() const = 0; 53 | 54 | virtual Float getAlphaVar() const = 0; 55 | virtual Float getEtaVar() const = 0; 56 | virtual Float getPhiVar() const = 0; 57 | 58 | virtual Float getShapeA() const = 0; 59 | virtual Float getShapeB() const = 0; 60 | virtual Float getShapeC() const = 0; 61 | 62 | virtual void setShapeA(Float a) = 0; 63 | virtual void setShapeB(Float a) = 0; 64 | virtual void setShapeC(Float a) = 0; 65 | 66 | virtual Float getAlpha(size_t k, size_t t) const = 0; 67 | virtual std::vector getPhi(size_t k, size_t t) const = 0; 68 | }; 69 | } 70 | -------------------------------------------------------------------------------- /src/TopicModel/DTM.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDAModel.hpp" 3 | #include "LDA.h" 4 | 5 | namespace tomoto 6 | { 7 | template 8 | struct DocumentDTM : public DocumentLDA<_tw, _Flags> 9 | { 10 | using BaseDocument = DocumentLDA<_tw, _Flags>; 11 | using DocumentLDA<_tw, _Flags>::DocumentLDA; 12 | using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type; 13 | }; 14 | 15 | class IDTModel : public ILDAModel 16 | { 17 | public: 18 | using DefaultDocType = DocumentDTM; 19 | static IDTModel* create(TermWeight _weight, size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); 20 | 21 | }; 22 | } 23 | -------------------------------------------------------------------------------- /src/TopicModel/DTModel.cpp: -------------------------------------------------------------------------------- 1 | #include "DTModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentDTM); 9 | 10 | IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, DTModel, args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/TopicModel/GDMR.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "DMR.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentGDMR : public DocumentDMR<_tw> 8 | { 9 | using BaseDocument = DocumentDMR<_tw>; 10 | using DocumentDMR<_tw>::DocumentDMR; 11 | std::vector metadataOrg, metadataNormalized; 12 | 13 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override 14 | { 15 | RawDoc::MiscType ret = DocumentDMR<_tw>::makeMisc(tm); 16 | ret["numeric_metadata"] = metadataOrg; 17 | return ret; 18 | } 19 | 20 | DECLARE_SERIALIZER_WITH_VERSION(0); 21 | DECLARE_SERIALIZER_WITH_VERSION(1); 22 | }; 23 | 24 | struct GDMRArgs : public DMRArgs 25 | { 26 | std::vector degrees; 27 | Float sigma0 = 3.0; 28 | Float orderDecay = 0; 29 | }; 30 | 31 | class IGDMRModel : public IDMRModel 32 | { 33 | public: 34 | using DefaultDocType = DocumentDMR; 35 | static IGDMRModel* create(TermWeight _weight, const GDMRArgs& args, 36 | bool scalarRng = false); 37 | 38 | virtual Float getSigma0() const = 0; 39 | virtual Float getOrderDecay() const = 0; 40 | virtual void setSigma0(Float) = 0; 41 | virtual const std::vector& getFs() const = 0; 42 | virtual std::vector getLambdaByTopic(Tid tid) const = 0; 43 | 44 | virtual std::vector getTDF( 45 | const Float* metadata, 46 | const std::string& metadataCat, 47 | const std::vector& multiMetadataCat, 48 | bool normalize 49 | ) const = 0; 50 | 51 | virtual std::vector getTDFBatch( 52 | const Float* metadata, 53 | const std::string& metadataCat, 54 | const std::vector& multiMetadataCat, 55 | size_t stride, 56 | size_t cnt, 57 | bool normalize 58 | ) const = 0; 59 | 60 | virtual void setMdRange(const std::vector& vMin, const std::vector& vMax) = 0; 61 | virtual void getMdRange(std::vector& vMin, std::vector& vMax) const = 0; 62 | }; 63 | } 64 | -------------------------------------------------------------------------------- /src/TopicModel/GDMRModel.cpp: -------------------------------------------------------------------------------- 1 | #include "GDMRModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentGDMR); 9 | 10 | IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/TopicModel/HDP.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentHDP : public DocumentLDA<_tw> 8 | { 9 | /* 10 | For DocumentHDP, the topic in numByTopic, Zs indicates 'table id', not 'topic id'. 11 | To get real 'topic id', check the topic field of numTopicByTable. 12 | */ 13 | using BaseDocument = DocumentLDA<_tw>; 14 | using DocumentLDA<_tw>::DocumentLDA; 15 | using WeightType = typename DocumentLDA<_tw>::WeightType; 16 | struct TableTopicInfo 17 | { 18 | WeightType num; 19 | Tid topic; 20 | 21 | TableTopicInfo(WeightType _num = 0, Tid _topic = 0) : num(_num), topic(_topic) 22 | { 23 | } 24 | 25 | operator const bool() const 26 | { 27 | return num > (WeightType)1e-2; 28 | } 29 | 30 | void serializerWrite(std::ostream& writer) const 31 | { 32 | serializer::writeMany(writer, topic); 33 | } 34 | 35 | void serializerRead(std::istream& reader) 36 | { 37 | serializer::readMany(reader, topic); 38 | } 39 | }; 40 | std::vector numTopicByTable; 41 | 42 | DECLARE_SERIALIZER_WITH_VERSION(0); 43 | DECLARE_SERIALIZER_WITH_VERSION(1); 44 | 45 | size_t getNumTable() const 46 | { 47 | return std::count_if(numTopicByTable.begin(), numTopicByTable.end(), [](const TableTopicInfo& e) { return (bool)e; }); 48 | } 49 | 50 | // add a new table into doc and return the new table's idx 51 | size_t addNewTable(Tid tid) 52 | { 53 | return insertIntoEmpty(numTopicByTable, TableTopicInfo( 0, tid )); 54 | } 55 | 56 | template void update(WeightType* ptr, const _TopicModel& mdl); 57 | }; 58 | 59 | struct HDPArgs : public LDAArgs 60 | { 61 | Float gamma = 0.1; 62 | 63 | HDPArgs() 64 | { 65 | k = 2; 66 | } 67 | }; 68 | 69 | class IHDPModel : public ILDAModel 70 | { 71 | public: 72 | using DefaultDocType = DocumentHDP; 73 | static IHDPModel* create(TermWeight _weight, const HDPArgs& args, 74 | bool scalarRng = false); 75 | 76 | virtual Float getGamma() const = 0; 77 | virtual size_t getTotalTables() const = 0; 78 | virtual size_t getLiveK() const = 0; 79 | virtual bool isLiveTopic(Tid tid) const = 0; 80 | 81 | virtual std::unique_ptr convertToLDA(float topicThreshold, std::vector& newK) const = 0; 82 | virtual std::vector purgeDeadTopics() = 0; 83 | }; 84 | } -------------------------------------------------------------------------------- /src/TopicModel/HDPModel.cpp: -------------------------------------------------------------------------------- 1 | #include "HDPModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentHDP); 9 | 10 | IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/HLDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentHLDA : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using WeightType = typename DocumentLDA<_tw>::WeightType; 11 | using DocumentLDA<_tw>::DocumentLDA; 12 | 13 | // numByTopic indicates numByLevel in HLDAModel. 14 | // Zs indicates level in HLDAModel. 15 | std::vector path; 16 | 17 | template void update(WeightType* ptr, const _TopicModel& mdl); 18 | 19 | DECLARE_SERIALIZER_WITH_VERSION(0); 20 | DECLARE_SERIALIZER_WITH_VERSION(1); 21 | }; 22 | 23 | struct HLDAArgs : public LDAArgs 24 | { 25 | Float gamma = 0.1; 26 | 27 | HLDAArgs() 28 | { 29 | k = 2; 30 | } 31 | }; 32 | 33 | class IHLDAModel : public ILDAModel 34 | { 35 | public: 36 | using DefaultDocType = DocumentHLDA; 37 | static IHLDAModel* create(TermWeight _weight, const HLDAArgs& args, 38 | bool scalarRng = false); 39 | 40 | virtual Float getGamma() const = 0; 41 | virtual size_t getLiveK() const = 0; 42 | virtual size_t getLevelDepth() const = 0; 43 | virtual bool isLiveTopic(Tid tid) const = 0; 44 | virtual size_t getNumDocsOfTopic(Tid tid) const = 0; 45 | virtual size_t getLevelOfTopic(Tid tid) const = 0; 46 | virtual size_t getParentTopicId(Tid tid) const = 0; 47 | virtual std::vector getChildTopicId(Tid tid) const = 0; 48 | }; 49 | } 50 | -------------------------------------------------------------------------------- /src/TopicModel/HLDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "HLDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentHLDA); 9 | 10 | IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/HPA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "PA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentHPA : public DocumentPA<_tw> 8 | { 9 | using BaseDocument = DocumentPA<_tw>; 10 | using DocumentPA<_tw>::DocumentPA; 11 | using WeightType = typename DocumentPA<_tw>::WeightType; 12 | 13 | template void update(WeightType* ptr, const _TopicModel& mdl); 14 | 15 | DECLARE_SERIALIZER_WITH_VERSION(0); 16 | DECLARE_SERIALIZER_WITH_VERSION(1); 17 | }; 18 | 19 | struct HPAArgs : public PAArgs 20 | { 21 | }; 22 | 23 | class IHPAModel : public IPAModel 24 | { 25 | public: 26 | using DefaultDocType = DocumentHPA; 27 | static IHPAModel* create(TermWeight _weight, bool _exclusive, const HPAArgs& args, 28 | bool scalarRng = false); 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /src/TopicModel/HPAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "HPAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0); 6 | DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentHPA); 9 | 10 | IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng) 11 | { 12 | if (_exclusive) 13 | { 14 | //TMT_SWITCH_TW(_weight, HPAModelExclusive, _K, _K2, _alphaSum, _eta, seed); 15 | } 16 | else 17 | { 18 | TMT_SWITCH_TW(_weight, scalarRng, HPAModel, args); 19 | } 20 | return nullptr; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/TopicModel/LDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "TopicModel.hpp" 3 | 4 | namespace tomoto 5 | { 6 | enum class TermWeight { one, idf, pmi, size }; 7 | 8 | template 9 | struct ShareableMatrix : Eigen::Map> 10 | { 11 | using BaseType = Eigen::Map>; 12 | Eigen::Matrix<_Scalar, _rows, _cols> ownData; 13 | 14 | ShareableMatrix(_Scalar* ptr = nullptr, Eigen::Index rows = 0, Eigen::Index cols = 0) 15 | : BaseType(nullptr, _rows != -1 ? _rows : 0, _cols != -1 ? _cols : 0) 16 | { 17 | init(ptr, rows, cols); 18 | } 19 | 20 | ShareableMatrix(const ShareableMatrix& o) 21 | : BaseType(nullptr, _rows != -1 ? _rows : 0, _cols != -1 ? _cols : 0), ownData{ o.ownData } 22 | { 23 | if (o.ownData.data()) 24 | { 25 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols()); 26 | } 27 | else 28 | { 29 | new (this) BaseType((_Scalar*)o.data(), o.rows(), o.cols()); 30 | } 31 | } 32 | 33 | ShareableMatrix(ShareableMatrix&& o) = default; 34 | 35 | ShareableMatrix& operator=(const ShareableMatrix& o) 36 | { 37 | if (o.ownData.data()) 38 | { 39 | ownData = o.ownData; 40 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols()); 41 | } 42 | else 43 | { 44 | new (this) BaseType((_Scalar*)o.data(), o.rows(), o.cols()); 45 | } 46 | return *this; 47 | } 48 | 49 | ShareableMatrix& operator=(ShareableMatrix&& o) = default; 50 | 51 | void init(_Scalar* ptr, Eigen::Index rows, Eigen::Index cols) 52 | { 53 | if (!ptr && rows && cols) 54 | { 55 | ownData = Eigen::Matrix<_Scalar, _rows, _cols>::Zero(_rows != -1 ? _rows : rows, _cols != -1 ? _cols : cols); 56 | ptr = ownData.data(); 57 | } 58 | else 59 | { 60 | ownData = Eigen::Matrix<_Scalar, _rows, _cols>{}; 61 | } 62 | new (this) BaseType(ptr, _rows != -1 ? _rows : rows, _cols != -1 ? _cols : cols); 63 | } 64 | 65 | void conservativeResize(size_t newRows, size_t newCols) 66 | { 67 | ownData.conservativeResize(_rows != -1 ? _rows : newRows, _cols != -1 ? _cols : newCols); 68 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols()); 69 | } 70 | 71 | void becomeOwner() 72 | { 73 | if (ownData.data() != this->m_data) 74 | { 75 | ownData = *this; 76 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols()); 77 | } 78 | } 79 | 80 | void serializerRead(std::istream& istr) 81 | { 82 | uint32_t rows = serializer::readFromStream(istr); 83 | uint32_t cols = serializer::readFromStream(istr); 84 | init(nullptr, rows, cols); 85 | if (!istr.read((char*)this->data(), sizeof(_Scalar) * this->size())) 86 | throw std::ios_base::failure(std::string("reading type '") + typeid(_Scalar).name() + std::string("' is failed")); 87 | } 88 | 89 | void serializerWrite(std::ostream& ostr) const 90 | { 91 | serializer::writeToStream(ostr, (uint32_t)this->rows()); 92 | serializer::writeToStream(ostr, (uint32_t)this->cols()); 93 | if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size())) 94 | throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed")); 95 | } 96 | 97 | uint64_t computeHash(uint64_t seed) const 98 | { 99 | seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols()); 100 | return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed); 101 | } 102 | }; 103 | 104 | template 105 | struct SumWordWeight 106 | { 107 | Float sumWordWeight = 0; 108 | Float getSumWordWeight() const 109 | { 110 | return sumWordWeight; 111 | } 112 | 113 | void updateSumWordWeight(size_t realV) 114 | { 115 | sumWordWeight = std::accumulate(static_cast<_Base*>(this)->wordWeights.begin(), static_cast<_Base*>(this)->wordWeights.end(), 0.f); 116 | } 117 | }; 118 | 119 | template 120 | struct SumWordWeight<_Base, TermWeight::one> 121 | { 122 | int32_t sumWordWeight = 0; 123 | int32_t getSumWordWeight() const 124 | { 125 | return sumWordWeight; 126 | } 127 | 128 | void updateSumWordWeight(size_t realV) 129 | { 130 | sumWordWeight = (int32_t)std::count_if(static_cast<_Base*>(this)->words.begin(), static_cast<_Base*>(this)->words.end(), [realV](Vid w) 131 | { 132 | return w < realV; 133 | }); 134 | } 135 | }; 136 | 137 | template 138 | struct DocumentLDA : public DocumentBase, SumWordWeight, _tw> 139 | { 140 | public: 141 | using DocumentBase::DocumentBase; 142 | using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type; 143 | 144 | tvector Zs; 145 | tvector wordWeights; 146 | ShareableMatrix numByTopic; 147 | 148 | DECLARE_SERIALIZER_WITH_VERSION(0); 149 | DECLARE_SERIALIZER_WITH_VERSION(1); 150 | 151 | template void update(WeightType* ptr, const _TopicModel& mdl); 152 | 153 | WeightType getWordWeight(size_t idx) const 154 | { 155 | return _tw == TermWeight::one ? 1 : wordWeights[idx]; 156 | } 157 | 158 | std::vector getCountVector(size_t V) const 159 | { 160 | std::vector vs(V); 161 | for (size_t i = 0; i < words.size(); ++i) 162 | { 163 | if (words[i] >= V) continue; 164 | vs[words[i]] += wordWeights.empty() ? 1.f : wordWeights[i]; 165 | } 166 | return vs; 167 | } 168 | }; 169 | 170 | struct LDAArgs 171 | { 172 | size_t k = 1; 173 | std::vector alpha = { (Float)0.1 }; 174 | Float eta = (Float)0.01; 175 | size_t seed = std::random_device{}(); 176 | }; 177 | 178 | class ILDAModel : public ITopicModel 179 | { 180 | public: 181 | using DefaultDocType = DocumentLDA; 182 | static ILDAModel* create(TermWeight _weight, const LDAArgs& args, 183 | bool scalarRng = false); 184 | 185 | virtual TermWeight getTermWeight() const = 0; 186 | virtual size_t getOptimInterval() const = 0; 187 | virtual void setOptimInterval(size_t) = 0; 188 | virtual size_t getBurnInIteration() const = 0; 189 | virtual void setBurnInIteration(size_t) = 0; 190 | virtual std::vector getCountByTopic() const = 0; 191 | virtual Float getAlpha() const = 0; 192 | virtual Float getAlpha(size_t k) const = 0; 193 | virtual Float getEta() const = 0; 194 | 195 | virtual std::vector getWordPrior(const std::string& word) const = 0; 196 | virtual void setWordPrior(const std::string& word, const std::vector& priors) = 0; 197 | }; 198 | } 199 | -------------------------------------------------------------------------------- /src/TopicModel/LDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "LDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentLDA); 9 | 10 | ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/TopicModel/LLDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentLLDA : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | using WeightType = typename DocumentLDA<_tw>::WeightType; 12 | Eigen::Matrix labelMask; 13 | 14 | DECLARE_SERIALIZER_WITH_VERSION(0); 15 | DECLARE_SERIALIZER_WITH_VERSION(1); 16 | }; 17 | 18 | class ILLDAModel : public ILDAModel 19 | { 20 | public: 21 | using DefaultDocType = DocumentLLDA; 22 | static ILLDAModel* create(TermWeight _weight, const LDAArgs& args, 23 | bool scalarRng = false); 24 | 25 | virtual const Dictionary& getTopicLabelDict() const = 0; 26 | 27 | virtual size_t getNumTopicsPerLabel() const = 0; 28 | }; 29 | } -------------------------------------------------------------------------------- /src/TopicModel/LLDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "LLDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentLLDA); 9 | 10 | ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/MGLDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentMGLDA : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | using WeightType = typename DocumentLDA<_tw>::WeightType; 12 | 13 | std::vector sents; // sentence id of each word (const) 14 | std::vector numBySent; // number of words in the sentence (const) 15 | 16 | //std::vector Zs; // gl./loc. and topic assignment 17 | std::vector Vs; // window assignment 18 | WeightType numGl = 0; // number of words assigned as gl. 19 | //std::vector numByTopic; // len = K + KL 20 | Eigen::Matrix numBySentWin; // len = S * T 21 | Eigen::Matrix numByWinL; // number of words assigned as loc. in the window (len = S + T - 1) 22 | Eigen::Matrix numByWin; // number of words in the window (len = S + T - 1) 23 | Eigen::Matrix numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1)) 24 | 25 | DECLARE_SERIALIZER_WITH_VERSION(0); 26 | DECLARE_SERIALIZER_WITH_VERSION(1); 27 | 28 | template void update(WeightType* ptr, const _TopicModel& mdl); 29 | }; 30 | 31 | struct MGLDAArgs : public LDAArgs 32 | { 33 | size_t kL = 1; 34 | size_t t = 3; 35 | std::vector alphaL = { 0.1 }; 36 | Float alphaMG = 0.1; 37 | Float alphaML = 0.1; 38 | Float etaL = 0.01; 39 | Float gamma = 0.1; 40 | }; 41 | 42 | class IMGLDAModel : public ILDAModel 43 | { 44 | public: 45 | using DefaultDocType = DocumentMGLDA; 46 | static IMGLDAModel* create(TermWeight _weight, const MGLDAArgs& args, 47 | bool scalarRng = false); 48 | 49 | virtual size_t getKL() const = 0; 50 | virtual size_t getT() const = 0; 51 | virtual Float getAlphaL() const = 0; 52 | virtual Float getEtaL() const = 0; 53 | virtual Float getGamma() const = 0; 54 | virtual Float getAlphaM() const = 0; 55 | virtual Float getAlphaML() const = 0; 56 | }; 57 | } -------------------------------------------------------------------------------- /src/TopicModel/MGLDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "MGLDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentMGLDA); 9 | 10 | IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/PA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentPA : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | using WeightType = typename DocumentLDA<_tw>::WeightType; 12 | 13 | tvector Z2s; 14 | Eigen::Matrix numByTopic1_2; 15 | 16 | template void update(WeightType* ptr, const _TopicModel& mdl); 17 | 18 | DECLARE_SERIALIZER_WITH_VERSION(0); 19 | DECLARE_SERIALIZER_WITH_VERSION(1); 20 | }; 21 | 22 | struct PAArgs : public LDAArgs 23 | { 24 | size_t k2 = 1; 25 | std::vector subalpha = { 0.1 }; 26 | }; 27 | 28 | class IPAModel : public ILDAModel 29 | { 30 | public: 31 | using DefaultDocType = DocumentPA; 32 | static IPAModel* create(TermWeight _weight, const PAArgs& args, 33 | bool scalarRng = false); 34 | 35 | virtual size_t getDirichletEstIteration() const = 0; 36 | virtual void setDirichletEstIteration(size_t iter) = 0; 37 | virtual size_t getK2() const = 0; 38 | virtual Float getSubAlpha(Tid k1, Tid k2) const = 0; 39 | virtual std::vector getSubAlpha(Tid k1) const = 0; 40 | virtual std::vector getSubTopicBySuperTopic(Tid k, bool normalize = true) const = 0; 41 | virtual std::vector> getSubTopicBySuperTopicSorted(Tid k, size_t topN) const = 0; 42 | 43 | virtual std::vector getSubTopicsByDoc(const DocumentBase* doc, bool normalize = true) const = 0; 44 | virtual std::vector> getSubTopicsByDocSorted(const DocumentBase* doc, size_t topN) const = 0; 45 | 46 | virtual std::vector getCountBySuperTopic() const = 0; 47 | }; 48 | } 49 | -------------------------------------------------------------------------------- /src/TopicModel/PAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "PAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentPA); 9 | 10 | IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, PAModel, args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/TopicModel/PLDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LLDA.h" 3 | 4 | namespace tomoto 5 | { 6 | struct PLDAArgs : public LDAArgs 7 | { 8 | size_t numLatentTopics = 0; 9 | size_t numTopicsPerLabel = 1; 10 | 11 | PLDAArgs setK(size_t _k = 1) const 12 | { 13 | PLDAArgs ret = *this; 14 | ret.k = _k; 15 | return ret; 16 | } 17 | }; 18 | 19 | class IPLDAModel : public ILLDAModel 20 | { 21 | public: 22 | using DefaultDocType = DocumentLLDA; 23 | static IPLDAModel* create(TermWeight _weight, const PLDAArgs& args, 24 | bool scalarRng = false); 25 | 26 | virtual size_t getNumLatentTopics() const = 0; 27 | }; 28 | } -------------------------------------------------------------------------------- /src/TopicModel/PLDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "PLDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | IPLDAModel* IPLDAModel::create(TermWeight _weight, const PLDAArgs& args, bool scalarRng) 6 | { 7 | TMT_SWITCH_TW(_weight, scalarRng, PLDAModel, args); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/TopicModel/PT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentPT : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | using WeightType = typename DocumentLDA<_tw>::WeightType; 12 | 13 | uint64_t pseudoDoc = 0; 14 | 15 | DECLARE_SERIALIZER_WITH_VERSION(0); 16 | DECLARE_SERIALIZER_WITH_VERSION(1); 17 | }; 18 | 19 | struct PTArgs : public LDAArgs 20 | { 21 | size_t p = 0; 22 | Float lambda = 0.01; 23 | }; 24 | 25 | class IPTModel : public ILDAModel 26 | { 27 | public: 28 | using DefaultDocType = DocumentPT; 29 | static IPTModel* create(TermWeight _weight, const PTArgs& args, 30 | bool scalarRng = false); 31 | 32 | virtual size_t getP() const = 0; 33 | virtual std::vector getTopicsFromPseudoDoc(const DocumentBase* doc, bool normalize = true) const = 0; 34 | virtual std::vector> getTopicsFromPseudoDocSorted(const DocumentBase* doc, size_t topN) const = 0; 35 | }; 36 | } 37 | -------------------------------------------------------------------------------- /src/TopicModel/PTModel.cpp: -------------------------------------------------------------------------------- 1 | #include "PTModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentPT); 9 | 10 | IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, PTModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/TopicModel/SLDA.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "LDA.h" 3 | 4 | namespace tomoto 5 | { 6 | template 7 | struct DocumentSLDA : public DocumentLDA<_tw> 8 | { 9 | using BaseDocument = DocumentLDA<_tw>; 10 | using DocumentLDA<_tw>::DocumentLDA; 11 | std::vector y; 12 | 13 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override 14 | { 15 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm); 16 | ret["y"] = y; 17 | return ret; 18 | } 19 | 20 | DECLARE_SERIALIZER_WITH_VERSION(0); 21 | DECLARE_SERIALIZER_WITH_VERSION(1); 22 | }; 23 | 24 | struct SLDAArgs; 25 | 26 | class ISLDAModel : public ILDAModel 27 | { 28 | public: 29 | enum class GLM 30 | { 31 | linear = 0, 32 | binary_logistic = 1, 33 | }; 34 | 35 | using DefaultDocType = DocumentSLDA; 36 | static ISLDAModel* create(TermWeight _weight, const SLDAArgs& args, 37 | bool scalarRng = false); 38 | 39 | virtual size_t getF() const = 0; 40 | virtual std::vector getRegressionCoef(size_t f) const = 0; 41 | virtual GLM getTypeOfVar(size_t f) const = 0; 42 | virtual std::vector estimateVars(const DocumentBase* doc) const = 0; 43 | }; 44 | 45 | struct SLDAArgs : public LDAArgs 46 | { 47 | std::vector vars; 48 | std::vector mu; 49 | std::vector nuSq; 50 | std::vector glmParam; 51 | }; 52 | } -------------------------------------------------------------------------------- /src/TopicModel/SLDAModel.cpp: -------------------------------------------------------------------------------- 1 | #include "SLDAModel.hpp" 2 | 3 | namespace tomoto 4 | { 5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 0, y); 6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 1, 0x00010001, y); 7 | 8 | TMT_INSTANTIATE_DOC(DocumentSLDA); 9 | 10 | ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng) 11 | { 12 | TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args); 13 | } 14 | } -------------------------------------------------------------------------------- /src/Utils/AliasMethod.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace tomoto 7 | { 8 | namespace sample 9 | { 10 | template 11 | class AliasMethod 12 | { 13 | std::unique_ptr<_Precision[]> arr; 14 | std::unique_ptr alias; 15 | size_t msize = 0, bitsize = 0; 16 | 17 | public: 18 | AliasMethod() 19 | { 20 | } 21 | 22 | AliasMethod(const AliasMethod& o) 23 | { 24 | operator=(o); 25 | } 26 | 27 | AliasMethod(AliasMethod&& o) 28 | { 29 | operator=(o); 30 | } 31 | 32 | AliasMethod& operator=(const AliasMethod& o) 33 | { 34 | msize = o.msize; 35 | bitsize = o.bitsize; 36 | if (msize) 37 | { 38 | size_t n = (size_t)1 << bitsize; 39 | arr = std::make_unique<_Precision[]>(n); 40 | alias = std::make_unique(n); 41 | 42 | std::copy(o.arr.get(), o.arr.get() + n, arr.get()); 43 | std::copy(o.alias.get(), o.alias.get() + n, alias.get()); 44 | } 45 | return *this; 46 | } 47 | 48 | AliasMethod& operator=(AliasMethod&& o) 49 | { 50 | msize = o.msize; 51 | bitsize = o.bitsize; 52 | std::swap(arr, o.arr); 53 | std::swap(alias, o.alias); 54 | return *this; 55 | } 56 | 57 | template 58 | AliasMethod(_Iter first, _Iter last) 59 | { 60 | buildTable(first, last); 61 | } 62 | 63 | template 64 | void buildTable(_Iter first, _Iter last) 65 | { 66 | size_t psize, nbsize; 67 | msize = 0; 68 | double sum = 0; 69 | for (auto it = first; it != last; ++it, ++msize) 70 | { 71 | sum += *it; 72 | } 73 | 74 | if (!std::isfinite(sum)) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "cannot build NaN value distribution"); 75 | 76 | // ceil to power of 2 77 | nbsize = log2_ceil(msize); 78 | psize = (size_t)1 << nbsize; 79 | 80 | if (nbsize != bitsize) 81 | { 82 | arr = std::make_unique<_Precision[]>(psize); 83 | std::fill(arr.get(), arr.get() + psize, 0); 84 | alias = std::make_unique(psize); 85 | bitsize = nbsize; 86 | } 87 | 88 | sum /= psize; 89 | 90 | auto f = std::make_unique(psize); 91 | auto pf = f.get(); 92 | for (auto it = first; it != last; ++it, ++pf) 93 | { 94 | *pf = *it / sum; 95 | } 96 | std::fill(pf, pf + psize - msize, 0); 97 | 98 | size_t over = 0, under = 0, mm; 99 | while (over < psize && f[over] < 1) ++over; 100 | while (under < psize && f[under] >= 1) ++under; 101 | mm = under + 1; 102 | 103 | while (over < psize && under < psize) 104 | { 105 | arr[under] = f[under] * (std::numeric_limits<_Precision>::max() + 1.0); 106 | alias[under] = over; 107 | f[over] += f[under] - 1; 108 | if (f[over] >= 1 || mm <= over) 109 | { 110 | for (under = mm; under < psize && f[under] >= 1; ++under); 111 | mm = under + 1; 112 | } 113 | else 114 | { 115 | under = over; 116 | } 117 | 118 | while (over < psize && f[over] < 1) ++over; 119 | } 120 | 121 | for (; over < psize; ++over) 122 | { 123 | if (f[over] >= 1) 124 | { 125 | arr[over] = std::numeric_limits<_Precision>::max(); 126 | alias[over] = over; 127 | } 128 | } 129 | 130 | if (under < psize) 131 | { 132 | arr[under] = std::numeric_limits<_Precision>::max(); 133 | alias[under] = under; 134 | for (under = mm; under < msize; ++under) 135 | { 136 | if (f[under] < 1) 137 | { 138 | arr[under] = std::numeric_limits<_Precision>::max(); 139 | alias[under] = under; 140 | } 141 | } 142 | } 143 | } 144 | 145 | template 146 | size_t operator()(_Rng& rng) const 147 | { 148 | auto x = rng(); 149 | size_t a; 150 | if (sizeof(_Precision) < sizeof(typename _Rng::result_type)) 151 | { 152 | a = x >> (sizeof(x) * 8 - bitsize); 153 | } 154 | else 155 | { 156 | a = rng() & ((1 << bitsize) - 1); 157 | } 158 | 159 | _Precision b = (_Precision)x; 160 | if (b < arr[a]) 161 | { 162 | assert(a < msize); 163 | return a; 164 | } 165 | assert(alias[a] < msize); 166 | return alias[a]; 167 | } 168 | }; 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/Utils/Dictionary.cpp: -------------------------------------------------------------------------------- 1 | #include "Dictionary.h" 2 | 3 | namespace tomoto 4 | { 5 | Dictionary::Dictionary() = default; 6 | Dictionary::~Dictionary() = default; 7 | 8 | Dictionary::Dictionary(const Dictionary&) = default; 9 | Dictionary& Dictionary::operator=(const Dictionary&) = default; 10 | 11 | Dictionary::Dictionary(Dictionary&&) noexcept = default; 12 | Dictionary& Dictionary::operator=(Dictionary&&) noexcept = default; 13 | 14 | Vid Dictionary::add(const std::string& word) 15 | { 16 | auto it = dict.find(word); 17 | if (it == dict.end()) 18 | { 19 | dict.emplace(word, (Vid)dict.size()); 20 | id2word.emplace_back(word); 21 | return (Vid)(dict.size() - 1); 22 | } 23 | return it->second; 24 | } 25 | 26 | const std::string& Dictionary::toWord(Vid vid) const 27 | { 28 | assert(vid < id2word.size()); 29 | return id2word[vid]; 30 | } 31 | 32 | Vid Dictionary::toWid(const std::string& word) const 33 | { 34 | auto it = dict.find(word); 35 | if (it == dict.end()) return non_vocab_id; 36 | return it->second; 37 | } 38 | 39 | void Dictionary::serializerWrite(std::ostream& writer) const 40 | { 41 | serializer::writeMany(writer, serializer::to_key("Dict"), id2word); 42 | } 43 | 44 | void Dictionary::serializerRead(std::istream& reader) 45 | { 46 | serializer::readMany(reader, serializer::to_key("Dict"), id2word); 47 | for (size_t i = 0; i < id2word.size(); ++i) 48 | { 49 | dict.emplace(id2word[i], (Vid)i); 50 | } 51 | } 52 | 53 | uint64_t Dictionary::computeHash(uint64_t seed) const 54 | { 55 | return serializer::computeHashMany(seed, id2word); 56 | } 57 | 58 | void Dictionary::swap(Dictionary& rhs) 59 | { 60 | std::swap(dict, rhs.dict); 61 | std::swap(id2word, rhs.id2word); 62 | } 63 | 64 | void Dictionary::reorder(const std::vector& order) 65 | { 66 | for (auto& p : dict) 67 | { 68 | p.second = order[p.second]; 69 | id2word[p.second] = p.first; 70 | } 71 | } 72 | 73 | const std::vector& Dictionary::getRaw() const 74 | { 75 | return id2word; 76 | } 77 | 78 | Vid Dictionary::mapToNewDict(Vid v, const Dictionary& newDict) const 79 | { 80 | return newDict.toWid(toWord(v)); 81 | } 82 | 83 | std::vector Dictionary::mapToNewDict(const std::vector& v, const Dictionary& newDict) const 84 | { 85 | std::vector r(v.size()); 86 | for (size_t i = 0; i < v.size(); ++i) 87 | { 88 | r[i] = mapToNewDict(v[i], newDict); 89 | } 90 | return r; 91 | } 92 | 93 | std::vector Dictionary::mapToNewDictAdd(const std::vector& v, Dictionary& newDict) const 94 | { 95 | std::vector r(v.size()); 96 | for (size_t i = 0; i < v.size(); ++i) 97 | { 98 | r[i] = mapToNewDict(v[i], newDict); 99 | } 100 | return r; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/Utils/Dictionary.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "serializer.hpp" 10 | 11 | namespace tomoto 12 | { 13 | using Vid = uint32_t; 14 | static constexpr Vid non_vocab_id = (Vid)-1; 15 | static constexpr Vid rm_vocab_id = (Vid)-2; 16 | using Tid = uint16_t; 17 | static constexpr Tid non_topic_id = (Tid)-1; 18 | using Float = float; 19 | 20 | struct VidPair : public std::pair 21 | { 22 | using std::pair::pair; 23 | }; 24 | 25 | class Dictionary 26 | { 27 | protected: 28 | std::unordered_map dict; 29 | std::vector id2word; 30 | public: 31 | 32 | Dictionary(); 33 | ~Dictionary(); 34 | 35 | Dictionary(const Dictionary&); 36 | Dictionary& operator=(const Dictionary&); 37 | 38 | Dictionary(Dictionary&&) noexcept; 39 | Dictionary& operator=(Dictionary&&) noexcept; 40 | 41 | Vid add(const std::string& word); 42 | 43 | size_t size() const { return dict.size(); } 44 | 45 | const std::string& toWord(Vid vid) const; 46 | 47 | Vid toWid(const std::string& word) const; 48 | 49 | void serializerWrite(std::ostream& writer) const; 50 | 51 | void serializerRead(std::istream& reader); 52 | 53 | uint64_t computeHash(uint64_t seed) const; 54 | 55 | void swap(Dictionary& rhs); 56 | 57 | void reorder(const std::vector& order); 58 | 59 | const std::vector& getRaw() const; 60 | 61 | Vid mapToNewDict(Vid v, const Dictionary& newDict) const; 62 | 63 | std::vector mapToNewDict(const std::vector& v, const Dictionary& newDict) const; 64 | 65 | std::vector mapToNewDictAdd(const std::vector& v, Dictionary& newDict) const; 66 | }; 67 | 68 | } 69 | 70 | namespace std 71 | { 72 | template<> 73 | struct hash 74 | { 75 | size_t operator()(const tomoto::VidPair& p) const 76 | { 77 | return hash{}(p.first) ^ hash{}(p.second); 78 | } 79 | }; 80 | } 81 | -------------------------------------------------------------------------------- /src/Utils/LBFGS.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2019 Yixuan Qiu 2 | // Under MIT license 3 | // https://github.com/yixuan/LBFGSpp 4 | // bab2min modified some features 5 | 6 | #ifndef LBFGS_H 7 | #define LBFGS_H 8 | 9 | #include 10 | #include "LBFGS/Param.h" 11 | #include "LBFGS/LineSearchBacktracking.h" 12 | #include "LBFGS/LineSearchBracketing.h" 13 | 14 | 15 | namespace LBFGSpp { 16 | 17 | 18 | /// 19 | /// LBFGS solver for unconstrained numerical optimization 20 | /// 21 | template < typename Scalar, 22 | template class LineSearch = LineSearchBacktracking > 23 | class LBFGSSolver 24 | { 25 | private: 26 | typedef Eigen::Matrix Vector; 27 | typedef Eigen::Matrix Matrix; 28 | typedef Eigen::Map MapVec; 29 | 30 | static constexpr Scalar epsilon = Scalar(0.001); // add epsilon for preventing division-by-zero 31 | 32 | LBFGSParam m_param; // Parameters to control the LBFGS algorithm 33 | Matrix m_s; // History of the s vectors 34 | Matrix m_y; // History of the y vectors 35 | Vector m_ys; // History of the s'y values 36 | Vector m_alpha; // History of the step lengths 37 | Vector m_fx; // History of the objective function values 38 | Vector m_xp; // Old x 39 | Vector m_grad; // New gradient 40 | Vector m_gradp; // Old gradient 41 | Vector m_drt; // Moving direction 42 | 43 | inline void reset(int n) 44 | { 45 | const int m = m_param.m; 46 | m_s.resize(n, m); 47 | m_y.resize(n, m); 48 | m_ys.resize(m); 49 | m_alpha.resize(m); 50 | m_xp.resize(n); 51 | m_grad.resize(n); 52 | m_gradp.resize(n); 53 | m_drt.resize(n); 54 | if (m_param.past > 0) 55 | m_fx.resize(m_param.past); 56 | } 57 | 58 | public: 59 | /// 60 | /// Constructor for LBFGS solver. 61 | /// 62 | /// \param param An object of \ref LBFGSParam to store parameters for the 63 | /// algorithm 64 | /// 65 | LBFGSSolver(const LBFGSParam& param = {}) : 66 | m_param(param) 67 | { 68 | m_param.check_param(); 69 | } 70 | 71 | /// 72 | /// Minimizing a multivariate function using LBFGS algorithm. 73 | /// Exceptions will be thrown if error occurs. 74 | /// 75 | /// \param f A function object such that `f(x, grad)` returns the 76 | /// objective function value at `x`, and overwrites `grad` with 77 | /// the gradient. 78 | /// \param x In: An initial guess of the optimal point. Out: The best point 79 | /// found. 80 | /// \param fx Out: The objective function value at `x`. 81 | /// 82 | /// \return Number of iterations used. 83 | /// 84 | template 85 | inline int minimize(Foo&& f, Eigen::Ref x, Scalar& fx) 86 | { 87 | const int n = x.size(); 88 | const int fpast = m_param.past; 89 | reset(n); 90 | 91 | // Evaluate function and compute gradient 92 | fx = f(x, m_grad); 93 | 94 | Scalar xnorm = x.norm(); 95 | Scalar gnorm = m_grad.norm(); 96 | if (fpast > 0) 97 | m_fx[0] = fx; 98 | 99 | // Early exit if the initial x is already a minimizer 100 | if (gnorm <= m_param.epsilon * std::max(xnorm, Scalar(1.0))) 101 | { 102 | return 1; 103 | } 104 | 105 | // Initial direction 106 | m_drt.noalias() = -m_grad; 107 | // Initial step 108 | Scalar step = Scalar(1.0) / m_drt.norm(); 109 | 110 | int k = 1; 111 | int end = 0; 112 | for (; ; ) 113 | { 114 | // Save the curent x and gradient 115 | m_xp.noalias() = x; 116 | m_gradp.noalias() = m_grad; 117 | 118 | // Line search to update x, fx and gradient 119 | LineSearch::LineSearch(f, fx, x, m_grad, step, m_drt, m_xp, m_param); 120 | 121 | // New x norm and gradient norm 122 | xnorm = x.norm(); 123 | gnorm = m_grad.norm(); 124 | 125 | // Convergence test -- gradient 126 | if (gnorm <= m_param.epsilon * std::max(xnorm, Scalar(1.0))) 127 | { 128 | return k; 129 | } 130 | // Convergence test -- objective function value 131 | if (fpast > 0) 132 | { 133 | if (k >= fpast && std::abs((m_fx[k % fpast] - fx) / fx) < m_param.delta) 134 | return k; 135 | 136 | m_fx[k % fpast] = fx; 137 | } 138 | // Maximum number of iterations 139 | if (m_param.max_iterations != 0 && k >= m_param.max_iterations) 140 | { 141 | return k; 142 | } 143 | 144 | // Update s and y 145 | // s_{k+1} = x_{k+1} - x_k 146 | // y_{k+1} = g_{k+1} - g_k 147 | MapVec svec(&m_s(0, end), n); 148 | MapVec yvec(&m_y(0, end), n); 149 | svec.noalias() = x - m_xp; 150 | yvec.noalias() = m_grad - m_gradp; 151 | 152 | // ys = y's = 1/rho 153 | // yy = y'y 154 | Scalar ys = yvec.dot(svec); 155 | Scalar yy = yvec.squaredNorm(); 156 | 157 | /* prevent division-by-zero */ 158 | if (yy == 0 || ys == 0) 159 | { 160 | ys += epsilon; 161 | yy += epsilon; 162 | } 163 | m_ys[end] = ys; 164 | 165 | // Recursive formula to compute d = -H * g 166 | m_drt.noalias() = -m_grad; 167 | int bound = std::min(m_param.m, k); 168 | end = (end + 1) % m_param.m; 169 | int j = end; 170 | for (int i = 0; i < bound; i++) 171 | { 172 | j = (j + m_param.m - 1) % m_param.m; 173 | MapVec sj(&m_s(0, j), n); 174 | MapVec yj(&m_y(0, j), n); 175 | m_alpha[j] = sj.dot(m_drt) / m_ys[j]; 176 | m_drt.noalias() -= m_alpha[j] * yj; 177 | } 178 | 179 | m_drt *= (ys / yy); 180 | 181 | for (int i = 0; i < bound; i++) 182 | { 183 | MapVec sj(&m_s(0, j), n); 184 | MapVec yj(&m_y(0, j), n); 185 | Scalar beta = yj.dot(m_drt) / m_ys[j]; 186 | m_drt.noalias() += (m_alpha[j] - beta) * sj; 187 | j = (j + 1) % m_param.m; 188 | } 189 | 190 | // step = 1.0 as initial guess 191 | step = Scalar(1.0); 192 | k++; 193 | } 194 | 195 | return k; 196 | } 197 | }; 198 | 199 | 200 | } // namespace LBFGSpp 201 | 202 | #endif // LBFGS_H -------------------------------------------------------------------------------- /src/Utils/LBFGS/LineSearchBacktracking.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2019 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LINE_SEARCH_BACKTRACKING_H 5 | #define LINE_SEARCH_BACKTRACKING_H 6 | 7 | #include 8 | #include // std::runtime_error 9 | 10 | 11 | namespace LBFGSpp { 12 | 13 | 14 | /// 15 | /// The backtracking line search algorithm for LBFGS. Mainly for internal use. 16 | /// 17 | template 18 | class LineSearchBacktracking 19 | { 20 | private: 21 | typedef Eigen::Matrix Vector; 22 | 23 | public: 24 | /// 25 | /// Line search by backtracking. 26 | /// 27 | /// \param f A function object such that `f(x, grad)` returns the 28 | /// objective function value at `x`, and overwrites `grad` with 29 | /// the gradient. 30 | /// \param fx In: The objective function value at the current point. 31 | /// Out: The function value at the new point. 32 | /// \param x Out: The new point moved to. 33 | /// \param grad In: The current gradient vector. Out: The gradient at the 34 | /// new point. 35 | /// \param step In: The initial step length. Out: The calculated step length. 36 | /// \param drt The current moving direction. 37 | /// \param xp The current point. 38 | /// \param param Parameters for the LBFGS algorithm 39 | /// 40 | template 41 | static void LineSearch(Foo& f, Scalar& fx, Eigen::Ref x, Vector& grad, 42 | Scalar& step, 43 | const Vector& drt, const Vector& xp, 44 | const LBFGSParam& param) 45 | { 46 | // Decreasing and increasing factors 47 | const Scalar dec = 0.5; 48 | const Scalar inc = 2.1; 49 | 50 | // Check the value of step 51 | if (step <= Scalar(0)) 52 | std::invalid_argument("'step' must be positive"); 53 | 54 | // Save the function value at the current x 55 | const Scalar fx_init = fx; 56 | // Projection of gradient on the search direction 57 | const Scalar dg_init = grad.dot(drt); 58 | // Make sure d points to a descent direction 59 | if (dg_init > 0) 60 | std::logic_error("the moving direction increases the objective function value"); 61 | 62 | const Scalar dg_test = param.ftol * dg_init; 63 | Scalar width; 64 | 65 | int iter; 66 | for (iter = 0; iter < param.max_linesearch; iter++) 67 | { 68 | // x_{k+1} = x_k + step * d_k 69 | x.noalias() = xp + step * drt; 70 | // Evaluate this candidate 71 | fx = f(x, grad); 72 | 73 | if (fx > fx_init + step * dg_test) 74 | { 75 | width = dec; 76 | } 77 | else { 78 | // Armijo condition is met 79 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) 80 | break; 81 | 82 | const Scalar dg = grad.dot(drt); 83 | if (dg < param.wolfe * dg_init) 84 | { 85 | width = inc; 86 | } 87 | else { 88 | // Regular Wolfe condition is met 89 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) 90 | break; 91 | 92 | if (dg > -param.wolfe * dg_init) 93 | { 94 | width = dec; 95 | } 96 | else { 97 | // Strong Wolfe condition is met 98 | break; 99 | } 100 | } 101 | } 102 | 103 | if (iter >= param.max_linesearch) 104 | throw std::runtime_error("the line search routine reached the maximum number of iterations"); 105 | 106 | if (step < param.min_step) 107 | throw std::runtime_error("the line search step became smaller than the minimum value allowed"); 108 | 109 | if (step > param.max_step) 110 | throw std::runtime_error("the line search step became larger than the maximum value allowed"); 111 | 112 | step *= width; 113 | } 114 | } 115 | }; 116 | 117 | 118 | } // namespace LBFGSpp 119 | 120 | #endif // LINE_SEARCH_BACKTRACKING_H -------------------------------------------------------------------------------- /src/Utils/LBFGS/LineSearchBracketing.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2019 Yixuan Qiu & Dirk Toewe 2 | // Under MIT license 3 | 4 | #ifndef LINE_SEARCH_BRACKETING_H 5 | #define LINE_SEARCH_BRACKETING_H 6 | 7 | #include 8 | #include // std::runtime_error 9 | 10 | namespace LBFGSpp { 11 | 12 | 13 | /// 14 | /// The bracketing line search algorithm for LBFGS. Mainly for internal use. 15 | /// 16 | template 17 | class LineSearchBracketing 18 | { 19 | private: 20 | typedef Eigen::Matrix Vector; 21 | 22 | public: 23 | /// 24 | /// Line search by bracketing. Similar to the backtracking line search 25 | /// except that it actively maintains an upper and lower bound of the 26 | /// current search range. 27 | /// 28 | /// \param f A function object such that `f(x, grad)` returns the 29 | /// objective function value at `x`, and overwrites `grad` with 30 | /// the gradient. 31 | /// \param fx In: The objective function value at the current point. 32 | /// Out: The function value at the new point. 33 | /// \param x Out: The new point moved to. 34 | /// \param grad In: The current gradient vector. Out: The gradient at the 35 | /// new point. 36 | /// \param step In: The initial step length. Out: The calculated step length. 37 | /// \param drt The current moving direction. 38 | /// \param xp The current point. 39 | /// \param param Parameters for the LBFGS algorithm 40 | /// 41 | template 42 | static void LineSearch(Foo&& f, Scalar& fx, Eigen::Ref x, Vector& grad, 43 | Scalar& step, 44 | const Vector& drt, const Vector& xp, 45 | const LBFGSParam& param) 46 | { 47 | // Check the value of step 48 | if (step <= Scalar(0)) 49 | std::invalid_argument("'step' must be positive"); 50 | 51 | // Save the function value at the current x 52 | const Scalar fx_init = fx; 53 | // Projection of gradient on the search direction 54 | const Scalar dg_init = grad.dot(drt); 55 | // Make sure d points to a descent direction 56 | if (dg_init > 0) 57 | std::logic_error("the moving direction increases the objective function value"); 58 | 59 | const Scalar dg_test = param.ftol * dg_init; 60 | 61 | // Upper and lower end of the current line search range 62 | Scalar step_lo = 0, 63 | step_hi = std::numeric_limits::infinity(); 64 | 65 | for (int iter = 0; iter < param.max_linesearch; iter++) 66 | { 67 | // x_{k+1} = x_k + step * d_k 68 | x.noalias() = xp + step * drt; 69 | // Evaluate this candidate 70 | fx = f(x, grad); 71 | 72 | if (fx > fx_init + step * dg_test) 73 | { 74 | step_hi = step; 75 | } 76 | else { 77 | // Armijo condition is met 78 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) 79 | break; 80 | 81 | const Scalar dg = grad.dot(drt); 82 | if (dg < param.wolfe * dg_init) 83 | { 84 | step_lo = step; 85 | } 86 | else { 87 | // Regular Wolfe condition is met 88 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) 89 | break; 90 | 91 | if (dg > -param.wolfe * dg_init) 92 | { 93 | step_hi = step; 94 | } 95 | else { 96 | // Strong Wolfe condition is met 97 | break; 98 | } 99 | } 100 | } 101 | 102 | assert(step_lo < step_hi); 103 | 104 | if (iter >= param.max_linesearch) 105 | throw std::runtime_error("the line search routine reached the maximum number of iterations"); 106 | 107 | if (step < param.min_step) 108 | throw std::runtime_error("the line search step became smaller than the minimum value allowed"); 109 | 110 | if (step > param.max_step) 111 | throw std::runtime_error("the line search step became larger than the maximum value allowed"); 112 | 113 | // continue search in mid of current search range 114 | step = std::isinf(step_hi) ? 2 * step : step_lo / 2 + step_hi / 2; 115 | } 116 | } 117 | }; 118 | 119 | 120 | } // namespace LBFGSpp 121 | 122 | #endif // LINE_SEARCH_BRACKETING_H 123 | -------------------------------------------------------------------------------- /src/Utils/LUT.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace tomoto 5 | { 6 | namespace math 7 | { 8 | namespace detail 9 | { 10 | template 11 | class LUT3 12 | { 13 | protected: 14 | std::array<_Prec, N + M + L> points = {}; 15 | static constexpr _Prec P = (_Prec)(1. / S); 16 | static constexpr _Prec Q = (_Prec)(1. / T); 17 | static constexpr _Prec R = (_Prec)(1. / U); 18 | LUT3() 19 | { 20 | _Func fun; 21 | for (size_t i = 0; i < N; i++) 22 | { 23 | points[i] = fun(i ? i * P : (_Prec)0.0001); 24 | } 25 | for (size_t i = 0; i < M; i++) 26 | { 27 | points[i + N] = fun(i*Q + N * P); 28 | } 29 | for (size_t i = 0; i < L; i++) 30 | { 31 | points[i + N + M] = fun(i*R + N * P + M * Q); 32 | } 33 | } 34 | 35 | _Prec _get(_Prec x) const 36 | { 37 | if (!std::isfinite(x)) return _Func{}.forNonFinite(x); 38 | if (x < 0) return NAN; 39 | if (x < _Func::smallThreshold) return _Func{}.forSmall(x); 40 | if (x >= N * P + M * Q + (L - 1) * R) return _Func{}.forLarge(x); 41 | size_t idx; 42 | _Prec a; 43 | _Prec nx = x; 44 | if (x < N*P) 45 | { 46 | idx = (size_t)(nx / P); 47 | a = (nx - idx * P) / P; 48 | } 49 | else 50 | { 51 | nx -= N * P; 52 | if (nx < M*Q) 53 | { 54 | idx = (size_t)(nx / Q); 55 | a = (nx - idx * Q) / Q; 56 | idx += N; 57 | } 58 | else 59 | { 60 | nx -= M * Q; 61 | idx = (size_t)(nx / R); 62 | a = (nx - idx * R) / R; 63 | idx += N + M; 64 | } 65 | } 66 | return points[idx] + a * (points[idx + 1] - points[idx]); 67 | } 68 | public: 69 | static const LUT3& getInst() 70 | { 71 | static LUT3 lg; 72 | return lg; 73 | } 74 | 75 | static _Prec get(_Prec x) 76 | { 77 | return getInst()._get(x); 78 | } 79 | }; 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /src/Utils/Mmap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Mmap.h" 3 | 4 | namespace tomoto 5 | { 6 | namespace utils 7 | { 8 | static std::u16string utf8To16(const std::string& str) 9 | { 10 | std::u16string ret; 11 | for (auto it = str.begin(); it != str.end(); ++it) 12 | { 13 | uint32_t code = 0; 14 | uint32_t byte = (uint8_t)*it; 15 | if ((byte & 0xF8) == 0xF0) 16 | { 17 | code = (uint32_t)((byte & 0x07) << 18); 18 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 19 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 20 | code |= (uint32_t)((byte & 0x3F) << 12); 21 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 22 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 23 | code |= (uint32_t)((byte & 0x3F) << 6); 24 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 25 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 26 | code |= (byte & 0x3F); 27 | } 28 | else if ((byte & 0xF0) == 0xE0) 29 | { 30 | code = (uint32_t)((byte & 0x0F) << 12); 31 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 32 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 33 | code |= (uint32_t)((byte & 0x3F) << 6); 34 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 35 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 36 | code |= (byte & 0x3F); 37 | } 38 | else if ((byte & 0xE0) == 0xC0) 39 | { 40 | code = (uint32_t)((byte & 0x1F) << 6); 41 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" }; 42 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" }; 43 | code |= (byte & 0x3F); 44 | } 45 | else if ((byte & 0x80) == 0x00) 46 | { 47 | code = byte; 48 | } 49 | else 50 | { 51 | throw std::invalid_argument{ "unicode error" }; 52 | } 53 | 54 | if (code < 0x10000) 55 | { 56 | ret.push_back((char16_t)code); 57 | } 58 | else if (code < 0x10FFFF) 59 | { 60 | code -= 0x10000; 61 | ret.push_back((char16_t)(0xD800 | (code >> 10))); 62 | ret.push_back((char16_t)(0xDC00 | (code & 0x3FF))); 63 | } 64 | else 65 | { 66 | throw std::invalid_argument{ "unicode error" }; 67 | } 68 | } 69 | return ret; 70 | } 71 | } 72 | } 73 | 74 | namespace tomoto 75 | { 76 | namespace utils 77 | { 78 | MMap::MMap(const std::string& filepath) 79 | { 80 | #ifdef _WIN32 81 | hFile = CreateFileW((const wchar_t*)utf8To16(filepath).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr); 82 | if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'"); 83 | hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr); 84 | if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError())); 85 | view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0); 86 | if (!view) throw std::ios_base::failure("Cannot MapViewOfFile() Code:" + std::to_string(GetLastError())); 87 | DWORD high; 88 | len = GetFileSize(hFile, &high); 89 | len |= (uint64_t)high << 32; 90 | #else 91 | fd = open(filepath.c_str(), O_RDONLY); 92 | if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'"); 93 | struct stat sb; 94 | if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'"); 95 | len = sb.st_size; 96 | view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0); 97 | if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed"); 98 | #endif 99 | } 100 | 101 | #ifdef _WIN32 102 | MMap::MMap(MMap&& o) noexcept 103 | : view{ o.view }, len{ o.len } 104 | { 105 | o.view = nullptr; 106 | std::swap(hFile, o.hFile); 107 | std::swap(hFileMap, o.hFileMap); 108 | } 109 | #else 110 | MMap::MMap(MMap&& o) noexcept 111 | : len{ o.len }, fd{ std::move(o.fd) } 112 | { 113 | std::swap(view, o.view); 114 | } 115 | #endif 116 | 117 | MMap& MMap::operator=(MMap&& o) noexcept 118 | { 119 | std::swap(view, o.view); 120 | std::swap(len, o.len); 121 | #ifdef _WIN32 122 | std::swap(hFile, o.hFile); 123 | std::swap(hFileMap, o.hFileMap); 124 | #else 125 | std::swap(fd, o.fd); 126 | #endif 127 | return *this; 128 | } 129 | 130 | MMap::~MMap() 131 | { 132 | #ifdef _WIN32 133 | if (hFileMap) 134 | { 135 | UnmapViewOfFile(view); 136 | view = nullptr; 137 | } 138 | #else 139 | if (view) 140 | { 141 | munmap((void*)view, len); 142 | } 143 | #endif 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/Utils/Mmap.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #ifdef _WIN32 6 | #define NOMINMAX 7 | #include 8 | namespace tomoto 9 | { 10 | namespace utils 11 | { 12 | namespace detail 13 | { 14 | class HandleGuard 15 | { 16 | HANDLE handle = nullptr; 17 | public: 18 | HandleGuard(HANDLE _handle = nullptr) : handle(_handle) 19 | { 20 | } 21 | 22 | HandleGuard(const HandleGuard&) = delete; 23 | HandleGuard& operator =(const HandleGuard&) = delete; 24 | 25 | HandleGuard(HandleGuard&& o) noexcept 26 | { 27 | std::swap(handle, o.handle); 28 | } 29 | 30 | HandleGuard& operator=(HandleGuard&& o) noexcept 31 | { 32 | std::swap(handle, o.handle); 33 | return *this; 34 | } 35 | 36 | ~HandleGuard() 37 | { 38 | if (handle && handle != INVALID_HANDLE_VALUE) 39 | { 40 | CloseHandle(handle); 41 | handle = nullptr; 42 | } 43 | } 44 | 45 | operator HANDLE() const 46 | { 47 | return handle; 48 | } 49 | }; 50 | } 51 | 52 | class MMap 53 | { 54 | const char* view = nullptr; 55 | uint64_t len = 0; 56 | detail::HandleGuard hFile, hFileMap; 57 | public: 58 | MMap(const std::string& filepath); 59 | MMap(const MMap&) = delete; 60 | MMap& operator=(const MMap&) = delete; 61 | MMap(MMap&& o) noexcept; 62 | MMap& operator=(MMap&& o) noexcept; 63 | ~MMap(); 64 | 65 | const char* get() const { return view; } 66 | size_t size() const { return len; } 67 | }; 68 | } 69 | } 70 | #else 71 | #include 72 | #include 73 | #include 74 | #include 75 | #include 76 | 77 | namespace tomoto 78 | { 79 | namespace utils 80 | { 81 | namespace detail 82 | { 83 | class FDGuard 84 | { 85 | int fd = 0; 86 | public: 87 | FDGuard(int _fd = 0) : fd(_fd) 88 | { 89 | } 90 | 91 | FDGuard(const FDGuard&) = delete; 92 | FDGuard& operator =(const FDGuard&) = delete; 93 | 94 | FDGuard(FDGuard&& o) 95 | { 96 | std::swap(fd, o.fd); 97 | } 98 | 99 | FDGuard& operator=(FDGuard&& o) 100 | { 101 | std::swap(fd, o.fd); 102 | return *this; 103 | } 104 | 105 | ~FDGuard() 106 | { 107 | if (fd && fd != -1) 108 | { 109 | close(fd); 110 | fd = 0; 111 | } 112 | } 113 | 114 | operator int() const 115 | { 116 | return fd; 117 | } 118 | }; 119 | } 120 | 121 | class MMap 122 | { 123 | const char* view = nullptr; 124 | size_t len = 0; 125 | detail::FDGuard fd; 126 | public: 127 | MMap(const std::string& filepath); 128 | MMap(const MMap&) = delete; 129 | MMap& operator=(const MMap&) = delete; 130 | MMap(MMap&& o) noexcept; 131 | MMap& operator=(MMap&& o) noexcept; 132 | ~MMap(); 133 | 134 | const char* get() const { return view; } 135 | size_t size() const { return len; } 136 | }; 137 | } 138 | } 139 | #endif 140 | -------------------------------------------------------------------------------- /src/Utils/MultiNormalDistribution.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "serializer.hpp" 5 | 6 | namespace tomoto 7 | { 8 | namespace math 9 | { 10 | template 11 | struct MultiNormalDistribution 12 | { 13 | static constexpr _Ty log2pi = (_Ty)1.83787706641; 14 | Eigen::Matrix<_Ty, -1, 1> mean; 15 | Eigen::Matrix<_Ty, -1, -1> cov, l; 16 | _Ty logDet = 0; 17 | 18 | MultiNormalDistribution(size_t k = 0) : 19 | mean{ Eigen::Matrix<_Ty, -1, 1>::Zero(k) }, 20 | cov{ Eigen::Matrix<_Ty, -1, -1>::Identity(k, k) }, 21 | l{ Eigen::Matrix<_Ty, -1, -1>::Identity(k, k) } 22 | { 23 | } 24 | 25 | _Ty getLL(const Eigen::Matrix<_Ty, -1, 1>& x) const 26 | { 27 | _Ty ll = -((x - mean).transpose() * cov.inverse() * (x - mean))[0] / 2; 28 | ll -= log2pi * mean.size() / 2 + logDet; 29 | return ll; 30 | } 31 | 32 | const Eigen::Matrix<_Ty, -1, -1>& getCovL() const 33 | { 34 | return l; 35 | } 36 | 37 | template 38 | static MultiNormalDistribution<_Ty> estimate(_List list, size_t len) 39 | { 40 | MultiNormalDistribution<_Ty> newDist; 41 | if (len) 42 | { 43 | newDist.mean = list(0); 44 | for (size_t i = 1; i < len; ++i) newDist.mean += list(i); 45 | newDist.mean /= len; 46 | newDist.cov = Eigen::Matrix<_Ty, -1, -1>::Identity(newDist.mean.size(), newDist.mean.size()); 47 | for (size_t i = 0; i < len; ++i) 48 | { 49 | Eigen::Matrix<_Ty, -1, 1> o = list(i) - newDist.mean; 50 | newDist.cov += o * o.transpose(); 51 | } 52 | if (len > 1) newDist.cov /= len - 1; 53 | } 54 | Eigen::MatrixXd l = newDist.cov.template cast().llt().matrixL(); 55 | newDist.l = l.template cast(); 56 | newDist.logDet = l.diagonal().array().log().sum(); 57 | return newDist; 58 | } 59 | 60 | DEFINE_SERIALIZER_CALLBACK(onRead, mean, cov); 61 | DEFINE_HASHER(mean, cov); 62 | private: 63 | void onRead() 64 | { 65 | l = cov.llt().matrixL(); 66 | logDet = l.diagonal().array().log().sum(); 67 | } 68 | }; 69 | 70 | } 71 | } -------------------------------------------------------------------------------- /src/Utils/PolyaGamma.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace tomoto 8 | { 9 | namespace math 10 | { 11 | template 12 | class PolyaGamma 13 | { 14 | static constexpr _Real __PI = 3.141592653589793238462643383279502884197; 15 | static constexpr _Real HALFPISQ = 0.5 * __PI * __PI; 16 | static constexpr _Real FOURPISQ = 4 * __PI * __PI; 17 | static constexpr _Real __TRUNC = 0.64; 18 | static constexpr _Real __TRUNC_RECIP = 1.0 / __TRUNC; 19 | 20 | static _Real p_norm(_Real x) 21 | { 22 | return std::erf(x / std::sqrt((_Real)2)) / 2 + 0.5f; 23 | } 24 | 25 | static _Real draw_like_devroye(_Real Z, _RNG& r) 26 | { 27 | // Change the parameter. 28 | Z = std::fabs(Z) * 0.5; 29 | 30 | // Now sample 0.25 * J^*(1, Z := Z/2). 31 | _Real fz = 0.125 * __PI*__PI + 0.5 * Z*Z; 32 | // ... Problems with large Z? Try using q_over_p. 33 | // double p = 0.5 * __PI * exp(-1.0 * fz * __TRUNC) / fz; 34 | // double q = 2 * exp(-1.0 * Z) * pigauss(__TRUNC, Z); 35 | 36 | _Real X = 0.0; 37 | _Real S = 1.0; 38 | _Real Y = 0.0; 39 | // int iter = 0; If you want to keep track of iterations. 40 | 41 | while (true) 42 | { 43 | 44 | // if (r.unif() < p/(p+q)) 45 | if (std::generate_canonical<_Real, sizeof(_Real) * 8>(r) < mass_texpon(Z)) 46 | X = __TRUNC + std::exponential_distribution<_Real>()(r) / fz; 47 | else 48 | X = rtigauss(Z, r); 49 | 50 | S = a(0, X); 51 | Y = std::generate_canonical<_Real, sizeof(_Real) * 8>(r) * S; 52 | int n = 0; 53 | bool go = true; 54 | 55 | // Cap the number of iterations? 56 | while (go) 57 | { 58 | ++n; 59 | if (n % 2 == 1) 60 | { 61 | S = S - a(n, X); 62 | if (Y <= S) return 0.25 * X; 63 | } 64 | else 65 | { 66 | S = S + a(n, X); 67 | if (Y > S) go = false; 68 | } 69 | 70 | } 71 | // Need Y <= S in event that Y = S, e.g. when X = 0. 72 | 73 | } 74 | } 75 | 76 | static _Real a(int n, _Real x) 77 | { 78 | _Real K = (n + 0.5) * __PI; 79 | _Real y = 0; 80 | if (x > __TRUNC) { 81 | y = K * std::exp(-0.5 * K*K * x); 82 | } 83 | else if (x > 0) { 84 | _Real expnt = -1.5 * (std::log(0.5 * __PI) + std::log(x)) + std::log(K) - 2.0 * (n + 0.5)*(n + 0.5) / x; 85 | y = std::exp(expnt); 86 | // y = pow(0.5 * __PI * x, -1.5) * K * exp( -2.0 * (n+0.5)*(n+0.5) / x); 87 | // ^- unstable for small x? 88 | } 89 | return y; 90 | } 91 | 92 | static _Real mass_texpon(_Real Z) 93 | { 94 | _Real t = __TRUNC; 95 | 96 | _Real fz = 0.125 * __PI*__PI + 0.5 * Z*Z; 97 | _Real b = std::sqrt(1.0 / t) * (t * Z - 1); 98 | _Real a = std::sqrt(1.0 / t) * (t * Z + 1) * -1.0; 99 | 100 | _Real x0 = log(fz) + fz * t; 101 | _Real xb = x0 - Z + log(p_norm(b)); 102 | _Real xa = x0 + Z + log(p_norm(a)); 103 | 104 | _Real qdivp = 4 / __PI * (exp(xb) + exp(xa)); 105 | 106 | return 1.0 / (1.0 + qdivp); 107 | } 108 | 109 | static _Real rtigauss(_Real Z, _RNG& r) 110 | { 111 | Z = std::fabs(Z); 112 | _Real t = __TRUNC; 113 | _Real X = t + 1.0; 114 | if (__TRUNC_RECIP > Z) 115 | { // mu > t 116 | _Real alpha = 0.0; 117 | while (std::generate_canonical<_Real, sizeof(_Real) * 8>(r) > alpha) 118 | { 119 | // X = t + 1.0; 120 | // while (X > t) 121 | // X = 1.0 / r.gamma_rate(0.5, 0.5); 122 | // Slightly faster to use truncated normal. 123 | _Real E1 = std::exponential_distribution<_Real>()(r); 124 | _Real E2 = std::exponential_distribution<_Real>()(r); 125 | while (E1*E1 > 2 * E2 / t) 126 | { 127 | E1 = std::exponential_distribution<_Real>()(r); 128 | E2 = std::exponential_distribution<_Real>()(r); 129 | } 130 | X = 1 + E1 * t; 131 | X = t / (X * X); 132 | alpha = std::exp(-0.5 * Z*Z * X); 133 | } 134 | } 135 | else 136 | { 137 | _Real mu = 1.0 / Z; 138 | while (X > t) 139 | { 140 | _Real Y = std::normal_distribution<_Real>()(r); Y *= Y; 141 | _Real half_mu = 0.5 * mu; 142 | _Real mu_Y = mu * Y; 143 | X = mu + half_mu * mu_Y - half_mu * sqrt(4 * mu_Y + mu_Y * mu_Y); 144 | if (std::generate_canonical<_Real, sizeof(_Real) * 8>(r) > mu / (mu + X)) 145 | X = mu * mu / X; 146 | } 147 | } 148 | return X; 149 | } 150 | 151 | static _Real jj_m1(_Real b, _Real z) 152 | { 153 | z = std::fabs(z); 154 | _Real m1 = 0.0; 155 | if (z > 1e-12) 156 | m1 = b * std::tanh(z) / z; 157 | else 158 | m1 = b * (1 - (1.0 / 3) * std::pow(z, 2) + (2.0 / 15) * std::pow(z, 4) - (17.0 / 315) * std::pow(z, 6)); 159 | return m1; 160 | } 161 | 162 | static _Real jj_m2(_Real b, _Real z) 163 | { 164 | z = std::fabs(z); 165 | double m2 = 0.0; 166 | if (z > 1e-12) 167 | m2 = (b + 1) * b * std::pow(tanh(z) / z, 2) + b * ((std::tanh(z) - z) / std::pow(z, 3)); 168 | else 169 | m2 = (b + 1) * b * std::pow(1 - (1.0 / 3) * std::pow(z, 2) + (2.0 / 15) * std::pow(z, 4) - (17.0 / 315) * std::pow(z, 6), 2) + 170 | b * ((-1.0 / 3) + (2.0 / 15) * std::pow(z, 2) - (17.0 / 315) * std::pow(z, 4)); 171 | return m2; 172 | } 173 | 174 | public: 175 | static _Real draw(size_t n, _Real z, _RNG& r) 176 | { 177 | _Real sum = 0.0; 178 | for (size_t i = 0; i < n; ++i) 179 | sum += draw_like_devroye(z, r); 180 | return sum; 181 | } 182 | 183 | static _Real pg_m1(_Real b, _Real z) 184 | { 185 | return jj_m1(b, 0.5 * z) * 0.25; 186 | } 187 | 188 | static _Real pg_m2(_Real b, _Real z) 189 | { 190 | return jj_m2(b, 0.5 * z) * 0.0625; 191 | } 192 | 193 | }; 194 | 195 | template _Real drawPolyaGamma(size_t n, _Real z, _RNG& r) 196 | { 197 | return PolyaGamma<_Real, _RNG>::draw(n, z, r); 198 | } 199 | } 200 | } -------------------------------------------------------------------------------- /src/Utils/SharedString.cpp: -------------------------------------------------------------------------------- 1 | #include "SharedString.h" 2 | 3 | namespace tomoto 4 | { 5 | void SharedString::incref() 6 | { 7 | if (ptr) 8 | { 9 | ++*(size_t*)ptr; 10 | } 11 | } 12 | 13 | void SharedString::decref() 14 | { 15 | if (ptr) 16 | { 17 | if (--*(size_t*)ptr == 0) 18 | { 19 | delete[] ptr; 20 | ptr = nullptr; 21 | } 22 | } 23 | } 24 | 25 | void SharedString::init(const char* _begin, const char* _end) 26 | { 27 | ptr = new char[_end - _begin + 9]; 28 | *(size_t*)ptr = 1; 29 | len = _end - _begin; 30 | std::memcpy((void*)(ptr + 8), _begin, _end - _begin); 31 | ((char*)ptr)[_end - _begin + 8] = 0; 32 | } 33 | 34 | SharedString::SharedString() 35 | { 36 | } 37 | 38 | SharedString::SharedString(const char* _begin, const char* _end) 39 | { 40 | init(_begin, _end); 41 | } 42 | 43 | SharedString::SharedString(const char* _ptr) 44 | { 45 | if (_ptr) 46 | { 47 | init(_ptr, _ptr + std::strlen(_ptr)); 48 | } 49 | } 50 | 51 | SharedString::SharedString(const std::string& str) 52 | { 53 | if (!str.empty()) 54 | { 55 | init(str.data(), str.data() + str.size()); 56 | } 57 | } 58 | 59 | SharedString::SharedString(const SharedString& o) noexcept 60 | : ptr{ o.ptr }, len{ o.len } 61 | { 62 | incref(); 63 | } 64 | 65 | SharedString::SharedString(SharedString&& o) noexcept 66 | { 67 | std::swap(ptr, o.ptr); 68 | std::swap(len, o.len); 69 | } 70 | 71 | SharedString::~SharedString() 72 | { 73 | decref(); 74 | } 75 | 76 | SharedString& SharedString::operator=(const SharedString& o) 77 | { 78 | if (this != &o) 79 | { 80 | decref(); 81 | ptr = o.ptr; 82 | len = o.len; 83 | incref(); 84 | } 85 | return *this; 86 | } 87 | 88 | SharedString& SharedString::operator=(SharedString&& o) noexcept 89 | { 90 | std::swap(ptr, o.ptr); 91 | std::swap(len, o.len); 92 | return *this; 93 | } 94 | 95 | SharedString::operator std::string() const 96 | { 97 | if (!ptr) return {}; 98 | return { ptr + 8, ptr + 8 + len }; 99 | } 100 | 101 | const char* SharedString::c_str() const 102 | { 103 | if (!ptr) return ""; 104 | return ptr + 8; 105 | } 106 | 107 | std::string SharedString::substr(size_t start, size_t len) const 108 | { 109 | return { c_str() + start, c_str() + start + len }; 110 | } 111 | 112 | bool SharedString::operator==(const SharedString& o) const 113 | { 114 | if (ptr == o.ptr) return true; 115 | if (size() != o.size()) return false; 116 | return std::equal(begin(), end(), o.begin()); 117 | } 118 | 119 | bool SharedString::operator==(const std::string& o) const 120 | { 121 | if (size() != o.size()) return false; 122 | return std::equal(begin(), end(), o.begin()); 123 | } 124 | 125 | bool SharedString::operator!=(const SharedString& o) const 126 | { 127 | return !operator==(o); 128 | } 129 | 130 | bool SharedString::operator!=(const std::string& o) const 131 | { 132 | return !operator==(o); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/Utils/SharedString.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "serializer.hpp" 5 | 6 | namespace tomoto 7 | { 8 | class SharedString 9 | { 10 | const char* ptr = nullptr; 11 | size_t len = 0; 12 | 13 | void incref(); 14 | 15 | void decref(); 16 | 17 | void init(const char* _begin, const char* _end); 18 | 19 | public: 20 | 21 | SharedString(); 22 | explicit SharedString(const char* _begin, const char* _end); 23 | explicit SharedString(const char* _ptr); 24 | explicit SharedString(const std::string& str); 25 | SharedString(const SharedString& o) noexcept; 26 | SharedString(SharedString&& o) noexcept; 27 | ~SharedString(); 28 | SharedString& operator=(const SharedString& o); 29 | SharedString& operator=(SharedString&& o) noexcept; 30 | 31 | size_t size() const 32 | { 33 | if (ptr) return len; 34 | return 0; 35 | } 36 | 37 | bool empty() const 38 | { 39 | return size() == 0; 40 | } 41 | 42 | operator std::string() const; 43 | 44 | const char* c_str() const; 45 | 46 | const char* data() const 47 | { 48 | return c_str(); 49 | } 50 | 51 | const char* begin() const 52 | { 53 | return data(); 54 | } 55 | 56 | const char* end() const 57 | { 58 | return data() + size(); 59 | } 60 | 61 | std::string substr(size_t start, size_t len) const; 62 | 63 | bool operator==(const SharedString& o) const; 64 | bool operator==(const std::string& o) const; 65 | 66 | bool operator!=(const SharedString& o) const; 67 | bool operator!=(const std::string& o) const; 68 | }; 69 | 70 | namespace serializer 71 | { 72 | template<> 73 | struct Serializer 74 | { 75 | using VTy = SharedString; 76 | void write(std::ostream& ostr, const VTy& v) 77 | { 78 | writeToStream(ostr, (uint32_t)v.size()); 79 | if (!ostr.write((const char*)v.data(), v.size())) 80 | throw std::ios_base::failure(std::string("writing type 'SharedString' is failed")); 81 | } 82 | 83 | void read(std::istream& istr, VTy& v) 84 | { 85 | auto size = readFromStream(istr); 86 | std::vector t(size); 87 | if (!istr.read((char*)t.data(), t.size())) 88 | throw std::ios_base::failure(std::string("reading type 'SharedString' is failed")); 89 | v = SharedString{ t.data(), t.data() + t.size() }; 90 | } 91 | }; 92 | } 93 | } 94 | 95 | namespace std 96 | { 97 | template <> struct hash 98 | { 99 | size_t operator()(const tomoto::SharedString& x) const 100 | { 101 | return hash{}(x); 102 | } 103 | }; 104 | } 105 | -------------------------------------------------------------------------------- /src/Utils/ThreadPool.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool) 5 | modified by bab2min to have additional parameter threadId 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace tomoto 19 | { 20 | class ThreadPool { 21 | public: 22 | ThreadPool(size_t threads = 0, size_t maxQueued = 0); 23 | template 24 | auto enqueue(F&& f, Args&&... args) 25 | ->std::future::type>; 26 | 27 | template 28 | auto enqueueToAll(F&& f, Args&&... args) 29 | ->std::vector::type>>; 30 | 31 | ~ThreadPool(); 32 | 33 | size_t getNumWorkers() const { return workers.size(); } 34 | size_t getNumEnqued() const { return tasks.size(); } 35 | private: 36 | // need to keep track of threads so we can join them 37 | std::vector< std::thread > workers; 38 | // the task queue 39 | std::queue< std::function > shared_task; 40 | std::vector< std::queue< std::function > > tasks; 41 | // synchronization 42 | std::mutex queue_mutex; 43 | std::condition_variable condition, inputCnd; 44 | size_t maxQueued; 45 | bool stop; 46 | }; 47 | 48 | 49 | // the constructor just launches some amount of workers 50 | inline ThreadPool::ThreadPool(size_t threads, size_t _maxQueued) 51 | : tasks(threads), maxQueued(_maxQueued), stop(false) 52 | { 53 | for (size_t i = 0; i < threads; ++i) 54 | { 55 | workers.emplace_back([this, i] 56 | { 57 | while (1) 58 | { 59 | std::function task; 60 | 61 | { 62 | std::unique_lock lock(this->queue_mutex); 63 | this->condition.wait(lock, 64 | [this, i] { return this->stop || !this->shared_task.empty() || !this->tasks[i].empty(); }); 65 | if (this->stop && this->shared_task.empty() && this->tasks[i].empty()) return; 66 | if (this->tasks[i].empty()) 67 | { 68 | task = std::move(this->shared_task.front()); 69 | this->shared_task.pop(); 70 | } 71 | else 72 | { 73 | task = std::move(this->tasks[i].front()); 74 | this->tasks[i].pop(); 75 | } 76 | 77 | if (this->maxQueued) this->inputCnd.notify_all(); 78 | } 79 | 80 | //std::cout << "Start #" << i << std::endl; 81 | task(i); 82 | //std::cout << "End #" << i << std::endl; 83 | } 84 | }); 85 | } 86 | } 87 | 88 | // add new work item to the pool 89 | template 90 | auto ThreadPool::enqueue(F&& f, Args&&... args) 91 | -> std::future::type> 92 | { 93 | using return_type = typename std::result_of::type; 94 | 95 | auto task = std::make_shared< std::packaged_task >( 96 | std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...)); 97 | 98 | std::future res = task->get_future(); 99 | { 100 | std::unique_lock lock(queue_mutex); 101 | 102 | // don't allow enqueueing after stopping the pool 103 | if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); 104 | if (maxQueued && shared_task.size() >= maxQueued) 105 | { 106 | inputCnd.wait(lock, [&]() { return shared_task.size() < maxQueued; }); 107 | } 108 | shared_task.emplace([task](size_t id) { (*task)(id); }); 109 | } 110 | condition.notify_one(); 111 | return res; 112 | } 113 | 114 | template 115 | auto ThreadPool::enqueueToAll(F&& f, Args&&... args) 116 | ->std::vector::type> > 117 | { 118 | using return_type = typename std::result_of::type; 119 | 120 | std::vector > ret; 121 | std::unique_lock lock(queue_mutex); 122 | for (size_t i = 0; i < workers.size(); ++i) 123 | { 124 | auto task = std::make_shared< std::packaged_task >( 125 | std::bind(f, std::placeholders::_1, args...)); 126 | 127 | ret.emplace_back(task->get_future()); 128 | 129 | { 130 | // don't allow enqueueing after stopping the pool 131 | if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); 132 | tasks[i].emplace([task](size_t id) { (*task)(id); }); 133 | } 134 | } 135 | condition.notify_all(); 136 | return ret; 137 | } 138 | 139 | // the destructor joins all threads 140 | inline ThreadPool::~ThreadPool() 141 | { 142 | { 143 | std::unique_lock lock(queue_mutex); 144 | stop = true; 145 | } 146 | condition.notify_all(); 147 | for (std::thread &worker : workers) 148 | worker.join(); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/Utils/TruncMultiNormal.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "MultiNormalDistribution.hpp" 4 | #include "rtnorm.hpp" 5 | 6 | namespace tomoto 7 | { 8 | namespace math 9 | { 10 | template 11 | _Out sampleFromTruncatedMultiNormal( 12 | _Out ret, 13 | const MultiNormalDistribution<_Ty>& multiNormal, 14 | const Eigen::Matrix<_Ty, -1, 1>& lowerBound, 15 | const Eigen::Matrix<_Ty, -1, 1>& upperBound, 16 | _Rng& rng, 17 | size_t burnIn 18 | ) 19 | { 20 | const size_t K = ret.size(); 21 | Eigen::Matrix<_Ty, -1, -1> l = multiNormal.getCovL(); 22 | ret = (lowerBound + upperBound) / 2; 23 | Eigen::Matrix<_Ty, -1, 1> z = l.template triangularView().solve(ret - multiNormal.mean), 24 | a = lowerBound - multiNormal.mean, 25 | b = upperBound - multiNormal.mean, 26 | t, at, bt; 27 | for (size_t i = 0; i < burnIn; ++i) 28 | { 29 | for (size_t j = 0; j < K; ++j) 30 | { 31 | auto lj = l.col(j); 32 | z[j] = 0; 33 | t = l * z; 34 | _Ty lower_pos = -INFINITY, upper_pos = INFINITY, 35 | lower_neg = -INFINITY, upper_neg = INFINITY; 36 | at = ((a - t).array() / lj.array()).matrix(); 37 | bt = ((b - t).array() / lj.array()).matrix(); 38 | for (size_t k = 0; k < K; ++k) 39 | { 40 | if (lj[k] > 0) 41 | { 42 | lower_pos = std::max(lower_pos, at[k]); 43 | upper_pos = std::min(upper_pos, bt[k]); 44 | } 45 | else if (lj[k] < 0) 46 | { 47 | lower_neg = std::max(lower_neg, bt[k]); 48 | upper_neg = std::min(upper_neg, at[k]); 49 | } 50 | } 51 | lower_pos = std::max(lower_pos, lower_neg); 52 | upper_pos = std::min(upper_pos, upper_neg); 53 | // this is due to numerical instability 54 | if (lower_pos >= upper_pos) 55 | { 56 | std::cerr << __FILE__ << "(" << __LINE__ << "): wrong truncation range [" << lower_pos << ", " << upper_pos << "]" << std::endl; 57 | z[j] = (lower_pos + upper_pos) / 2; 58 | } 59 | else 60 | { 61 | z[j] = (_Ty)rtnorm::rtnorm(rng, lower_pos, upper_pos); 62 | } 63 | } 64 | } 65 | ret = (l * z) + multiNormal.mean; 66 | return ret; 67 | } 68 | 69 | template 70 | _Out sampleFromTruncatedMultiNormalRejection( 71 | _Out ret, 72 | const MultiNormalDistribution<_Ty>& multiNormal, 73 | const Eigen::Matrix<_Ty, -1, 1>& lowerBound, 74 | const Eigen::Matrix<_Ty, -1, 1>& upperBound, 75 | _Rng& rng) 76 | { 77 | const size_t K = ret.size(); 78 | auto& l = multiNormal.getCovL(); 79 | std::normal_distribution<_Ty> normal{}; 80 | while (1) 81 | { 82 | for (size_t k = 0; k < K; ++k) ret[k] = normal(rng); 83 | ret = l * ret; 84 | ret += multiNormal.mean; 85 | if ((lowerBound.array() <= ret.array()).all() && (ret.array() <= upperBound.array()).all()) 86 | { 87 | return ret; 88 | } 89 | } 90 | } 91 | } 92 | } -------------------------------------------------------------------------------- /src/Utils/avx_gamma.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "avx_mathfun.h" 3 | 4 | // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) 5 | inline __m256 lgamma_ps(__m256 x) 6 | { 7 | __m256 x_3 = _mm256_add_ps(x, _mm256_set1_ps(3)); 8 | __m256 ret = _mm256_mul_ps(_mm256_add_ps(x_3, _mm256_set1_ps(-0.5f)), log_ps(x_3)); 9 | ret = _mm256_sub_ps(ret, x_3); 10 | ret = _mm256_add_ps(ret, _mm256_set1_ps(0.91893853f)); 11 | ret = _mm256_add_ps(ret, _mm256_div_ps(_mm256_set1_ps(1 / 12.f), x_3)); 12 | ret = _mm256_sub_ps(ret, log_ps(_mm256_mul_ps( 13 | _mm256_mul_ps(_mm256_sub_ps(x_3, _mm256_set1_ps(1)), _mm256_sub_ps(x_3, _mm256_set1_ps(2))), x))); 14 | return ret; 15 | } 16 | 17 | // approximation : lgamma(z + a) - lgamma(z) = (z + a + 1.5) * log(z + a + 2) - (z + 1.5) * log(z + 2) - a + (1. / (z + a + 2) - 1. / (z + 2)) / 12. - log(((z + a) * (z + a + 1)) / (z * (z + 1))) 18 | inline __m256 lgamma_subt(__m256 z, __m256 a) 19 | { 20 | __m256 _1p5 = _mm256_set1_ps(1.5); 21 | __m256 _2 = _mm256_set1_ps(2); 22 | __m256 za = _mm256_add_ps(z, a); 23 | __m256 ret = _mm256_mul_ps(_mm256_add_ps(za, _1p5), log_ps(_mm256_add_ps(za, _2))); 24 | ret = _mm256_sub_ps(ret, _mm256_mul_ps(_mm256_add_ps(z, _1p5), log_ps(_mm256_add_ps(z, _2)))); 25 | ret = _mm256_sub_ps(ret, a); 26 | __m256 _1 = _mm256_set1_ps(1); 27 | __m256 _1_12 = _mm256_set1_ps(1 / 12.f); 28 | ret = _mm256_add_ps(ret, _mm256_sub_ps(_mm256_div_ps(_1_12, _mm256_add_ps(za, _2)), _mm256_div_ps(_1_12, _mm256_add_ps(z, _2)))); 29 | ret = _mm256_sub_ps(ret, log_ps(_mm256_div_ps(_mm256_div_ps(_mm256_mul_ps(za, _mm256_add_ps(za, _1)), z), _mm256_add_ps(z, _1)))); 30 | return ret; 31 | } 32 | 33 | 34 | // approximation : digamma(z) ~= ln(z+4) - 1/2/(z+4) - 1/12/(z+4)^2 - 1/z - 1/(z+1) - 1/(z+2) - 1/(z+3) 35 | inline __m256 digamma_ps(__m256 x) 36 | { 37 | __m256 x_4 = _mm256_add_ps(x, _mm256_set1_ps(4)); 38 | __m256 ret = log_ps(x_4); 39 | ret = _mm256_sub_ps(ret, _mm256_div_ps(_mm256_set1_ps(1 / 2.f), x_4)); 40 | ret = _mm256_sub_ps(ret, _mm256_div_ps(_mm256_div_ps(_mm256_set1_ps(1 / 12.f), x_4), x_4)); 41 | ret = _mm256_sub_ps(ret, _mm256_rcp_ps(_mm256_sub_ps(x_4, _mm256_set1_ps(1)))); 42 | ret = _mm256_sub_ps(ret, _mm256_rcp_ps(_mm256_sub_ps(x_4, _mm256_set1_ps(2)))); 43 | ret = _mm256_sub_ps(ret, _mm256_rcp_ps(_mm256_sub_ps(x_4, _mm256_set1_ps(3)))); 44 | ret = _mm256_sub_ps(ret, _mm256_rcp_ps(_mm256_sub_ps(x_4, _mm256_set1_ps(4)))); 45 | return ret; 46 | } 47 | -------------------------------------------------------------------------------- /src/Utils/exception.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "text.hpp" 4 | namespace tomoto 5 | { 6 | namespace exc 7 | { 8 | class TrainingError : public std::runtime_error 9 | { 10 | public: 11 | using std::runtime_error::runtime_error; 12 | }; 13 | 14 | class Unimplemented : public std::runtime_error 15 | { 16 | public: 17 | using std::runtime_error::runtime_error; 18 | }; 19 | 20 | class InvalidArgument : public std::invalid_argument 21 | { 22 | public: 23 | using std::invalid_argument::invalid_argument; 24 | }; 25 | 26 | class EmptyWordArgument : public InvalidArgument 27 | { 28 | public: 29 | using InvalidArgument::InvalidArgument; 30 | }; 31 | } 32 | } 33 | 34 | #define THROW_ERROR_WITH_INFO(exec, msg) do {throw exec(tomoto::text::format("%s (%d): ", __FILE__, __LINE__) + msg); } while(0) 35 | -------------------------------------------------------------------------------- /src/Utils/neon_gamma.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | inline float32x4_t accurate_rcp(float32x4_t x) 4 | { 5 | float32x4_t r = vrecpeq_f32(x); 6 | return vmulq_f32(vrecpsq_f32(x, r), r); 7 | } 8 | 9 | // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) 10 | inline float32x4_t lgamma_ps(float32x4_t x) 11 | { 12 | float32x4_t x_3 = vaddq_f32(x, vmovq_n_f32(3)); 13 | float32x4_t ret = vmulq_f32(vaddq_f32(x_3, vmovq_n_f32(-0.5f)), Eigen::internal::plog(x_3)); 14 | ret = vsubq_f32(ret, x_3); 15 | ret = vaddq_f32(ret, vmovq_n_f32(0.91893853f)); 16 | ret = vaddq_f32(ret, vdivq_f32(vmovq_n_f32(1 / 12.f), x_3)); 17 | ret = vsubq_f32(ret, Eigen::internal::plog(vmulq_f32( 18 | vmulq_f32(vsubq_f32(x_3, vmovq_n_f32(1)), vsubq_f32(x_3, vmovq_n_f32(2))), x))); 19 | return ret; 20 | } 21 | 22 | // approximation : lgamma(z + a) - lgamma(z) = (z + a + 1.5) * log(z + a + 2) - (z + 1.5) * log(z + 2) - a + (1. / (z + a + 2) - 1. / (z + 2)) / 12. - log(((z + a) * (z + a + 1)) / (z * (z + 1))) 23 | inline float32x4_t lgamma_subt(float32x4_t z, float32x4_t a) 24 | { 25 | float32x4_t _1p5 = vmovq_n_f32(1.5); 26 | float32x4_t _2 = vmovq_n_f32(2); 27 | float32x4_t za = vaddq_f32(z, a); 28 | float32x4_t ret = vmulq_f32(vaddq_f32(za, _1p5), Eigen::internal::plog(vaddq_f32(za, _2))); 29 | ret = vsubq_f32(ret, vmulq_f32(vaddq_f32(z, _1p5), Eigen::internal::plog(vaddq_f32(z, _2)))); 30 | ret = vsubq_f32(ret, a); 31 | float32x4_t _1 = vmovq_n_f32(1); 32 | float32x4_t _1_12 = vmovq_n_f32(1 / 12.f); 33 | ret = vaddq_f32(ret, vsubq_f32(vdivq_f32(_1_12, vaddq_f32(za, _2)), vdivq_f32(_1_12, vaddq_f32(z, _2)))); 34 | ret = vsubq_f32(ret, Eigen::internal::plog(vdivq_f32(vdivq_f32(vmulq_f32(za, vaddq_f32(za, _1)), z), vaddq_f32(z, _1)))); 35 | return ret; 36 | } 37 | 38 | 39 | // approximation : digamma(z) ~= ln(z+4) - 1/2/(z+4) - 1/12/(z+4)^2 - 1/z - 1/(z+1) - 1/(z+2) - 1/(z+3) 40 | inline float32x4_t digamma_ps(float32x4_t x) 41 | { 42 | float32x4_t x_4 = vaddq_f32(x, vmovq_n_f32(4)); 43 | float32x4_t ret = Eigen::internal::plog(x_4); 44 | ret = vsubq_f32(ret, vdivq_f32(vmovq_n_f32(1 / 2.f), x_4)); 45 | ret = vsubq_f32(ret, vdivq_f32(vdivq_f32(vmovq_n_f32(1 / 12.f), x_4), x_4)); 46 | ret = vsubq_f32(ret, accurate_rcp(vsubq_f32(x_4, vmovq_n_f32(1)))); 47 | ret = vsubq_f32(ret, accurate_rcp(vsubq_f32(x_4, vmovq_n_f32(2)))); 48 | ret = vsubq_f32(ret, accurate_rcp(vsubq_f32(x_4, vmovq_n_f32(3)))); 49 | ret = vsubq_f32(ret, accurate_rcp(vsubq_f32(x_4, vmovq_n_f32(4)))); 50 | return ret; 51 | } 52 | -------------------------------------------------------------------------------- /src/Utils/sample.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #if defined(__AVX__) || defined(__SSE2__) 5 | #include 6 | #else 7 | 8 | #endif 9 | 10 | #ifdef _WIN32 11 | #include 12 | #endif 13 | 14 | namespace tomoto 15 | { 16 | namespace sample 17 | { 18 | #ifdef _WIN32 19 | inline uint32_t popcnt(uint32_t i) 20 | { 21 | return __popcnt(i); 22 | } 23 | 24 | #ifdef _WIN64 25 | inline uint64_t log2_ceil(uint64_t i) 26 | { 27 | unsigned long idx; 28 | if (!_BitScanReverse64(&idx, i)) return 0; 29 | return idx + 1 - ((i & (i - 1)) == 0 ? 1 : 0); 30 | } 31 | #else 32 | inline uint32_t log2_ceil(uint32_t i) 33 | { 34 | unsigned long idx; 35 | if (!_BitScanReverse(&idx, i)) return 0; 36 | return idx + 1 - ((i & (i - 1)) == 0 ? 1 : 0); 37 | } 38 | #endif 39 | 40 | #else 41 | inline uint32_t popcnt(uint32_t i) 42 | { 43 | return __builtin_popcount(i); 44 | } 45 | 46 | #ifdef __x86_64 47 | inline uint64_t log2_ceil(uint64_t i) 48 | { 49 | return 64 - __builtin_clzll(i) - ((i & (i - 1)) == 0 ? 1 : 0); 50 | } 51 | #else 52 | inline uint32_t log2_ceil(uint32_t i) 53 | { 54 | return 32 - __builtin_clz(i) - ((i & (i - 1)) == 0 ? 1 : 0); 55 | } 56 | #endif 57 | 58 | #endif 59 | 60 | 61 | #if defined(__SSE2__) 62 | inline __m128 scan_SSE(__m128 x) 63 | { 64 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); 65 | x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 8))); 66 | return x; 67 | } 68 | 69 | inline void prefixSum(float* arr, int n) 70 | { 71 | int n4 = n & ~3; 72 | __m128 offset = _mm_setzero_ps(); 73 | for (int i = 0; i < n4; i += 4) 74 | { 75 | __m128 x = _mm_load_ps(&arr[i]); 76 | __m128 out = scan_SSE(x); 77 | out = _mm_add_ps(out, offset); 78 | _mm_store_ps(&arr[i], out); 79 | offset = _mm_shuffle_ps(out, out, _MM_SHUFFLE(3, 3, 3, 3)); 80 | } 81 | if (!n4) n4 = 1; 82 | for (int i = n4; i < n; ++i) 83 | { 84 | arr[i] += arr[i - 1]; 85 | } 86 | } 87 | #else 88 | inline void prefixSum(float* arr, int n) 89 | { 90 | int n4 = n & ~3; 91 | float acc = 0; 92 | for (int i = 0; i < n4; i += 4) 93 | { 94 | // first accumulation 95 | arr[i + 3] += arr[i + 2]; 96 | arr[i + 2] += arr[i + 1]; 97 | arr[i + 1] += arr[i]; 98 | 99 | // second accumulation 100 | arr[i + 3] += arr[i + 1]; 101 | arr[i + 2] += arr[i]; 102 | 103 | // accumulate offset 104 | arr[i] += acc; 105 | arr[i + 1] += acc; 106 | arr[i + 2] += acc; 107 | arr[i + 3] += acc; 108 | 109 | acc = arr[i + 3]; 110 | } 111 | 112 | if (!n4) n4 = 1; 113 | for (size_t i = n4; i < n; ++i) 114 | { 115 | arr[i] += arr[i - 1]; 116 | } 117 | } 118 | #endif 119 | 120 | template 121 | inline size_t sampleFromDiscrete(RealIt begin, RealIt end, Random& rg) 122 | { 123 | auto r = rg.uniform_real() * std::accumulate(begin, end, 0.f); 124 | size_t K = std::distance(begin, end); 125 | size_t z = 0; 126 | for (; r > *begin && z < K - 1; ++z, ++begin) 127 | { 128 | r -= *begin; 129 | } 130 | return z; 131 | } 132 | 133 | template 134 | inline size_t sampleFromDiscreteAcc(RealIt begin, RealIt end, Random& rg) 135 | { 136 | auto r = rg.uniform_real() * *(end - 1); 137 | size_t K = std::distance(begin, end); 138 | size_t z = 0; 139 | #ifdef __AVX__ 140 | __m256 mr = _mm256_set1_ps(r), mz; 141 | int mask; 142 | for (; z < (K >> 5) << 5; z += 32) 143 | { 144 | mz = _mm256_load_ps(&begin[z]); 145 | mask = _mm256_movemask_ps(_mm256_cmp_ps(mr, mz, _CMP_LT_OQ)); 146 | if (mask) return z + 8 - popcnt(mask); 147 | mz = _mm256_load_ps(&begin[z + 8]); 148 | mask = _mm256_movemask_ps(_mm256_cmp_ps(mr, mz, _CMP_LT_OQ)); 149 | if (mask) return z + 16 - popcnt(mask); 150 | mz = _mm256_load_ps(&begin[z + 16]); 151 | mask = _mm256_movemask_ps(_mm256_cmp_ps(mr, mz, _CMP_LT_OQ)); 152 | if (mask) return z + 24 - popcnt(mask); 153 | mz = _mm256_load_ps(&begin[z + 24]); 154 | mask = _mm256_movemask_ps(_mm256_cmp_ps(mr, mz, _CMP_LT_OQ)); 155 | if (mask) return z + 32 - popcnt(mask); 156 | } 157 | for (; z < (K >> 3) << 3; z += 8) 158 | { 159 | __m256 mz = _mm256_load_ps(&begin[z]); 160 | int mask = _mm256_movemask_ps(_mm256_cmp_ps(mr, mz, _CMP_LT_OQ)); 161 | if (mask) return z + 8 - popcnt(mask); 162 | } 163 | #elif defined(__SSE2__) 164 | __m128 mr = _mm_set1_ps(r); 165 | for (; z < (K >> 2) << 2; z += 4) 166 | { 167 | __m128 mz = _mm_load_ps(&begin[z]); 168 | int mask = _mm_movemask_ps(_mm_cmplt_ps(mr, mz)); 169 | if (mask) return z + 4 - popcnt(mask); 170 | } 171 | #else 172 | for (; z < (K >> 3) << 3; z += 8) 173 | { 174 | if (r < begin[z]) return z; 175 | if (r < begin[z + 1]) return z + 1; 176 | if (r < begin[z + 2]) return z + 2; 177 | if (r < begin[z + 3]) return z + 3; 178 | if (r < begin[z + 4]) return z + 4; 179 | if (r < begin[z + 5]) return z + 5; 180 | if (r < begin[z + 6]) return z + 6; 181 | if (r < begin[z + 7]) return z + 7; 182 | } 183 | #endif 184 | for (; z < K; ++z) 185 | { 186 | if (r < begin[z]) return z; 187 | } 188 | return K - 1; 189 | } 190 | } 191 | } 192 | 193 | #include "AliasMethod.hpp" 194 | 195 | -------------------------------------------------------------------------------- /src/Utils/serializer.cpp: -------------------------------------------------------------------------------- 1 | #include "serializer.hpp" 2 | 3 | namespace tomoto 4 | { 5 | namespace serializer 6 | { 7 | membuf::membuf(bool read, bool write, char* base, std::ptrdiff_t n) 8 | { 9 | if (read) 10 | { 11 | this->setg(base, base, base + n); 12 | } 13 | 14 | if (write) 15 | { 16 | this->setp(base, base + n); 17 | } 18 | } 19 | 20 | membuf::~membuf() = default; 21 | 22 | std::streampos membuf::seekpos(pos_type sp, std::ios_base::openmode which) 23 | { 24 | return seekoff(sp - pos_type(off_type(0)), std::ios_base::beg, which); 25 | } 26 | 27 | std::streampos membuf::seekoff(off_type off, 28 | std::ios_base::seekdir dir, 29 | std::ios_base::openmode which 30 | ) 31 | { 32 | if (which & std::ios_base::in) 33 | { 34 | if (dir == std::ios_base::cur) 35 | gbump(off); 36 | else if (dir == std::ios_base::end) 37 | setg(eback(), egptr() + off, egptr()); 38 | else if (dir == std::ios_base::beg) 39 | setg(eback(), eback() + off, egptr()); 40 | } 41 | if (which & std::ios_base::out) 42 | { 43 | if (dir == std::ios_base::cur) 44 | pbump(off); 45 | else if (dir == std::ios_base::end) 46 | setp(epptr() + off, epptr()); 47 | else if (dir == std::ios_base::beg) 48 | setp(pbase() + off, epptr()); 49 | 50 | if (!(which & std::ios_base::in)) 51 | { 52 | return pptr() - pbase(); 53 | } 54 | } 55 | return gptr() - eback(); 56 | } 57 | 58 | imstream::imstream(const char* base, std::ptrdiff_t n) 59 | : std::istream(&buf), buf(true, false, (char*)base, n) 60 | { 61 | } 62 | 63 | imstream::~imstream() = default; 64 | 65 | omstream::omstream(char* base, std::ptrdiff_t n) 66 | : std::ostream(&buf), buf(false, true, (char*)base, n) 67 | { 68 | } 69 | 70 | omstream::~omstream() = default; 71 | 72 | 73 | BlockStreamBuffer::BlockStreamBuffer(size_t _block_size) : block_size{ _block_size } 74 | { 75 | buffers.emplace_back(std::make_unique(block_size)); 76 | this->setp((char*)buffers.back().get(), (char*)buffers.back().get() + block_size); 77 | } 78 | 79 | BlockStreamBuffer::~BlockStreamBuffer() = default; 80 | 81 | int BlockStreamBuffer::overflow(int c) 82 | { 83 | if (this->pptr() == this->epptr()) 84 | { 85 | buffers.emplace_back(std::make_unique(block_size)); 86 | this->setp((char*)buffers.back().get(), (char*)buffers.back().get() + block_size); 87 | } 88 | else 89 | { 90 | *(this->pptr()) = c; 91 | this->pbump(1); 92 | } 93 | return c; 94 | } 95 | 96 | std::streamsize BlockStreamBuffer::xsputn(const char* s, std::streamsize n) 97 | { 98 | auto rest = n; 99 | auto buf_remain = this->epptr() - this->pptr(); 100 | while (rest > buf_remain) 101 | { 102 | std::copy(s, s + buf_remain, this->pptr()); 103 | this->pbump(buf_remain); 104 | buffers.emplace_back(std::make_unique(block_size)); 105 | this->setp((char*)buffers.back().get(), (char*)buffers.back().get() + block_size); 106 | rest -= buf_remain; 107 | s += buf_remain; 108 | buf_remain = block_size; 109 | } 110 | std::copy(s, s + rest, this->pptr()); 111 | this->pbump(rest); 112 | return n; 113 | } 114 | 115 | size_t BlockStreamBuffer::totalSize() const 116 | { 117 | return (buffers.size() - 1) * block_size + (this->pptr() - this->pbase()); 118 | } 119 | 120 | TaggedDataMap readTaggedDataMap(std::istream& istr, uint32_t version) 121 | { 122 | std::unordered_map> ret; 123 | TaggedDataHeader h; 124 | do 125 | { 126 | istr.read((char*)&h, sizeof(h)); 127 | if (h.key != taggedDataKeyUint) 128 | { 129 | throw UnfitException("tagged data key is not found"); 130 | } 131 | const std::streampos totsize_pos = istr.tellg() - (std::streamoff)16; 132 | std::array key; 133 | istr.read(key.data(), h.keysize); 134 | const std::streampos start_pos = istr.tellg(); 135 | const std::streampos end_pos = totsize_pos + (std::streamoff)h.totsize; 136 | ret.emplace(std::string{ key.data(), h.keysize }, std::make_pair(start_pos, end_pos)); 137 | ret[""] = std::make_pair(start_pos, end_pos); 138 | istr.seekg(end_pos); 139 | } while (h.trailing_cnt); 140 | return ret; 141 | } 142 | 143 | uint64_t computeFastHash(const void* data, size_t size, uint64_t seed) 144 | { 145 | for (size_t i = 0; i < size / 4; ++i) 146 | { 147 | uint32_t x = ((const uint32_t*)data)[i]; 148 | x = ((x >> 16) ^ x) * 0x45d9f3b; 149 | x = ((x >> 16) ^ x) * 0x45d9f3b; 150 | x = (x >> 16) ^ x; 151 | seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2); 152 | } 153 | 154 | if (size % 4) 155 | { 156 | uint32_t x = 0; 157 | memcpy(&x, (const char*)data + (size / 4) * 4, size % 4); 158 | x = ((x >> 16) ^ x) * 0x45d9f3b; 159 | x = ((x >> 16) ^ x) * 0x45d9f3b; 160 | x = (x >> 16) ^ x; 161 | seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2); 162 | } 163 | return seed; 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/Utils/slp.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace tomoto 5 | { 6 | namespace slp 7 | { 8 | template 9 | struct combination 10 | { 11 | static constexpr int64_t value = combination::value + combination::value; 12 | }; 13 | 14 | template 15 | struct combination 16 | { 17 | static constexpr int64_t value = 1; 18 | }; 19 | 20 | template 21 | struct combination<0, n> 22 | { 23 | static constexpr int64_t value = 1; 24 | }; 25 | 26 | template 27 | struct combination 28 | { 29 | static constexpr int64_t value = 1; 30 | }; 31 | 32 | template<> 33 | struct combination<0, 0> 34 | { 35 | static constexpr int64_t value = 1; 36 | }; 37 | 38 | template 39 | struct shiftedLegendre 40 | { 41 | inline static _Type at(_Type x) 42 | { 43 | return shiftedLegendre<_Order, _Type, n + 1>::at(x) * x + combination<_Order, n>::value * combination<_Order + n, n>::value * ((_Order + n) % 2 ? -1 : 1); 44 | } 45 | 46 | inline static _Type atDerivative(_Type x) 47 | { 48 | return shiftedLegendre<_Order, _Type, n + 1>::atDerivation(x) * x + combination<_Order, n>::value * combination<_Order + n, n>::value * ((_Order + n) % 2 ? -1 : 1) * (int)n; 49 | } 50 | }; 51 | 52 | template 53 | struct shiftedLegendre<_Order, _Type, _Order> 54 | { 55 | inline static _Type at(_Type x) 56 | { 57 | return combination<_Order + _Order, _Order>::value; 58 | } 59 | 60 | inline static _Type atDerivative(_Type x) 61 | { 62 | return combination<_Order + _Order, _Order>::value * _Order; 63 | } 64 | }; 65 | 66 | template 67 | struct shiftedLegendre<0, _Type, 0> 68 | { 69 | inline static _Type at(_Type x) 70 | { 71 | return 1; 72 | } 73 | }; 74 | 75 | template 76 | struct shiftedLegendre<0, _Type, 1> 77 | { 78 | inline static _Type atDerivative(_Type x) 79 | { 80 | return 0; 81 | } 82 | }; 83 | 84 | template inline _Type shiftedLegendreFunc(_Type x) 85 | { 86 | return shiftedLegendre<_Order, _Type, 0>::at(x); 87 | } 88 | 89 | template inline _Type shiftedLegendreFuncDerivative(_Type x) 90 | { 91 | return shiftedLegendre<_Order, _Type, 1>::atDerivative(x); 92 | } 93 | 94 | 95 | template _Type slpGet(size_t order, _Type x) 96 | { 97 | switch (order) 98 | { 99 | case 0: return shiftedLegendreFunc<0>(x); 100 | case 1: return shiftedLegendreFunc<1>(x); 101 | case 2: return shiftedLegendreFunc<2>(x); 102 | case 3: return shiftedLegendreFunc<3>(x); 103 | case 4: return shiftedLegendreFunc<4>(x); 104 | case 5: return shiftedLegendreFunc<5>(x); 105 | case 6: return shiftedLegendreFunc<6>(x); 106 | case 7: return shiftedLegendreFunc<7>(x); 107 | case 8: return shiftedLegendreFunc<8>(x); 108 | case 9: return shiftedLegendreFunc<9>(x); 109 | case 10: return shiftedLegendreFunc<10>(x); 110 | case 11: return shiftedLegendreFunc<11>(x); 111 | case 12: return shiftedLegendreFunc<12>(x); 112 | case 13: return shiftedLegendreFunc<13>(x); 113 | case 14: return shiftedLegendreFunc<14>(x); 114 | case 15: return shiftedLegendreFunc<15>(x); 115 | } 116 | return _Type{}; 117 | } 118 | 119 | inline size_t partialProductDown(size_t n, size_t k) 120 | { 121 | size_t ret = 1; 122 | for (size_t i = 0; i < k; ++i) ret *= n--; 123 | return ret; 124 | } 125 | 126 | inline int slpGetCoef(size_t n, size_t k) 127 | { 128 | return ((n + k) & 1 ? -1 : 1) * (int)(partialProductDown(n, k) / partialProductDown(k, k) * partialProductDown(n + k, k) / partialProductDown(k, k)); 129 | } 130 | } 131 | } -------------------------------------------------------------------------------- /src/Utils/sse_gamma.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "sse_mathfun.h" 3 | 4 | // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) 5 | inline __m128 lgamma_ps(__m128 x) 6 | { 7 | __m128 x_3 = _mm_add_ps(x, _mm_set1_ps(3)); 8 | __m128 ret = _mm_mul_ps(_mm_add_ps(x_3, _mm_set1_ps(-0.5f)), log_ps(x_3)); 9 | ret = _mm_sub_ps(ret, x_3); 10 | ret = _mm_add_ps(ret, _mm_set1_ps(0.91893853f)); 11 | ret = _mm_add_ps(ret, _mm_div_ps(_mm_set1_ps(1 / 12.f), x_3)); 12 | ret = _mm_sub_ps(ret, log_ps(_mm_mul_ps( 13 | _mm_mul_ps(_mm_sub_ps(x_3, _mm_set1_ps(1)), _mm_sub_ps(x_3, _mm_set1_ps(2))), x))); 14 | return ret; 15 | } 16 | 17 | // approximation : lgamma(z + a) - lgamma(z) = (z + a + 1.5) * log(z + a + 2) - (z + 1.5) * log(z + 2) - a + (1. / (z + a + 2) - 1. / (z + 2)) / 12. - log(((z + a) * (z + a + 1)) / (z * (z + 1))) 18 | inline __m128 lgamma_subt(__m128 z, __m128 a) 19 | { 20 | __m128 _1 = _mm_set1_ps(1); 21 | __m128 _1p5 = _mm_set1_ps(1.5); 22 | __m128 _2 = _mm_set1_ps(2); 23 | __m128 _1_12 = _mm_set1_ps(1 / 12.f); 24 | __m128 za = _mm_add_ps(z, a); 25 | __m128 ret = _mm_mul_ps(_mm_add_ps(za, _1p5), log_ps(_mm_add_ps(za, _2))); 26 | ret = _mm_sub_ps(ret, _mm_mul_ps(_mm_add_ps(z, _1p5), log_ps(_mm_add_ps(z, _2)))); 27 | ret = _mm_sub_ps(ret, a); 28 | ret = _mm_add_ps(ret, _mm_sub_ps(_mm_div_ps(_1_12, _mm_add_ps(za, _2)), _mm_div_ps(_1_12, _mm_add_ps(z, _2)))); 29 | ret = _mm_sub_ps(ret, log_ps(_mm_div_ps(_mm_div_ps(_mm_mul_ps(za, _mm_add_ps(za, _1)), z), _mm_add_ps(z, _1)))); 30 | return ret; 31 | } 32 | 33 | // approximation : digamma(z) ~= ln(z+4) - 1/2/(z+4) - 1/12/(z+4)^2 - 1/z - 1/(z+1) - 1/(z+2) - 1/(z+3) 34 | inline __m128 digamma_ps(__m128 x) 35 | { 36 | __m128 x_4 = _mm_add_ps(x, _mm_set1_ps(4)); 37 | __m128 ret = log_ps(x_4); 38 | ret = _mm_sub_ps(ret, _mm_div_ps(_mm_set1_ps(1 / 2.f), x_4)); 39 | ret = _mm_sub_ps(ret, _mm_div_ps(_mm_div_ps(_mm_set1_ps(1 / 12.f), x_4), x_4)); 40 | ret = _mm_sub_ps(ret, _mm_rcp_ps(_mm_sub_ps(x_4, _mm_set1_ps(1)))); 41 | ret = _mm_sub_ps(ret, _mm_rcp_ps(_mm_sub_ps(x_4, _mm_set1_ps(2)))); 42 | ret = _mm_sub_ps(ret, _mm_rcp_ps(_mm_sub_ps(x_4, _mm_set1_ps(3)))); 43 | ret = _mm_sub_ps(ret, _mm_rcp_ps(_mm_sub_ps(x_4, _mm_set1_ps(4)))); 44 | return ret; 45 | } 46 | -------------------------------------------------------------------------------- /src/Utils/text.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace tomoto 12 | { 13 | namespace text 14 | { 15 | template 16 | std::string format(const std::string& format, _Args ... args) 17 | { 18 | size_t size = snprintf(nullptr, 0, format.c_str(), args ...) + 1; 19 | std::vector buf(size); 20 | snprintf(buf.data(), size, format.c_str(), args ...); 21 | return std::string{ buf.data(), buf.data() + size - 1 }; 22 | } 23 | 24 | template 25 | std::string join(_Iter first, _Iter last, const std::string& delimiter = ",") 26 | { 27 | if (first == last) return ""; 28 | std::ostringstream stream; 29 | std::copy(first, last, std::ostream_iterator<_Target>(stream, delimiter.c_str())); 30 | std::string s = stream.str(); 31 | s.erase(s.end() - delimiter.size(), s.end()); 32 | return s; 33 | } 34 | 35 | inline std::string quote(const std::string& s) 36 | { 37 | std::ostringstream stream; 38 | stream << std::quoted(s); 39 | return stream.str(); 40 | } 41 | 42 | inline std::vector split(const std::string& str, const std::string& delim) 43 | { 44 | std::vector tokens; 45 | size_t prev = 0, pos = 0; 46 | do 47 | { 48 | pos = str.find(delim, prev); 49 | if (pos == std::string::npos) pos = str.length(); 50 | std::string token = str.substr(prev, pos - prev); 51 | if (!token.empty()) tokens.push_back(token); 52 | prev = pos + delim.length(); 53 | } while (pos < str.length() && prev < str.length()); 54 | return tokens; 55 | } 56 | } 57 | } -------------------------------------------------------------------------------- /src/python/dispatcher/py_rt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _DEBUG 9 | #undef _DEBUG 10 | #include 11 | #define _DEBUG 12 | #else 13 | #include 14 | #endif 15 | 16 | #if defined(_WIN32) || defined(_WIN64) 17 | #include 18 | #include 19 | 20 | #define cpuid(info, x) __cpuidex(info, x, 0) 21 | 22 | #elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX) 23 | #include 24 | 25 | //#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 26 | static inline unsigned long long _xgetbv(unsigned int index) { 27 | unsigned int eax, edx; 28 | __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); 29 | return ((unsigned long long)edx << 32) | eax; 30 | } 31 | /*#else 32 | #define _xgetbv(x) 0 33 | #endif*/ 34 | 35 | void cpuid(int info[4], int InfoType) { 36 | __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); 37 | } 38 | #endif 39 | 40 | PyMODINIT_FUNC PyInit__tomotopy() 41 | { 42 | using namespace std; 43 | 44 | bool sse2 = false, avx = false, avx2 = false; 45 | bool env_sse2 = false, env_avx = false, env_avx2 = false; 46 | 47 | string isaEnv; 48 | const char* p = getenv("TOMOTOPY_ISA"); 49 | if (p) isaEnv = p; 50 | transform(isaEnv.begin(), isaEnv.end(), isaEnv.begin(), ::tolower); 51 | 52 | istringstream iss{ isaEnv }; 53 | string item; 54 | 55 | while (getline(iss, item, ',')) 56 | { 57 | if (item == "avx2") env_avx2 = true; 58 | else if (item == "avx") env_avx = true; 59 | else if (item == "sse2") env_sse2 = true; 60 | else if (item == "none"); 61 | else fprintf(stderr, "Unknown ISA option '%s' ignored.\n", item.c_str()); 62 | } 63 | 64 | if (!env_sse2 && !env_avx && !env_avx2) 65 | { 66 | env_sse2 = true; 67 | env_avx = true; 68 | env_avx2 = true; 69 | } 70 | 71 | int info[4]; 72 | cpuid(info, 0); 73 | int nIds = info[0]; 74 | 75 | cpuid(info, 0x80000000); 76 | unsigned nExIds = info[0]; 77 | 78 | if (nIds >= 1) { 79 | cpuid(info, 1); 80 | sse2 = (info[3] & ((int)1 << 26)) != 0; 81 | if ((info[2] & (1 << 27)) && ((info[2] & ((int)1 << 28)) != 0)) 82 | { 83 | unsigned long long xcrFeatureMask = _xgetbv(0); 84 | avx = (xcrFeatureMask & 0x6) == 0x6; 85 | } 86 | } 87 | if (nIds >= 7) { 88 | cpuid(info, 7); 89 | avx2 = (info[1] & ((int)1 << 5)) != 0; 90 | } 91 | 92 | PyObject* module = nullptr; 93 | vector triedModules; 94 | if (!module && avx2 && env_avx2) 95 | { 96 | module = PyImport_ImportModule("_tomotopy_avx2"); 97 | if (!module) 98 | { 99 | PyErr_Clear(); 100 | triedModules.emplace_back("avx2"); 101 | } 102 | } 103 | if (!module && avx && env_avx) 104 | { 105 | module = PyImport_ImportModule("_tomotopy_avx"); 106 | if (!module) 107 | { 108 | PyErr_Clear(); 109 | triedModules.emplace_back("avx"); 110 | } 111 | } 112 | if (!module && sse2 && env_sse2) 113 | { 114 | module = PyImport_ImportModule("_tomotopy_sse2"); 115 | if (!module) 116 | { 117 | PyErr_Clear(); 118 | triedModules.emplace_back("sse2"); 119 | } 120 | } 121 | if (!module) 122 | { 123 | module = PyImport_ImportModule("_tomotopy_none"); 124 | if (!module) 125 | { 126 | PyErr_Clear(); 127 | triedModules.emplace_back("none"); 128 | } 129 | } 130 | 131 | if (!module) 132 | { 133 | string err = "No module named any of "; 134 | for (auto& s : triedModules) err += "'_tomotopy_" + s + "', "; 135 | err.pop_back(); err.pop_back(); 136 | #if PY_MINOR_VERSION < 6 137 | PyErr_SetString(PyExc_RuntimeError, err.c_str()); 138 | #else 139 | PyErr_SetString(PyExc_ModuleNotFoundError, err.c_str()); 140 | #endif 141 | } 142 | return module; 143 | } 144 | -------------------------------------------------------------------------------- /src/python/handler/coherence.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "module.h" 4 | #include "utils.h" 5 | #include "../../Coherence/CoherenceModel.hpp" 6 | 7 | struct CoherenceObject 8 | { 9 | using ProbEstimation = tomoto::coherence::ProbEstimation; 10 | using Segmentation = tomoto::coherence::Segmentation; 11 | using ConfirmMeasure = tomoto::coherence::ConfirmMeasure; 12 | using IndirectMeasure = tomoto::coherence::IndirectMeasure; 13 | 14 | PyObject_HEAD; 15 | CorpusObject* corpus; 16 | Segmentation seg; 17 | union { tomoto::coherence::CoherenceModel model; }; 18 | union { tomoto::coherence::AnyConfirmMeasurer cm; }; 19 | static int init(CoherenceObject* self, PyObject* args, PyObject* kwargs); 20 | static PyObject* repr(CoherenceObject* self); 21 | static void dealloc(CoherenceObject* self); 22 | 23 | static PyObject* getScore(CoherenceObject* self, PyObject* args, PyObject* kwargs); 24 | }; 25 | 26 | void addCoherenceTypes(PyObject* gModule); 27 | -------------------------------------------------------------------------------- /src/python/handler/label.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _DEBUG 4 | //#undef _DEBUG 5 | #define DEBUG_LOG(t) do{ cerr << t << endl; }while(0) 6 | #include "PyUtils.h" 7 | //#define _DEBUG 8 | #else 9 | #define DEBUG_LOG(t) 10 | #include "PyUtils.h" 11 | #endif 12 | 13 | #include "../../Labeling/FoRelevance.h" 14 | 15 | struct CorpusObject; 16 | 17 | struct CandidateObject; 18 | 19 | class CandWordIterator 20 | { 21 | const CandidateObject* co = nullptr; 22 | size_t idx = 0; 23 | public: 24 | using difference_type = ptrdiff_t; 25 | using value_type = const std::string; 26 | using reference = const std::string&; 27 | using pointer = const std::string*; 28 | using iterator_category = std::random_access_iterator_tag; 29 | 30 | CandWordIterator(const CandidateObject* _co = nullptr, size_t _idx = 0) 31 | : co{ _co }, idx{ _idx } 32 | { 33 | } 34 | 35 | CandWordIterator& operator++() 36 | { 37 | idx++; 38 | return *this; 39 | } 40 | 41 | const std::string& operator *() const; 42 | 43 | bool operator==(const CandWordIterator& o) const 44 | { 45 | return co == o.co && idx == o.idx; 46 | } 47 | 48 | bool operator!=(const CandWordIterator& o) const 49 | { 50 | return co != o.co || idx != o.idx; 51 | } 52 | 53 | std::ptrdiff_t operator-(const CandWordIterator& o) const 54 | { 55 | return (std::ptrdiff_t)idx - (std::ptrdiff_t)o.idx; 56 | } 57 | }; 58 | 59 | struct CandidateObject 60 | { 61 | PyObject_HEAD; 62 | TopicModelObject* tm; 63 | CorpusObject* corpus; 64 | tomoto::label::Candidate cand; 65 | 66 | static int init(CandidateObject* self, PyObject* args, PyObject* kwargs); 67 | static void dealloc(CandidateObject* self); 68 | static PyObject* repr(CandidateObject* self); 69 | 70 | CandWordIterator begin() const 71 | { 72 | return { this, 0 }; 73 | } 74 | 75 | CandWordIterator end() const 76 | { 77 | return { this, cand.w.size() }; 78 | } 79 | }; 80 | 81 | extern PyTypeObject Candidate_type; 82 | 83 | void addLabelTypes(PyObject* mModule); -------------------------------------------------------------------------------- /src/python/handler/py_PT.cpp: -------------------------------------------------------------------------------- 1 | #include "../../TopicModel/PT.h" 2 | 3 | #include "module.h" 4 | #include "utils.h" 5 | 6 | using namespace std; 7 | 8 | static int PT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) 9 | { 10 | size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; 11 | tomoto::PTArgs margs; 12 | 13 | PyObject* objCorpus = nullptr, *objTransform = nullptr; 14 | PyObject* objAlpha = nullptr, *objSeed = nullptr; 15 | static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "p", "alpha", "eta", 16 | "seed", "corpus", "transform", nullptr }; 17 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnOfOOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, 18 | &margs.k, &margs.p, &objAlpha, &margs.eta, &objSeed, &objCorpus, &objTransform)) return -1; 19 | return py::handleExc([&]() 20 | { 21 | if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, 22 | [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } 23 | ); 24 | if (objSeed) margs.seed = py::toCpp(objSeed, "`seed` must be an integer or None."); 25 | 26 | if (margs.p == 0) margs.p = margs.k * 10; 27 | 28 | tomoto::ITopicModel* inst = tomoto::IPTModel::create((tomoto::TermWeight)tw, margs); 29 | if (!inst) throw py::ValueError{ "unknown `tw` value" }; 30 | self->inst = inst; 31 | self->isPrepared = false; 32 | self->seedGiven = !!objSeed; 33 | self->minWordCnt = minCnt; 34 | self->minWordDf = minDf; 35 | self->removeTopWord = rmTop; 36 | self->initParams = py::buildPyDict(kwlist, 37 | tw, minCnt, minDf, rmTop, margs.k, margs.p, margs.alpha, margs.eta, margs.seed 38 | ); 39 | py::setPyDictItem(self->initParams, "version", getVersion()); 40 | 41 | insertCorpus(self, objCorpus, objTransform); 42 | return 0; 43 | }); 44 | } 45 | 46 | DEFINE_GETTER(tomoto::IPTModel, PT, getP); 47 | 48 | DEFINE_LOADER(PT, PT_type); 49 | 50 | static PyMethodDef PT_methods[] = 51 | { 52 | { "load", (PyCFunction)PT_load, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_load__doc__ }, 53 | { "loads", (PyCFunction)PT_loads, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_loads__doc__ }, 54 | { nullptr } 55 | }; 56 | 57 | 58 | static PyGetSetDef PT_getseters[] = { 59 | { (char*)"p", (getter)PT_getP, nullptr, PT_p__doc__, nullptr }, 60 | { nullptr }, 61 | }; 62 | 63 | DEFINE_DOCUMENT_GETTER(tomoto::DocumentPT, pseudo_doc_id, pseudoDoc); 64 | 65 | 66 | TopicModelTypeObject PT_type = { { 67 | PyVarObject_HEAD_INIT(nullptr, 0) 68 | "tomotopy.PTModel", /* tp_name */ 69 | sizeof(TopicModelObject), /* tp_basicsize */ 70 | 0, /* tp_itemsize */ 71 | (destructor)TopicModelObject::dealloc, /* tp_dealloc */ 72 | 0, /* tp_print */ 73 | 0, /* tp_getattr */ 74 | 0, /* tp_setattr */ 75 | 0, /* tp_reserved */ 76 | 0, /* tp_repr */ 77 | 0, /* tp_as_number */ 78 | 0, /* tp_as_sequence */ 79 | 0, /* tp_as_mapping */ 80 | 0, /* tp_hash */ 81 | 0, /* tp_call */ 82 | 0, /* tp_str */ 83 | 0, /* tp_getattro */ 84 | 0, /* tp_setattro */ 85 | 0, /* tp_as_buffer */ 86 | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 87 | PT___init____doc__, /* tp_doc */ 88 | 0, /* tp_traverse */ 89 | 0, /* tp_clear */ 90 | 0, /* tp_richcompare */ 91 | 0, /* tp_weaklistoffset */ 92 | 0, /* tp_iter */ 93 | 0, /* tp_iternext */ 94 | PT_methods, /* tp_methods */ 95 | 0, /* tp_members */ 96 | PT_getseters, /* tp_getset */ 97 | &LDA_type, /* tp_base */ 98 | 0, /* tp_dict */ 99 | 0, /* tp_descr_get */ 100 | 0, /* tp_descr_set */ 101 | 0, /* tp_dictoffset */ 102 | (initproc)PT_init, /* tp_init */ 103 | PyType_GenericAlloc, 104 | PyType_GenericNew, 105 | }}; 106 | 107 | 108 | PyObject* Document_getTopicsFromPseudoDoc(DocumentObject* self, size_t topN) 109 | { 110 | tomoto::IPTModel* mdl = dynamic_cast(self->corpus->tm->inst); 111 | if (!mdl) throw py::ValueError{ "`from_pseudo_doc` is valid for only `tomotopy.PTModel`." }; 112 | return py::buildPyValue(self->corpus->tm->inst->getTopicsByDocSorted(self->getBoundDoc(), topN)); 113 | } 114 | 115 | PyObject* Document_getTopicDistFromPseudoDoc(DocumentObject* self, bool normalize) 116 | { 117 | tomoto::IPTModel* mdl = dynamic_cast(self->corpus->tm->inst); 118 | if (!mdl) throw py::ValueError{ "`from_pseudo_doc` is valid for only `tomotopy.PTModel`." }; 119 | return py::buildPyValue(self->corpus->tm->inst->getTopicsByDoc(self->getBoundDoc(), !!normalize)); 120 | } -------------------------------------------------------------------------------- /src/python/handler/py_coherence.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "module.h" 3 | #include "coherence.h" 4 | 5 | using namespace std; 6 | 7 | int CoherenceObject::init(CoherenceObject* self, PyObject* args, PyObject* kwargs) 8 | { 9 | new (&self->model) tomoto::coherence::CoherenceModel; 10 | new (&self->cm) tomoto::coherence::AnyConfirmMeasurer; 11 | 12 | CorpusObject* corpus; 13 | PyObject* targets = nullptr; 14 | size_t windowSize = 0; 15 | double eps = 1e-12; 16 | double gamma = 1; 17 | ProbEstimation pe = ProbEstimation::none; 18 | Segmentation seg = Segmentation::none; 19 | ConfirmMeasure cm = ConfirmMeasure::none; 20 | IndirectMeasure im = IndirectMeasure::none; 21 | static const char* kwlist[] = { "corpus", "pe", "seg", "cm", "im", "window_size", "eps", "gamma", "targets", nullptr }; 22 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiiinddO", (char**)kwlist, 23 | &corpus, &pe, &seg, &cm, &im, &windowSize, &eps, &gamma, &targets)) return -1; 24 | return py::handleExc([&]() 25 | { 26 | if (!PyObject_TypeCheck(corpus, &UtilsCorpus_type)) 27 | { 28 | throw py::ValueError{ "`corpus` must be an instance of `tomotopy.utils.Corpus`." }; 29 | } 30 | self->model.~CoherenceModel(); 31 | new (&self->model) tomoto::coherence::CoherenceModel{ pe, windowSize }; 32 | 33 | self->corpus = corpus; 34 | Py_INCREF(corpus); 35 | 36 | vector targetIds; 37 | py::foreach(targets, [&](const string& w) 38 | { 39 | auto wid = corpus->getVocabDict().toWid(w); 40 | if (wid != tomoto::non_vocab_id) targetIds.emplace_back(wid); 41 | }, "`targets` must be an iterable of `str`."); 42 | 43 | self->model.insertTargets(targetIds.begin(), targetIds.end()); 44 | 45 | for (size_t i = 0; i < CorpusObject::len(corpus); ++i) 46 | { 47 | auto* doc = corpus->getDoc(i); 48 | self->model.insertDoc( 49 | wordBegin(doc, corpus->isIndependent()), 50 | wordEnd(doc, corpus->isIndependent()) 51 | ); 52 | } 53 | 54 | self->seg = seg; 55 | self->cm = tomoto::coherence::AnyConfirmMeasurer::getInstance(cm, im, targetIds.begin(), targetIds.end(), eps, gamma); 56 | return 0; 57 | }); 58 | } 59 | 60 | PyObject* CoherenceObject::repr(CoherenceObject* self) 61 | { 62 | return py::buildPyValue(string{ }); 63 | } 64 | 65 | void CoherenceObject::dealloc(CoherenceObject* self) 66 | { 67 | self->model.~CoherenceModel(); 68 | self->cm.~AnyConfirmMeasurer(); 69 | Py_XDECREF(self->corpus); 70 | Py_TYPE(self)->tp_free((PyObject*)self); 71 | } 72 | 73 | PyObject* CoherenceObject::getScore(CoherenceObject* self, PyObject* args, PyObject* kwargs) 74 | { 75 | PyObject* words; 76 | static const char* kwlist[] = { "words", nullptr }; 77 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, 78 | &words)) return nullptr; 79 | 80 | return py::handleExc([&]() 81 | { 82 | vector wordIds; 83 | py::foreach(words, [&](const string& w) 84 | { 85 | auto wid = self->corpus->getVocabDict().toWid(w); 86 | if (wid != tomoto::non_vocab_id) wordIds.emplace_back(wid); 87 | }, "`words` must be an iterable of `str`."); 88 | 89 | switch (self->seg) 90 | { 91 | case Segmentation::one_one: 92 | return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); 93 | case Segmentation::one_pre: 94 | return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); 95 | case Segmentation::one_suc: 96 | return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); 97 | case Segmentation::one_all: 98 | return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); 99 | case Segmentation::one_set: 100 | return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); 101 | default: 102 | throw py::ValueError{ "invalid Segmentation `seg`" }; 103 | } 104 | }); 105 | } 106 | 107 | static PyMethodDef Coherence_methods[] = 108 | { 109 | { "get_score", (PyCFunction)CoherenceObject::getScore, METH_VARARGS | METH_KEYWORDS, "" }, 110 | { nullptr } 111 | }; 112 | 113 | 114 | static PyGetSetDef Coherence_getseters[] = { 115 | //{ (char*)"words", (getter)CoherenceObject::getWords, nullptr, Document_words__doc__, nullptr }, 116 | { nullptr } 117 | }; 118 | 119 | PyTypeObject Coherence_type = { 120 | PyVarObject_HEAD_INIT(nullptr, 0) 121 | "tomotopy._Coherence", /* tp_name */ 122 | sizeof(CoherenceObject), /* tp_basicsize */ 123 | 0, /* tp_itemsize */ 124 | (destructor)CoherenceObject::dealloc, /* tp_dealloc */ 125 | 0, /* tp_print */ 126 | 0, /* tp_getattr */ 127 | 0, /* tp_setattr */ 128 | 0, /* tp_reserved */ 129 | (reprfunc)CoherenceObject::repr, /* tp_repr */ 130 | 0, /* tp_as_number */ 131 | 0, /* tp_as_sequence */ 132 | 0, /* tp_as_mapping */ 133 | 0, /* tp_hash */ 134 | 0, /* tp_call */ 135 | 0, /* tp_str */ 136 | 0, 137 | 0, /* tp_setattro */ 138 | 0, /* tp_as_buffer */ 139 | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 140 | "", /* tp_doc */ 141 | 0, /* tp_traverse */ 142 | 0, /* tp_clear */ 143 | 0, /* tp_richcompare */ 144 | 0, /* tp_weaklistoffset */ 145 | 0, /* tp_iter */ 146 | 0, /* tp_iternext */ 147 | Coherence_methods, /* tp_methods */ 148 | 0, /* tp_members */ 149 | Coherence_getseters, /* tp_getset */ 150 | 0, /* tp_base */ 151 | 0, /* tp_dict */ 152 | 0, /* tp_descr_get */ 153 | 0, /* tp_descr_set */ 154 | 0, /* tp_dictoffset */ 155 | (initproc)CoherenceObject::init, /* tp_init */ 156 | PyType_GenericAlloc, 157 | PyType_GenericNew, 158 | }; 159 | 160 | 161 | void addCoherenceTypes(PyObject* gModule) 162 | { 163 | if (PyType_Ready(&Coherence_type) < 0) throw runtime_error{ "Coherence_type is not ready." }; 164 | Py_INCREF(&Coherence_type); 165 | PyModule_AddObject(gModule, "_Coherence", (PyObject*)&Coherence_type); 166 | } 167 | -------------------------------------------------------------------------------- /src/python/handler/py_main.cpp: -------------------------------------------------------------------------------- 1 | #define MAIN_MODULE 2 | #include "module.h" 3 | #include "label.h" 4 | #include "utils.h" 5 | #include "coherence.h" 6 | 7 | using namespace std; 8 | 9 | PyObject* gModule; 10 | 11 | #ifdef TOMOTOPY_ISA 12 | #define TO_STR(name) #name 13 | #define TO_STR_WRAP(name) TO_STR(name) 14 | #define TOMOTOPY_ISA_STR TO_STR_WRAP(TOMOTOPY_ISA) 15 | static const char* isa_str = TOMOTOPY_ISA_STR; 16 | #else 17 | static const char* isa_str = "none"; 18 | #endif 19 | 20 | void char2Byte(const char* strBegin, const char* strEnd, vector& startPos, vector& length) 21 | { 22 | if (strBegin == strEnd) return; 23 | vector charPos; 24 | auto it = strBegin; 25 | for (; it != strEnd; ) 26 | { 27 | charPos.emplace_back(it - strBegin); 28 | uint8_t c = *it; 29 | if ((c & 0xF8) == 0xF0) 30 | { 31 | it += 4; 32 | } 33 | else if ((c & 0xF0) == 0xE0) 34 | { 35 | it += 3; 36 | } 37 | else if ((c & 0xE0) == 0xC0) 38 | { 39 | it += 2; 40 | } 41 | else if ((c & 0x80)) 42 | { 43 | throw std::runtime_error{ "utf-8 decoding error" }; 44 | } 45 | else it += 1; 46 | } 47 | charPos.emplace_back(strEnd - strBegin); 48 | 49 | for (size_t i = 0; i < startPos.size(); ++i) 50 | { 51 | size_t s = startPos[i], e = (size_t)startPos[i] + length[i]; 52 | startPos[i] = charPos[s]; 53 | length[i] = charPos[e] - charPos[s]; 54 | } 55 | } 56 | 57 | void char2Byte(const string& str, vector& startPos, vector& length) 58 | { 59 | return char2Byte(&str[0], &str[0] + str.size(), startPos, length); 60 | } 61 | 62 | void char2Byte(const tomoto::SharedString& str, vector& startPos, vector& length) 63 | { 64 | return char2Byte(str.begin(), str.end(), startPos, length); 65 | } 66 | 67 | void byte2Char(const char* strBegin, const char* strEnd, vector& startPos, vector& length) 68 | { 69 | if (strBegin == strEnd) return; 70 | vector charPos; 71 | auto it = strBegin; 72 | for (; it != strEnd; ) 73 | { 74 | charPos.emplace_back(it - strBegin); 75 | uint8_t c = *it; 76 | if ((c & 0xF8) == 0xF0) 77 | { 78 | it += 4; 79 | } 80 | else if ((c & 0xF0) == 0xE0) 81 | { 82 | it += 3; 83 | } 84 | else if ((c & 0xE0) == 0xC0) 85 | { 86 | it += 2; 87 | } 88 | else if ((c & 0x80)) 89 | { 90 | throw std::runtime_error{ "utf-8 decoding error" }; 91 | } 92 | else it += 1; 93 | } 94 | charPos.emplace_back(strEnd - strBegin); 95 | 96 | for (size_t i = 0; i < startPos.size(); ++i) 97 | { 98 | size_t s = startPos[i], e = (size_t)startPos[i] + length[i]; 99 | startPos[i] = std::lower_bound(charPos.begin(), charPos.end(), s) - charPos.begin(); 100 | length[i] = std::lower_bound(charPos.begin(), charPos.end(), e) - charPos.begin() - startPos[i]; 101 | } 102 | } 103 | 104 | void byte2Char(const string& str, vector& startPos, vector& length) 105 | { 106 | return byte2Char(&str[0], &str[0] + str.size(), startPos, length); 107 | } 108 | 109 | void byte2Char(const tomoto::SharedString& str, vector& startPos, vector& length) 110 | { 111 | return byte2Char(str.begin(), str.end(), startPos, length); 112 | } 113 | 114 | void TopicModelObject::dealloc(TopicModelObject* self) 115 | { 116 | DEBUG_LOG("TopicModelObject Dealloc " << self); 117 | if (self->inst) 118 | { 119 | delete self->inst; 120 | } 121 | Py_XDECREF(self->initParams); 122 | Py_TYPE(self)->tp_free((PyObject*)self); 123 | } 124 | 125 | PyMODINIT_FUNC MODULE_NAME() 126 | { 127 | import_array(); 128 | 129 | static PyModuleDef mod = 130 | { 131 | PyModuleDef_HEAD_INIT, 132 | "tomotopy", 133 | "Tomoto Module for Python", 134 | -1, 135 | nullptr, 136 | }; 137 | 138 | gModule = PyModule_Create(&mod); 139 | if (!gModule) return nullptr; 140 | 141 | if (PyType_Ready(&LDA_type) < 0) return nullptr; 142 | Py_INCREF(&LDA_type); 143 | PyModule_AddObject(gModule, "LDAModel", (PyObject*)&LDA_type); 144 | 145 | #ifdef TM_DMR 146 | if (PyType_Ready(&DMR_type) < 0) return nullptr; 147 | Py_INCREF(&DMR_type); 148 | PyModule_AddObject(gModule, "DMRModel", (PyObject*)&DMR_type); 149 | #endif 150 | #ifdef TM_HDP 151 | if (PyType_Ready(&HDP_type) < 0) return nullptr; 152 | Py_INCREF(&HDP_type); 153 | PyModule_AddObject(gModule, "HDPModel", (PyObject*)&HDP_type); 154 | #endif 155 | #ifdef TM_MGLDA 156 | if (PyType_Ready(&MGLDA_type) < 0) return nullptr; 157 | Py_INCREF(&MGLDA_type); 158 | PyModule_AddObject(gModule, "MGLDAModel", (PyObject*)&MGLDA_type); 159 | #endif 160 | #ifdef TM_PA 161 | if (PyType_Ready(&PA_type) < 0) return nullptr; 162 | Py_INCREF(&PA_type); 163 | PyModule_AddObject(gModule, "PAModel", (PyObject*)&PA_type); 164 | #endif 165 | #ifdef TM_HPA 166 | if (PyType_Ready(&HPA_type) < 0) return nullptr; 167 | Py_INCREF(&HPA_type); 168 | PyModule_AddObject(gModule, "HPAModel", (PyObject*)&HPA_type); 169 | #endif 170 | #ifdef TM_HLDA 171 | if (PyType_Ready(&HLDA_type) < 0) return nullptr; 172 | Py_INCREF(&HLDA_type); 173 | PyModule_AddObject(gModule, "HLDAModel", (PyObject*)&HLDA_type); 174 | #endif 175 | #ifdef TM_CT 176 | if (PyType_Ready(&CT_type) < 0) return nullptr; 177 | Py_INCREF(&CT_type); 178 | PyModule_AddObject(gModule, "CTModel", (PyObject*)&CT_type); 179 | #endif 180 | #ifdef TM_SLDA 181 | if (PyType_Ready(&SLDA_type) < 0) return nullptr; 182 | Py_INCREF(&SLDA_type); 183 | PyModule_AddObject(gModule, "SLDAModel", (PyObject*)&SLDA_type); 184 | #endif 185 | #ifdef TM_LLDA 186 | if (PyType_Ready(&LLDA_type) < 0) return nullptr; 187 | Py_INCREF(&LLDA_type); 188 | PyModule_AddObject(gModule, "LLDAModel", (PyObject*)&LLDA_type); 189 | #endif 190 | #ifdef TM_PLDA 191 | if (PyType_Ready(&PLDA_type) < 0) return nullptr; 192 | Py_INCREF(&PLDA_type); 193 | PyModule_AddObject(gModule, "PLDAModel", (PyObject*)&PLDA_type); 194 | #endif 195 | #ifdef TM_DT 196 | if (PyType_Ready(&DT_type) < 0) return nullptr; 197 | Py_INCREF(&DT_type); 198 | PyModule_AddObject(gModule, "DTModel", (PyObject*)&DT_type); 199 | #endif 200 | #ifdef TM_GDMR 201 | if (PyType_Ready(&GDMR_type) < 0) return nullptr; 202 | Py_INCREF(&GDMR_type); 203 | PyModule_AddObject(gModule, "GDMRModel", (PyObject*)&GDMR_type); 204 | #endif 205 | #ifdef TM_PT 206 | if (PyType_Ready(&PT_type) < 0) return nullptr; 207 | Py_INCREF(&PT_type); 208 | PyModule_AddObject(gModule, "PTModel", (PyObject*)&PT_type); 209 | #endif 210 | 211 | #ifdef __AVX2__ 212 | PyModule_AddStringConstant(gModule, "isa", "avx2"); 213 | #elif defined(__AVX__) 214 | PyModule_AddStringConstant(gModule, "isa", "avx"); 215 | #elif defined(__SSE2__) || defined(__x86_64__) || defined(_WIN64) 216 | PyModule_AddStringConstant(gModule, "isa", "sse2"); 217 | #else 218 | PyModule_AddStringConstant(gModule, "isa", isa_str); 219 | #endif 220 | addLabelTypes(gModule); 221 | addUtilsTypes(gModule); 222 | addCoherenceTypes(gModule); 223 | 224 | return gModule; 225 | } 226 | -------------------------------------------------------------------------------- /tomotopy/_call_utils.py: -------------------------------------------------------------------------------- 1 | 2 | def call_method_bound(mdl, method:str, global_methods:dict, *args, **kwargs): 3 | for c in type(mdl).mro()[:-1]: 4 | cname = c.__name__ 5 | try: 6 | return global_methods[method + '_' + cname](mdl, *args, **kwargs) 7 | except KeyError: 8 | pass 9 | raise KeyError(method + '_' + cname) 10 | -------------------------------------------------------------------------------- /tomotopy/_show_progress.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from ._call_utils import call_method_bound 3 | 4 | _tqdm_objs = {} 5 | 6 | def init_tqdm_LDAModel(mdl, current_iteration:int, total_iteration:int): 7 | _tqdm_objs[mdl] = tqdm(total=total_iteration, desc='Iteration') 8 | 9 | def close_tqdm_LDAModel(mdl, current_iteration:int, total_iteration:int): 10 | obj:tqdm = _tqdm_objs[mdl] 11 | obj.update(current_iteration - obj.n) 12 | obj.close() 13 | del _tqdm_objs[mdl] 14 | 15 | def progress_LDAModel(mdl, current_iteration:int, total_iteration:int): 16 | obj:tqdm = _tqdm_objs[mdl] 17 | obj.set_postfix_str(f'LLPW: {mdl.ll_per_word:.6f}') 18 | obj.update(current_iteration - obj.n) 19 | 20 | def progress_HDPModel(mdl, current_iteration:int, total_iteration:int): 21 | obj:tqdm = _tqdm_objs[mdl] 22 | obj.set_postfix_str(f'# Topics: {mdl.live_k}, LLPW: {mdl.ll_per_word:.6f}') 23 | obj.update(current_iteration - obj.n) 24 | 25 | def progress_HLDAModel(mdl, current_iteration:int, total_iteration:int): 26 | obj:tqdm = _tqdm_objs[mdl] 27 | obj.set_postfix_str(f'# Topics: {mdl.live_k}, LLPW: {mdl.ll_per_word:.6f}') 28 | obj.update(current_iteration - obj.n) 29 | 30 | def show_progress(mdl, current_iteration:int, total_iteration:int): 31 | if current_iteration == 0: 32 | call_method_bound(mdl, 'init_tqdm', globals(), current_iteration, total_iteration) 33 | elif current_iteration == total_iteration: 34 | call_method_bound(mdl, 'close_tqdm', globals(), current_iteration, total_iteration) 35 | else: 36 | call_method_bound(mdl, 'progress', globals(), current_iteration, total_iteration) 37 | -------------------------------------------------------------------------------- /tomotopy/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.13.0' 2 | -------------------------------------------------------------------------------- /tomotopy/auto_labeling_code.rst: -------------------------------------------------------------------------------- 1 | :: 2 | 3 | import tomotopy as tp 4 | 5 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) 6 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 7 | corpus.process(open(input_file, encoding='utf-8')) 8 | 9 | # make LDA model and train 10 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) 11 | mdl.train(0) 12 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 13 | print('Removed top words:', mdl.removed_top_words) 14 | for i in range(0, 1000, 10): 15 | mdl.train(10) 16 | print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) 17 | 18 | # extract candidates for auto topic labeling 19 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) 20 | cands = extractor.extract(mdl) 21 | 22 | # ranking the candidates of labels for a specific topic 23 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) 24 | for k in range(mdl.k): 25 | print("== Topic #{} ==".format(k)) 26 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) 27 | for word, prob in mdl.get_topic_words(k, top_n=10): 28 | print(word, prob, sep='\t') 29 | print() 30 | 31 | # Example of Results 32 | # ----------------- 33 | # == Topic #13 == 34 | # Labels: american basebal, american actress, lawyer politician, race car driver, brown american 35 | # american 0.061747949570417404 36 | # english 0.02476435713469982 37 | # player 0.02357063814997673 38 | # politician 0.020087148994207382 39 | # footbal 0.016364915296435356 40 | # author 0.014303036034107208 41 | # actor 0.01202411763370037 42 | # french 0.009745198301970959 43 | # academ 0.009701790288090706 44 | # produc 0.008822779171168804 45 | # 46 | # == Topic #16 == 47 | # Labels: lunar, saturn, orbit moon, nasa report, orbit around 48 | # apollo 0.03052366152405739 49 | # star 0.017564402893185616 50 | # mission 0.015656694769859314 51 | # earth 0.01532777864485979 52 | # lunar 0.015130429528653622 53 | # moon 0.013683202676475048 54 | # orbit 0.011315013282001019 55 | # crew 0.01092031504958868 56 | # space 0.010821640491485596 57 | # nasa 0.009999352507293224 58 | -------------------------------------------------------------------------------- /tomotopy/auto_labeling_code_with_porter.rst: -------------------------------------------------------------------------------- 1 | :: 2 | 3 | import tomotopy as tp 4 | 5 | # This code requires nltk package for stemming. 6 | from nltk.stem.porter import PorterStemmer 7 | from nltk.corpus import stopwords 8 | 9 | stemmer = PorterStemmer() 10 | stopwords = set(stopwords.words('english')) 11 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 12 | stopwords=lambda x: len(x) <= 2 or x in stopwords) 13 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string) 14 | corpus.process(open(input_file, encoding='utf-8')) 15 | 16 | # make LDA model and train 17 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) 18 | mdl.train(0) 19 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) 20 | print('Removed top words:', mdl.removed_top_words) 21 | for i in range(0, 1000, 10): 22 | mdl.train(10) 23 | print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) 24 | 25 | # extract candidates for auto topic labeling 26 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) 27 | cands = extractor.extract(mdl) 28 | 29 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) 30 | for k in range(mdl.k): 31 | print("== Topic #{} ==".format(k)) 32 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) 33 | for word, prob in mdl.get_topic_words(k, top_n=10): 34 | print(word, prob, sep='\t') 35 | print() 36 | 37 | # Example of Results 38 | # ----------------- 39 | # == Topic #13 == 40 | # Labels: weapon systems, weaponry, anti-aircraft, towed, long-range 41 | # aircraft 0.020458335056900978 42 | # use 0.019993379712104797 43 | # airlin 0.012523100711405277 44 | # car 0.012058146297931671 45 | # vehicl 0.01165518444031477 46 | # carrier 0.011531196534633636 47 | # tank 0.011221226304769516 48 | # design 0.010694277472794056 49 | # audi 0.010322313755750656 50 | # martin 0.009981346316635609 51 | # 52 | # == Topic #17 == 53 | # Labels: American baseball player, American baseball, American actress, singer-songwriter and guitarist, American actor, director, producer, and screenwriter 54 | # american 0.04471408948302269 55 | # english 0.01746685802936554 56 | # player 0.01714528724551201 57 | # politician 0.014698212035000324 58 | # footbal 0.012313882820308208 59 | # author 0.010909952223300934 60 | # actor 0.008949155919253826 61 | # french 0.007647186517715454 62 | # academ 0.0073020863346755505 63 | # produc 0.006815808825194836 64 | # 65 | -------------------------------------------------------------------------------- /tomotopy/label.py: -------------------------------------------------------------------------------- 1 | """ 2 | Submodule `tomotopy.label` provides automatic topic labeling techniques. 3 | You can label topics automatically with simple code like below. The results are attached to the bottom of the code. 4 | 5 | .. include:: ./auto_labeling_code.rst 6 | """ 7 | 8 | from _tomotopy import (_LabelCandidate, _LabelPMIExtractor, _LabelFoRelevance) 9 | 10 | Candidate = _LabelCandidate 11 | PMIExtractor = _LabelPMIExtractor 12 | FoRelevance = _LabelFoRelevance 13 | '''end of copy from pyc''' 14 | 15 | import os 16 | if os.environ.get('TOMOTOPY_LANG') == 'kr': 17 | __doc__ = """ 18 | `tomotopy.label` 서브모듈은 자동 토픽 라벨링 기법을 제공합니다. 19 | 아래에 나온 코드처럼 간단한 작업을 통해 토픽 모델의 결과에 이름을 붙일 수 있습니다. 그 결과는 코드 하단에 첨부되어 있습니다. 20 | 21 | .. include:: ./auto_labeling_code.rst 22 | """ 23 | del os 24 | -------------------------------------------------------------------------------- /tomotopy/viewer/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ..versionadded:: 0.13.0 3 | 4 | `tomotopy.viewer` is a module for visualizing the tomotopy's topic model using a web browser. It provides a simple way to visualize the topic model and its results interactively. 5 | But it is not recommended to use it in a production web service, because it uses python's built-in `http.server` module to serve. 6 | 7 | The following is a simple example of how to use the viewer: 8 | :: 9 | 10 | import tomotopy as tp 11 | mdl = tp.load_model('a_trained_model.bin') 12 | tp.viewer.open_viewer(mdl, port=9999) 13 | # open http://localhost:9999 in your web browser 14 | 15 | Or you can run the viewer from the command line: 16 | :: 17 | 18 | python -m tomotopy.viewer a_trained_model.bin --host localhost --port 9999 19 | # open http://localhost:9999 in your web browser 20 | 21 | For more details, please refer to the `tomotopy.viewer.viewer_server.open_viewer` function. 22 | ''' 23 | 24 | from .viewer_server import open_viewer 25 | -------------------------------------------------------------------------------- /tomotopy/viewer/__main__.py: -------------------------------------------------------------------------------- 1 | import tomotopy as tp 2 | from tomotopy.viewer import open_viewer 3 | 4 | def main(args): 5 | model = tp.load_model(args.model) 6 | open_viewer(model, args.host, args.port, args.root_path, args.browser_title, args.model + '.json', read_only=args.read_only) 7 | 8 | if __name__ == '__main__': 9 | import argparse 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('model') 12 | parser.add_argument('--root-path', default='/') 13 | parser.add_argument('--browser-title') 14 | parser.add_argument('--host', default='localhost') 15 | parser.add_argument('-p', '--port', type=int, default=9999) 16 | parser.add_argument('-r', '--read-only', action='store_true') 17 | main(parser.parse_args()) 18 | --------------------------------------------------------------------------------