├── .github
├── FUNDING.yml
└── workflows
│ ├── deploy.yml
│ ├── deploy_test.yml
│ ├── generate_documentation.yml
│ ├── numpy_version.py
│ └── pull_request_test.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.kr.rst
├── README.rst
├── benchmark.py
├── document
├── document_header.html
└── pdoc_localization.sh
├── examples
├── coherence.py
├── corpus_and_inference.py
├── corpus_and_labeling.py
├── ctm_network.py
├── dmr_multi_label.py
├── dmr_plot.py
├── dtm.py
├── extract_ngram.py
├── gdmr_both_categorical_and_numerical.py
├── gdmr_plot.py
├── hdp_basic.py
├── hdp_visualization.py
├── hlda_basic.py
├── lda_basic.py
├── lda_visualization.py
├── raw_corpus_and_labeling.py
└── word_prior.py
├── licenses_bundled
├── EigenRand
└── MapboxVariant
├── requirements.txt
├── setup.py
├── src
├── Coherence
│ ├── CoherenceModel.hpp
│ ├── Common.h
│ ├── ConfirmMeasurer.hpp
│ ├── ProbEstimator.hpp
│ └── Segmentor.hpp
├── Labeling
│ ├── FoRelevance.cpp
│ ├── FoRelevance.h
│ ├── Labeler.h
│ └── Phraser.hpp
├── TopicModel
│ ├── CT.h
│ ├── CTModel.cpp
│ ├── CTModel.hpp
│ ├── DMR.h
│ ├── DMRModel.cpp
│ ├── DMRModel.hpp
│ ├── DT.h
│ ├── DTM.h
│ ├── DTModel.cpp
│ ├── DTModel.hpp
│ ├── GDMR.h
│ ├── GDMRModel.cpp
│ ├── GDMRModel.hpp
│ ├── HDP.h
│ ├── HDPModel.cpp
│ ├── HDPModel.hpp
│ ├── HLDA.h
│ ├── HLDAModel.cpp
│ ├── HLDAModel.hpp
│ ├── HPA.h
│ ├── HPAModel.cpp
│ ├── HPAModel.hpp
│ ├── LDA.h
│ ├── LDACVB0Model.hpp
│ ├── LDAModel.cpp
│ ├── LDAModel.hpp
│ ├── LLDA.h
│ ├── LLDAModel.cpp
│ ├── LLDAModel.hpp
│ ├── MGLDA.h
│ ├── MGLDAModel.cpp
│ ├── MGLDAModel.hpp
│ ├── PA.h
│ ├── PAModel.cpp
│ ├── PAModel.hpp
│ ├── PLDA.h
│ ├── PLDAModel.cpp
│ ├── PLDAModel.hpp
│ ├── PT.h
│ ├── PTModel.cpp
│ ├── PTModel.hpp
│ ├── SLDA.h
│ ├── SLDAModel.cpp
│ ├── SLDAModel.hpp
│ └── TopicModel.hpp
├── Utils
│ ├── AliasMethod.hpp
│ ├── Dictionary.cpp
│ ├── Dictionary.h
│ ├── EigenAddonOps.hpp
│ ├── LBFGS.h
│ ├── LBFGS
│ │ ├── LineSearchBacktracking.h
│ │ ├── LineSearchBracketing.h
│ │ └── Param.h
│ ├── LUT.hpp
│ ├── Mmap.cpp
│ ├── Mmap.h
│ ├── MultiNormalDistribution.hpp
│ ├── PolyaGamma.hpp
│ ├── PolyaGammaHybrid.hpp
│ ├── SharedString.cpp
│ ├── SharedString.h
│ ├── ThreadPool.hpp
│ ├── Trie.hpp
│ ├── TruncMultiNormal.hpp
│ ├── Utils.hpp
│ ├── avx_gamma.h
│ ├── avx_mathfun.h
│ ├── exception.h
│ ├── math.h
│ ├── neon_gamma.h
│ ├── rtnorm.hpp
│ ├── sample.hpp
│ ├── serializer.cpp
│ ├── serializer.hpp
│ ├── slp.hpp
│ ├── sse_gamma.h
│ ├── sse_mathfun.h
│ ├── text.hpp
│ └── tvector.hpp
└── python
│ ├── dispatcher
│ └── py_rt.cpp
│ └── handler
│ ├── PyUtils.h
│ ├── coherence.h
│ ├── docs.h
│ ├── label.h
│ ├── label_docs.h
│ ├── module.h
│ ├── py_CT.cpp
│ ├── py_DMR.cpp
│ ├── py_DT.cpp
│ ├── py_GDMR.cpp
│ ├── py_HDP.cpp
│ ├── py_HLDA.cpp
│ ├── py_HPA.cpp
│ ├── py_LDA.cpp
│ ├── py_LLDA.cpp
│ ├── py_MGLDA.cpp
│ ├── py_PA.cpp
│ ├── py_PLDA.cpp
│ ├── py_PT.cpp
│ ├── py_SLDA.cpp
│ ├── py_coherence.cpp
│ ├── py_label.cpp
│ ├── py_main.cpp
│ ├── py_utils.cpp
│ └── utils.h
├── test
├── sample.txt
├── sample_raw.txt
├── sample_tp.txt
├── sample_with_md.txt
└── unit_test.py
└── tomotopy
├── __init__.py
├── _call_utils.py
├── _show_progress.py
├── _summary.py
├── _version.py
├── auto_labeling_code.rst
├── auto_labeling_code_with_porter.rst
├── coherence.py
├── documentation.kr.rst
├── documentation.rst
├── label.py
├── utils.py
└── viewer
├── __init__.py
├── __main__.py
├── template.html
└── viewer_server.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [bab2min]
4 |
--------------------------------------------------------------------------------
/.github/workflows/generate_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Generate the documentation
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*.*.*'
7 | - '!v*.*.*d'
8 |
9 | jobs:
10 | build_manylinux:
11 | name: Build for manylinux
12 | runs-on: ubuntu-latest
13 | container:
14 | image: docker://quay.io/pypa/manylinux2014_x86_64
15 | strategy:
16 | max-parallel: 1
17 | matrix:
18 | language: [kr, en]
19 |
20 | steps:
21 | - uses: actions/checkout@v1
22 | - name: Install dependencies
23 | run: |
24 | /opt/python/cp39-cp39/bin/python -m pip install --upgrade pip
25 | yum install libffi-devel -y
26 | git clone https://gitlab.com/libeigen/eigen
27 | cd eigen
28 | git checkout tags/3.4.0
29 | cd ..
30 | mkdir include
31 | mv eigen/Eigen/ include/
32 | git clone https://github.com/bab2min/EigenRand
33 | cd EigenRand
34 | git checkout tags/v0.4.1
35 | cd ..
36 | mv EigenRand/EigenRand include/
37 | git clone https://github.com/mapbox/variant
38 | cd variant
39 | git checkout tags/v1.1.3
40 | cd ..
41 | mv variant/include/mapbox include/
42 | - name: build
43 | run: |
44 | /opt/python/cp39-cp39/bin/python -m pip install numpy==`/opt/python/cp39-cp39/bin/python .github/workflows/numpy_version.py`
45 | /opt/python/cp39-cp39/bin/python -m pip install pdoc3==0.8.4
46 | export TOMOTOPY_LANG=${{ matrix.language }}
47 | /opt/python/cp39-cp39/bin/python setup.py install
48 | - name: gen doc
49 | run: |
50 | export TOMOTOPY_VER="`/opt/python/cp39-cp39/bin/python -m pip show tomotopy | grep Version | cut -d' ' -f2`"
51 | export TOMOTOPY_LANG=${{ matrix.language }}
52 | /opt/python/cp39-cp39/bin/python -m pdoc --html tomotopy
53 | sed -i -E "s/documentation<\/title>/documentation (v${TOMOTOPY_VER})<\/title>/" html/tomotopy/*.html
54 | sed -i -E 's/<\/title>/<\/title>/' html/tomotopy/*.html
55 | sed -i -E 's/(
<\/p>)/
66 |
67 |
--------------------------------------------------------------------------------
/document/pdoc_localization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$TOMOTOPY_LANG" = "kr" ]; then
4 | sed -i -E "s/Parameters<\/h2>/파라미터<\/h2>/g" $@
5 | sed -i -E "s/Added in version:/추가된 버전:/g" $@
6 | sed -i -E "s/Instance variables<\/h3>/인스턴스 변수<\/h3>/g" $@
7 | sed -i -E "s/Methods<\/h3>/메소드<\/h3>/g" $@
8 | sed -i -E "s/Inherited members<\/h3>/상속받은 메소드 및 변수<\/h3>/g" $@
9 | sed -i -E "s/Ancestors<\/h3>/부모 클래스<\/h3>/g" $@
10 | sed -i -E "s/Super-module<\/h3>/상위 모듈<\/h3>/g" $@
11 | sed -i -E "s/Sub-modules<\/a>/하위 모듈<\/a>/g" $@
12 | sed -i -E "s/Global variables<\/a>/전역 변수<\/a>/g" $@
13 | sed -i -E "s/Classes<\/a>/클래스<\/a>/g" $@
14 | fi
15 |
--------------------------------------------------------------------------------
/examples/coherence.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example shows how to perform a Latent Dirichlet Allocation
3 | and calculate coherence of the results.
4 |
5 | Required Packages:
6 | nltk, sklearn
7 | '''
8 |
9 | import tomotopy as tp
10 | import nltk
11 | from nltk.corpus import stopwords
12 | import re
13 | from sklearn.datasets import fetch_20newsgroups
14 | import itertools
15 |
16 | print('Training lda models...')
17 | try:
18 | # load if trained model exist already
19 | mdl = tp.LDAModel.load('trained_lda_model.bin')
20 | except:
21 | porter_stemmer = nltk.PorterStemmer().stem
22 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
23 | pat = re.compile('^[a-z]{2,}$')
24 | corpus = tp.utils.Corpus(
25 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
26 | stopwords=lambda x: x in english_stops or not pat.match(x)
27 | )
28 | newsgroups_train = fetch_20newsgroups()
29 | corpus.process(d.lower() for d in newsgroups_train.data)
30 |
31 | mdl = tp.LDAModel(min_df=5, rm_top=30, k=20, corpus=corpus)
32 | mdl.train(0)
33 |
34 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
35 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
36 | ))
37 | print('Removed Top words: ', *mdl.removed_top_words)
38 |
39 | # Let's train the model
40 | mdl.train(1000, show_progress=True)
41 | mdl.summary()
42 |
43 | # save lda model for reuse
44 | mdl.save('trained_lda_model.bin')
45 |
46 | # calculate coherence using preset
47 | for preset in ('u_mass', 'c_uci', 'c_npmi', 'c_v'):
48 | coh = tp.coherence.Coherence(mdl, coherence=preset)
49 | average_coherence = coh.get_score()
50 | coherence_per_topic = [coh.get_score(topic_id=k) for k in range(mdl.k)]
51 | print('==== Coherence : {} ===='.format(preset))
52 | print('Average:', average_coherence, '\nPer Topic:', coherence_per_topic)
53 | print()
54 |
55 | # calculate coherence using custom combination
56 | for seg, cm, im in itertools.product(tp.coherence.Segmentation, tp.coherence.ConfirmMeasure, tp.coherence.IndirectMeasure):
57 | coh = tp.coherence.Coherence(mdl, coherence=(tp.coherence.ProbEstimation.DOCUMENT, seg, cm, im))
58 | average_coherence = coh.get_score()
59 | coherence_per_topic = [coh.get_score(topic_id=k) for k in range(mdl.k)]
60 | print('==== Coherence : {}, {}, {} ===='.format(repr(seg), repr(cm), repr(im)))
61 | print('Average:', average_coherence, '\nPer Topic:', coherence_per_topic)
62 | print()
63 |
--------------------------------------------------------------------------------
/examples/corpus_and_inference.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | # You can get the sample data file 'enwiki-stemmed-1000.txt'
5 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
6 |
7 | def infer_new_corpus():
8 | '''
9 | Since 0.10.0 version, inference using an instance of `Corpus` was supported.
10 | '''
11 |
12 | train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
13 | train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8'))
14 |
15 | test_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
16 | test_corpus.process(open('corpus_to_be_inferred.txt', encoding='utf-8'))
17 |
18 | # make LDA model and train
19 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus)
20 | mdl.train(0)
21 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
22 | print('Removed top words:', mdl.removed_top_words)
23 |
24 | mdl.train(1000, show_progress=True)
25 | mdl.summary()
26 |
27 | inferred_corpus, ll = mdl.infer(test_corpus)
28 |
29 | # print topic distributions of each document
30 | for doc in inferred_corpus:
31 | #print(doc.raw) # print raw string of the document
32 | #print(list(doc)) # print a list of words within the document
33 | print(doc.get_topic_dist())
34 |
35 | def infer_new_doc():
36 | '''
37 | Prior to version 0.10.0, we had to make instances of `Document` using `make_doc` first
38 | and call `infer`.
39 | '''
40 | train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
41 | train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8'))
42 |
43 | # make LDA model and train
44 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus)
45 | mdl.train(0)
46 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
47 | print('Removed top words:', mdl.removed_top_words)
48 | for i in range(0, 1000, 10):
49 | mdl.train(10)
50 | print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
51 |
52 | mdl.summary()
53 |
54 | docs = []
55 | for line in open('enwiki-stemmed-1000.txt', encoding='utf-8'):
56 | docs.append(mdl.make_doc(line.lower().split()))
57 |
58 | topic_distributions, ll = mdl.infer(docs)
59 |
60 | # print topic distributions of each document
61 | for doc, topic_dist in zip(docs, topic_distributions):
62 | #print(doc)
63 | print(topic_dist)
64 |
65 | infer_new_corpus()
66 | infer_new_doc()
--------------------------------------------------------------------------------
/examples/corpus_and_labeling.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def corpus_and_labeling_example(input_file):
5 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
6 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
7 | corpus.process(open(input_file, encoding='utf-8'))
8 |
9 | # make LDA model and train
10 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
11 | mdl.train(0)
12 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
13 | print('Removed top words:', mdl.removed_top_words)
14 |
15 | mdl.train(1000, show_progress=True)
16 | mdl.summary()
17 |
18 | # extract candidates for auto topic labeling
19 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
20 | cands = extractor.extract(mdl)
21 |
22 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
23 | for k in range(mdl.k):
24 | print("== Topic #{} ==".format(k))
25 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
26 | for word, prob in mdl.get_topic_words(k, top_n=10):
27 | print(word, prob, sep='\t')
28 | print()
29 |
30 | # You can get the sample data file 'enwiki-stemmed-1000.txt'
31 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
32 |
33 | print('Running LDA and Labeling')
34 | corpus_and_labeling_example('enwiki-stemmed-1000.txt')
35 |
--------------------------------------------------------------------------------
/examples/ctm_network.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example shows how to perform a Correlated Topic Model using tomotopy
3 | and visualize the correlation between topics.
4 |
5 |
6 | Required Packages:
7 | nltk, sklearn, pyvis
8 | '''
9 |
10 | import tomotopy as tp
11 | import nltk
12 | from nltk.corpus import stopwords
13 | import re
14 | from sklearn.datasets import fetch_20newsgroups
15 | from pyvis.network import Network
16 |
17 | try:
18 | # load if preprocessed corpus exists
19 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
20 | except IOError:
21 | porter_stemmer = nltk.PorterStemmer().stem
22 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
23 | pat = re.compile('^[a-z]{2,}$')
24 | corpus = tp.utils.Corpus(
25 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
26 | stopwords=lambda x: x in english_stops or not pat.match(x)
27 | )
28 | newsgroups_train = fetch_20newsgroups()
29 | corpus.process(d.lower() for d in newsgroups_train.data)
30 | # save preprocessed corpus for reuse
31 | corpus.save('preprocessed_20news.cps')
32 |
33 | mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus)
34 | mdl.train(0)
35 |
36 | # Since we have more than ten thousand of documents,
37 | # setting the `num_beta_sample` smaller value will not cause an inaccurate result.
38 | mdl.num_beta_sample = 5
39 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
40 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
41 | ))
42 | print('Removed Top words: ', *mdl.removed_top_words)
43 |
44 | # Let's train the model
45 | mdl.train(1000, show_progress=True)
46 | mdl.summary()
47 |
48 | # Let's visualize the result
49 | g = Network(width=800, height=800, font_color="#333")
50 | correl = mdl.get_correlations().reshape([-1])
51 | correl.sort()
52 | top_tenth = mdl.k * (mdl.k - 1) // 10
53 | top_tenth = correl[-mdl.k - top_tenth]
54 |
55 | for k in range(mdl.k):
56 | label = "#{}".format(k)
57 | title= ' '.join(word for word, _ in mdl.get_topic_words(k, top_n=6))
58 | print('Topic', label, title)
59 | g.add_node(k, label=label, title=title, shape='ellipse')
60 | for l, correlation in zip(range(k - 1), mdl.get_correlations(k)):
61 | if correlation < top_tenth: continue
62 | g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation))
63 |
64 | g.barnes_hut(gravity=-1000, spring_length=20)
65 | g.show_buttons()
66 | g.show("topic_network.html")
67 |
--------------------------------------------------------------------------------
/examples/dmr_multi_label.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example show how to perform a DMR topic model with multi-metadata using tomotopy
3 | '''
4 | import itertools
5 |
6 | import tomotopy as tp
7 | import numpy as np
8 |
9 | # You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
10 | corpus = tp.utils.Corpus()
11 | for line in open('text_mining_year_journal.txt', encoding='utf-8'):
12 | fd = line.strip().split('\t', maxsplit=2)
13 | corpus.add_doc(fd[2].split(), multi_metadata=['y_' + fd[0], 'j_' + fd[1]])
14 | # We add prefix 'y' for year-label and 'j' for journal-label
15 |
16 | # We set a range of the first metadata as [2000, 2017]
17 | # and one of the second metadata as [0, 1].
18 | mdl = tp.DMRModel(tw=tp.TermWeight.ONE,
19 | k=20,
20 | corpus=corpus
21 | )
22 | mdl.optim_interval = 20
23 | mdl.burn_in = 200
24 |
25 | mdl.train(0)
26 |
27 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
28 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
29 | ))
30 |
31 | # Let's train the model
32 | mdl.train(2000, show_progress=True)
33 |
34 | mdl.summary()
35 |
36 | year_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('y_'))
37 | journal_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('j_'))
38 |
39 | # calculate topic distribution with each metadata using get_topic_prior()
40 | print('Topic distributions by year')
41 | for l in year_labels:
42 | print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')
43 |
44 | print('Topic distributions by journal')
45 | for l in journal_labels:
46 | print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')
47 |
48 | # Also we can estimate topic distributions with multiple metadata
49 | print('Topic distributions by year-journal')
50 | for y, j in itertools.product(year_labels, journal_labels):
51 | print(y, ',', j, '\n', mdl.get_topic_prior(multi_metadata=[y, j]), '\n')
52 |
--------------------------------------------------------------------------------
/examples/dmr_plot.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example show how to perform a DMR topic model using tomotopy
3 | and visualize the topic distribution for each metadata
4 |
5 | Required Packages:
6 | matplotlib
7 | '''
8 |
9 | import tomotopy as tp
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 |
13 | '''
14 | You can get the sample data file from https://drive.google.com/file/d/1AUHdwaPzw5qW0j8MaKqFNfw-SQDMbIzw/view?usp=sharing .
15 | '''
16 |
17 | corpus = tp.utils.Corpus()
18 | for line in open('text_mining.txt', encoding='utf-8'):
19 | fd = line.strip().split('\t')
20 | corpus.add_doc(fd[1].lower().split(), metadata=fd[0])
21 |
22 | # We set a range of the first metadata as [2000, 2017]
23 | # and one of the second metadata as [0, 1].
24 | mdl = tp.DMRModel(tw=tp.TermWeight.PMI,
25 | k=15,
26 | corpus=corpus
27 | )
28 | mdl.optim_interval = 20
29 | mdl.burn_in = 200
30 |
31 | mdl.train(0)
32 |
33 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
34 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
35 | ))
36 |
37 | # Let's train the model
38 | mdl.train(2000, show_progress=True)
39 |
40 | mdl.summary()
41 |
42 | # calculate topic distribution for each metadata using softmax
43 | probs = np.exp(mdl.lambdas - mdl.lambdas.max(axis=0))
44 | probs /= probs.sum(axis=0)
45 |
46 | print('Topic distributions for each metadata')
47 | for f, metadata_name in enumerate(mdl.metadata_dict):
48 | print(metadata_name, probs[:, f], '\n')
49 |
50 | x = np.arange(mdl.k)
51 | width = 1 / (mdl.f + 2)
52 |
53 | fig, ax = plt.subplots()
54 | for f, metadata_name in enumerate(mdl.metadata_dict):
55 | ax.bar(x + width * (f - mdl.f / 2), probs[:, f], width, label=mdl.metadata_dict[f])
56 |
57 | ax.set_ylabel('Probabilities')
58 | ax.set_yscale('log')
59 | ax.set_title('Topic distributions')
60 | ax.set_xticks(x)
61 | ax.set_xticklabels(['Topic #{}'.format(k) for k in range(mdl.k)])
62 | ax.legend()
63 |
64 | fig.tight_layout()
65 | plt.show()
66 |
--------------------------------------------------------------------------------
/examples/dtm.py:
--------------------------------------------------------------------------------
1 | import tomotopy as tp
2 | import numpy as np
3 | import nltk
4 | import pyLDAvis
5 |
6 | def data_feeder(input_file):
7 | for line in open(input_file, encoding='utf-8'):
8 | fd = line.strip().split(maxsplit=1)
9 | timepoint = int(fd[0])
10 | yield fd[1], None, {'timepoint':timepoint}
11 |
12 | porter_stemmer = nltk.PorterStemmer().stem
13 | corpus = tp.utils.Corpus(
14 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer)
15 | )
16 | corpus.process(data_feeder('../test/sample_tp.txt'))
17 |
18 | mdl = tp.DTModel(min_cf=3, k=10, t=13, phi_var=1e-2, corpus=corpus)
19 | mdl.train(0)
20 |
21 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
22 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
23 | ))
24 | print('Removed Top words: ', *mdl.removed_top_words)
25 |
26 | # Let's train the model
27 | mdl.train(1000, show_progress=True)
28 | mdl.summary()
29 |
30 | topic_dist_by_time = np.zeros(shape=[mdl.num_timepoints, mdl.k], dtype=np.float)
31 | for doc in mdl.docs:
32 | topic_dist_by_time[doc.timepoint] += doc.get_topic_dist()
33 |
34 | topic_dist_by_time /= mdl.num_docs_by_timepoint[:, np.newaxis]
35 |
36 | for k in range(mdl.k):
37 | print('Topic #{}'.format(k), *(w for w, _ in mdl.get_topic_words(k, 0, top_n=5)))
38 | print(topic_dist_by_time[:, k])
39 |
40 | for timepoint in range(mdl.num_timepoints):
41 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k, timepoint=timepoint) for k in range(mdl.k)])
42 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs if doc.timepoint == timepoint])
43 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
44 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs if doc.timepoint == timepoint])
45 | vocab = list(mdl.used_vocabs)
46 | term_frequency = mdl.used_vocab_freq
47 |
48 | prepared_data = pyLDAvis.prepare(
49 | topic_term_dists,
50 | doc_topic_dists,
51 | doc_lengths,
52 | vocab,
53 | term_frequency,
54 | start_index=0,
55 | sort_topics=False
56 | )
57 | pyLDAvis.save_html(prepared_data, 'dtmvis_{}.html'.format(timepoint))
58 |
--------------------------------------------------------------------------------
/examples/extract_ngram.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def extract_ngrams_example(input_file):
5 | from nltk.corpus import stopwords
6 | stops = set(stopwords.words('english'))
7 | stops.update(['many', 'also', 'would', 'often', 'could'])
8 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(),
9 | stopwords=lambda x: len(x) <= 2 or x in stops)
10 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
11 | corpus.process(open(input_file, encoding='utf-8'))
12 |
13 | # extract the n-gram candidates first
14 | cands = corpus.extract_ngrams(min_cf=20, min_df=10, max_len=5, max_cand=1000, normalized=False)
15 | print('==== extracted n-gram collocations (using PMI) ====')
16 | for cand in cands:
17 | print(cand)
18 |
19 | # it prints like:
20 | # tomotopy.label.Candidate(words=["academic","nobel","prize","laureate"], name="", score=23.376673)
21 | # tomotopy.label.Candidate(words=["canadian","ice","hockey","player"], name="", score=21.658447)
22 | # tomotopy.label.Candidate(words=["english","race","car","driver"], name="", score=20.356688)
23 | # tomotopy.label.Candidate(words=["australian","rugby","league","player"], name="", score=20.124966)
24 | # tomotopy.label.Candidate(words=["american","race","car","driver"], name="", score=19.717760)
25 | # tomotopy.label.Candidate(words=["new","zealand","rugby","player"], name="", score=18.866398)
26 | # tomotopy.label.Candidate(words=["american","ice","hockey","player"], name="", score=17.599983)
27 | # tomotopy.label.Candidate(words=["american","actor","director","producer"], name="", score=16.722300)
28 | # tomotopy.label.Candidate(words=["nobel","prize","laureate"], name="", score=16.635370)
29 | # tomotopy.label.Candidate(words=["eastern","orthodox","liturgics"], name="", score=16.540277)
30 | # ...
31 |
32 | cands = corpus.extract_ngrams(min_cf=20, min_df=10, max_len=5, max_cand=1000, normalized=True)
33 | print('==== extracted n-gram collocations (using Normalized PMI) ====')
34 | for cand in cands:
35 | print(cand)
36 |
37 | # it prints like:
38 | # tomotopy.label.Candidate(words=["buenos","aires"], name="", score=0.996445)
39 | # tomotopy.label.Candidate(words=["los","angeles"], name="", score=0.988719)
40 | # tomotopy.label.Candidate(words=["las","vegas"], name="", score=0.982273)
41 | # tomotopy.label.Candidate(words=["hong","kong"], name="", score=0.978606)
42 | # tomotopy.label.Candidate(words=["hip","hop"], name="", score=0.965971)
43 | # tomotopy.label.Candidate(words=["nova","scotia"], name="", score=0.957440)
44 | # tomotopy.label.Candidate(words=["ice","hockey"], name="", score=0.932300)
45 | # tomotopy.label.Candidate(words=["nobel","prize","laureate"], name="", score=0.927281)
46 | # tomotopy.label.Candidate(words=["sri","lankan"], name="", score=0.925504)
47 | # tomotopy.label.Candidate(words=["ann","arbor"], name="", score=0.921129)
48 | # ...
49 |
50 | # before concat
51 | print(corpus[3])
52 |
53 | # concat n-grams in the corpus
54 | corpus.concat_ngrams(cands, delimiter='_')
55 |
56 | # after concat
57 | print(corpus[3])
58 |
59 | # You can get the sample data file 'enwiki-1000.txt'
60 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
61 |
62 | extract_ngrams_example('enwiki-1000.txt')
63 |
--------------------------------------------------------------------------------
/examples/gdmr_both_categorical_and_numerical.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example show how to perform a g-DMR topic model
3 | for mixture of categorical and numerical metadata using tomotopy
4 | and visualize a topic distribution.
5 |
6 | Required Packages:
7 | matplotlib
8 | '''
9 |
10 | import tomotopy as tp
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | import matplotlib.colors as clr
14 | import re
15 |
16 | #You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
17 | corpus = tp.utils.Corpus()
18 | for line in open('text_mining_year_journal.txt', encoding='utf-8'):
19 | fd = line.strip().split('\t', maxsplit=2)
20 | corpus.add_doc(fd[2].split(), numeric_metadata=[float(fd[0])], metadata=fd[1])
21 | # Use the argument `numeric_metadata` for continuous numerical metadata (list of float type),
22 | # and the argument `metadata` for categorical metadata (str type)
23 |
24 | # We set a range of the numeric metadata as [2000, 2017].
25 | # `decay=1.0` penalizes higher-order terms of lambdas to prevent overfitting.
26 | mdl = tp.GDMRModel(tw=tp.TermWeight.ONE, k=30, degrees=[6],
27 | alpha=1e-2, sigma=0.25, sigma0=3.0, decay=1.0,
28 | metadata_range=[(2000, 2017)], corpus=corpus
29 | )
30 | mdl.optim_interval = 20
31 | mdl.burn_in = 200
32 |
33 | mdl.train(0)
34 |
35 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
36 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
37 | ))
38 |
39 | # Let's train the model
40 | mdl.train(1000, show_progress=True)
41 | mdl.summary()
42 |
43 | # Let's visualize the result
44 | topic_counts = mdl.get_count_by_topics()
45 | lambdas = mdl.lambdas
46 | lambdas = lambdas.reshape(lambdas.shape[:1] + (len(mdl.metadata_dict), -1))
47 | # lambdas shape: [num_topics, num_categorical_metadata, degrees + 1]
48 |
49 | md_range = mdl.metadata_range
50 | r = np.stack([mdl.tdf_linspace(
51 | [md_range[0][0]],
52 | [md_range[0][1]],
53 | [50], # interpolation size
54 | cat
55 | ) for cat in mdl.metadata_dict])
56 | # r shape: [num_categorical_metadata, 50, num_topics]
57 |
58 | xs = np.linspace(*md_range[0], 50)
59 | for k in (-topic_counts).argsort():
60 | print('Topic #{} ({})'.format(k, topic_counts[k]))
61 | print(*(w for w, _ in mdl.get_topic_words(k)))
62 | print('Lambda:', lambdas[k].reshape((len(mdl.metadata_dict), -1)))
63 |
64 | for label, ys in zip(mdl.metadata_dict, r[:, :, k]):
65 | label = re.sub(r'^(Proceedings|Journal)( of)?( the)?( -)?|International Conference on', '', label).strip()
66 | if len(label) >= 35: label = label[:33] + '...'
67 | plt.plot(xs, ys, linewidth=2, label=label)
68 | plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
69 | plt.legend()
70 | plt.show()
71 |
--------------------------------------------------------------------------------
/examples/gdmr_plot.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example show how to perform a g-DMR topic model using tomotopy
3 | and visualize a topic distribution map.
4 |
5 | Required Packages:
6 | matplotlib
7 | '''
8 |
9 | import tomotopy as tp
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | import matplotlib.colors as clr
13 |
14 | class ExpNormalize(clr.Normalize):
15 | def __init__(self, scale):
16 | super().__init__()
17 | self.scale = scale
18 |
19 | def __call__(self, value, clip=None):
20 | if clip is None:
21 | clip = self.clip
22 |
23 | result, is_scalar = self.process_value(value)
24 |
25 | self.autoscale_None(result)
26 | (vmin,), _ = self.process_value(self.vmin)
27 | (vmax,), _ = self.process_value(self.vmax)
28 | if vmin == vmax:
29 | result.fill(0)
30 | elif vmin > vmax:
31 | raise ValueError("minvalue must be less than or equal to maxvalue")
32 | else:
33 | if clip:
34 | mask = np.ma.getmask(result)
35 | result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax),
36 | mask=mask)
37 | resdat = result.data
38 | resdat = 1 - np.exp(-2 * resdat / self.scale)
39 | result = np.ma.array(resdat, mask=result.mask, copy=False)
40 | if is_scalar:
41 | result = result[0]
42 | return result
43 |
44 | heat = clr.LinearSegmentedColormap.from_list('heat',
45 | [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)],
46 | N=1024
47 | )
48 |
49 | '''
50 | You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
51 | '''
52 |
53 | corpus = tp.utils.Corpus()
54 | for line in open('dataset2.txt', encoding='utf-8'):
55 | fd = line.strip().split()
56 | corpus.add_doc(fd[2:], numeric_metadata=list(map(float, fd[:2])))
57 |
58 | # We set a range of the first metadata as [2000, 2017]
59 | # and one of the second metadata as [0, 1].
60 | mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=30, degrees=[4, 3],
61 | alpha=1e-2, sigma=0.25, sigma0=3.0,
62 | metadata_range=[(2000, 2017), (0, 1)], corpus=corpus
63 | )
64 | mdl.optim_interval = 20
65 | mdl.burn_in = 200
66 |
67 | mdl.train(0)
68 |
69 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
70 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
71 | ))
72 |
73 | # Let's train the model
74 | mdl.train(1000, show_progress=True)
75 | mdl.summary()
76 |
77 | # Let's visualize the result
78 | topic_counts = mdl.get_count_by_topics()
79 | lambdas = mdl.lambdas
80 |
81 | md_range = mdl.metadata_range
82 | # Our topic distribution map has
83 | # 400 pixels for the first axis and
84 | # 200 pixels for the second axis.
85 | r = mdl.tdf_linspace(
86 | [md_range[0][0], md_range[1][0]],
87 | [md_range[0][1], md_range[1][1]],
88 | [400, 200]
89 | )
90 |
91 | for k in (-topic_counts).argsort():
92 | print('Topic #{} ({})'.format(k, topic_counts[k]))
93 | print(*(w for w, _ in mdl.get_topic_words(k)))
94 | print('Lambda:', lambdas[k])
95 |
96 | imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()),
97 | origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04),
98 | extent=[*md_range[0], *md_range[1]],
99 | aspect='auto'
100 | )
101 | plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
102 | plt.colorbar()
103 | plt.show()
104 |
--------------------------------------------------------------------------------
/examples/hdp_basic.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def hdp_example(input_file, save_path):
5 | mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
6 | for n, line in enumerate(open(input_file, encoding='utf-8')):
7 | ch = line.strip().split()
8 | mdl.add_doc(ch)
9 | mdl.burn_in = 100
10 | mdl.train(0)
11 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
12 | print('Removed top words:', mdl.removed_top_words)
13 | print('Training...', file=sys.stderr, flush=True)
14 | mdl.train(1000, show_progress=True)
15 | mdl.summary()
16 | print('Saving...', file=sys.stderr, flush=True)
17 | mdl.save(save_path, True)
18 |
19 | important_topics = [k for k, v in sorted(enumerate(mdl.get_count_by_topics()), key=lambda x:x[1], reverse=True)]
20 | for k in important_topics:
21 | if not mdl.is_live_topic(k): continue
22 | print('Topic #{}'.format(k))
23 | for word, prob in mdl.get_topic_words(k):
24 | print('\t', word, prob, sep='\t')
25 | # You can get the sample data file 'enwiki-stemmed-1000.txt'
26 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
27 |
28 | print('Running HDP')
29 | hdp_example('enwiki-stemmed-1000.txt', 'test.hdp.bin')
30 |
--------------------------------------------------------------------------------
/examples/hdp_visualization.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example shows how to perform a Hierarchical Dirichlet Process using tomotopy
3 | and visualize the result.
4 |
5 |
6 | Required Packages:
7 | nltk, sklearn, pyldavis
8 | '''
9 |
10 | import tomotopy as tp
11 | import nltk
12 | from nltk.corpus import stopwords
13 | import re
14 | from sklearn.datasets import fetch_20newsgroups
15 | import numpy as np
16 | import pyLDAvis
17 |
18 | try:
19 | # load if preprocessed corpus exists
20 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
21 | except IOError:
22 | porter_stemmer = nltk.PorterStemmer().stem
23 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
24 | pat = re.compile('^[a-z]{2,}$')
25 | corpus = tp.utils.Corpus(
26 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
27 | stopwords=lambda x: x in english_stops or not pat.match(x)
28 | )
29 | newsgroups_train = fetch_20newsgroups()
30 | corpus.process(d.lower() for d in newsgroups_train.data)
31 | # save preprocessed corpus for reuse
32 | corpus.save('preprocessed_20news.cps')
33 |
34 | mdl = tp.HDPModel(tw=tp.TermWeight.PMI, min_df=5, rm_top=30, alpha=1, gamma=10, initial_k=10, corpus=corpus)
35 | mdl.train(0)
36 | mdl.burn_in = 500
37 |
38 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
39 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
40 | ))
41 | print('Removed Top words: ', *mdl.removed_top_words)
42 |
43 | # Let's train the model
44 | mdl.train(5000, show_progress=True)
45 | mdl.summary()
46 |
47 | live_topics = [k for k in range(mdl.k) if mdl.is_live_topic(k)]
48 |
49 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
50 | topic_term_dists = topic_term_dists[live_topics]
51 | topic_term_dists /= topic_term_dists.sum(axis=1, keepdims=True)
52 |
53 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
54 | doc_topic_dists = doc_topic_dists[:, live_topics]
55 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
56 |
57 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
58 | vocab = list(mdl.used_vocabs)
59 | term_frequency = mdl.used_vocab_freq
60 |
61 | prepared_data = pyLDAvis.prepare(
62 | topic_term_dists,
63 | doc_topic_dists,
64 | doc_lengths,
65 | vocab,
66 | term_frequency,
67 | start_index=0,
68 | sort_topics=False
69 | )
70 | pyLDAvis.save_html(prepared_data, 'ldavis.html')
71 |
--------------------------------------------------------------------------------
/examples/hlda_basic.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 | import numpy as np
4 |
5 | def hlda_example(input_file, save_path):
6 | from nltk.stem.porter import PorterStemmer
7 | from nltk.corpus import stopwords
8 | try:
9 | cps = tp.utils.Corpus.load(input_file + '.cached.cps')
10 | except IOError:
11 | stemmer = PorterStemmer()
12 | stops = set(stopwords.words('english'))
13 | cps = tp.utils.Corpus(
14 | tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem),
15 | stopwords=lambda x: len(x) <= 2 or x in stops
16 | )
17 | cps.process(open(input_file, encoding='utf-8'))
18 | cps.save(input_file + '.cached.cps')
19 |
20 | np.random.seed(42)
21 | ridcs = np.random.permutation(len(cps))
22 | test_idcs = ridcs[:20]
23 | train_idcs = ridcs[20:]
24 |
25 | test_cps = cps[test_idcs]
26 | train_cps = cps[train_idcs]
27 |
28 | mdl = tp.HLDAModel(tw=tp.TermWeight.ONE, min_df=10, depth=4, rm_top=10, corpus=train_cps)
29 | mdl.train(0)
30 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
31 | print('Removed top words:', mdl.removed_top_words)
32 | print('Training...', file=sys.stderr, flush=True)
33 | for _ in range(0, 1000, 10):
34 | mdl.train(7)
35 | mdl.train(3, freeze_topics=True)
36 | print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))
37 |
38 | for _ in range(0, 100, 10):
39 | mdl.train(10, freeze_topics=True)
40 | print('Iteration: {:05}\tll per word: {:.5f}\tNum. of topics: {}'.format(mdl.global_step, mdl.ll_per_word, mdl.live_k))
41 |
42 | mdl.summary()
43 | print('Saving...', file=sys.stderr, flush=True)
44 | mdl.save(save_path, True)
45 |
46 | test_result_cps, ll = mdl.infer(test_cps)
47 | for doc in test_result_cps:
48 | print(doc.path, doc.get_words(top_n=10))
49 |
50 | # You can get the sample data file 'enwiki-16000.txt'
51 | # at https://drive.google.com/file/d/1OfyJ9TqaMiqzO6Qw-c_jXL-pmSIZf5Xt/view?usp=sharing
52 |
53 | if __name__ == '__main__':
54 | hlda_example('enwiki-16000.txt', 'test.hlda.tmm')
55 |
--------------------------------------------------------------------------------
/examples/lda_basic.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def lda_example(input_file, save_path):
5 | mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20)
6 | for n, line in enumerate(open(input_file, encoding='utf-8')):
7 | ch = line.strip().split()
8 | mdl.add_doc(ch)
9 | mdl.burn_in = 100
10 | mdl.train(0)
11 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
12 | print('Removed top words:', mdl.removed_top_words)
13 | print('Training...', file=sys.stderr, flush=True)
14 | mdl.train(1000, show_progress=True)
15 | mdl.summary()
16 | print('Saving...', file=sys.stderr, flush=True)
17 | mdl.save(save_path, True)
18 |
19 | for k in range(mdl.k):
20 | print('Topic #{}'.format(k))
21 | for word, prob in mdl.get_topic_words(k):
22 | print('\t', word, prob, sep='\t')
23 |
24 | # You can get the sample data file 'enwiki-stemmed-1000.txt'
25 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
26 |
27 | print('Running LDA')
28 | lda_example('enwiki-stemmed-1000.txt', 'test.lda.bin')
29 |
--------------------------------------------------------------------------------
/examples/lda_visualization.py:
--------------------------------------------------------------------------------
1 | '''
2 | This example shows how to perform a Latent Dirichlet Allocation using tomotopy
3 | and visualize the result.
4 |
5 |
6 | Required Packages:
7 | nltk, sklearn, pyldavis
8 | '''
9 |
10 | import tomotopy as tp
11 | import nltk
12 | from nltk.corpus import stopwords
13 | import re
14 | from sklearn.datasets import fetch_20newsgroups
15 | import numpy as np
16 | import pyLDAvis
17 |
18 | try:
19 | # load if preprocessed corpus exists
20 | corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
21 | except IOError:
22 | porter_stemmer = nltk.PorterStemmer().stem
23 | english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
24 | pat = re.compile('^[a-z]{2,}$')
25 | corpus = tp.utils.Corpus(
26 | tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
27 | stopwords=lambda x: x in english_stops or not pat.match(x)
28 | )
29 | newsgroups_train = fetch_20newsgroups()
30 | corpus.process(d.lower() for d in newsgroups_train.data)
31 | # save preprocessed corpus for reuse
32 | corpus.save('preprocessed_20news.cps')
33 |
34 | mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus)
35 | mdl.train(0)
36 |
37 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
38 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
39 | ))
40 | print('Removed Top words: ', *mdl.removed_top_words)
41 |
42 | # Let's train the model
43 | mdl.train(1000, show_progress=True)
44 | mdl.summary()
45 |
46 | topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
47 | doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
48 | doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
49 | doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
50 | vocab = list(mdl.used_vocabs)
51 | term_frequency = mdl.used_vocab_freq
52 |
53 | prepared_data = pyLDAvis.prepare(
54 | topic_term_dists,
55 | doc_topic_dists,
56 | doc_lengths,
57 | vocab,
58 | term_frequency,
59 | start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
60 | sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
61 | )
62 | pyLDAvis.save_html(prepared_data, 'ldavis.html')
63 |
--------------------------------------------------------------------------------
/examples/raw_corpus_and_labeling.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def raw_corpus_and_labeling_example(input_file):
5 | from nltk.stem.porter import PorterStemmer
6 | from nltk.corpus import stopwords
7 | stemmer = PorterStemmer()
8 | stops = set(stopwords.words('english'))
9 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem),
10 | stopwords=lambda x: len(x) <= 2 or x in stops)
11 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
12 | corpus.process(open(input_file, encoding='utf-8'))
13 |
14 | # make LDA model and train
15 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
16 | mdl.train(0)
17 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
18 | print('Removed top words:', mdl.removed_top_words)
19 | mdl.train(1000, show_progress=True)
20 | mdl.summary()
21 |
22 | # extract candidates for auto topic labeling
23 | extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000, normalized=True)
24 | cands = extractor.extract(mdl)
25 |
26 | labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
27 | for k in range(mdl.k):
28 | print("== Topic #{} ==".format(k))
29 | print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
30 | for word, prob in mdl.get_topic_words(k, top_n=10):
31 | print(word, prob, sep='\t')
32 | print()
33 |
34 |
35 | # You can get the sample data file 'enwiki-1000.txt'
36 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
37 |
38 | print('Running LDA from raw corpus and Labeling')
39 | raw_corpus_and_labeling_example('enwiki-1000.txt')
40 |
--------------------------------------------------------------------------------
/examples/word_prior.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tomotopy as tp
3 |
4 | def word_prior_example(input_file):
5 | corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
6 | # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
7 | corpus.process(open(input_file, encoding='utf-8'))
8 |
9 | # make LDA model and train
10 | mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
11 | # The word 'church' is assigned to Topic 0 with a weight of 1.0 and to the remaining topics with a weight of 0.1.
12 | # Therefore, a topic related to 'church' can be fixed at Topic 0 .
13 | mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)])
14 | # Topic 1 for a topic related to 'softwar'
15 | mdl.set_word_prior('softwar', [1.0 if k == 1 else 0.1 for k in range(20)])
16 | # Topic 2 for a topic related to 'citi'
17 | mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)])
18 | mdl.train(0)
19 | print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
20 | print('Removed top words:', mdl.removed_top_words)
21 | mdl.train(1000, show_progress=True)
22 | mdl.summary()
23 |
24 | for k in range(mdl.k):
25 | print("== Topic #{} ==".format(k))
26 | for word, prob in mdl.get_topic_words(k, top_n=10):
27 | print(word, prob, sep='\t')
28 | print()
29 |
30 |
31 | # You can get the sample data file 'enwiki-stemmed-1000.txt'
32 | # at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
33 |
34 | print('Set Word Prior')
35 | word_prior_example('enwiki-stemmed-1000.txt')
36 |
--------------------------------------------------------------------------------
/licenses_bundled/EigenRand:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020, bab2min
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/licenses_bundled/MapboxVariant:
--------------------------------------------------------------------------------
1 | Copyright (c) MapBox
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | - Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | - Redistributions in binary form must reproduce the above copyright notice, this
10 | list of conditions and the following disclaimer in the documentation and/or
11 | other materials provided with the distribution.
12 | - Neither the name "MapBox" nor the names of its contributors may be
13 | used to endorse or promote products derived from this software without
14 | specific prior written permission.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
20 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.10.0,<2
--------------------------------------------------------------------------------
/src/Coherence/CoherenceModel.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | /*
4 | * Röder, M., Both, A., & Hinneburg, A. (2015, February). Exploring the space of topic coherence measures. In Proceedings of the eighth ACM international conference on Web search and data mining (pp. 399-408).
5 | http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
6 | https://github.com/dice-group/Palmetto
7 |
8 | */
9 |
10 | #include "Common.h"
11 | #include "ConfirmMeasurer.hpp"
12 | #include "ProbEstimator.hpp"
13 | #include "Segmentor.hpp"
14 |
15 | namespace tomoto
16 | {
17 | namespace coherence
18 | {
19 | class CoherenceModel
20 | {
21 | std::unique_ptr pe;
22 | ProbEstimation pe_type = ProbEstimation::none;
23 |
24 | template
25 | void init(size_t windowSize)
26 | {
27 | pe_type = _pe;
28 | pe = std::make_unique>(windowSize);
29 | }
30 |
31 | template
32 | void _insertTargets(_TargetIter targetFirst, _TargetIter targetLast)
33 | {
34 | ((ProbEstimator<_pe>*)pe.get())->insertTargets(targetFirst, targetLast);
35 | }
36 |
37 | template
38 | void _insertDoc(_TargetIter wordFirst, _TargetIter wordLast)
39 | {
40 | ((ProbEstimator<_pe>*)pe.get())->insertDoc(wordFirst, wordLast);
41 | }
42 |
43 | public:
44 | CoherenceModel() = default;
45 |
46 | CoherenceModel(ProbEstimation _pe, size_t windowSize)
47 | {
48 | switch (_pe)
49 | {
50 | case ProbEstimation::document:
51 | init(windowSize);
52 | break;
53 | case ProbEstimation::sliding_windows:
54 | init(windowSize);
55 | break;
56 | default:
57 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" };
58 | }
59 | }
60 |
61 | template
62 | void insertTargets(_TargetIter targetFirst, _TargetIter targetLast)
63 | {
64 | switch (pe_type)
65 | {
66 | case ProbEstimation::document:
67 | return _insertTargets(targetFirst, targetLast);
68 | case ProbEstimation::sliding_windows:
69 | return _insertTargets(targetFirst, targetLast);
70 | default:
71 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" };
72 | }
73 | }
74 |
75 | template
76 | void insertDoc(_TargetIter wordFirst, _TargetIter wordLast)
77 | {
78 | switch (pe_type)
79 | {
80 | case ProbEstimation::document:
81 | return _insertDoc(wordFirst, wordLast);
82 | case ProbEstimation::sliding_windows:
83 | return _insertDoc(wordFirst, wordLast);
84 | default:
85 | throw std::invalid_argument{ "invalid ProbEstimation `_pe`" };
86 | }
87 | }
88 |
89 | template
90 | double getScore(_CMFunc&& cm, _TargetIter targetFirst, _TargetIter targetLast) const
91 | {
92 | return makeSegmentor<_seg>(std::forward<_CMFunc>(cm), pe.get())(targetFirst, targetLast);
93 | }
94 |
95 | };
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/Coherence/Common.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "../TopicModel/TopicModel.hpp"
3 |
4 | namespace tomoto
5 | {
6 | namespace coherence
7 | {
8 | enum class Segmentation
9 | {
10 | none = 0,
11 | one_one,
12 | one_pre,
13 | one_suc,
14 | one_all,
15 | one_set,
16 | };
17 |
18 | enum class ProbEstimation
19 | {
20 | none = 0,
21 | document,
22 | sliding_windows,
23 | };
24 |
25 | class IProbEstimator
26 | {
27 | public:
28 | virtual double getProb(Vid word) const = 0;
29 | virtual double getProb(Vid word1, Vid word2) const = 0;
30 | virtual double getProb(const std::vector& words) const = 0;
31 | virtual double getJointNotProb(Vid word1, Vid word2) const = 0;
32 | virtual double getJointNotProb(Vid word1, const std::vector& word2) const = 0;
33 | virtual ~IProbEstimator() {}
34 |
35 | double getProb(Vid word1, const std::vector& word2) const
36 | {
37 | auto words = word2;
38 | if(std::find(words.begin(), words.end(), word1) == words.end()) words.emplace_back(word1);
39 | return getProb(words);
40 | }
41 | };
42 |
43 | enum class ConfirmMeasure
44 | {
45 | none = 0,
46 | difference,
47 | ratio,
48 | likelihood,
49 | loglikelihood,
50 | pmi,
51 | npmi,
52 | logcond,
53 | };
54 |
55 | enum class IndirectMeasure
56 | {
57 | none = 0,
58 | cosine,
59 | dice,
60 | jaccard,
61 | };
62 |
63 | /*enum class Aggregation
64 | {
65 | none = 0,
66 | amean,
67 | median,
68 | gmean,
69 | hmean,
70 | qmean,
71 | };*/
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/Coherence/Segmentor.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "Common.h"
4 |
5 | namespace tomoto
6 | {
7 | namespace coherence
8 | {
9 | template
10 | class Segmentor;
11 |
12 | template
13 | Segmentor<_seg, typename std::remove_reference<_CMFunc>::type>
14 | makeSegmentor(_CMFunc&& cm, const IProbEstimator* pe)
15 | {
16 | return { std::forward<_CMFunc>(cm), pe };
17 | }
18 |
19 | template
20 | class Segmentor
21 | {
22 | const IProbEstimator* pe;
23 | _CMFunc cm;
24 | public:
25 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe }
26 | {
27 | }
28 |
29 | template
30 | double operator()(_TargetIter wordFirst, _TargetIter wordLast)
31 | {
32 | double ret = 0;
33 | double n = 0;
34 | for (auto it1 = wordFirst; it1 != wordLast; ++it1)
35 | {
36 | for (auto it2 = wordFirst; it2 != wordLast; ++it2)
37 | {
38 | if (it1 == it2) continue;
39 | ret += cm(pe, *it1, *it2);
40 | n += 1;
41 | }
42 | }
43 | return ret / n;
44 | }
45 | };
46 |
47 | template
48 | class Segmentor
49 | {
50 | const IProbEstimator* pe;
51 | _CMFunc cm;
52 | public:
53 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe }
54 | {
55 | }
56 |
57 | template
58 | double operator()(_TargetIter wordFirst, _TargetIter wordLast)
59 | {
60 | double ret = 0;
61 | double n = 0;
62 | for (auto it1 = wordFirst; it1 != wordLast; ++it1)
63 | {
64 | for (auto it2 = wordFirst; it2 != it1; ++it2)
65 | {
66 | ret += cm(pe, *it1, *it2);
67 | n += 1;
68 | }
69 | }
70 | return ret / n;
71 | }
72 | };
73 |
74 | template
75 | class Segmentor
76 | {
77 | const IProbEstimator* pe;
78 | _CMFunc cm;
79 | public:
80 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe }
81 | {
82 | }
83 |
84 | template
85 | double operator()(_TargetIter wordFirst, _TargetIter wordLast)
86 | {
87 | double ret = 0;
88 | double n = 0;
89 | for (auto it1 = wordFirst; it1 != wordLast; ++it1)
90 | {
91 | for (auto it2 = it1 + 1; it2 == wordLast; ++it2)
92 | {
93 | ret += cm(pe, *it1, *it2);
94 | n += 1;
95 | }
96 | }
97 | return ret / n;
98 | }
99 | };
100 |
101 | template
102 | class Segmentor
103 | {
104 | const IProbEstimator* pe;
105 | _CMFunc cm;
106 | public:
107 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe }
108 | {
109 | }
110 |
111 | template
112 | double operator()(_TargetIter wordFirst, _TargetIter wordLast)
113 | {
114 | double ret = 0;
115 | double n = 0;
116 | for (auto it1 = wordFirst; it1 != wordLast; ++it1)
117 | {
118 | ret += cm(pe, *it1, std::vector{ wordFirst, wordLast });
119 | n += 1;
120 | }
121 | return ret / n;
122 | }
123 | };
124 |
125 |
126 | template
127 | class Segmentor
128 | {
129 | const IProbEstimator* pe;
130 | _CMFunc cm;
131 | public:
132 | Segmentor(const _CMFunc& _cm, const IProbEstimator* _pe) : cm{ _cm }, pe{ _pe }
133 | {
134 | }
135 |
136 | template
137 | double operator()(_TargetIter wordFirst, _TargetIter wordLast)
138 | {
139 | double ret = 0;
140 | double n = 0;
141 | for (auto it1 = wordFirst; it1 != wordLast; ++it1)
142 | {
143 | std::vector rest;
144 | rest.insert(rest.end(), wordFirst, it1);
145 | rest.insert(rest.end(), it1 + 1, wordLast);
146 | ret += cm(pe, *it1, rest);
147 | n += 1;
148 | }
149 | return ret / n;
150 | }
151 | };
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/src/Labeling/FoRelevance.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include "Labeler.h"
5 | #include "../Utils/EigenAddonOps.hpp"
6 | #include "../Utils/Trie.hpp"
7 | #include "../Utils/ThreadPool.hpp"
8 |
9 | /*
10 | Implementation of First-order Relevance for topic labeling by bab2min
11 |
12 | * Mei, Q., Shen, X., & Zhai, C. (2007, August). Automatic labeling of multinomial topic models. In Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 490-499).
13 |
14 | */
15 |
16 | namespace tomoto
17 | {
18 | namespace label
19 | {
20 | class PMIExtractor : public IExtractor
21 | {
22 | size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
23 | bool normalized;
24 | public:
25 | PMIExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
26 | size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000,
27 | bool _normalized = false
28 | )
29 | : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf },
30 | minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen },
31 | maxCandidates{ _maxCandidates }, normalized{ _normalized }
32 | {
33 | }
34 |
35 | std::vector extract(const ITopicModel* tm) const override;
36 | };
37 |
38 | class PMIBEExtractor : public IExtractor
39 | {
40 | size_t candMinCnt, candMinDf, minLabelLen, maxLabelLen, maxCandidates;
41 | public:
42 | PMIBEExtractor(size_t _candMinCnt = 10, size_t _candMinDf = 2,
43 | size_t _minLabelLen = 1, size_t _maxLabelLen = 5, size_t _maxCandidates = 1000
44 | )
45 | : candMinCnt{ _candMinCnt }, candMinDf{ _candMinDf }, minLabelLen{ _minLabelLen }, maxLabelLen{ _maxLabelLen }, maxCandidates{ _maxCandidates }
46 | {
47 | }
48 |
49 | std::vector extract(const ITopicModel* tm) const override;
50 | };
51 |
52 | class FoRelevance : public ILabeler
53 | {
54 | struct CandidateEx : public Candidate
55 | {
56 | std::unordered_map names;
57 | std::set docIds;
58 | Eigen::Array scores;
59 |
60 | CandidateEx()
61 | {
62 | }
63 |
64 | CandidateEx(const Candidate& c)
65 | : Candidate{ c }
66 | {
67 | }
68 | };
69 |
70 | const ITopicModel* tm;
71 | size_t candMinDf;
72 | float smoothing, lambda, mu;
73 | size_t windowSize;
74 | std::unique_ptr pool;
75 | std::unique_ptr mtx;
76 | std::vector candidates;
77 |
78 | template
79 | const Eigen::ArrayXi& updateContext(size_t docId, const tomoto::DocumentBase* doc, const Trie* root);
80 |
81 | void estimateContexts();
82 |
83 | public:
84 | template
85 | FoRelevance(const ITopicModel* _tm,
86 | _Iter candFirst, _Iter candEnd,
87 | size_t _candMinDf = 2, float _smoothing = 0.1f, float _lambda = 0.1f, float _mu = 0.1f,
88 | size_t _windowSize = (size_t)-1,
89 | size_t numWorkers = 0)
90 | : tm{ _tm }, candMinDf{ _candMinDf },
91 | smoothing{ _smoothing }, lambda{ _lambda }, mu{ _mu }, windowSize{ _windowSize }
92 | {
93 | if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
94 | if (numWorkers > 1)
95 | {
96 | pool = std::make_unique(numWorkers);
97 | mtx = std::make_unique(numWorkers);
98 | }
99 |
100 | for (; candFirst != candEnd; ++candFirst)
101 | {
102 | candidates.emplace_back(*candFirst);
103 | }
104 |
105 | estimateContexts();
106 | }
107 |
108 | std::vector> getLabels(Tid tid, size_t topK = 10) const override;
109 | };
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/src/Labeling/Labeler.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include "../TopicModel/TopicModel.hpp"
5 |
6 | namespace tomoto
7 | {
8 | namespace label
9 | {
10 | struct Candidate
11 | {
12 | float score = 0;
13 | size_t cf = 0, df = 0;
14 | std::vector w;
15 | std::string name;
16 |
17 | Candidate()
18 | {
19 | }
20 |
21 | Candidate(float _score, Vid w1)
22 | : score{ _score }, w{ w1 }
23 | {
24 | }
25 |
26 | Candidate(float _score, Vid w1, Vid w2)
27 | : score{ _score }, w{ w1, w2 }
28 | {
29 | }
30 |
31 | Candidate(float _score, const std::vector& _w)
32 | : score{ _score }, w{ _w }
33 | {
34 | }
35 | };
36 |
37 | class IExtractor
38 | {
39 | public:
40 |
41 | virtual std::vector extract(const ITopicModel* tm) const = 0;
42 | virtual ~IExtractor() {}
43 | };
44 |
45 | class ILabeler
46 | {
47 | public:
48 | virtual std::vector> getLabels(Tid tid, size_t topK = 10) const = 0;
49 | virtual ~ILabeler() {}
50 | };
51 | }
52 | }
--------------------------------------------------------------------------------
/src/TopicModel/CT.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentCTM : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | Matrix beta; // Dim: (K, betaSample)
12 | Vector smBeta; // Dim: K
13 |
14 | DECLARE_SERIALIZER_WITH_VERSION(0);
15 | DECLARE_SERIALIZER_WITH_VERSION(1);
16 | };
17 |
18 | struct CTArgs : public LDAArgs
19 | {
20 |
21 | };
22 |
23 | class ICTModel : public ILDAModel
24 | {
25 | public:
26 | using DefaultDocType = DocumentCTM;
27 | static ICTModel* create(TermWeight _weight, const CTArgs& args,
28 | bool scalarRng = false);
29 |
30 | virtual void setNumBetaSample(size_t numSample) = 0;
31 | virtual size_t getNumBetaSample() const = 0;
32 | virtual void setNumTMNSample(size_t numSample) = 0;
33 | virtual size_t getNumTMNSample() const = 0;
34 | virtual void setNumDocBetaSample(size_t numSample) = 0;
35 | virtual size_t getNumDocBetaSample() const = 0;
36 | virtual std::vector getPriorMean() const = 0;
37 | virtual std::vector getPriorCov() const = 0;
38 | virtual std::vector getCorrelationTopic(Tid k) const = 0;
39 | };
40 | }
41 |
--------------------------------------------------------------------------------
/src/TopicModel/CTModel.cpp:
--------------------------------------------------------------------------------
1 | #include "CTModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentCTM);
9 |
10 | ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/DMR.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | class IDMRModel;
7 |
8 | template
9 | struct DocumentDMR : public DocumentLDA<_tw>
10 | {
11 | using BaseDocument = DocumentLDA<_tw>;
12 | using DocumentLDA<_tw>::DocumentLDA;
13 | uint64_t metadata = 0;
14 | std::vector multiMetadata;
15 | Vector mdVec;
16 | size_t mdHash = (size_t)-1;
17 | mutable Matrix cachedAlpha;
18 |
19 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
20 |
21 | DECLARE_SERIALIZER_WITH_VERSION(0);
22 | DECLARE_SERIALIZER_WITH_VERSION(1);
23 | };
24 |
25 | struct DMRArgs : public LDAArgs
26 | {
27 | Float alphaEps = 1e-10;
28 | Float sigma = 1.0;
29 | };
30 |
31 | class IDMRModel : public ILDAModel
32 | {
33 | public:
34 | using DefaultDocType = DocumentDMR;
35 | static IDMRModel* create(TermWeight _weight, const DMRArgs& args,
36 | bool scalarRng = false);
37 |
38 | virtual void setAlphaEps(Float _alphaEps) = 0;
39 | virtual Float getAlphaEps() const = 0;
40 | virtual void setOptimRepeat(size_t repeat) = 0;
41 | virtual size_t getOptimRepeat() const = 0;
42 | virtual size_t getF() const = 0;
43 | virtual size_t getMdVecSize() const = 0;
44 | virtual Float getSigma() const = 0;
45 | virtual const Dictionary& getMetadataDict() const = 0;
46 | virtual const Dictionary& getMultiMetadataDict() const = 0;
47 | virtual std::vector getLambdaByMetadata(size_t metadataId) const = 0;
48 | virtual std::vector getLambdaByTopic(Tid tid) const = 0;
49 |
50 | virtual std::vector getTopicPrior(
51 | const std::string& metadata,
52 | const std::vector& multiMetadata,
53 | bool raw = false
54 | ) const = 0;
55 | };
56 |
57 | template
58 | RawDoc::MiscType DocumentDMR<_tw>::makeMisc(const ITopicModel* tm) const
59 | {
60 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm);
61 | auto inst = static_cast(tm);
62 | ret["metadata"] = inst->getMetadataDict().toWord(metadata);
63 | return ret;
64 | }
65 | }
--------------------------------------------------------------------------------
/src/TopicModel/DMRModel.cpp:
--------------------------------------------------------------------------------
1 | #include "DMRModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentDMR);
9 |
10 | IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/DT.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDAModel.hpp"
3 | #include "LDA.h"
4 |
5 | namespace tomoto
6 | {
7 | template
8 | struct DocumentDTM : public DocumentLDA<_tw>
9 | {
10 | using BaseDocument = DocumentLDA<_tw>;
11 | using DocumentLDA<_tw>::DocumentLDA;
12 |
13 | uint64_t timepoint = 0;
14 | ShareableMatrix eta;
15 | sample::AliasMethod<> aliasTable;
16 |
17 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override
18 | {
19 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm);
20 | ret["timepoint"] = (uint32_t)timepoint;
21 | return ret;
22 | }
23 |
24 | DECLARE_SERIALIZER_WITH_VERSION(0);
25 | DECLARE_SERIALIZER_WITH_VERSION(1);
26 | };
27 |
28 | struct DTArgs : public LDAArgs
29 | {
30 | size_t t = 1;
31 | Float phi = 0.1;
32 | Float shapeA = 0.01;
33 | Float shapeB = 0.1;
34 | Float shapeC = 0.55;
35 | Float etaL2Reg = 0;
36 |
37 | DTArgs()
38 | {
39 | alpha[0] = 0.1;
40 | eta = 0.1;
41 | }
42 | };
43 |
44 | class IDTModel : public ILDAModel
45 | {
46 | public:
47 | using DefaultDocType = DocumentDTM;
48 | static IDTModel* create(TermWeight _weight, const DTArgs& args,
49 | bool scalarRng = false);
50 |
51 | virtual size_t getT() const = 0;
52 | virtual std::vector getNumDocsByT() const = 0;
53 |
54 | virtual Float getAlphaVar() const = 0;
55 | virtual Float getEtaVar() const = 0;
56 | virtual Float getPhiVar() const = 0;
57 |
58 | virtual Float getShapeA() const = 0;
59 | virtual Float getShapeB() const = 0;
60 | virtual Float getShapeC() const = 0;
61 |
62 | virtual void setShapeA(Float a) = 0;
63 | virtual void setShapeB(Float a) = 0;
64 | virtual void setShapeC(Float a) = 0;
65 |
66 | virtual Float getAlpha(size_t k, size_t t) const = 0;
67 | virtual std::vector getPhi(size_t k, size_t t) const = 0;
68 | };
69 | }
70 |
--------------------------------------------------------------------------------
/src/TopicModel/DTM.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDAModel.hpp"
3 | #include "LDA.h"
4 |
5 | namespace tomoto
6 | {
7 | template
8 | struct DocumentDTM : public DocumentLDA<_tw, _Flags>
9 | {
10 | using BaseDocument = DocumentLDA<_tw, _Flags>;
11 | using DocumentLDA<_tw, _Flags>::DocumentLDA;
12 | using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type;
13 | };
14 |
15 | class IDTModel : public ILDAModel
16 | {
17 | public:
18 | using DefaultDocType = DocumentDTM;
19 | static IDTModel* create(TermWeight _weight, size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() });
20 |
21 | };
22 | }
23 |
--------------------------------------------------------------------------------
/src/TopicModel/DTModel.cpp:
--------------------------------------------------------------------------------
1 | #include "DTModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentDTM);
9 |
10 | IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/TopicModel/GDMR.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "DMR.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentGDMR : public DocumentDMR<_tw>
8 | {
9 | using BaseDocument = DocumentDMR<_tw>;
10 | using DocumentDMR<_tw>::DocumentDMR;
11 | std::vector metadataOrg, metadataNormalized;
12 |
13 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override
14 | {
15 | RawDoc::MiscType ret = DocumentDMR<_tw>::makeMisc(tm);
16 | ret["numeric_metadata"] = metadataOrg;
17 | return ret;
18 | }
19 |
20 | DECLARE_SERIALIZER_WITH_VERSION(0);
21 | DECLARE_SERIALIZER_WITH_VERSION(1);
22 | };
23 |
24 | struct GDMRArgs : public DMRArgs
25 | {
26 | std::vector degrees;
27 | Float sigma0 = 3.0;
28 | Float orderDecay = 0;
29 | };
30 |
31 | class IGDMRModel : public IDMRModel
32 | {
33 | public:
34 | using DefaultDocType = DocumentDMR;
35 | static IGDMRModel* create(TermWeight _weight, const GDMRArgs& args,
36 | bool scalarRng = false);
37 |
38 | virtual Float getSigma0() const = 0;
39 | virtual Float getOrderDecay() const = 0;
40 | virtual void setSigma0(Float) = 0;
41 | virtual const std::vector& getFs() const = 0;
42 | virtual std::vector getLambdaByTopic(Tid tid) const = 0;
43 |
44 | virtual std::vector getTDF(
45 | const Float* metadata,
46 | const std::string& metadataCat,
47 | const std::vector& multiMetadataCat,
48 | bool normalize
49 | ) const = 0;
50 |
51 | virtual std::vector getTDFBatch(
52 | const Float* metadata,
53 | const std::string& metadataCat,
54 | const std::vector& multiMetadataCat,
55 | size_t stride,
56 | size_t cnt,
57 | bool normalize
58 | ) const = 0;
59 |
60 | virtual void setMdRange(const std::vector& vMin, const std::vector& vMax) = 0;
61 | virtual void getMdRange(std::vector& vMin, std::vector& vMax) const = 0;
62 | };
63 | }
64 |
--------------------------------------------------------------------------------
/src/TopicModel/GDMRModel.cpp:
--------------------------------------------------------------------------------
1 | #include "GDMRModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentGDMR);
9 |
10 | IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/TopicModel/HDP.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentHDP : public DocumentLDA<_tw>
8 | {
9 | /*
10 | For DocumentHDP, the topic in numByTopic, Zs indicates 'table id', not 'topic id'.
11 | To get real 'topic id', check the topic field of numTopicByTable.
12 | */
13 | using BaseDocument = DocumentLDA<_tw>;
14 | using DocumentLDA<_tw>::DocumentLDA;
15 | using WeightType = typename DocumentLDA<_tw>::WeightType;
16 | struct TableTopicInfo
17 | {
18 | WeightType num;
19 | Tid topic;
20 |
21 | TableTopicInfo(WeightType _num = 0, Tid _topic = 0) : num(_num), topic(_topic)
22 | {
23 | }
24 |
25 | operator const bool() const
26 | {
27 | return num > (WeightType)1e-2;
28 | }
29 |
30 | void serializerWrite(std::ostream& writer) const
31 | {
32 | serializer::writeMany(writer, topic);
33 | }
34 |
35 | void serializerRead(std::istream& reader)
36 | {
37 | serializer::readMany(reader, topic);
38 | }
39 | };
40 | std::vector numTopicByTable;
41 |
42 | DECLARE_SERIALIZER_WITH_VERSION(0);
43 | DECLARE_SERIALIZER_WITH_VERSION(1);
44 |
45 | size_t getNumTable() const
46 | {
47 | return std::count_if(numTopicByTable.begin(), numTopicByTable.end(), [](const TableTopicInfo& e) { return (bool)e; });
48 | }
49 |
50 | // add a new table into doc and return the new table's idx
51 | size_t addNewTable(Tid tid)
52 | {
53 | return insertIntoEmpty(numTopicByTable, TableTopicInfo( 0, tid ));
54 | }
55 |
56 | template void update(WeightType* ptr, const _TopicModel& mdl);
57 | };
58 |
59 | struct HDPArgs : public LDAArgs
60 | {
61 | Float gamma = 0.1;
62 |
63 | HDPArgs()
64 | {
65 | k = 2;
66 | }
67 | };
68 |
69 | class IHDPModel : public ILDAModel
70 | {
71 | public:
72 | using DefaultDocType = DocumentHDP;
73 | static IHDPModel* create(TermWeight _weight, const HDPArgs& args,
74 | bool scalarRng = false);
75 |
76 | virtual Float getGamma() const = 0;
77 | virtual size_t getTotalTables() const = 0;
78 | virtual size_t getLiveK() const = 0;
79 | virtual bool isLiveTopic(Tid tid) const = 0;
80 |
81 | virtual std::unique_ptr convertToLDA(float topicThreshold, std::vector& newK) const = 0;
82 | virtual std::vector purgeDeadTopics() = 0;
83 | };
84 | }
--------------------------------------------------------------------------------
/src/TopicModel/HDPModel.cpp:
--------------------------------------------------------------------------------
1 | #include "HDPModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentHDP);
9 |
10 | IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/HLDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentHLDA : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using WeightType = typename DocumentLDA<_tw>::WeightType;
11 | using DocumentLDA<_tw>::DocumentLDA;
12 |
13 | // numByTopic indicates numByLevel in HLDAModel.
14 | // Zs indicates level in HLDAModel.
15 | std::vector path;
16 |
17 | template void update(WeightType* ptr, const _TopicModel& mdl);
18 |
19 | DECLARE_SERIALIZER_WITH_VERSION(0);
20 | DECLARE_SERIALIZER_WITH_VERSION(1);
21 | };
22 |
23 | struct HLDAArgs : public LDAArgs
24 | {
25 | Float gamma = 0.1;
26 |
27 | HLDAArgs()
28 | {
29 | k = 2;
30 | }
31 | };
32 |
33 | class IHLDAModel : public ILDAModel
34 | {
35 | public:
36 | using DefaultDocType = DocumentHLDA;
37 | static IHLDAModel* create(TermWeight _weight, const HLDAArgs& args,
38 | bool scalarRng = false);
39 |
40 | virtual Float getGamma() const = 0;
41 | virtual size_t getLiveK() const = 0;
42 | virtual size_t getLevelDepth() const = 0;
43 | virtual bool isLiveTopic(Tid tid) const = 0;
44 | virtual size_t getNumDocsOfTopic(Tid tid) const = 0;
45 | virtual size_t getLevelOfTopic(Tid tid) const = 0;
46 | virtual size_t getParentTopicId(Tid tid) const = 0;
47 | virtual std::vector getChildTopicId(Tid tid) const = 0;
48 | };
49 | }
50 |
--------------------------------------------------------------------------------
/src/TopicModel/HLDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "HLDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentHLDA);
9 |
10 | IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/HPA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "PA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentHPA : public DocumentPA<_tw>
8 | {
9 | using BaseDocument = DocumentPA<_tw>;
10 | using DocumentPA<_tw>::DocumentPA;
11 | using WeightType = typename DocumentPA<_tw>::WeightType;
12 |
13 | template void update(WeightType* ptr, const _TopicModel& mdl);
14 |
15 | DECLARE_SERIALIZER_WITH_VERSION(0);
16 | DECLARE_SERIALIZER_WITH_VERSION(1);
17 | };
18 |
19 | struct HPAArgs : public PAArgs
20 | {
21 | };
22 |
23 | class IHPAModel : public IPAModel
24 | {
25 | public:
26 | using DefaultDocType = DocumentHPA;
27 | static IHPAModel* create(TermWeight _weight, bool _exclusive, const HPAArgs& args,
28 | bool scalarRng = false);
29 | };
30 | }
31 |
--------------------------------------------------------------------------------
/src/TopicModel/HPAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "HPAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
6 | DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentHPA);
9 |
10 | IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
11 | {
12 | if (_exclusive)
13 | {
14 | //TMT_SWITCH_TW(_weight, HPAModelExclusive, _K, _K2, _alphaSum, _eta, seed);
15 | }
16 | else
17 | {
18 | TMT_SWITCH_TW(_weight, scalarRng, HPAModel, args);
19 | }
20 | return nullptr;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/TopicModel/LDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "TopicModel.hpp"
3 |
4 | namespace tomoto
5 | {
6 | enum class TermWeight { one, idf, pmi, size };
7 |
8 | template
9 | struct ShareableMatrix : Eigen::Map>
10 | {
11 | using BaseType = Eigen::Map>;
12 | Eigen::Matrix<_Scalar, _rows, _cols> ownData;
13 |
14 | ShareableMatrix(_Scalar* ptr = nullptr, Eigen::Index rows = 0, Eigen::Index cols = 0)
15 | : BaseType(nullptr, _rows != -1 ? _rows : 0, _cols != -1 ? _cols : 0)
16 | {
17 | init(ptr, rows, cols);
18 | }
19 |
20 | ShareableMatrix(const ShareableMatrix& o)
21 | : BaseType(nullptr, _rows != -1 ? _rows : 0, _cols != -1 ? _cols : 0), ownData{ o.ownData }
22 | {
23 | if (o.ownData.data())
24 | {
25 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols());
26 | }
27 | else
28 | {
29 | new (this) BaseType((_Scalar*)o.data(), o.rows(), o.cols());
30 | }
31 | }
32 |
33 | ShareableMatrix(ShareableMatrix&& o) = default;
34 |
35 | ShareableMatrix& operator=(const ShareableMatrix& o)
36 | {
37 | if (o.ownData.data())
38 | {
39 | ownData = o.ownData;
40 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols());
41 | }
42 | else
43 | {
44 | new (this) BaseType((_Scalar*)o.data(), o.rows(), o.cols());
45 | }
46 | return *this;
47 | }
48 |
49 | ShareableMatrix& operator=(ShareableMatrix&& o) = default;
50 |
51 | void init(_Scalar* ptr, Eigen::Index rows, Eigen::Index cols)
52 | {
53 | if (!ptr && rows && cols)
54 | {
55 | ownData = Eigen::Matrix<_Scalar, _rows, _cols>::Zero(_rows != -1 ? _rows : rows, _cols != -1 ? _cols : cols);
56 | ptr = ownData.data();
57 | }
58 | else
59 | {
60 | ownData = Eigen::Matrix<_Scalar, _rows, _cols>{};
61 | }
62 | new (this) BaseType(ptr, _rows != -1 ? _rows : rows, _cols != -1 ? _cols : cols);
63 | }
64 |
65 | void conservativeResize(size_t newRows, size_t newCols)
66 | {
67 | ownData.conservativeResize(_rows != -1 ? _rows : newRows, _cols != -1 ? _cols : newCols);
68 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols());
69 | }
70 |
71 | void becomeOwner()
72 | {
73 | if (ownData.data() != this->m_data)
74 | {
75 | ownData = *this;
76 | new (this) BaseType(ownData.data(), ownData.rows(), ownData.cols());
77 | }
78 | }
79 |
80 | void serializerRead(std::istream& istr)
81 | {
82 | uint32_t rows = serializer::readFromStream(istr);
83 | uint32_t cols = serializer::readFromStream(istr);
84 | init(nullptr, rows, cols);
85 | if (!istr.read((char*)this->data(), sizeof(_Scalar) * this->size()))
86 | throw std::ios_base::failure(std::string("reading type '") + typeid(_Scalar).name() + std::string("' is failed"));
87 | }
88 |
89 | void serializerWrite(std::ostream& ostr) const
90 | {
91 | serializer::writeToStream(ostr, (uint32_t)this->rows());
92 | serializer::writeToStream(ostr, (uint32_t)this->cols());
93 | if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
94 | throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
95 | }
96 |
97 | uint64_t computeHash(uint64_t seed) const
98 | {
99 | seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
100 | return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
101 | }
102 | };
103 |
104 | template
105 | struct SumWordWeight
106 | {
107 | Float sumWordWeight = 0;
108 | Float getSumWordWeight() const
109 | {
110 | return sumWordWeight;
111 | }
112 |
113 | void updateSumWordWeight(size_t realV)
114 | {
115 | sumWordWeight = std::accumulate(static_cast<_Base*>(this)->wordWeights.begin(), static_cast<_Base*>(this)->wordWeights.end(), 0.f);
116 | }
117 | };
118 |
119 | template
120 | struct SumWordWeight<_Base, TermWeight::one>
121 | {
122 | int32_t sumWordWeight = 0;
123 | int32_t getSumWordWeight() const
124 | {
125 | return sumWordWeight;
126 | }
127 |
128 | void updateSumWordWeight(size_t realV)
129 | {
130 | sumWordWeight = (int32_t)std::count_if(static_cast<_Base*>(this)->words.begin(), static_cast<_Base*>(this)->words.end(), [realV](Vid w)
131 | {
132 | return w < realV;
133 | });
134 | }
135 | };
136 |
137 | template
138 | struct DocumentLDA : public DocumentBase, SumWordWeight, _tw>
139 | {
140 | public:
141 | using DocumentBase::DocumentBase;
142 | using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type;
143 |
144 | tvector Zs;
145 | tvector wordWeights;
146 | ShareableMatrix numByTopic;
147 |
148 | DECLARE_SERIALIZER_WITH_VERSION(0);
149 | DECLARE_SERIALIZER_WITH_VERSION(1);
150 |
151 | template void update(WeightType* ptr, const _TopicModel& mdl);
152 |
153 | WeightType getWordWeight(size_t idx) const
154 | {
155 | return _tw == TermWeight::one ? 1 : wordWeights[idx];
156 | }
157 |
158 | std::vector getCountVector(size_t V) const
159 | {
160 | std::vector vs(V);
161 | for (size_t i = 0; i < words.size(); ++i)
162 | {
163 | if (words[i] >= V) continue;
164 | vs[words[i]] += wordWeights.empty() ? 1.f : wordWeights[i];
165 | }
166 | return vs;
167 | }
168 | };
169 |
170 | struct LDAArgs
171 | {
172 | size_t k = 1;
173 | std::vector alpha = { (Float)0.1 };
174 | Float eta = (Float)0.01;
175 | size_t seed = std::random_device{}();
176 | };
177 |
178 | class ILDAModel : public ITopicModel
179 | {
180 | public:
181 | using DefaultDocType = DocumentLDA;
182 | static ILDAModel* create(TermWeight _weight, const LDAArgs& args,
183 | bool scalarRng = false);
184 |
185 | virtual TermWeight getTermWeight() const = 0;
186 | virtual size_t getOptimInterval() const = 0;
187 | virtual void setOptimInterval(size_t) = 0;
188 | virtual size_t getBurnInIteration() const = 0;
189 | virtual void setBurnInIteration(size_t) = 0;
190 | virtual std::vector getCountByTopic() const = 0;
191 | virtual Float getAlpha() const = 0;
192 | virtual Float getAlpha(size_t k) const = 0;
193 | virtual Float getEta() const = 0;
194 |
195 | virtual std::vector getWordPrior(const std::string& word) const = 0;
196 | virtual void setWordPrior(const std::string& word, const std::vector& priors) = 0;
197 | };
198 | }
199 |
--------------------------------------------------------------------------------
/src/TopicModel/LDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "LDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentLDA);
9 |
10 | ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/TopicModel/LLDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentLLDA : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | using WeightType = typename DocumentLDA<_tw>::WeightType;
12 | Eigen::Matrix labelMask;
13 |
14 | DECLARE_SERIALIZER_WITH_VERSION(0);
15 | DECLARE_SERIALIZER_WITH_VERSION(1);
16 | };
17 |
18 | class ILLDAModel : public ILDAModel
19 | {
20 | public:
21 | using DefaultDocType = DocumentLLDA;
22 | static ILLDAModel* create(TermWeight _weight, const LDAArgs& args,
23 | bool scalarRng = false);
24 |
25 | virtual const Dictionary& getTopicLabelDict() const = 0;
26 |
27 | virtual size_t getNumTopicsPerLabel() const = 0;
28 | };
29 | }
--------------------------------------------------------------------------------
/src/TopicModel/LLDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "LLDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentLLDA);
9 |
10 | ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/MGLDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentMGLDA : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | using WeightType = typename DocumentLDA<_tw>::WeightType;
12 |
13 | std::vector sents; // sentence id of each word (const)
14 | std::vector numBySent; // number of words in the sentence (const)
15 |
16 | //std::vector Zs; // gl./loc. and topic assignment
17 | std::vector Vs; // window assignment
18 | WeightType numGl = 0; // number of words assigned as gl.
19 | //std::vector numByTopic; // len = K + KL
20 | Eigen::Matrix numBySentWin; // len = S * T
21 | Eigen::Matrix numByWinL; // number of words assigned as loc. in the window (len = S + T - 1)
22 | Eigen::Matrix numByWin; // number of words in the window (len = S + T - 1)
23 | Eigen::Matrix numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
24 |
25 | DECLARE_SERIALIZER_WITH_VERSION(0);
26 | DECLARE_SERIALIZER_WITH_VERSION(1);
27 |
28 | template void update(WeightType* ptr, const _TopicModel& mdl);
29 | };
30 |
31 | struct MGLDAArgs : public LDAArgs
32 | {
33 | size_t kL = 1;
34 | size_t t = 3;
35 | std::vector alphaL = { 0.1 };
36 | Float alphaMG = 0.1;
37 | Float alphaML = 0.1;
38 | Float etaL = 0.01;
39 | Float gamma = 0.1;
40 | };
41 |
42 | class IMGLDAModel : public ILDAModel
43 | {
44 | public:
45 | using DefaultDocType = DocumentMGLDA;
46 | static IMGLDAModel* create(TermWeight _weight, const MGLDAArgs& args,
47 | bool scalarRng = false);
48 |
49 | virtual size_t getKL() const = 0;
50 | virtual size_t getT() const = 0;
51 | virtual Float getAlphaL() const = 0;
52 | virtual Float getEtaL() const = 0;
53 | virtual Float getGamma() const = 0;
54 | virtual Float getAlphaM() const = 0;
55 | virtual Float getAlphaML() const = 0;
56 | };
57 | }
--------------------------------------------------------------------------------
/src/TopicModel/MGLDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "MGLDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentMGLDA);
9 |
10 | IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/PA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentPA : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | using WeightType = typename DocumentLDA<_tw>::WeightType;
12 |
13 | tvector Z2s;
14 | Eigen::Matrix numByTopic1_2;
15 |
16 | template void update(WeightType* ptr, const _TopicModel& mdl);
17 |
18 | DECLARE_SERIALIZER_WITH_VERSION(0);
19 | DECLARE_SERIALIZER_WITH_VERSION(1);
20 | };
21 |
22 | struct PAArgs : public LDAArgs
23 | {
24 | size_t k2 = 1;
25 | std::vector subalpha = { 0.1 };
26 | };
27 |
28 | class IPAModel : public ILDAModel
29 | {
30 | public:
31 | using DefaultDocType = DocumentPA;
32 | static IPAModel* create(TermWeight _weight, const PAArgs& args,
33 | bool scalarRng = false);
34 |
35 | virtual size_t getDirichletEstIteration() const = 0;
36 | virtual void setDirichletEstIteration(size_t iter) = 0;
37 | virtual size_t getK2() const = 0;
38 | virtual Float getSubAlpha(Tid k1, Tid k2) const = 0;
39 | virtual std::vector getSubAlpha(Tid k1) const = 0;
40 | virtual std::vector getSubTopicBySuperTopic(Tid k, bool normalize = true) const = 0;
41 | virtual std::vector> getSubTopicBySuperTopicSorted(Tid k, size_t topN) const = 0;
42 |
43 | virtual std::vector getSubTopicsByDoc(const DocumentBase* doc, bool normalize = true) const = 0;
44 | virtual std::vector> getSubTopicsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;
45 |
46 | virtual std::vector getCountBySuperTopic() const = 0;
47 | };
48 | }
49 |
--------------------------------------------------------------------------------
/src/TopicModel/PAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "PAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentPA);
9 |
10 | IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, PAModel, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/TopicModel/PLDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LLDA.h"
3 |
4 | namespace tomoto
5 | {
6 | struct PLDAArgs : public LDAArgs
7 | {
8 | size_t numLatentTopics = 0;
9 | size_t numTopicsPerLabel = 1;
10 |
11 | PLDAArgs setK(size_t _k = 1) const
12 | {
13 | PLDAArgs ret = *this;
14 | ret.k = _k;
15 | return ret;
16 | }
17 | };
18 |
19 | class IPLDAModel : public ILLDAModel
20 | {
21 | public:
22 | using DefaultDocType = DocumentLLDA;
23 | static IPLDAModel* create(TermWeight _weight, const PLDAArgs& args,
24 | bool scalarRng = false);
25 |
26 | virtual size_t getNumLatentTopics() const = 0;
27 | };
28 | }
--------------------------------------------------------------------------------
/src/TopicModel/PLDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "PLDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | IPLDAModel* IPLDAModel::create(TermWeight _weight, const PLDAArgs& args, bool scalarRng)
6 | {
7 | TMT_SWITCH_TW(_weight, scalarRng, PLDAModel, args);
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/src/TopicModel/PT.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentPT : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | using WeightType = typename DocumentLDA<_tw>::WeightType;
12 |
13 | uint64_t pseudoDoc = 0;
14 |
15 | DECLARE_SERIALIZER_WITH_VERSION(0);
16 | DECLARE_SERIALIZER_WITH_VERSION(1);
17 | };
18 |
19 | struct PTArgs : public LDAArgs
20 | {
21 | size_t p = 0;
22 | Float lambda = 0.01;
23 | };
24 |
25 | class IPTModel : public ILDAModel
26 | {
27 | public:
28 | using DefaultDocType = DocumentPT;
29 | static IPTModel* create(TermWeight _weight, const PTArgs& args,
30 | bool scalarRng = false);
31 |
32 | virtual size_t getP() const = 0;
33 | virtual std::vector getTopicsFromPseudoDoc(const DocumentBase* doc, bool normalize = true) const = 0;
34 | virtual std::vector> getTopicsFromPseudoDocSorted(const DocumentBase* doc, size_t topN) const = 0;
35 | };
36 | }
37 |
--------------------------------------------------------------------------------
/src/TopicModel/PTModel.cpp:
--------------------------------------------------------------------------------
1 | #include "PTModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentPT);
9 |
10 | IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/TopicModel/SLDA.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "LDA.h"
3 |
4 | namespace tomoto
5 | {
6 | template
7 | struct DocumentSLDA : public DocumentLDA<_tw>
8 | {
9 | using BaseDocument = DocumentLDA<_tw>;
10 | using DocumentLDA<_tw>::DocumentLDA;
11 | std::vector y;
12 |
13 | RawDoc::MiscType makeMisc(const ITopicModel* tm) const override
14 | {
15 | RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm);
16 | ret["y"] = y;
17 | return ret;
18 | }
19 |
20 | DECLARE_SERIALIZER_WITH_VERSION(0);
21 | DECLARE_SERIALIZER_WITH_VERSION(1);
22 | };
23 |
24 | struct SLDAArgs;
25 |
26 | class ISLDAModel : public ILDAModel
27 | {
28 | public:
29 | enum class GLM
30 | {
31 | linear = 0,
32 | binary_logistic = 1,
33 | };
34 |
35 | using DefaultDocType = DocumentSLDA;
36 | static ISLDAModel* create(TermWeight _weight, const SLDAArgs& args,
37 | bool scalarRng = false);
38 |
39 | virtual size_t getF() const = 0;
40 | virtual std::vector getRegressionCoef(size_t f) const = 0;
41 | virtual GLM getTypeOfVar(size_t f) const = 0;
42 | virtual std::vector estimateVars(const DocumentBase* doc) const = 0;
43 | };
44 |
45 | struct SLDAArgs : public LDAArgs
46 | {
47 | std::vector vars;
48 | std::vector mu;
49 | std::vector nuSq;
50 | std::vector glmParam;
51 | };
52 | }
--------------------------------------------------------------------------------
/src/TopicModel/SLDAModel.cpp:
--------------------------------------------------------------------------------
1 | #include "SLDAModel.hpp"
2 |
3 | namespace tomoto
4 | {
5 | DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 0, y);
6 | DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 1, 0x00010001, y);
7 |
8 | TMT_INSTANTIATE_DOC(DocumentSLDA);
9 |
10 | ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng)
11 | {
12 | TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args);
13 | }
14 | }
--------------------------------------------------------------------------------
/src/Utils/AliasMethod.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 |
6 | namespace tomoto
7 | {
8 | namespace sample
9 | {
10 | template
11 | class AliasMethod
12 | {
13 | std::unique_ptr<_Precision[]> arr;
14 | std::unique_ptr alias;
15 | size_t msize = 0, bitsize = 0;
16 |
17 | public:
18 | AliasMethod()
19 | {
20 | }
21 |
22 | AliasMethod(const AliasMethod& o)
23 | {
24 | operator=(o);
25 | }
26 |
27 | AliasMethod(AliasMethod&& o)
28 | {
29 | operator=(o);
30 | }
31 |
32 | AliasMethod& operator=(const AliasMethod& o)
33 | {
34 | msize = o.msize;
35 | bitsize = o.bitsize;
36 | if (msize)
37 | {
38 | size_t n = (size_t)1 << bitsize;
39 | arr = std::make_unique<_Precision[]>(n);
40 | alias = std::make_unique(n);
41 |
42 | std::copy(o.arr.get(), o.arr.get() + n, arr.get());
43 | std::copy(o.alias.get(), o.alias.get() + n, alias.get());
44 | }
45 | return *this;
46 | }
47 |
48 | AliasMethod& operator=(AliasMethod&& o)
49 | {
50 | msize = o.msize;
51 | bitsize = o.bitsize;
52 | std::swap(arr, o.arr);
53 | std::swap(alias, o.alias);
54 | return *this;
55 | }
56 |
57 | template
58 | AliasMethod(_Iter first, _Iter last)
59 | {
60 | buildTable(first, last);
61 | }
62 |
63 | template
64 | void buildTable(_Iter first, _Iter last)
65 | {
66 | size_t psize, nbsize;
67 | msize = 0;
68 | double sum = 0;
69 | for (auto it = first; it != last; ++it, ++msize)
70 | {
71 | sum += *it;
72 | }
73 |
74 | if (!std::isfinite(sum)) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "cannot build NaN value distribution");
75 |
76 | // ceil to power of 2
77 | nbsize = log2_ceil(msize);
78 | psize = (size_t)1 << nbsize;
79 |
80 | if (nbsize != bitsize)
81 | {
82 | arr = std::make_unique<_Precision[]>(psize);
83 | std::fill(arr.get(), arr.get() + psize, 0);
84 | alias = std::make_unique(psize);
85 | bitsize = nbsize;
86 | }
87 |
88 | sum /= psize;
89 |
90 | auto f = std::make_unique(psize);
91 | auto pf = f.get();
92 | for (auto it = first; it != last; ++it, ++pf)
93 | {
94 | *pf = *it / sum;
95 | }
96 | std::fill(pf, pf + psize - msize, 0);
97 |
98 | size_t over = 0, under = 0, mm;
99 | while (over < psize && f[over] < 1) ++over;
100 | while (under < psize && f[under] >= 1) ++under;
101 | mm = under + 1;
102 |
103 | while (over < psize && under < psize)
104 | {
105 | arr[under] = f[under] * (std::numeric_limits<_Precision>::max() + 1.0);
106 | alias[under] = over;
107 | f[over] += f[under] - 1;
108 | if (f[over] >= 1 || mm <= over)
109 | {
110 | for (under = mm; under < psize && f[under] >= 1; ++under);
111 | mm = under + 1;
112 | }
113 | else
114 | {
115 | under = over;
116 | }
117 |
118 | while (over < psize && f[over] < 1) ++over;
119 | }
120 |
121 | for (; over < psize; ++over)
122 | {
123 | if (f[over] >= 1)
124 | {
125 | arr[over] = std::numeric_limits<_Precision>::max();
126 | alias[over] = over;
127 | }
128 | }
129 |
130 | if (under < psize)
131 | {
132 | arr[under] = std::numeric_limits<_Precision>::max();
133 | alias[under] = under;
134 | for (under = mm; under < msize; ++under)
135 | {
136 | if (f[under] < 1)
137 | {
138 | arr[under] = std::numeric_limits<_Precision>::max();
139 | alias[under] = under;
140 | }
141 | }
142 | }
143 | }
144 |
145 | template
146 | size_t operator()(_Rng& rng) const
147 | {
148 | auto x = rng();
149 | size_t a;
150 | if (sizeof(_Precision) < sizeof(typename _Rng::result_type))
151 | {
152 | a = x >> (sizeof(x) * 8 - bitsize);
153 | }
154 | else
155 | {
156 | a = rng() & ((1 << bitsize) - 1);
157 | }
158 |
159 | _Precision b = (_Precision)x;
160 | if (b < arr[a])
161 | {
162 | assert(a < msize);
163 | return a;
164 | }
165 | assert(alias[a] < msize);
166 | return alias[a];
167 | }
168 | };
169 | }
170 | }
171 |
--------------------------------------------------------------------------------
/src/Utils/Dictionary.cpp:
--------------------------------------------------------------------------------
1 | #include "Dictionary.h"
2 |
3 | namespace tomoto
4 | {
5 | Dictionary::Dictionary() = default;
6 | Dictionary::~Dictionary() = default;
7 |
8 | Dictionary::Dictionary(const Dictionary&) = default;
9 | Dictionary& Dictionary::operator=(const Dictionary&) = default;
10 |
11 | Dictionary::Dictionary(Dictionary&&) noexcept = default;
12 | Dictionary& Dictionary::operator=(Dictionary&&) noexcept = default;
13 |
14 | Vid Dictionary::add(const std::string& word)
15 | {
16 | auto it = dict.find(word);
17 | if (it == dict.end())
18 | {
19 | dict.emplace(word, (Vid)dict.size());
20 | id2word.emplace_back(word);
21 | return (Vid)(dict.size() - 1);
22 | }
23 | return it->second;
24 | }
25 |
26 | const std::string& Dictionary::toWord(Vid vid) const
27 | {
28 | assert(vid < id2word.size());
29 | return id2word[vid];
30 | }
31 |
32 | Vid Dictionary::toWid(const std::string& word) const
33 | {
34 | auto it = dict.find(word);
35 | if (it == dict.end()) return non_vocab_id;
36 | return it->second;
37 | }
38 |
39 | void Dictionary::serializerWrite(std::ostream& writer) const
40 | {
41 | serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
42 | }
43 |
44 | void Dictionary::serializerRead(std::istream& reader)
45 | {
46 | serializer::readMany(reader, serializer::to_key("Dict"), id2word);
47 | for (size_t i = 0; i < id2word.size(); ++i)
48 | {
49 | dict.emplace(id2word[i], (Vid)i);
50 | }
51 | }
52 |
53 | uint64_t Dictionary::computeHash(uint64_t seed) const
54 | {
55 | return serializer::computeHashMany(seed, id2word);
56 | }
57 |
58 | void Dictionary::swap(Dictionary& rhs)
59 | {
60 | std::swap(dict, rhs.dict);
61 | std::swap(id2word, rhs.id2word);
62 | }
63 |
64 | void Dictionary::reorder(const std::vector& order)
65 | {
66 | for (auto& p : dict)
67 | {
68 | p.second = order[p.second];
69 | id2word[p.second] = p.first;
70 | }
71 | }
72 |
73 | const std::vector& Dictionary::getRaw() const
74 | {
75 | return id2word;
76 | }
77 |
78 | Vid Dictionary::mapToNewDict(Vid v, const Dictionary& newDict) const
79 | {
80 | return newDict.toWid(toWord(v));
81 | }
82 |
83 | std::vector Dictionary::mapToNewDict(const std::vector& v, const Dictionary& newDict) const
84 | {
85 | std::vector r(v.size());
86 | for (size_t i = 0; i < v.size(); ++i)
87 | {
88 | r[i] = mapToNewDict(v[i], newDict);
89 | }
90 | return r;
91 | }
92 |
93 | std::vector Dictionary::mapToNewDictAdd(const std::vector& v, Dictionary& newDict) const
94 | {
95 | std::vector r(v.size());
96 | for (size_t i = 0; i < v.size(); ++i)
97 | {
98 | r[i] = mapToNewDict(v[i], newDict);
99 | }
100 | return r;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/Utils/Dictionary.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include "serializer.hpp"
10 |
11 | namespace tomoto
12 | {
13 | using Vid = uint32_t;
14 | static constexpr Vid non_vocab_id = (Vid)-1;
15 | static constexpr Vid rm_vocab_id = (Vid)-2;
16 | using Tid = uint16_t;
17 | static constexpr Tid non_topic_id = (Tid)-1;
18 | using Float = float;
19 |
20 | struct VidPair : public std::pair
21 | {
22 | using std::pair::pair;
23 | };
24 |
25 | class Dictionary
26 | {
27 | protected:
28 | std::unordered_map dict;
29 | std::vector id2word;
30 | public:
31 |
32 | Dictionary();
33 | ~Dictionary();
34 |
35 | Dictionary(const Dictionary&);
36 | Dictionary& operator=(const Dictionary&);
37 |
38 | Dictionary(Dictionary&&) noexcept;
39 | Dictionary& operator=(Dictionary&&) noexcept;
40 |
41 | Vid add(const std::string& word);
42 |
43 | size_t size() const { return dict.size(); }
44 |
45 | const std::string& toWord(Vid vid) const;
46 |
47 | Vid toWid(const std::string& word) const;
48 |
49 | void serializerWrite(std::ostream& writer) const;
50 |
51 | void serializerRead(std::istream& reader);
52 |
53 | uint64_t computeHash(uint64_t seed) const;
54 |
55 | void swap(Dictionary& rhs);
56 |
57 | void reorder(const std::vector& order);
58 |
59 | const std::vector& getRaw() const;
60 |
61 | Vid mapToNewDict(Vid v, const Dictionary& newDict) const;
62 |
63 | std::vector mapToNewDict(const std::vector& v, const Dictionary& newDict) const;
64 |
65 | std::vector mapToNewDictAdd(const std::vector& v, Dictionary& newDict) const;
66 | };
67 |
68 | }
69 |
70 | namespace std
71 | {
72 | template<>
73 | struct hash
74 | {
75 | size_t operator()(const tomoto::VidPair& p) const
76 | {
77 | return hash{}(p.first) ^ hash{}(p.second);
78 | }
79 | };
80 | }
81 |
--------------------------------------------------------------------------------
/src/Utils/LBFGS.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2016-2019 Yixuan Qiu
2 | // Under MIT license
3 | // https://github.com/yixuan/LBFGSpp
4 | // bab2min modified some features
5 |
6 | #ifndef LBFGS_H
7 | #define LBFGS_H
8 |
9 | #include
10 | #include "LBFGS/Param.h"
11 | #include "LBFGS/LineSearchBacktracking.h"
12 | #include "LBFGS/LineSearchBracketing.h"
13 |
14 |
15 | namespace LBFGSpp {
16 |
17 |
18 | ///
19 | /// LBFGS solver for unconstrained numerical optimization
20 | ///
21 | template < typename Scalar,
22 | template class LineSearch = LineSearchBacktracking >
23 | class LBFGSSolver
24 | {
25 | private:
26 | typedef Eigen::Matrix Vector;
27 | typedef Eigen::Matrix Matrix;
28 | typedef Eigen::Map MapVec;
29 |
30 | static constexpr Scalar epsilon = Scalar(0.001); // add epsilon for preventing division-by-zero
31 |
32 | LBFGSParam m_param; // Parameters to control the LBFGS algorithm
33 | Matrix m_s; // History of the s vectors
34 | Matrix m_y; // History of the y vectors
35 | Vector m_ys; // History of the s'y values
36 | Vector m_alpha; // History of the step lengths
37 | Vector m_fx; // History of the objective function values
38 | Vector m_xp; // Old x
39 | Vector m_grad; // New gradient
40 | Vector m_gradp; // Old gradient
41 | Vector m_drt; // Moving direction
42 |
43 | inline void reset(int n)
44 | {
45 | const int m = m_param.m;
46 | m_s.resize(n, m);
47 | m_y.resize(n, m);
48 | m_ys.resize(m);
49 | m_alpha.resize(m);
50 | m_xp.resize(n);
51 | m_grad.resize(n);
52 | m_gradp.resize(n);
53 | m_drt.resize(n);
54 | if (m_param.past > 0)
55 | m_fx.resize(m_param.past);
56 | }
57 |
58 | public:
59 | ///
60 | /// Constructor for LBFGS solver.
61 | ///
62 | /// \param param An object of \ref LBFGSParam to store parameters for the
63 | /// algorithm
64 | ///
65 | LBFGSSolver(const LBFGSParam& param = {}) :
66 | m_param(param)
67 | {
68 | m_param.check_param();
69 | }
70 |
71 | ///
72 | /// Minimizing a multivariate function using LBFGS algorithm.
73 | /// Exceptions will be thrown if error occurs.
74 | ///
75 | /// \param f A function object such that `f(x, grad)` returns the
76 | /// objective function value at `x`, and overwrites `grad` with
77 | /// the gradient.
78 | /// \param x In: An initial guess of the optimal point. Out: The best point
79 | /// found.
80 | /// \param fx Out: The objective function value at `x`.
81 | ///
82 | /// \return Number of iterations used.
83 | ///
84 | template
85 | inline int minimize(Foo&& f, Eigen::Ref x, Scalar& fx)
86 | {
87 | const int n = x.size();
88 | const int fpast = m_param.past;
89 | reset(n);
90 |
91 | // Evaluate function and compute gradient
92 | fx = f(x, m_grad);
93 |
94 | Scalar xnorm = x.norm();
95 | Scalar gnorm = m_grad.norm();
96 | if (fpast > 0)
97 | m_fx[0] = fx;
98 |
99 | // Early exit if the initial x is already a minimizer
100 | if (gnorm <= m_param.epsilon * std::max(xnorm, Scalar(1.0)))
101 | {
102 | return 1;
103 | }
104 |
105 | // Initial direction
106 | m_drt.noalias() = -m_grad;
107 | // Initial step
108 | Scalar step = Scalar(1.0) / m_drt.norm();
109 |
110 | int k = 1;
111 | int end = 0;
112 | for (; ; )
113 | {
114 | // Save the curent x and gradient
115 | m_xp.noalias() = x;
116 | m_gradp.noalias() = m_grad;
117 |
118 | // Line search to update x, fx and gradient
119 | LineSearch::LineSearch(f, fx, x, m_grad, step, m_drt, m_xp, m_param);
120 |
121 | // New x norm and gradient norm
122 | xnorm = x.norm();
123 | gnorm = m_grad.norm();
124 |
125 | // Convergence test -- gradient
126 | if (gnorm <= m_param.epsilon * std::max(xnorm, Scalar(1.0)))
127 | {
128 | return k;
129 | }
130 | // Convergence test -- objective function value
131 | if (fpast > 0)
132 | {
133 | if (k >= fpast && std::abs((m_fx[k % fpast] - fx) / fx) < m_param.delta)
134 | return k;
135 |
136 | m_fx[k % fpast] = fx;
137 | }
138 | // Maximum number of iterations
139 | if (m_param.max_iterations != 0 && k >= m_param.max_iterations)
140 | {
141 | return k;
142 | }
143 |
144 | // Update s and y
145 | // s_{k+1} = x_{k+1} - x_k
146 | // y_{k+1} = g_{k+1} - g_k
147 | MapVec svec(&m_s(0, end), n);
148 | MapVec yvec(&m_y(0, end), n);
149 | svec.noalias() = x - m_xp;
150 | yvec.noalias() = m_grad - m_gradp;
151 |
152 | // ys = y's = 1/rho
153 | // yy = y'y
154 | Scalar ys = yvec.dot(svec);
155 | Scalar yy = yvec.squaredNorm();
156 |
157 | /* prevent division-by-zero */
158 | if (yy == 0 || ys == 0)
159 | {
160 | ys += epsilon;
161 | yy += epsilon;
162 | }
163 | m_ys[end] = ys;
164 |
165 | // Recursive formula to compute d = -H * g
166 | m_drt.noalias() = -m_grad;
167 | int bound = std::min(m_param.m, k);
168 | end = (end + 1) % m_param.m;
169 | int j = end;
170 | for (int i = 0; i < bound; i++)
171 | {
172 | j = (j + m_param.m - 1) % m_param.m;
173 | MapVec sj(&m_s(0, j), n);
174 | MapVec yj(&m_y(0, j), n);
175 | m_alpha[j] = sj.dot(m_drt) / m_ys[j];
176 | m_drt.noalias() -= m_alpha[j] * yj;
177 | }
178 |
179 | m_drt *= (ys / yy);
180 |
181 | for (int i = 0; i < bound; i++)
182 | {
183 | MapVec sj(&m_s(0, j), n);
184 | MapVec yj(&m_y(0, j), n);
185 | Scalar beta = yj.dot(m_drt) / m_ys[j];
186 | m_drt.noalias() += (m_alpha[j] - beta) * sj;
187 | j = (j + 1) % m_param.m;
188 | }
189 |
190 | // step = 1.0 as initial guess
191 | step = Scalar(1.0);
192 | k++;
193 | }
194 |
195 | return k;
196 | }
197 | };
198 |
199 |
200 | } // namespace LBFGSpp
201 |
202 | #endif // LBFGS_H
--------------------------------------------------------------------------------
/src/Utils/LBFGS/LineSearchBacktracking.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2016-2019 Yixuan Qiu
2 | // Under MIT license
3 |
4 | #ifndef LINE_SEARCH_BACKTRACKING_H
5 | #define LINE_SEARCH_BACKTRACKING_H
6 |
7 | #include
8 | #include // std::runtime_error
9 |
10 |
11 | namespace LBFGSpp {
12 |
13 |
14 | ///
15 | /// The backtracking line search algorithm for LBFGS. Mainly for internal use.
16 | ///
17 | template
18 | class LineSearchBacktracking
19 | {
20 | private:
21 | typedef Eigen::Matrix Vector;
22 |
23 | public:
24 | ///
25 | /// Line search by backtracking.
26 | ///
27 | /// \param f A function object such that `f(x, grad)` returns the
28 | /// objective function value at `x`, and overwrites `grad` with
29 | /// the gradient.
30 | /// \param fx In: The objective function value at the current point.
31 | /// Out: The function value at the new point.
32 | /// \param x Out: The new point moved to.
33 | /// \param grad In: The current gradient vector. Out: The gradient at the
34 | /// new point.
35 | /// \param step In: The initial step length. Out: The calculated step length.
36 | /// \param drt The current moving direction.
37 | /// \param xp The current point.
38 | /// \param param Parameters for the LBFGS algorithm
39 | ///
40 | template
41 | static void LineSearch(Foo& f, Scalar& fx, Eigen::Ref x, Vector& grad,
42 | Scalar& step,
43 | const Vector& drt, const Vector& xp,
44 | const LBFGSParam& param)
45 | {
46 | // Decreasing and increasing factors
47 | const Scalar dec = 0.5;
48 | const Scalar inc = 2.1;
49 |
50 | // Check the value of step
51 | if (step <= Scalar(0))
52 | std::invalid_argument("'step' must be positive");
53 |
54 | // Save the function value at the current x
55 | const Scalar fx_init = fx;
56 | // Projection of gradient on the search direction
57 | const Scalar dg_init = grad.dot(drt);
58 | // Make sure d points to a descent direction
59 | if (dg_init > 0)
60 | std::logic_error("the moving direction increases the objective function value");
61 |
62 | const Scalar dg_test = param.ftol * dg_init;
63 | Scalar width;
64 |
65 | int iter;
66 | for (iter = 0; iter < param.max_linesearch; iter++)
67 | {
68 | // x_{k+1} = x_k + step * d_k
69 | x.noalias() = xp + step * drt;
70 | // Evaluate this candidate
71 | fx = f(x, grad);
72 |
73 | if (fx > fx_init + step * dg_test)
74 | {
75 | width = dec;
76 | }
77 | else {
78 | // Armijo condition is met
79 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
80 | break;
81 |
82 | const Scalar dg = grad.dot(drt);
83 | if (dg < param.wolfe * dg_init)
84 | {
85 | width = inc;
86 | }
87 | else {
88 | // Regular Wolfe condition is met
89 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE)
90 | break;
91 |
92 | if (dg > -param.wolfe * dg_init)
93 | {
94 | width = dec;
95 | }
96 | else {
97 | // Strong Wolfe condition is met
98 | break;
99 | }
100 | }
101 | }
102 |
103 | if (iter >= param.max_linesearch)
104 | throw std::runtime_error("the line search routine reached the maximum number of iterations");
105 |
106 | if (step < param.min_step)
107 | throw std::runtime_error("the line search step became smaller than the minimum value allowed");
108 |
109 | if (step > param.max_step)
110 | throw std::runtime_error("the line search step became larger than the maximum value allowed");
111 |
112 | step *= width;
113 | }
114 | }
115 | };
116 |
117 |
118 | } // namespace LBFGSpp
119 |
120 | #endif // LINE_SEARCH_BACKTRACKING_H
--------------------------------------------------------------------------------
/src/Utils/LBFGS/LineSearchBracketing.h:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2016-2019 Yixuan Qiu & Dirk Toewe
2 | // Under MIT license
3 |
4 | #ifndef LINE_SEARCH_BRACKETING_H
5 | #define LINE_SEARCH_BRACKETING_H
6 |
7 | #include
8 | #include // std::runtime_error
9 |
10 | namespace LBFGSpp {
11 |
12 |
13 | ///
14 | /// The bracketing line search algorithm for LBFGS. Mainly for internal use.
15 | ///
16 | template
17 | class LineSearchBracketing
18 | {
19 | private:
20 | typedef Eigen::Matrix Vector;
21 |
22 | public:
23 | ///
24 | /// Line search by bracketing. Similar to the backtracking line search
25 | /// except that it actively maintains an upper and lower bound of the
26 | /// current search range.
27 | ///
28 | /// \param f A function object such that `f(x, grad)` returns the
29 | /// objective function value at `x`, and overwrites `grad` with
30 | /// the gradient.
31 | /// \param fx In: The objective function value at the current point.
32 | /// Out: The function value at the new point.
33 | /// \param x Out: The new point moved to.
34 | /// \param grad In: The current gradient vector. Out: The gradient at the
35 | /// new point.
36 | /// \param step In: The initial step length. Out: The calculated step length.
37 | /// \param drt The current moving direction.
38 | /// \param xp The current point.
39 | /// \param param Parameters for the LBFGS algorithm
40 | ///
41 | template
42 | static void LineSearch(Foo&& f, Scalar& fx, Eigen::Ref x, Vector& grad,
43 | Scalar& step,
44 | const Vector& drt, const Vector& xp,
45 | const LBFGSParam& param)
46 | {
47 | // Check the value of step
48 | if (step <= Scalar(0))
49 | std::invalid_argument("'step' must be positive");
50 |
51 | // Save the function value at the current x
52 | const Scalar fx_init = fx;
53 | // Projection of gradient on the search direction
54 | const Scalar dg_init = grad.dot(drt);
55 | // Make sure d points to a descent direction
56 | if (dg_init > 0)
57 | std::logic_error("the moving direction increases the objective function value");
58 |
59 | const Scalar dg_test = param.ftol * dg_init;
60 |
61 | // Upper and lower end of the current line search range
62 | Scalar step_lo = 0,
63 | step_hi = std::numeric_limits::infinity();
64 |
65 | for (int iter = 0; iter < param.max_linesearch; iter++)
66 | {
67 | // x_{k+1} = x_k + step * d_k
68 | x.noalias() = xp + step * drt;
69 | // Evaluate this candidate
70 | fx = f(x, grad);
71 |
72 | if (fx > fx_init + step * dg_test)
73 | {
74 | step_hi = step;
75 | }
76 | else {
77 | // Armijo condition is met
78 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
79 | break;
80 |
81 | const Scalar dg = grad.dot(drt);
82 | if (dg < param.wolfe * dg_init)
83 | {
84 | step_lo = step;
85 | }
86 | else {
87 | // Regular Wolfe condition is met
88 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE)
89 | break;
90 |
91 | if (dg > -param.wolfe * dg_init)
92 | {
93 | step_hi = step;
94 | }
95 | else {
96 | // Strong Wolfe condition is met
97 | break;
98 | }
99 | }
100 | }
101 |
102 | assert(step_lo < step_hi);
103 |
104 | if (iter >= param.max_linesearch)
105 | throw std::runtime_error("the line search routine reached the maximum number of iterations");
106 |
107 | if (step < param.min_step)
108 | throw std::runtime_error("the line search step became smaller than the minimum value allowed");
109 |
110 | if (step > param.max_step)
111 | throw std::runtime_error("the line search step became larger than the maximum value allowed");
112 |
113 | // continue search in mid of current search range
114 | step = std::isinf(step_hi) ? 2 * step : step_lo / 2 + step_hi / 2;
115 | }
116 | }
117 | };
118 |
119 |
120 | } // namespace LBFGSpp
121 |
122 | #endif // LINE_SEARCH_BRACKETING_H
123 |
--------------------------------------------------------------------------------
/src/Utils/LUT.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | namespace tomoto
5 | {
6 | namespace math
7 | {
8 | namespace detail
9 | {
10 | template
11 | class LUT3
12 | {
13 | protected:
14 | std::array<_Prec, N + M + L> points = {};
15 | static constexpr _Prec P = (_Prec)(1. / S);
16 | static constexpr _Prec Q = (_Prec)(1. / T);
17 | static constexpr _Prec R = (_Prec)(1. / U);
18 | LUT3()
19 | {
20 | _Func fun;
21 | for (size_t i = 0; i < N; i++)
22 | {
23 | points[i] = fun(i ? i * P : (_Prec)0.0001);
24 | }
25 | for (size_t i = 0; i < M; i++)
26 | {
27 | points[i + N] = fun(i*Q + N * P);
28 | }
29 | for (size_t i = 0; i < L; i++)
30 | {
31 | points[i + N + M] = fun(i*R + N * P + M * Q);
32 | }
33 | }
34 |
35 | _Prec _get(_Prec x) const
36 | {
37 | if (!std::isfinite(x)) return _Func{}.forNonFinite(x);
38 | if (x < 0) return NAN;
39 | if (x < _Func::smallThreshold) return _Func{}.forSmall(x);
40 | if (x >= N * P + M * Q + (L - 1) * R) return _Func{}.forLarge(x);
41 | size_t idx;
42 | _Prec a;
43 | _Prec nx = x;
44 | if (x < N*P)
45 | {
46 | idx = (size_t)(nx / P);
47 | a = (nx - idx * P) / P;
48 | }
49 | else
50 | {
51 | nx -= N * P;
52 | if (nx < M*Q)
53 | {
54 | idx = (size_t)(nx / Q);
55 | a = (nx - idx * Q) / Q;
56 | idx += N;
57 | }
58 | else
59 | {
60 | nx -= M * Q;
61 | idx = (size_t)(nx / R);
62 | a = (nx - idx * R) / R;
63 | idx += N + M;
64 | }
65 | }
66 | return points[idx] + a * (points[idx + 1] - points[idx]);
67 | }
68 | public:
69 | static const LUT3& getInst()
70 | {
71 | static LUT3 lg;
72 | return lg;
73 | }
74 |
75 | static _Prec get(_Prec x)
76 | {
77 | return getInst()._get(x);
78 | }
79 | };
80 | }
81 | }
82 | }
--------------------------------------------------------------------------------
/src/Utils/Mmap.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "Mmap.h"
3 |
4 | namespace tomoto
5 | {
6 | namespace utils
7 | {
8 | static std::u16string utf8To16(const std::string& str)
9 | {
10 | std::u16string ret;
11 | for (auto it = str.begin(); it != str.end(); ++it)
12 | {
13 | uint32_t code = 0;
14 | uint32_t byte = (uint8_t)*it;
15 | if ((byte & 0xF8) == 0xF0)
16 | {
17 | code = (uint32_t)((byte & 0x07) << 18);
18 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
19 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
20 | code |= (uint32_t)((byte & 0x3F) << 12);
21 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
22 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
23 | code |= (uint32_t)((byte & 0x3F) << 6);
24 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
25 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
26 | code |= (byte & 0x3F);
27 | }
28 | else if ((byte & 0xF0) == 0xE0)
29 | {
30 | code = (uint32_t)((byte & 0x0F) << 12);
31 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
32 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
33 | code |= (uint32_t)((byte & 0x3F) << 6);
34 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
35 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
36 | code |= (byte & 0x3F);
37 | }
38 | else if ((byte & 0xE0) == 0xC0)
39 | {
40 | code = (uint32_t)((byte & 0x1F) << 6);
41 | if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
42 | if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
43 | code |= (byte & 0x3F);
44 | }
45 | else if ((byte & 0x80) == 0x00)
46 | {
47 | code = byte;
48 | }
49 | else
50 | {
51 | throw std::invalid_argument{ "unicode error" };
52 | }
53 |
54 | if (code < 0x10000)
55 | {
56 | ret.push_back((char16_t)code);
57 | }
58 | else if (code < 0x10FFFF)
59 | {
60 | code -= 0x10000;
61 | ret.push_back((char16_t)(0xD800 | (code >> 10)));
62 | ret.push_back((char16_t)(0xDC00 | (code & 0x3FF)));
63 | }
64 | else
65 | {
66 | throw std::invalid_argument{ "unicode error" };
67 | }
68 | }
69 | return ret;
70 | }
71 | }
72 | }
73 |
74 | namespace tomoto
75 | {
76 | namespace utils
77 | {
78 | MMap::MMap(const std::string& filepath)
79 | {
80 | #ifdef _WIN32
81 | hFile = CreateFileW((const wchar_t*)utf8To16(filepath).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr);
82 | if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'");
83 | hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr);
84 | if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError()));
85 | view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0);
86 | if (!view) throw std::ios_base::failure("Cannot MapViewOfFile() Code:" + std::to_string(GetLastError()));
87 | DWORD high;
88 | len = GetFileSize(hFile, &high);
89 | len |= (uint64_t)high << 32;
90 | #else
91 | fd = open(filepath.c_str(), O_RDONLY);
92 | if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'");
93 | struct stat sb;
94 | if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'");
95 | len = sb.st_size;
96 | view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0);
97 | if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed");
98 | #endif
99 | }
100 |
101 | #ifdef _WIN32
102 | MMap::MMap(MMap&& o) noexcept
103 | : view{ o.view }, len{ o.len }
104 | {
105 | o.view = nullptr;
106 | std::swap(hFile, o.hFile);
107 | std::swap(hFileMap, o.hFileMap);
108 | }
109 | #else
110 | MMap::MMap(MMap&& o) noexcept
111 | : len{ o.len }, fd{ std::move(o.fd) }
112 | {
113 | std::swap(view, o.view);
114 | }
115 | #endif
116 |
117 | MMap& MMap::operator=(MMap&& o) noexcept
118 | {
119 | std::swap(view, o.view);
120 | std::swap(len, o.len);
121 | #ifdef _WIN32
122 | std::swap(hFile, o.hFile);
123 | std::swap(hFileMap, o.hFileMap);
124 | #else
125 | std::swap(fd, o.fd);
126 | #endif
127 | return *this;
128 | }
129 |
130 | MMap::~MMap()
131 | {
132 | #ifdef _WIN32
133 | if (hFileMap)
134 | {
135 | UnmapViewOfFile(view);
136 | view = nullptr;
137 | }
138 | #else
139 | if (view)
140 | {
141 | munmap((void*)view, len);
142 | }
143 | #endif
144 | }
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/src/Utils/Mmap.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 |
5 | #ifdef _WIN32
6 | #define NOMINMAX
7 | #include
8 | namespace tomoto
9 | {
10 | namespace utils
11 | {
12 | namespace detail
13 | {
14 | class HandleGuard
15 | {
16 | HANDLE handle = nullptr;
17 | public:
18 | HandleGuard(HANDLE _handle = nullptr) : handle(_handle)
19 | {
20 | }
21 |
22 | HandleGuard(const HandleGuard&) = delete;
23 | HandleGuard& operator =(const HandleGuard&) = delete;
24 |
25 | HandleGuard(HandleGuard&& o) noexcept
26 | {
27 | std::swap(handle, o.handle);
28 | }
29 |
30 | HandleGuard& operator=(HandleGuard&& o) noexcept
31 | {
32 | std::swap(handle, o.handle);
33 | return *this;
34 | }
35 |
36 | ~HandleGuard()
37 | {
38 | if (handle && handle != INVALID_HANDLE_VALUE)
39 | {
40 | CloseHandle(handle);
41 | handle = nullptr;
42 | }
43 | }
44 |
45 | operator HANDLE() const
46 | {
47 | return handle;
48 | }
49 | };
50 | }
51 |
52 | class MMap
53 | {
54 | const char* view = nullptr;
55 | uint64_t len = 0;
56 | detail::HandleGuard hFile, hFileMap;
57 | public:
58 | MMap(const std::string& filepath);
59 | MMap(const MMap&) = delete;
60 | MMap& operator=(const MMap&) = delete;
61 | MMap(MMap&& o) noexcept;
62 | MMap& operator=(MMap&& o) noexcept;
63 | ~MMap();
64 |
65 | const char* get() const { return view; }
66 | size_t size() const { return len; }
67 | };
68 | }
69 | }
70 | #else
71 | #include
72 | #include
73 | #include
74 | #include
75 | #include
76 |
77 | namespace tomoto
78 | {
79 | namespace utils
80 | {
81 | namespace detail
82 | {
83 | class FDGuard
84 | {
85 | int fd = 0;
86 | public:
87 | FDGuard(int _fd = 0) : fd(_fd)
88 | {
89 | }
90 |
91 | FDGuard(const FDGuard&) = delete;
92 | FDGuard& operator =(const FDGuard&) = delete;
93 |
94 | FDGuard(FDGuard&& o)
95 | {
96 | std::swap(fd, o.fd);
97 | }
98 |
99 | FDGuard& operator=(FDGuard&& o)
100 | {
101 | std::swap(fd, o.fd);
102 | return *this;
103 | }
104 |
105 | ~FDGuard()
106 | {
107 | if (fd && fd != -1)
108 | {
109 | close(fd);
110 | fd = 0;
111 | }
112 | }
113 |
114 | operator int() const
115 | {
116 | return fd;
117 | }
118 | };
119 | }
120 |
121 | class MMap
122 | {
123 | const char* view = nullptr;
124 | size_t len = 0;
125 | detail::FDGuard fd;
126 | public:
127 | MMap(const std::string& filepath);
128 | MMap(const MMap&) = delete;
129 | MMap& operator=(const MMap&) = delete;
130 | MMap(MMap&& o) noexcept;
131 | MMap& operator=(MMap&& o) noexcept;
132 | ~MMap();
133 |
134 | const char* get() const { return view; }
135 | size_t size() const { return len; }
136 | };
137 | }
138 | }
139 | #endif
140 |
--------------------------------------------------------------------------------
/src/Utils/MultiNormalDistribution.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include "serializer.hpp"
5 |
6 | namespace tomoto
7 | {
8 | namespace math
9 | {
10 | template