├── DocumentFeatureSelection
    ├── bns
    │   ├── __init__.py
    │   ├── bns.py
    │   ├── bns_cython.pyx
    │   └── bns_python3.py
    ├── soa
    │   ├── __init__.py
    │   ├── soa.py
    │   ├── soa_cython.pyx
    │   └── soa_python3.py
    ├── common
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── func_data_converter.py
    │   ├── crs_matrix_constructor.py
    │   └── data_converter.py
    ├── tf_idf
    │   ├── __init__.py
    │   └── tf_idf.py
    ├── pmi
    │   ├── __init__.py
    │   ├── PMI.py
    │   ├── pmi_cython.pyx
    │   └── PMI_python3.py
    ├── init_logger.py
    ├── __init__.py
    ├── interface.py
    └── models.py
├── examples
    ├── __init__.py
    ├── check_performance.py
    ├── huge_data_example.py
    ├── basic_example.py
    └── advanced_example.py
├── doc
    ├── source
    │   ├── tutorial.rst
    │   ├── installation.rst
    │   ├── index.rst
    │   └── conf.py
    ├── make.bat
    └── Makefile
├── MANIFEST.in
├── tests
    ├── __init__.py
    ├── docker-compose.yml
    ├── all_tests.py
    ├── Dockerfile
    ├── test_PMI_python3.py
    ├── test_tf_idf.py
    ├── test_soa_python3.py
    ├── check_code_pmi.py
    ├── test_data_models.py
    ├── test_interface.py
    ├── test_bns_python3.py
    └── test_data_converter.py
├── .travis.yml
├── .gitignore
├── setup.py
├── README.md
└── LICENSE


/DocumentFeatureSelection/bns/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/soa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'kensuke-mi'
2 | 


--------------------------------------------------------------------------------
/doc/source/tutorial.rst:
--------------------------------------------------------------------------------
1 | Quick tutorial and examples
2 | ===========================
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | recursive-include examples *
4 | recursive-include tests *
5 | recursive-include DocumentFeatureSelection *


--------------------------------------------------------------------------------
/DocumentFeatureSelection/common/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import print_function
3 | from __future__ import unicode_literals
4 | from __future__ import division
5 | __author__ = 'kensuke-mi'


--------------------------------------------------------------------------------
/DocumentFeatureSelection/tf_idf/__init__.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import print_function
4 | from __future__ import unicode_literals
5 | from __future__ import division


--------------------------------------------------------------------------------
/DocumentFeatureSelection/pmi/__init__.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import print_function
4 | from __future__ import unicode_literals
5 | from __future__ import division
6 | __author__ = 'kensuke-mi'
7 | 
8 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/bns/bns.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | import sys
 7 | python_version = sys.version_info
 8 | 
 9 | if python_version > (3, 0, 0):
10 |     from DocumentFeatureSelection.bns.bns_python3 import BNS
11 | else:
12 |     raise SystemError('Not Implemented yet')


--------------------------------------------------------------------------------
/DocumentFeatureSelection/pmi/PMI.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | import sys
 7 | python_version = sys.version_info
 8 | 
 9 | if python_version > (3, 0, 0):
10 |     from DocumentFeatureSelection.pmi.PMI_python3 import PMI
11 | else:
12 |     raise SystemError('Not Implemented yet')


--------------------------------------------------------------------------------
/DocumentFeatureSelection/soa/soa.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | import sys
 7 | python_version = sys.version_info
 8 | 
 9 | if python_version > (3, 0, 0):
10 |     from DocumentFeatureSelection.soa.soa_python3 import SOA
11 | else:
12 |     raise SystemError('Not Implemented yet')


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Package gathering all unitary tests for document-feature-selection.
 4 | Module names must start with `test_` to be taken into account.
 5 | 
 6 | You should consider to install :mod:`Distribute` to run all tests with::
 7 | 
 8 |     $ python setup.py test
 9 | 
10 | """
11 | from __future__ import unicode_literals
12 | __author__ = 'kensuke-mi'
13 | import unittest
14 | 
15 | if __name__ == '__main__':
16 |     unittest.main()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     import doctest
21 |     doctest.testmod()
22 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/init_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from logging import Formatter, StreamHandler
 4 | 
 5 | # Formatter
 6 | custmoFormatter = Formatter(
 7 |     fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s',
 8 |     datefmt='Y/%m/%d %H:%M:%S'
 9 | )
10 | 
11 | handler = logging.StreamHandler(sys.stderr)
12 | handler.setFormatter(custmoFormatter)
13 | 
14 | LOGGER_NAME = 'DocumentFeatureSelection'
15 | logger = logging.getLogger(LOGGER_NAME)
16 | logger.setLevel(logging.INFO)
17 | logger.addHandler(handler)
18 | logger.propagate = False
19 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | from __future__ import absolute_import
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | from __future__ import division
 7 | import sys
 8 | python_version = sys.version_info
 9 | 
10 | #from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix
11 | from DocumentFeatureSelection.pmi.PMI import PMI
12 | from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF
13 | from DocumentFeatureSelection.soa.soa import SOA
14 | from DocumentFeatureSelection.bns.bns import BNS
15 | 


--------------------------------------------------------------------------------
/tests/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # 開発/test環境としてまとめてdocker環境を整えるためのcompose
 2 | version: '3'
 3 | services:
 4 |   dev_env:
 5 |     build:
 6 |       context: ./
 7 |       dockerfile: Dockerfile
 8 |     volumes:
 9 |       - ..:/codes/
10 |     working_dir: /codes
11 |     stdin_open: true
12 |     tty: true
13 |   test_env:
14 |     build:
15 |       context: ./
16 |       dockerfile: Dockerfile
17 |     volumes:
18 |       - ..:/codes/
19 |     working_dir: /codes
20 |     command: >
21 |       bash -c "echo 'Python3.6 Test' &&
22 |       source activate p36 &&
23 |       python setup.py test &&
24 |       deactivate &&
25 |       echo 'Python3.7 Test' &&
26 |       source activate p37 &&
27 |       python setup.py test"
28 |     stdin_open: true
29 |     tty: true


--------------------------------------------------------------------------------
/tests/all_tests.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'kensuke-mi'
 2 | 
 3 | import sys
 4 | import unittest
 5 | python_version = sys.version_info
 6 | 
 7 | 
 8 | def suite():
 9 |     suite = unittest.TestSuite()
10 |     if python_version >= (3, 0, 0):
11 |         from .test_data_converter import TestDataConverter
12 |         from .test_PMI_python3 import TestPmiPython3
13 |         from .test_tf_idf import TestTfIdf
14 |         from .test_soa_python3 import TestSoaPython3
15 |         from .test_bns_python3 import TestBnsPython3
16 |         suite.addTest(unittest.makeSuite(TestDataConverter))
17 |         suite.addTest(unittest.makeSuite(TestPmiPython3))
18 |         suite.addTest(unittest.makeSuite(TestTfIdf))
19 |         suite.addTest(unittest.makeSuite(TestSoaPython3))
20 |         suite.addTest(unittest.makeSuite(TestBnsPython3))
21 |     else:
22 |         pass
23 | 
24 | 
25 |     return suite


--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installing / Upgrading
 2 | ======================
 3 | 
 4 | Installing from source
 5 | ----------------------
 6 | 
 7 | If you prefer install directly from the source::
 8 | 
 9 |   $ cd document-feature-selection
10 |   $ sudo python setup.py install
11 | 
12 | Creating packages
13 | -----------------
14 | 
15 | You can easily create documentation and packages::
16 | 
17 |   $ cd document-feature-selection
18 |   $ python setup.py sdist  # generate source .tar.gz file
19 |   $ python setup.py bdist_deb  # require python-all and python-stdeb packages
20 |   $ python setup.py bdist_rpm  #
21 |   $ python setup.py bdist_msi  # generate a Windows installer
22 |   $ python setup.py bdist  # generate a binary .tar.gz
23 |   $ python setup.py py2exe  # generate a portable Windows application
24 |   $ python setup.py py2app  # generate a portable Mac OS X application
25 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. document-feature-selection documentation master file, created by
 2 |    sphinx-quickstart on Wed Feb 13 11:51:12 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to document-feature-selection's documentation!
 7 | ======================================================
 8 | 
 9 | Overview:
10 | 
11 | :doc:`installation`
12 |     Instruction on how to get the distribution
13 | 
14 | :doc:`tutorial`
15 |     Start here for a quick overview
16 | 
17 | :doc:`api/index`
18 |     The complete API documentation, organized by modules
19 | 
20 | 
21 | Full table of contents
22 | ======================
23 | 
24 | .. toctree::
25 |    :maxdepth: 4
26 | 
27 |    installation
28 |    tutorial
29 |    api/index
30 | 
31 | Indices and tables
32 | ==================
33 | 
34 | * :ref:`genindex`
35 | * :ref:`modindex`
36 | * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 |   - "3.6"
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |     - git
 9 |     - make
10 |     - curl
11 |     - xz-utils
12 |     - file
13 |     - pandoc
14 |     - gcc-5
15 |     - g++-5
16 |     - build-essential
17 |     sources:
18 |     - ubuntu-toolchain-r-test
19 | before_install:
20 |   - sudo apt-get update -qq
21 |   - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
22 |   - export CC="gcc-5"
23 |   - export CXX="g++-5"
24 |   - export CFLAGS=-std=c++11
25 |   - export CXXFLAGS=-std=c++11
26 |   - pip install numpy scipy scikit-learn cython sqlitedict
27 | install:
28 |   - python --version
29 |   - python setup.py install
30 |   - pip install coveralls coverage nose
31 | script:
32 |   - coverage run --source=DocumentFeatureSelection setup.py test
33 | after_success:
34 |   - sudo coveralls
35 | notifications:
36 |   email:
37 |     recipients:
38 |       - kensuke.mit@gmail.com
39 |     on_success: always
40 |     on_failure: always


--------------------------------------------------------------------------------
/tests/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM frolvlad/alpine-glibc:alpine-3.6
 2 | MAINTAINER kensuke-mi <kensuke.mit@gmail.com>
 3 | 
 4 | # apk update
 5 | RUN apk update
 6 | # general
 7 | RUN apk --no-cache add vim \
 8 | wget \
 9 | lsof \
10 | curl \
11 | bash \
12 | swig \
13 | gcc \
14 | build-base \
15 | make \
16 | python-dev \
17 | py-pip \
18 | jpeg-dev \
19 | zlib-dev \
20 | git \
21 | linux-headers
22 | ENV LIBRARY_PATH=/lib:/usr/lib
23 | 
24 | ENV PATH=/opt/conda/bin:$PATH \
25 |     LANG=C.UTF-8 \
26 |     MINICONDA=Miniconda3-latest-Linux-x86_64.sh
27 | 
28 | # Python
29 | RUN apk add --no-cache bash wget && \
30 |     wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \
31 |     bash /Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
32 |     ln -s /opt/conda/bin/* /usr/local/bin/ && \
33 |     rm -rf /root/.[acpw]* /$MINICONDA /opt/conda/pkgs/*
34 | 
35 | RUN conda config --add channels conda-forge --system
36 | RUN conda install Cython \
37 | scikit-learn \
38 | scipy \
39 | numpy
40 | 
41 | RUN pip install more_itertools joblib nltk pypandoc sqlitedict nose
42 | RUN conda create -y -n p36 python=3.6
43 | RUN conda create -y -n p37 python=3.7
44 | 
45 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .gitignore support plugin (hsz.mobi)
 2 | ### Python template
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | *.c
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | .eggs/
27 | 
28 | 
29 | # Standard IDEs
30 | .idea/
31 | .project/
32 | 
33 | # PyInstaller
34 | #  Usually these files are written by a python script from a template
35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | 
37 | *.manifest
38 | *.spec
39 | 
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | 
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 
65 | # custom
66 | envs/
67 | package/
68 | .python-version
69 | web_api/tests/matrix.memmap
70 | web_api/tests/backend.sqlite3
71 | 


--------------------------------------------------------------------------------
/examples/check_performance.py:
--------------------------------------------------------------------------------
 1 | from DocumentFeatureSelection.init_logger import logger
 2 | from DocumentFeatureSelection import interface
 3 | import logging
 4 | import time
 5 | import nltk
 6 | 
 7 | nltk.download('abc')
 8 | nltk.download('genesis')
 9 | nltk.download('webtext')
10 | nltk.download('gutenberg')
11 | nltk.download('punkt')
12 | 
13 | """This script shows the difference of computing speed between cython and multi-processing"""
14 | 
15 | 
16 | def pmi_with_parallel(input_corpus):
17 |     logging.debug(msg='With multiprocessing backend')
18 |     start = time.time()
19 |     scored_matrix_obj = interface.run_feature_selection(
20 |         input_dict=input_corpus,
21 |         method='pmi',
22 |         n_jobs=-1,
23 |     )
24 |     elapsed_time = time.time() - start
25 |     logger.info("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
26 | 
27 | 
28 | def pmi_with_cython(input_corpus):
29 |     logging.debug(msg='With cython is True')
30 |     start = time.time()
31 |     scored_matrix_obj = interface.run_feature_selection(
32 |         input_dict=input_corpus,
33 |         method='pmi',
34 |         use_cython=True
35 |     )
36 |     elapsed_time = time.time() - start
37 |     logger.info("elapsed_time with cython:{} [sec]".format(elapsed_time))
38 | 
39 | 
40 | from nltk.corpus import gutenberg
41 | from nltk.corpus import webtext
42 | from nltk.corpus import genesis
43 | from nltk.corpus import abc
44 | 
45 | abs_corpus = abc.sents()
46 | genesis_corpus = genesis.sents()
47 | web_corpus = webtext.sents()
48 | gutenberg_corpus = gutenberg.sents()
49 | 
50 | input_corpus = {
51 |     'abs': list(abs_corpus),
52 |     'genesis': list(genesis_corpus),
53 |     'web': list(web_corpus),
54 |     'gutenberg': list(gutenberg_corpus)
55 |     }
56 | 
57 | pmi_with_cython(input_corpus)
58 | pmi_with_parallel(input_corpus)
59 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/tf_idf/tf_idf.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | from sklearn.feature_extraction.text import TfidfTransformer
 7 | from scipy.sparse.csr import csr_matrix
 8 | from numpy import ndarray
 9 | __author__ = 'kensuke-mi'
10 | 
11 | 
12 | class TFIDF(object):
13 |     def __init__(self, ngram=1, norm_metric='l2', use_idf_bool=True, smooth_idf_bool=True, sublinear_tf_bool=False):
14 |         assert isinstance(ngram, int)
15 | 
16 |         self.n_gram = ngram
17 |         self.norm_metric = norm_metric
18 |         self.use_idf_bool = use_idf_bool
19 |         self.smooth_idf_bool = smooth_idf_bool
20 |         self.sublinear_tf_bool = sublinear_tf_bool
21 | 
22 |     def fit_transform(self, X):
23 |         if isinstance(X, csr_matrix):
24 |             X = X.toarray()
25 |         else:
26 |             X = X
27 | 
28 |         tf_idf_matrix = self.call_sklearn_tfidf(
29 |                 X=X,
30 |                 norm=self.norm_metric,
31 |                 use_idf=self.use_idf_bool,
32 |                 smooth_idf=self.smooth_idf_bool,
33 |                 sublinear_tf=self.sublinear_tf_bool
34 |         )
35 |         self.weighed_matrix = tf_idf_matrix
36 | 
37 |         return tf_idf_matrix
38 | 
39 |     def call_sklearn_tfidf(self, X, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):
40 |         assert isinstance(X, (csr_matrix, ndarray))
41 | 
42 |         tf_idf_generator = TfidfTransformer(
43 |             norm=norm,
44 |             use_idf=use_idf,
45 |             smooth_idf=smooth_idf,
46 |             sublinear_tf=sublinear_tf
47 |         )
48 |         if isinstance(csr_matrix, csr_matrix):
49 |             feat_matrix = X.toarray()
50 |         else:
51 |             feat_matrix = X
52 | 
53 |         tf_idf_weight_matrix = tf_idf_generator.fit_transform(
54 |             X=feat_matrix
55 |         )
56 |         assert isinstance(tf_idf_weight_matrix, (csr_matrix, ndarray))
57 | 
58 |         return tf_idf_weight_matrix
59 | 
60 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/common/utils.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | from scipy.sparse.csr import csr_matrix
 7 | from typing import Union
 8 | from DocumentFeatureSelection import models
 9 | import sqlitedict
10 | import sys
11 | import tempfile
12 | import os
13 | python_version = sys.version_info
14 | 
15 | __author__ = 'kensuke-mi'
16 | 
17 | 
18 | def flatten(lis):
19 |     for item in lis:
20 |         if isinstance(item, list) and not isinstance(item, str):
21 |             for x in flatten(item):
22 |                 yield x
23 |         else:
24 |             yield item
25 | 
26 | 
27 | def __conv_into_dict_format(pmi_word_score_items):
28 |     out_format_structure = {}
29 |     for item in pmi_word_score_items:
30 |         if out_format_structure not in item['label']:
31 |             out_format_structure[item['label']] = [{'word': item['word'], 'score': item['score']}]
32 |         else:
33 |             out_format_structure[item['label']].append({'word': item['word'], 'score': item['score']})
34 |     return out_format_structure
35 | 
36 | 
37 | def extract_from_csr_matrix(weight_csr_matrix, vocabulary, label_id, row_id, col_id):
38 |     assert isinstance(weight_csr_matrix, csr_matrix)
39 |     assert isinstance(vocabulary, dict)
40 |     assert isinstance(label_id, dict)
41 | 
42 | 
43 | def init_cache_object(file_name:str,
44 |                       path_work_dir:str=tempfile.mkdtemp(),
45 |                       cache_backend:str='PersistentDict')->Union[sqlitedict.SqliteDict, models.PersistentDict]:
46 |     """* What you can do
47 |     - You initialize cached object.
48 |     """
49 |     if cache_backend == 'PersistentDict':
50 |         cached_obj = models.PersistentDict(os.path.join(path_work_dir, file_name))
51 |     elif cache_backend == 'SqliteDict':
52 |         cached_obj = sqlitedict.SqliteDict(os.path.join(path_work_dir, file_name), autocommit=True)
53 |     else:
54 |         raise Exception('No cache backend named {}'.format(cache_backend))
55 | 
56 |     return cached_obj


--------------------------------------------------------------------------------
/examples/huge_data_example.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from DocumentFeatureSelection import interface
 3 | from DocumentFeatureSelection.models import PersistentDict
 4 | from DocumentFeatureSelection.init_logger import logger
 5 | import logging
 6 | import time
 7 | import os
 8 | import nltk
 9 | nltk.download('wordnet')
10 | from collections import Counter
11 | from nltk import stem
12 | from typing import List
13 | # make download 20news group file
14 | from sklearn.datasets import fetch_20newsgroups
15 | newsgroups_train = fetch_20newsgroups(subset='train')
16 | lemmatizer = stem.WordNetLemmatizer()
17 | logger.setLevel(logging.DEBUG)
18 | 
19 | """This example shows you how to work on huge dataset.
20 | For persisted-dict object you can choose PersistentDict or SqliteDict
21 | """
22 | 
23 | DATA_LIMIT = 100000
24 | 
25 | 
26 | def run_nltk_lemma(subject_name: str)->List[str]:
27 |     return [lemmatizer.lemmatize(t).strip(':?!><') for t in subject_name.lower().split()]
28 | 
29 | 
30 | category_names = newsgroups_train.target_names
31 | logger.debug("20-news has {} categories".format(len(category_names)))
32 | logger.debug("Now pre-processing on subject text...")
33 | news_lemma = [run_nltk_lemma(d) for d in newsgroups_train.data[:DATA_LIMIT]]
34 | 
35 | index2category = {i: t for i, t in enumerate(newsgroups_train.target_names)}
36 | dict_index2label = {i: index2category[t_no] for i, t_no in enumerate(newsgroups_train.target[:DATA_LIMIT])}
37 | logger.info("Subject distribution")
38 | for k, v in dict(Counter(dict_index2label.values())).items():
39 |     logger.info("{} is {}, {}%".format(k, v, v / len(dict_index2label) * 100))
40 | 
41 | # Case of PersistentDict
42 | logger.info("Putting documents into dict object...")
43 | persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
44 | for i, label in dict_index2label.items():
45 |     if label in persistent_dict_obj:
46 |         persistent_dict_obj[label].append(news_lemma[i])
47 |     else:
48 |         persistent_dict_obj[label] = [news_lemma[i]]
49 | else:
50 |     persistent_dict_obj.sync()
51 | 
52 | start = time.time()
53 | # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
54 | # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
55 | scored_matrix_obj = interface.run_feature_selection(
56 |         input_dict=persistent_dict_obj,
57 |         method='pmi',
58 |         use_cython=True,
59 |         is_use_cache=True,
60 |         is_use_memmap=True
61 |     )
62 | elapsed_time = time.time() - start
63 | logger.info("elapsed_time with cython: {} [sec]".format(elapsed_time))
64 | os.remove('./demo.json')
65 | 


--------------------------------------------------------------------------------
/tests/test_PMI_python3.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from DocumentFeatureSelection.common import data_converter
 3 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix
 4 | from DocumentFeatureSelection.models import ScoredResultObject
 5 | from DocumentFeatureSelection.pmi import PMI_python3
 6 | from scipy.sparse import csr_matrix
 7 | 
 8 | 
 9 | class TestPmiPython3(unittest.TestCase):
10 |     def setUp(self):
11 |         input_dict = {
12 |             "label_a": [
13 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
14 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
15 |                 ["I", "aa", "hero", "some", "ok", "aa"]
16 |             ],
17 |             "label_b": [
18 |                 ["bb", "bb", "bb"],
19 |                 ["bb", "bb", "bb"],
20 |                 ["hero", "ok", "bb"],
21 |                 ["hero", "cc", "bb"],
22 |             ],
23 |             "label_c": [
24 |                 ["cc", "cc", "cc"],
25 |                 ["cc", "cc", "bb"],
26 |                 ["xx", "xx", "cc"],
27 |                 ["aa", "xx", "cc"],
28 |             ]
29 |         }
30 | 
31 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
32 |             labeled_documents=input_dict,
33 |             n_jobs=5
34 |         )
35 |         assert isinstance(data_csr_matrix, DataCsrMatrix)
36 |         self.label2id_dict = data_csr_matrix.label2id_dict
37 |         self.csr_matrix_ = data_csr_matrix.csr_matrix_
38 |         self.n_docs_distribution = data_csr_matrix.n_docs_distribution
39 |         self.vocabulary = data_csr_matrix.vocabulary
40 | 
41 |     def test_normal_fit_transform(self):
42 |         pmi_object = PMI_python3.PMI()
43 |         scored_matrix = pmi_object.fit_transform(
44 |             X=self.csr_matrix_,
45 |             n_jobs=1,
46 |             n_docs_distribution=self.n_docs_distribution
47 |         )
48 |         assert isinstance(scored_matrix, csr_matrix)
49 | 
50 |     def test_multi_process_fit_transform(self):
51 |         pmi_object = PMI_python3.PMI()
52 |         scored_matrix = pmi_object.fit_transform(
53 |             X=self.csr_matrix_,
54 |             n_jobs=5,
55 |             n_docs_distribution=self.n_docs_distribution,
56 |             verbose=True
57 |         )
58 |         assert isinstance(scored_matrix, csr_matrix)
59 | 
60 |     def test_output_result_pmi(self):
61 |         pmi_object = PMI_python3.PMI()
62 |         scored_matrix = pmi_object.fit_transform(
63 |             X=self.csr_matrix_,
64 |             n_jobs=5,
65 |             n_docs_distribution=self.n_docs_distribution
66 |         )
67 |         assert isinstance(scored_matrix, csr_matrix)
68 | 
69 |         pmi_scored_dict = ScoredResultObject(
70 |             scored_matrix=scored_matrix,
71 |             label2id_dict=self.label2id_dict,
72 |             feature2id_dict=self.vocabulary
73 |         ).convert_score_matrix2score_record(outformat='items')
74 |         self.assertTrue(isinstance(pmi_scored_dict, list))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/soa/soa_cython.pyx:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | cimport numpy as np
 4 | from cpython cimport bool
 5 | 
 6 | cdef float soa(
 7 |     np.ndarray[np.float64_t, ndim=2] X,
 8 |     np.ndarray[np.int64_t, ndim=1] unit_distribution,
 9 |     int n_total_docs,
10 |     int feature_index,
11 |     int sample_index, 
12 |     bool verbose):
13 |     # X is either of term-frequency matrix per label or document-frequency per label
14 | 
15 |     matrix_size = X.shape
16 |     NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
17 | 
18 |     # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e
19 |     cdef float freq_w_e = X[sample_index, feature_index]
20 |     # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e
21 |     cdef float freq_w_not_e = X[NOT_sample_indexes, feature_index].sum()
22 |     # freq_e is the number of the unit having specific label e
23 |     cdef float freq_e = unit_distribution[sample_index]
24 |     # freq_not_e is the number of the unit NOT having the specific label e
25 |     cdef float freq_not_e = n_total_docs - freq_e
26 |     cdef float nominator, denominator, ans, soa_val
27 | 
28 |     if verbose:
29 |         print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
30 |         print('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format(
31 |             freq_w_e,
32 |             freq_w_not_e,
33 |             freq_e,
34 |             freq_not_e
35 |         ))
36 | 
37 |     if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0:
38 |         return 0.0
39 |     else:
40 |         nominator = (float(freq_w_e) * freq_not_e)
41 |         denominator = (float(freq_e) * freq_w_not_e)
42 |         ans = nominator / denominator
43 |         soa_val = math.log(ans, 2)
44 |         return soa_val
45 | 
46 | 
47 | def main(X,
48 |         np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
49 |         int n_total_doc,
50 |         sample_range,
51 |         feature_range,
52 |         bool verbose=False):
53 |     """What you can do
54 |     - calculate PMI score based on given data.
55 |     - The function returns list of tuple, whose element is (sample_index, feature_index, score)
56 |     - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature.
57 |     """
58 | 
59 |     cdef int n_samples = X.shape[0]
60 | 
61 |     if isinstance(X, scipy.sparse.csr_matrix):
62 |         X = X.toarray()
63 | 
64 |     cdef int sample_index, feature_index
65 |     soa_score_csr_source = [
66 |         (
67 |             sample_index,
68 |             feature_index,
69 |             soa(X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose)
70 |          )
71 |         for sample_index in sample_range
72 |         for feature_index in feature_range
73 |     ]
74 |     non_zero_soa_score_csr_source = [score_tuple for score_tuple in soa_score_csr_source if not score_tuple[2]==0]
75 | 
76 |     return non_zero_soa_score_csr_source


--------------------------------------------------------------------------------
/tests/test_tf_idf.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import division
 6 | from scipy.sparse import csr_matrix
 7 | from DocumentFeatureSelection.common import data_converter
 8 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix
 9 | from DocumentFeatureSelection.tf_idf import tf_idf
10 | from DocumentFeatureSelection.models import ScoredResultObject
11 | import logging
12 | import unittest
13 | import numpy
14 | logging.basicConfig(level=logging.DEBUG)
15 | logger = logging.getLogger(__name__)
16 | __author__ = 'kensuke-mi'
17 | 
18 | 
19 | class TestTfIdf(unittest.TestCase):
20 |     def setUp(self):
21 |         input_dict = {
22 |             "label_a": [
23 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
24 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
25 |                 ["I", "aa", "hero", "some", "ok", "aa"]
26 |             ],
27 |             "label_b": [
28 |                 ["bb", "bb", "bb"],
29 |                 ["bb", "bb", "bb"],
30 |                 ["hero", "ok", "bb"],
31 |                 ["hero", "cc", "bb"],
32 |             ],
33 |             "label_c": [
34 |                 ["cc", "cc", "cc"],
35 |                 ["cc", "cc", "bb"],
36 |                 ["xx", "xx", "cc"],
37 |                 ["aa", "xx", "cc"],
38 |             ]
39 |         }
40 | 
41 |         tf_matrix = numpy.array(
42 |             [
43 |                 [2, 12, 1, 0, 1, 1, 1, 0],
44 |                 [0, 0, 8, 1, 2, 1, 0, 0],
45 |                 [0, 1, 1, 7, 0, 0, 0, 3]
46 |              ]
47 |         )
48 | 
49 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
50 |             labeled_documents=input_dict,
51 |             n_jobs=-1
52 |         )
53 |         assert isinstance(data_csr_matrix, DataCsrMatrix)
54 |         self.label2id_dict = data_csr_matrix.label2id_dict
55 |         self.csr_matrix_ = data_csr_matrix.csr_matrix_
56 |         self.n_docs_distribution = data_csr_matrix.n_docs_distribution
57 |         self.vocabulary = data_csr_matrix.vocabulary
58 | 
59 |         numpy.array_equal(data_csr_matrix.csr_matrix_.toarray(), tf_matrix)
60 | 
61 |     def test_normal_fit_transform(self):
62 |         tf_idf_weighted_matrix = tf_idf.TFIDF().fit_transform(
63 |             X=self.csr_matrix_,
64 |         )
65 |         assert isinstance(tf_idf_weighted_matrix, csr_matrix)
66 | 
67 |     def test_output_result_pmi(self):
68 |         tf_idf_weighted_matrix = tf_idf.TFIDF().fit_transform(
69 |             X=self.csr_matrix_,
70 |         )
71 |         assert isinstance(tf_idf_weighted_matrix, csr_matrix)
72 | 
73 |         tf_idf_scored_dict = ScoredResultObject(
74 |             scored_matrix=tf_idf_weighted_matrix,
75 |             label2id_dict=self.label2id_dict,
76 |             feature2id_dict=self.vocabulary,
77 |         ).convert_score_matrix2score_record(outformat='items')
78 |         self.assertTrue(isinstance(tf_idf_scored_dict, list))
79 |         assert isinstance(tf_idf_scored_dict, list)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     unittest.main()
84 | 
85 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/bns/bns_cython.pyx:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | from cpython cimport bool
 4 | from scipy.stats import norm
 5 | cimport numpy
 6 | import numpy
 7 | 
 8 | 
 9 | cdef float bns(
10 |     numpy.ndarray[numpy.float64_t, ndim=2] X,
11 |     numpy.ndarray[numpy.int64_t, ndim=1] unit_distribution,
12 |     int feature_index,
13 |     int sample_index, 
14 |     int true_index,
15 |     bool verbose):
16 |     # X is either of term-frequency matrix per label or document-frequency per label
17 | 
18 |     cdef int false_index
19 |     if true_index == 0:
20 |         false_index = 1
21 |     elif true_index == 1:
22 |         false_index = 0
23 |     else:
24 |         raise Exception('true index must be either of 0 or 1')
25 | 
26 |     # trueラベルで出現した回数
27 |     # tp is frequency of features in the specified positive label
28 |     cdef float tp = X[true_index, feature_index]
29 |     # trueラベルで出現しなかった回数
30 |     # fp is frequency of NON-features(expect specified feature) in the specified positive label
31 |     cdef float fp = unit_distribution[true_index] - tp
32 | 
33 |     # negativeラベルで出現した回数
34 |     # fn is frequency of features in the specified negative label
35 |     cdef float fn = X[false_index, feature_index]
36 |     # negativeラベルで出現しなかった回数
37 |     # fp is frequency of NON-features(expect specified feature) in the specified negative label
38 |     cdef float tn = unit_distribution[false_index] - fn
39 | 
40 |     if tn < 0.0:
41 |         print('Something wrong')
42 | 
43 |     cdef float pos = tp + fn
44 |     cdef float neg = fp + tn
45 | 
46 |     cdef float tpr = tp / pos
47 |     cdef float fpr = fp / neg
48 | 
49 |     if verbose:
50 |         print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
51 |         print('tp:{} fp:{} fn:{} tn:{} pos:{} neg:{} tpr:{} fpr:{}'.format(
52 |             tp,
53 |             fp,
54 |             fn,
55 |             tn,
56 |             pos,
57 |             neg,
58 |             tpr,
59 |             fpr
60 |         ))
61 |     cdef float bns_score = numpy.abs(norm.ppf(tpr) - norm.ppf(fpr))
62 |     # cdef float bns_score = numpy.abs(norm.ppf(norm.cdf(tpr)) - norm.ppf(norm.cdf(fpr)))
63 |     return bns_score
64 | 
65 | 
66 | def main(X,
67 |         numpy.ndarray[numpy.int64_t, ndim=1] unit_distribution,
68 |         sample_range,
69 |         feature_range,
70 |         int true_index,
71 |         bool verbose=False):
72 |     """What you can do
73 |     - calculate BNS score based on given data.
74 |     - The function returns list of tuple, whose element is (sample_index, feature_index, score)
75 |     - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature.
76 |     """
77 |     if isinstance(X, scipy.sparse.csr_matrix):
78 |         X = X.toarray()
79 | 
80 |     cdef int sample_index, feature_index
81 |     soa_score_csr_source = [
82 |         (
83 |             sample_index,
84 |             feature_index,
85 |             bns(X, unit_distribution, feature_index, sample_index, true_index, verbose)
86 |          )
87 |         for sample_index in sample_range
88 |         for feature_index in feature_range
89 |     ]
90 | 
91 |     return soa_score_csr_source


--------------------------------------------------------------------------------
/DocumentFeatureSelection/pmi/pmi_cython.pyx:
--------------------------------------------------------------------------------
 1 | import math
 2 | import scipy
 3 | cimport numpy as np
 4 | from cpython cimport bool
 5 | 
 6 | cdef float pmi(np.ndarray[np.float64_t, ndim=2] X,
 7 |         int n_samples,
 8 |         np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
 9 |         int n_total_doc,
10 |         int feature_index,
11 |         int sample_index,
12 |         bool verbose):
13 |     """get PMI score for given feature & sample index
14 |     """
15 |     cdef i
16 |     sample_indexes = [i for i in range(0, n_samples) if i != sample_index]
17 | 
18 |     # n_11 is #docs having feature(i.e. word) in the specified index(label)
19 |     cdef float n_11 = X[sample_index, feature_index]
20 |     # n_01 is #docs NOT having feature in the specified index(label)
21 |     cdef float n_01 = n_docs_distribution[sample_index] - n_11
22 |     # n_10 is #docs having feature in NOT specified index(indexes except specified index)
23 |     cdef float n_10 = X[sample_indexes, feature_index].sum()
24 |     # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
25 |     cdef float n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
26 | 
27 |     cdef float temp1, temp2, temp3, temp4, score
28 | 
29 |     if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
30 |         return 0
31 |     else:
32 |         temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
33 |         temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
34 |         temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
35 |         temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
36 |         score = temp1 + temp2 + temp3 + temp4
37 | 
38 |         if verbose:
39 |             print('score={}, temp1={}, temp2={}, temp3={}, temp4={}, n11={}, n10={}, n01={}, n00={}, n_total_docs={}'.format(score, temp1, temp2, temp3, temp4, n_11, n_10, n_01, n_00, n_total_doc))
40 | 
41 |         return score
42 | 
43 | 
44 | def main(X,
45 |         np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
46 |         int n_total_doc,
47 |         sample_range,
48 |         feature_range,
49 |         bool verbose=False):
50 |     """What you can do
51 |     - calculate PMI score based on given data.
52 |     - The function returns list of tuple, whose element is (sample_index, feature_index, score)
53 |     - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature.
54 |     """
55 | 
56 |     cdef int n_samples = X.shape[0]
57 | 
58 |     if isinstance(X, scipy.sparse.csr_matrix):
59 |         X = X.toarray()
60 | 
61 |     cdef int sample_index, feature_index
62 |     pmi_score_csr_source = [
63 |         (
64 |             sample_index,
65 |             feature_index,
66 |             pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose)
67 |          )
68 |         for sample_index in sample_range
69 |         for feature_index in feature_range
70 |     ]
71 |     non_zero_pmi_score_csr_source = [score_tuple for score_tuple in pmi_score_csr_source if not score_tuple[2]==0]
72 | 
73 |     return non_zero_pmi_score_csr_source


--------------------------------------------------------------------------------
/tests/test_soa_python3.py:
--------------------------------------------------------------------------------
 1 | from DocumentFeatureSelection.soa import soa_python3
 2 | from DocumentFeatureSelection.common import data_converter
 3 | from DocumentFeatureSelection.models import ScoredResultObject
 4 | import unittest
 5 | 
 6 | 
 7 | class TestSoaPython3(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.input_dict = {
10 |             "label_a": [
11 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
12 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
13 |                 ["I", "aa", "hero", "some", "ok", "aa"]
14 |             ],
15 |             "label_b": [
16 |                 ["bb", "bb", "bb"],
17 |                 ["bb", "bb", "bb"],
18 |                 ["hero", "ok", "bb"],
19 |                 ["hero", "cc", "bb"],
20 |             ],
21 |             "label_c": [
22 |                 ["cc", "cc", "cc"],
23 |                 ["cc", "cc", "bb"],
24 |                 ["xx", "xx", "cc"],
25 |                 ["aa", "xx", "cc"],
26 |             ]
27 |         }
28 | 
29 |     def test_soa_with_term_freq(self):
30 |         data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
31 |             labeled_documents=self.input_dict,
32 |             n_jobs=5
33 |         )
34 |         assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix)
35 |         label2id_dict = data_csr_matrix.label2id_dict
36 |         csr_matrix_ = data_csr_matrix.csr_matrix_
37 |         n_docs_distribution = data_csr_matrix.n_docs_distribution
38 |         vocabulary = data_csr_matrix.vocabulary
39 | 
40 |         scored_matrix_term_freq = soa_python3.SOA().fit_transform(
41 |             X=csr_matrix_,
42 |             unit_distribution=n_docs_distribution,
43 |             verbose=True
44 |         )
45 | 
46 |         soa_scores_term_freq = ScoredResultObject(
47 |             scored_matrix=scored_matrix_term_freq,
48 |             label2id_dict=label2id_dict,
49 |             feature2id_dict=vocabulary
50 |         ).convert_score_matrix2score_record()
51 |         self.assertTrue(isinstance(soa_scores_term_freq, list))
52 | 
53 |         #import pprint
54 |         #print('term freq based soa')
55 |         #pprint.pprint(soa_scores_term_freq)
56 | 
57 |     def test_soa_doc_freq(self):
58 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
59 |             labeled_documents=self.input_dict,
60 |             n_jobs=5
61 |         )
62 |         assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix)
63 |         label2id_dict = data_csr_matrix.label2id_dict
64 |         csr_matrix_ = data_csr_matrix.csr_matrix_
65 |         n_docs_distribution = data_csr_matrix.n_docs_distribution
66 |         vocabulary = data_csr_matrix.vocabulary
67 | 
68 |         scored_matrix_doc_freq = soa_python3.SOA().fit_transform(
69 |             X=csr_matrix_,
70 |             unit_distribution=n_docs_distribution,
71 |             verbose=True
72 |         )
73 | 
74 |         soa_scores_doc_freq = ScoredResultObject(
75 |             scored_matrix=scored_matrix_doc_freq,
76 |             label2id_dict=label2id_dict,
77 |             feature2id_dict=vocabulary
78 |         ).convert_score_matrix2score_record()
79 |         self.assertTrue(isinstance(soa_scores_doc_freq, list))
80 | 
81 |         #import pprint
82 |         #print('doc freq based soa')
83 |         #pprint.pprint(soa_scores_doc_freq)
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/examples/basic_example.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding: utf-8 -*-
  2 | __author__ = 'kensuke-mi'
  3 | 
  4 | from DocumentFeatureSelection import interface
  5 | import logging
  6 | import pprint
  7 | logger = logging.getLogger('sample usage')
  8 | logger.level = logging.ERROR
  9 | 
 10 | 
 11 | # ======================================================================================================
 12 | # basic usage
 13 | 
 14 | input_dict = {
 15 |     "label_a": [
 16 |         ["I", "aa", "aa", "aa", "aa", "aa"],
 17 |         ["bb", "aa", "aa", "aa", "aa", "aa"],
 18 |         ["I", "aa", "hero", "some", "ok", "aa"]
 19 |     ],
 20 |     "label_b": [
 21 |         ["bb", "bb", "bb"],
 22 |         ["bb", "bb", "bb"],
 23 |         ["hero", "ok", "bb"],
 24 |         ["hero", "cc", "bb"],
 25 |     ],
 26 |     "label_c": [
 27 |         ["cc", "cc", "cc"],
 28 |         ["cc", "cc", "bb"],
 29 |         ["xx", "xx", "cc"],
 30 |         ["aa", "xx", "cc"],
 31 |     ]
 32 | }
 33 | 
 34 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 35 | # tf idf
 36 | 
 37 | tf_idf_scored_object = interface.run_feature_selection(
 38 |     input_dict=input_dict,
 39 |     method='tf_idf',
 40 |     n_jobs=5
 41 | )
 42 | 
 43 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 44 | # pmi
 45 | pmi_scored_object = interface.run_feature_selection(
 46 |     input_dict=input_dict,
 47 |     method='pmi',
 48 |     n_jobs=1,
 49 |     use_cython=False
 50 | )
 51 | pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
 52 | 
 53 | # you can use cython version pmi also
 54 | # !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place.
 55 | pmi_scored_object_cython = interface.run_feature_selection(
 56 |     input_dict=input_dict,
 57 |     method='pmi',
 58 |     n_jobs=1,
 59 |     use_cython=True
 60 | )
 61 | pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary())
 62 | 
 63 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 64 | # soa
 65 | soa_scored_object = interface.run_feature_selection(
 66 |     input_dict=input_dict,
 67 |     method='soa',
 68 |     n_jobs=5
 69 | )
 70 | pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary())
 71 | 
 72 | soa_scored_object_cython = interface.run_feature_selection(
 73 |     input_dict=input_dict,
 74 |     method='soa',
 75 |     n_jobs=1,
 76 |     use_cython=True
 77 | )
 78 | pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary())
 79 | 
 80 | 
 81 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 82 | # bns
 83 | input_dict = {
 84 |     "label1": [
 85 |         ["I", "aa", "aa", "aa", "aa", "aa"],
 86 |         ["bb", "aa", "aa", "aa", "aa", "aa"],
 87 |         ["I", "aa", "hero", "some", "ok", "aa"]
 88 |     ],
 89 |     "label2": [
 90 |         ["bb", "bb", "bb"],
 91 |         ["bb", "bb", "bb"],
 92 |         ["hero", "ok", "bb"],
 93 |         ["hero", "cc", "bb"],
 94 |     ]
 95 | }
 96 | bns_scored_object = interface.run_feature_selection(
 97 |     input_dict=input_dict,
 98 |     method='bns',
 99 |     n_jobs=1
100 | )
101 | pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary())
102 | 
103 | bns_scored_object = interface.run_feature_selection(
104 |     input_dict=input_dict,
105 |     method='bns',
106 |     use_cython=True
107 | )
108 | 


--------------------------------------------------------------------------------
/tests/check_code_pmi.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import codecs
  3 | import math
  4 | import sys
  5 | from collections import defaultdict
  6 | # this code is from http://aidiary.hatenablog.com/entry/20100619/1276950312
  7 | # checked to work under python2.5
  8 | 
  9 | def mutual_information(target, data, k=5):
 10 |     # comment inputはlist 0th indexにカテゴリラベルがある。 1-th indexはすべてfeature word
 11 | 
 12 | 
 13 |     """カテゴリtargetにおける相互情報量が高い上位k件の単語を返す"""
 14 |     # 上位k件を指定しないときはすべて返す
 15 | 
 16 |     V = set()
 17 |     N11 = defaultdict(float)  # N11[word] -> wordを含むtargetの文書数
 18 |     N10 = defaultdict(float)  # N10[word] -> wordを含むtarget以外の文書数
 19 |     N01 = defaultdict(float)  # N01[word] -> wordを含まないtargetの文書数
 20 |     N00 = defaultdict(float)  # N00[word] -> wordを含まないtarget以外の文書数
 21 |     Np = 0.0  # targetの文書数
 22 |     Nn = 0.0  # target以外の文書す
 23 | 
 24 |     # N11とN10をカウント
 25 |     for d in data:
 26 |         cat, words = d[0], d[1:]
 27 |         if cat == target:
 28 |             Np += 1
 29 |             for wc in words:
 30 |                 if ':' in wc: word, count = wc.split(":")
 31 |                 else: word = wc
 32 | 
 33 |                 V.add(word)
 34 |                 N11[word] += 1  # 文書数をカウントするので+1すればOK
 35 |         elif cat != target:
 36 |             Nn += 1
 37 |             for wc in words:
 38 |                 if ':' in wc: word, count = wc.split(":")
 39 |                 else: word = wc
 40 | 
 41 |                 V.add(word)
 42 |                 N10[word] += 1
 43 | 
 44 |     # N01とN00は簡単に求められる
 45 |     for word in V:
 46 |         N01[word] = Np - N11[word]
 47 |         N00[word] = Nn - N10[word]
 48 | 
 49 |     for w, c in N01.items():
 50 |         if c < 0: N01[w] = 0.0
 51 | 
 52 |     for w, c in N00.items():
 53 |         if c < 0: N00[w] = 0.0
 54 | 
 55 |     # 総文書数
 56 |     N = Np + Nn
 57 | 
 58 | 
 59 |     # 各単語の相互情報量を計算
 60 |     MI = []
 61 |     for word in V:
 62 |         n11, n10, n01, n00 = N11[word], N10[word], N01[word], N00[word]
 63 |         # いずれかの出現頻度が0.0となる単語はlog2(0)となってしまうのでスコア0とする
 64 |         if n11 == 0.0 or n10 == 0.0 or n01 == 0.0 or n00 == 0.0:
 65 |             MI.append( (0.0, word) )
 66 |             continue
 67 |         # 相互情報量の定義の各項を計算
 68 |         temp1 = n11/N * math.log((N*n11)/((n10+n11)*(n01+n11)), 2)
 69 |         temp2 = n01/N * math.log((N*n01)/((n00+n01)*(n01+n11)), 2)
 70 |         temp3 = n10/N * math.log((N*n10)/((n10+n11)*(n00+n10)), 2)
 71 |         temp4 = n00/N * math.log((N*n00)/((n00+n01)*(n00+n10)), 2)
 72 |         score = temp1 + temp2 + temp3 + temp4
 73 |         MI.append( (score, word) )
 74 | 
 75 |     # 相互情報量の降順にソートして上位k個を返す
 76 |     MI.sort(reverse=True)
 77 |     return MI[0:k]
 78 | 
 79 | 
 80 | if __name__ == '__main__':
 81 | 
 82 |     input_data = [
 83 |         ['label_a', "I", "aa", "aa", "aa", "aa", "aa"],
 84 |         ['label_a', "bb", "aa", "aa", "aa", "aa", "aa"],
 85 |         ['label_a', "I", "aa", "hero", "some", "ok", "aa"],
 86 |         ['label_b', "bb", "bb", "bb"],
 87 |         ['label_b', "bb", "bb", "bb"],
 88 |         ['label_b', "hero", "ok", "bb"],
 89 |         ['label_b', "hero", "cc", "bb"],
 90 |         ['label_c', "cc", "cc", "cc"],
 91 |         ['label_c', "cc", "cc", "bb"],
 92 |         ['label_c', "xx", "xx", "cc"],
 93 |         ['label_c', "aa", "xx", "cc"],
 94 |     ]
 95 |     res = mutual_information(target='label_a', data=input_data, k=30)
 96 |     import pprint
 97 |     print('label_a')
 98 |     pprint.pprint(res)
 99 | 
100 |     print('label_b')
101 |     pprint.pprint(mutual_information(target='label_b', data=input_data, k=30))
102 | 
103 |     print('label_c')
104 |     pprint.pprint(mutual_information(target='label_c', data=input_data, k=30))


--------------------------------------------------------------------------------
/examples/advanced_example.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | __author__ = 'kensuke-mi'
 3 | 
 4 | from DocumentFeatureSelection import interface
 5 | from DocumentFeatureSelection.init_logger import logger
 6 | import logging
 7 | import pprint
 8 | 
 9 | # ======================================================================================================
10 | # expert usage
11 | # you can put complex-structure-feature as feature.
12 | # One feature is tuple of tuple. Concretely (("he", "N"), ("is", "V")) is one feature.
13 | # You can NOT use ngram argument for expert input
14 | input_dict_tuple_feature = {
15 |     "label_a": [
16 |         [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ],
17 |         [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ],
18 |         [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
19 |     ],
20 |     "label_b": [
21 |         [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ],
22 |         [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ],
23 |         [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
24 |     ]
25 | }
26 | 
27 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
28 | # tf idf
29 | tf_idf_scored_object = interface.run_feature_selection(
30 |     input_dict=input_dict_tuple_feature,
31 |     method='tf_idf',
32 |     n_jobs=5
33 | )
34 | pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
35 | 
36 | 
37 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
38 | # pmi
39 | pmi_scored_object = interface.run_feature_selection(
40 |     input_dict=input_dict_tuple_feature,
41 |     method='pmi',
42 |     n_jobs=5
43 | )
44 | pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
45 | 
46 | 
47 | pmi_scored_object_cython = interface.run_feature_selection(
48 |     input_dict=input_dict_tuple_feature,
49 |     method='pmi',
50 |     use_cython=True
51 | )
52 | pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary())
53 | 
54 | 
55 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
56 | # soa
57 | soa_scored_object = interface.run_feature_selection(
58 |     input_dict=input_dict_tuple_feature,
59 |     method='soa',
60 |     n_jobs=5
61 | )
62 | pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary())
63 | 
64 | 
65 | soa_scored_object_cython = interface.run_feature_selection(
66 |     input_dict=input_dict_tuple_feature,
67 |     method='soa',
68 |     use_cython=True
69 | )
70 | pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary())
71 | 
72 | 
73 | 
74 | 
75 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
76 | # bns
77 | input_dict_tuple_feature = {
78 |     "positive": [
79 |         [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ],
80 |         [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ],
81 |         [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
82 |     ],
83 |     "negative": [
84 |         [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ],
85 |         [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ],
86 |         [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
87 |     ]
88 | }
89 | 
90 | 
91 | bns_scored_object = interface.run_feature_selection(
92 |     input_dict=input_dict_tuple_feature,
93 |     method='bns',
94 |     n_jobs=5
95 | )
96 | pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary())


--------------------------------------------------------------------------------
/tests/test_data_models.py:
--------------------------------------------------------------------------------
  1 | from DocumentFeatureSelection.common import data_converter
  2 | from DocumentFeatureSelection.pmi import PMI_python3
  3 | from DocumentFeatureSelection.models import ScoredResultObject
  4 | from scipy.sparse import csr_matrix
  5 | import unittest
  6 | import numpy
  7 | import logging
  8 | 
  9 | 
 10 | class TestDataModels(unittest.TestCase):
 11 |     def setUp(self):
 12 |         self.input_dict = {
 13 |             "label_a": [
 14 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
 15 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
 16 |                 ["I", "aa", "hero", "some", "ok", "aa"]
 17 |             ],
 18 |             "label_b": [
 19 |                 ["bb", "bb", "bb"],
 20 |                 ["bb", "bb", "bb"],
 21 |                 ["hero", "ok", "bb"],
 22 |                 ["hero", "cc", "bb"],
 23 |             ],
 24 |             "label_c": [
 25 |                 ["cc", "cc", "cc"],
 26 |                 ["cc", "cc", "bb"],
 27 |                 ["xx", "xx", "cc"],
 28 |                 ["aa", "xx", "cc"],
 29 |             ]
 30 |         }
 31 | 
 32 | 
 33 |     def test_get_pmi_feature_dictionary(self):
 34 |         """checks if it works or not, that getting scored dictionary object from scored_matrix
 35 | 
 36 |         :return:
 37 |         """
 38 |         data_csr_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
 39 |             labeled_documents=self.input_dict,
 40 |             ngram=1,
 41 |             n_jobs=5
 42 |         )
 43 | 
 44 |         assert isinstance(data_csr_object.csr_matrix_, csr_matrix)
 45 |         assert isinstance(data_csr_object.label2id_dict, dict)
 46 |         assert isinstance(data_csr_object.vocabulary, dict)
 47 | 
 48 |         pmi_scored_matrix = PMI_python3.PMI().fit_transform(X=data_csr_object.csr_matrix_, n_jobs=5,
 49 |                                                             n_docs_distribution=data_csr_object.n_docs_distribution)
 50 | 
 51 |         # main part of test
 52 |         # when sort is True, cut_zero is True, outformat is dict
 53 |         pmi_scored_dictionary_objects = ScoredResultObject(
 54 |             scored_matrix=pmi_scored_matrix,
 55 |             label2id_dict=data_csr_object.label2id_dict,
 56 |             feature2id_dict=data_csr_object.vocabulary
 57 |         ).ScoreMatrix2ScoreDictionary(
 58 |             outformat='dict',
 59 |             sort_desc=True,
 60 |             n_jobs=5
 61 |         )
 62 |         assert isinstance(pmi_scored_dictionary_objects, dict)
 63 |         logging.debug(pmi_scored_dictionary_objects)
 64 | 
 65 |         # when sort is True, cut_zero is True, outformat is items
 66 |         pmi_scored_dictionary_objects = ScoredResultObject(
 67 |             scored_matrix=pmi_scored_matrix,
 68 |             label2id_dict=data_csr_object.label2id_dict,
 69 |             feature2id_dict=data_csr_object.vocabulary).ScoreMatrix2ScoreDictionary(
 70 |             outformat='items',
 71 |             sort_desc=True,
 72 |             n_jobs=5
 73 |         )
 74 |         assert isinstance(pmi_scored_dictionary_objects, list)
 75 |         for d in pmi_scored_dictionary_objects:
 76 |             assert isinstance(d, dict)
 77 | 
 78 |         # when sort is True, cut_zero is False, outformat is dict
 79 |         pmi_scored_dictionary_objects = ScoredResultObject(
 80 |             scored_matrix=pmi_scored_matrix,
 81 |             label2id_dict=data_csr_object.label2id_dict,
 82 |             feature2id_dict=data_csr_object.vocabulary
 83 |         ).ScoreMatrix2ScoreDictionary(
 84 |             outformat='dict',
 85 |             sort_desc=True,
 86 |             n_jobs=5
 87 |         )
 88 |         assert isinstance(pmi_scored_dictionary_objects, dict)
 89 |         logging.debug(pmi_scored_dictionary_objects)
 90 | 
 91 |         # when sort is True, cut_zero is False, outformat is items
 92 |         pmi_scored_dictionary_objects = ScoredResultObject(
 93 |             scored_matrix=pmi_scored_matrix,
 94 |             label2id_dict=data_csr_object.label2id_dict,
 95 |             feature2id_dict=data_csr_object.vocabulary
 96 |         ).ScoreMatrix2ScoreDictionary(
 97 |             outformat='items',
 98 |             sort_desc=True,
 99 |             n_jobs=5
100 |         )
101 |         assert isinstance(pmi_scored_dictionary_objects, list)
102 |         for d in pmi_scored_dictionary_objects:
103 |             assert isinstance(d, dict)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_interface.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from DocumentFeatureSelection import interface
 3 | from DocumentFeatureSelection.models import ScoredResultObject
 4 | from DocumentFeatureSelection.models import PersistentDict
 5 | from sqlitedict import SqliteDict
 6 | import os
 7 | import numpy
 8 | 
 9 | 
10 | class TestInterface(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.input_dict = {
14 |             "label_a": [
15 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
16 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
17 |                 ["I", "aa", "hero", "some", "ok", "aa"]
18 |             ],
19 |             "label_b": [
20 |                 ["bb", "bb", "bb"],
21 |                 ["bb", "bb", "bb"],
22 |                 ["hero", "ok", "bb"],
23 |                 ["hero", "cc", "bb"],
24 |             ],
25 |             "label_c": [
26 |                 ["cc", "cc", "cc"],
27 |                 ["cc", "cc", "bb"],
28 |                 ["xx", "xx", "cc"],
29 |                 ["aa", "xx", "cc"],
30 |             ]
31 |         }
32 |         cls.method = ['pmi', 'tf_idf', 'soa']
33 |         cls.bool_cython = [True, False]
34 |         cls.is_use_cache = [True, False]
35 |         cls.is_use_memmap = [True, False]
36 |         cls.joblib_range = range(0, 2)
37 |         cls.path_shelve_file = './shelve'
38 |         cls.path_sqlite3_persistent = './temp_db.sqlite3'
39 | 
40 |     @classmethod
41 |     def tearDownClass(cls):
42 |         os.remove(cls.path_sqlite3_persistent)
43 | 
44 |     def test_interface_shelve(self):
45 |         """パラメタ条件を組み合わせてテストを実行する　
46 |         - cythonモード使う or not
47 |         - cacheモード使う or not
48 |         - memmapモード使う or not
49 |         """
50 |         shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json')
51 |         for key, value in self.input_dict.items(): shelve_obj[key] = value
52 | 
53 |         sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True)
54 |         for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value
55 | 
56 |         for method_name in self.method:
57 |             for cython_flag in self.bool_cython:
58 |                 for cache_flag in self.is_use_cache:
59 |                     for memmap_flag in self.is_use_memmap:
60 |                         scored_result_persisted = interface.run_feature_selection(
61 |                             input_dict=shelve_obj,
62 |                             method=method_name,
63 |                             use_cython=cython_flag,
64 |                             is_use_cache=cache_flag,
65 |                             is_use_memmap=memmap_flag
66 |                         )  # type: ScoredResultObject
67 |                         self.assertIsInstance(scored_result_persisted, ScoredResultObject)
68 |                         self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list)
69 | 
70 |                         scored_result_sqlite3_persisted = interface.run_feature_selection(
71 |                             input_dict=sqlite3_dict_obj,
72 |                             method=method_name, use_cython=cython_flag, is_use_cache=cache_flag)  # type: ScoredResultObject
73 |                         self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject)
74 |                         self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list)
75 | 
76 |                         # You check if result is same between data-source = shelve_obj and data-source = dict-object
77 |                         scored_result_dict = interface.run_feature_selection(
78 |                             input_dict=self.input_dict,
79 |                             method=method_name, use_cython=cython_flag, is_use_cache=cache_flag)  # type: ScoredResultObject
80 |                         self.assertIsInstance(scored_result_dict, ScoredResultObject)
81 |                         self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list)
82 | 
83 |                         numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(),
84 |                                                          scored_result_dict.scored_matrix.toarray())
85 |                         numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(),
86 |                                                          scored_result_dict.scored_matrix.toarray())
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     unittest.main()
91 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
 10 | if NOT "%PAPER%" == "" (
 11 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 12 | )
 13 | 
 14 | if "%1" == "" goto help
 15 | 
 16 | if "%1" == "help" (
 17 | 	:help
 18 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 19 | 	echo.  html       to make standalone HTML files
 20 | 	echo.  dirhtml    to make HTML files named index.html in directories
 21 | 	echo.  singlehtml to make a single large HTML file
 22 | 	echo.  pickle     to make pickle files
 23 | 	echo.  json       to make JSON files
 24 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 25 | 	echo.  qthelp     to make HTML files and a qthelp project
 26 | 	echo.  devhelp    to make HTML files and a Devhelp project
 27 | 	echo.  epub       to make an epub
 28 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 29 | 	echo.  text       to make text files
 30 | 	echo.  man        to make manual pages
 31 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 32 | 	echo.  linkcheck  to check all external links for integrity
 33 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 34 | 	goto end
 35 | )
 36 | 
 37 | if "%1" == "clean" (
 38 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 39 | 	del /q /s %BUILDDIR%\*
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "html" (
 44 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 45 | 	echo.
 46 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 47 | 	goto end
 48 | )
 49 | 
 50 | if "%1" == "dirhtml" (
 51 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 52 | 	echo.
 53 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 54 | 	goto end
 55 | )
 56 | 
 57 | if "%1" == "singlehtml" (
 58 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 59 | 	echo.
 60 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 61 | 	goto end
 62 | )
 63 | 
 64 | if "%1" == "pickle" (
 65 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 66 | 	echo.
 67 | 	echo.Build finished; now you can process the pickle files.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "json" (
 72 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 73 | 	echo.
 74 | 	echo.Build finished; now you can process the JSON files.
 75 | 	goto end
 76 | )
 77 | 
 78 | if "%1" == "htmlhelp" (
 79 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 80 | 	echo.
 81 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 82 | .hhp project file in %BUILDDIR%/htmlhelp.
 83 | 	goto end
 84 | )
 85 | 
 86 | if "%1" == "qthelp" (
 87 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 88 | 	echo.
 89 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 90 | .qhcp project file in %BUILDDIR%/qthelp, like this:
 91 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\document-feature-selection.qhcp
 92 | 	echo.To view the help file:
 93 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\document-feature-selection.ghc
 94 | 	goto end
 95 | )
 96 | 
 97 | if "%1" == "devhelp" (
 98 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
 99 | 	echo.
100 | 	echo.Build finished.
101 | 	goto end
102 | )
103 | 
104 | if "%1" == "epub" (
105 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
106 | 	echo.
107 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
108 | 	goto end
109 | )
110 | 
111 | if "%1" == "latex" (
112 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
113 | 	echo.
114 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
115 | 	goto end
116 | )
117 | 
118 | if "%1" == "text" (
119 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
120 | 	echo.
121 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
122 | 	goto end
123 | )
124 | 
125 | if "%1" == "man" (
126 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
127 | 	echo.
128 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "changes" (
133 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
134 | 	echo.
135 | 	echo.The overview file is in %BUILDDIR%/changes.
136 | 	goto end
137 | )
138 | 
139 | if "%1" == "linkcheck" (
140 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
141 | 	echo.
142 | 	echo.Link check complete; look for any errors in the above output ^
143 | or in %BUILDDIR%/linkcheck/output.txt.
144 | 	goto end
145 | )
146 | 
147 | if "%1" == "doctest" (
148 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
149 | 	echo.
150 | 	echo.Testing of doctests in the sources finished, look at the ^
151 | results in %BUILDDIR%/doctest/output.txt.
152 | 	goto end
153 | )
154 | 
155 | :end


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Setup file for the document-feature-selection project.
  3 | """
  4 | 
  5 | __author__ = 'kensuke-mi'
  6 | __version__ = '1.5'
  7 | 
  8 | import sys
  9 | import subprocess
 10 | from setuptools import setup, find_packages
 11 | from distutils.extension import Extension
 12 | python_version = sys.version_info
 13 | print('python version {}'.format(python_version))
 14 | 
 15 | 
 16 | # --------------------------------------------------------------------------------------------------------
 17 | # Flags to compile Cython code or use already compiled code
 18 | try:
 19 |     import Cython
 20 | except ImportError:
 21 |     subprocess.check_call(["python", '-m', 'pip', 'install', 'cython'])
 22 |     import Cython
 23 | 
 24 | if sys.version_info >= (3, 7):
 25 |     # if python >= 3.7, Cython must regenerate C++ code again.
 26 |     import os
 27 |     if os.path.exists('DocumentFeatureSelection/pmi/pmi_cython.c'):
 28 |         os.remove('DocumentFeatureSelection/pmi/pmi_cython.c')
 29 |     if os.path.exists('DocumentFeatureSelection/bns/bns_cython.c'):
 30 |         os.remove('DocumentFeatureSelection/bns/bns_cython.c')
 31 |     if os.path.exists('DocumentFeatureSelection/soa/soa_cython.c'):
 32 |         os.remove('DocumentFeatureSelection/soa/soa_cython.c')
 33 |     # if python >= 3.7, typing should be installed again.
 34 |     subprocess.check_call(["python", '-m', 'pip', 'install', 'typing'])
 35 | 
 36 | cmdclass = {}
 37 | ext_modules = []
 38 | from Cython.Distutils import build_ext
 39 | 
 40 | ext_modules += [
 41 |     Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ],),
 42 |     Extension("DocumentFeatureSelection.soa.soa_cython", [ "DocumentFeatureSelection/soa/soa_cython.pyx" ],),
 43 |     Extension("DocumentFeatureSelection.bns.bns_cython", [ "DocumentFeatureSelection/bns/bns_cython.pyx" ],)
 44 | ]
 45 | cmdclass.update({'build_ext': build_ext})
 46 | 
 47 | 
 48 | # --------------------------------------------------------------------------------------------------------
 49 | # try to install numpy automatically because sklearn requires the status where numpy is already installed
 50 | try:
 51 |     import numpy
 52 | except ImportError:
 53 |     use_numpy_include_dirs = False
 54 |     try:
 55 |         subprocess.check_call(["python", '-m', 'pip', 'install', 'numpy'])
 56 |         import numpy
 57 |     except Exception as e:
 58 |         raise Exception(e.__str__() + 'We failed to install numpy automatically. \
 59 |         Try installing numpy manually or Try anaconda distribution.')
 60 | 
 61 | # --------------------------------------------------------------------------------------------------------
 62 | # try to install scipy automatically because sklearn requires the status where scipy is already installed
 63 | try:
 64 |     import scipy
 65 | except ImportError:
 66 |     try:
 67 |         subprocess.check_call(["python", '-m', 'pip', 'install', 'scipy'])
 68 |         import scipy
 69 |     except Exception as e:
 70 |         raise Exception(e.__str__() + 'We failed to install scipy automatically. \
 71 |         Try installing scipy manually or Try anaconda distribution.')
 72 | # --------------------------------------------------------------------------------------------------------
 73 | 
 74 | 
 75 | install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy',
 76 |                     'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython', 'sqlitedict', 'nose',
 77 |                     'typing']
 78 | 
 79 | try:
 80 |     import pypandoc
 81 |     long_description = pypandoc.convert('README.md', 'rst')
 82 | except(IOError, ImportError):
 83 |     long_description = open('README.md').read()
 84 | 
 85 | 
 86 | description = 'Various methods of feature selection from Text Data'
 87 | 
 88 | classifiers = [
 89 |         "Development Status :: 5 - Production/Stable",
 90 |         "License :: OSI Approved :: MIT License",
 91 |         "Programming Language :: Python",
 92 |         "Natural Language :: Japanese",
 93 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
 94 |         "Programming Language :: Python :: 3.5"
 95 |         ]
 96 | 
 97 | setup(
 98 |     name='DocumentFeatureSelection',
 99 |     version=__version__,
100 |     description=description,
101 |     long_description=long_description,
102 |     author=__author__,
103 |     author_email='kensuke.mit@gmail.com',
104 |     license='CeCILL-B',
105 |     url='https://github.com/Kensuke-Mitsuzawa/DocumentFeatureSelection',
106 |     packages=find_packages(),
107 |     include_package_data=True,
108 |     zip_safe=False,
109 |     test_suite='tests.all_tests.suite',
110 |     install_requires=install_requires,
111 |     tests_require=install_requires,
112 |     setup_requires=['six', 'setuptools>=1.0', 'pip', 'typing', 'cython'],
113 |     classifiers=classifiers,
114 |     cmdclass=cmdclass,
115 |     ext_modules=ext_modules,
116 |     include_dirs=[numpy.get_include()]
117 | )
118 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/common/func_data_converter.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes
  3 | from DocumentFeatureSelection.common.utils import init_cache_object
  4 | from sklearn.feature_extraction import DictVectorizer
  5 | from typing import Dict, List, Tuple, Any, Union
  6 | from sqlitedict import SqliteDict
  7 | import joblib
  8 | import itertools
  9 | import tempfile
 10 | N_FEATURE_SWITCH_STRATEGY = 1000000
 11 | 
 12 | 
 13 | 
 14 | def generate_document_dict(document_key:str,
 15 |                            documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]:
 16 |     """This function gets Document-frequency count in given list of documents
 17 |     """
 18 |     assert isinstance(documents, list)
 19 |     feature_frequencies = [Counter(document) for document in documents]
 20 |     document_frequencies = Counter()
 21 |     for feat_freq in feature_frequencies: document_frequencies.update(feat_freq.keys())
 22 | 
 23 |     return (document_key, document_frequencies)
 24 | 
 25 | 
 26 | def make_multi_docs2term_freq_info(labeled_documents:AvailableInputTypes,
 27 |                            is_use_cache:bool=True,
 28 |                            path_work_dir:str=tempfile.mkdtemp()):
 29 |     """* What u can do
 30 |     - This function generates information to construct term-frequency matrix
 31 |     """
 32 |     assert isinstance(labeled_documents, (SqliteDict, dict))
 33 | 
 34 |     counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents))))
 35 |                          for label, documents in labeled_documents.items()]
 36 |     feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency]
 37 | 
 38 | 
 39 |     if is_use_cache:
 40 |         dict_matrix_index = init_cache_object('matrix_element_objects', path_work_dir=path_work_dir)
 41 |     else:
 42 |         dict_matrix_index = {}
 43 | 
 44 |     # use sklearn feature-extraction
 45 |     vec = DictVectorizer()
 46 |     dict_matrix_index['matrix_object'] = vec.fit_transform(feature_documents).tocsr()
 47 |     dict_matrix_index['feature2id'] = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
 48 |     dict_matrix_index['label2id'] = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in  enumerate(counted_frequency)}
 49 | 
 50 |     return SetDocumentInformation(dict_matrix_index)
 51 | 
 52 | '''
 53 | def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str:
 54 |     type_flag = None
 55 |     for document_list in docs:
 56 |         assert isinstance(document_list, list)
 57 |         for feature in document_list:
 58 |             if isinstance(feature, str):
 59 |                 type_flag = 'str'
 60 |             elif isinstance(feature, tuple):
 61 |                 type_flag = 'tuple'
 62 |             else:
 63 |                 logger.error(msg=docs)
 64 |                 raise TypeError('Feature object should be either of str or tuple')
 65 |     return type_flag'''
 66 | 
 67 | 
 68 | def make_multi_docs2doc_freq_info(labeled_documents:AvailableInputTypes,
 69 |                                   n_jobs:int=-1,
 70 |                                   path_working_dir:str=tempfile.mkdtemp(),
 71 |                                   is_use_cache: bool = True)->SetDocumentInformation:
 72 |     """* What u can do
 73 |     - This function generates information for constructing document-frequency matrix.
 74 |     """
 75 |     assert isinstance(labeled_documents, (SqliteDict, dict))
 76 |     #type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
 77 |     #assert len(type_flag)==1
 78 | 
 79 |     # todo 高速化を検討すること
 80 |     counted_frequency = joblib.Parallel(n_jobs=n_jobs)(
 81 |         joblib.delayed(generate_document_dict)(key, docs)
 82 |         for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]))
 83 | 
 84 |     ### construct [{}] structure for input of DictVectorizer() ###
 85 |     seq_feature_documents = (dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency)
 86 | 
 87 |     ### Save index-string dictionary
 88 |     if is_use_cache:
 89 |         dict_matrix_index = init_cache_object('matrix_element_object', path_working_dir)
 90 |     else:
 91 |         dict_matrix_index = {}
 92 | 
 93 |     # use sklearn feature-extraction
 94 |     vec = DictVectorizer()
 95 |     dict_matrix_index['matrix_object'] = vec.fit_transform(seq_feature_documents).tocsr()
 96 |     dict_matrix_index['feature2id'] = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
 97 |     dict_matrix_index['label2id'] = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)}
 98 | 
 99 |     return SetDocumentInformation(dict_matrix_index)
100 | 
101 | 
102 | # alias for old versions
103 | multiDocs2TermFreqInfo = make_multi_docs2term_freq_info
104 | multiDocs2DocFreqInfo = make_multi_docs2doc_freq_info
105 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/common/crs_matrix_constructor.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger, StreamHandler
  2 | import joblib
  3 | import sys
  4 | import logging
  5 | import numpy
  6 | from typing import List, Tuple, Dict
  7 | from scipy.sparse import csr_matrix
  8 | 
  9 | logging.basicConfig(format='%(asctime)s %(message)s',
 10 |                     datefmt='%m/%d/%Y %I:%M:%S %p',
 11 |                     level=logging.DEBUG)
 12 | logger = getLogger(__name__)
 13 | handler = StreamHandler()
 14 | logger.addHandler(handler)
 15 | 
 16 | python_version = sys.version_info
 17 | __author__ = 'kensuke-mi'
 18 | 
 19 | 
 20 | class PosTuple(object):
 21 |     __slots__ = ['doc_id', 'word_id', 'document_frequency']
 22 |     def __init__(self, doc_id, word_id, document_frequency):
 23 |         self.doc_id = doc_id
 24 |         self.word_id = word_id
 25 |         self.document_frequency = document_frequency
 26 | 
 27 | 
 28 | PARAM_JOBLIB_BACKEND = ['multiprocessing', 'threading']
 29 | 
 30 | def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy.ndarray)->numpy.array:
 31 |     """* what you can do
 32 |      - You get array of [document_id, feature_id, value(frequency)]
 33 |     """
 34 |     assert isinstance(vocaburary, numpy.ndarray)
 35 |     col_element = vocaburary[numpy.where(vocaburary['key']==word)]
 36 |     assert len(col_element) == 1
 37 |     col_value = col_element[0]['value']
 38 |     # df value is word frequency in documents
 39 |     df_value = doc_freq
 40 | 
 41 |     return numpy.array([doc_id, col_value, df_value])
 42 | 
 43 | def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray:
 44 | 
 45 |     value_pairs = numpy.array([
 46 |         get_data_col_row_values(doc_id=doc_id, word=key_value_tuple['key'], doc_freq=key_value_tuple['value'], vocaburary=vocabulary)
 47 |         for key_value_tuple
 48 |         in doc_freq_obj])
 49 | 
 50 |     return value_pairs
 51 | 
 52 | 
 53 | def make_csr_list(value_position_list:List[numpy.ndarray])->Tuple[List[int], List[int], List[int]]:
 54 |     data = []
 55 |     row = []
 56 |     col = []
 57 |     for position_tuple in value_position_list:
 58 |         row.append(position_tuple[0])
 59 |         col.append(position_tuple[1])
 60 |         data.append(position_tuple[2])
 61 | 
 62 |     return row, col, data
 63 | 
 64 | 
 65 | def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_backend:str='Parallel'):
 66 |     """This function makes information to make csr matrix. Data-list/Row-list/Col-list
 67 | 
 68 |     :param feature_frequency list: list having dictionary of {feature: frequency}
 69 |     :param label2id_dict dict: dictionary of {feature: feature_id}
 70 |     :return: tuple having lists to construct csr matrix
 71 |     :rtype tuple:
 72 | 
 73 |     Example,
 74 | 
 75 |     feature_frequency is
 76 |     >>> [{'some': 1, 'bb': 1, 'hero': 1, 'aa': 3, 'I': 2, 'ok': 1}, {'cc': 1, 'bb': 4, 'ok': 1, 'hero': 2}, {'cc': 4, 'bb': 1, 'xx': 2, 'aa': 1}]
 77 | 
 78 |     vocaburary is
 79 |     >>> {'some': 6, 'bb': 2, 'xx': 7, 'hero': 4, 'aa': 1, 'cc': 3, 'I': 0, 'ok': 5}
 80 | 
 81 |     return value is
 82 |     >>> ([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], [0, 1, 2, 4, 5, 6, 2, 3, 4, 5, 1, 2, 3, 7], [2, 3, 1, 1, 1, 1, 4, 1, 2, 1, 1, 1, 4, 2])
 83 | 
 84 |     """
 85 |     if not joblib_backend in PARAM_JOBLIB_BACKEND:
 86 |         assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend))
 87 | 
 88 |     assert isinstance(feature_frequency, list)
 89 |     assert isinstance(vocabulary, numpy.ndarray)
 90 |     assert isinstance(n_jobs, int)
 91 | 
 92 |     logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs))
 93 | 
 94 |     set_value_position_list = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
 95 |         joblib.delayed(SUB_FUNC_make_value_pairs)(
 96 |             doc_id,
 97 |             doc_freq_obj,
 98 |             vocabulary
 99 |         )
100 |         for doc_id, doc_freq_obj in enumerate(feature_frequency)
101 |     )  # type: List[numpy.ndarray]
102 | 
103 |     # make 2-d list into 1-d list
104 |     value_position_list = sorted(
105 |             [l for set in set_value_position_list for l in set],
106 |         key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2]))
107 | 
108 |     row, col, data = make_csr_list(value_position_list)
109 | 
110 |     return row, col, data
111 | 
112 | 
113 | def make_csr_objects(row, col, data, n_feature, n_docs):
114 |     """This is main function of making csr_matrix from given data
115 | 
116 |     :param row:
117 |     :param col:
118 |     :param data:
119 |     :param n_feature:
120 |     :param n_docs:
121 |     :return:
122 |     """
123 |     assert isinstance(row, list)
124 |     assert isinstance(col, list)
125 |     assert isinstance(data, list)
126 |     assert isinstance(n_feature, int)
127 |     assert isinstance(n_docs, int)
128 | 
129 |     return csr_matrix((data, (row, col)), shape=(n_docs, n_feature))


--------------------------------------------------------------------------------
/tests/test_bns_python3.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from DocumentFeatureSelection.common import data_converter
  3 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix
  4 | from DocumentFeatureSelection.bns import bns_python3
  5 | from DocumentFeatureSelection.models import ScoredResultObject
  6 | from scipy.sparse import csr_matrix
  7 | 
  8 | 
  9 | class TestBnsPython3(unittest.TestCase):
 10 |     def setUp(self):
 11 |         self.correct_input = {
 12 |             "label_a": [
 13 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
 14 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
 15 |                 ["I", "aa", "hero", "some", "ok", "aa"]
 16 |             ],
 17 |             "label_b": [
 18 |                 ["bb", "bb", "bb"],
 19 |                 ["bb", "bb", "bb"],
 20 |                 ["hero", "ok", "bb"],
 21 |                 ["hero", "cc", "bb"],
 22 |             ]
 23 |         }
 24 | 
 25 |     def test_fit_transform(self):
 26 | 
 27 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
 28 |             labeled_documents=self.correct_input,
 29 |             n_jobs=5
 30 |         )
 31 |         assert isinstance(data_csr_matrix, DataCsrMatrix)
 32 |         label2id_dict = data_csr_matrix.label2id_dict
 33 |         csr_matrix_ = data_csr_matrix.csr_matrix_
 34 |         n_docs_distribution = data_csr_matrix.n_docs_distribution
 35 |         vocabulary = data_csr_matrix.vocabulary
 36 | 
 37 |         bns_score_csr_matrix = bns_python3.BNS().fit_transform(X=csr_matrix_,
 38 |                                                                y=None,
 39 |                                                                unit_distribution=n_docs_distribution,
 40 |                                                                verbose=True)
 41 |         assert isinstance(bns_score_csr_matrix, csr_matrix)
 42 | 
 43 |         bns_scores_dict = ScoredResultObject(
 44 |             scored_matrix=bns_score_csr_matrix,
 45 |             label2id_dict=label2id_dict,
 46 |             feature2id_dict=vocabulary
 47 |         ).convert_score_matrix2score_record()
 48 |         self.assertTrue(bns_scores_dict, list)
 49 |         #assert isinstance(bns_scores_dict, list)
 50 |         #import pprint
 51 |         #pprint.pprint(bns_scores_dict)
 52 | 
 53 | 
 54 |     def test_check_input_error(self):
 55 |         incorrect_input_dict = {
 56 |             "label_a": [
 57 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
 58 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
 59 |                 ["I", "aa", "hero", "some", "ok", "aa"]
 60 |             ],
 61 |             "label_b": [
 62 |                 ["bb", "bb", "bb"],
 63 |                 ["bb", "bb", "bb"],
 64 |                 ["hero", "ok", "bb"],
 65 |                 ["hero", "cc", "bb"],
 66 |                 ["cc", "cc", "cc"],
 67 |                 ["cc", "cc", "bb"],
 68 |                 ["xx", "xx", "cc"],
 69 |                 ["aa", "xx", "cc"],
 70 |             ],
 71 |             "label_c":[
 72 |                 ["aa", "xx", "cc"]
 73 |             ]
 74 |         }
 75 | 
 76 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
 77 |             labeled_documents=incorrect_input_dict,
 78 |             n_jobs=5
 79 |         )
 80 |         assert isinstance(data_csr_matrix, DataCsrMatrix)
 81 |         csr_matrix_ = data_csr_matrix.csr_matrix_
 82 |         n_docs_distribution = data_csr_matrix.n_docs_distribution
 83 |         try:
 84 |             bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution)
 85 |         except:
 86 |             pass
 87 | 
 88 |     def test_bns_cython(self):
 89 |         incorrect_input_dict = {
 90 |             "label_a": [
 91 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
 92 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
 93 |                 ["I", "aa", "hero", "some", "ok", "aa"]
 94 |             ],
 95 |             "label_b": [
 96 |                 ["bb", "bb", "bb"],
 97 |                 ["bb", "bb", "bb"],
 98 |                 ["hero", "ok", "bb"],
 99 |                 ["hero", "cc", "bb"],
100 |                 ["cc", "cc", "cc"],
101 |                 ["cc", "cc", "bb"],
102 |                 ["xx", "xx", "cc"],
103 |                 ["aa", "xx", "cc"],
104 |             ]
105 |         }
106 | 
107 |         data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
108 |             labeled_documents=incorrect_input_dict,
109 |             n_jobs=5
110 |         )
111 |         assert isinstance(data_csr_matrix, DataCsrMatrix)
112 |         csr_matrix_ = data_csr_matrix.csr_matrix_
113 |         n_docs_distribution = data_csr_matrix.n_docs_distribution
114 | 
115 |         result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_,
116 |                                         y=None,
117 |                                         unit_distribution=n_docs_distribution,
118 |                                         use_cython=True)
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     unittest.main()


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 14 | 
 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
 16 | 
 17 | help:
 18 | 	@echo "Please use \`make <target>' where <target> is one of"
 19 | 	@echo "  html       to make standalone HTML files"
 20 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 21 | 	@echo "  singlehtml to make a single large HTML file"
 22 | 	@echo "  pickle     to make pickle files"
 23 | 	@echo "  json       to make JSON files"
 24 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 25 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 26 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 27 | 	@echo "  epub       to make an epub"
 28 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 29 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 30 | 	@echo "  text       to make text files"
 31 | 	@echo "  man        to make manual pages"
 32 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 33 | 	@echo "  linkcheck  to check all external links for integrity"
 34 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 35 | 
 36 | clean:
 37 | 	-rm -rf $(BUILDDIR)/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	@echo
 42 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 43 | 
 44 | dirhtml:
 45 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 48 | 
 49 | singlehtml:
 50 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 53 | 
 54 | pickle:
 55 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 56 | 	@echo
 57 | 	@echo "Build finished; now you can process the pickle files."
 58 | 
 59 | json:
 60 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the JSON files."
 63 | 
 64 | htmlhelp:
 65 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 66 | 	@echo
 67 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 68 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 69 | 
 70 | qthelp:
 71 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 74 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 75 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/document-feature-selection.qhcp"
 76 | 	@echo "To view the help file:"
 77 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/document-feature-selection.qhc"
 78 | 
 79 | devhelp:
 80 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 81 | 	@echo
 82 | 	@echo "Build finished."
 83 | 	@echo "To view the help file:"
 84 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/document-feature-selection"
 85 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/document-feature-selection"
 86 | 	@echo "# devhelp"
 87 | 
 88 | epub:
 89 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 90 | 	@echo
 91 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 92 | 
 93 | latex:
 94 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 95 | 	@echo
 96 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 97 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 98 | 	      "(use \`make latexpdf' here to do that automatically)."
 99 | 
100 | latexpdf:
101 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
102 | 	@echo "Running LaTeX files through pdflatex..."
103 | 	make -C $(BUILDDIR)/latex all-pdf
104 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
105 | 
106 | text:
107 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
108 | 	@echo
109 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
110 | 
111 | man:
112 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
113 | 	@echo
114 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
115 | 
116 | changes:
117 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
118 | 	@echo
119 | 	@echo "The overview file is in $(BUILDDIR)/changes."
120 | 
121 | linkcheck:
122 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
123 | 	@echo
124 | 	@echo "Link check complete; look for any errors in the above output " \
125 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
126 | 
127 | doctest:
128 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
129 | 	@echo "Testing of doctests in the sources finished, look at the " \
130 | 	      "results in $(BUILDDIR)/doctest/output.txt."


--------------------------------------------------------------------------------
/DocumentFeatureSelection/soa/soa_python3.py:
--------------------------------------------------------------------------------
  1 | from scipy.sparse import csr_matrix
  2 | from numpy import memmap
  3 | from typing import Union
  4 | from DocumentFeatureSelection.init_logger import logger
  5 | import logging
  6 | import joblib
  7 | import math
  8 | import numpy
  9 | 
 10 | __author__ = 'kensuke-mi'
 11 | 
 12 | 
 13 | def soa(X:Union[memmap, csr_matrix],
 14 |         unit_distribution:numpy.ndarray,
 15 |         n_total_docs:int,
 16 |         feature_index:int,
 17 |         sample_index:int, verbose=False):
 18 |     # X is either of term-frequency matrix per label or document-frequency per label
 19 |     assert isinstance(X, (memmap, csr_matrix))
 20 |     assert isinstance(unit_distribution, numpy.ndarray)
 21 |     assert isinstance(feature_index, int)
 22 |     assert isinstance(sample_index, int)
 23 | 
 24 |     matrix_size = X.shape
 25 |     NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
 26 | 
 27 |     # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e
 28 |     freq_w_e = X[sample_index, feature_index]
 29 |     # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e
 30 |     freq_w_not_e = X[NOT_sample_indexes, feature_index].sum()
 31 |     # freq_e is the number of the unit having specific label e
 32 |     freq_e = unit_distribution[sample_index]
 33 |     # freq_not_e is the number of the unit NOT having the specific label e
 34 |     freq_not_e = n_total_docs - freq_e
 35 | 
 36 |     if verbose:
 37 |         logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
 38 |         logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format(
 39 |             freq_w_e,
 40 |             freq_w_not_e,
 41 |             freq_e,
 42 |             freq_not_e
 43 |         ))
 44 | 
 45 |     if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0:
 46 |         return 0
 47 |     else:
 48 |         nominator = (float(freq_w_e) * freq_not_e)
 49 |         denominator = (float(freq_e) * freq_w_not_e)
 50 |         ans = nominator / denominator
 51 |         assert isinstance(ans, float)
 52 |         soa_val = math.log(ans, 2)
 53 |         return soa_val
 54 | 
 55 | 
 56 | class SOA(object):
 57 |     def __init__(self):
 58 |         pass
 59 | 
 60 |     def fit_transform(self,
 61 |                       X: Union[memmap, csr_matrix],
 62 |                       unit_distribution: numpy.ndarray,
 63 |                       n_jobs: int=1,
 64 |                       verbose=False,
 65 |                       joblib_backend: str='multiprocessing',
 66 |                       use_cython: bool=False):
 67 |         """* What you can do
 68 |         - Get SOA weighted-score matrix.
 69 |         - You can get fast-speed with Cython
 70 |         """
 71 |         assert isinstance(X, (memmap, csr_matrix))
 72 |         assert isinstance(unit_distribution, numpy.ndarray)
 73 | 
 74 |         matrix_size = X.shape
 75 |         sample_range = list(range(0, matrix_size[0]))
 76 |         feature_range = list(range(0, matrix_size[1]))
 77 |         n_total_document = sum(unit_distribution)
 78 | 
 79 |         logger.debug(msg='Start calculating SOA')
 80 |         logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))
 81 | 
 82 |         if use_cython:
 83 |             import pyximport; pyximport.install()
 84 |             from DocumentFeatureSelection.soa.soa_cython import main
 85 |             logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
 86 |             soa_score_csr_source = main(X=X,
 87 |                                         n_docs_distribution=unit_distribution,
 88 |                                         n_total_doc=n_total_document,
 89 |                                         sample_range=sample_range,
 90 |                                         feature_range=feature_range,
 91 |                                         verbose=False)
 92 |         else:
 93 |             self.soa = soa
 94 |             soa_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
 95 |                 joblib.delayed(self.docId_word_soa)(
 96 |                     X=X,
 97 |                     unit_distribution=unit_distribution,
 98 |                     feature_index=feature_index,
 99 |                     sample_index=sample_index,
100 |                     n_total_doc=n_total_document,
101 |                     verbose=verbose
102 |                 )
103 |                 for sample_index in sample_range
104 |                 for feature_index in feature_range
105 |             )
106 | 
107 |         row_list = [t[0] for t in soa_score_csr_source]
108 |         col_list = [t[1] for t in soa_score_csr_source]
109 |         data_list = [t[2] for t in soa_score_csr_source]
110 | 
111 |         soa_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
112 |                                              shape=(X.shape[0],
113 |                                                     X.shape[1]))
114 | 
115 |         logging.debug(msg='End calculating SOA')
116 | 
117 |         return soa_featured_csr_matrix
118 | 
119 |     def docId_word_soa(self,
120 |                        X: Union[memmap, csr_matrix],
121 |                        unit_distribution: numpy.ndarray,
122 |                        n_total_doc: int,
123 |                        feature_index: int,
124 |                        sample_index: int, verbose=False):
125 |         """
126 |         """
127 |         assert isinstance(X, (memmap, csr_matrix))
128 |         assert isinstance(unit_distribution, numpy.ndarray)
129 |         assert isinstance(feature_index, int)
130 |         assert isinstance(sample_index, int)
131 | 
132 |         soa_score = self.soa(
133 |             X=X,
134 |             unit_distribution=unit_distribution,
135 |             feature_index=feature_index,
136 |             sample_index=sample_index,
137 |             n_total_docs=n_total_doc,
138 |             verbose=verbose
139 |         )
140 |         return sample_index, feature_index, soa_score
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | DocumentFeatureSelection
  2 | ==========================
  3 | 
  4 | # what's this?
  5 | 
  6 | This is set of feature selection codes from text data.
  7 | (About feature selection, see [here](http://nlp.stanford.edu/IR-book/html/htmledition/feature-selection-1.html) or [here](http://stackoverflow.com/questions/13603882/feature-selection-and-reduction-for-text-classification))
  8 | 
  9 | The Feature selection is really important when you use machine learning metrics on natural language data.
 10 | The natural language data usually contains a lot of noise information, thus machine learning metrics are weak if you don't process any feature selection.
 11 | (There is some exceptions of algorithms like _Decision Tree_ or _Random forest_ . They have feature selection metric inside the algorithm itself)
 12 | 
 13 | The feature selection is also useful when you observe your text data.
 14 | With the feature selection, you can get to know which features really contribute to specific labels.
 15 | 
 16 | Please visit [project page on github](https://github.com/Kensuke-Mitsuzawa/DocumentFeatureSelection).
 17 | 
 18 | If you find any bugs and you report it to github issue, I'm glad.
 19 | 
 20 | Any pull-requests are welcomed.
 21 | 
 22 | ## Supporting methods
 23 | 
 24 | This package provides you some feature selection metrics.
 25 | Currently, this package supports following feature selection methods
 26 | 
 27 | * TF-IDF
 28 | * Pointwise mutual information (PMI)
 29 | * Strength of Association (SOA)
 30 | * Bi-Normal Separation (BNS)
 31 | 
 32 | ## Contribution of this package
 33 | 
 34 | * Easy interface for pre-processing
 35 | * Easy interface for accessing feature selection methods
 36 | * Fast speed computation thanks to sparse matrix and multi-processing
 37 | 
 38 | # Overview of methods
 39 | 
 40 | ## TF-IDF
 41 | 
 42 | This method, in fact, just calls `TfidfTransformer` of the scikit-learn.
 43 | 
 44 | See [scikit-learn document](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) about detailed information.
 45 | 
 46 | ## PMI
 47 | 
 48 | PMI is calculated by correlation between _feature_ (i.e. token) and _category_ (i.e. label).
 49 | Concretely, it makes _cross-table_ (or called _contingency table_) and calculates joint probability and marginal probability on it.
 50 | 
 51 | To know more, see [reference](https://www.eecis.udel.edu/~trnka/CISC889-11S/lectures/philip-pmi.pdf)
 52 | 
 53 | In python world, [NLTK](http://www.nltk.org/howto/collocations.html) and [Other package](https://github.com/Bollegala/svdmi) also provide PMI.
 54 | Check them and choose based on your preference and usage.
 55 | 
 56 | 
 57 | ## SOA
 58 | 
 59 | SOA is improved feature-selection method from PMI.
 60 | PMI is weak when feature has low word frequency.
 61 | SOA is based on PMI computing, however, it is feasible on such low frequency features.
 62 | Moreover, you can get anti-correlation between features and categories.
 63 | 
 64 | In this package, SOA formula is from following paper,
 65 | 
 66 | `Saif Mohammad and Svetlana Kiritchenko, "Using Hashtags to Capture Fine Emotion Categories from Tweets", Computational Intelligence, 01/2014; 31(2).`
 67 | 
 68 | ```
 69 | SOA(w, e)\ =\ log_2\frac{freq(w, e) * freq(\neg{e})}{freq(e) * freq(w, \neg{e})}
 70 | ```
 71 | 
 72 | Where
 73 | 
 74 | * freq(w, e) is the number of times _w_ occurs in an unit(sentence or document) with label _e_
 75 | * freq(w,¬e) is the number of times _w_ occurs in units that does not have the label _e_
 76 | * freq(e) is the number of units having the label _e_
 77 | * freq(¬e) is the number of units having NOT the label _e_
 78 | 
 79 | ## BNS
 80 | 
 81 | BNS is a feature selection method for binary class data.
 82 | There is several methods available for binary class data, such as _information gain (IG)_, _chi-squared
 83 | (CHI)_, _odds ratio (Odds)_.
 84 |  
 85 | The problem is when you execute your feature selection on skewed data.
 86 | These methods are weak for such skewed data, however, _BNS_ is feasible only for skewed data.
 87 | The following paper shows how BNS is feasible for skewed data.
 88 | 
 89 | ```Lei Tang and Huan Liu, "Bias Analysis in Text Classification for Highly Skewed Data", 2005```
 90 | 
 91 | or 
 92 | 
 93 | ```George Forman, "An Extensive Empirical Study of Feature Selection Metrics for Text Classification",Journal of Machine Learning Research 3 (2003) 1289-1305```
 94 |  
 95 | 
 96 | # Requirement
 97 | 
 98 | * Python 3.x(checked under Python 3.5)
 99 | 
100 | 
101 | # Setting up
102 | 
103 | ## install
104 | 
105 | `python setup.py install`
106 | 
107 | ### Note
108 | 
109 | You might see error message during running this command, such as
110 | 
111 | ```
112 | We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution.
113 | ```
114 | 
115 | This is because `setup.py` tries to instal numpy and scipy with `pip`, however it fails. 
116 | We need numpy and scipy before we install `scikit-learn`.
117 | 
118 | In this case, you take following choice
119 | 
120 | * You install `numpy` and `scipy` manually
121 | * You use `anaconda` python distribution. Please visit [their site](https://www.continuum.io/downloads).
122 | 
123 | # Example
124 | 
125 | ```python
126 | input_dict = {
127 |     "label_a": [
128 |         ["I", "aa", "aa", "aa", "aa", "aa"],
129 |         ["bb", "aa", "aa", "aa", "aa", "aa"],
130 |         ["I", "aa", "hero", "some", "ok", "aa"]
131 |     ],
132 |     "label_b": [
133 |         ["bb", "bb", "bb"],
134 |         ["bb", "bb", "bb"],
135 |         ["hero", "ok", "bb"],
136 |         ["hero", "cc", "bb"],
137 |     ],
138 |     "label_c": [
139 |         ["cc", "cc", "cc"],
140 |         ["cc", "cc", "bb"],
141 |         ["xx", "xx", "cc"],
142 |         ["aa", "xx", "cc"],
143 |     ]
144 | }
145 | 
146 | from DocumentFeatureSelection import interface
147 | interface.run_feature_selection(input_dict, method='pmi', use_cython=True).convert_score_matrix2score_record()
148 | ```
149 | Then, you get the result
150 | 
151 | ```python
152 | [{'score': 0.14976146817207336, 'label': 'label_c', 'feature': 'bb', 'frequency': 1.0}, ...]
153 | ```
154 | 
155 | See scripts in `examples/`
156 | 
157 | # For developers
158 | 
159 | You could set up dev environment with docker-compose.
160 | 
161 | This command runs test with the docker container.
162 | 
163 | ```bash
164 | $ cd tests/
165 | $ docker-compose build
166 | $ docker-compose up
167 | ```
168 | 
169 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/pmi/PMI_python3.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | from __future__ import division
  6 | from scipy.sparse import csr_matrix
  7 | from numpy import memmap
  8 | from typing import Union
  9 | from DocumentFeatureSelection.init_logger import logger
 10 | 
 11 | import logging
 12 | import joblib
 13 | import math
 14 | import numpy
 15 | 
 16 | __author__ = 'kensuke-mi'
 17 | 
 18 | 
 19 | # TODO normzalized pmiの導入
 20 | # http://sucrose.hatenablog.com/entry/2014/12/02/235959
 21 | 
 22 | 
 23 | def pmi(X: Union[csr_matrix, memmap],
 24 |         n_docs_distribution: numpy.ndarray,
 25 |         n_total_doc: int,
 26 |         feature_index: int,
 27 |         sample_index: int,
 28 |         verbose=False):
 29 |     """get PMI score for given feature & sample index
 30 | 
 31 |     :param X:
 32 |     :param feature_index:
 33 |     :param sample_index:
 34 |     :return:
 35 |     """
 36 |     assert isinstance(X, (memmap, csr_matrix))
 37 |     assert isinstance(n_docs_distribution, numpy.ndarray)
 38 |     assert isinstance(feature_index, int)
 39 |     assert isinstance(sample_index, int)
 40 | 
 41 |     matrix_size = X.shape
 42 |     sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
 43 | 
 44 |     # n_11 is #docs having feature(i.e. word) in the specified index(label)
 45 |     n_11 = X[sample_index, feature_index]
 46 |     # n_01 is #docs NOT having feature in the specified index(label)
 47 |     n_01 = n_docs_distribution[sample_index] - n_11
 48 |     # n_10 is #docs having feature in NOT specified index(indexes except specified index)
 49 |     n_10 = X[sample_indexes, feature_index].sum()
 50 |     # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
 51 |     n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
 52 | 
 53 |     if verbose:
 54 |         logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
 55 |         logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format(
 56 |             n_11,
 57 |             n_01,
 58 |             n_10,
 59 |             n_00
 60 |         ))
 61 | 
 62 |     if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
 63 |         return 0
 64 |     else:
 65 |         temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
 66 |         temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
 67 |         temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
 68 |         temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
 69 |         score = temp1 + temp2 + temp3 + temp4
 70 | 
 71 |         return score
 72 | 
 73 | 
 74 | class PMI(object):
 75 |     def __init__(self):
 76 |         pass
 77 | 
 78 |     def fit_transform(self,
 79 |                       X: Union[csr_matrix, memmap],
 80 |                       n_docs_distribution,
 81 |                       n_jobs=1,
 82 |                       verbose=False,
 83 |                       joblib_backend='multiprocessing',
 84 |                       use_cython:bool=False):
 85 |         """Main method of PMI class.
 86 |         """
 87 |         assert isinstance(X, (memmap, csr_matrix))
 88 |         assert isinstance(n_docs_distribution, numpy.ndarray)
 89 | 
 90 |         matrix_size = X.shape
 91 |         sample_range = list(range(0, matrix_size[0]))
 92 |         feature_range = list(range(0, matrix_size[1]))
 93 |         n_total_document = sum(n_docs_distribution)
 94 | 
 95 |         logger.debug(msg='Start calculating PMI')
 96 |         logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))
 97 | 
 98 |         if use_cython:
 99 |             import pyximport; pyximport.install()
100 |             from DocumentFeatureSelection.pmi.pmi_cython import main
101 |             logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
102 |             pmi_score_csr_source = main(X=X,
103 |                                         n_docs_distribution=n_docs_distribution,
104 |                                         sample_range=sample_range,
105 |                                         feature_range=feature_range,
106 |                                         n_total_doc=n_total_document,
107 |                                         verbose=False)
108 | 
109 |         else:
110 |             self.pmi = pmi
111 |             pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
112 |                 joblib.delayed(self.docId_word_PMI)(
113 |                     X=X,
114 |                     n_docs_distribution=n_docs_distribution,
115 |                     feature_index=feature_index,
116 |                     sample_index=sample_index,
117 |                     n_total_doc=n_total_document,
118 |                     verbose=verbose
119 |                 )
120 |                 for sample_index in sample_range
121 |                 for feature_index in feature_range
122 |             )
123 | 
124 |         row_list = [t[0] for t in pmi_score_csr_source]
125 |         col_list = [t[1] for t in pmi_score_csr_source]
126 |         data_list = [t[2] for t in pmi_score_csr_source]
127 | 
128 |         pmi_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
129 |                                              shape=(X.shape[0],
130 |                                                     X.shape[1]))
131 | 
132 |         logging.debug(msg='End calculating PMI')
133 | 
134 |         return pmi_featured_csr_matrix
135 | 
136 |     def docId_word_PMI(self,
137 |                        X:Union[csr_matrix, memmap],
138 |                        n_docs_distribution:numpy.ndarray,
139 |                        n_total_doc:int,
140 |                        feature_index:int,
141 |                        sample_index:int,
142 |                        verbose=False,
143 |                        use_cython:bool=False):
144 |         """Calculate PMI score for fit_format()
145 | 
146 |         :param X:
147 |         :param vocabulary:
148 |         :param label_id:
149 |         :param word:
150 |         :param label:
151 |         :return:
152 |         """
153 |         pmi_score = self.pmi(
154 |             X=X,
155 |             n_docs_distribution=n_docs_distribution,
156 |             feature_index=feature_index,
157 |             sample_index=sample_index,
158 |             n_total_doc=n_total_doc,
159 |             verbose=verbose
160 |         )
161 |         return sample_index, feature_index, pmi_score
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/bns/bns_python3.py:
--------------------------------------------------------------------------------
  1 | from scipy.sparse import csr_matrix
  2 | from sklearn.base import TransformerMixin
  3 | from scipy.stats import norm
  4 | from numpy import ndarray, memmap
  5 | from typing import Union
  6 | from DocumentFeatureSelection.init_logger import logger
  7 | import numpy as np
  8 | import joblib
  9 | import logging
 10 | 
 11 | 
 12 | def bns(X:Union[memmap, csr_matrix],
 13 |         feature_index: int,
 14 |         sample_index: int,
 15 |         unit_distribution: np.ndarray,
 16 |         true_index: int = 0,
 17 |         verbose: bool = False):
 18 |     if true_index == 0:
 19 |         false_index = 1
 20 |     elif true_index == 1:
 21 |         false_index = 0
 22 |     else:
 23 |         raise Exception('true index must be either of 0 or 1')
 24 | 
 25 |     # trueラベルで出現した回数
 26 |     # tp is frequency of features in the specified positive label
 27 |     tp = X[true_index, feature_index]
 28 |     # trueラベルで出現しなかった回数
 29 |     # fp is frequency of NON-features(expect specified feature) in the specified positive label
 30 |     fp = unit_distribution[true_index] - tp
 31 | 
 32 |     # negativeラベルで出現した回数
 33 |     # fn is frequency of features in the specified negative label
 34 |     fn = X[false_index, feature_index]
 35 |     # negativeラベルで出現しなかった回数
 36 |     # fp is frequency of NON-features(expect specified feature) in the specified negative label
 37 |     tn = unit_distribution[false_index] - fn
 38 | 
 39 |     if tn < 0.0:
 40 |         print('aaaa')
 41 | 
 42 |     pos = tp + fn
 43 |     neg = fp + tn
 44 | 
 45 |     tpr = tp / pos
 46 |     fpr = fp / neg
 47 | 
 48 |     if verbose:
 49 |         logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
 50 |         logging.debug('tp:{} fp:{} fn:{} tn:{} pos:{} neg:{} tpr:{} fpr:{}'.format(
 51 |             tp,
 52 |             fp,
 53 |             fn,
 54 |             tn,
 55 |             pos,
 56 |             neg,
 57 |             tpr,
 58 |             fpr
 59 |         ))
 60 | 
 61 |     #bns_score = np.abs(norm.ppf(norm.cdf(tpr)) - norm.ppf(norm.cdf(fpr)))
 62 |     bns_score = abs(norm.ppf(tpr) - norm.ppf(fpr))
 63 |     return bns_score
 64 | 
 65 | 
 66 | 
 67 | class BNS(TransformerMixin):
 68 |     def __init__(self):
 69 |         pass
 70 | 
 71 |     def __check_matrix_form(self, X):
 72 |         assert isinstance(X, csr_matrix)
 73 |         matrix_size = X.shape
 74 |         n_categories = matrix_size[0]
 75 |         if n_categories != 2:
 76 |             raise Exception('BNS input must be of 2 categories')
 77 | 
 78 |     def fit_transform(self,
 79 |                       X: Union[memmap, csr_matrix],
 80 |                       y=None,
 81 |                       **fit_params):
 82 |         """* What you can do
 83 | 
 84 |         * Args
 85 |         - X; scipy.csr_matrix or numpy.memmap: Matrix object
 86 | 
 87 |         * Params
 88 |         - unit_distribution; list or ndarray: The number of document frequency per label. Ex. [10, 20]
 89 |         - n_jobs: The number of cores when you use joblib.
 90 |         - joblib_backend: "multiprocessing" or "multithreding"
 91 |         - true_index: The index number of True label.
 92 |         - use_cython; boolean: True, then Use Cython for computation. False, not.
 93 |         """
 94 |         assert isinstance(X, csr_matrix)
 95 | 
 96 |         # --------------------------------------------------------
 97 |         # Check parameters
 98 |         if not 'unit_distribution' in fit_params:
 99 |             raise Exception('You must put unit_distribution parameter')
100 |         assert isinstance(fit_params['unit_distribution'], (list, ndarray))
101 |         self.__check_matrix_form(X)
102 | 
103 |         unit_distribution = fit_params['unit_distribution']
104 | 
105 |         if 'n_jobs' in fit_params:
106 |             n_jobs = fit_params['n_jobs']
107 |         else:
108 |             n_jobs = 1
109 | 
110 |         if 'true_index' in fit_params:
111 |             true_index = fit_params['true_index']
112 |         else:
113 |             true_index = 0
114 | 
115 |         if 'verbose' in fit_params:
116 |             verbose = True
117 |         else:
118 |             verbose = False
119 | 
120 |         if 'joblib_backend' in fit_params:
121 |             joblib_backend = fit_params['joblib_backend']
122 |         else:
123 |             joblib_backend = 'multiprocessing'
124 | 
125 |         if 'use_cython' in fit_params:
126 |             is_use_cython = True
127 |         else:
128 |             is_use_cython = False
129 |         # --------------------------------------------------------
130 | 
131 |         matrix_size = X.shape
132 |         sample_range = list(range(0, matrix_size[0]))
133 |         feature_range = list(range(0, matrix_size[1]))
134 | 
135 |         logger.debug(msg='Start calculating BNS with n(process)={}'.format(n_jobs))
136 |         logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))
137 | 
138 |         if is_use_cython:
139 |             import pyximport; pyximport.install()
140 |             from DocumentFeatureSelection.bns.bns_cython import main
141 |             logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
142 |             bns_score_csr_source = main(
143 |                 X=X,
144 |                 unit_distribution=unit_distribution,
145 |                 sample_range=sample_range,
146 |                 feature_range=feature_range,
147 |                 true_index=true_index,
148 |                 verbose=verbose
149 |             )
150 |         else:
151 |             bns_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
152 |                 joblib.delayed(self.docId_word_BNS)(
153 |                 X=X,
154 |                 feature_index=feature_index,
155 |                 sample_index=sample_index,
156 |                 true_index=true_index,
157 |                 unit_distribution=unit_distribution,
158 |                 verbose=verbose
159 |             )
160 |             for sample_index in sample_range
161 |             for feature_index in feature_range)
162 | 
163 |         row_list = [t[0] for t in bns_score_csr_source]
164 |         col_list = [t[1] for t in bns_score_csr_source]
165 |         data_list = [t[2] for t in bns_score_csr_source]
166 | 
167 |         bns_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
168 |                                              shape=(X.shape[0],
169 |                                                     X.shape[1]))
170 | 
171 |         logging.debug(msg='End calculating BNS')
172 | 
173 |         return bns_featured_csr_matrix
174 | 
175 |     def docId_word_BNS(self, X:csr_matrix,
176 |                        feature_index:int,
177 |                        sample_index:int,
178 |                        unit_distribution:np.ndarray,
179 |                        true_index:int,
180 |                        verbose=False):
181 | 
182 |         assert isinstance(X, csr_matrix)
183 |         assert isinstance(feature_index, int)
184 |         assert isinstance(sample_index, int)
185 | 
186 |         bns_score = bns(
187 |             X=X,
188 |             feature_index=feature_index,
189 |             sample_index=sample_index,
190 |             true_index=true_index,
191 |             unit_distribution=unit_distribution,
192 |             verbose=verbose
193 |         )
194 |         return sample_index, feature_index, bns_score


--------------------------------------------------------------------------------
/tests/test_data_converter.py:
--------------------------------------------------------------------------------
  1 | from DocumentFeatureSelection.common import data_converter
  2 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix
  3 | from scipy.sparse import csr_matrix
  4 | import unittest
  5 | import numpy
  6 | 
  7 | 
  8 | class TestDataConverter(unittest.TestCase):
  9 |     def setUp(self):
 10 |         self.input_dict = {
 11 |             "label_a": [
 12 |                 ["I", "aa", "aa", "aa", "aa", "aa"],
 13 |                 ["bb", "aa", "aa", "aa", "aa", "aa"],
 14 |                 ["I", "aa", "hero", "some", "ok", "aa"]
 15 |             ],
 16 |             "label_b": [
 17 |                 ["bb", "bb", "bb"],
 18 |                 ["bb", "bb", "bb"],
 19 |                 ["hero", "ok", "bb"],
 20 |                 ["hero", "cc", "bb"],
 21 |             ],
 22 |             "label_c": [
 23 |                 ["cc", "cc", "cc"],
 24 |                 ["cc", "cc", "bb"],
 25 |                 ["xx", "xx", "cc"],
 26 |                 ["aa", "xx", "cc"],
 27 |             ]
 28 |         }
 29 | 
 30 |         self.input_dict_complex_feature = {
 31 |             "label_a": [
 32 |                 [['a', 'b'], ['b', 'c'], ['a', 'b', 'c']],
 33 |                 [['a', 'b'], ['b', 'c']],
 34 |                 [['a', 'b']]
 35 |             ],
 36 |             "label_b": [
 37 |                 [['b', 'c'], ['c', 'd']],
 38 |                 [['b', 'c']],
 39 |                 [['b', 'c', 'd']],
 40 |                 [['b', 'c']],
 41 |             ],
 42 |             "label_c": [
 43 |                 [['c', 'd'], ['a', 'b']],
 44 |                 [['b', 'c'], ['a', 'b', 'c']],
 45 |                 [['b', 'c']]
 46 |             ]
 47 |         }
 48 | 
 49 | 
 50 |     def test_check_same_csr_matrix(self):
 51 |         """複数回の変換を実施して、同一のcsr_matrixになることを確認する
 52 |         """
 53 |         n_joblib_tasks = 2
 54 | 
 55 |         data_csr_matrix1 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
 56 |             labeled_documents=self.input_dict,
 57 |             n_jobs=n_joblib_tasks
 58 |         )
 59 |         assert isinstance(data_csr_matrix1, DataCsrMatrix)
 60 |         csr_matrix_1 = data_csr_matrix1.csr_matrix_
 61 |         label_group_dict_1 = data_csr_matrix1.label2id_dict
 62 |         vocabulary_1 = data_csr_matrix1.vocabulary
 63 |         n_doc_distri_1 = data_csr_matrix1.n_docs_distribution
 64 |         n_term_distir_1 = data_csr_matrix1.n_term_freq_distribution
 65 |         dense_matrix_1 = csr_matrix_1.toarray()
 66 | 
 67 |         data_csr_matrix2 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
 68 |             labeled_documents=self.input_dict,
 69 |             n_jobs=n_joblib_tasks
 70 |         )
 71 |         assert isinstance(data_csr_matrix2, DataCsrMatrix)
 72 |         csr_matrix_2 = data_csr_matrix2.csr_matrix_
 73 |         label_group_dict_2 = data_csr_matrix2.label2id_dict
 74 |         vocabulary_2 = data_csr_matrix2.vocabulary
 75 |         n_doc_distri_2 = data_csr_matrix2.n_docs_distribution
 76 |         n_term_distir_2 = data_csr_matrix2.n_term_freq_distribution
 77 |         dense_matrix_2 = data_csr_matrix2.csr_matrix_.toarray()
 78 | 
 79 |         data_csr_matrix3 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
 80 |             labeled_documents=self.input_dict,
 81 |             n_jobs=n_joblib_tasks
 82 |         )
 83 |         assert isinstance(data_csr_matrix3, DataCsrMatrix)
 84 |         csr_matrix_3 = data_csr_matrix3.csr_matrix_
 85 |         label_group_dict_3 = data_csr_matrix3.label2id_dict
 86 |         vocabulary_3 = data_csr_matrix3.vocabulary
 87 |         n_doc_distri_3 = data_csr_matrix3.n_docs_distribution
 88 |         n_term_distir_3 = data_csr_matrix3.n_term_freq_distribution
 89 |         dense_matrix_3 = data_csr_matrix3.csr_matrix_.toarray()
 90 | 
 91 |         assert numpy.array_equal(dense_matrix_1, dense_matrix_2)
 92 |         assert numpy.array_equal(dense_matrix_2, dense_matrix_3)
 93 |         assert numpy.array_equal(dense_matrix_1, dense_matrix_3)
 94 | 
 95 |         assert vocabulary_1 == vocabulary_2
 96 |         assert vocabulary_2 == vocabulary_3
 97 |         assert vocabulary_1 == vocabulary_3
 98 | 
 99 | 
100 | 
101 |     def test_basic_convert_data(self):
102 |         """checks it works of not when n_jobs=1, n_process=1
103 | 
104 |         data convert過程のミスが疑われるので、整合性のチェックをする
105 | 
106 |         :return:
107 |         """
108 | 
109 |         csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
110 |             labeled_documents=self.input_dict,
111 |             n_jobs=5
112 |         )
113 |         assert isinstance(csr_matrix_information, DataCsrMatrix)
114 |         csr_matrix_ = csr_matrix_information.csr_matrix_
115 |         label_group_dict = csr_matrix_information.label2id_dict
116 |         vocabulary = csr_matrix_information.vocabulary
117 | 
118 |         assert isinstance(csr_matrix_, csr_matrix)
119 |         assert isinstance(label_group_dict, dict)
120 |         assert isinstance(vocabulary, dict)
121 | 
122 |         n_correct_sample = 3
123 |         n_correct_featute = 8
124 | 
125 |         assert csr_matrix_.shape[0] == n_correct_sample
126 |         assert csr_matrix_.shape[1] == n_correct_featute
127 | 
128 |         dense_matrix_constructed_matrix = csr_matrix_.toarray()
129 | 
130 |         # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2}
131 |         # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1}
132 |         correct_array_numpy = numpy.array(
133 |             [[2, 3, 1, 0, 1, 1, 1, 0],
134 |              [0, 0, 4, 1, 2, 1, 0, 0],
135 |              [0, 1, 1, 4, 0, 0, 0, 2]
136 |          ]).astype(numpy.int64)
137 |         assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix)
138 | 
139 | 
140 | 
141 |     def test_multi_process_convert_data(self):
142 |         """checks if it works or not when n_process is more than 1
143 | 
144 |         :return:
145 |         """
146 | 
147 |         data_csr_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
148 |             labeled_documents=self.input_dict,
149 |             n_jobs=5
150 |         )
151 | 
152 |         assert isinstance(data_csr_object.csr_matrix_, csr_matrix)
153 |         assert isinstance(data_csr_object.label2id_dict, dict)
154 |         assert isinstance(data_csr_object.vocabulary, dict)
155 | 
156 |     def test_complex_feature_convertion(self):
157 |         """"""
158 |         csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
159 |             labeled_documents=self.input_dict_complex_feature,
160 |             n_jobs=1
161 |         )
162 |         assert isinstance(csr_matrix_information, DataCsrMatrix)
163 |         csr_matrix_ = csr_matrix_information.csr_matrix_
164 |         label_group_dict = csr_matrix_information.label2id_dict
165 |         vocabulary = csr_matrix_information.vocabulary
166 | 
167 |         assert isinstance(csr_matrix_, csr_matrix)
168 |         assert isinstance(label_group_dict, dict)
169 |         assert isinstance(vocabulary, dict)
170 | 
171 |         n_correct_sample = 3
172 |         n_correct_feature = 5
173 | 
174 |         assert csr_matrix_.shape[0] == n_correct_sample
175 |         assert csr_matrix_.shape[1] == n_correct_feature
176 | 
177 |         dense_matrix_constructed_matrix = csr_matrix_.toarray()
178 | 
179 |         # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2}
180 |         # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1}
181 |         correct_array_numpy = numpy.array(
182 |             [[1.0, 3.0, 0.0, 2.0, 0.0],
183 |              [0.0, 0.0, 1.0, 3.0, 1.0],
184 |              [1.0, 1.0, 0.0, 2.0, 1.0],
185 |          ]).astype(numpy.int64)
186 |         assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix)
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     unittest.main()
191 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/interface.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding: utf-8 -*-
  2 | from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject, AvailableInputTypes
  3 | from DocumentFeatureSelection.common import data_converter
  4 | from DocumentFeatureSelection.soa.soa_python3 import SOA
  5 | from DocumentFeatureSelection.pmi.PMI_python3 import PMI
  6 | from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF
  7 | from DocumentFeatureSelection.bns.bns_python3 import BNS
  8 | from DocumentFeatureSelection.init_logger import logger
  9 | from tempfile import mkdtemp
 10 | from typing import Dict
 11 | from scipy.sparse.csr import csr_matrix
 12 | import shutil
 13 | METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns']
 14 | N_FEATURE_SWITCH_STRATEGY = 1000000
 15 | 
 16 | 
 17 | def decide_joblib_strategy(feature2id_dict: Dict[str, int])->str:
 18 |     if len(feature2id_dict) > N_FEATURE_SWITCH_STRATEGY:
 19 |         return 'threading'
 20 |     else:
 21 |         return 'multiprocessing'
 22 | 
 23 | 
 24 | def run_feature_selection(input_dict: AvailableInputTypes,
 25 |                           method: str,
 26 |                           use_cython: bool=False,
 27 |                           is_use_cache: bool=False,
 28 |                           is_use_memmap: bool=False,
 29 |                           cache_backend: str='PersistentDict',
 30 |                           path_working_dir: str=None,
 31 |                           matrix_form=None,
 32 |                           n_jobs: int=1)->ScoredResultObject:
 33 |     """A interface function of DocumentFeatureSelection package.
 34 | 
 35 |     * Args
 36 |     - input_dict: Dict-object which has category-name as key and list of features as value.
 37 |         - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
 38 |     - method: A method name of feature selection metric
 39 |     - use_cython: boolean flag to use cython code for computation.
 40 |     It's much faster to use cython than native-python code
 41 |     - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
 42 |     - is_use_memmap: boolean flag to use memmap for keeping matrix object.
 43 |     - path_working_dir: str object.
 44 |         - The file path to directory where you save cache file or memmap matrix object. If you leave it None,
 45 |         it finds some directory and save files in it.
 46 |     - cache_backend
 47 |         - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict]
 48 | 
 49 |     """
 50 |     if method not in METHOD_NAMES:
 51 |         raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method))
 52 | 
 53 |     if (is_use_cache or is_use_memmap) and path_working_dir is None:
 54 |         path_working_dir = mkdtemp()
 55 |         logger.info("Temporary files are created under {}".format(path_working_dir))
 56 | 
 57 |     if method == 'tf_idf':
 58 |         """You get scored-matrix with term-frequency.
 59 |         ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
 60 |         """
 61 |         matrix_data_object = data_converter.DataConverter().convert_multi_docs2term_frequency_matrix(
 62 |             labeled_documents=input_dict,
 63 |             n_jobs=n_jobs,
 64 |             is_use_cache=is_use_cache,
 65 |             is_use_memmap=is_use_memmap,
 66 |             path_working_dir=path_working_dir,
 67 |             cache_backend=cache_backend
 68 |         )
 69 |         assert isinstance(matrix_data_object, DataCsrMatrix)
 70 | 
 71 |         scored_sparse_matrix = TFIDF().fit_transform(X=matrix_data_object.csr_matrix_)
 72 |         assert isinstance(scored_sparse_matrix, csr_matrix)
 73 | 
 74 |     elif method in ['soa', 'pmi'] and matrix_form is None:
 75 |         """You get scored-matrix with either of soa or pmi.
 76 |         """
 77 |         matrix_data_object = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
 78 |             labeled_documents=input_dict,
 79 |             n_jobs=n_jobs,
 80 |             is_use_cache=is_use_cache,
 81 |             is_use_memmap=is_use_memmap,
 82 |             path_working_dir=path_working_dir
 83 |         )
 84 |         assert isinstance(matrix_data_object, DataCsrMatrix)
 85 |         if method == 'pmi':
 86 |             backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
 87 |             scored_sparse_matrix = PMI().fit_transform(X=matrix_data_object.csr_matrix_,
 88 |                                                        n_docs_distribution=matrix_data_object.n_docs_distribution,
 89 |                                                        n_jobs=n_jobs,
 90 |                                                        joblib_backend=backend_strategy,
 91 |                                                        use_cython=use_cython)
 92 |             assert isinstance(scored_sparse_matrix, csr_matrix)
 93 |         elif method == 'soa':
 94 |             backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
 95 |             scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_,
 96 |                                                        unit_distribution=matrix_data_object.n_docs_distribution,
 97 |                                                        n_jobs=n_jobs,
 98 |                                                        joblib_backend=backend_strategy,
 99 |                                                        use_cython=use_cython)
100 |             assert isinstance(scored_sparse_matrix, csr_matrix)
101 |         else:
102 |             raise Exception()
103 | 
104 |     elif method == 'soa' and matrix_form == 'term_freq':
105 |         # You get score-matrix with soa from term-frequency matrix.
106 |         # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
107 |         matrix_data_object = data_converter.DataConverter().convert_multi_docs2term_frequency_matrix(
108 |             labeled_documents=input_dict,
109 |             n_jobs=n_jobs,
110 |             is_use_cache=is_use_cache,
111 |             is_use_memmap=is_use_memmap,
112 |             path_working_dir=path_working_dir
113 |         )
114 |         assert isinstance(matrix_data_object, DataCsrMatrix)
115 | 
116 |         backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
117 |         scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_,
118 |                                                    unit_distribution=matrix_data_object.n_docs_distribution,
119 |                                                    n_jobs=n_jobs,
120 |                                                    joblib_backend=backend_strategy)
121 |         assert isinstance(scored_sparse_matrix, csr_matrix)
122 | 
123 |     elif method == 'bns':
124 |         # You get scored-matrix with bns.
125 |         # ATTENTION: #label should be 2 always.
126 |         # Consider shorter label name as positive label
127 |         # (positive and negative does NOT have any meaning in this context) #
128 |         positive_label_name = sorted(input_dict.keys(), key=lambda x: len(x))[0]
129 | 
130 |         if len(input_dict.keys()) >= 3:
131 |             raise KeyError('input_dict must not have more than 3 keys if you would like to use BNS.')
132 | 
133 |         matrix_data_object = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
134 |             labeled_documents=input_dict,
135 |             n_jobs=n_jobs,
136 |             is_use_cache=is_use_cache,
137 |             is_use_memmap=is_use_memmap,
138 |             path_working_dir=path_working_dir
139 |         )
140 |         assert isinstance(matrix_data_object, DataCsrMatrix)
141 | 
142 |         true_class_index = matrix_data_object.label2id_dict[positive_label_name]
143 |         backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
144 |         scored_sparse_matrix = BNS().fit_transform(
145 |             X=matrix_data_object.csr_matrix_,
146 |             unit_distribution=matrix_data_object.n_term_freq_distribution,
147 |             n_jobs=n_jobs,
148 |             true_index=true_class_index,
149 |             joblib_backend=backend_strategy,
150 |             use_cython=use_cython
151 |         )
152 |         assert isinstance(scored_sparse_matrix, csr_matrix)
153 |     else:
154 |         raise Exception()
155 |     logger.info('Done computation.')
156 | 
157 |     # delete tmp file directory
158 |     if is_use_cache or is_use_memmap:
159 |         logger.debug("Delete temporary files {}".format(path_working_dir))
160 |         shutil.rmtree(path_working_dir)
161 | 
162 |     return ScoredResultObject(
163 |         scored_matrix=scored_sparse_matrix,
164 |         label2id_dict=matrix_data_object.label2id_dict,
165 |         feature2id_dict=matrix_data_object.vocabulary,
166 |         method=method,
167 |         matrix_form=matrix_form,
168 |         frequency_matrix=matrix_data_object.csr_matrix_)
169 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """document-feature-selection documentation build configuration file, created by
  3 | sphinx-quickstart on Wed Feb 13 11:51:12 2013.
  4 | 
  5 | This file is execfile()d with the current directory set to its containing dir.
  6 | 
  7 | Note that not all possible configuration values are present in this
  8 | autogenerated file.
  9 | 
 10 | All configuration values have a default; values that are commented out
 11 | serve to show the default. """
 12 | from __future__ import unicode_literals
 13 | __author__ = 'kensuke-mi'
 14 | # 
 15 | import sys
 16 | import os.path
 17 | 
 18 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
 19 | from document-feature-selection import __version__ as version_orig
 20 | # If extensions (or modules to document with autodoc) are in another directory,
 21 | # add these directories to sys.path here. If the directory is relative to the
 22 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 23 | # sys.path.insert(0, os.path.abspath('.'))
 24 | 
 25 | # -- General configuration -----------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be extensions
 31 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo',
 34 |     'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode',
 35 |     'sphinx.ext.intersphinx',
 36 | ]
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix of source filenames.
 42 | source_suffix = '.rst'
 43 | 
 44 | # The encoding of source files.
 45 | # source_encoding = 'utf-8-sig'
 46 | 
 47 | # The master toctree document.
 48 | master_doc = 'index'
 49 | 
 50 | # General information about the project.
 51 | project = 'document-feature-selection'
 52 | copyright = '2015, kensuke-mi'
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement for
 55 | # |version| and |release|, also used in various other places throughout the
 56 | # built documents.
 57 | # 
 58 | # The short X.Y version.
 59 | version = version_orig
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = version_orig
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | # language = None
 66 | 
 67 | # There are two options for replacing |today|: either, you set today to some
 68 | # non-false value, then it is used:
 69 | # today = ''
 70 | # Else, today_fmt is used as the format for a strftime call.
 71 | # today_fmt = '%B %d, %Y'
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = []
 76 | 
 77 | # The reST default role (used for this markup: `text`) to use for all documents.
 78 | # default_role = None
 79 | 
 80 | # If true, '()' will be appended to :func: etc. cross-reference text.
 81 | # add_function_parentheses = True
 82 | 
 83 | # If true, the current module name will be prepended to all description
 84 | # unit titles (such as .. function::).
 85 | # add_module_names = True
 86 | 
 87 | # If true, sectionauthor and moduleauthor directives will be shown in the
 88 | # output. They are ignored by default.
 89 | # show_authors = False
 90 | 
 91 | # The name of the Pygments (syntax highlighting) style to use.
 92 | pygments_style = 'sphinx'
 93 | 
 94 | # A list of ignored prefixes for module index sorting.
 95 | # modindex_common_prefix = []
 96 | 
 97 | 
 98 | # -- Options for HTML output ---------------------------------------------------
 99 | 
100 | # The theme to use for HTML and HTML Help pages.  See the documentation for
101 | # a list of builtin themes.
102 | html_theme = 'sphinx_rtd_theme'
103 | 
104 | # Theme options are theme-specific and customize the look and feel of a theme
105 | # further.  For a list of options available for each theme, see the
106 | # documentation.
107 | # html_theme_options = {'bgcolor': '# FCFCFC; background: # FCFCFC url("fond_article.png") no-repeat center 10em; ', }
108 | 
109 | # Add any paths that contain custom themes here, relative to this directory.
110 | # html_theme_path = []
111 | 
112 | # The name for this set of Sphinx documents.  If None, it defaults to
113 | # "<project> v<release> documentation".
114 | # html_title = None
115 | 
116 | # A shorter title for the navigation bar.  Default is the same as html_title.
117 | # html_short_title = None
118 | 
119 | # The name of an image file (relative to this directory) to place at the top
120 | # of the sidebar.
121 | html_logo = 'logo_principal.png'
122 | 
123 | # The name of an image file (within the static path) to use as favicon of the
124 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
125 | # pixels large.
126 | # html_favicon = None
127 | 
128 | # Add any paths that contain custom static files (such as style sheets) here,
129 | # relative to this directory. They are copied after the builtin static files,
130 | # so a file named "default.css" will overwrite the builtin "default.css".
131 | html_static_path = ['_static']
132 | 
133 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
134 | # using the given strftime format.
135 | # html_last_updated_fmt = '%b %d, %Y'
136 | 
137 | # If true, SmartyPants will be used to convert quotes and dashes to
138 | # typographically correct entities.
139 | # html_use_smartypants = True
140 | 
141 | # Custom sidebar templates, maps document names to template names.
142 | # html_sidebars = {}
143 | 
144 | # Additional templates that should be rendered to pages, maps page names to
145 | # template names.
146 | # html_additional_pages = {}
147 | 
148 | # If false, no module index is generated.
149 | # html_domain_indices = True
150 | 
151 | # If false, no index is generated.
152 | # html_use_index = True
153 | 
154 | # If true, the index is split into individual pages for each letter.
155 | # html_split_index = False
156 | 
157 | # If true, links to the reST sources are added to the pages.
158 | # html_show_sourcelink = True
159 | 
160 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
161 | # html_show_sphinx = True
162 | 
163 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
164 | # html_show_copyright = True
165 | 
166 | # If true, an OpenSearch description file will be output, and all pages will
167 | # contain a <link> tag referring to it.  The value of this option must be the
168 | # base URL from which the finished HTML is served.
169 | # html_use_opensearch = ''
170 | 
171 | # This is the file name suffix for HTML files (e.g. ".xhtml").
172 | # html_file_suffix = None
173 | 
174 | # Output file base name for HTML help builder.
175 | htmlhelp_basename = 'document-feature-selectiondoc'
176 | 
177 | 
178 | # -- Options for LaTeX output --------------------------------------------------
179 | 
180 | # The paper size ('letter' or 'a4').
181 | # latex_paper_size = 'letter'
182 | 
183 | # The font size ('10pt', '11pt' or '12pt').
184 | # latex_font_size = '10pt'
185 | 
186 | # Grouping the document tree into LaTeX files. List of tuples
187 | # (source start file, target name, title, author, documentclass [howto/manual]).
188 | latex_documents = [
189 |     ('index', 'document-feature-selection.tex', 'document-feature-selection Documentation',
190 |      'kensuke-mi', 'manual'),
191 | ]
192 | 
193 | # The name of an image file (relative to this directory) to place at the top of
194 | # the title page.
195 | # latex_logo = None
196 | 
197 | # For "manual" documents, if this is true, then toplevel headings are parts,
198 | # not chapters.
199 | # latex_use_parts = False
200 | 
201 | # If true, show page references after internal links.
202 | # latex_show_pagerefs = False
203 | 
204 | # If true, show URL addresses after external links.
205 | # latex_show_urls = False
206 | 
207 | # Additional stuff for the LaTeX preamble.
208 | # latex_preamble = ''
209 | 
210 | # Documents to append as an appendix to all manuals.
211 | # latex_appendices = []
212 | 
213 | # If false, no module index is generated.
214 | # latex_domain_indices = True
215 | 
216 | 
217 | # -- Options for manual page output --------------------------------------------
218 | 
219 | # One entry per manual page. List of tuples
220 | # (source start file, name, description, authors, manual section).
221 | man_pages = [
222 |     ('index', 'document-feature-selection', 'document-feature-selection Documentation',
223 |      ['kensuke-mi'], 1)
224 | ]
225 | 
226 | 
227 | # -- Options for Epub output ---------------------------------------------------
228 | 
229 | # Bibliographic Dublin Core info.
230 | epub_title = 'document-feature-selection'
231 | epub_author = 'kensuke-mi'
232 | epub_publisher = 'unknown'
233 | epub_copyright = '2015, kensuke-mi'
234 | 
235 | # The language of the text. It defaults to the language option
236 | # or en if the language is not set.
237 | # epub_language = ''
238 | 
239 | # The scheme of the identifier. Typical schemes are ISBN or URL.
240 | # epub_scheme = ''
241 | 
242 | # The unique identifier of the text. This can be a ISBN number
243 | # or the project homepage.
244 | # epub_identifier = ''
245 | 
246 | # A unique identification for the text.
247 | # epub_uid = ''
248 | 
249 | # HTML files that should be inserted before the pages created by sphinx.
250 | # The format is a list of tuples containing the path and title.
251 | # epub_pre_files = []
252 | 
253 | # HTML files shat should be inserted after the pages created by sphinx.
254 | # The format is a list of tuples containing the path and title.
255 | # epub_post_files = []
256 | 
257 | # A list of files that should not be packed into the epub file.
258 | # epub_exclude_files = []
259 | 
260 | # The depth of the table of contents in toc.ncx.
261 | # epub_tocdepth = 3
262 | 
263 | # Allow duplicate toc entries.
264 | # epub_tocdup = True
265 | # 
266 | intersphinx_mapping = {}
267 | 
268 | 
269 | 
270 | if __name__ == '__main__':
271 |     import doctest
272 |     doctest.testmod()
273 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/common/data_converter.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | from __future__ import division
  6 | from DocumentFeatureSelection.common import utils, func_data_converter
  7 | from DocumentFeatureSelection.models import DataCsrMatrix, AvailableInputTypes, PersistentDict
  8 | from DocumentFeatureSelection.init_logger import logger
  9 | from sqlitedict import SqliteDict
 10 | import sys
 11 | import numpy
 12 | import tempfile
 13 | import json
 14 | from typing import Dict, List, Tuple, Any, Union
 15 | python_version = sys.version_info
 16 | 
 17 | __author__ = 'kensuke-mi'
 18 | 
 19 | 
 20 | class DataConverter(object):
 21 |     """This class is for converting data type from dict-object into DataCsrMatrix-object which saves information of matrix.
 22 |     """
 23 |     def __init__(self):
 24 |         # for keeping old version
 25 |         self.labeledMultiDocs2TermFreqMatrix = self.convert_multi_docs2term_frequency_matrix
 26 |         self.labeledMultiDocs2DocFreqMatrix = self.convert_multi_docs2document_frequency_matrix
 27 | 
 28 |     def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]):
 29 |         """Count term-distribution per label.
 30 |         """
 31 |         assert isinstance(labeled_documents, (SqliteDict, dict))
 32 |         assert isinstance(label2id, dict)
 33 | 
 34 |         # count total term-frequency per label
 35 |         term_frequency_distribution = {
 36 |             label: len(list(utils.flatten(document_lists)))
 37 |             for label, document_lists
 38 |             in labeled_documents.items()
 39 |         }
 40 | 
 41 |         # make list of distribution
 42 |         term_frequency_distribution_list = [0] * len(labeled_documents)
 43 | 
 44 |         for label_string, n_doc in term_frequency_distribution.items():
 45 |             #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
 46 |             term_index = label2id[label_string]
 47 |             term_frequency_distribution_list[term_index] = n_doc
 48 | 
 49 |         return numpy.array(term_frequency_distribution_list, dtype='i8')
 50 | 
 51 |     def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray:
 52 |         """This method count n(docs) per label.
 53 |         """
 54 |         assert isinstance(labeled_documents, (SqliteDict, dict))
 55 |         assert isinstance(label2id, dict)
 56 | 
 57 |         # count n(docs) per label
 58 |         n_doc_distribution = {
 59 |             label: len(document_lists)
 60 |             for label, document_lists
 61 |             in labeled_documents.items()
 62 |         }
 63 | 
 64 |         # make list of distribution
 65 |         n_doc_distribution_list = [0] * len(labeled_documents)
 66 | 
 67 |         for label_string, n_doc in n_doc_distribution.items():
 68 |             #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
 69 |             docs_index = label2id[label_string]
 70 |             n_doc_distribution_list[docs_index] = n_doc
 71 | 
 72 |         return numpy.array(n_doc_distribution_list, dtype='i8')
 73 | 
 74 |     def __make_feature_object2json_string(self, seq_feature_in_doc:List[Union[str,List[str],Tuple[str,...]]])->List[str]:
 75 |         """Sub-method of make_feature_object2json_string()"""
 76 |         replaced_seq_feature_in_doc = [None] * len(seq_feature_in_doc)  # type: List[str]
 77 |         for i, feature_object in enumerate(seq_feature_in_doc):
 78 |             if isinstance(feature_object, str):
 79 |                 replaced_seq_feature_in_doc[i] = json.dumps(tuple([feature_object]), ensure_ascii=False)
 80 |             elif isinstance(feature_object, (tuple, list)):
 81 |                 replaced_seq_feature_in_doc[i] = json.dumps(feature_object, ensure_ascii=False)
 82 |             else:
 83 |                 raise Exception("feature type must be either of str,list,tuple. Detected={}".format(type(feature_object)))
 84 |         else:
 85 |             return replaced_seq_feature_in_doc
 86 | 
 87 |     def make_feature_object2json_string(self, labeled_document:AvailableInputTypes)->Dict[str,AvailableInputTypes]:
 88 |         """* What u can do
 89 |         - This function converts feature-object in sequence object into json string.
 90 |         - This function make every object into json string.
 91 |             - string object -> json array which has one string. Ex. "feature" -> '["feature"]'
 92 |             - list object -> json array. Ex. ["feature", "feature"] -> '["feature", "feature"]'
 93 |             - tuple object -> json array. Ex. ("feature", "feature") -> '["feature", "feature"]'
 94 |         * Parameters
 95 |         - labeled_document: dict object which has key of 'label-name', and value is 2-dim list of features.
 96 | 
 97 |         """
 98 |         assert isinstance(labeled_document, (dict,PersistentDict,SqliteDict))
 99 |         replaced_labeled_document = {key: [] for key in labeled_document}
100 |         for key, docs_in_label in labeled_document.items():
101 |             assert isinstance(docs_in_label, list)
102 |             replaced_docs_in_label = [None] * len(docs_in_label)
103 |             for i, doc_label in enumerate(docs_in_label):
104 |                 replaced_docs_in_label[i] = self.__make_feature_object2json_string(doc_label)
105 |             else:
106 |                 replaced_labeled_document[key] = replaced_docs_in_label
107 |         else:
108 |             return replaced_labeled_document
109 | 
110 |     def convert_multi_docs2term_frequency_matrix(self,
111 |                                                  labeled_documents: AvailableInputTypes,
112 |                                                  is_use_cache: bool = False,
113 |                                                  is_use_memmap: bool = False,
114 |                                                  path_working_dir: str = tempfile.mkdtemp(),
115 |                                                  cache_backend: str = 'PersistentDict',
116 |                                                  n_jobs: int = 1):
117 |         """* What you can do
118 |         - This function makes TERM-frequency matrix for TF-IDF calculation.
119 |         - TERM-frequency matrix is scipy.csr_matrix.
120 | 
121 |         * Params
122 |         - labeled_documents: Dict object which has category-name as key, and list of features as value
123 |         - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
124 |         - path_working_dir: path to directory for saving cache files
125 |         """
126 |         labeled_documents = self.make_feature_object2json_string(labeled_documents)
127 | 
128 |         logger.debug(msg='Now pre-processing before CSR matrix')
129 |         # convert data structure
130 |         set_document_information = func_data_converter.make_multi_docs2term_freq_info(labeled_documents)
131 | 
132 |         # count n(docs) per label
133 |         n_docs_distribution = self.count_document_distribution(
134 |             labeled_documents=labeled_documents,
135 |             label2id=set_document_information.label2id
136 |         )
137 |         # count term-frequency per label
138 |         term_frequency_distribution = self.count_term_frequency_distribution(
139 |             labeled_documents=labeled_documents,
140 |             label2id=set_document_information.label2id
141 |         )
142 | 
143 |         return DataCsrMatrix(
144 |             csr_matrix_=set_document_information.matrix_object,
145 |             label2id_dict=set_document_information.label2id,
146 |             vocabulary=set_document_information.feature2id,
147 |             n_docs_distribution=n_docs_distribution,
148 |             n_term_freq_distribution=term_frequency_distribution,
149 |             is_use_cache=is_use_cache,
150 |             is_use_memmap=is_use_memmap,
151 |             path_working_dir=path_working_dir,
152 |             cache_backend=cache_backend
153 |         )
154 | 
155 |     def convert_multi_docs2document_frequency_matrix(self,
156 |                                        labeled_documents:AvailableInputTypes,
157 |                                        is_use_cache:bool=False,
158 |                                        is_use_memmap:bool=False,
159 |                                        path_working_dir:str=None,
160 |                                        n_jobs:int=1)->DataCsrMatrix:
161 |         """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix.
162 | 
163 |         * Input object
164 |         - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below
165 |             >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]],
166 |             >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],],
167 |             >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]}
168 | 
169 |         * Output
170 |         - DataCsrMatrix object.
171 |         """
172 |         labeled_documents = self.make_feature_object2json_string(labeled_documents)
173 | 
174 |         logger.debug(msg='Now pre-processing before CSR matrix')
175 |         # convert data structure
176 |         set_document_information = func_data_converter.make_multi_docs2doc_freq_info(labeled_documents,n_jobs=n_jobs)
177 |         assert isinstance(set_document_information, func_data_converter.SetDocumentInformation)
178 | 
179 |         # count n(docs) per label
180 |         n_docs_distribution = self.count_document_distribution(
181 |             labeled_documents=labeled_documents,
182 |             label2id=set_document_information.label2id
183 |         )
184 |         # count term-frequency per label
185 |         term_frequency_distribution = self.count_term_frequency_distribution(
186 |             labeled_documents=labeled_documents,
187 |             label2id=set_document_information.label2id
188 |         )
189 |         return DataCsrMatrix(
190 |             csr_matrix_=set_document_information.matrix_object,
191 |             label2id_dict=set_document_information.label2id,
192 |             vocabulary=set_document_information.feature2id,
193 |             n_docs_distribution=n_docs_distribution,
194 |             n_term_freq_distribution=term_frequency_distribution,
195 |             is_use_cache=is_use_cache,
196 |             is_use_memmap=is_use_memmap,
197 |             path_working_dir=path_working_dir
198 |         )
199 | 


--------------------------------------------------------------------------------
/DocumentFeatureSelection/models.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Tuple, Union, Any, TypeVar
  2 | from scipy.sparse.csr import csr_matrix
  3 | from numpy import memmap
  4 | from sqlitedict import SqliteDict
  5 | from tempfile import mkdtemp
  6 | from DocumentFeatureSelection.init_logger import logger
  7 | from numpy import ndarray, int32, int64
  8 | import pickle
  9 | import json
 10 | import csv
 11 | import os
 12 | import shutil
 13 | 
 14 | 
 15 | # this class is from https://code.activestate.com/recipes/576642/
 16 | class PersistentDict(dict):
 17 |     ''' Persistent dictionary with an API compatible with shelve and anydbm.
 18 | 
 19 |     The dict is kept in memory, so the dictionary operations run as fast as
 20 |     a regular dictionary.
 21 | 
 22 |     Write to disk is delayed until close or sync (similar to gdbm's fast mode).
 23 | 
 24 |     Input file format is automatically discovered.
 25 |     Output file format is selectable between pickle, json, and csv.
 26 |     All three serialization formats are backed by fast C implementations.
 27 | 
 28 |     '''
 29 | 
 30 |     def __init__(self, filename, flag='c', mode=None, format='pickle', *args, **kwds):
 31 |         self.flag = flag                    # r=readonly, c=create, or n=new
 32 |         self.mode = mode                    # None or an octal triple like 0644
 33 |         self.format = format                # 'csv', 'json', or 'pickle'
 34 |         self.filename = filename
 35 |         if flag != 'n' and os.access(filename, os.R_OK):
 36 |             fileobj = open(filename, 'rb' if format=='pickle' else 'r')
 37 |             with fileobj:
 38 |                 self.load(fileobj)
 39 |         dict.__init__(self, *args, **kwds)
 40 | 
 41 |     def sync(self):
 42 |         'Write dict to disk'
 43 |         if self.flag == 'r':
 44 |             return
 45 |         filename = self.filename
 46 |         tempname = filename + '.tmp'
 47 |         fileobj = open(tempname, 'wb' if self.format=='pickle' else 'w')
 48 |         try:
 49 |             self.dump(fileobj)
 50 |         except Exception:
 51 |             os.remove(tempname)
 52 |             raise
 53 |         finally:
 54 |             fileobj.close()
 55 |         shutil.move(tempname, self.filename)    # atomic commit
 56 |         if self.mode is not None:
 57 |             os.chmod(self.filename, self.mode)
 58 | 
 59 |     def close(self):
 60 |         self.sync()
 61 | 
 62 |     def __enter__(self):
 63 |         return self
 64 | 
 65 |     def __exit__(self, *exc_info):
 66 |         self.close()
 67 | 
 68 |     def dump(self, fileobj):
 69 |         if self.format == 'csv':
 70 |             csv.writer(fileobj).writerows(self.items())
 71 |         elif self.format == 'json':
 72 |             json.dump(self, fileobj, separators=(',', ':'))
 73 |         elif self.format == 'pickle':
 74 |             pickle.dump(dict(self), fileobj, 2)
 75 |         else:
 76 |             raise NotImplementedError('Unknown format: ' + repr(self.format))
 77 | 
 78 |     def load(self, fileobj):
 79 |         # try formats from most restrictive to least restrictive
 80 |         for loader in (pickle.load, json.load, csv.reader):
 81 |             fileobj.seek(0)
 82 |             try:
 83 |                 return self.update(loader(fileobj))
 84 |             except Exception:
 85 |                 pass
 86 |         raise ValueError('File not in a supported format')
 87 | 
 88 | 
 89 | class SetDocumentInformation(object):
 90 |     __slots__ = ['matrix_object', 'label2id', 'feature2id']
 91 | 
 92 |     def __init__(self, dict_matrix_index:Union[Dict[str,Any], SqliteDict, PersistentDict]):
 93 |         """
 94 |         * Keys
 95 |         - matrix_object:Union[csr_matrix, ndarray]
 96 |         - label2id: Dict[str, str]
 97 |         - feature2id: Dict[str, str]
 98 |         """
 99 |         if not "matrix_object" in dict_matrix_index:
100 |             raise Exception("dict_matrix_index must have key='matrix_object'")
101 |         if not "label2id" in dict_matrix_index:
102 |             raise Exception("dict_matrix_index must have key='label2id'")
103 |         if not "feature2id" in dict_matrix_index:
104 |             raise Exception("dict_matrix_index must have key='feature2id'")
105 | 
106 |         self.matrix_object = dict_matrix_index['matrix_object']
107 |         self.label2id = dict_matrix_index['label2id']
108 |         self.feature2id = dict_matrix_index['feature2id']
109 | 
110 |         if isinstance(dict_matrix_index, dict):
111 |             pass
112 |         elif isinstance(dict_matrix_index, PersistentDict):
113 |             dict_matrix_index.sync()
114 |         elif isinstance(dict_matrix_index, SqliteDict):
115 |             dict_matrix_index.sync()
116 |         else:
117 |             raise Exception()
118 | 
119 | 
120 | class DataCsrMatrix(object):
121 |     """* What you can do
122 |     - You can keep information for keeping matrix object.
123 |     """
124 |     __slots__ = ['cache_backend', 'csr_matrix_',
125 |                  'label2id_dict', 'vocabulary',
126 |                  'n_docs_distribution', 'n_term_freq_distribution', 'path_working_dir']
127 | 
128 |     def __init__(self,
129 |                  csr_matrix_: csr_matrix,
130 |                  label2id_dict: Dict[str, int],
131 |                  vocabulary: Dict[str, int],
132 |                  n_docs_distribution: ndarray,
133 |                  n_term_freq_distribution: ndarray,
134 |                  is_use_cache: bool=False,
135 |                  is_use_memmap: bool=False,
136 |                  cache_backend: str='PersistentDict',
137 |                  path_working_dir: str=None):
138 |         """* Parameters
139 |         -----------------
140 |         - csr_matrix_: Matrix object which saves term frequency or document frequency
141 |         - label2id_dict: Dict object whose key is label-name, value is row-index of the given matrix.
142 |             >>> {'label_b': 0, 'label_c': 1, 'label_a': 2}
143 |         -  vocabulary: Dict object whose key is feature-name, value is column-index of the given matrix.
144 |             >>> {'label_b': 0, 'label_c': 1, 'label_a': 2}
145 |         - n_docs_distribution: Sequence object(list,ndarray). It saves a distribution of N(docs) in each label.
146 |         - n_term_freq_distribution: Sequence object(list,ndarray). It saves a distribution of N(all terms) in each label.
147 |         - is_use_cache: boolean. It True; the matrix object is saved on the disk. It saves memory of your machine.
148 |         - is_use_memmap: boolean. It True; the matrix object is saved on the disk. It saves memory of your machine.
149 |         - cache_backend: str. {PersistentDict, SqliteDict}, backend to save this object on the disk.
150 |         - path_working_dir: str. Path to save temporary cache objects.
151 |         """
152 | 
153 |         self.n_docs_distribution = n_docs_distribution
154 |         self.n_term_freq_distribution = n_term_freq_distribution
155 |         self.cache_backend = cache_backend
156 | 
157 |         if (is_use_memmap or is_use_cache) and path_working_dir is None:
158 |             self.path_working_dir = mkdtemp()
159 |             logger.info("Temporary files are at {}".format(self.path_working_dir))
160 |         else:
161 |             self.path_working_dir = path_working_dir
162 | 
163 |         if is_use_cache:
164 |             """You use disk-drive for keeping object.
165 |             """
166 |             path_vocabulary_cache_obj = os.path.join(self.path_working_dir, 'vocabulary.cache')
167 |             path_label_2_dict_cache_obj = os.path.join(self.path_working_dir, 'label_2_dict.cache')
168 |             self.vocabulary = self.initialize_cache_dict_object(path_vocabulary_cache_obj)
169 |             self.vocabulary = vocabulary
170 | 
171 |             self.label2id_dict = self.initialize_cache_dict_object(path_label_2_dict_cache_obj)
172 |             logger.info("Now saving into local file...")
173 |             for k, v in label2id_dict.items():
174 |                 self.label2id_dict[k] = v
175 |             if isinstance(self.label2id_dict, PersistentDict):
176 |                 self.label2id_dict.sync()
177 | 
178 |         else:
179 |             """Keep everything on memory
180 |             """
181 |             self.label2id_dict = label2id_dict
182 |             self.vocabulary = vocabulary
183 | 
184 |         if is_use_memmap:
185 |             """You use disk-drive for keeping object
186 |             """
187 |             path_memmap_obj = os.path.join(self.path_working_dir, 'matrix.memmap')
188 |             self.csr_matrix_ = self.initialize_memmap_object(csr_matrix_, path_memmap_object=path_memmap_obj)
189 |         else:
190 |             self.csr_matrix_ = csr_matrix_
191 | 
192 |     def initialize_cache_dict_object(self, path_cache_file):
193 |         if self.cache_backend == 'PersistentDict':
194 |             return PersistentDict(path_cache_file, flag='c', format='json')
195 |         elif self.cache_backend == 'SqliteDict':
196 |             return SqliteDict(path_cache_file, autocommit=True)
197 |         else:
198 |             raise Exception('No such cache_backend option named {}'.format(self.cache_backend))
199 | 
200 |     def initialize_memmap_object(self, matrix_object: csr_matrix, path_memmap_object: str)->memmap:
201 |         fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape)
202 |         fp[:] = matrix_object.todense()[:]
203 |         return fp
204 | 
205 |     def __str__(self):
206 |         return """matrix-type={}, matrix-size={}, path_working_dir={}""".format(type(self.csr_matrix_),
207 |                    self.csr_matrix_.shape,
208 |                    self.path_working_dir)
209 | 
210 | 
211 | class ROW_COL_VAL(object):
212 |     """Data class to keep value of one item in CSR-matrix"""
213 |     __slots__ = ('row', 'col', 'val')
214 |     def __init__(self, row: int, col:int, val:int):
215 |         self.row = row
216 |         self.col = col
217 |         self.val = val
218 | 
219 | 
220 | class ScoredResultObject(object):
221 |     """"""
222 | 
223 |     def __init__(self,
224 |                  scored_matrix:csr_matrix,
225 |                  label2id_dict:Union[Dict[str,Any], ndarray],
226 |                  feature2id_dict=Union[Dict[str,Any], ndarray],
227 |                  method:str=None,
228 |                  matrix_form:str=None,
229 |                  frequency_matrix:csr_matrix=None):
230 |         """*Parameters
231 |         ------------
232 |         - scored_matrix: Matrix object which saves result of feature-extraction
233 |         - label2id_dict: Dict object whose key is label-name, value is row-index of the matrix.
234 |         - feature2id_dict: Dict object whose key is feature-name, value is column-index of the matrix.
235 |         - method: a name of feature-extraction method.
236 |         - matrix_form: a type of the given matrix for feature-extraction computation. {term_freq, doc_freq}
237 |         - frequency_matrix: Matrix object(term-frequency or document-frequency). The matrix is data-source of feature-extraction computation.
238 |         """
239 |         self.scored_matrix = scored_matrix
240 |         self.label2id_dict = label2id_dict
241 |         self.feature2id_dict = feature2id_dict
242 |         self.method = method
243 |         self.matrix_form = matrix_form
244 |         self.frequency_matrix = frequency_matrix
245 |         # For keeping old version
246 |         self.ScoreMatrix2ScoreDictionary = self.convert_score_matrix2score_record
247 | 
248 |     def __conv_into_dict_format(self, word_score_items):
249 |         out_format_structure = {}
250 |         for item in word_score_items:
251 |             if item['label'] not in out_format_structure :
252 |                 out_format_structure[item['label']] = [{'feature': item['word'], 'score': item['score']}]
253 |             else:
254 |                 out_format_structure[item['label']].append({'feature': item['word'], 'score': item['score']})
255 |         return out_format_structure
256 | 
257 |     def convert_score_matrix2score_record(self,
258 |                                     outformat:str='items',
259 |                                     sort_desc:bool=True):
260 |         """* What you can do
261 |         - Get dictionary structure from weighted-featured scores.
262 |         - You can choose 'dict' or 'items' for ```outformat``` parameter.
263 | 
264 |         * Output
265 |         ---------------------
266 |         - If outformat='dict', you get
267 |         >>> {label_name:{feature: score}}
268 |         Else if outformat='items', you get
269 |         >>> [{feature: score}]
270 | 
271 |         """
272 |         scored_objects = self.get_feature_dictionary(
273 |             weighted_matrix=self.scored_matrix,
274 |             vocabulary=self.feature2id_dict,
275 |             label_group_dict=self.label2id_dict,
276 |             frequency_matrix=self.frequency_matrix
277 |         )
278 | 
279 |         if sort_desc: scored_objects = \
280 |             sorted(scored_objects, key=lambda x: x['score'], reverse=True)
281 | 
282 |         if outformat=='dict':
283 |             out_format_structure = self.__conv_into_dict_format(scored_objects)
284 |         elif outformat=='items':
285 |             out_format_structure = scored_objects
286 |         else:
287 |             raise ValueError('outformat must be either of {dict, items}')
288 | 
289 |         return out_format_structure
290 | 
291 |     def __get_value_index(self, row_index, column_index, weight_csr_matrix, verbose=False):
292 |         assert isinstance(row_index, (int, int32, int64))
293 |         assert isinstance(column_index, (int, int32, int64))
294 |         assert isinstance(weight_csr_matrix, (ndarray,csr_matrix))
295 | 
296 |         value = weight_csr_matrix[row_index, column_index]
297 | 
298 |         return value
299 | 
300 |     def make_non_zero_information(self, weight_csr_matrix: csr_matrix)->List[ROW_COL_VAL]:
301 |         """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple.
302 | 
303 |         :param weight_csr_matrix:
304 |         :return:
305 |         """
306 |         assert isinstance(weight_csr_matrix, (csr_matrix, ndarray))
307 | 
308 |         row_col_index_array = weight_csr_matrix.nonzero()
309 |         row_indexes = row_col_index_array[0]
310 |         column_indexes = row_col_index_array[1]
311 |         assert len(row_indexes) == len(column_indexes)
312 | 
313 |         value_index_items = [None] * len(row_indexes)  # type: List[ROW_COL_VAL]
314 |         for i in range(0, len(row_indexes)):
315 |             value_index_items[i] = ROW_COL_VAL(row_indexes[i],
316 |                                                column_indexes[i],
317 |                                                self.__get_value_index(row_indexes[i], column_indexes[i], weight_csr_matrix))
318 |         return value_index_items
319 | 
320 |     def SUB_FUNC_feature_extraction(self,
321 |                                     weight_row_col_val_obj: ROW_COL_VAL,
322 |                                     dict_index_information: Dict[str, Dict[str, str]],
323 |                                     dict_position2value: Dict[Tuple[int, int], float]=None)->Dict[str, Any]:
324 |         """This function returns weighted score between label and words.
325 | 
326 |         Input csr matrix must be 'document-frequency' matrix, where records #document that word appears in document set.
327 |         [NOTE] This is not TERM-FREQUENCY.
328 | 
329 |         For example,
330 |         If 'iPhone' appears in 5 documents of 'IT' category document set, value must be 5.
331 |         Even if 10 'iPhone' words in 'IT' category document set, value is still 5.
332 |         """
333 |         assert isinstance(weight_row_col_val_obj, ROW_COL_VAL)
334 |         feature_score_record = {
335 |             'score': weight_row_col_val_obj.val,
336 |             'label': self.get_label(weight_row_col_val_obj, dict_index_information['id2label']),
337 |             'feature': self.get_word(weight_row_col_val_obj, dict_index_information['id2vocab'])
338 |         }
339 |         if not dict_position2value is None:
340 |             if (weight_row_col_val_obj.col,weight_row_col_val_obj.row) in dict_position2value:
341 |                 frequency = dict_position2value[tuple([weight_row_col_val_obj.col,weight_row_col_val_obj.row])]
342 |             else:
343 |                 """When a feature-extraction method is BNS, frequency=0 is possible."""
344 |                 frequency = 0
345 | 
346 |             feature_score_record.update({"frequency": frequency})
347 | 
348 |         return feature_score_record
349 | 
350 |     def get_feature_dictionary(self,
351 |                                weighted_matrix: csr_matrix,
352 |                                vocabulary:Dict[str, int],
353 |                                label_group_dict:Dict[str, int],
354 |                                cache_backend: str = 'PersistentDict',
355 |                                is_use_cache: bool=True,
356 |                                frequency_matrix: csr_matrix=None)->List[Dict[str, Any]]:
357 |         """* What you can do
358 |         - Get dictionary structure from weighted-featured scores.
359 |         """
360 |         assert isinstance(weighted_matrix, csr_matrix)
361 |         assert isinstance(vocabulary, dict)
362 |         assert isinstance(label_group_dict, dict)
363 | 
364 |         logger.debug(msg='Start making scored dictionary object from scored matrix')
365 |         logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1]))
366 | 
367 |         weight_value_index_items = self.make_non_zero_information(weighted_matrix)
368 |         if not frequency_matrix is None:
369 |             frequency_value_index_items = self.make_non_zero_information(frequency_matrix)
370 |             dict_position2value = {(t_col_row.col,t_col_row.row): t_col_row.val for t_col_row in frequency_value_index_items}
371 |         else:
372 |             dict_position2value = None
373 | 
374 |         if is_use_cache:
375 |             dict_index_information = self.initialize_cache_dict_object(cache_backend, file_name='dict_index_information')
376 |         else:
377 |             dict_index_information = {}
378 | 
379 |         dict_index_information['id2label'] = {value:key for key, value in label_group_dict.items()}
380 |         dict_index_information['id2vocab'] = {value:key for key, value in vocabulary.items()}
381 |         if isinstance(dict_index_information, SqliteDict):
382 |             dict_index_information.commit()
383 |         elif isinstance(dict_index_information, PersistentDict):
384 |             dict_index_information.sync()
385 |         else:
386 |             pass
387 | 
388 |         # TODO may be this func takes too much time. consider cython.
389 |         seq_score_objects = [None] * len(weight_value_index_items)  # type: List[Dict[str,Any]]
390 |         for i, weight_row_col_val_tuple in enumerate(weight_value_index_items):
391 |             seq_score_objects[i] = self.SUB_FUNC_feature_extraction(
392 |                 weight_row_col_val_tuple,
393 |                 dict_index_information,
394 |                 dict_position2value)
395 | 
396 |         logger.debug(msg='Finished making scored dictionary')
397 | 
398 |         return seq_score_objects
399 | 
400 |     def get_label(self, row_col_val_tuple, label_id)->str:
401 |         assert isinstance(row_col_val_tuple, ROW_COL_VAL)
402 |         assert isinstance(label_id, dict)
403 | 
404 |         label = label_id[row_col_val_tuple.row]
405 | 
406 |         return label
407 | 
408 |     def get_word(self, row_col_val_tuple:ROW_COL_VAL, vocabulary:Dict[int,str])->Union[str,List[str],Tuple[str,...]]:
409 |         """* what u can do
410 |         - It gets feature name from the given matrix object.
411 |         - A feature is json serialized, thus this method tries to de-serialize json string into python object.
412 |             - Original feature object is possibly string(word), list of str, list of str.
413 |         """
414 |         assert isinstance(row_col_val_tuple, ROW_COL_VAL)
415 |         assert isinstance(vocabulary, dict)
416 |         vocab = vocabulary[row_col_val_tuple.col]
417 |         try:
418 |             feature_object = json.loads(vocab)
419 |             if len(feature_object)==1:
420 |                 # When feature is word, the length is 1 #
421 |                 feature_object = feature_object[0]
422 |         except:
423 |             feature_object = vocab
424 | 
425 | 
426 |         return feature_object
427 | 
428 |     def initialize_cache_dict_object(self, cache_backend:str, file_name:str, path_cache_file=mkdtemp()):
429 |         if cache_backend == 'PersistentDict':
430 |             return PersistentDict(os.path.join(path_cache_file, file_name), flag='c', format='json')
431 |         elif cache_backend == 'SqliteDict':
432 |             return SqliteDict(os.path.join(path_cache_file, file_name), autocommit=True)
433 |         else:
434 |             raise Exception('No such cache_backend option named {}'.format(cache_backend))
435 | 
436 | 
437 | FeatureType = TypeVar('T', str, Tuple[Any])
438 | AvailableInputTypes = TypeVar('T', PersistentDict,
439 |                               SqliteDict,
440 |                               Dict[str,List[List[Union[str,Tuple[Any]]]]])


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 | CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
  3 | 
  4 | 
  5 |     Notice
  6 | 
  7 | This Agreement is a Free Software license agreement that is the result
  8 | of discussions between its authors in order to ensure compliance with
  9 | the two main principles guiding its drafting:
 10 | 
 11 |     * firstly, compliance with the principles governing the distribution
 12 |       of Free Software: access to source code, broad rights granted to
 13 |       users,
 14 |     * secondly, the election of a governing law, French law, with which
 15 |       it is conformant, both as regards the law of torts and
 16 |       intellectual property law, and the protection that it offers to
 17 |       both authors and holders of the economic rights over software.
 18 | 
 19 | The authors of the CeCILL-B (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre])
 20 | license are:
 21 | 
 22 | Commissariat à l'Energie Atomique - CEA, a public scientific, technical
 23 | and industrial research establishment, having its principal place of
 24 | business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France.
 25 | 
 26 | Centre National de la Recherche Scientifique - CNRS, a public scientific
 27 | and technological establishment, having its principal place of business
 28 | at 3 rue Michel-Ange, 75794 Paris cedex 16, France.
 29 | 
 30 | Institut National de Recherche en Informatique et en Automatique -
 31 | INRIA, a public scientific and technological establishment, having its
 32 | principal place of business at Domaine de Voluceau, Rocquencourt, BP
 33 | 105, 78153 Le Chesnay cedex, France.
 34 | 
 35 | 
 36 |     Preamble
 37 | 
 38 | This Agreement is an open source software license intended to give users
 39 | significant freedom to modify and redistribute the software licensed
 40 | hereunder.
 41 | 
 42 | The exercising of this freedom is conditional upon a strong obligation
 43 | of giving credits for everybody that distributes a software
 44 | incorporating a software ruled by the current license so as all
 45 | contributions to be properly identified and acknowledged.
 46 | 
 47 | In consideration of access to the source code and the rights to copy,
 48 | modify and redistribute granted by the license, users are provided only
 49 | with a limited warranty and the software's author, the holder of the
 50 | economic rights, and the successive licensors only have limited liability.
 51 | 
 52 | In this respect, the risks associated with loading, using, modifying
 53 | and/or developing or reproducing the software by the user are brought to
 54 | the user's attention, given its Free Software status, which may make it
 55 | complicated to use, with the result that its use is reserved for
 56 | developers and experienced professionals having in-depth computer
 57 | knowledge. Users are therefore encouraged to load and test the
 58 | suitability of the software as regards their requirements in conditions
 59 | enabling the security of their systems and/or data to be ensured and,
 60 | more generally, to use and operate it in the same conditions of
 61 | security. This Agreement may be freely reproduced and published,
 62 | provided it is not altered, and that no provisions are either added or
 63 | removed herefrom.
 64 | 
 65 | This Agreement may apply to any or all software for which the holder of
 66 | the economic rights decides to submit the use thereof to its provisions.
 67 | 
 68 | 
 69 |     Article 1 - DEFINITIONS
 70 | 
 71 | For the purpose of this Agreement, when the following expressions
 72 | commence with a capital letter, they shall have the following meaning:
 73 | 
 74 | Agreement: means this license agreement, and its possible subsequent
 75 | versions and annexes.
 76 | 
 77 | Software: means the software in its Object Code and/or Source Code form
 78 | and, where applicable, its documentation, "as is" when the Licensee
 79 | accepts the Agreement.
 80 | 
 81 | Initial Software: means the Software in its Source Code and possibly its
 82 | Object Code form and, where applicable, its documentation, "as is" when
 83 | it is first distributed under the terms and conditions of the Agreement.
 84 | 
 85 | Modified Software: means the Software modified by at least one
 86 | Contribution.
 87 | 
 88 | Source Code: means all the Software's instructions and program lines to
 89 | which access is required so as to modify the Software.
 90 | 
 91 | Object Code: means the binary files originating from the compilation of
 92 | the Source Code.
 93 | 
 94 | Holder: means the holder(s) of the economic rights over the Initial
 95 | Software.
 96 | 
 97 | Licensee: means the Software user(s) having accepted the Agreement.
 98 | 
 99 | Contributor: means a Licensee having made at least one Contribution.
100 | 
101 | Licensor: means the Holder, or any other individual or legal entity, who
102 | distributes the Software under the Agreement.
103 | 
104 | Contribution: means any or all modifications, corrections, translations,
105 | adaptations and/or new functions integrated into the Software by any or
106 | all Contributors, as well as any or all Internal Modules.
107 | 
108 | Module: means a set of sources files including their documentation that
109 | enables supplementary functions or services in addition to those offered
110 | by the Software.
111 | 
112 | External Module: means any or all Modules, not derived from the
113 | Software, so that this Module and the Software run in separate address
114 | spaces, with one calling the other when they are run.
115 | 
116 | Internal Module: means any or all Module, connected to the Software so
117 | that they both execute in the same address space.
118 | 
119 | Parties: mean both the Licensee and the Licensor.
120 | 
121 | These expressions may be used both in singular and plural form.
122 | 
123 | 
124 |     Article 2 - PURPOSE
125 | 
126 | The purpose of the Agreement is the grant by the Licensor to the
127 | Licensee of a non-exclusive, transferable and worldwide license for the
128 | Software as set forth in Article 5 hereinafter for the whole term of the
129 | protection granted by the rights over said Software.
130 | 
131 | 
132 |     Article 3 - ACCEPTANCE
133 | 
134 | 3.1 The Licensee shall be deemed as having accepted the terms and
135 | conditions of this Agreement upon the occurrence of the first of the
136 | following events:
137 | 
138 |     * (i) loading the Software by any or all means, notably, by
139 |       downloading from a remote server, or by loading from a physical
140 |       medium;
141 |     * (ii) the first time the Licensee exercises any of the rights
142 |       granted hereunder.
143 | 
144 | 3.2 One copy of the Agreement, containing a notice relating to the
145 | characteristics of the Software, to the limited warranty, and to the
146 | fact that its use is restricted to experienced users has been provided
147 | to the Licensee prior to its acceptance as set forth in Article 3.1
148 | hereinabove, and the Licensee hereby acknowledges that it has read and
149 | understood it.
150 | 
151 | 
152 |     Article 4 - EFFECTIVE DATE AND TERM
153 | 
154 | 
155 |       4.1 EFFECTIVE DATE
156 | 
157 | The Agreement shall become effective on the date when it is accepted by
158 | the Licensee as set forth in Article 3.1.
159 | 
160 | 
161 |       4.2 TERM
162 | 
163 | The Agreement shall remain in force for the entire legal term of
164 | protection of the economic rights over the Software.
165 | 
166 | 
167 |     Article 5 - SCOPE OF RIGHTS GRANTED
168 | 
169 | The Licensor hereby grants to the Licensee, who accepts, the following
170 | rights over the Software for any or all use, and for the term of the
171 | Agreement, on the basis of the terms and conditions set forth hereinafter.
172 | 
173 | Besides, if the Licensor owns or comes to own one or more patents
174 | protecting all or part of the functions of the Software or of its
175 | components, the Licensor undertakes not to enforce the rights granted by
176 | these patents against successive Licensees using, exploiting or
177 | modifying the Software. If these patents are transferred, the Licensor
178 | undertakes to have the transferees subscribe to the obligations set
179 | forth in this paragraph.
180 | 
181 | 
182 |       5.1 RIGHT OF USE
183 | 
184 | The Licensee is authorized to use the Software, without any limitation
185 | as to its fields of application, with it being hereinafter specified
186 | that this comprises:
187 | 
188 |    1. permanent or temporary reproduction of all or part of the Software
189 |       by any or all means and in any or all form.
190 | 
191 |    2. loading, displaying, running, or storing the Software on any or
192 |       all medium.
193 | 
194 |    3. entitlement to observe, study or test its operation so as to
195 |       determine the ideas and principles behind any or all constituent
196 |       elements of said Software. This shall apply when the Licensee
197 |       carries out any or all loading, displaying, running, transmission
198 |       or storage operation as regards the Software, that it is entitled
199 |       to carry out hereunder.
200 | 
201 | 
202 |       5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS
203 | 
204 | The right to make Contributions includes the right to translate, adapt,
205 | arrange, or make any or all modifications to the Software, and the right
206 | to reproduce the resulting software.
207 | 
208 | The Licensee is authorized to make any or all Contributions to the
209 | Software provided that it includes an explicit notice that it is the
210 | author of said Contribution and indicates the date of the creation thereof.
211 | 
212 | 
213 |       5.3 RIGHT OF DISTRIBUTION
214 | 
215 | In particular, the right of distribution includes the right to publish,
216 | transmit and communicate the Software to the general public on any or
217 | all medium, and by any or all means, and the right to market, either in
218 | consideration of a fee, or free of charge, one or more copies of the
219 | Software by any means.
220 | 
221 | The Licensee is further authorized to distribute copies of the modified
222 | or unmodified Software to third parties according to the terms and
223 | conditions set forth hereinafter.
224 | 
225 | 
226 |         5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION
227 | 
228 | The Licensee is authorized to distribute true copies of the Software in
229 | Source Code or Object Code form, provided that said distribution
230 | complies with all the provisions of the Agreement and is accompanied by:
231 | 
232 |    1. a copy of the Agreement,
233 | 
234 |    2. a notice relating to the limitation of both the Licensor's
235 |       warranty and liability as set forth in Articles 8 and 9,
236 | 
237 | and that, in the event that only the Object Code of the Software is
238 | redistributed, the Licensee allows effective access to the full Source
239 | Code of the Software at a minimum during the entire period of its
240 | distribution of the Software, it being understood that the additional
241 | cost of acquiring the Source Code shall not exceed the cost of
242 | transferring the data.
243 | 
244 | 
245 |         5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE
246 | 
247 | If the Licensee makes any Contribution to the Software, the resulting
248 | Modified Software may be distributed under a license agreement other
249 | than this Agreement subject to compliance with the provisions of Article
250 | 5.3.4.
251 | 
252 | 
253 |         5.3.3 DISTRIBUTION OF EXTERNAL MODULES
254 | 
255 | When the Licensee has developed an External Module, the terms and
256 | conditions of this Agreement do not apply to said External Module, that
257 | may be distributed under a separate license agreement.
258 | 
259 | 
260 |         5.3.4 CREDITS
261 | 
262 | Any Licensee who may distribute a Modified Software hereby expressly
263 | agrees to:
264 | 
265 |    1. indicate in the related documentation that it is based on the
266 |       Software licensed hereunder, and reproduce the intellectual
267 |       property notice for the Software,
268 | 
269 |    2. ensure that written indications of the Software intended use,
270 |       intellectual property notice and license hereunder are included in
271 |       easily accessible format from the Modified Software interface,
272 | 
273 |    3. mention, on a freely accessible website describing the Modified
274 |       Software, at least throughout the distribution term thereof, that
275 |       it is based on the Software licensed hereunder, and reproduce the
276 |       Software intellectual property notice,
277 | 
278 |    4. where it is distributed to a third party that may distribute a
279 |       Modified Software without having to make its source code
280 |       available, make its best efforts to ensure that said third party
281 |       agrees to comply with the obligations set forth in this Article .
282 | 
283 | If the Software, whether or not modified, is distributed with an
284 | External Module designed for use in connection with the Software, the
285 | Licensee shall submit said External Module to the foregoing obligations.
286 | 
287 | 
288 |         5.3.5 COMPATIBILITY WITH THE CeCILL AND CeCILL-C LICENSES
289 | 
290 | Where a Modified Software contains a Contribution subject to the CeCILL
291 | license, the provisions set forth in Article 5.3.4 shall be optional.
292 | 
293 | A Modified Software may be distributed under the CeCILL-C license. In
294 | such a case the provisions set forth in Article 5.3.4 shall be optional.
295 | 
296 | 
297 |     Article 6 - INTELLECTUAL PROPERTY
298 | 
299 | 
300 |       6.1 OVER THE INITIAL SOFTWARE
301 | 
302 | The Holder owns the economic rights over the Initial Software. Any or
303 | all use of the Initial Software is subject to compliance with the terms
304 | and conditions under which the Holder has elected to distribute its work
305 | and no one shall be entitled to modify the terms and conditions for the
306 | distribution of said Initial Software.
307 | 
308 | The Holder undertakes that the Initial Software will remain ruled at
309 | least by this Agreement, for the duration set forth in Article 4.2.
310 | 
311 | 
312 |       6.2 OVER THE CONTRIBUTIONS
313 | 
314 | The Licensee who develops a Contribution is the owner of the
315 | intellectual property rights over this Contribution as defined by
316 | applicable law.
317 | 
318 | 
319 |       6.3 OVER THE EXTERNAL MODULES
320 | 
321 | The Licensee who develops an External Module is the owner of the
322 | intellectual property rights over this External Module as defined by
323 | applicable law and is free to choose the type of agreement that shall
324 | govern its distribution.
325 | 
326 | 
327 |       6.4 JOINT PROVISIONS
328 | 
329 | The Licensee expressly undertakes:
330 | 
331 |    1. not to remove, or modify, in any manner, the intellectual property
332 |       notices attached to the Software;
333 | 
334 |    2. to reproduce said notices, in an identical manner, in the copies
335 |       of the Software modified or not.
336 | 
337 | The Licensee undertakes not to directly or indirectly infringe the
338 | intellectual property rights of the Holder and/or Contributors on the
339 | Software and to take, where applicable, vis-à-vis its staff, any and all
340 | measures required to ensure respect of said intellectual property rights
341 | of the Holder and/or Contributors.
342 | 
343 | 
344 |     Article 7 - RELATED SERVICES
345 | 
346 | 7.1 Under no circumstances shall the Agreement oblige the Licensor to
347 | provide technical assistance or maintenance services for the Software.
348 | 
349 | However, the Licensor is entitled to offer this type of services. The
350 | terms and conditions of such technical assistance, and/or such
351 | maintenance, shall be set forth in a separate instrument. Only the
352 | Licensor offering said maintenance and/or technical assistance services
353 | shall incur liability therefor.
354 | 
355 | 7.2 Similarly, any Licensor is entitled to offer to its licensees, under
356 | its sole responsibility, a warranty, that shall only be binding upon
357 | itself, for the redistribution of the Software and/or the Modified
358 | Software, under terms and conditions that it is free to decide. Said
359 | warranty, and the financial terms and conditions of its application,
360 | shall be subject of a separate instrument executed between the Licensor
361 | and the Licensee.
362 | 
363 | 
364 |     Article 8 - LIABILITY
365 | 
366 | 8.1 Subject to the provisions of Article 8.2, the Licensee shall be
367 | entitled to claim compensation for any direct loss it may have suffered
368 | from the Software as a result of a fault on the part of the relevant
369 | Licensor, subject to providing evidence thereof.
370 | 
371 | 8.2 The Licensor's liability is limited to the commitments made under
372 | this Agreement and shall not be incurred as a result of in particular:
373 | (i) loss due the Licensee's total or partial failure to fulfill its
374 | obligations, (ii) direct or consequential loss that is suffered by the
375 | Licensee due to the use or performance of the Software, and (iii) more
376 | generally, any consequential loss. In particular the Parties expressly
377 | agree that any or all pecuniary or business loss (i.e. loss of data,
378 | loss of profits, operating loss, loss of customers or orders,
379 | opportunity cost, any disturbance to business activities) or any or all
380 | legal proceedings instituted against the Licensee by a third party,
381 | shall constitute consequential loss and shall not provide entitlement to
382 | any or all compensation from the Licensor.
383 | 
384 | 
385 |     Article 9 - WARRANTY
386 | 
387 | 9.1 The Licensee acknowledges that the scientific and technical
388 | state-of-the-art when the Software was distributed did not enable all
389 | possible uses to be tested and verified, nor for the presence of
390 | possible defects to be detected. In this respect, the Licensee's
391 | attention has been drawn to the risks associated with loading, using,
392 | modifying and/or developing and reproducing the Software which are
393 | reserved for experienced users.
394 | 
395 | The Licensee shall be responsible for verifying, by any or all means,
396 | the suitability of the product for its requirements, its good working
397 | order, and for ensuring that it shall not cause damage to either persons
398 | or properties.
399 | 
400 | 9.2 The Licensor hereby represents, in good faith, that it is entitled
401 | to grant all the rights over the Software (including in particular the
402 | rights set forth in Article 5).
403 | 
404 | 9.3 The Licensee acknowledges that the Software is supplied "as is" by
405 | the Licensor without any other express or tacit warranty, other than
406 | that provided for in Article 9.2 and, in particular, without any warranty
407 | as to its commercial value, its secured, safe, innovative or relevant
408 | nature.
409 | 
410 | Specifically, the Licensor does not warrant that the Software is free
411 | from any error, that it will operate without interruption, that it will
412 | be compatible with the Licensee's own equipment and software
413 | configuration, nor that it will meet the Licensee's requirements.
414 | 
415 | 9.4 The Licensor does not either expressly or tacitly warrant that the
416 | Software does not infringe any third party intellectual property right
417 | relating to a patent, software or any other property right. Therefore,
418 | the Licensor disclaims any and all liability towards the Licensee
419 | arising out of any or all proceedings for infringement that may be
420 | instituted in respect of the use, modification and redistribution of the
421 | Software. Nevertheless, should such proceedings be instituted against
422 | the Licensee, the Licensor shall provide it with technical and legal
423 | assistance for its defense. Such technical and legal assistance shall be
424 | decided on a case-by-case basis between the relevant Licensor and the
425 | Licensee pursuant to a memorandum of understanding. The Licensor
426 | disclaims any and all liability as regards the Licensee's use of the
427 | name of the Software. No warranty is given as regards the existence of
428 | prior rights over the name of the Software or as regards the existence
429 | of a trademark.
430 | 
431 | 
432 |     Article 10 - TERMINATION
433 | 
434 | 10.1 In the event of a breach by the Licensee of its obligations
435 | hereunder, the Licensor may automatically terminate this Agreement
436 | thirty (30) days after notice has been sent to the Licensee and has
437 | remained ineffective.
438 | 
439 | 10.2 A Licensee whose Agreement is terminated shall no longer be
440 | authorized to use, modify or distribute the Software. However, any
441 | licenses that it may have granted prior to termination of the Agreement
442 | shall remain valid subject to their having been granted in compliance
443 | with the terms and conditions hereof.
444 | 
445 | 
446 |     Article 11 - MISCELLANEOUS
447 | 
448 | 
449 |       11.1 EXCUSABLE EVENTS
450 | 
451 | Neither Party shall be liable for any or all delay, or failure to
452 | perform the Agreement, that may be attributable to an event of force
453 | majeure, an act of God or an outside cause, such as defective
454 | functioning or interruptions of the electricity or telecommunications
455 | networks, network paralysis following a virus attack, intervention by
456 | government authorities, natural disasters, water damage, earthquakes,
457 | fire, explosions, strikes and labor unrest, war, etc.
458 | 
459 | 11.2 Any failure by either Party, on one or more occasions, to invoke
460 | one or more of the provisions hereof, shall under no circumstances be
461 | interpreted as being a waiver by the interested Party of its right to
462 | invoke said provision(s) subsequently.
463 | 
464 | 11.3 The Agreement cancels and replaces any or all previous agreements,
465 | whether written or oral, between the Parties and having the same
466 | purpose, and constitutes the entirety of the agreement between said
467 | Parties concerning said purpose. No supplement or modification to the
468 | terms and conditions hereof shall be effective as between the Parties
469 | unless it is made in writing and signed by their duly authorized
470 | representatives.
471 | 
472 | 11.4 In the event that one or more of the provisions hereof were to
473 | conflict with a current or future applicable act or legislative text,
474 | said act or legislative text shall prevail, and the Parties shall make
475 | the necessary amendments so as to comply with said act or legislative
476 | text. All other provisions shall remain effective. Similarly, invalidity
477 | of a provision of the Agreement, for any reason whatsoever, shall not
478 | cause the Agreement as a whole to be invalid.
479 | 
480 | 
481 |       11.5 LANGUAGE
482 | 
483 | The Agreement is drafted in both French and English and both versions
484 | are deemed authentic.
485 | 
486 | 
487 |     Article 12 - NEW VERSIONS OF THE AGREEMENT
488 | 
489 | 12.1 Any person is authorized to duplicate and distribute copies of this
490 | Agreement.
491 | 
492 | 12.2 So as to ensure coherence, the wording of this Agreement is
493 | protected and may only be modified by the authors of the License, who
494 | reserve the right to periodically publish updates or new versions of the
495 | Agreement, each with a separate number. These subsequent versions may
496 | address new issues encountered by Free Software.
497 | 
498 | 12.3 Any Software distributed under a given version of the Agreement may
499 | only be subsequently distributed under the same version of the Agreement
500 | or a subsequent version.
501 | 
502 | 
503 |     Article 13 - GOVERNING LAW AND JURISDICTION
504 | 
505 | 13.1 The Agreement is governed by French law. The Parties agree to
506 | endeavor to seek an amicable solution to any disagreements or disputes
507 | that may arise during the performance of the Agreement.
508 | 
509 | 13.2 Failing an amicable solution within two (2) months as from their
510 | occurrence, and unless emergency proceedings are necessary, the
511 | disagreements or disputes shall be referred to the Paris Courts having
512 | jurisdiction, by the more diligent Party.
513 | 
514 | 
515 | Version 1.0 dated 2006-09-05.
516 | 


--------------------------------------------------------------------------------