├── DocumentFeatureSelection ├── bns │ ├── __init__.py │ ├── bns.py │ ├── bns_cython.pyx │ └── bns_python3.py ├── soa │ ├── __init__.py │ ├── soa.py │ ├── soa_cython.pyx │ └── soa_python3.py ├── common │ ├── __init__.py │ ├── utils.py │ ├── func_data_converter.py │ ├── crs_matrix_constructor.py │ └── data_converter.py ├── tf_idf │ ├── __init__.py │ └── tf_idf.py ├── pmi │ ├── __init__.py │ ├── PMI.py │ ├── pmi_cython.pyx │ └── PMI_python3.py ├── init_logger.py ├── __init__.py ├── interface.py └── models.py ├── examples ├── __init__.py ├── check_performance.py ├── huge_data_example.py ├── basic_example.py └── advanced_example.py ├── doc ├── source │ ├── tutorial.rst │ ├── installation.rst │ ├── index.rst │ └── conf.py ├── make.bat └── Makefile ├── MANIFEST.in ├── tests ├── __init__.py ├── docker-compose.yml ├── all_tests.py ├── Dockerfile ├── test_PMI_python3.py ├── test_tf_idf.py ├── test_soa_python3.py ├── check_code_pmi.py ├── test_data_models.py ├── test_interface.py ├── test_bns_python3.py └── test_data_converter.py ├── .travis.yml ├── .gitignore ├── setup.py ├── README.md └── LICENSE /DocumentFeatureSelection/bns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/soa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'kensuke-mi' 2 | -------------------------------------------------------------------------------- /doc/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | Quick tutorial and examples 2 | =========================== 3 | 4 | 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | recursive-include examples * 4 | recursive-include tests * 5 | recursive-include DocumentFeatureSelection * -------------------------------------------------------------------------------- /DocumentFeatureSelection/common/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | from __future__ import division 5 | __author__ = 'kensuke-mi' -------------------------------------------------------------------------------- /DocumentFeatureSelection/tf_idf/__init__.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division -------------------------------------------------------------------------------- /DocumentFeatureSelection/pmi/__init__.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | __author__ = 'kensuke-mi' 7 | 8 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/bns/bns.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | import sys 7 | python_version = sys.version_info 8 | 9 | if python_version > (3, 0, 0): 10 | from DocumentFeatureSelection.bns.bns_python3 import BNS 11 | else: 12 | raise SystemError('Not Implemented yet') -------------------------------------------------------------------------------- /DocumentFeatureSelection/pmi/PMI.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | import sys 7 | python_version = sys.version_info 8 | 9 | if python_version > (3, 0, 0): 10 | from DocumentFeatureSelection.pmi.PMI_python3 import PMI 11 | else: 12 | raise SystemError('Not Implemented yet') -------------------------------------------------------------------------------- /DocumentFeatureSelection/soa/soa.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | import sys 7 | python_version = sys.version_info 8 | 9 | if python_version > (3, 0, 0): 10 | from DocumentFeatureSelection.soa.soa_python3 import SOA 11 | else: 12 | raise SystemError('Not Implemented yet') -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Package gathering all unitary tests for document-feature-selection. 4 | Module names must start with `test_` to be taken into account. 5 | 6 | You should consider to install :mod:`Distribute` to run all tests with:: 7 | 8 | $ python setup.py test 9 | 10 | """ 11 | from __future__ import unicode_literals 12 | __author__ = 'kensuke-mi' 13 | import unittest 14 | 15 | if __name__ == '__main__': 16 | unittest.main() 17 | 18 | 19 | if __name__ == '__main__': 20 | import doctest 21 | doctest.testmod() 22 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/init_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from logging import Formatter, StreamHandler 4 | 5 | # Formatter 6 | custmoFormatter = Formatter( 7 | fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s', 8 | datefmt='Y/%m/%d %H:%M:%S' 9 | ) 10 | 11 | handler = logging.StreamHandler(sys.stderr) 12 | handler.setFormatter(custmoFormatter) 13 | 14 | LOGGER_NAME = 'DocumentFeatureSelection' 15 | logger = logging.getLogger(LOGGER_NAME) 16 | logger.setLevel(logging.INFO) 17 | logger.addHandler(handler) 18 | logger.propagate = False 19 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | from __future__ import division 7 | import sys 8 | python_version = sys.version_info 9 | 10 | #from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix 11 | from DocumentFeatureSelection.pmi.PMI import PMI 12 | from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF 13 | from DocumentFeatureSelection.soa.soa import SOA 14 | from DocumentFeatureSelection.bns.bns import BNS 15 | -------------------------------------------------------------------------------- /tests/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # 開発/test環境としてまとめてdocker環境を整えるためのcompose 2 | version: '3' 3 | services: 4 | dev_env: 5 | build: 6 | context: ./ 7 | dockerfile: Dockerfile 8 | volumes: 9 | - ..:/codes/ 10 | working_dir: /codes 11 | stdin_open: true 12 | tty: true 13 | test_env: 14 | build: 15 | context: ./ 16 | dockerfile: Dockerfile 17 | volumes: 18 | - ..:/codes/ 19 | working_dir: /codes 20 | command: > 21 | bash -c "echo 'Python3.6 Test' && 22 | source activate p36 && 23 | python setup.py test && 24 | deactivate && 25 | echo 'Python3.7 Test' && 26 | source activate p37 && 27 | python setup.py test" 28 | stdin_open: true 29 | tty: true -------------------------------------------------------------------------------- /tests/all_tests.py: -------------------------------------------------------------------------------- 1 | __author__ = 'kensuke-mi' 2 | 3 | import sys 4 | import unittest 5 | python_version = sys.version_info 6 | 7 | 8 | def suite(): 9 | suite = unittest.TestSuite() 10 | if python_version >= (3, 0, 0): 11 | from .test_data_converter import TestDataConverter 12 | from .test_PMI_python3 import TestPmiPython3 13 | from .test_tf_idf import TestTfIdf 14 | from .test_soa_python3 import TestSoaPython3 15 | from .test_bns_python3 import TestBnsPython3 16 | suite.addTest(unittest.makeSuite(TestDataConverter)) 17 | suite.addTest(unittest.makeSuite(TestPmiPython3)) 18 | suite.addTest(unittest.makeSuite(TestTfIdf)) 19 | suite.addTest(unittest.makeSuite(TestSoaPython3)) 20 | suite.addTest(unittest.makeSuite(TestBnsPython3)) 21 | else: 22 | pass 23 | 24 | 25 | return suite -------------------------------------------------------------------------------- /doc/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installing / Upgrading 2 | ====================== 3 | 4 | Installing from source 5 | ---------------------- 6 | 7 | If you prefer install directly from the source:: 8 | 9 | $ cd document-feature-selection 10 | $ sudo python setup.py install 11 | 12 | Creating packages 13 | ----------------- 14 | 15 | You can easily create documentation and packages:: 16 | 17 | $ cd document-feature-selection 18 | $ python setup.py sdist # generate source .tar.gz file 19 | $ python setup.py bdist_deb # require python-all and python-stdeb packages 20 | $ python setup.py bdist_rpm # 21 | $ python setup.py bdist_msi # generate a Windows installer 22 | $ python setup.py bdist # generate a binary .tar.gz 23 | $ python setup.py py2exe # generate a portable Windows application 24 | $ python setup.py py2app # generate a portable Mac OS X application 25 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. document-feature-selection documentation master file, created by 2 | sphinx-quickstart on Wed Feb 13 11:51:12 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to document-feature-selection's documentation! 7 | ====================================================== 8 | 9 | Overview: 10 | 11 | :doc:`installation` 12 | Instruction on how to get the distribution 13 | 14 | :doc:`tutorial` 15 | Start here for a quick overview 16 | 17 | :doc:`api/index` 18 | The complete API documentation, organized by modules 19 | 20 | 21 | Full table of contents 22 | ====================== 23 | 24 | .. toctree:: 25 | :maxdepth: 4 26 | 27 | installation 28 | tutorial 29 | api/index 30 | 31 | Indices and tables 32 | ================== 33 | 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | addons: 6 | apt: 7 | packages: 8 | - git 9 | - make 10 | - curl 11 | - xz-utils 12 | - file 13 | - pandoc 14 | - gcc-5 15 | - g++-5 16 | - build-essential 17 | sources: 18 | - ubuntu-toolchain-r-test 19 | before_install: 20 | - sudo apt-get update -qq 21 | - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 22 | - export CC="gcc-5" 23 | - export CXX="g++-5" 24 | - export CFLAGS=-std=c++11 25 | - export CXXFLAGS=-std=c++11 26 | - pip install numpy scipy scikit-learn cython sqlitedict 27 | install: 28 | - python --version 29 | - python setup.py install 30 | - pip install coveralls coverage nose 31 | script: 32 | - coverage run --source=DocumentFeatureSelection setup.py test 33 | after_success: 34 | - sudo coveralls 35 | notifications: 36 | email: 37 | recipients: 38 | - kensuke.mit@gmail.com 39 | on_success: always 40 | on_failure: always -------------------------------------------------------------------------------- /tests/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM frolvlad/alpine-glibc:alpine-3.6 2 | MAINTAINER kensuke-mi 3 | 4 | # apk update 5 | RUN apk update 6 | # general 7 | RUN apk --no-cache add vim \ 8 | wget \ 9 | lsof \ 10 | curl \ 11 | bash \ 12 | swig \ 13 | gcc \ 14 | build-base \ 15 | make \ 16 | python-dev \ 17 | py-pip \ 18 | jpeg-dev \ 19 | zlib-dev \ 20 | git \ 21 | linux-headers 22 | ENV LIBRARY_PATH=/lib:/usr/lib 23 | 24 | ENV PATH=/opt/conda/bin:$PATH \ 25 | LANG=C.UTF-8 \ 26 | MINICONDA=Miniconda3-latest-Linux-x86_64.sh 27 | 28 | # Python 29 | RUN apk add --no-cache bash wget && \ 30 | wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \ 31 | bash /Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ 32 | ln -s /opt/conda/bin/* /usr/local/bin/ && \ 33 | rm -rf /root/.[acpw]* /$MINICONDA /opt/conda/pkgs/* 34 | 35 | RUN conda config --add channels conda-forge --system 36 | RUN conda install Cython \ 37 | scikit-learn \ 38 | scipy \ 39 | numpy 40 | 41 | RUN pip install more_itertools joblib nltk pypandoc sqlitedict nose 42 | RUN conda create -y -n p36 python=3.6 43 | RUN conda create -y -n p37 python=3.7 44 | 45 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .gitignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | *.c 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | .eggs/ 27 | 28 | 29 | # Standard IDEs 30 | .idea/ 31 | .project/ 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | # custom 66 | envs/ 67 | package/ 68 | .python-version 69 | web_api/tests/matrix.memmap 70 | web_api/tests/backend.sqlite3 71 | -------------------------------------------------------------------------------- /examples/check_performance.py: -------------------------------------------------------------------------------- 1 | from DocumentFeatureSelection.init_logger import logger 2 | from DocumentFeatureSelection import interface 3 | import logging 4 | import time 5 | import nltk 6 | 7 | nltk.download('abc') 8 | nltk.download('genesis') 9 | nltk.download('webtext') 10 | nltk.download('gutenberg') 11 | nltk.download('punkt') 12 | 13 | """This script shows the difference of computing speed between cython and multi-processing""" 14 | 15 | 16 | def pmi_with_parallel(input_corpus): 17 | logging.debug(msg='With multiprocessing backend') 18 | start = time.time() 19 | scored_matrix_obj = interface.run_feature_selection( 20 | input_dict=input_corpus, 21 | method='pmi', 22 | n_jobs=-1, 23 | ) 24 | elapsed_time = time.time() - start 25 | logger.info("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)) 26 | 27 | 28 | def pmi_with_cython(input_corpus): 29 | logging.debug(msg='With cython is True') 30 | start = time.time() 31 | scored_matrix_obj = interface.run_feature_selection( 32 | input_dict=input_corpus, 33 | method='pmi', 34 | use_cython=True 35 | ) 36 | elapsed_time = time.time() - start 37 | logger.info("elapsed_time with cython:{} [sec]".format(elapsed_time)) 38 | 39 | 40 | from nltk.corpus import gutenberg 41 | from nltk.corpus import webtext 42 | from nltk.corpus import genesis 43 | from nltk.corpus import abc 44 | 45 | abs_corpus = abc.sents() 46 | genesis_corpus = genesis.sents() 47 | web_corpus = webtext.sents() 48 | gutenberg_corpus = gutenberg.sents() 49 | 50 | input_corpus = { 51 | 'abs': list(abs_corpus), 52 | 'genesis': list(genesis_corpus), 53 | 'web': list(web_corpus), 54 | 'gutenberg': list(gutenberg_corpus) 55 | } 56 | 57 | pmi_with_cython(input_corpus) 58 | pmi_with_parallel(input_corpus) 59 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/tf_idf/tf_idf.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | from scipy.sparse.csr import csr_matrix 8 | from numpy import ndarray 9 | __author__ = 'kensuke-mi' 10 | 11 | 12 | class TFIDF(object): 13 | def __init__(self, ngram=1, norm_metric='l2', use_idf_bool=True, smooth_idf_bool=True, sublinear_tf_bool=False): 14 | assert isinstance(ngram, int) 15 | 16 | self.n_gram = ngram 17 | self.norm_metric = norm_metric 18 | self.use_idf_bool = use_idf_bool 19 | self.smooth_idf_bool = smooth_idf_bool 20 | self.sublinear_tf_bool = sublinear_tf_bool 21 | 22 | def fit_transform(self, X): 23 | if isinstance(X, csr_matrix): 24 | X = X.toarray() 25 | else: 26 | X = X 27 | 28 | tf_idf_matrix = self.call_sklearn_tfidf( 29 | X=X, 30 | norm=self.norm_metric, 31 | use_idf=self.use_idf_bool, 32 | smooth_idf=self.smooth_idf_bool, 33 | sublinear_tf=self.sublinear_tf_bool 34 | ) 35 | self.weighed_matrix = tf_idf_matrix 36 | 37 | return tf_idf_matrix 38 | 39 | def call_sklearn_tfidf(self, X, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): 40 | assert isinstance(X, (csr_matrix, ndarray)) 41 | 42 | tf_idf_generator = TfidfTransformer( 43 | norm=norm, 44 | use_idf=use_idf, 45 | smooth_idf=smooth_idf, 46 | sublinear_tf=sublinear_tf 47 | ) 48 | if isinstance(csr_matrix, csr_matrix): 49 | feat_matrix = X.toarray() 50 | else: 51 | feat_matrix = X 52 | 53 | tf_idf_weight_matrix = tf_idf_generator.fit_transform( 54 | X=feat_matrix 55 | ) 56 | assert isinstance(tf_idf_weight_matrix, (csr_matrix, ndarray)) 57 | 58 | return tf_idf_weight_matrix 59 | 60 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/common/utils.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | from scipy.sparse.csr import csr_matrix 7 | from typing import Union 8 | from DocumentFeatureSelection import models 9 | import sqlitedict 10 | import sys 11 | import tempfile 12 | import os 13 | python_version = sys.version_info 14 | 15 | __author__ = 'kensuke-mi' 16 | 17 | 18 | def flatten(lis): 19 | for item in lis: 20 | if isinstance(item, list) and not isinstance(item, str): 21 | for x in flatten(item): 22 | yield x 23 | else: 24 | yield item 25 | 26 | 27 | def __conv_into_dict_format(pmi_word_score_items): 28 | out_format_structure = {} 29 | for item in pmi_word_score_items: 30 | if out_format_structure not in item['label']: 31 | out_format_structure[item['label']] = [{'word': item['word'], 'score': item['score']}] 32 | else: 33 | out_format_structure[item['label']].append({'word': item['word'], 'score': item['score']}) 34 | return out_format_structure 35 | 36 | 37 | def extract_from_csr_matrix(weight_csr_matrix, vocabulary, label_id, row_id, col_id): 38 | assert isinstance(weight_csr_matrix, csr_matrix) 39 | assert isinstance(vocabulary, dict) 40 | assert isinstance(label_id, dict) 41 | 42 | 43 | def init_cache_object(file_name:str, 44 | path_work_dir:str=tempfile.mkdtemp(), 45 | cache_backend:str='PersistentDict')->Union[sqlitedict.SqliteDict, models.PersistentDict]: 46 | """* What you can do 47 | - You initialize cached object. 48 | """ 49 | if cache_backend == 'PersistentDict': 50 | cached_obj = models.PersistentDict(os.path.join(path_work_dir, file_name)) 51 | elif cache_backend == 'SqliteDict': 52 | cached_obj = sqlitedict.SqliteDict(os.path.join(path_work_dir, file_name), autocommit=True) 53 | else: 54 | raise Exception('No cache backend named {}'.format(cache_backend)) 55 | 56 | return cached_obj -------------------------------------------------------------------------------- /examples/huge_data_example.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from DocumentFeatureSelection import interface 3 | from DocumentFeatureSelection.models import PersistentDict 4 | from DocumentFeatureSelection.init_logger import logger 5 | import logging 6 | import time 7 | import os 8 | import nltk 9 | nltk.download('wordnet') 10 | from collections import Counter 11 | from nltk import stem 12 | from typing import List 13 | # make download 20news group file 14 | from sklearn.datasets import fetch_20newsgroups 15 | newsgroups_train = fetch_20newsgroups(subset='train') 16 | lemmatizer = stem.WordNetLemmatizer() 17 | logger.setLevel(logging.DEBUG) 18 | 19 | """This example shows you how to work on huge dataset. 20 | For persisted-dict object you can choose PersistentDict or SqliteDict 21 | """ 22 | 23 | DATA_LIMIT = 100000 24 | 25 | 26 | def run_nltk_lemma(subject_name: str)->List[str]: 27 | return [lemmatizer.lemmatize(t).strip(':?!><') for t in subject_name.lower().split()] 28 | 29 | 30 | category_names = newsgroups_train.target_names 31 | logger.debug("20-news has {} categories".format(len(category_names))) 32 | logger.debug("Now pre-processing on subject text...") 33 | news_lemma = [run_nltk_lemma(d) for d in newsgroups_train.data[:DATA_LIMIT]] 34 | 35 | index2category = {i: t for i, t in enumerate(newsgroups_train.target_names)} 36 | dict_index2label = {i: index2category[t_no] for i, t_no in enumerate(newsgroups_train.target[:DATA_LIMIT])} 37 | logger.info("Subject distribution") 38 | for k, v in dict(Counter(dict_index2label.values())).items(): 39 | logger.info("{} is {}, {}%".format(k, v, v / len(dict_index2label) * 100)) 40 | 41 | # Case of PersistentDict 42 | logger.info("Putting documents into dict object...") 43 | persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') 44 | for i, label in dict_index2label.items(): 45 | if label in persistent_dict_obj: 46 | persistent_dict_obj[label].append(news_lemma[i]) 47 | else: 48 | persistent_dict_obj[label] = [news_lemma[i]] 49 | else: 50 | persistent_dict_obj.sync() 51 | 52 | start = time.time() 53 | # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation 54 | # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation 55 | scored_matrix_obj = interface.run_feature_selection( 56 | input_dict=persistent_dict_obj, 57 | method='pmi', 58 | use_cython=True, 59 | is_use_cache=True, 60 | is_use_memmap=True 61 | ) 62 | elapsed_time = time.time() - start 63 | logger.info("elapsed_time with cython: {} [sec]".format(elapsed_time)) 64 | os.remove('./demo.json') 65 | -------------------------------------------------------------------------------- /tests/test_PMI_python3.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from DocumentFeatureSelection.common import data_converter 3 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix 4 | from DocumentFeatureSelection.models import ScoredResultObject 5 | from DocumentFeatureSelection.pmi import PMI_python3 6 | from scipy.sparse import csr_matrix 7 | 8 | 9 | class TestPmiPython3(unittest.TestCase): 10 | def setUp(self): 11 | input_dict = { 12 | "label_a": [ 13 | ["I", "aa", "aa", "aa", "aa", "aa"], 14 | ["bb", "aa", "aa", "aa", "aa", "aa"], 15 | ["I", "aa", "hero", "some", "ok", "aa"] 16 | ], 17 | "label_b": [ 18 | ["bb", "bb", "bb"], 19 | ["bb", "bb", "bb"], 20 | ["hero", "ok", "bb"], 21 | ["hero", "cc", "bb"], 22 | ], 23 | "label_c": [ 24 | ["cc", "cc", "cc"], 25 | ["cc", "cc", "bb"], 26 | ["xx", "xx", "cc"], 27 | ["aa", "xx", "cc"], 28 | ] 29 | } 30 | 31 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 32 | labeled_documents=input_dict, 33 | n_jobs=5 34 | ) 35 | assert isinstance(data_csr_matrix, DataCsrMatrix) 36 | self.label2id_dict = data_csr_matrix.label2id_dict 37 | self.csr_matrix_ = data_csr_matrix.csr_matrix_ 38 | self.n_docs_distribution = data_csr_matrix.n_docs_distribution 39 | self.vocabulary = data_csr_matrix.vocabulary 40 | 41 | def test_normal_fit_transform(self): 42 | pmi_object = PMI_python3.PMI() 43 | scored_matrix = pmi_object.fit_transform( 44 | X=self.csr_matrix_, 45 | n_jobs=1, 46 | n_docs_distribution=self.n_docs_distribution 47 | ) 48 | assert isinstance(scored_matrix, csr_matrix) 49 | 50 | def test_multi_process_fit_transform(self): 51 | pmi_object = PMI_python3.PMI() 52 | scored_matrix = pmi_object.fit_transform( 53 | X=self.csr_matrix_, 54 | n_jobs=5, 55 | n_docs_distribution=self.n_docs_distribution, 56 | verbose=True 57 | ) 58 | assert isinstance(scored_matrix, csr_matrix) 59 | 60 | def test_output_result_pmi(self): 61 | pmi_object = PMI_python3.PMI() 62 | scored_matrix = pmi_object.fit_transform( 63 | X=self.csr_matrix_, 64 | n_jobs=5, 65 | n_docs_distribution=self.n_docs_distribution 66 | ) 67 | assert isinstance(scored_matrix, csr_matrix) 68 | 69 | pmi_scored_dict = ScoredResultObject( 70 | scored_matrix=scored_matrix, 71 | label2id_dict=self.label2id_dict, 72 | feature2id_dict=self.vocabulary 73 | ).convert_score_matrix2score_record(outformat='items') 74 | self.assertTrue(isinstance(pmi_scored_dict, list)) 75 | 76 | 77 | if __name__ == '__main__': 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/soa/soa_cython.pyx: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | cimport numpy as np 4 | from cpython cimport bool 5 | 6 | cdef float soa( 7 | np.ndarray[np.float64_t, ndim=2] X, 8 | np.ndarray[np.int64_t, ndim=1] unit_distribution, 9 | int n_total_docs, 10 | int feature_index, 11 | int sample_index, 12 | bool verbose): 13 | # X is either of term-frequency matrix per label or document-frequency per label 14 | 15 | matrix_size = X.shape 16 | NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] 17 | 18 | # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e 19 | cdef float freq_w_e = X[sample_index, feature_index] 20 | # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e 21 | cdef float freq_w_not_e = X[NOT_sample_indexes, feature_index].sum() 22 | # freq_e is the number of the unit having specific label e 23 | cdef float freq_e = unit_distribution[sample_index] 24 | # freq_not_e is the number of the unit NOT having the specific label e 25 | cdef float freq_not_e = n_total_docs - freq_e 26 | cdef float nominator, denominator, ans, soa_val 27 | 28 | if verbose: 29 | print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) 30 | print('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format( 31 | freq_w_e, 32 | freq_w_not_e, 33 | freq_e, 34 | freq_not_e 35 | )) 36 | 37 | if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0: 38 | return 0.0 39 | else: 40 | nominator = (float(freq_w_e) * freq_not_e) 41 | denominator = (float(freq_e) * freq_w_not_e) 42 | ans = nominator / denominator 43 | soa_val = math.log(ans, 2) 44 | return soa_val 45 | 46 | 47 | def main(X, 48 | np.ndarray[np.int64_t, ndim=1] n_docs_distribution, 49 | int n_total_doc, 50 | sample_range, 51 | feature_range, 52 | bool verbose=False): 53 | """What you can do 54 | - calculate PMI score based on given data. 55 | - The function returns list of tuple, whose element is (sample_index, feature_index, score) 56 | - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature. 57 | """ 58 | 59 | cdef int n_samples = X.shape[0] 60 | 61 | if isinstance(X, scipy.sparse.csr_matrix): 62 | X = X.toarray() 63 | 64 | cdef int sample_index, feature_index 65 | soa_score_csr_source = [ 66 | ( 67 | sample_index, 68 | feature_index, 69 | soa(X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose) 70 | ) 71 | for sample_index in sample_range 72 | for feature_index in feature_range 73 | ] 74 | non_zero_soa_score_csr_source = [score_tuple for score_tuple in soa_score_csr_source if not score_tuple[2]==0] 75 | 76 | return non_zero_soa_score_csr_source -------------------------------------------------------------------------------- /tests/test_tf_idf.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | from scipy.sparse import csr_matrix 7 | from DocumentFeatureSelection.common import data_converter 8 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix 9 | from DocumentFeatureSelection.tf_idf import tf_idf 10 | from DocumentFeatureSelection.models import ScoredResultObject 11 | import logging 12 | import unittest 13 | import numpy 14 | logging.basicConfig(level=logging.DEBUG) 15 | logger = logging.getLogger(__name__) 16 | __author__ = 'kensuke-mi' 17 | 18 | 19 | class TestTfIdf(unittest.TestCase): 20 | def setUp(self): 21 | input_dict = { 22 | "label_a": [ 23 | ["I", "aa", "aa", "aa", "aa", "aa"], 24 | ["bb", "aa", "aa", "aa", "aa", "aa"], 25 | ["I", "aa", "hero", "some", "ok", "aa"] 26 | ], 27 | "label_b": [ 28 | ["bb", "bb", "bb"], 29 | ["bb", "bb", "bb"], 30 | ["hero", "ok", "bb"], 31 | ["hero", "cc", "bb"], 32 | ], 33 | "label_c": [ 34 | ["cc", "cc", "cc"], 35 | ["cc", "cc", "bb"], 36 | ["xx", "xx", "cc"], 37 | ["aa", "xx", "cc"], 38 | ] 39 | } 40 | 41 | tf_matrix = numpy.array( 42 | [ 43 | [2, 12, 1, 0, 1, 1, 1, 0], 44 | [0, 0, 8, 1, 2, 1, 0, 0], 45 | [0, 1, 1, 7, 0, 0, 0, 3] 46 | ] 47 | ) 48 | 49 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 50 | labeled_documents=input_dict, 51 | n_jobs=-1 52 | ) 53 | assert isinstance(data_csr_matrix, DataCsrMatrix) 54 | self.label2id_dict = data_csr_matrix.label2id_dict 55 | self.csr_matrix_ = data_csr_matrix.csr_matrix_ 56 | self.n_docs_distribution = data_csr_matrix.n_docs_distribution 57 | self.vocabulary = data_csr_matrix.vocabulary 58 | 59 | numpy.array_equal(data_csr_matrix.csr_matrix_.toarray(), tf_matrix) 60 | 61 | def test_normal_fit_transform(self): 62 | tf_idf_weighted_matrix = tf_idf.TFIDF().fit_transform( 63 | X=self.csr_matrix_, 64 | ) 65 | assert isinstance(tf_idf_weighted_matrix, csr_matrix) 66 | 67 | def test_output_result_pmi(self): 68 | tf_idf_weighted_matrix = tf_idf.TFIDF().fit_transform( 69 | X=self.csr_matrix_, 70 | ) 71 | assert isinstance(tf_idf_weighted_matrix, csr_matrix) 72 | 73 | tf_idf_scored_dict = ScoredResultObject( 74 | scored_matrix=tf_idf_weighted_matrix, 75 | label2id_dict=self.label2id_dict, 76 | feature2id_dict=self.vocabulary, 77 | ).convert_score_matrix2score_record(outformat='items') 78 | self.assertTrue(isinstance(tf_idf_scored_dict, list)) 79 | assert isinstance(tf_idf_scored_dict, list) 80 | 81 | 82 | if __name__ == '__main__': 83 | unittest.main() 84 | 85 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/bns/bns_cython.pyx: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | from cpython cimport bool 4 | from scipy.stats import norm 5 | cimport numpy 6 | import numpy 7 | 8 | 9 | cdef float bns( 10 | numpy.ndarray[numpy.float64_t, ndim=2] X, 11 | numpy.ndarray[numpy.int64_t, ndim=1] unit_distribution, 12 | int feature_index, 13 | int sample_index, 14 | int true_index, 15 | bool verbose): 16 | # X is either of term-frequency matrix per label or document-frequency per label 17 | 18 | cdef int false_index 19 | if true_index == 0: 20 | false_index = 1 21 | elif true_index == 1: 22 | false_index = 0 23 | else: 24 | raise Exception('true index must be either of 0 or 1') 25 | 26 | # trueラベルで出現した回数 27 | # tp is frequency of features in the specified positive label 28 | cdef float tp = X[true_index, feature_index] 29 | # trueラベルで出現しなかった回数 30 | # fp is frequency of NON-features(expect specified feature) in the specified positive label 31 | cdef float fp = unit_distribution[true_index] - tp 32 | 33 | # negativeラベルで出現した回数 34 | # fn is frequency of features in the specified negative label 35 | cdef float fn = X[false_index, feature_index] 36 | # negativeラベルで出現しなかった回数 37 | # fp is frequency of NON-features(expect specified feature) in the specified negative label 38 | cdef float tn = unit_distribution[false_index] - fn 39 | 40 | if tn < 0.0: 41 | print('Something wrong') 42 | 43 | cdef float pos = tp + fn 44 | cdef float neg = fp + tn 45 | 46 | cdef float tpr = tp / pos 47 | cdef float fpr = fp / neg 48 | 49 | if verbose: 50 | print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) 51 | print('tp:{} fp:{} fn:{} tn:{} pos:{} neg:{} tpr:{} fpr:{}'.format( 52 | tp, 53 | fp, 54 | fn, 55 | tn, 56 | pos, 57 | neg, 58 | tpr, 59 | fpr 60 | )) 61 | cdef float bns_score = numpy.abs(norm.ppf(tpr) - norm.ppf(fpr)) 62 | # cdef float bns_score = numpy.abs(norm.ppf(norm.cdf(tpr)) - norm.ppf(norm.cdf(fpr))) 63 | return bns_score 64 | 65 | 66 | def main(X, 67 | numpy.ndarray[numpy.int64_t, ndim=1] unit_distribution, 68 | sample_range, 69 | feature_range, 70 | int true_index, 71 | bool verbose=False): 72 | """What you can do 73 | - calculate BNS score based on given data. 74 | - The function returns list of tuple, whose element is (sample_index, feature_index, score) 75 | - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature. 76 | """ 77 | if isinstance(X, scipy.sparse.csr_matrix): 78 | X = X.toarray() 79 | 80 | cdef int sample_index, feature_index 81 | soa_score_csr_source = [ 82 | ( 83 | sample_index, 84 | feature_index, 85 | bns(X, unit_distribution, feature_index, sample_index, true_index, verbose) 86 | ) 87 | for sample_index in sample_range 88 | for feature_index in feature_range 89 | ] 90 | 91 | return soa_score_csr_source -------------------------------------------------------------------------------- /DocumentFeatureSelection/pmi/pmi_cython.pyx: -------------------------------------------------------------------------------- 1 | import math 2 | import scipy 3 | cimport numpy as np 4 | from cpython cimport bool 5 | 6 | cdef float pmi(np.ndarray[np.float64_t, ndim=2] X, 7 | int n_samples, 8 | np.ndarray[np.int64_t, ndim=1] n_docs_distribution, 9 | int n_total_doc, 10 | int feature_index, 11 | int sample_index, 12 | bool verbose): 13 | """get PMI score for given feature & sample index 14 | """ 15 | cdef i 16 | sample_indexes = [i for i in range(0, n_samples) if i != sample_index] 17 | 18 | # n_11 is #docs having feature(i.e. word) in the specified index(label) 19 | cdef float n_11 = X[sample_index, feature_index] 20 | # n_01 is #docs NOT having feature in the specified index(label) 21 | cdef float n_01 = n_docs_distribution[sample_index] - n_11 22 | # n_10 is #docs having feature in NOT specified index(indexes except specified index) 23 | cdef float n_10 = X[sample_indexes, feature_index].sum() 24 | # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) 25 | cdef float n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) 26 | 27 | cdef float temp1, temp2, temp3, temp4, score 28 | 29 | if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: 30 | return 0 31 | else: 32 | temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) 33 | temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) 34 | temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) 35 | temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) 36 | score = temp1 + temp2 + temp3 + temp4 37 | 38 | if verbose: 39 | print('score={}, temp1={}, temp2={}, temp3={}, temp4={}, n11={}, n10={}, n01={}, n00={}, n_total_docs={}'.format(score, temp1, temp2, temp3, temp4, n_11, n_10, n_01, n_00, n_total_doc)) 40 | 41 | return score 42 | 43 | 44 | def main(X, 45 | np.ndarray[np.int64_t, ndim=1] n_docs_distribution, 46 | int n_total_doc, 47 | sample_range, 48 | feature_range, 49 | bool verbose=False): 50 | """What you can do 51 | - calculate PMI score based on given data. 52 | - The function returns list of tuple, whose element is (sample_index, feature_index, score) 53 | - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature. 54 | """ 55 | 56 | cdef int n_samples = X.shape[0] 57 | 58 | if isinstance(X, scipy.sparse.csr_matrix): 59 | X = X.toarray() 60 | 61 | cdef int sample_index, feature_index 62 | pmi_score_csr_source = [ 63 | ( 64 | sample_index, 65 | feature_index, 66 | pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose) 67 | ) 68 | for sample_index in sample_range 69 | for feature_index in feature_range 70 | ] 71 | non_zero_pmi_score_csr_source = [score_tuple for score_tuple in pmi_score_csr_source if not score_tuple[2]==0] 72 | 73 | return non_zero_pmi_score_csr_source -------------------------------------------------------------------------------- /tests/test_soa_python3.py: -------------------------------------------------------------------------------- 1 | from DocumentFeatureSelection.soa import soa_python3 2 | from DocumentFeatureSelection.common import data_converter 3 | from DocumentFeatureSelection.models import ScoredResultObject 4 | import unittest 5 | 6 | 7 | class TestSoaPython3(unittest.TestCase): 8 | def setUp(self): 9 | self.input_dict = { 10 | "label_a": [ 11 | ["I", "aa", "aa", "aa", "aa", "aa"], 12 | ["bb", "aa", "aa", "aa", "aa", "aa"], 13 | ["I", "aa", "hero", "some", "ok", "aa"] 14 | ], 15 | "label_b": [ 16 | ["bb", "bb", "bb"], 17 | ["bb", "bb", "bb"], 18 | ["hero", "ok", "bb"], 19 | ["hero", "cc", "bb"], 20 | ], 21 | "label_c": [ 22 | ["cc", "cc", "cc"], 23 | ["cc", "cc", "bb"], 24 | ["xx", "xx", "cc"], 25 | ["aa", "xx", "cc"], 26 | ] 27 | } 28 | 29 | def test_soa_with_term_freq(self): 30 | data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix( 31 | labeled_documents=self.input_dict, 32 | n_jobs=5 33 | ) 34 | assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix) 35 | label2id_dict = data_csr_matrix.label2id_dict 36 | csr_matrix_ = data_csr_matrix.csr_matrix_ 37 | n_docs_distribution = data_csr_matrix.n_docs_distribution 38 | vocabulary = data_csr_matrix.vocabulary 39 | 40 | scored_matrix_term_freq = soa_python3.SOA().fit_transform( 41 | X=csr_matrix_, 42 | unit_distribution=n_docs_distribution, 43 | verbose=True 44 | ) 45 | 46 | soa_scores_term_freq = ScoredResultObject( 47 | scored_matrix=scored_matrix_term_freq, 48 | label2id_dict=label2id_dict, 49 | feature2id_dict=vocabulary 50 | ).convert_score_matrix2score_record() 51 | self.assertTrue(isinstance(soa_scores_term_freq, list)) 52 | 53 | #import pprint 54 | #print('term freq based soa') 55 | #pprint.pprint(soa_scores_term_freq) 56 | 57 | def test_soa_doc_freq(self): 58 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 59 | labeled_documents=self.input_dict, 60 | n_jobs=5 61 | ) 62 | assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix) 63 | label2id_dict = data_csr_matrix.label2id_dict 64 | csr_matrix_ = data_csr_matrix.csr_matrix_ 65 | n_docs_distribution = data_csr_matrix.n_docs_distribution 66 | vocabulary = data_csr_matrix.vocabulary 67 | 68 | scored_matrix_doc_freq = soa_python3.SOA().fit_transform( 69 | X=csr_matrix_, 70 | unit_distribution=n_docs_distribution, 71 | verbose=True 72 | ) 73 | 74 | soa_scores_doc_freq = ScoredResultObject( 75 | scored_matrix=scored_matrix_doc_freq, 76 | label2id_dict=label2id_dict, 77 | feature2id_dict=vocabulary 78 | ).convert_score_matrix2score_record() 79 | self.assertTrue(isinstance(soa_scores_doc_freq, list)) 80 | 81 | #import pprint 82 | #print('doc freq based soa') 83 | #pprint.pprint(soa_scores_doc_freq) 84 | 85 | 86 | if __name__ == '__main__': 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /examples/basic_example.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | __author__ = 'kensuke-mi' 3 | 4 | from DocumentFeatureSelection import interface 5 | import logging 6 | import pprint 7 | logger = logging.getLogger('sample usage') 8 | logger.level = logging.ERROR 9 | 10 | 11 | # ====================================================================================================== 12 | # basic usage 13 | 14 | input_dict = { 15 | "label_a": [ 16 | ["I", "aa", "aa", "aa", "aa", "aa"], 17 | ["bb", "aa", "aa", "aa", "aa", "aa"], 18 | ["I", "aa", "hero", "some", "ok", "aa"] 19 | ], 20 | "label_b": [ 21 | ["bb", "bb", "bb"], 22 | ["bb", "bb", "bb"], 23 | ["hero", "ok", "bb"], 24 | ["hero", "cc", "bb"], 25 | ], 26 | "label_c": [ 27 | ["cc", "cc", "cc"], 28 | ["cc", "cc", "bb"], 29 | ["xx", "xx", "cc"], 30 | ["aa", "xx", "cc"], 31 | ] 32 | } 33 | 34 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | # tf idf 36 | 37 | tf_idf_scored_object = interface.run_feature_selection( 38 | input_dict=input_dict, 39 | method='tf_idf', 40 | n_jobs=5 41 | ) 42 | 43 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 44 | # pmi 45 | pmi_scored_object = interface.run_feature_selection( 46 | input_dict=input_dict, 47 | method='pmi', 48 | n_jobs=1, 49 | use_cython=False 50 | ) 51 | pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) 52 | 53 | # you can use cython version pmi also 54 | # !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place. 55 | pmi_scored_object_cython = interface.run_feature_selection( 56 | input_dict=input_dict, 57 | method='pmi', 58 | n_jobs=1, 59 | use_cython=True 60 | ) 61 | pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) 62 | 63 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | # soa 65 | soa_scored_object = interface.run_feature_selection( 66 | input_dict=input_dict, 67 | method='soa', 68 | n_jobs=5 69 | ) 70 | pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) 71 | 72 | soa_scored_object_cython = interface.run_feature_selection( 73 | input_dict=input_dict, 74 | method='soa', 75 | n_jobs=1, 76 | use_cython=True 77 | ) 78 | pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) 79 | 80 | 81 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 82 | # bns 83 | input_dict = { 84 | "label1": [ 85 | ["I", "aa", "aa", "aa", "aa", "aa"], 86 | ["bb", "aa", "aa", "aa", "aa", "aa"], 87 | ["I", "aa", "hero", "some", "ok", "aa"] 88 | ], 89 | "label2": [ 90 | ["bb", "bb", "bb"], 91 | ["bb", "bb", "bb"], 92 | ["hero", "ok", "bb"], 93 | ["hero", "cc", "bb"], 94 | ] 95 | } 96 | bns_scored_object = interface.run_feature_selection( 97 | input_dict=input_dict, 98 | method='bns', 99 | n_jobs=1 100 | ) 101 | pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary()) 102 | 103 | bns_scored_object = interface.run_feature_selection( 104 | input_dict=input_dict, 105 | method='bns', 106 | use_cython=True 107 | ) 108 | -------------------------------------------------------------------------------- /tests/check_code_pmi.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import codecs 3 | import math 4 | import sys 5 | from collections import defaultdict 6 | # this code is from http://aidiary.hatenablog.com/entry/20100619/1276950312 7 | # checked to work under python2.5 8 | 9 | def mutual_information(target, data, k=5): 10 | # comment inputはlist 0th indexにカテゴリラベルがある。 1-th indexはすべてfeature word 11 | 12 | 13 | """カテゴリtargetにおける相互情報量が高い上位k件の単語を返す""" 14 | # 上位k件を指定しないときはすべて返す 15 | 16 | V = set() 17 | N11 = defaultdict(float) # N11[word] -> wordを含むtargetの文書数 18 | N10 = defaultdict(float) # N10[word] -> wordを含むtarget以外の文書数 19 | N01 = defaultdict(float) # N01[word] -> wordを含まないtargetの文書数 20 | N00 = defaultdict(float) # N00[word] -> wordを含まないtarget以外の文書数 21 | Np = 0.0 # targetの文書数 22 | Nn = 0.0 # target以外の文書す 23 | 24 | # N11とN10をカウント 25 | for d in data: 26 | cat, words = d[0], d[1:] 27 | if cat == target: 28 | Np += 1 29 | for wc in words: 30 | if ':' in wc: word, count = wc.split(":") 31 | else: word = wc 32 | 33 | V.add(word) 34 | N11[word] += 1 # 文書数をカウントするので+1すればOK 35 | elif cat != target: 36 | Nn += 1 37 | for wc in words: 38 | if ':' in wc: word, count = wc.split(":") 39 | else: word = wc 40 | 41 | V.add(word) 42 | N10[word] += 1 43 | 44 | # N01とN00は簡単に求められる 45 | for word in V: 46 | N01[word] = Np - N11[word] 47 | N00[word] = Nn - N10[word] 48 | 49 | for w, c in N01.items(): 50 | if c < 0: N01[w] = 0.0 51 | 52 | for w, c in N00.items(): 53 | if c < 0: N00[w] = 0.0 54 | 55 | # 総文書数 56 | N = Np + Nn 57 | 58 | 59 | # 各単語の相互情報量を計算 60 | MI = [] 61 | for word in V: 62 | n11, n10, n01, n00 = N11[word], N10[word], N01[word], N00[word] 63 | # いずれかの出現頻度が0.0となる単語はlog2(0)となってしまうのでスコア0とする 64 | if n11 == 0.0 or n10 == 0.0 or n01 == 0.0 or n00 == 0.0: 65 | MI.append( (0.0, word) ) 66 | continue 67 | # 相互情報量の定義の各項を計算 68 | temp1 = n11/N * math.log((N*n11)/((n10+n11)*(n01+n11)), 2) 69 | temp2 = n01/N * math.log((N*n01)/((n00+n01)*(n01+n11)), 2) 70 | temp3 = n10/N * math.log((N*n10)/((n10+n11)*(n00+n10)), 2) 71 | temp4 = n00/N * math.log((N*n00)/((n00+n01)*(n00+n10)), 2) 72 | score = temp1 + temp2 + temp3 + temp4 73 | MI.append( (score, word) ) 74 | 75 | # 相互情報量の降順にソートして上位k個を返す 76 | MI.sort(reverse=True) 77 | return MI[0:k] 78 | 79 | 80 | if __name__ == '__main__': 81 | 82 | input_data = [ 83 | ['label_a', "I", "aa", "aa", "aa", "aa", "aa"], 84 | ['label_a', "bb", "aa", "aa", "aa", "aa", "aa"], 85 | ['label_a', "I", "aa", "hero", "some", "ok", "aa"], 86 | ['label_b', "bb", "bb", "bb"], 87 | ['label_b', "bb", "bb", "bb"], 88 | ['label_b', "hero", "ok", "bb"], 89 | ['label_b', "hero", "cc", "bb"], 90 | ['label_c', "cc", "cc", "cc"], 91 | ['label_c', "cc", "cc", "bb"], 92 | ['label_c', "xx", "xx", "cc"], 93 | ['label_c', "aa", "xx", "cc"], 94 | ] 95 | res = mutual_information(target='label_a', data=input_data, k=30) 96 | import pprint 97 | print('label_a') 98 | pprint.pprint(res) 99 | 100 | print('label_b') 101 | pprint.pprint(mutual_information(target='label_b', data=input_data, k=30)) 102 | 103 | print('label_c') 104 | pprint.pprint(mutual_information(target='label_c', data=input_data, k=30)) -------------------------------------------------------------------------------- /examples/advanced_example.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | __author__ = 'kensuke-mi' 3 | 4 | from DocumentFeatureSelection import interface 5 | from DocumentFeatureSelection.init_logger import logger 6 | import logging 7 | import pprint 8 | 9 | # ====================================================================================================== 10 | # expert usage 11 | # you can put complex-structure-feature as feature. 12 | # One feature is tuple of tuple. Concretely (("he", "N"), ("is", "V")) is one feature. 13 | # You can NOT use ngram argument for expert input 14 | input_dict_tuple_feature = { 15 | "label_a": [ 16 | [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ], 17 | [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ], 18 | [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] 19 | ], 20 | "label_b": [ 21 | [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ], 22 | [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ], 23 | [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] 24 | ] 25 | } 26 | 27 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 | # tf idf 29 | tf_idf_scored_object = interface.run_feature_selection( 30 | input_dict=input_dict_tuple_feature, 31 | method='tf_idf', 32 | n_jobs=5 33 | ) 34 | pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) 35 | 36 | 37 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 38 | # pmi 39 | pmi_scored_object = interface.run_feature_selection( 40 | input_dict=input_dict_tuple_feature, 41 | method='pmi', 42 | n_jobs=5 43 | ) 44 | pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) 45 | 46 | 47 | pmi_scored_object_cython = interface.run_feature_selection( 48 | input_dict=input_dict_tuple_feature, 49 | method='pmi', 50 | use_cython=True 51 | ) 52 | pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) 53 | 54 | 55 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 56 | # soa 57 | soa_scored_object = interface.run_feature_selection( 58 | input_dict=input_dict_tuple_feature, 59 | method='soa', 60 | n_jobs=5 61 | ) 62 | pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) 63 | 64 | 65 | soa_scored_object_cython = interface.run_feature_selection( 66 | input_dict=input_dict_tuple_feature, 67 | method='soa', 68 | use_cython=True 69 | ) 70 | pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) 71 | 72 | 73 | 74 | 75 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 76 | # bns 77 | input_dict_tuple_feature = { 78 | "positive": [ 79 | [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ], 80 | [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ], 81 | [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] 82 | ], 83 | "negative": [ 84 | [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ], 85 | [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ], 86 | [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] 87 | ] 88 | } 89 | 90 | 91 | bns_scored_object = interface.run_feature_selection( 92 | input_dict=input_dict_tuple_feature, 93 | method='bns', 94 | n_jobs=5 95 | ) 96 | pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary()) -------------------------------------------------------------------------------- /tests/test_data_models.py: -------------------------------------------------------------------------------- 1 | from DocumentFeatureSelection.common import data_converter 2 | from DocumentFeatureSelection.pmi import PMI_python3 3 | from DocumentFeatureSelection.models import ScoredResultObject 4 | from scipy.sparse import csr_matrix 5 | import unittest 6 | import numpy 7 | import logging 8 | 9 | 10 | class TestDataModels(unittest.TestCase): 11 | def setUp(self): 12 | self.input_dict = { 13 | "label_a": [ 14 | ["I", "aa", "aa", "aa", "aa", "aa"], 15 | ["bb", "aa", "aa", "aa", "aa", "aa"], 16 | ["I", "aa", "hero", "some", "ok", "aa"] 17 | ], 18 | "label_b": [ 19 | ["bb", "bb", "bb"], 20 | ["bb", "bb", "bb"], 21 | ["hero", "ok", "bb"], 22 | ["hero", "cc", "bb"], 23 | ], 24 | "label_c": [ 25 | ["cc", "cc", "cc"], 26 | ["cc", "cc", "bb"], 27 | ["xx", "xx", "cc"], 28 | ["aa", "xx", "cc"], 29 | ] 30 | } 31 | 32 | 33 | def test_get_pmi_feature_dictionary(self): 34 | """checks if it works or not, that getting scored dictionary object from scored_matrix 35 | 36 | :return: 37 | """ 38 | data_csr_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 39 | labeled_documents=self.input_dict, 40 | ngram=1, 41 | n_jobs=5 42 | ) 43 | 44 | assert isinstance(data_csr_object.csr_matrix_, csr_matrix) 45 | assert isinstance(data_csr_object.label2id_dict, dict) 46 | assert isinstance(data_csr_object.vocabulary, dict) 47 | 48 | pmi_scored_matrix = PMI_python3.PMI().fit_transform(X=data_csr_object.csr_matrix_, n_jobs=5, 49 | n_docs_distribution=data_csr_object.n_docs_distribution) 50 | 51 | # main part of test 52 | # when sort is True, cut_zero is True, outformat is dict 53 | pmi_scored_dictionary_objects = ScoredResultObject( 54 | scored_matrix=pmi_scored_matrix, 55 | label2id_dict=data_csr_object.label2id_dict, 56 | feature2id_dict=data_csr_object.vocabulary 57 | ).ScoreMatrix2ScoreDictionary( 58 | outformat='dict', 59 | sort_desc=True, 60 | n_jobs=5 61 | ) 62 | assert isinstance(pmi_scored_dictionary_objects, dict) 63 | logging.debug(pmi_scored_dictionary_objects) 64 | 65 | # when sort is True, cut_zero is True, outformat is items 66 | pmi_scored_dictionary_objects = ScoredResultObject( 67 | scored_matrix=pmi_scored_matrix, 68 | label2id_dict=data_csr_object.label2id_dict, 69 | feature2id_dict=data_csr_object.vocabulary).ScoreMatrix2ScoreDictionary( 70 | outformat='items', 71 | sort_desc=True, 72 | n_jobs=5 73 | ) 74 | assert isinstance(pmi_scored_dictionary_objects, list) 75 | for d in pmi_scored_dictionary_objects: 76 | assert isinstance(d, dict) 77 | 78 | # when sort is True, cut_zero is False, outformat is dict 79 | pmi_scored_dictionary_objects = ScoredResultObject( 80 | scored_matrix=pmi_scored_matrix, 81 | label2id_dict=data_csr_object.label2id_dict, 82 | feature2id_dict=data_csr_object.vocabulary 83 | ).ScoreMatrix2ScoreDictionary( 84 | outformat='dict', 85 | sort_desc=True, 86 | n_jobs=5 87 | ) 88 | assert isinstance(pmi_scored_dictionary_objects, dict) 89 | logging.debug(pmi_scored_dictionary_objects) 90 | 91 | # when sort is True, cut_zero is False, outformat is items 92 | pmi_scored_dictionary_objects = ScoredResultObject( 93 | scored_matrix=pmi_scored_matrix, 94 | label2id_dict=data_csr_object.label2id_dict, 95 | feature2id_dict=data_csr_object.vocabulary 96 | ).ScoreMatrix2ScoreDictionary( 97 | outformat='items', 98 | sort_desc=True, 99 | n_jobs=5 100 | ) 101 | assert isinstance(pmi_scored_dictionary_objects, list) 102 | for d in pmi_scored_dictionary_objects: 103 | assert isinstance(d, dict) 104 | 105 | 106 | if __name__ == '__main__': 107 | unittest.main() -------------------------------------------------------------------------------- /tests/test_interface.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from DocumentFeatureSelection import interface 3 | from DocumentFeatureSelection.models import ScoredResultObject 4 | from DocumentFeatureSelection.models import PersistentDict 5 | from sqlitedict import SqliteDict 6 | import os 7 | import numpy 8 | 9 | 10 | class TestInterface(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.input_dict = { 14 | "label_a": [ 15 | ["I", "aa", "aa", "aa", "aa", "aa"], 16 | ["bb", "aa", "aa", "aa", "aa", "aa"], 17 | ["I", "aa", "hero", "some", "ok", "aa"] 18 | ], 19 | "label_b": [ 20 | ["bb", "bb", "bb"], 21 | ["bb", "bb", "bb"], 22 | ["hero", "ok", "bb"], 23 | ["hero", "cc", "bb"], 24 | ], 25 | "label_c": [ 26 | ["cc", "cc", "cc"], 27 | ["cc", "cc", "bb"], 28 | ["xx", "xx", "cc"], 29 | ["aa", "xx", "cc"], 30 | ] 31 | } 32 | cls.method = ['pmi', 'tf_idf', 'soa'] 33 | cls.bool_cython = [True, False] 34 | cls.is_use_cache = [True, False] 35 | cls.is_use_memmap = [True, False] 36 | cls.joblib_range = range(0, 2) 37 | cls.path_shelve_file = './shelve' 38 | cls.path_sqlite3_persistent = './temp_db.sqlite3' 39 | 40 | @classmethod 41 | def tearDownClass(cls): 42 | os.remove(cls.path_sqlite3_persistent) 43 | 44 | def test_interface_shelve(self): 45 | """パラメタ条件を組み合わせてテストを実行する  46 | - cythonモード使う or not 47 | - cacheモード使う or not 48 | - memmapモード使う or not 49 | """ 50 | shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json') 51 | for key, value in self.input_dict.items(): shelve_obj[key] = value 52 | 53 | sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True) 54 | for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value 55 | 56 | for method_name in self.method: 57 | for cython_flag in self.bool_cython: 58 | for cache_flag in self.is_use_cache: 59 | for memmap_flag in self.is_use_memmap: 60 | scored_result_persisted = interface.run_feature_selection( 61 | input_dict=shelve_obj, 62 | method=method_name, 63 | use_cython=cython_flag, 64 | is_use_cache=cache_flag, 65 | is_use_memmap=memmap_flag 66 | ) # type: ScoredResultObject 67 | self.assertIsInstance(scored_result_persisted, ScoredResultObject) 68 | self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list) 69 | 70 | scored_result_sqlite3_persisted = interface.run_feature_selection( 71 | input_dict=sqlite3_dict_obj, 72 | method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject 73 | self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) 74 | self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list) 75 | 76 | # You check if result is same between data-source = shelve_obj and data-source = dict-object 77 | scored_result_dict = interface.run_feature_selection( 78 | input_dict=self.input_dict, 79 | method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject 80 | self.assertIsInstance(scored_result_dict, ScoredResultObject) 81 | self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list) 82 | 83 | numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(), 84 | scored_result_dict.scored_matrix.toarray()) 85 | numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(), 86 | scored_result_dict.scored_matrix.toarray()) 87 | 88 | 89 | if __name__ == '__main__': 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | if NOT "%PAPER%" == "" ( 11 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 12 | ) 13 | 14 | if "%1" == "" goto help 15 | 16 | if "%1" == "help" ( 17 | :help 18 | echo.Please use `make ^` where ^ is one of 19 | echo. html to make standalone HTML files 20 | echo. dirhtml to make HTML files named index.html in directories 21 | echo. singlehtml to make a single large HTML file 22 | echo. pickle to make pickle files 23 | echo. json to make JSON files 24 | echo. htmlhelp to make HTML files and a HTML help project 25 | echo. qthelp to make HTML files and a qthelp project 26 | echo. devhelp to make HTML files and a Devhelp project 27 | echo. epub to make an epub 28 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 29 | echo. text to make text files 30 | echo. man to make manual pages 31 | echo. changes to make an overview over all changed/added/deprecated items 32 | echo. linkcheck to check all external links for integrity 33 | echo. doctest to run all doctests embedded in the documentation if enabled 34 | goto end 35 | ) 36 | 37 | if "%1" == "clean" ( 38 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 39 | del /q /s %BUILDDIR%\* 40 | goto end 41 | ) 42 | 43 | if "%1" == "html" ( 44 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 45 | echo. 46 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 47 | goto end 48 | ) 49 | 50 | if "%1" == "dirhtml" ( 51 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 52 | echo. 53 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 54 | goto end 55 | ) 56 | 57 | if "%1" == "singlehtml" ( 58 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 59 | echo. 60 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 61 | goto end 62 | ) 63 | 64 | if "%1" == "pickle" ( 65 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 66 | echo. 67 | echo.Build finished; now you can process the pickle files. 68 | goto end 69 | ) 70 | 71 | if "%1" == "json" ( 72 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 73 | echo. 74 | echo.Build finished; now you can process the JSON files. 75 | goto end 76 | ) 77 | 78 | if "%1" == "htmlhelp" ( 79 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 80 | echo. 81 | echo.Build finished; now you can run HTML Help Workshop with the ^ 82 | .hhp project file in %BUILDDIR%/htmlhelp. 83 | goto end 84 | ) 85 | 86 | if "%1" == "qthelp" ( 87 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 88 | echo. 89 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 90 | .qhcp project file in %BUILDDIR%/qthelp, like this: 91 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\document-feature-selection.qhcp 92 | echo.To view the help file: 93 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\document-feature-selection.ghc 94 | goto end 95 | ) 96 | 97 | if "%1" == "devhelp" ( 98 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 99 | echo. 100 | echo.Build finished. 101 | goto end 102 | ) 103 | 104 | if "%1" == "epub" ( 105 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 106 | echo. 107 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 108 | goto end 109 | ) 110 | 111 | if "%1" == "latex" ( 112 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 113 | echo. 114 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 115 | goto end 116 | ) 117 | 118 | if "%1" == "text" ( 119 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 120 | echo. 121 | echo.Build finished. The text files are in %BUILDDIR%/text. 122 | goto end 123 | ) 124 | 125 | if "%1" == "man" ( 126 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 127 | echo. 128 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 129 | goto end 130 | ) 131 | 132 | if "%1" == "changes" ( 133 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 134 | echo. 135 | echo.The overview file is in %BUILDDIR%/changes. 136 | goto end 137 | ) 138 | 139 | if "%1" == "linkcheck" ( 140 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 141 | echo. 142 | echo.Link check complete; look for any errors in the above output ^ 143 | or in %BUILDDIR%/linkcheck/output.txt. 144 | goto end 145 | ) 146 | 147 | if "%1" == "doctest" ( 148 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 149 | echo. 150 | echo.Testing of doctests in the sources finished, look at the ^ 151 | results in %BUILDDIR%/doctest/output.txt. 152 | goto end 153 | ) 154 | 155 | :end -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Setup file for the document-feature-selection project. 3 | """ 4 | 5 | __author__ = 'kensuke-mi' 6 | __version__ = '1.5' 7 | 8 | import sys 9 | import subprocess 10 | from setuptools import setup, find_packages 11 | from distutils.extension import Extension 12 | python_version = sys.version_info 13 | print('python version {}'.format(python_version)) 14 | 15 | 16 | # -------------------------------------------------------------------------------------------------------- 17 | # Flags to compile Cython code or use already compiled code 18 | try: 19 | import Cython 20 | except ImportError: 21 | subprocess.check_call(["python", '-m', 'pip', 'install', 'cython']) 22 | import Cython 23 | 24 | if sys.version_info >= (3, 7): 25 | # if python >= 3.7, Cython must regenerate C++ code again. 26 | import os 27 | if os.path.exists('DocumentFeatureSelection/pmi/pmi_cython.c'): 28 | os.remove('DocumentFeatureSelection/pmi/pmi_cython.c') 29 | if os.path.exists('DocumentFeatureSelection/bns/bns_cython.c'): 30 | os.remove('DocumentFeatureSelection/bns/bns_cython.c') 31 | if os.path.exists('DocumentFeatureSelection/soa/soa_cython.c'): 32 | os.remove('DocumentFeatureSelection/soa/soa_cython.c') 33 | # if python >= 3.7, typing should be installed again. 34 | subprocess.check_call(["python", '-m', 'pip', 'install', 'typing']) 35 | 36 | cmdclass = {} 37 | ext_modules = [] 38 | from Cython.Distutils import build_ext 39 | 40 | ext_modules += [ 41 | Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ],), 42 | Extension("DocumentFeatureSelection.soa.soa_cython", [ "DocumentFeatureSelection/soa/soa_cython.pyx" ],), 43 | Extension("DocumentFeatureSelection.bns.bns_cython", [ "DocumentFeatureSelection/bns/bns_cython.pyx" ],) 44 | ] 45 | cmdclass.update({'build_ext': build_ext}) 46 | 47 | 48 | # -------------------------------------------------------------------------------------------------------- 49 | # try to install numpy automatically because sklearn requires the status where numpy is already installed 50 | try: 51 | import numpy 52 | except ImportError: 53 | use_numpy_include_dirs = False 54 | try: 55 | subprocess.check_call(["python", '-m', 'pip', 'install', 'numpy']) 56 | import numpy 57 | except Exception as e: 58 | raise Exception(e.__str__() + 'We failed to install numpy automatically. \ 59 | Try installing numpy manually or Try anaconda distribution.') 60 | 61 | # -------------------------------------------------------------------------------------------------------- 62 | # try to install scipy automatically because sklearn requires the status where scipy is already installed 63 | try: 64 | import scipy 65 | except ImportError: 66 | try: 67 | subprocess.check_call(["python", '-m', 'pip', 'install', 'scipy']) 68 | import scipy 69 | except Exception as e: 70 | raise Exception(e.__str__() + 'We failed to install scipy automatically. \ 71 | Try installing scipy manually or Try anaconda distribution.') 72 | # -------------------------------------------------------------------------------------------------------- 73 | 74 | 75 | install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy', 76 | 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython', 'sqlitedict', 'nose', 77 | 'typing'] 78 | 79 | try: 80 | import pypandoc 81 | long_description = pypandoc.convert('README.md', 'rst') 82 | except(IOError, ImportError): 83 | long_description = open('README.md').read() 84 | 85 | 86 | description = 'Various methods of feature selection from Text Data' 87 | 88 | classifiers = [ 89 | "Development Status :: 5 - Production/Stable", 90 | "License :: OSI Approved :: MIT License", 91 | "Programming Language :: Python", 92 | "Natural Language :: Japanese", 93 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 94 | "Programming Language :: Python :: 3.5" 95 | ] 96 | 97 | setup( 98 | name='DocumentFeatureSelection', 99 | version=__version__, 100 | description=description, 101 | long_description=long_description, 102 | author=__author__, 103 | author_email='kensuke.mit@gmail.com', 104 | license='CeCILL-B', 105 | url='https://github.com/Kensuke-Mitsuzawa/DocumentFeatureSelection', 106 | packages=find_packages(), 107 | include_package_data=True, 108 | zip_safe=False, 109 | test_suite='tests.all_tests.suite', 110 | install_requires=install_requires, 111 | tests_require=install_requires, 112 | setup_requires=['six', 'setuptools>=1.0', 'pip', 'typing', 'cython'], 113 | classifiers=classifiers, 114 | cmdclass=cmdclass, 115 | ext_modules=ext_modules, 116 | include_dirs=[numpy.get_include()] 117 | ) 118 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/common/func_data_converter.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes 3 | from DocumentFeatureSelection.common.utils import init_cache_object 4 | from sklearn.feature_extraction import DictVectorizer 5 | from typing import Dict, List, Tuple, Any, Union 6 | from sqlitedict import SqliteDict 7 | import joblib 8 | import itertools 9 | import tempfile 10 | N_FEATURE_SWITCH_STRATEGY = 1000000 11 | 12 | 13 | 14 | def generate_document_dict(document_key:str, 15 | documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]: 16 | """This function gets Document-frequency count in given list of documents 17 | """ 18 | assert isinstance(documents, list) 19 | feature_frequencies = [Counter(document) for document in documents] 20 | document_frequencies = Counter() 21 | for feat_freq in feature_frequencies: document_frequencies.update(feat_freq.keys()) 22 | 23 | return (document_key, document_frequencies) 24 | 25 | 26 | def make_multi_docs2term_freq_info(labeled_documents:AvailableInputTypes, 27 | is_use_cache:bool=True, 28 | path_work_dir:str=tempfile.mkdtemp()): 29 | """* What u can do 30 | - This function generates information to construct term-frequency matrix 31 | """ 32 | assert isinstance(labeled_documents, (SqliteDict, dict)) 33 | 34 | counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents)))) 35 | for label, documents in labeled_documents.items()] 36 | feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency] 37 | 38 | 39 | if is_use_cache: 40 | dict_matrix_index = init_cache_object('matrix_element_objects', path_work_dir=path_work_dir) 41 | else: 42 | dict_matrix_index = {} 43 | 44 | # use sklearn feature-extraction 45 | vec = DictVectorizer() 46 | dict_matrix_index['matrix_object'] = vec.fit_transform(feature_documents).tocsr() 47 | dict_matrix_index['feature2id'] = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} 48 | dict_matrix_index['label2id'] = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} 49 | 50 | return SetDocumentInformation(dict_matrix_index) 51 | 52 | ''' 53 | def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str: 54 | type_flag = None 55 | for document_list in docs: 56 | assert isinstance(document_list, list) 57 | for feature in document_list: 58 | if isinstance(feature, str): 59 | type_flag = 'str' 60 | elif isinstance(feature, tuple): 61 | type_flag = 'tuple' 62 | else: 63 | logger.error(msg=docs) 64 | raise TypeError('Feature object should be either of str or tuple') 65 | return type_flag''' 66 | 67 | 68 | def make_multi_docs2doc_freq_info(labeled_documents:AvailableInputTypes, 69 | n_jobs:int=-1, 70 | path_working_dir:str=tempfile.mkdtemp(), 71 | is_use_cache: bool = True)->SetDocumentInformation: 72 | """* What u can do 73 | - This function generates information for constructing document-frequency matrix. 74 | """ 75 | assert isinstance(labeled_documents, (SqliteDict, dict)) 76 | #type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) 77 | #assert len(type_flag)==1 78 | 79 | # todo 高速化を検討すること 80 | counted_frequency = joblib.Parallel(n_jobs=n_jobs)( 81 | joblib.delayed(generate_document_dict)(key, docs) 82 | for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0])) 83 | 84 | ### construct [{}] structure for input of DictVectorizer() ### 85 | seq_feature_documents = (dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency) 86 | 87 | ### Save index-string dictionary 88 | if is_use_cache: 89 | dict_matrix_index = init_cache_object('matrix_element_object', path_working_dir) 90 | else: 91 | dict_matrix_index = {} 92 | 93 | # use sklearn feature-extraction 94 | vec = DictVectorizer() 95 | dict_matrix_index['matrix_object'] = vec.fit_transform(seq_feature_documents).tocsr() 96 | dict_matrix_index['feature2id'] = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} 97 | dict_matrix_index['label2id'] = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} 98 | 99 | return SetDocumentInformation(dict_matrix_index) 100 | 101 | 102 | # alias for old versions 103 | multiDocs2TermFreqInfo = make_multi_docs2term_freq_info 104 | multiDocs2DocFreqInfo = make_multi_docs2doc_freq_info 105 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/common/crs_matrix_constructor.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger, StreamHandler 2 | import joblib 3 | import sys 4 | import logging 5 | import numpy 6 | from typing import List, Tuple, Dict 7 | from scipy.sparse import csr_matrix 8 | 9 | logging.basicConfig(format='%(asctime)s %(message)s', 10 | datefmt='%m/%d/%Y %I:%M:%S %p', 11 | level=logging.DEBUG) 12 | logger = getLogger(__name__) 13 | handler = StreamHandler() 14 | logger.addHandler(handler) 15 | 16 | python_version = sys.version_info 17 | __author__ = 'kensuke-mi' 18 | 19 | 20 | class PosTuple(object): 21 | __slots__ = ['doc_id', 'word_id', 'document_frequency'] 22 | def __init__(self, doc_id, word_id, document_frequency): 23 | self.doc_id = doc_id 24 | self.word_id = word_id 25 | self.document_frequency = document_frequency 26 | 27 | 28 | PARAM_JOBLIB_BACKEND = ['multiprocessing', 'threading'] 29 | 30 | def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy.ndarray)->numpy.array: 31 | """* what you can do 32 | - You get array of [document_id, feature_id, value(frequency)] 33 | """ 34 | assert isinstance(vocaburary, numpy.ndarray) 35 | col_element = vocaburary[numpy.where(vocaburary['key']==word)] 36 | assert len(col_element) == 1 37 | col_value = col_element[0]['value'] 38 | # df value is word frequency in documents 39 | df_value = doc_freq 40 | 41 | return numpy.array([doc_id, col_value, df_value]) 42 | 43 | def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray: 44 | 45 | value_pairs = numpy.array([ 46 | get_data_col_row_values(doc_id=doc_id, word=key_value_tuple['key'], doc_freq=key_value_tuple['value'], vocaburary=vocabulary) 47 | for key_value_tuple 48 | in doc_freq_obj]) 49 | 50 | return value_pairs 51 | 52 | 53 | def make_csr_list(value_position_list:List[numpy.ndarray])->Tuple[List[int], List[int], List[int]]: 54 | data = [] 55 | row = [] 56 | col = [] 57 | for position_tuple in value_position_list: 58 | row.append(position_tuple[0]) 59 | col.append(position_tuple[1]) 60 | data.append(position_tuple[2]) 61 | 62 | return row, col, data 63 | 64 | 65 | def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_backend:str='Parallel'): 66 | """This function makes information to make csr matrix. Data-list/Row-list/Col-list 67 | 68 | :param feature_frequency list: list having dictionary of {feature: frequency} 69 | :param label2id_dict dict: dictionary of {feature: feature_id} 70 | :return: tuple having lists to construct csr matrix 71 | :rtype tuple: 72 | 73 | Example, 74 | 75 | feature_frequency is 76 | >>> [{'some': 1, 'bb': 1, 'hero': 1, 'aa': 3, 'I': 2, 'ok': 1}, {'cc': 1, 'bb': 4, 'ok': 1, 'hero': 2}, {'cc': 4, 'bb': 1, 'xx': 2, 'aa': 1}] 77 | 78 | vocaburary is 79 | >>> {'some': 6, 'bb': 2, 'xx': 7, 'hero': 4, 'aa': 1, 'cc': 3, 'I': 0, 'ok': 5} 80 | 81 | return value is 82 | >>> ([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], [0, 1, 2, 4, 5, 6, 2, 3, 4, 5, 1, 2, 3, 7], [2, 3, 1, 1, 1, 1, 4, 1, 2, 1, 1, 1, 4, 2]) 83 | 84 | """ 85 | if not joblib_backend in PARAM_JOBLIB_BACKEND: 86 | assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend)) 87 | 88 | assert isinstance(feature_frequency, list) 89 | assert isinstance(vocabulary, numpy.ndarray) 90 | assert isinstance(n_jobs, int) 91 | 92 | logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs)) 93 | 94 | set_value_position_list = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( 95 | joblib.delayed(SUB_FUNC_make_value_pairs)( 96 | doc_id, 97 | doc_freq_obj, 98 | vocabulary 99 | ) 100 | for doc_id, doc_freq_obj in enumerate(feature_frequency) 101 | ) # type: List[numpy.ndarray] 102 | 103 | # make 2-d list into 1-d list 104 | value_position_list = sorted( 105 | [l for set in set_value_position_list for l in set], 106 | key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2])) 107 | 108 | row, col, data = make_csr_list(value_position_list) 109 | 110 | return row, col, data 111 | 112 | 113 | def make_csr_objects(row, col, data, n_feature, n_docs): 114 | """This is main function of making csr_matrix from given data 115 | 116 | :param row: 117 | :param col: 118 | :param data: 119 | :param n_feature: 120 | :param n_docs: 121 | :return: 122 | """ 123 | assert isinstance(row, list) 124 | assert isinstance(col, list) 125 | assert isinstance(data, list) 126 | assert isinstance(n_feature, int) 127 | assert isinstance(n_docs, int) 128 | 129 | return csr_matrix((data, (row, col)), shape=(n_docs, n_feature)) -------------------------------------------------------------------------------- /tests/test_bns_python3.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from DocumentFeatureSelection.common import data_converter 3 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix 4 | from DocumentFeatureSelection.bns import bns_python3 5 | from DocumentFeatureSelection.models import ScoredResultObject 6 | from scipy.sparse import csr_matrix 7 | 8 | 9 | class TestBnsPython3(unittest.TestCase): 10 | def setUp(self): 11 | self.correct_input = { 12 | "label_a": [ 13 | ["I", "aa", "aa", "aa", "aa", "aa"], 14 | ["bb", "aa", "aa", "aa", "aa", "aa"], 15 | ["I", "aa", "hero", "some", "ok", "aa"] 16 | ], 17 | "label_b": [ 18 | ["bb", "bb", "bb"], 19 | ["bb", "bb", "bb"], 20 | ["hero", "ok", "bb"], 21 | ["hero", "cc", "bb"], 22 | ] 23 | } 24 | 25 | def test_fit_transform(self): 26 | 27 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 28 | labeled_documents=self.correct_input, 29 | n_jobs=5 30 | ) 31 | assert isinstance(data_csr_matrix, DataCsrMatrix) 32 | label2id_dict = data_csr_matrix.label2id_dict 33 | csr_matrix_ = data_csr_matrix.csr_matrix_ 34 | n_docs_distribution = data_csr_matrix.n_docs_distribution 35 | vocabulary = data_csr_matrix.vocabulary 36 | 37 | bns_score_csr_matrix = bns_python3.BNS().fit_transform(X=csr_matrix_, 38 | y=None, 39 | unit_distribution=n_docs_distribution, 40 | verbose=True) 41 | assert isinstance(bns_score_csr_matrix, csr_matrix) 42 | 43 | bns_scores_dict = ScoredResultObject( 44 | scored_matrix=bns_score_csr_matrix, 45 | label2id_dict=label2id_dict, 46 | feature2id_dict=vocabulary 47 | ).convert_score_matrix2score_record() 48 | self.assertTrue(bns_scores_dict, list) 49 | #assert isinstance(bns_scores_dict, list) 50 | #import pprint 51 | #pprint.pprint(bns_scores_dict) 52 | 53 | 54 | def test_check_input_error(self): 55 | incorrect_input_dict = { 56 | "label_a": [ 57 | ["I", "aa", "aa", "aa", "aa", "aa"], 58 | ["bb", "aa", "aa", "aa", "aa", "aa"], 59 | ["I", "aa", "hero", "some", "ok", "aa"] 60 | ], 61 | "label_b": [ 62 | ["bb", "bb", "bb"], 63 | ["bb", "bb", "bb"], 64 | ["hero", "ok", "bb"], 65 | ["hero", "cc", "bb"], 66 | ["cc", "cc", "cc"], 67 | ["cc", "cc", "bb"], 68 | ["xx", "xx", "cc"], 69 | ["aa", "xx", "cc"], 70 | ], 71 | "label_c":[ 72 | ["aa", "xx", "cc"] 73 | ] 74 | } 75 | 76 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 77 | labeled_documents=incorrect_input_dict, 78 | n_jobs=5 79 | ) 80 | assert isinstance(data_csr_matrix, DataCsrMatrix) 81 | csr_matrix_ = data_csr_matrix.csr_matrix_ 82 | n_docs_distribution = data_csr_matrix.n_docs_distribution 83 | try: 84 | bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution) 85 | except: 86 | pass 87 | 88 | def test_bns_cython(self): 89 | incorrect_input_dict = { 90 | "label_a": [ 91 | ["I", "aa", "aa", "aa", "aa", "aa"], 92 | ["bb", "aa", "aa", "aa", "aa", "aa"], 93 | ["I", "aa", "hero", "some", "ok", "aa"] 94 | ], 95 | "label_b": [ 96 | ["bb", "bb", "bb"], 97 | ["bb", "bb", "bb"], 98 | ["hero", "ok", "bb"], 99 | ["hero", "cc", "bb"], 100 | ["cc", "cc", "cc"], 101 | ["cc", "cc", "bb"], 102 | ["xx", "xx", "cc"], 103 | ["aa", "xx", "cc"], 104 | ] 105 | } 106 | 107 | data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 108 | labeled_documents=incorrect_input_dict, 109 | n_jobs=5 110 | ) 111 | assert isinstance(data_csr_matrix, DataCsrMatrix) 112 | csr_matrix_ = data_csr_matrix.csr_matrix_ 113 | n_docs_distribution = data_csr_matrix.n_docs_distribution 114 | 115 | result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_, 116 | y=None, 117 | unit_distribution=n_docs_distribution, 118 | use_cython=True) 119 | 120 | 121 | if __name__ == '__main__': 122 | unittest.main() -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/document-feature-selection.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/document-feature-selection.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/document-feature-selection" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/document-feature-selection" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." -------------------------------------------------------------------------------- /DocumentFeatureSelection/soa/soa_python3.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse import csr_matrix 2 | from numpy import memmap 3 | from typing import Union 4 | from DocumentFeatureSelection.init_logger import logger 5 | import logging 6 | import joblib 7 | import math 8 | import numpy 9 | 10 | __author__ = 'kensuke-mi' 11 | 12 | 13 | def soa(X:Union[memmap, csr_matrix], 14 | unit_distribution:numpy.ndarray, 15 | n_total_docs:int, 16 | feature_index:int, 17 | sample_index:int, verbose=False): 18 | # X is either of term-frequency matrix per label or document-frequency per label 19 | assert isinstance(X, (memmap, csr_matrix)) 20 | assert isinstance(unit_distribution, numpy.ndarray) 21 | assert isinstance(feature_index, int) 22 | assert isinstance(sample_index, int) 23 | 24 | matrix_size = X.shape 25 | NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] 26 | 27 | # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e 28 | freq_w_e = X[sample_index, feature_index] 29 | # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e 30 | freq_w_not_e = X[NOT_sample_indexes, feature_index].sum() 31 | # freq_e is the number of the unit having specific label e 32 | freq_e = unit_distribution[sample_index] 33 | # freq_not_e is the number of the unit NOT having the specific label e 34 | freq_not_e = n_total_docs - freq_e 35 | 36 | if verbose: 37 | logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) 38 | logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format( 39 | freq_w_e, 40 | freq_w_not_e, 41 | freq_e, 42 | freq_not_e 43 | )) 44 | 45 | if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0: 46 | return 0 47 | else: 48 | nominator = (float(freq_w_e) * freq_not_e) 49 | denominator = (float(freq_e) * freq_w_not_e) 50 | ans = nominator / denominator 51 | assert isinstance(ans, float) 52 | soa_val = math.log(ans, 2) 53 | return soa_val 54 | 55 | 56 | class SOA(object): 57 | def __init__(self): 58 | pass 59 | 60 | def fit_transform(self, 61 | X: Union[memmap, csr_matrix], 62 | unit_distribution: numpy.ndarray, 63 | n_jobs: int=1, 64 | verbose=False, 65 | joblib_backend: str='multiprocessing', 66 | use_cython: bool=False): 67 | """* What you can do 68 | - Get SOA weighted-score matrix. 69 | - You can get fast-speed with Cython 70 | """ 71 | assert isinstance(X, (memmap, csr_matrix)) 72 | assert isinstance(unit_distribution, numpy.ndarray) 73 | 74 | matrix_size = X.shape 75 | sample_range = list(range(0, matrix_size[0])) 76 | feature_range = list(range(0, matrix_size[1])) 77 | n_total_document = sum(unit_distribution) 78 | 79 | logger.debug(msg='Start calculating SOA') 80 | logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1])) 81 | 82 | if use_cython: 83 | import pyximport; pyximport.install() 84 | from DocumentFeatureSelection.soa.soa_cython import main 85 | logger.warning(msg='n_jobs parameter is invalid when use_cython=True') 86 | soa_score_csr_source = main(X=X, 87 | n_docs_distribution=unit_distribution, 88 | n_total_doc=n_total_document, 89 | sample_range=sample_range, 90 | feature_range=feature_range, 91 | verbose=False) 92 | else: 93 | self.soa = soa 94 | soa_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( 95 | joblib.delayed(self.docId_word_soa)( 96 | X=X, 97 | unit_distribution=unit_distribution, 98 | feature_index=feature_index, 99 | sample_index=sample_index, 100 | n_total_doc=n_total_document, 101 | verbose=verbose 102 | ) 103 | for sample_index in sample_range 104 | for feature_index in feature_range 105 | ) 106 | 107 | row_list = [t[0] for t in soa_score_csr_source] 108 | col_list = [t[1] for t in soa_score_csr_source] 109 | data_list = [t[2] for t in soa_score_csr_source] 110 | 111 | soa_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)), 112 | shape=(X.shape[0], 113 | X.shape[1])) 114 | 115 | logging.debug(msg='End calculating SOA') 116 | 117 | return soa_featured_csr_matrix 118 | 119 | def docId_word_soa(self, 120 | X: Union[memmap, csr_matrix], 121 | unit_distribution: numpy.ndarray, 122 | n_total_doc: int, 123 | feature_index: int, 124 | sample_index: int, verbose=False): 125 | """ 126 | """ 127 | assert isinstance(X, (memmap, csr_matrix)) 128 | assert isinstance(unit_distribution, numpy.ndarray) 129 | assert isinstance(feature_index, int) 130 | assert isinstance(sample_index, int) 131 | 132 | soa_score = self.soa( 133 | X=X, 134 | unit_distribution=unit_distribution, 135 | feature_index=feature_index, 136 | sample_index=sample_index, 137 | n_total_docs=n_total_doc, 138 | verbose=verbose 139 | ) 140 | return sample_index, feature_index, soa_score 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DocumentFeatureSelection 2 | ========================== 3 | 4 | # what's this? 5 | 6 | This is set of feature selection codes from text data. 7 | (About feature selection, see [here](http://nlp.stanford.edu/IR-book/html/htmledition/feature-selection-1.html) or [here](http://stackoverflow.com/questions/13603882/feature-selection-and-reduction-for-text-classification)) 8 | 9 | The Feature selection is really important when you use machine learning metrics on natural language data. 10 | The natural language data usually contains a lot of noise information, thus machine learning metrics are weak if you don't process any feature selection. 11 | (There is some exceptions of algorithms like _Decision Tree_ or _Random forest_ . They have feature selection metric inside the algorithm itself) 12 | 13 | The feature selection is also useful when you observe your text data. 14 | With the feature selection, you can get to know which features really contribute to specific labels. 15 | 16 | Please visit [project page on github](https://github.com/Kensuke-Mitsuzawa/DocumentFeatureSelection). 17 | 18 | If you find any bugs and you report it to github issue, I'm glad. 19 | 20 | Any pull-requests are welcomed. 21 | 22 | ## Supporting methods 23 | 24 | This package provides you some feature selection metrics. 25 | Currently, this package supports following feature selection methods 26 | 27 | * TF-IDF 28 | * Pointwise mutual information (PMI) 29 | * Strength of Association (SOA) 30 | * Bi-Normal Separation (BNS) 31 | 32 | ## Contribution of this package 33 | 34 | * Easy interface for pre-processing 35 | * Easy interface for accessing feature selection methods 36 | * Fast speed computation thanks to sparse matrix and multi-processing 37 | 38 | # Overview of methods 39 | 40 | ## TF-IDF 41 | 42 | This method, in fact, just calls `TfidfTransformer` of the scikit-learn. 43 | 44 | See [scikit-learn document](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) about detailed information. 45 | 46 | ## PMI 47 | 48 | PMI is calculated by correlation between _feature_ (i.e. token) and _category_ (i.e. label). 49 | Concretely, it makes _cross-table_ (or called _contingency table_) and calculates joint probability and marginal probability on it. 50 | 51 | To know more, see [reference](https://www.eecis.udel.edu/~trnka/CISC889-11S/lectures/philip-pmi.pdf) 52 | 53 | In python world, [NLTK](http://www.nltk.org/howto/collocations.html) and [Other package](https://github.com/Bollegala/svdmi) also provide PMI. 54 | Check them and choose based on your preference and usage. 55 | 56 | 57 | ## SOA 58 | 59 | SOA is improved feature-selection method from PMI. 60 | PMI is weak when feature has low word frequency. 61 | SOA is based on PMI computing, however, it is feasible on such low frequency features. 62 | Moreover, you can get anti-correlation between features and categories. 63 | 64 | In this package, SOA formula is from following paper, 65 | 66 | `Saif Mohammad and Svetlana Kiritchenko, "Using Hashtags to Capture Fine Emotion Categories from Tweets", Computational Intelligence, 01/2014; 31(2).` 67 | 68 | ``` 69 | SOA(w, e)\ =\ log_2\frac{freq(w, e) * freq(\neg{e})}{freq(e) * freq(w, \neg{e})} 70 | ``` 71 | 72 | Where 73 | 74 | * freq(w, e) is the number of times _w_ occurs in an unit(sentence or document) with label _e_ 75 | * freq(w,¬e) is the number of times _w_ occurs in units that does not have the label _e_ 76 | * freq(e) is the number of units having the label _e_ 77 | * freq(¬e) is the number of units having NOT the label _e_ 78 | 79 | ## BNS 80 | 81 | BNS is a feature selection method for binary class data. 82 | There is several methods available for binary class data, such as _information gain (IG)_, _chi-squared 83 | (CHI)_, _odds ratio (Odds)_. 84 | 85 | The problem is when you execute your feature selection on skewed data. 86 | These methods are weak for such skewed data, however, _BNS_ is feasible only for skewed data. 87 | The following paper shows how BNS is feasible for skewed data. 88 | 89 | ```Lei Tang and Huan Liu, "Bias Analysis in Text Classification for Highly Skewed Data", 2005``` 90 | 91 | or 92 | 93 | ```George Forman, "An Extensive Empirical Study of Feature Selection Metrics for Text Classification",Journal of Machine Learning Research 3 (2003) 1289-1305``` 94 | 95 | 96 | # Requirement 97 | 98 | * Python 3.x(checked under Python 3.5) 99 | 100 | 101 | # Setting up 102 | 103 | ## install 104 | 105 | `python setup.py install` 106 | 107 | ### Note 108 | 109 | You might see error message during running this command, such as 110 | 111 | ``` 112 | We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution. 113 | ``` 114 | 115 | This is because `setup.py` tries to instal numpy and scipy with `pip`, however it fails. 116 | We need numpy and scipy before we install `scikit-learn`. 117 | 118 | In this case, you take following choice 119 | 120 | * You install `numpy` and `scipy` manually 121 | * You use `anaconda` python distribution. Please visit [their site](https://www.continuum.io/downloads). 122 | 123 | # Example 124 | 125 | ```python 126 | input_dict = { 127 | "label_a": [ 128 | ["I", "aa", "aa", "aa", "aa", "aa"], 129 | ["bb", "aa", "aa", "aa", "aa", "aa"], 130 | ["I", "aa", "hero", "some", "ok", "aa"] 131 | ], 132 | "label_b": [ 133 | ["bb", "bb", "bb"], 134 | ["bb", "bb", "bb"], 135 | ["hero", "ok", "bb"], 136 | ["hero", "cc", "bb"], 137 | ], 138 | "label_c": [ 139 | ["cc", "cc", "cc"], 140 | ["cc", "cc", "bb"], 141 | ["xx", "xx", "cc"], 142 | ["aa", "xx", "cc"], 143 | ] 144 | } 145 | 146 | from DocumentFeatureSelection import interface 147 | interface.run_feature_selection(input_dict, method='pmi', use_cython=True).convert_score_matrix2score_record() 148 | ``` 149 | Then, you get the result 150 | 151 | ```python 152 | [{'score': 0.14976146817207336, 'label': 'label_c', 'feature': 'bb', 'frequency': 1.0}, ...] 153 | ``` 154 | 155 | See scripts in `examples/` 156 | 157 | # For developers 158 | 159 | You could set up dev environment with docker-compose. 160 | 161 | This command runs test with the docker container. 162 | 163 | ```bash 164 | $ cd tests/ 165 | $ docker-compose build 166 | $ docker-compose up 167 | ``` 168 | 169 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/pmi/PMI_python3.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | from scipy.sparse import csr_matrix 7 | from numpy import memmap 8 | from typing import Union 9 | from DocumentFeatureSelection.init_logger import logger 10 | 11 | import logging 12 | import joblib 13 | import math 14 | import numpy 15 | 16 | __author__ = 'kensuke-mi' 17 | 18 | 19 | # TODO normzalized pmiの導入 20 | # http://sucrose.hatenablog.com/entry/2014/12/02/235959 21 | 22 | 23 | def pmi(X: Union[csr_matrix, memmap], 24 | n_docs_distribution: numpy.ndarray, 25 | n_total_doc: int, 26 | feature_index: int, 27 | sample_index: int, 28 | verbose=False): 29 | """get PMI score for given feature & sample index 30 | 31 | :param X: 32 | :param feature_index: 33 | :param sample_index: 34 | :return: 35 | """ 36 | assert isinstance(X, (memmap, csr_matrix)) 37 | assert isinstance(n_docs_distribution, numpy.ndarray) 38 | assert isinstance(feature_index, int) 39 | assert isinstance(sample_index, int) 40 | 41 | matrix_size = X.shape 42 | sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] 43 | 44 | # n_11 is #docs having feature(i.e. word) in the specified index(label) 45 | n_11 = X[sample_index, feature_index] 46 | # n_01 is #docs NOT having feature in the specified index(label) 47 | n_01 = n_docs_distribution[sample_index] - n_11 48 | # n_10 is #docs having feature in NOT specified index(indexes except specified index) 49 | n_10 = X[sample_indexes, feature_index].sum() 50 | # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) 51 | n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) 52 | 53 | if verbose: 54 | logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) 55 | logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format( 56 | n_11, 57 | n_01, 58 | n_10, 59 | n_00 60 | )) 61 | 62 | if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: 63 | return 0 64 | else: 65 | temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) 66 | temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) 67 | temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) 68 | temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) 69 | score = temp1 + temp2 + temp3 + temp4 70 | 71 | return score 72 | 73 | 74 | class PMI(object): 75 | def __init__(self): 76 | pass 77 | 78 | def fit_transform(self, 79 | X: Union[csr_matrix, memmap], 80 | n_docs_distribution, 81 | n_jobs=1, 82 | verbose=False, 83 | joblib_backend='multiprocessing', 84 | use_cython:bool=False): 85 | """Main method of PMI class. 86 | """ 87 | assert isinstance(X, (memmap, csr_matrix)) 88 | assert isinstance(n_docs_distribution, numpy.ndarray) 89 | 90 | matrix_size = X.shape 91 | sample_range = list(range(0, matrix_size[0])) 92 | feature_range = list(range(0, matrix_size[1])) 93 | n_total_document = sum(n_docs_distribution) 94 | 95 | logger.debug(msg='Start calculating PMI') 96 | logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1])) 97 | 98 | if use_cython: 99 | import pyximport; pyximport.install() 100 | from DocumentFeatureSelection.pmi.pmi_cython import main 101 | logger.warning(msg='n_jobs parameter is invalid when use_cython=True') 102 | pmi_score_csr_source = main(X=X, 103 | n_docs_distribution=n_docs_distribution, 104 | sample_range=sample_range, 105 | feature_range=feature_range, 106 | n_total_doc=n_total_document, 107 | verbose=False) 108 | 109 | else: 110 | self.pmi = pmi 111 | pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( 112 | joblib.delayed(self.docId_word_PMI)( 113 | X=X, 114 | n_docs_distribution=n_docs_distribution, 115 | feature_index=feature_index, 116 | sample_index=sample_index, 117 | n_total_doc=n_total_document, 118 | verbose=verbose 119 | ) 120 | for sample_index in sample_range 121 | for feature_index in feature_range 122 | ) 123 | 124 | row_list = [t[0] for t in pmi_score_csr_source] 125 | col_list = [t[1] for t in pmi_score_csr_source] 126 | data_list = [t[2] for t in pmi_score_csr_source] 127 | 128 | pmi_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)), 129 | shape=(X.shape[0], 130 | X.shape[1])) 131 | 132 | logging.debug(msg='End calculating PMI') 133 | 134 | return pmi_featured_csr_matrix 135 | 136 | def docId_word_PMI(self, 137 | X:Union[csr_matrix, memmap], 138 | n_docs_distribution:numpy.ndarray, 139 | n_total_doc:int, 140 | feature_index:int, 141 | sample_index:int, 142 | verbose=False, 143 | use_cython:bool=False): 144 | """Calculate PMI score for fit_format() 145 | 146 | :param X: 147 | :param vocabulary: 148 | :param label_id: 149 | :param word: 150 | :param label: 151 | :return: 152 | """ 153 | pmi_score = self.pmi( 154 | X=X, 155 | n_docs_distribution=n_docs_distribution, 156 | feature_index=feature_index, 157 | sample_index=sample_index, 158 | n_total_doc=n_total_doc, 159 | verbose=verbose 160 | ) 161 | return sample_index, feature_index, pmi_score 162 | 163 | 164 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/bns/bns_python3.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse import csr_matrix 2 | from sklearn.base import TransformerMixin 3 | from scipy.stats import norm 4 | from numpy import ndarray, memmap 5 | from typing import Union 6 | from DocumentFeatureSelection.init_logger import logger 7 | import numpy as np 8 | import joblib 9 | import logging 10 | 11 | 12 | def bns(X:Union[memmap, csr_matrix], 13 | feature_index: int, 14 | sample_index: int, 15 | unit_distribution: np.ndarray, 16 | true_index: int = 0, 17 | verbose: bool = False): 18 | if true_index == 0: 19 | false_index = 1 20 | elif true_index == 1: 21 | false_index = 0 22 | else: 23 | raise Exception('true index must be either of 0 or 1') 24 | 25 | # trueラベルで出現した回数 26 | # tp is frequency of features in the specified positive label 27 | tp = X[true_index, feature_index] 28 | # trueラベルで出現しなかった回数 29 | # fp is frequency of NON-features(expect specified feature) in the specified positive label 30 | fp = unit_distribution[true_index] - tp 31 | 32 | # negativeラベルで出現した回数 33 | # fn is frequency of features in the specified negative label 34 | fn = X[false_index, feature_index] 35 | # negativeラベルで出現しなかった回数 36 | # fp is frequency of NON-features(expect specified feature) in the specified negative label 37 | tn = unit_distribution[false_index] - fn 38 | 39 | if tn < 0.0: 40 | print('aaaa') 41 | 42 | pos = tp + fn 43 | neg = fp + tn 44 | 45 | tpr = tp / pos 46 | fpr = fp / neg 47 | 48 | if verbose: 49 | logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) 50 | logging.debug('tp:{} fp:{} fn:{} tn:{} pos:{} neg:{} tpr:{} fpr:{}'.format( 51 | tp, 52 | fp, 53 | fn, 54 | tn, 55 | pos, 56 | neg, 57 | tpr, 58 | fpr 59 | )) 60 | 61 | #bns_score = np.abs(norm.ppf(norm.cdf(tpr)) - norm.ppf(norm.cdf(fpr))) 62 | bns_score = abs(norm.ppf(tpr) - norm.ppf(fpr)) 63 | return bns_score 64 | 65 | 66 | 67 | class BNS(TransformerMixin): 68 | def __init__(self): 69 | pass 70 | 71 | def __check_matrix_form(self, X): 72 | assert isinstance(X, csr_matrix) 73 | matrix_size = X.shape 74 | n_categories = matrix_size[0] 75 | if n_categories != 2: 76 | raise Exception('BNS input must be of 2 categories') 77 | 78 | def fit_transform(self, 79 | X: Union[memmap, csr_matrix], 80 | y=None, 81 | **fit_params): 82 | """* What you can do 83 | 84 | * Args 85 | - X; scipy.csr_matrix or numpy.memmap: Matrix object 86 | 87 | * Params 88 | - unit_distribution; list or ndarray: The number of document frequency per label. Ex. [10, 20] 89 | - n_jobs: The number of cores when you use joblib. 90 | - joblib_backend: "multiprocessing" or "multithreding" 91 | - true_index: The index number of True label. 92 | - use_cython; boolean: True, then Use Cython for computation. False, not. 93 | """ 94 | assert isinstance(X, csr_matrix) 95 | 96 | # -------------------------------------------------------- 97 | # Check parameters 98 | if not 'unit_distribution' in fit_params: 99 | raise Exception('You must put unit_distribution parameter') 100 | assert isinstance(fit_params['unit_distribution'], (list, ndarray)) 101 | self.__check_matrix_form(X) 102 | 103 | unit_distribution = fit_params['unit_distribution'] 104 | 105 | if 'n_jobs' in fit_params: 106 | n_jobs = fit_params['n_jobs'] 107 | else: 108 | n_jobs = 1 109 | 110 | if 'true_index' in fit_params: 111 | true_index = fit_params['true_index'] 112 | else: 113 | true_index = 0 114 | 115 | if 'verbose' in fit_params: 116 | verbose = True 117 | else: 118 | verbose = False 119 | 120 | if 'joblib_backend' in fit_params: 121 | joblib_backend = fit_params['joblib_backend'] 122 | else: 123 | joblib_backend = 'multiprocessing' 124 | 125 | if 'use_cython' in fit_params: 126 | is_use_cython = True 127 | else: 128 | is_use_cython = False 129 | # -------------------------------------------------------- 130 | 131 | matrix_size = X.shape 132 | sample_range = list(range(0, matrix_size[0])) 133 | feature_range = list(range(0, matrix_size[1])) 134 | 135 | logger.debug(msg='Start calculating BNS with n(process)={}'.format(n_jobs)) 136 | logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1])) 137 | 138 | if is_use_cython: 139 | import pyximport; pyximport.install() 140 | from DocumentFeatureSelection.bns.bns_cython import main 141 | logger.warning(msg='n_jobs parameter is invalid when use_cython=True') 142 | bns_score_csr_source = main( 143 | X=X, 144 | unit_distribution=unit_distribution, 145 | sample_range=sample_range, 146 | feature_range=feature_range, 147 | true_index=true_index, 148 | verbose=verbose 149 | ) 150 | else: 151 | bns_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( 152 | joblib.delayed(self.docId_word_BNS)( 153 | X=X, 154 | feature_index=feature_index, 155 | sample_index=sample_index, 156 | true_index=true_index, 157 | unit_distribution=unit_distribution, 158 | verbose=verbose 159 | ) 160 | for sample_index in sample_range 161 | for feature_index in feature_range) 162 | 163 | row_list = [t[0] for t in bns_score_csr_source] 164 | col_list = [t[1] for t in bns_score_csr_source] 165 | data_list = [t[2] for t in bns_score_csr_source] 166 | 167 | bns_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)), 168 | shape=(X.shape[0], 169 | X.shape[1])) 170 | 171 | logging.debug(msg='End calculating BNS') 172 | 173 | return bns_featured_csr_matrix 174 | 175 | def docId_word_BNS(self, X:csr_matrix, 176 | feature_index:int, 177 | sample_index:int, 178 | unit_distribution:np.ndarray, 179 | true_index:int, 180 | verbose=False): 181 | 182 | assert isinstance(X, csr_matrix) 183 | assert isinstance(feature_index, int) 184 | assert isinstance(sample_index, int) 185 | 186 | bns_score = bns( 187 | X=X, 188 | feature_index=feature_index, 189 | sample_index=sample_index, 190 | true_index=true_index, 191 | unit_distribution=unit_distribution, 192 | verbose=verbose 193 | ) 194 | return sample_index, feature_index, bns_score -------------------------------------------------------------------------------- /tests/test_data_converter.py: -------------------------------------------------------------------------------- 1 | from DocumentFeatureSelection.common import data_converter 2 | from DocumentFeatureSelection.common.data_converter import DataCsrMatrix 3 | from scipy.sparse import csr_matrix 4 | import unittest 5 | import numpy 6 | 7 | 8 | class TestDataConverter(unittest.TestCase): 9 | def setUp(self): 10 | self.input_dict = { 11 | "label_a": [ 12 | ["I", "aa", "aa", "aa", "aa", "aa"], 13 | ["bb", "aa", "aa", "aa", "aa", "aa"], 14 | ["I", "aa", "hero", "some", "ok", "aa"] 15 | ], 16 | "label_b": [ 17 | ["bb", "bb", "bb"], 18 | ["bb", "bb", "bb"], 19 | ["hero", "ok", "bb"], 20 | ["hero", "cc", "bb"], 21 | ], 22 | "label_c": [ 23 | ["cc", "cc", "cc"], 24 | ["cc", "cc", "bb"], 25 | ["xx", "xx", "cc"], 26 | ["aa", "xx", "cc"], 27 | ] 28 | } 29 | 30 | self.input_dict_complex_feature = { 31 | "label_a": [ 32 | [['a', 'b'], ['b', 'c'], ['a', 'b', 'c']], 33 | [['a', 'b'], ['b', 'c']], 34 | [['a', 'b']] 35 | ], 36 | "label_b": [ 37 | [['b', 'c'], ['c', 'd']], 38 | [['b', 'c']], 39 | [['b', 'c', 'd']], 40 | [['b', 'c']], 41 | ], 42 | "label_c": [ 43 | [['c', 'd'], ['a', 'b']], 44 | [['b', 'c'], ['a', 'b', 'c']], 45 | [['b', 'c']] 46 | ] 47 | } 48 | 49 | 50 | def test_check_same_csr_matrix(self): 51 | """複数回の変換を実施して、同一のcsr_matrixになることを確認する 52 | """ 53 | n_joblib_tasks = 2 54 | 55 | data_csr_matrix1 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 56 | labeled_documents=self.input_dict, 57 | n_jobs=n_joblib_tasks 58 | ) 59 | assert isinstance(data_csr_matrix1, DataCsrMatrix) 60 | csr_matrix_1 = data_csr_matrix1.csr_matrix_ 61 | label_group_dict_1 = data_csr_matrix1.label2id_dict 62 | vocabulary_1 = data_csr_matrix1.vocabulary 63 | n_doc_distri_1 = data_csr_matrix1.n_docs_distribution 64 | n_term_distir_1 = data_csr_matrix1.n_term_freq_distribution 65 | dense_matrix_1 = csr_matrix_1.toarray() 66 | 67 | data_csr_matrix2 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 68 | labeled_documents=self.input_dict, 69 | n_jobs=n_joblib_tasks 70 | ) 71 | assert isinstance(data_csr_matrix2, DataCsrMatrix) 72 | csr_matrix_2 = data_csr_matrix2.csr_matrix_ 73 | label_group_dict_2 = data_csr_matrix2.label2id_dict 74 | vocabulary_2 = data_csr_matrix2.vocabulary 75 | n_doc_distri_2 = data_csr_matrix2.n_docs_distribution 76 | n_term_distir_2 = data_csr_matrix2.n_term_freq_distribution 77 | dense_matrix_2 = data_csr_matrix2.csr_matrix_.toarray() 78 | 79 | data_csr_matrix3 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 80 | labeled_documents=self.input_dict, 81 | n_jobs=n_joblib_tasks 82 | ) 83 | assert isinstance(data_csr_matrix3, DataCsrMatrix) 84 | csr_matrix_3 = data_csr_matrix3.csr_matrix_ 85 | label_group_dict_3 = data_csr_matrix3.label2id_dict 86 | vocabulary_3 = data_csr_matrix3.vocabulary 87 | n_doc_distri_3 = data_csr_matrix3.n_docs_distribution 88 | n_term_distir_3 = data_csr_matrix3.n_term_freq_distribution 89 | dense_matrix_3 = data_csr_matrix3.csr_matrix_.toarray() 90 | 91 | assert numpy.array_equal(dense_matrix_1, dense_matrix_2) 92 | assert numpy.array_equal(dense_matrix_2, dense_matrix_3) 93 | assert numpy.array_equal(dense_matrix_1, dense_matrix_3) 94 | 95 | assert vocabulary_1 == vocabulary_2 96 | assert vocabulary_2 == vocabulary_3 97 | assert vocabulary_1 == vocabulary_3 98 | 99 | 100 | 101 | def test_basic_convert_data(self): 102 | """checks it works of not when n_jobs=1, n_process=1 103 | 104 | data convert過程のミスが疑われるので、整合性のチェックをする 105 | 106 | :return: 107 | """ 108 | 109 | csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 110 | labeled_documents=self.input_dict, 111 | n_jobs=5 112 | ) 113 | assert isinstance(csr_matrix_information, DataCsrMatrix) 114 | csr_matrix_ = csr_matrix_information.csr_matrix_ 115 | label_group_dict = csr_matrix_information.label2id_dict 116 | vocabulary = csr_matrix_information.vocabulary 117 | 118 | assert isinstance(csr_matrix_, csr_matrix) 119 | assert isinstance(label_group_dict, dict) 120 | assert isinstance(vocabulary, dict) 121 | 122 | n_correct_sample = 3 123 | n_correct_featute = 8 124 | 125 | assert csr_matrix_.shape[0] == n_correct_sample 126 | assert csr_matrix_.shape[1] == n_correct_featute 127 | 128 | dense_matrix_constructed_matrix = csr_matrix_.toarray() 129 | 130 | # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2} 131 | # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1} 132 | correct_array_numpy = numpy.array( 133 | [[2, 3, 1, 0, 1, 1, 1, 0], 134 | [0, 0, 4, 1, 2, 1, 0, 0], 135 | [0, 1, 1, 4, 0, 0, 0, 2] 136 | ]).astype(numpy.int64) 137 | assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix) 138 | 139 | 140 | 141 | def test_multi_process_convert_data(self): 142 | """checks if it works or not when n_process is more than 1 143 | 144 | :return: 145 | """ 146 | 147 | data_csr_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 148 | labeled_documents=self.input_dict, 149 | n_jobs=5 150 | ) 151 | 152 | assert isinstance(data_csr_object.csr_matrix_, csr_matrix) 153 | assert isinstance(data_csr_object.label2id_dict, dict) 154 | assert isinstance(data_csr_object.vocabulary, dict) 155 | 156 | def test_complex_feature_convertion(self): 157 | """""" 158 | csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( 159 | labeled_documents=self.input_dict_complex_feature, 160 | n_jobs=1 161 | ) 162 | assert isinstance(csr_matrix_information, DataCsrMatrix) 163 | csr_matrix_ = csr_matrix_information.csr_matrix_ 164 | label_group_dict = csr_matrix_information.label2id_dict 165 | vocabulary = csr_matrix_information.vocabulary 166 | 167 | assert isinstance(csr_matrix_, csr_matrix) 168 | assert isinstance(label_group_dict, dict) 169 | assert isinstance(vocabulary, dict) 170 | 171 | n_correct_sample = 3 172 | n_correct_feature = 5 173 | 174 | assert csr_matrix_.shape[0] == n_correct_sample 175 | assert csr_matrix_.shape[1] == n_correct_feature 176 | 177 | dense_matrix_constructed_matrix = csr_matrix_.toarray() 178 | 179 | # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2} 180 | # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1} 181 | correct_array_numpy = numpy.array( 182 | [[1.0, 3.0, 0.0, 2.0, 0.0], 183 | [0.0, 0.0, 1.0, 3.0, 1.0], 184 | [1.0, 1.0, 0.0, 2.0, 1.0], 185 | ]).astype(numpy.int64) 186 | assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix) 187 | 188 | 189 | if __name__ == '__main__': 190 | unittest.main() 191 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/interface.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject, AvailableInputTypes 3 | from DocumentFeatureSelection.common import data_converter 4 | from DocumentFeatureSelection.soa.soa_python3 import SOA 5 | from DocumentFeatureSelection.pmi.PMI_python3 import PMI 6 | from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF 7 | from DocumentFeatureSelection.bns.bns_python3 import BNS 8 | from DocumentFeatureSelection.init_logger import logger 9 | from tempfile import mkdtemp 10 | from typing import Dict 11 | from scipy.sparse.csr import csr_matrix 12 | import shutil 13 | METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns'] 14 | N_FEATURE_SWITCH_STRATEGY = 1000000 15 | 16 | 17 | def decide_joblib_strategy(feature2id_dict: Dict[str, int])->str: 18 | if len(feature2id_dict) > N_FEATURE_SWITCH_STRATEGY: 19 | return 'threading' 20 | else: 21 | return 'multiprocessing' 22 | 23 | 24 | def run_feature_selection(input_dict: AvailableInputTypes, 25 | method: str, 26 | use_cython: bool=False, 27 | is_use_cache: bool=False, 28 | is_use_memmap: bool=False, 29 | cache_backend: str='PersistentDict', 30 | path_working_dir: str=None, 31 | matrix_form=None, 32 | n_jobs: int=1)->ScoredResultObject: 33 | """A interface function of DocumentFeatureSelection package. 34 | 35 | * Args 36 | - input_dict: Dict-object which has category-name as key and list of features as value. 37 | - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict 38 | - method: A method name of feature selection metric 39 | - use_cython: boolean flag to use cython code for computation. 40 | It's much faster to use cython than native-python code 41 | - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. 42 | - is_use_memmap: boolean flag to use memmap for keeping matrix object. 43 | - path_working_dir: str object. 44 | - The file path to directory where you save cache file or memmap matrix object. If you leave it None, 45 | it finds some directory and save files in it. 46 | - cache_backend 47 | - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict] 48 | 49 | """ 50 | if method not in METHOD_NAMES: 51 | raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method)) 52 | 53 | if (is_use_cache or is_use_memmap) and path_working_dir is None: 54 | path_working_dir = mkdtemp() 55 | logger.info("Temporary files are created under {}".format(path_working_dir)) 56 | 57 | if method == 'tf_idf': 58 | """You get scored-matrix with term-frequency. 59 | ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix 60 | """ 61 | matrix_data_object = data_converter.DataConverter().convert_multi_docs2term_frequency_matrix( 62 | labeled_documents=input_dict, 63 | n_jobs=n_jobs, 64 | is_use_cache=is_use_cache, 65 | is_use_memmap=is_use_memmap, 66 | path_working_dir=path_working_dir, 67 | cache_backend=cache_backend 68 | ) 69 | assert isinstance(matrix_data_object, DataCsrMatrix) 70 | 71 | scored_sparse_matrix = TFIDF().fit_transform(X=matrix_data_object.csr_matrix_) 72 | assert isinstance(scored_sparse_matrix, csr_matrix) 73 | 74 | elif method in ['soa', 'pmi'] and matrix_form is None: 75 | """You get scored-matrix with either of soa or pmi. 76 | """ 77 | matrix_data_object = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 78 | labeled_documents=input_dict, 79 | n_jobs=n_jobs, 80 | is_use_cache=is_use_cache, 81 | is_use_memmap=is_use_memmap, 82 | path_working_dir=path_working_dir 83 | ) 84 | assert isinstance(matrix_data_object, DataCsrMatrix) 85 | if method == 'pmi': 86 | backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) 87 | scored_sparse_matrix = PMI().fit_transform(X=matrix_data_object.csr_matrix_, 88 | n_docs_distribution=matrix_data_object.n_docs_distribution, 89 | n_jobs=n_jobs, 90 | joblib_backend=backend_strategy, 91 | use_cython=use_cython) 92 | assert isinstance(scored_sparse_matrix, csr_matrix) 93 | elif method == 'soa': 94 | backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) 95 | scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_, 96 | unit_distribution=matrix_data_object.n_docs_distribution, 97 | n_jobs=n_jobs, 98 | joblib_backend=backend_strategy, 99 | use_cython=use_cython) 100 | assert isinstance(scored_sparse_matrix, csr_matrix) 101 | else: 102 | raise Exception() 103 | 104 | elif method == 'soa' and matrix_form == 'term_freq': 105 | # You get score-matrix with soa from term-frequency matrix. 106 | # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix 107 | matrix_data_object = data_converter.DataConverter().convert_multi_docs2term_frequency_matrix( 108 | labeled_documents=input_dict, 109 | n_jobs=n_jobs, 110 | is_use_cache=is_use_cache, 111 | is_use_memmap=is_use_memmap, 112 | path_working_dir=path_working_dir 113 | ) 114 | assert isinstance(matrix_data_object, DataCsrMatrix) 115 | 116 | backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) 117 | scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_, 118 | unit_distribution=matrix_data_object.n_docs_distribution, 119 | n_jobs=n_jobs, 120 | joblib_backend=backend_strategy) 121 | assert isinstance(scored_sparse_matrix, csr_matrix) 122 | 123 | elif method == 'bns': 124 | # You get scored-matrix with bns. 125 | # ATTENTION: #label should be 2 always. 126 | # Consider shorter label name as positive label 127 | # (positive and negative does NOT have any meaning in this context) # 128 | positive_label_name = sorted(input_dict.keys(), key=lambda x: len(x))[0] 129 | 130 | if len(input_dict.keys()) >= 3: 131 | raise KeyError('input_dict must not have more than 3 keys if you would like to use BNS.') 132 | 133 | matrix_data_object = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( 134 | labeled_documents=input_dict, 135 | n_jobs=n_jobs, 136 | is_use_cache=is_use_cache, 137 | is_use_memmap=is_use_memmap, 138 | path_working_dir=path_working_dir 139 | ) 140 | assert isinstance(matrix_data_object, DataCsrMatrix) 141 | 142 | true_class_index = matrix_data_object.label2id_dict[positive_label_name] 143 | backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) 144 | scored_sparse_matrix = BNS().fit_transform( 145 | X=matrix_data_object.csr_matrix_, 146 | unit_distribution=matrix_data_object.n_term_freq_distribution, 147 | n_jobs=n_jobs, 148 | true_index=true_class_index, 149 | joblib_backend=backend_strategy, 150 | use_cython=use_cython 151 | ) 152 | assert isinstance(scored_sparse_matrix, csr_matrix) 153 | else: 154 | raise Exception() 155 | logger.info('Done computation.') 156 | 157 | # delete tmp file directory 158 | if is_use_cache or is_use_memmap: 159 | logger.debug("Delete temporary files {}".format(path_working_dir)) 160 | shutil.rmtree(path_working_dir) 161 | 162 | return ScoredResultObject( 163 | scored_matrix=scored_sparse_matrix, 164 | label2id_dict=matrix_data_object.label2id_dict, 165 | feature2id_dict=matrix_data_object.vocabulary, 166 | method=method, 167 | matrix_form=matrix_form, 168 | frequency_matrix=matrix_data_object.csr_matrix_) 169 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """document-feature-selection documentation build configuration file, created by 3 | sphinx-quickstart on Wed Feb 13 11:51:12 2013. 4 | 5 | This file is execfile()d with the current directory set to its containing dir. 6 | 7 | Note that not all possible configuration values are present in this 8 | autogenerated file. 9 | 10 | All configuration values have a default; values that are commented out 11 | serve to show the default. """ 12 | from __future__ import unicode_literals 13 | __author__ = 'kensuke-mi' 14 | # 15 | import sys 16 | import os.path 17 | 18 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) 19 | from document-feature-selection import __version__ as version_orig 20 | # If extensions (or modules to document with autodoc) are in another directory, 21 | # add these directories to sys.path here. If the directory is relative to the 22 | # documentation root, use os.path.abspath to make it absolute, like shown here. 23 | # sys.path.insert(0, os.path.abspath('.')) 24 | 25 | # -- General configuration ----------------------------------------------------- 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be extensions 31 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 34 | 'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 35 | 'sphinx.ext.intersphinx', 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix of source filenames. 42 | source_suffix = '.rst' 43 | 44 | # The encoding of source files. 45 | # source_encoding = 'utf-8-sig' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = 'document-feature-selection' 52 | copyright = '2015, kensuke-mi' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = version_orig 60 | # The full version, including alpha/beta/rc tags. 61 | release = version_orig 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # language = None 66 | 67 | # There are two options for replacing |today|: either, you set today to some 68 | # non-false value, then it is used: 69 | # today = '' 70 | # Else, today_fmt is used as the format for a strftime call. 71 | # today_fmt = '%B %d, %Y' 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | exclude_patterns = [] 76 | 77 | # The reST default role (used for this markup: `text`) to use for all documents. 78 | # default_role = None 79 | 80 | # If true, '()' will be appended to :func: etc. cross-reference text. 81 | # add_function_parentheses = True 82 | 83 | # If true, the current module name will be prepended to all description 84 | # unit titles (such as .. function::). 85 | # add_module_names = True 86 | 87 | # If true, sectionauthor and moduleauthor directives will be shown in the 88 | # output. They are ignored by default. 89 | # show_authors = False 90 | 91 | # The name of the Pygments (syntax highlighting) style to use. 92 | pygments_style = 'sphinx' 93 | 94 | # A list of ignored prefixes for module index sorting. 95 | # modindex_common_prefix = [] 96 | 97 | 98 | # -- Options for HTML output --------------------------------------------------- 99 | 100 | # The theme to use for HTML and HTML Help pages. See the documentation for 101 | # a list of builtin themes. 102 | html_theme = 'sphinx_rtd_theme' 103 | 104 | # Theme options are theme-specific and customize the look and feel of a theme 105 | # further. For a list of options available for each theme, see the 106 | # documentation. 107 | # html_theme_options = {'bgcolor': '# FCFCFC; background: # FCFCFC url("fond_article.png") no-repeat center 10em; ', } 108 | 109 | # Add any paths that contain custom themes here, relative to this directory. 110 | # html_theme_path = [] 111 | 112 | # The name for this set of Sphinx documents. If None, it defaults to 113 | # " v documentation". 114 | # html_title = None 115 | 116 | # A shorter title for the navigation bar. Default is the same as html_title. 117 | # html_short_title = None 118 | 119 | # The name of an image file (relative to this directory) to place at the top 120 | # of the sidebar. 121 | html_logo = 'logo_principal.png' 122 | 123 | # The name of an image file (within the static path) to use as favicon of the 124 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 125 | # pixels large. 126 | # html_favicon = None 127 | 128 | # Add any paths that contain custom static files (such as style sheets) here, 129 | # relative to this directory. They are copied after the builtin static files, 130 | # so a file named "default.css" will overwrite the builtin "default.css". 131 | html_static_path = ['_static'] 132 | 133 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 134 | # using the given strftime format. 135 | # html_last_updated_fmt = '%b %d, %Y' 136 | 137 | # If true, SmartyPants will be used to convert quotes and dashes to 138 | # typographically correct entities. 139 | # html_use_smartypants = True 140 | 141 | # Custom sidebar templates, maps document names to template names. 142 | # html_sidebars = {} 143 | 144 | # Additional templates that should be rendered to pages, maps page names to 145 | # template names. 146 | # html_additional_pages = {} 147 | 148 | # If false, no module index is generated. 149 | # html_domain_indices = True 150 | 151 | # If false, no index is generated. 152 | # html_use_index = True 153 | 154 | # If true, the index is split into individual pages for each letter. 155 | # html_split_index = False 156 | 157 | # If true, links to the reST sources are added to the pages. 158 | # html_show_sourcelink = True 159 | 160 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 161 | # html_show_sphinx = True 162 | 163 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 164 | # html_show_copyright = True 165 | 166 | # If true, an OpenSearch description file will be output, and all pages will 167 | # contain a tag referring to it. The value of this option must be the 168 | # base URL from which the finished HTML is served. 169 | # html_use_opensearch = '' 170 | 171 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 172 | # html_file_suffix = None 173 | 174 | # Output file base name for HTML help builder. 175 | htmlhelp_basename = 'document-feature-selectiondoc' 176 | 177 | 178 | # -- Options for LaTeX output -------------------------------------------------- 179 | 180 | # The paper size ('letter' or 'a4'). 181 | # latex_paper_size = 'letter' 182 | 183 | # The font size ('10pt', '11pt' or '12pt'). 184 | # latex_font_size = '10pt' 185 | 186 | # Grouping the document tree into LaTeX files. List of tuples 187 | # (source start file, target name, title, author, documentclass [howto/manual]). 188 | latex_documents = [ 189 | ('index', 'document-feature-selection.tex', 'document-feature-selection Documentation', 190 | 'kensuke-mi', 'manual'), 191 | ] 192 | 193 | # The name of an image file (relative to this directory) to place at the top of 194 | # the title page. 195 | # latex_logo = None 196 | 197 | # For "manual" documents, if this is true, then toplevel headings are parts, 198 | # not chapters. 199 | # latex_use_parts = False 200 | 201 | # If true, show page references after internal links. 202 | # latex_show_pagerefs = False 203 | 204 | # If true, show URL addresses after external links. 205 | # latex_show_urls = False 206 | 207 | # Additional stuff for the LaTeX preamble. 208 | # latex_preamble = '' 209 | 210 | # Documents to append as an appendix to all manuals. 211 | # latex_appendices = [] 212 | 213 | # If false, no module index is generated. 214 | # latex_domain_indices = True 215 | 216 | 217 | # -- Options for manual page output -------------------------------------------- 218 | 219 | # One entry per manual page. List of tuples 220 | # (source start file, name, description, authors, manual section). 221 | man_pages = [ 222 | ('index', 'document-feature-selection', 'document-feature-selection Documentation', 223 | ['kensuke-mi'], 1) 224 | ] 225 | 226 | 227 | # -- Options for Epub output --------------------------------------------------- 228 | 229 | # Bibliographic Dublin Core info. 230 | epub_title = 'document-feature-selection' 231 | epub_author = 'kensuke-mi' 232 | epub_publisher = 'unknown' 233 | epub_copyright = '2015, kensuke-mi' 234 | 235 | # The language of the text. It defaults to the language option 236 | # or en if the language is not set. 237 | # epub_language = '' 238 | 239 | # The scheme of the identifier. Typical schemes are ISBN or URL. 240 | # epub_scheme = '' 241 | 242 | # The unique identifier of the text. This can be a ISBN number 243 | # or the project homepage. 244 | # epub_identifier = '' 245 | 246 | # A unique identification for the text. 247 | # epub_uid = '' 248 | 249 | # HTML files that should be inserted before the pages created by sphinx. 250 | # The format is a list of tuples containing the path and title. 251 | # epub_pre_files = [] 252 | 253 | # HTML files shat should be inserted after the pages created by sphinx. 254 | # The format is a list of tuples containing the path and title. 255 | # epub_post_files = [] 256 | 257 | # A list of files that should not be packed into the epub file. 258 | # epub_exclude_files = [] 259 | 260 | # The depth of the table of contents in toc.ncx. 261 | # epub_tocdepth = 3 262 | 263 | # Allow duplicate toc entries. 264 | # epub_tocdup = True 265 | # 266 | intersphinx_mapping = {} 267 | 268 | 269 | 270 | if __name__ == '__main__': 271 | import doctest 272 | doctest.testmod() 273 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/common/data_converter.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import division 6 | from DocumentFeatureSelection.common import utils, func_data_converter 7 | from DocumentFeatureSelection.models import DataCsrMatrix, AvailableInputTypes, PersistentDict 8 | from DocumentFeatureSelection.init_logger import logger 9 | from sqlitedict import SqliteDict 10 | import sys 11 | import numpy 12 | import tempfile 13 | import json 14 | from typing import Dict, List, Tuple, Any, Union 15 | python_version = sys.version_info 16 | 17 | __author__ = 'kensuke-mi' 18 | 19 | 20 | class DataConverter(object): 21 | """This class is for converting data type from dict-object into DataCsrMatrix-object which saves information of matrix. 22 | """ 23 | def __init__(self): 24 | # for keeping old version 25 | self.labeledMultiDocs2TermFreqMatrix = self.convert_multi_docs2term_frequency_matrix 26 | self.labeledMultiDocs2DocFreqMatrix = self.convert_multi_docs2document_frequency_matrix 27 | 28 | def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]): 29 | """Count term-distribution per label. 30 | """ 31 | assert isinstance(labeled_documents, (SqliteDict, dict)) 32 | assert isinstance(label2id, dict) 33 | 34 | # count total term-frequency per label 35 | term_frequency_distribution = { 36 | label: len(list(utils.flatten(document_lists))) 37 | for label, document_lists 38 | in labeled_documents.items() 39 | } 40 | 41 | # make list of distribution 42 | term_frequency_distribution_list = [0] * len(labeled_documents) 43 | 44 | for label_string, n_doc in term_frequency_distribution.items(): 45 | #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] 46 | term_index = label2id[label_string] 47 | term_frequency_distribution_list[term_index] = n_doc 48 | 49 | return numpy.array(term_frequency_distribution_list, dtype='i8') 50 | 51 | def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray: 52 | """This method count n(docs) per label. 53 | """ 54 | assert isinstance(labeled_documents, (SqliteDict, dict)) 55 | assert isinstance(label2id, dict) 56 | 57 | # count n(docs) per label 58 | n_doc_distribution = { 59 | label: len(document_lists) 60 | for label, document_lists 61 | in labeled_documents.items() 62 | } 63 | 64 | # make list of distribution 65 | n_doc_distribution_list = [0] * len(labeled_documents) 66 | 67 | for label_string, n_doc in n_doc_distribution.items(): 68 | #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] 69 | docs_index = label2id[label_string] 70 | n_doc_distribution_list[docs_index] = n_doc 71 | 72 | return numpy.array(n_doc_distribution_list, dtype='i8') 73 | 74 | def __make_feature_object2json_string(self, seq_feature_in_doc:List[Union[str,List[str],Tuple[str,...]]])->List[str]: 75 | """Sub-method of make_feature_object2json_string()""" 76 | replaced_seq_feature_in_doc = [None] * len(seq_feature_in_doc) # type: List[str] 77 | for i, feature_object in enumerate(seq_feature_in_doc): 78 | if isinstance(feature_object, str): 79 | replaced_seq_feature_in_doc[i] = json.dumps(tuple([feature_object]), ensure_ascii=False) 80 | elif isinstance(feature_object, (tuple, list)): 81 | replaced_seq_feature_in_doc[i] = json.dumps(feature_object, ensure_ascii=False) 82 | else: 83 | raise Exception("feature type must be either of str,list,tuple. Detected={}".format(type(feature_object))) 84 | else: 85 | return replaced_seq_feature_in_doc 86 | 87 | def make_feature_object2json_string(self, labeled_document:AvailableInputTypes)->Dict[str,AvailableInputTypes]: 88 | """* What u can do 89 | - This function converts feature-object in sequence object into json string. 90 | - This function make every object into json string. 91 | - string object -> json array which has one string. Ex. "feature" -> '["feature"]' 92 | - list object -> json array. Ex. ["feature", "feature"] -> '["feature", "feature"]' 93 | - tuple object -> json array. Ex. ("feature", "feature") -> '["feature", "feature"]' 94 | * Parameters 95 | - labeled_document: dict object which has key of 'label-name', and value is 2-dim list of features. 96 | 97 | """ 98 | assert isinstance(labeled_document, (dict,PersistentDict,SqliteDict)) 99 | replaced_labeled_document = {key: [] for key in labeled_document} 100 | for key, docs_in_label in labeled_document.items(): 101 | assert isinstance(docs_in_label, list) 102 | replaced_docs_in_label = [None] * len(docs_in_label) 103 | for i, doc_label in enumerate(docs_in_label): 104 | replaced_docs_in_label[i] = self.__make_feature_object2json_string(doc_label) 105 | else: 106 | replaced_labeled_document[key] = replaced_docs_in_label 107 | else: 108 | return replaced_labeled_document 109 | 110 | def convert_multi_docs2term_frequency_matrix(self, 111 | labeled_documents: AvailableInputTypes, 112 | is_use_cache: bool = False, 113 | is_use_memmap: bool = False, 114 | path_working_dir: str = tempfile.mkdtemp(), 115 | cache_backend: str = 'PersistentDict', 116 | n_jobs: int = 1): 117 | """* What you can do 118 | - This function makes TERM-frequency matrix for TF-IDF calculation. 119 | - TERM-frequency matrix is scipy.csr_matrix. 120 | 121 | * Params 122 | - labeled_documents: Dict object which has category-name as key, and list of features as value 123 | - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. 124 | - path_working_dir: path to directory for saving cache files 125 | """ 126 | labeled_documents = self.make_feature_object2json_string(labeled_documents) 127 | 128 | logger.debug(msg='Now pre-processing before CSR matrix') 129 | # convert data structure 130 | set_document_information = func_data_converter.make_multi_docs2term_freq_info(labeled_documents) 131 | 132 | # count n(docs) per label 133 | n_docs_distribution = self.count_document_distribution( 134 | labeled_documents=labeled_documents, 135 | label2id=set_document_information.label2id 136 | ) 137 | # count term-frequency per label 138 | term_frequency_distribution = self.count_term_frequency_distribution( 139 | labeled_documents=labeled_documents, 140 | label2id=set_document_information.label2id 141 | ) 142 | 143 | return DataCsrMatrix( 144 | csr_matrix_=set_document_information.matrix_object, 145 | label2id_dict=set_document_information.label2id, 146 | vocabulary=set_document_information.feature2id, 147 | n_docs_distribution=n_docs_distribution, 148 | n_term_freq_distribution=term_frequency_distribution, 149 | is_use_cache=is_use_cache, 150 | is_use_memmap=is_use_memmap, 151 | path_working_dir=path_working_dir, 152 | cache_backend=cache_backend 153 | ) 154 | 155 | def convert_multi_docs2document_frequency_matrix(self, 156 | labeled_documents:AvailableInputTypes, 157 | is_use_cache:bool=False, 158 | is_use_memmap:bool=False, 159 | path_working_dir:str=None, 160 | n_jobs:int=1)->DataCsrMatrix: 161 | """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix. 162 | 163 | * Input object 164 | - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below 165 | >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]], 166 | >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],], 167 | >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]} 168 | 169 | * Output 170 | - DataCsrMatrix object. 171 | """ 172 | labeled_documents = self.make_feature_object2json_string(labeled_documents) 173 | 174 | logger.debug(msg='Now pre-processing before CSR matrix') 175 | # convert data structure 176 | set_document_information = func_data_converter.make_multi_docs2doc_freq_info(labeled_documents,n_jobs=n_jobs) 177 | assert isinstance(set_document_information, func_data_converter.SetDocumentInformation) 178 | 179 | # count n(docs) per label 180 | n_docs_distribution = self.count_document_distribution( 181 | labeled_documents=labeled_documents, 182 | label2id=set_document_information.label2id 183 | ) 184 | # count term-frequency per label 185 | term_frequency_distribution = self.count_term_frequency_distribution( 186 | labeled_documents=labeled_documents, 187 | label2id=set_document_information.label2id 188 | ) 189 | return DataCsrMatrix( 190 | csr_matrix_=set_document_information.matrix_object, 191 | label2id_dict=set_document_information.label2id, 192 | vocabulary=set_document_information.feature2id, 193 | n_docs_distribution=n_docs_distribution, 194 | n_term_freq_distribution=term_frequency_distribution, 195 | is_use_cache=is_use_cache, 196 | is_use_memmap=is_use_memmap, 197 | path_working_dir=path_working_dir 198 | ) 199 | -------------------------------------------------------------------------------- /DocumentFeatureSelection/models.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Union, Any, TypeVar 2 | from scipy.sparse.csr import csr_matrix 3 | from numpy import memmap 4 | from sqlitedict import SqliteDict 5 | from tempfile import mkdtemp 6 | from DocumentFeatureSelection.init_logger import logger 7 | from numpy import ndarray, int32, int64 8 | import pickle 9 | import json 10 | import csv 11 | import os 12 | import shutil 13 | 14 | 15 | # this class is from https://code.activestate.com/recipes/576642/ 16 | class PersistentDict(dict): 17 | ''' Persistent dictionary with an API compatible with shelve and anydbm. 18 | 19 | The dict is kept in memory, so the dictionary operations run as fast as 20 | a regular dictionary. 21 | 22 | Write to disk is delayed until close or sync (similar to gdbm's fast mode). 23 | 24 | Input file format is automatically discovered. 25 | Output file format is selectable between pickle, json, and csv. 26 | All three serialization formats are backed by fast C implementations. 27 | 28 | ''' 29 | 30 | def __init__(self, filename, flag='c', mode=None, format='pickle', *args, **kwds): 31 | self.flag = flag # r=readonly, c=create, or n=new 32 | self.mode = mode # None or an octal triple like 0644 33 | self.format = format # 'csv', 'json', or 'pickle' 34 | self.filename = filename 35 | if flag != 'n' and os.access(filename, os.R_OK): 36 | fileobj = open(filename, 'rb' if format=='pickle' else 'r') 37 | with fileobj: 38 | self.load(fileobj) 39 | dict.__init__(self, *args, **kwds) 40 | 41 | def sync(self): 42 | 'Write dict to disk' 43 | if self.flag == 'r': 44 | return 45 | filename = self.filename 46 | tempname = filename + '.tmp' 47 | fileobj = open(tempname, 'wb' if self.format=='pickle' else 'w') 48 | try: 49 | self.dump(fileobj) 50 | except Exception: 51 | os.remove(tempname) 52 | raise 53 | finally: 54 | fileobj.close() 55 | shutil.move(tempname, self.filename) # atomic commit 56 | if self.mode is not None: 57 | os.chmod(self.filename, self.mode) 58 | 59 | def close(self): 60 | self.sync() 61 | 62 | def __enter__(self): 63 | return self 64 | 65 | def __exit__(self, *exc_info): 66 | self.close() 67 | 68 | def dump(self, fileobj): 69 | if self.format == 'csv': 70 | csv.writer(fileobj).writerows(self.items()) 71 | elif self.format == 'json': 72 | json.dump(self, fileobj, separators=(',', ':')) 73 | elif self.format == 'pickle': 74 | pickle.dump(dict(self), fileobj, 2) 75 | else: 76 | raise NotImplementedError('Unknown format: ' + repr(self.format)) 77 | 78 | def load(self, fileobj): 79 | # try formats from most restrictive to least restrictive 80 | for loader in (pickle.load, json.load, csv.reader): 81 | fileobj.seek(0) 82 | try: 83 | return self.update(loader(fileobj)) 84 | except Exception: 85 | pass 86 | raise ValueError('File not in a supported format') 87 | 88 | 89 | class SetDocumentInformation(object): 90 | __slots__ = ['matrix_object', 'label2id', 'feature2id'] 91 | 92 | def __init__(self, dict_matrix_index:Union[Dict[str,Any], SqliteDict, PersistentDict]): 93 | """ 94 | * Keys 95 | - matrix_object:Union[csr_matrix, ndarray] 96 | - label2id: Dict[str, str] 97 | - feature2id: Dict[str, str] 98 | """ 99 | if not "matrix_object" in dict_matrix_index: 100 | raise Exception("dict_matrix_index must have key='matrix_object'") 101 | if not "label2id" in dict_matrix_index: 102 | raise Exception("dict_matrix_index must have key='label2id'") 103 | if not "feature2id" in dict_matrix_index: 104 | raise Exception("dict_matrix_index must have key='feature2id'") 105 | 106 | self.matrix_object = dict_matrix_index['matrix_object'] 107 | self.label2id = dict_matrix_index['label2id'] 108 | self.feature2id = dict_matrix_index['feature2id'] 109 | 110 | if isinstance(dict_matrix_index, dict): 111 | pass 112 | elif isinstance(dict_matrix_index, PersistentDict): 113 | dict_matrix_index.sync() 114 | elif isinstance(dict_matrix_index, SqliteDict): 115 | dict_matrix_index.sync() 116 | else: 117 | raise Exception() 118 | 119 | 120 | class DataCsrMatrix(object): 121 | """* What you can do 122 | - You can keep information for keeping matrix object. 123 | """ 124 | __slots__ = ['cache_backend', 'csr_matrix_', 125 | 'label2id_dict', 'vocabulary', 126 | 'n_docs_distribution', 'n_term_freq_distribution', 'path_working_dir'] 127 | 128 | def __init__(self, 129 | csr_matrix_: csr_matrix, 130 | label2id_dict: Dict[str, int], 131 | vocabulary: Dict[str, int], 132 | n_docs_distribution: ndarray, 133 | n_term_freq_distribution: ndarray, 134 | is_use_cache: bool=False, 135 | is_use_memmap: bool=False, 136 | cache_backend: str='PersistentDict', 137 | path_working_dir: str=None): 138 | """* Parameters 139 | ----------------- 140 | - csr_matrix_: Matrix object which saves term frequency or document frequency 141 | - label2id_dict: Dict object whose key is label-name, value is row-index of the given matrix. 142 | >>> {'label_b': 0, 'label_c': 1, 'label_a': 2} 143 | - vocabulary: Dict object whose key is feature-name, value is column-index of the given matrix. 144 | >>> {'label_b': 0, 'label_c': 1, 'label_a': 2} 145 | - n_docs_distribution: Sequence object(list,ndarray). It saves a distribution of N(docs) in each label. 146 | - n_term_freq_distribution: Sequence object(list,ndarray). It saves a distribution of N(all terms) in each label. 147 | - is_use_cache: boolean. It True; the matrix object is saved on the disk. It saves memory of your machine. 148 | - is_use_memmap: boolean. It True; the matrix object is saved on the disk. It saves memory of your machine. 149 | - cache_backend: str. {PersistentDict, SqliteDict}, backend to save this object on the disk. 150 | - path_working_dir: str. Path to save temporary cache objects. 151 | """ 152 | 153 | self.n_docs_distribution = n_docs_distribution 154 | self.n_term_freq_distribution = n_term_freq_distribution 155 | self.cache_backend = cache_backend 156 | 157 | if (is_use_memmap or is_use_cache) and path_working_dir is None: 158 | self.path_working_dir = mkdtemp() 159 | logger.info("Temporary files are at {}".format(self.path_working_dir)) 160 | else: 161 | self.path_working_dir = path_working_dir 162 | 163 | if is_use_cache: 164 | """You use disk-drive for keeping object. 165 | """ 166 | path_vocabulary_cache_obj = os.path.join(self.path_working_dir, 'vocabulary.cache') 167 | path_label_2_dict_cache_obj = os.path.join(self.path_working_dir, 'label_2_dict.cache') 168 | self.vocabulary = self.initialize_cache_dict_object(path_vocabulary_cache_obj) 169 | self.vocabulary = vocabulary 170 | 171 | self.label2id_dict = self.initialize_cache_dict_object(path_label_2_dict_cache_obj) 172 | logger.info("Now saving into local file...") 173 | for k, v in label2id_dict.items(): 174 | self.label2id_dict[k] = v 175 | if isinstance(self.label2id_dict, PersistentDict): 176 | self.label2id_dict.sync() 177 | 178 | else: 179 | """Keep everything on memory 180 | """ 181 | self.label2id_dict = label2id_dict 182 | self.vocabulary = vocabulary 183 | 184 | if is_use_memmap: 185 | """You use disk-drive for keeping object 186 | """ 187 | path_memmap_obj = os.path.join(self.path_working_dir, 'matrix.memmap') 188 | self.csr_matrix_ = self.initialize_memmap_object(csr_matrix_, path_memmap_object=path_memmap_obj) 189 | else: 190 | self.csr_matrix_ = csr_matrix_ 191 | 192 | def initialize_cache_dict_object(self, path_cache_file): 193 | if self.cache_backend == 'PersistentDict': 194 | return PersistentDict(path_cache_file, flag='c', format='json') 195 | elif self.cache_backend == 'SqliteDict': 196 | return SqliteDict(path_cache_file, autocommit=True) 197 | else: 198 | raise Exception('No such cache_backend option named {}'.format(self.cache_backend)) 199 | 200 | def initialize_memmap_object(self, matrix_object: csr_matrix, path_memmap_object: str)->memmap: 201 | fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape) 202 | fp[:] = matrix_object.todense()[:] 203 | return fp 204 | 205 | def __str__(self): 206 | return """matrix-type={}, matrix-size={}, path_working_dir={}""".format(type(self.csr_matrix_), 207 | self.csr_matrix_.shape, 208 | self.path_working_dir) 209 | 210 | 211 | class ROW_COL_VAL(object): 212 | """Data class to keep value of one item in CSR-matrix""" 213 | __slots__ = ('row', 'col', 'val') 214 | def __init__(self, row: int, col:int, val:int): 215 | self.row = row 216 | self.col = col 217 | self.val = val 218 | 219 | 220 | class ScoredResultObject(object): 221 | """""" 222 | 223 | def __init__(self, 224 | scored_matrix:csr_matrix, 225 | label2id_dict:Union[Dict[str,Any], ndarray], 226 | feature2id_dict=Union[Dict[str,Any], ndarray], 227 | method:str=None, 228 | matrix_form:str=None, 229 | frequency_matrix:csr_matrix=None): 230 | """*Parameters 231 | ------------ 232 | - scored_matrix: Matrix object which saves result of feature-extraction 233 | - label2id_dict: Dict object whose key is label-name, value is row-index of the matrix. 234 | - feature2id_dict: Dict object whose key is feature-name, value is column-index of the matrix. 235 | - method: a name of feature-extraction method. 236 | - matrix_form: a type of the given matrix for feature-extraction computation. {term_freq, doc_freq} 237 | - frequency_matrix: Matrix object(term-frequency or document-frequency). The matrix is data-source of feature-extraction computation. 238 | """ 239 | self.scored_matrix = scored_matrix 240 | self.label2id_dict = label2id_dict 241 | self.feature2id_dict = feature2id_dict 242 | self.method = method 243 | self.matrix_form = matrix_form 244 | self.frequency_matrix = frequency_matrix 245 | # For keeping old version 246 | self.ScoreMatrix2ScoreDictionary = self.convert_score_matrix2score_record 247 | 248 | def __conv_into_dict_format(self, word_score_items): 249 | out_format_structure = {} 250 | for item in word_score_items: 251 | if item['label'] not in out_format_structure : 252 | out_format_structure[item['label']] = [{'feature': item['word'], 'score': item['score']}] 253 | else: 254 | out_format_structure[item['label']].append({'feature': item['word'], 'score': item['score']}) 255 | return out_format_structure 256 | 257 | def convert_score_matrix2score_record(self, 258 | outformat:str='items', 259 | sort_desc:bool=True): 260 | """* What you can do 261 | - Get dictionary structure from weighted-featured scores. 262 | - You can choose 'dict' or 'items' for ```outformat``` parameter. 263 | 264 | * Output 265 | --------------------- 266 | - If outformat='dict', you get 267 | >>> {label_name:{feature: score}} 268 | Else if outformat='items', you get 269 | >>> [{feature: score}] 270 | 271 | """ 272 | scored_objects = self.get_feature_dictionary( 273 | weighted_matrix=self.scored_matrix, 274 | vocabulary=self.feature2id_dict, 275 | label_group_dict=self.label2id_dict, 276 | frequency_matrix=self.frequency_matrix 277 | ) 278 | 279 | if sort_desc: scored_objects = \ 280 | sorted(scored_objects, key=lambda x: x['score'], reverse=True) 281 | 282 | if outformat=='dict': 283 | out_format_structure = self.__conv_into_dict_format(scored_objects) 284 | elif outformat=='items': 285 | out_format_structure = scored_objects 286 | else: 287 | raise ValueError('outformat must be either of {dict, items}') 288 | 289 | return out_format_structure 290 | 291 | def __get_value_index(self, row_index, column_index, weight_csr_matrix, verbose=False): 292 | assert isinstance(row_index, (int, int32, int64)) 293 | assert isinstance(column_index, (int, int32, int64)) 294 | assert isinstance(weight_csr_matrix, (ndarray,csr_matrix)) 295 | 296 | value = weight_csr_matrix[row_index, column_index] 297 | 298 | return value 299 | 300 | def make_non_zero_information(self, weight_csr_matrix: csr_matrix)->List[ROW_COL_VAL]: 301 | """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple. 302 | 303 | :param weight_csr_matrix: 304 | :return: 305 | """ 306 | assert isinstance(weight_csr_matrix, (csr_matrix, ndarray)) 307 | 308 | row_col_index_array = weight_csr_matrix.nonzero() 309 | row_indexes = row_col_index_array[0] 310 | column_indexes = row_col_index_array[1] 311 | assert len(row_indexes) == len(column_indexes) 312 | 313 | value_index_items = [None] * len(row_indexes) # type: List[ROW_COL_VAL] 314 | for i in range(0, len(row_indexes)): 315 | value_index_items[i] = ROW_COL_VAL(row_indexes[i], 316 | column_indexes[i], 317 | self.__get_value_index(row_indexes[i], column_indexes[i], weight_csr_matrix)) 318 | return value_index_items 319 | 320 | def SUB_FUNC_feature_extraction(self, 321 | weight_row_col_val_obj: ROW_COL_VAL, 322 | dict_index_information: Dict[str, Dict[str, str]], 323 | dict_position2value: Dict[Tuple[int, int], float]=None)->Dict[str, Any]: 324 | """This function returns weighted score between label and words. 325 | 326 | Input csr matrix must be 'document-frequency' matrix, where records #document that word appears in document set. 327 | [NOTE] This is not TERM-FREQUENCY. 328 | 329 | For example, 330 | If 'iPhone' appears in 5 documents of 'IT' category document set, value must be 5. 331 | Even if 10 'iPhone' words in 'IT' category document set, value is still 5. 332 | """ 333 | assert isinstance(weight_row_col_val_obj, ROW_COL_VAL) 334 | feature_score_record = { 335 | 'score': weight_row_col_val_obj.val, 336 | 'label': self.get_label(weight_row_col_val_obj, dict_index_information['id2label']), 337 | 'feature': self.get_word(weight_row_col_val_obj, dict_index_information['id2vocab']) 338 | } 339 | if not dict_position2value is None: 340 | if (weight_row_col_val_obj.col,weight_row_col_val_obj.row) in dict_position2value: 341 | frequency = dict_position2value[tuple([weight_row_col_val_obj.col,weight_row_col_val_obj.row])] 342 | else: 343 | """When a feature-extraction method is BNS, frequency=0 is possible.""" 344 | frequency = 0 345 | 346 | feature_score_record.update({"frequency": frequency}) 347 | 348 | return feature_score_record 349 | 350 | def get_feature_dictionary(self, 351 | weighted_matrix: csr_matrix, 352 | vocabulary:Dict[str, int], 353 | label_group_dict:Dict[str, int], 354 | cache_backend: str = 'PersistentDict', 355 | is_use_cache: bool=True, 356 | frequency_matrix: csr_matrix=None)->List[Dict[str, Any]]: 357 | """* What you can do 358 | - Get dictionary structure from weighted-featured scores. 359 | """ 360 | assert isinstance(weighted_matrix, csr_matrix) 361 | assert isinstance(vocabulary, dict) 362 | assert isinstance(label_group_dict, dict) 363 | 364 | logger.debug(msg='Start making scored dictionary object from scored matrix') 365 | logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1])) 366 | 367 | weight_value_index_items = self.make_non_zero_information(weighted_matrix) 368 | if not frequency_matrix is None: 369 | frequency_value_index_items = self.make_non_zero_information(frequency_matrix) 370 | dict_position2value = {(t_col_row.col,t_col_row.row): t_col_row.val for t_col_row in frequency_value_index_items} 371 | else: 372 | dict_position2value = None 373 | 374 | if is_use_cache: 375 | dict_index_information = self.initialize_cache_dict_object(cache_backend, file_name='dict_index_information') 376 | else: 377 | dict_index_information = {} 378 | 379 | dict_index_information['id2label'] = {value:key for key, value in label_group_dict.items()} 380 | dict_index_information['id2vocab'] = {value:key for key, value in vocabulary.items()} 381 | if isinstance(dict_index_information, SqliteDict): 382 | dict_index_information.commit() 383 | elif isinstance(dict_index_information, PersistentDict): 384 | dict_index_information.sync() 385 | else: 386 | pass 387 | 388 | # TODO may be this func takes too much time. consider cython. 389 | seq_score_objects = [None] * len(weight_value_index_items) # type: List[Dict[str,Any]] 390 | for i, weight_row_col_val_tuple in enumerate(weight_value_index_items): 391 | seq_score_objects[i] = self.SUB_FUNC_feature_extraction( 392 | weight_row_col_val_tuple, 393 | dict_index_information, 394 | dict_position2value) 395 | 396 | logger.debug(msg='Finished making scored dictionary') 397 | 398 | return seq_score_objects 399 | 400 | def get_label(self, row_col_val_tuple, label_id)->str: 401 | assert isinstance(row_col_val_tuple, ROW_COL_VAL) 402 | assert isinstance(label_id, dict) 403 | 404 | label = label_id[row_col_val_tuple.row] 405 | 406 | return label 407 | 408 | def get_word(self, row_col_val_tuple:ROW_COL_VAL, vocabulary:Dict[int,str])->Union[str,List[str],Tuple[str,...]]: 409 | """* what u can do 410 | - It gets feature name from the given matrix object. 411 | - A feature is json serialized, thus this method tries to de-serialize json string into python object. 412 | - Original feature object is possibly string(word), list of str, list of str. 413 | """ 414 | assert isinstance(row_col_val_tuple, ROW_COL_VAL) 415 | assert isinstance(vocabulary, dict) 416 | vocab = vocabulary[row_col_val_tuple.col] 417 | try: 418 | feature_object = json.loads(vocab) 419 | if len(feature_object)==1: 420 | # When feature is word, the length is 1 # 421 | feature_object = feature_object[0] 422 | except: 423 | feature_object = vocab 424 | 425 | 426 | return feature_object 427 | 428 | def initialize_cache_dict_object(self, cache_backend:str, file_name:str, path_cache_file=mkdtemp()): 429 | if cache_backend == 'PersistentDict': 430 | return PersistentDict(os.path.join(path_cache_file, file_name), flag='c', format='json') 431 | elif cache_backend == 'SqliteDict': 432 | return SqliteDict(os.path.join(path_cache_file, file_name), autocommit=True) 433 | else: 434 | raise Exception('No such cache_backend option named {}'.format(cache_backend)) 435 | 436 | 437 | FeatureType = TypeVar('T', str, Tuple[Any]) 438 | AvailableInputTypes = TypeVar('T', PersistentDict, 439 | SqliteDict, 440 | Dict[str,List[List[Union[str,Tuple[Any]]]]]) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | CeCILL-B FREE SOFTWARE LICENSE AGREEMENT 3 | 4 | 5 | Notice 6 | 7 | This Agreement is a Free Software license agreement that is the result 8 | of discussions between its authors in order to ensure compliance with 9 | the two main principles guiding its drafting: 10 | 11 | * firstly, compliance with the principles governing the distribution 12 | of Free Software: access to source code, broad rights granted to 13 | users, 14 | * secondly, the election of a governing law, French law, with which 15 | it is conformant, both as regards the law of torts and 16 | intellectual property law, and the protection that it offers to 17 | both authors and holders of the economic rights over software. 18 | 19 | The authors of the CeCILL-B (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) 20 | license are: 21 | 22 | Commissariat à l'Energie Atomique - CEA, a public scientific, technical 23 | and industrial research establishment, having its principal place of 24 | business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. 25 | 26 | Centre National de la Recherche Scientifique - CNRS, a public scientific 27 | and technological establishment, having its principal place of business 28 | at 3 rue Michel-Ange, 75794 Paris cedex 16, France. 29 | 30 | Institut National de Recherche en Informatique et en Automatique - 31 | INRIA, a public scientific and technological establishment, having its 32 | principal place of business at Domaine de Voluceau, Rocquencourt, BP 33 | 105, 78153 Le Chesnay cedex, France. 34 | 35 | 36 | Preamble 37 | 38 | This Agreement is an open source software license intended to give users 39 | significant freedom to modify and redistribute the software licensed 40 | hereunder. 41 | 42 | The exercising of this freedom is conditional upon a strong obligation 43 | of giving credits for everybody that distributes a software 44 | incorporating a software ruled by the current license so as all 45 | contributions to be properly identified and acknowledged. 46 | 47 | In consideration of access to the source code and the rights to copy, 48 | modify and redistribute granted by the license, users are provided only 49 | with a limited warranty and the software's author, the holder of the 50 | economic rights, and the successive licensors only have limited liability. 51 | 52 | In this respect, the risks associated with loading, using, modifying 53 | and/or developing or reproducing the software by the user are brought to 54 | the user's attention, given its Free Software status, which may make it 55 | complicated to use, with the result that its use is reserved for 56 | developers and experienced professionals having in-depth computer 57 | knowledge. Users are therefore encouraged to load and test the 58 | suitability of the software as regards their requirements in conditions 59 | enabling the security of their systems and/or data to be ensured and, 60 | more generally, to use and operate it in the same conditions of 61 | security. This Agreement may be freely reproduced and published, 62 | provided it is not altered, and that no provisions are either added or 63 | removed herefrom. 64 | 65 | This Agreement may apply to any or all software for which the holder of 66 | the economic rights decides to submit the use thereof to its provisions. 67 | 68 | 69 | Article 1 - DEFINITIONS 70 | 71 | For the purpose of this Agreement, when the following expressions 72 | commence with a capital letter, they shall have the following meaning: 73 | 74 | Agreement: means this license agreement, and its possible subsequent 75 | versions and annexes. 76 | 77 | Software: means the software in its Object Code and/or Source Code form 78 | and, where applicable, its documentation, "as is" when the Licensee 79 | accepts the Agreement. 80 | 81 | Initial Software: means the Software in its Source Code and possibly its 82 | Object Code form and, where applicable, its documentation, "as is" when 83 | it is first distributed under the terms and conditions of the Agreement. 84 | 85 | Modified Software: means the Software modified by at least one 86 | Contribution. 87 | 88 | Source Code: means all the Software's instructions and program lines to 89 | which access is required so as to modify the Software. 90 | 91 | Object Code: means the binary files originating from the compilation of 92 | the Source Code. 93 | 94 | Holder: means the holder(s) of the economic rights over the Initial 95 | Software. 96 | 97 | Licensee: means the Software user(s) having accepted the Agreement. 98 | 99 | Contributor: means a Licensee having made at least one Contribution. 100 | 101 | Licensor: means the Holder, or any other individual or legal entity, who 102 | distributes the Software under the Agreement. 103 | 104 | Contribution: means any or all modifications, corrections, translations, 105 | adaptations and/or new functions integrated into the Software by any or 106 | all Contributors, as well as any or all Internal Modules. 107 | 108 | Module: means a set of sources files including their documentation that 109 | enables supplementary functions or services in addition to those offered 110 | by the Software. 111 | 112 | External Module: means any or all Modules, not derived from the 113 | Software, so that this Module and the Software run in separate address 114 | spaces, with one calling the other when they are run. 115 | 116 | Internal Module: means any or all Module, connected to the Software so 117 | that they both execute in the same address space. 118 | 119 | Parties: mean both the Licensee and the Licensor. 120 | 121 | These expressions may be used both in singular and plural form. 122 | 123 | 124 | Article 2 - PURPOSE 125 | 126 | The purpose of the Agreement is the grant by the Licensor to the 127 | Licensee of a non-exclusive, transferable and worldwide license for the 128 | Software as set forth in Article 5 hereinafter for the whole term of the 129 | protection granted by the rights over said Software. 130 | 131 | 132 | Article 3 - ACCEPTANCE 133 | 134 | 3.1 The Licensee shall be deemed as having accepted the terms and 135 | conditions of this Agreement upon the occurrence of the first of the 136 | following events: 137 | 138 | * (i) loading the Software by any or all means, notably, by 139 | downloading from a remote server, or by loading from a physical 140 | medium; 141 | * (ii) the first time the Licensee exercises any of the rights 142 | granted hereunder. 143 | 144 | 3.2 One copy of the Agreement, containing a notice relating to the 145 | characteristics of the Software, to the limited warranty, and to the 146 | fact that its use is restricted to experienced users has been provided 147 | to the Licensee prior to its acceptance as set forth in Article 3.1 148 | hereinabove, and the Licensee hereby acknowledges that it has read and 149 | understood it. 150 | 151 | 152 | Article 4 - EFFECTIVE DATE AND TERM 153 | 154 | 155 | 4.1 EFFECTIVE DATE 156 | 157 | The Agreement shall become effective on the date when it is accepted by 158 | the Licensee as set forth in Article 3.1. 159 | 160 | 161 | 4.2 TERM 162 | 163 | The Agreement shall remain in force for the entire legal term of 164 | protection of the economic rights over the Software. 165 | 166 | 167 | Article 5 - SCOPE OF RIGHTS GRANTED 168 | 169 | The Licensor hereby grants to the Licensee, who accepts, the following 170 | rights over the Software for any or all use, and for the term of the 171 | Agreement, on the basis of the terms and conditions set forth hereinafter. 172 | 173 | Besides, if the Licensor owns or comes to own one or more patents 174 | protecting all or part of the functions of the Software or of its 175 | components, the Licensor undertakes not to enforce the rights granted by 176 | these patents against successive Licensees using, exploiting or 177 | modifying the Software. If these patents are transferred, the Licensor 178 | undertakes to have the transferees subscribe to the obligations set 179 | forth in this paragraph. 180 | 181 | 182 | 5.1 RIGHT OF USE 183 | 184 | The Licensee is authorized to use the Software, without any limitation 185 | as to its fields of application, with it being hereinafter specified 186 | that this comprises: 187 | 188 | 1. permanent or temporary reproduction of all or part of the Software 189 | by any or all means and in any or all form. 190 | 191 | 2. loading, displaying, running, or storing the Software on any or 192 | all medium. 193 | 194 | 3. entitlement to observe, study or test its operation so as to 195 | determine the ideas and principles behind any or all constituent 196 | elements of said Software. This shall apply when the Licensee 197 | carries out any or all loading, displaying, running, transmission 198 | or storage operation as regards the Software, that it is entitled 199 | to carry out hereunder. 200 | 201 | 202 | 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS 203 | 204 | The right to make Contributions includes the right to translate, adapt, 205 | arrange, or make any or all modifications to the Software, and the right 206 | to reproduce the resulting software. 207 | 208 | The Licensee is authorized to make any or all Contributions to the 209 | Software provided that it includes an explicit notice that it is the 210 | author of said Contribution and indicates the date of the creation thereof. 211 | 212 | 213 | 5.3 RIGHT OF DISTRIBUTION 214 | 215 | In particular, the right of distribution includes the right to publish, 216 | transmit and communicate the Software to the general public on any or 217 | all medium, and by any or all means, and the right to market, either in 218 | consideration of a fee, or free of charge, one or more copies of the 219 | Software by any means. 220 | 221 | The Licensee is further authorized to distribute copies of the modified 222 | or unmodified Software to third parties according to the terms and 223 | conditions set forth hereinafter. 224 | 225 | 226 | 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION 227 | 228 | The Licensee is authorized to distribute true copies of the Software in 229 | Source Code or Object Code form, provided that said distribution 230 | complies with all the provisions of the Agreement and is accompanied by: 231 | 232 | 1. a copy of the Agreement, 233 | 234 | 2. a notice relating to the limitation of both the Licensor's 235 | warranty and liability as set forth in Articles 8 and 9, 236 | 237 | and that, in the event that only the Object Code of the Software is 238 | redistributed, the Licensee allows effective access to the full Source 239 | Code of the Software at a minimum during the entire period of its 240 | distribution of the Software, it being understood that the additional 241 | cost of acquiring the Source Code shall not exceed the cost of 242 | transferring the data. 243 | 244 | 245 | 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE 246 | 247 | If the Licensee makes any Contribution to the Software, the resulting 248 | Modified Software may be distributed under a license agreement other 249 | than this Agreement subject to compliance with the provisions of Article 250 | 5.3.4. 251 | 252 | 253 | 5.3.3 DISTRIBUTION OF EXTERNAL MODULES 254 | 255 | When the Licensee has developed an External Module, the terms and 256 | conditions of this Agreement do not apply to said External Module, that 257 | may be distributed under a separate license agreement. 258 | 259 | 260 | 5.3.4 CREDITS 261 | 262 | Any Licensee who may distribute a Modified Software hereby expressly 263 | agrees to: 264 | 265 | 1. indicate in the related documentation that it is based on the 266 | Software licensed hereunder, and reproduce the intellectual 267 | property notice for the Software, 268 | 269 | 2. ensure that written indications of the Software intended use, 270 | intellectual property notice and license hereunder are included in 271 | easily accessible format from the Modified Software interface, 272 | 273 | 3. mention, on a freely accessible website describing the Modified 274 | Software, at least throughout the distribution term thereof, that 275 | it is based on the Software licensed hereunder, and reproduce the 276 | Software intellectual property notice, 277 | 278 | 4. where it is distributed to a third party that may distribute a 279 | Modified Software without having to make its source code 280 | available, make its best efforts to ensure that said third party 281 | agrees to comply with the obligations set forth in this Article . 282 | 283 | If the Software, whether or not modified, is distributed with an 284 | External Module designed for use in connection with the Software, the 285 | Licensee shall submit said External Module to the foregoing obligations. 286 | 287 | 288 | 5.3.5 COMPATIBILITY WITH THE CeCILL AND CeCILL-C LICENSES 289 | 290 | Where a Modified Software contains a Contribution subject to the CeCILL 291 | license, the provisions set forth in Article 5.3.4 shall be optional. 292 | 293 | A Modified Software may be distributed under the CeCILL-C license. In 294 | such a case the provisions set forth in Article 5.3.4 shall be optional. 295 | 296 | 297 | Article 6 - INTELLECTUAL PROPERTY 298 | 299 | 300 | 6.1 OVER THE INITIAL SOFTWARE 301 | 302 | The Holder owns the economic rights over the Initial Software. Any or 303 | all use of the Initial Software is subject to compliance with the terms 304 | and conditions under which the Holder has elected to distribute its work 305 | and no one shall be entitled to modify the terms and conditions for the 306 | distribution of said Initial Software. 307 | 308 | The Holder undertakes that the Initial Software will remain ruled at 309 | least by this Agreement, for the duration set forth in Article 4.2. 310 | 311 | 312 | 6.2 OVER THE CONTRIBUTIONS 313 | 314 | The Licensee who develops a Contribution is the owner of the 315 | intellectual property rights over this Contribution as defined by 316 | applicable law. 317 | 318 | 319 | 6.3 OVER THE EXTERNAL MODULES 320 | 321 | The Licensee who develops an External Module is the owner of the 322 | intellectual property rights over this External Module as defined by 323 | applicable law and is free to choose the type of agreement that shall 324 | govern its distribution. 325 | 326 | 327 | 6.4 JOINT PROVISIONS 328 | 329 | The Licensee expressly undertakes: 330 | 331 | 1. not to remove, or modify, in any manner, the intellectual property 332 | notices attached to the Software; 333 | 334 | 2. to reproduce said notices, in an identical manner, in the copies 335 | of the Software modified or not. 336 | 337 | The Licensee undertakes not to directly or indirectly infringe the 338 | intellectual property rights of the Holder and/or Contributors on the 339 | Software and to take, where applicable, vis-à-vis its staff, any and all 340 | measures required to ensure respect of said intellectual property rights 341 | of the Holder and/or Contributors. 342 | 343 | 344 | Article 7 - RELATED SERVICES 345 | 346 | 7.1 Under no circumstances shall the Agreement oblige the Licensor to 347 | provide technical assistance or maintenance services for the Software. 348 | 349 | However, the Licensor is entitled to offer this type of services. The 350 | terms and conditions of such technical assistance, and/or such 351 | maintenance, shall be set forth in a separate instrument. Only the 352 | Licensor offering said maintenance and/or technical assistance services 353 | shall incur liability therefor. 354 | 355 | 7.2 Similarly, any Licensor is entitled to offer to its licensees, under 356 | its sole responsibility, a warranty, that shall only be binding upon 357 | itself, for the redistribution of the Software and/or the Modified 358 | Software, under terms and conditions that it is free to decide. Said 359 | warranty, and the financial terms and conditions of its application, 360 | shall be subject of a separate instrument executed between the Licensor 361 | and the Licensee. 362 | 363 | 364 | Article 8 - LIABILITY 365 | 366 | 8.1 Subject to the provisions of Article 8.2, the Licensee shall be 367 | entitled to claim compensation for any direct loss it may have suffered 368 | from the Software as a result of a fault on the part of the relevant 369 | Licensor, subject to providing evidence thereof. 370 | 371 | 8.2 The Licensor's liability is limited to the commitments made under 372 | this Agreement and shall not be incurred as a result of in particular: 373 | (i) loss due the Licensee's total or partial failure to fulfill its 374 | obligations, (ii) direct or consequential loss that is suffered by the 375 | Licensee due to the use or performance of the Software, and (iii) more 376 | generally, any consequential loss. In particular the Parties expressly 377 | agree that any or all pecuniary or business loss (i.e. loss of data, 378 | loss of profits, operating loss, loss of customers or orders, 379 | opportunity cost, any disturbance to business activities) or any or all 380 | legal proceedings instituted against the Licensee by a third party, 381 | shall constitute consequential loss and shall not provide entitlement to 382 | any or all compensation from the Licensor. 383 | 384 | 385 | Article 9 - WARRANTY 386 | 387 | 9.1 The Licensee acknowledges that the scientific and technical 388 | state-of-the-art when the Software was distributed did not enable all 389 | possible uses to be tested and verified, nor for the presence of 390 | possible defects to be detected. In this respect, the Licensee's 391 | attention has been drawn to the risks associated with loading, using, 392 | modifying and/or developing and reproducing the Software which are 393 | reserved for experienced users. 394 | 395 | The Licensee shall be responsible for verifying, by any or all means, 396 | the suitability of the product for its requirements, its good working 397 | order, and for ensuring that it shall not cause damage to either persons 398 | or properties. 399 | 400 | 9.2 The Licensor hereby represents, in good faith, that it is entitled 401 | to grant all the rights over the Software (including in particular the 402 | rights set forth in Article 5). 403 | 404 | 9.3 The Licensee acknowledges that the Software is supplied "as is" by 405 | the Licensor without any other express or tacit warranty, other than 406 | that provided for in Article 9.2 and, in particular, without any warranty 407 | as to its commercial value, its secured, safe, innovative or relevant 408 | nature. 409 | 410 | Specifically, the Licensor does not warrant that the Software is free 411 | from any error, that it will operate without interruption, that it will 412 | be compatible with the Licensee's own equipment and software 413 | configuration, nor that it will meet the Licensee's requirements. 414 | 415 | 9.4 The Licensor does not either expressly or tacitly warrant that the 416 | Software does not infringe any third party intellectual property right 417 | relating to a patent, software or any other property right. Therefore, 418 | the Licensor disclaims any and all liability towards the Licensee 419 | arising out of any or all proceedings for infringement that may be 420 | instituted in respect of the use, modification and redistribution of the 421 | Software. Nevertheless, should such proceedings be instituted against 422 | the Licensee, the Licensor shall provide it with technical and legal 423 | assistance for its defense. Such technical and legal assistance shall be 424 | decided on a case-by-case basis between the relevant Licensor and the 425 | Licensee pursuant to a memorandum of understanding. The Licensor 426 | disclaims any and all liability as regards the Licensee's use of the 427 | name of the Software. No warranty is given as regards the existence of 428 | prior rights over the name of the Software or as regards the existence 429 | of a trademark. 430 | 431 | 432 | Article 10 - TERMINATION 433 | 434 | 10.1 In the event of a breach by the Licensee of its obligations 435 | hereunder, the Licensor may automatically terminate this Agreement 436 | thirty (30) days after notice has been sent to the Licensee and has 437 | remained ineffective. 438 | 439 | 10.2 A Licensee whose Agreement is terminated shall no longer be 440 | authorized to use, modify or distribute the Software. However, any 441 | licenses that it may have granted prior to termination of the Agreement 442 | shall remain valid subject to their having been granted in compliance 443 | with the terms and conditions hereof. 444 | 445 | 446 | Article 11 - MISCELLANEOUS 447 | 448 | 449 | 11.1 EXCUSABLE EVENTS 450 | 451 | Neither Party shall be liable for any or all delay, or failure to 452 | perform the Agreement, that may be attributable to an event of force 453 | majeure, an act of God or an outside cause, such as defective 454 | functioning or interruptions of the electricity or telecommunications 455 | networks, network paralysis following a virus attack, intervention by 456 | government authorities, natural disasters, water damage, earthquakes, 457 | fire, explosions, strikes and labor unrest, war, etc. 458 | 459 | 11.2 Any failure by either Party, on one or more occasions, to invoke 460 | one or more of the provisions hereof, shall under no circumstances be 461 | interpreted as being a waiver by the interested Party of its right to 462 | invoke said provision(s) subsequently. 463 | 464 | 11.3 The Agreement cancels and replaces any or all previous agreements, 465 | whether written or oral, between the Parties and having the same 466 | purpose, and constitutes the entirety of the agreement between said 467 | Parties concerning said purpose. No supplement or modification to the 468 | terms and conditions hereof shall be effective as between the Parties 469 | unless it is made in writing and signed by their duly authorized 470 | representatives. 471 | 472 | 11.4 In the event that one or more of the provisions hereof were to 473 | conflict with a current or future applicable act or legislative text, 474 | said act or legislative text shall prevail, and the Parties shall make 475 | the necessary amendments so as to comply with said act or legislative 476 | text. All other provisions shall remain effective. Similarly, invalidity 477 | of a provision of the Agreement, for any reason whatsoever, shall not 478 | cause the Agreement as a whole to be invalid. 479 | 480 | 481 | 11.5 LANGUAGE 482 | 483 | The Agreement is drafted in both French and English and both versions 484 | are deemed authentic. 485 | 486 | 487 | Article 12 - NEW VERSIONS OF THE AGREEMENT 488 | 489 | 12.1 Any person is authorized to duplicate and distribute copies of this 490 | Agreement. 491 | 492 | 12.2 So as to ensure coherence, the wording of this Agreement is 493 | protected and may only be modified by the authors of the License, who 494 | reserve the right to periodically publish updates or new versions of the 495 | Agreement, each with a separate number. These subsequent versions may 496 | address new issues encountered by Free Software. 497 | 498 | 12.3 Any Software distributed under a given version of the Agreement may 499 | only be subsequently distributed under the same version of the Agreement 500 | or a subsequent version. 501 | 502 | 503 | Article 13 - GOVERNING LAW AND JURISDICTION 504 | 505 | 13.1 The Agreement is governed by French law. The Parties agree to 506 | endeavor to seek an amicable solution to any disagreements or disputes 507 | that may arise during the performance of the Agreement. 508 | 509 | 13.2 Failing an amicable solution within two (2) months as from their 510 | occurrence, and unless emergency proceedings are necessary, the 511 | disagreements or disputes shall be referred to the Paris Courts having 512 | jurisdiction, by the more diligent Party. 513 | 514 | 515 | Version 1.0 dated 2006-09-05. 516 | --------------------------------------------------------------------------------