├── experiments
    ├── server
    ├── parsers
    ├── .gitignore
    ├── Calculate pn and habr cooccurrences.ipynb
    ├── Count number of tokens in VW file.ipynb
    ├── Exporting Postnauka from MySQL dump.ipynb
    ├── Import of hierarchical spectrums.ipynb
    ├── Most popular-sciency Habr authors.ipynb
    ├── UCI Merger.ipynb
    ├── Sending requests to server from ARTM_model.ipynb
    ├── Comparison of different lemmatizers.ipynb
    ├── Parsing habr dataset.ipynb
    ├── Parsing elementy website.ipynb
    ├── Spectrum experiments.ipynb
    ├── Topical Similarity Measurements for ARTM RecSys.ipynb
    └── Parsing ruwiki dataset.ipynb
├── server
    ├── parsers
    ├── experiments
    ├── hartm
    ├── static
    │   ├── .gitignore
    │   ├── js
    │   │   ├── d3.min.js
    │   │   ├── jquery.min.js
    │   │   ├── bootstrap.min.js
    │   │   ├── fileinput.min.js
    │   │   ├── jquery.history.min.js
    │   │   └── hammer.min.js
    │   ├── css
    │   │   ├── fileinput.min.css
    │   │   └── index.css
    │   ├── img
    │   │   ├── pn.png
    │   │   ├── elem.png
    │   │   ├── habr.png
    │   │   ├── loading.gif
    │   │   └── loading-sm.gif
    │   ├── fonts
    │   │   ├── glyphicons-halflings-regular.woff
    │   │   └── glyphicons-halflings-regular.woff2
    │   ├── bower.json
    │   └── index.html
    ├── hierarchy_utils.py
    ├── .gitignore
    ├── package.json
    ├── artm_proxy.py
    ├── server.js
    ├── artm_bridge.py
    └── artm_lib.py
├── parsers
    ├── hierarchy_utils.py
    ├── arbitrary.py
    ├── habrahabr.ipynb
    ├── postnauka.ipynb
    └── text_utils.py
├── .gitmodules
├── .gitignore
├── bigartm.nix
├── bigartm_py.nix
├── default.nix
├── README.md
└── Dockerfile


/experiments/server:
--------------------------------------------------------------------------------
1 | ../server


--------------------------------------------------------------------------------
/server/parsers:
--------------------------------------------------------------------------------
1 | ../parsers/


--------------------------------------------------------------------------------
/experiments/parsers:
--------------------------------------------------------------------------------
1 | ../parsers


--------------------------------------------------------------------------------
/server/experiments:
--------------------------------------------------------------------------------
1 | ../experiments/


--------------------------------------------------------------------------------
/server/hartm:
--------------------------------------------------------------------------------
1 | ../experiments/hartm


--------------------------------------------------------------------------------
/server/static/.gitignore:
--------------------------------------------------------------------------------
1 | bower_components/
2 | 


--------------------------------------------------------------------------------
/server/hierarchy_utils.py:
--------------------------------------------------------------------------------
1 | ../experiments/hierarchy_utils.py


--------------------------------------------------------------------------------
/server/static/js/d3.min.js:
--------------------------------------------------------------------------------
1 | ../bower_components/d3/d3.min.js


--------------------------------------------------------------------------------
/parsers/hierarchy_utils.py:
--------------------------------------------------------------------------------
1 | ../experiments/hierarchy_utils.py


--------------------------------------------------------------------------------
/server/static/js/jquery.min.js:
--------------------------------------------------------------------------------
1 | ../bower_components/jquery/dist/jquery.min.js


--------------------------------------------------------------------------------
/server/static/js/bootstrap.min.js:
--------------------------------------------------------------------------------
1 | ../bower_components/bootstrap/dist/js/bootstrap.min.js


--------------------------------------------------------------------------------
/server/static/js/fileinput.min.js:
--------------------------------------------------------------------------------
1 | ../bower_components/bootstrap-fileinput/js/fileinput.min.js


--------------------------------------------------------------------------------
/server/static/css/fileinput.min.css:
--------------------------------------------------------------------------------
1 | ../bower_components/bootstrap-fileinput/css/fileinput.min.css


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bigartm"]
2 | 	path = bigartm
3 | 	url = https://github.com/bigartm/bigartm
4 | 


--------------------------------------------------------------------------------
/server/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | uploads/
3 | npm-debug.log
4 | hartm.mdl
5 | bigartm.*
6 | *.batch
7 | 


--------------------------------------------------------------------------------
/server/static/img/pn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/pn.png


--------------------------------------------------------------------------------
/server/static/img/elem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/elem.png


--------------------------------------------------------------------------------
/server/static/img/habr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/habr.png


--------------------------------------------------------------------------------
/server/static/js/jquery.history.min.js:
--------------------------------------------------------------------------------
1 | ../bower_components/history.js/scripts/bundled/html4+html5/jquery.history.js


--------------------------------------------------------------------------------
/server/static/img/loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/loading.gif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | .ipynb_checkpoints/
3 | __pycache__/
4 | venv/
5 | result
6 | *.DS_Store
7 | nohup.out
8 | *.dump
9 | 


--------------------------------------------------------------------------------
/server/static/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
1 | ../bower_components/bootstrap/dist/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/server/static/img/loading-sm.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/loading-sm.gif


--------------------------------------------------------------------------------
/server/static/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
1 | ../bower_components/bootstrap/dist/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/experiments/.gitignore:
--------------------------------------------------------------------------------
 1 | pn_batches/
 2 | hartm/
 3 | transform_batches/
 4 | test_batches/
 5 | bigartm.*
 6 | *.txt
 7 | *.batch
 8 | *.tar
 9 | *.tar.gz
10 | *.tar.bz2
11 | *.csv
12 | 


--------------------------------------------------------------------------------
/server/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Rysearch",
 3 |     "version": "0.0.1",
 4 |     "private": "true",
 5 |     "dependencies": {
 6 |         "express": ">=4.14.1",
 7 |         "zmq": ">=2.15.3",
 8 |         "uuid": ">=3.0.1",
 9 |         "multer": ">=1.3.0",
10 |         "body-parser": ">=1.17.2"
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/bigartm.nix:
--------------------------------------------------------------------------------
 1 | with import<nixpkgs> {}; {
 2 |   bigartm = stdenv.mkDerivation rec {
 3 |     name = "bigartm";
 4 | 
 5 |     buildInputs = [ cmake boost python python3 python35Packages.setuptools ];
 6 |     cmakeFlags = "-DBUILD_TESTS=OFF -DBUILD_BIGARTM_CLI=OFF";
 7 |     makeFlags = "-j4";
 8 | 
 9 |     LDFLAGS="-L${boost.dev}/lib";
10 | 
11 |     src = ./bigartm;
12 |   };
13 | }
14 | 


--------------------------------------------------------------------------------
/bigartm_py.nix:
--------------------------------------------------------------------------------
 1 | { python27Packages, python35Packages, protobuf }:
 2 | 
 3 | python35Packages.buildPythonPackage rec {
 4 |   name = "bigartm";
 5 | 
 6 |   buildInputs = [ python27Packages.protobuf3_0 ];
 7 | 
 8 |   propagatedBuildInputs = with python35Packages; [
 9 |     numpy
10 |     pandas
11 |     tqdm
12 |   ] ++ [ protobuf ];
13 | 
14 |   src = ./bigartm;
15 | 
16 |   preConfigure = ''
17 |     export PYTHONPATH="${python27Packages.protobuf3_0}/lib/python2.7/site-packages:$PYTHONPATH";
18 |     cd python
19 |   '';
20 | }
21 | 


--------------------------------------------------------------------------------
/server/static/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "rysearch",
 3 |     "homepage": "https://github.com/AVBelyy/Rysearch",
 4 |     "authors": [
 5 |         "Anton Belyy <anton.belyy@gmail.com>"
 6 |     ],
 7 |     "description": "An exploratoRY SEARCH engine",
 8 |     "dependencies": {
 9 |         "d3": ">=4.10.0",
10 |         "bootstrap": ">=3.3.7",
11 |         "bootstrap-fileinput": ">=4.4.2",
12 |         "history.js": ">=1.8.0"
13 |     },
14 |     "main": "",
15 |     "license": "MIT",
16 |     "private": true,
17 |     "ignore": [
18 |         "**/.*",
19 |         "node_modules",
20 |         "bower_components",
21 |         "test",
22 |         "tests"
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
 1 | with import <nixpkgs> {};
 2 | 
 3 | let bigartm_python = callPackage ./bigartm_py.nix {
 4 |       python27Packages = python27Packages;
 5 |       python35Packages = python35Packages;
 6 |       protobuf = protobuf3_0;
 7 |     };
 8 | in {
 9 |   rysearch = stdenv.mkDerivation rec {
10 |     name = "rysearch";
11 | 
12 |     buildInputs = [
13 |       nodejs
14 |       zeromq
15 |       python35Packages.pymongo
16 |       python35Packages.pyzmq
17 |       python35Packages.numpy
18 |       python35Packages.scipy
19 |       python35Packages.pandas
20 |       python35Packages.scikitlearn
21 |       python35Packages.regex
22 |       python35Packages.virtualenv
23 |     ];
24 | 
25 |     shellHook = ''
26 |       if [ ! -d venv ]; then
27 |         virtualenv --python=python3.5 venv
28 |         venv/bin/pip install pymystem3
29 |         venv/bin/pip install tqdm
30 |         venv/bin/pip install protobuf==3.0.0
31 |       fi
32 |       export PATH="$(pwd)/venv/bin:$PATH"
33 |       export ARTM_SHARED_LIBRARY="$(pwd)/result/lib/libartm.so";
34 |       export PYTHONPATH="$PYTHONPATH:$(toPythonPath ${bigartm_python})";
35 |     '';
36 |   };
37 | }
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rysearch
 2 | Rysearch is an explorato**ry search** engine and recommender system. Based on [BigARTM](http://bigartm.org), open-source library for topic modeling, it takes into account latent topical structure of texts to achieve good results in both knowledge exploration and visualization.[1]
 3 | 
 4 | ## Quick start
 5 | Use our pre-configured Docker image for quick installation. Run:
 6 | ```bash
 7 | docker run -t -p 3000:3000 tohnann/rysearch
 8 | ```
 9 | And then open [http://localhost:3000](http://localhost:3000).
10 | 
11 | ## Manual installation
12 | Everything is tested on Linux (NixOS) and Windows operating systems. If things don't work as described here — please submit us an issue.
13 | 
14 | ### Running a Rysearch server
15 | ```bash
16 | cd server/
17 | 
18 | # Install Node.js libraries 
19 | npm install
20 | ```
21 | 
22 | Rysearch server consists of two workers: ARTM_bridge and Node.js server. You have to run them as separate programs like this:
23 | ```bash
24 | # Run ARTM_bridge
25 | python3 artm_bridge.py
26 | ```
27 | 
28 | ```bash
29 | # Run Node.js server
30 | npm start
31 | ```
32 | 
33 | [1] K. V. Vorontsov et al. Non-Bayesian Additive Regularization for Multimodal Topic Modeling of Large Collections, *TM '15 Proceedings of the 2015 Workshop on Topic Models: Post-Processing and Applications*, 2014.
34 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu
 2 | 
 3 | RUN apt-get -y update && \
 4 |     apt-get -y dist-upgrade
 5 | 
 6 | RUN apt-get -y install \
 7 |         python3 \
 8 |         python3-pymongo \
 9 |         python3-zmq \
10 |         python3-numpy \
11 |         python3-scipy \
12 |         python3-sklearn \
13 |         python3-pip && \
14 |     pip3 install pandas
15 | 
16 | RUN apt-get -y install \
17 |         nodejs \
18 |         npm
19 | 
20 | RUN apt-get install -y wget libtool pkg-config build-essential autoconf automake uuid-dev && \
21 |     cd ~ && \
22 |     wget http://download.zeromq.org/zeromq-4.0.5.tar.gz && \
23 |     tar xvzf zeromq-4.0.5.tar.gz && \
24 |     cd zeromq-4.0.5 && \
25 |     ./configure && \
26 |     make install && \
27 |     ldconfig
28 | 
29 | RUN apt-get install -y git
30 | RUN ln -s /usr/bin/nodejs /usr/bin/node
31 | 
32 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6 && \
33 |     echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-3.4.list && \
34 |     apt-get update && \
35 |     apt-get -y install mongodb-org
36 | 
37 | RUN apt-get -y install tmux
38 | 
39 | RUN cd ~ && \
40 |     wget -qO- https://www.dropbox.com/s/uy4nfqr1m4spvvu/datasets.tar.gz | tar xzv && \
41 |     wget https://www.dropbox.com/s/h75rz3hfvpzanji/hartm.mdl
42 | 
43 | EXPOSE 3000
44 | 
45 | RUN locale-gen en_US.UTF-8
46 | ENV LANG en_US.UTF-8
47 | ENV LANGUAGE en_US:en
48 | ENV LC_ALL en_US.UTF-8
49 | 
50 | CMD cd ~ && \
51 |     tmux new-session -s "rysearch" -d && \
52 |     tmux new-window -t "rysearch:1" "mongod -f /etc/mongod.conf" && \
53 |     if [ ! -d rysearch ]; then git clone -b master https://github.com/AVBelyy/Rysearch.git rysearch; fi && \
54 |     if [ -d shared ]; then rm ~/hartm.mdl; ln -s shared/hartm.mdl ~/hartm.mdl; fi && \
55 |     if [ -d shared ]; then rm -rf ~/datasets; ln -s shared/datasets ~/datasets; fi && \
56 |     cd rysearch/server && \
57 |     ln -s ~/hartm.mdl hartm.mdl && \
58 |     mongorestore -d datasets ~/datasets && \
59 |     tmux new-window -t "rysearch:2" "python3 artm_bridge.py" && \
60 |     tmux split-window -t "rysearch:2" -v "npm install >/dev/null 2>&1 && npm start" && \
61 |     tmux select-window -t "rysearch:2" && \
62 |     tmux attach-session -t "rysearch"
63 | 


--------------------------------------------------------------------------------
/parsers/arbitrary.py:
--------------------------------------------------------------------------------
 1 | # Парсер произвольного документа
 2 | 
 3 | import regex
 4 | import unicodedata
 5 | 
 6 | from sklearn.pipeline import Pipeline
 7 | from pathlib import Path
 8 | 
 9 | from parsers.text_utils import BaseSource, BaseProcessor, BaseSink
10 | from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor
11 | from parsers.text_utils import VowpalWabbitSink, MongoDbSink
12 | 
13 | class ArbitraryFileSource(BaseSource):
14 |     def fit(self, iter_source, *args):
15 |         self.iter_source = iter_source
16 |         return self
17 | 
18 | class ArbitraryFileProcessor(BaseProcessor):
19 |     def __init__(self, stop_words):
20 |         self.doc_pipeline = Pipeline([
21 |             ("text-processor",     DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")),
22 |             ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_words)),
23 |         ])
24 | 
25 |     @staticmethod
26 |     def strip_accents(s):
27 |         unused_char = '\U00037b84'
28 |         s = s.replace("й", unused_char)
29 |         return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")).replace(unused_char, "й")
30 | 
31 |     def transform(self, src, *args):
32 |         # Parse text file
33 |         text = src.iter_source.read()
34 |         # Get rid of accent marks
35 |         text = ArbitraryFileProcessor.strip_accents(text)
36 |         # Run inner pipeline to form modalities
37 |         modalities = self.doc_pipeline.fit_transform(text)
38 |         # Finally, make a document and return it
39 |         doc = {}
40 |         doc["modalities"] = modalities
41 |         doc["markdown"] = text
42 |         return doc
43 | 
44 | def get_pipeline():
45 |     root_path = Path("../datasets/arbitrary")
46 |     stop_words = (root_path / "stopwords.txt").open().read().split()
47 |     return Pipeline([
48 |         ("file-source", ArbitraryFileSource()),
49 |         ("file-processor", ArbitraryFileProcessor(stop_words)),
50 |     ])
51 | 
52 | if __name__ == "__main__":
53 |     import argparse
54 |     pipeline = get_pipeline()
55 |     argparser = argparse.ArgumentParser()
56 |     argparser.add_argument("source_file")
57 |     # argparser.add_argument("target_file")
58 |     args = argparser.parse_args()
59 |     with open(args.source_file) as src:
60 |         doc = pipeline.fit_transform(src)
61 |         print(doc)


--------------------------------------------------------------------------------
/server/artm_proxy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Load-balancing proxy for ARTM backend.
 3 | """
 4 | 
 5 | import multiprocessing
 6 | import subprocess
 7 | import zmq
 8 | 
 9 | ZMQ_FRONTEND_PORT = 2411
10 | ZMQ_BACKEND_PORT = 2511
11 | 
12 | EMPTY = b""
13 | 
14 | try:
15 |     context = zmq.Context.instance()
16 |     frontend = context.socket(zmq.ROUTER)
17 |     frontend.bind("tcp://*:%d" % ZMQ_FRONTEND_PORT)
18 |     backend = context.socket(zmq.ROUTER)
19 |     backend.bind("tcp://*:%d" % ZMQ_BACKEND_PORT)
20 | 
21 |     # Initialize main loop state
22 |     available_workers = []
23 |     poller = zmq.Poller()
24 |     # Only poll for requests from backend until workers are available
25 |     poller.register(backend, zmq.POLLIN)
26 | 
27 |     print("ARTM_proxy: start serving ZeroMQ queries on ports",
28 |           ZMQ_FRONTEND_PORT, "and", ZMQ_BACKEND_PORT)
29 | 
30 |     # Main loop
31 |     # TODO: remove stale workers by time-out
32 |     while True:
33 |         sockets = dict(poller.poll())
34 |         prev_len = len(available_workers)
35 | 
36 |         if backend in sockets:
37 |             response = backend.recv_multipart()
38 |             worker, client = response[:2]
39 |             if client == b"UP":
40 |                 available_workers.append(worker)
41 |             elif client == b"DOWN":
42 |                 if worker in available_workers:
43 |                     available_workers.remove(worker)
44 |             elif len(response) > 2:
45 |                 # If worker replied, send rest back to client
46 |                 reply = response[2]
47 |                 frontend.send_multipart([client, reply])
48 |                 available_workers.append(worker)
49 | 
50 |         if frontend in sockets:
51 |             # Get next client request, route to last-used worker
52 |             # TODO: learn different routing tactics
53 |             client, request = frontend.recv_multipart()
54 |             worker = available_workers.pop(0)
55 |             backend.send_multipart([worker, client, request])
56 | 
57 |         if len(available_workers) > 0 and prev_len == 0:
58 |             # Poll for clients now that a worker is available
59 |             poller.register(frontend, zmq.POLLIN)
60 |         if len(available_workers) == 0 and frontend in poller:
61 |             # Don't poll clients if no workers are available
62 |             poller.unregister(frontend)
63 | except:
64 |     import traceback
65 |     traceback.print_exc()
66 |     print("Shutting down ARTM_proxy...")
67 | finally:
68 |     # Clean up
69 |     backend.close()
70 |     frontend.close()
71 |     context.term()
72 | 


--------------------------------------------------------------------------------
/experiments/Calculate pn and habr cooccurrences.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "%matplotlib inline\n",
12 |     "import itertools\n",
13 |     "import numpy as np\n",
14 |     "import pandas as pd\n",
15 |     "import matplotlib.pyplot as plt"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "metadata": {
22 |     "collapsed": true
23 |    },
24 |    "outputs": [],
25 |    "source": [
26 |     "# Загрузим словарь со словами объединенной коллекции\n",
27 |     "cooc_modality = \"text\"\n",
28 |     "\n",
29 |     "vocab_list = list(map(lambda r: r[0], filter(lambda r: len(r) > 1 and r[1] == cooc_modality,\n",
30 |     "                                             map(lambda r: r.strip().split(), open(\"merged_vocab.txt\", \"r\")))))\n",
31 |     "vocab_map = dict(zip(vocab_list, range(len(vocab_list))))"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {
38 |     "collapsed": false
39 |    },
40 |    "outputs": [],
41 |    "source": [
42 |     "%%time\n",
43 |     "\n",
44 |     "word_count = {}\n",
45 |     "pair_count = {}\n",
46 |     "\n",
47 |     "for i, text in enumerate(open(\"batch_vw.txt\"), 1):\n",
48 |     "    text = set(text.strip().split()[2:])\n",
49 |     "    token_ids = set(filter(None, map(vocab_map.get, text)))\n",
50 |     "    for u in token_ids:\n",
51 |     "        word_count.setdefault(u, 0)\n",
52 |     "        word_count[u] += 1\n",
53 |     "    for p in itertools.combinations(token_ids, 2):\n",
54 |     "        pair_count.setdefault(p, 0)\n",
55 |     "        pair_count[p] += 1\n",
56 |     "    if i % 100 == 0:\n",
57 |     "        print(\"Processed %i documents\" % i)"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "markdown",
62 |    "metadata": {},
63 |    "source": [
64 |     "---"
65 |    ]
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 3",
71 |    "language": "python",
72 |    "name": "python3"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.5.3"
85 |   },
86 |   "latex_envs": {
87 |    "bibliofile": "biblio.bib",
88 |    "cite_by": "apalike",
89 |    "current_citInitial": 1,
90 |    "eqLabelWithNumbers": true,
91 |    "eqNumInitial": 0
92 |   }
93 |  },
94 |  "nbformat": 4,
95 |  "nbformat_minor": 2
96 | }
97 | 


--------------------------------------------------------------------------------
/server/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <meta charset="utf-8">
 4 | 
 5 | <head>
 6 |     <title>Exploratory search</title>
 7 |     <!-- CSS styles -->
 8 |     <link href="css/index.css" rel="stylesheet" type="text/css">
 9 |     <link href="css/bootstrap.min.css" rel="stylesheet" type="text/css">
10 |     <link href="css/fileinput.min.css" rel="stylesheet" type="text/css">
11 | 
12 |     <!-- External JS libraries -->
13 |     <script src="js/jquery.min.js"></script>
14 |     <script src="js/jquery.history.min.js"></script>
15 |     <script src="js/bootstrap.min.js"></script>
16 |     <script src="js/fileinput.min.js"></script>
17 |     <script src="js/d3.min.js"></script>
18 |     <script src="js/carrotsearch.foamtree.min.js"></script>
19 |     <script src="js/carrotsearch.circles.min.js"></script>
20 |     <script src="js/hammer.min.js"></script>
21 | 
22 |     <!-- The very main JS file -->
23 |     <script type="text/javascript" src="js/index.js"></script>
24 | </head>
25 | <body>
26 |     <!-- Navigation panel markup -->
27 |     <nav class="navbar navbar-default navbar-fixed-top">
28 |         <div class="container-fluid">
29 |             <div class="navbar-header">
30 |                 <a class="navbar-brand" href="https://github.com/AVBelyy/Rysearch" target="_blank">Rysearch</a>
31 |             </div>
32 |             <ul class="nav navbar-nav" id="navigation_bar">
33 |                 <li class="active" id="navbar_home_container"><a id="home_btn">Карта знаний</a></li>
34 |             </ul>
35 |             <form class="navbar-form navbar-right">
36 |                 <div id="search_text_container" class="input-group">
37 |                     <input id="search_text" type="text" class="form-control"
38 |                            placeholder="Введите запрос или загрузите документ">
39 |                     <div class="input-group-btn">
40 |                         <label id="upload_btn" class="btn btn-default btn-file">
41 |                             <i id="upload_caption" class="glyphicon glyphicon-upload"></i>
42 |                             <input id="upload_hidden" name="doc" type="file" style="display: none;"
43 |                                    accept="text/plain">
44 |                         </label>
45 |                     </div>
46 |                 </div>
47 |             </form>
48 |         </div>
49 |     </nav>
50 | 
51 |     <!-- Container for displaying either a map or an article -->
52 |     <div class="container-fluid" id="main_container">
53 |         <div id="knowledge_map_container"></div>
54 |         <div id="overview_container" class="row">
55 |             <div class="col-md-8" id="document_container"></div>
56 |             <div class="col-md-4" id="recommendations_container"></div>
57 |         </div>
58 |     </div>
59 | 
60 |     <!-- Container for displaying recommendation -->
61 | </body>
62 | </html>
63 | 


--------------------------------------------------------------------------------
/experiments/Count number of tokens in VW file.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "CPU times: user 33.2 s, sys: 271 ms, total: 33.5 s\n",
 26 |       "Wall time: 33.4 s\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "%%time\n",
 32 |     "\n",
 33 |     "stats = []\n",
 34 |     "\n",
 35 |     "for i, doc in enumerate(open(\"../datasets/habrahabr/habrahabr.txt\")):\n",
 36 |     "    tokens = doc.split()\n",
 37 |     "    doc_id = tokens[0]\n",
 38 |     "    modalities = {}\n",
 39 |     "    cur_mod = \"\"\n",
 40 |     "    for token in tokens[1:]:\n",
 41 |     "        if token.startswith(\"|\"):\n",
 42 |     "            cur_mod = token[1:]\n",
 43 |     "            modalities[cur_mod] = []\n",
 44 |     "        else:\n",
 45 |     "            modalities[cur_mod].append(token)\n",
 46 |     "    stats.append((doc_id, len(modalities[\"text\"]), len(set(modalities[\"text\"])), \\\n",
 47 |     "                  len(modalities[\"text_habr\"]), len(set(modalities[\"text_habr\"]))))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "stats_df = pd.DataFrame(stats, columns=[\"doc_id\", \"n_all_common\", \"n_uniq_common\", \"n_all_spec\", \"n_uniq_spec\"])\n",
 59 |     "stats_df = stats_df.set_index(\"doc_id\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "stats_df.to_csv(\"modalities_stats.csv\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "---"
 78 |    ]
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 3",
 84 |    "language": "python",
 85 |    "name": "python3"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 3
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython3",
 97 |    "version": "3.5.2"
 98 |   },
 99 |   "latex_envs": {
100 |    "bibliofile": "biblio.bib",
101 |    "cite_by": "apalike",
102 |    "current_citInitial": 1,
103 |    "eqLabelWithNumbers": true,
104 |    "eqNumInitial": 0
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 0
109 | }
110 | 


--------------------------------------------------------------------------------
/server/static/css/index.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     padding-top: 70px;
  3 | }
  4 | 
  5 | #knowledge_map_container {
  6 |     position: absolute;
  7 |     top: 60px;
  8 |     bottom: 0px;
  9 |     left: 0px;
 10 |     right: 0px;
 11 | }
 12 | 
 13 | #doc_sunburst_container {
 14 |     position: absolute;
 15 |     top: 60px;
 16 |     bottom: 0px;
 17 |     left: 0px;
 18 |     right: 0px;
 19 | }
 20 | 
 21 | #document_container {
 22 |     padding-left: 10px;
 23 | }
 24 | 
 25 | #transform_container {
 26 |     display: none;
 27 | }
 28 | 
 29 | .polygons {
 30 |     fill: #00ff00;
 31 |     stroke: #000;
 32 | }
 33 | 
 34 | .hidden {
 35 |     display: none;
 36 | }
 37 | 
 38 | svg {
 39 |     padding-left: 60px;
 40 |     display: none;
 41 | }
 42 | 
 43 | .chosen_topic {
 44 |     font-family: 'Anton', sans-serif;
 45 |     font-size: 16px;
 46 |     font-style: normal;
 47 |     font-variant: normal;
 48 |     font-weight: normal;
 49 |     line-height: 15.4px;
 50 | }
 51 | 
 52 | .polygons #selected-polygon {
 53 |     fill: #f00;
 54 | }
 55 | 
 56 | .titles {
 57 |     font-family: 'Anton', sans-serif;
 58 |     font-size: 14px;
 59 |     font-style: normal;
 60 |     font-variant: normal;
 61 |     font-weight: normal;
 62 |     line-height: 15.4px;
 63 | }
 64 | 
 65 | .chosen_topics_label {
 66 |     font-size: 250%;
 67 | }
 68 | 
 69 | .document_title {
 70 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 71 |     font-size: 24px;
 72 |     font-style: normal;
 73 |     font-variant: normal;
 74 |     font-weight: bold;
 75 |     line-height: 26.4px;
 76 | }
 77 | 
 78 | .document_authors {
 79 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 80 |     font-size: 18px;
 81 |     font-style: normal;
 82 |     font-variant: normal;
 83 |     font-weight: bold;
 84 |     line-height: 22px;
 85 | }
 86 | 
 87 | .document_text {
 88 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 89 |     font-size: 14px;
 90 |     font-style: normal;
 91 |     font-variant: normal;
 92 |     font-weight: 400;
 93 |     line-height: 20px;
 94 | }
 95 | 
 96 | .recommendation_title {
 97 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 98 |     font-size: 16px;
 99 |     font-style: normal;
100 |     font-variant: normal;
101 |     font-weight: bold;
102 |     line-height: 26.4px;
103 | }
104 | 
105 | .recommendation_text {
106 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
107 |     font-size: 12px;
108 |     font-style: normal;
109 |     font-variant: normal;
110 |     font-weight: 400;
111 |     line-height: 20px;
112 | }
113 | 
114 | h1 {
115 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
116 |     font-size: 24px;
117 |     font-style: normal;
118 |     font-variant: normal;
119 |     font-weight: 500;
120 |     line-height: 26.4px;
121 | }
122 | 
123 | p {
124 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
125 |     font-size: 14px;
126 |     font-style: normal;
127 |     font-variant: normal;
128 |     font-weight: 400;
129 |     line-height: 20px;
130 | }
131 | 
132 | /* Upload button spinning animation */
133 | .glyphicon-refresh-animate {
134 |     -animation: spin .7s infinite linear;
135 |     -webkit-animation: spin2 .7s infinite linear;
136 | }
137 | 
138 | @-webkit-keyframes spin2 {
139 |     from { -webkit-transform: rotate(0deg);}
140 |     to { -webkit-transform: rotate(360deg);}
141 | }
142 | 
143 | @keyframes spin {
144 |     from { transform: scale(1) rotate(0deg);}
145 |     to { transform: scale(1) rotate(360deg);}
146 | }
147 | 
148 | #search_text {
149 |     width: 400px;
150 | }
151 | 
152 | .collection_image {
153 |     vertical-align: text-top;
154 |     margin-left: 0.25em;
155 | }
156 | 


--------------------------------------------------------------------------------
/experiments/Exporting Postnauka from MySQL dump.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import os\n",
 12 |     "import collections\n",
 13 |     "import pymysql\n",
 14 |     "import pymysql.cursors"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 13,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "CPU times: user 96 ms, sys: 0 ns, total: 96 ms\n",
 29 |       "Wall time: 1.3 s\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%%time\n",
 35 |     "\n",
 36 |     "# Дополним посты на ПостНауке именами авторов.\n",
 37 |     "\n",
 38 |     "conn = pymysql.connect(host=\"localhost\",\n",
 39 |     "                       user=\"root\",\n",
 40 |     "                       password=\"\",\n",
 41 |     "                       db=\"postnauka\",\n",
 42 |     "                       charset=\"utf8\",\n",
 43 |     "                       cursorclass=pymysql.cursors.DictCursor)\n",
 44 |     "\n",
 45 |     "authors_names = collections.defaultdict(list)\n",
 46 |     "\n",
 47 |     "try:\n",
 48 |     "    with conn.cursor() as cur:\n",
 49 |     "        q = \"\"\"\n",
 50 |     "        -- Получить имена авторов\n",
 51 |     "        select tr.object_id as post_id, t.term_id as author_id, tt.description as author_name\n",
 52 |     "        from pn_term_taxonomy tt\n",
 53 |     "        join pn_terms t on (t.term_id = tt.term_id)\n",
 54 |     "        join pn_term_relationships tr on (tr.term_taxonomy_id = tt.term_taxonomy_id)\n",
 55 |     "        join pn_posts p on (p.id = tr.object_id)\n",
 56 |     "        where p.post_type = 'post' and p.post_status = 'publish' and tt.taxonomy = 'author'\n",
 57 |     "        order by tr.object_id, t.term_id\n",
 58 |     "        \"\"\"\n",
 59 |     "        cur.execute(q)\n",
 60 |     "        for row in cur:\n",
 61 |     "            doc_id = row[\"post_id\"]\n",
 62 |     "            author_id = row[\"author_id\"]\n",
 63 |     "            author_str = row[\"author_name\"].split()[:-3]\n",
 64 |     "            author_name = \" \".join(author_str[:len(author_str) // 2])\n",
 65 |     "            authors_names[doc_id].append((author_id, author_name))\n",
 66 |     "finally:\n",
 67 |     "    conn.close()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 21,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "CPU times: user 265 ms, sys: 128 ms, total: 393 ms\n",
 82 |       "Wall time: 836 ms\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "%%time\n",
 88 |     "\n",
 89 |     "# Дополним существующий датасет именами авторов из MySQL дампа.\n",
 90 |     "\n",
 91 |     "for doc_id, authors in authors_names.items():\n",
 92 |     "    with open(\"../datasets/postnauka/raw_data/meta/%s_meta.txt\" % doc_id, \"a\") as meta_file:\n",
 93 |     "        for author_id, author_name in authors:\n",
 94 |     "            meta_file.write(\"author_name\\t%d\\t%s\\n\" % (author_id, author_name))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "---"
102 |    ]
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.5.2"
122 |   },
123 |   "latex_envs": {
124 |    "bibliofile": "biblio.bib",
125 |    "cite_by": "apalike",
126 |    "current_citInitial": 1,
127 |    "eqLabelWithNumbers": true,
128 |    "eqNumInitial": 0
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 0
133 | }
134 | 


--------------------------------------------------------------------------------
/server/server.js:
--------------------------------------------------------------------------------
  1 | const path = require("path");
  2 | 
  3 | const express = require("express");
  4 | const multer  = require("multer");
  5 | const zmq = require("zmq");
  6 | const uuidV4 = require("uuid/v4");
  7 | const bodyParser = require("body-parser");
  8 | 
  9 | // Initialize common data structures
 10 | 
 11 | // TODO: think about using TTL (time-to-live) later
 12 | var routingQueue = {};
 13 | 
 14 | // Initialize zmq
 15 | var sock = zmq.socket("dealer");
 16 | 
 17 | function sendToSock (res, msg) {
 18 |     if (typeof msg !== "object" || msg === null) {
 19 |         return false;
 20 |     }
 21 |     var uuid = uuidV4();
 22 |     routingQueue[uuid] = res;
 23 |     msg["id"] = uuid;
 24 |     return sock.send(JSON.stringify(msg)) === 0;
 25 | }
 26 | 
 27 | sock.connect("tcp://localhost:2411");
 28 | 
 29 | var artmTopics = null;
 30 | sock.on("message", function (reply) {
 31 |     reply = JSON.parse(reply);
 32 |     if (reply.act == "get_topics") {
 33 |         artmTopics = reply.data;
 34 |     } else if (reply.act == "recommend_docs" || reply.act == "get_documents" ||
 35 |                reply.act == "get_document" || reply.act == "transform_doc" ||
 36 |                reply.act == "get_next_assessment" || reply.act == "assess_document" ||
 37 |                reply.act == "perform_search"
 38 |         ) {
 39 |         var res = routingQueue[reply.id];
 40 |         delete routingQueue[reply.id];
 41 |         res.send(reply.data);
 42 |     }
 43 | });
 44 | 
 45 | sock.send(JSON.stringify({"act": "get_topics"}));
 46 | 
 47 | // Initialize express
 48 | const app = express();
 49 | app.use(express.static("static"));
 50 | app.use(bodyParser.urlencoded({ extended: true }));
 51 | 
 52 | // TODO: temporary upload path! change later in production
 53 | var UPLOAD_PATH = path.join(__dirname, "uploads/")
 54 | var upload = multer({dest: UPLOAD_PATH})
 55 | 
 56 | app.get("/get-topics", function (req, res) {
 57 |     if (artmTopics) {
 58 |         res.send(artmTopics);
 59 |     } else {
 60 |         res.send({"error": "topics data not ready yet"});
 61 |     }
 62 | });
 63 | 
 64 | app.get("/get-documents", function (req, res) {
 65 |     var topicId = req.query.topic_id;
 66 |     var offset = parseInt(req.query.offset);
 67 |     var limit = parseInt(req.query.limit);
 68 |     sendToSock(res, { "act": "get_documents", "topic_id": topicId,
 69 |                       "offset": offset, "limit": limit });
 70 | });
 71 | 
 72 | app.get("/perform-search", function (req, res) {
 73 |     var query = req.query.query;
 74 |     var limit = parseInt(req.query.limit);
 75 |     sendToSock(res, { "act": "perform_search", "query": query, "limit": limit });
 76 | });
 77 | 
 78 | app.get("/get-document", function (req, res) {
 79 |     var docId = req.query.doc_id;
 80 |     var recommTags = req.query.recommend_tags;
 81 |     sendToSock(res, { "act": "get_document", "doc_id": docId, "recommend_tags": !!recommTags });
 82 | });
 83 | 
 84 | app.get("/recommend-docs", function (req, res) {
 85 |     var docId = req.query.doc_id;
 86 |     sendToSock(res, { "act": "recommend_docs", "doc_id": docId });
 87 | });
 88 | 
 89 | app.post("/transform-doc", upload.single("doc"), function (req, res, next) {
 90 |     var fileObj = req.file;
 91 | 
 92 |     if (fileObj.mimetype != "text/plain") {
 93 |         res.send({"error": "unknown filetype '" + fileObj.mimetype + "'"});
 94 |         return;
 95 |     }
 96 | 
 97 |     // Make request to ARTM_bridge
 98 |     sendToSock(res, { "act": "transform_doc", "doc_path": fileObj.path,
 99 |                       "filename": fileObj.originalname });
100 | });
101 | 
102 | app.get("/get-next-assessment", function (req, res) {
103 |     var assessorId = parseInt(req.query.assessor_id);
104 |     var assessorsCnt = parseInt(req.query.assessors_cnt);
105 |     var collectionName = req.query.collection_name;
106 |     sendToSock(res, {"act": "get_next_assessment",
107 |                      "collection_name": collectionName,
108 |                      "assessor_id": assessorId,
109 |                      "assessors_cnt": assessorsCnt});
110 | });
111 | 
112 | app.post("/assess-document", function (req, res) {
113 |     var docId = req.body.doc_id;
114 |     var isRelevant = req.body.is_relevant === "true";
115 |     sendToSock(res, {"act": "assess_document",
116 |                      "doc_id": docId,
117 |                      "is_relevant": isRelevant});
118 | });
119 | 
120 | var server = app.listen(3000, function () {
121 |     var host = server.address().address;
122 |     var port = server.address().port;
123 | 
124 |     console.log("Example app listening at http://%s:%s", host, port);
125 | });
126 | 


--------------------------------------------------------------------------------
/experiments/Import of hierarchical spectrums.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 14,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 15,
 18 |    "metadata": {
 19 |     "collapsed": false,
 20 |     "deletable": true,
 21 |     "editable": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import pickle\n",
 26 |     "import hierarchy_utils"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 44,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "MODEL_PATH = \"hartm\"\n",
 38 |     "artm_extra_info = pickle.load(open(MODEL_PATH + \"/extra_info.dump\", \"rb\"))"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 38,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "artm_extra_info[\"spectrums\"] = [dump]"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 43,
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "pickle.dump(artm_extra_info, open(MODEL_PATH + \"/extra_info.dump\", \"wb\"))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 12,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stderr",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "/nix/store/r7qpc32yr09l9a0d5y3b8i84kw5phx4p-python3-3.5.3/lib/python3.5/json/encoder.py:198: DeprecationWarning: Interpreting naive datetime as local 2017-07-21 13:38:03.234389. Please add timezone info to timestamps.\n",
 75 |       "  chunks = self.iterencode(o, _one_shot=True)\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "artm_model = hierarchy_utils.hARTM(theta_columns_naming=\"title\",\n",
 81 |     "                                   cache_theta=True,\n",
 82 |     "                                   class_ids=artm_extra_info[\"class_ids\"])\n",
 83 |     "artm_model.load(MODEL_PATH)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 16,
 89 |    "metadata": {
 90 |     "collapsed": false,
 91 |     "deletable": true,
 92 |     "editable": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "dump = pickle.load(open(\"flat_spectrum.dump\", \"rb\"))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 18,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "phi0 = artm_model._levels[0].get_phi()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 29,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "topics_names = {}\n",
119 |     "for t in phi0.columns:\n",
120 |     "    topics_names[\"level_0_\" + t] = list(phi0[t].sort_values(ascending=False)[:3].index)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 33,
126 |    "metadata": {
127 |     "collapsed": false,
128 |     "deletable": true,
129 |     "editable": true
130 |    },
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "level_0_topic_13 \t химия, нанотехнологии, материаловедение\n",
137 |       "level_0_topic_5 \t астрономия, астрофизика, вселенная\n",
138 |       "level_0_topic_11 \t физика, физика_элементарных_частиц, квантовая_физика\n",
139 |       "level_0_topic_15 \t математика, информационные_технологии, технологии\n",
140 |       "level_0_topic_10 \t палеонтология, биохимия, стволовые_клетки\n",
141 |       "level_0_topic_2 \t информационная_безопасность, копирайт, будущее\n",
142 |       "level_0_topic_16 \t средневековье, мифология, биоинформатика\n",
143 |       "level_0_topic_4 \t история, история_россии, ссср\n",
144 |       "level_0_topic_8 \t общество, экономика, россия\n",
145 |       "level_0_topic_0 \t философия, россия, география\n",
146 |       "level_0_topic_3 \t культура, литература, культурология\n",
147 |       "level_0_topic_6 \t лингвистика, язык, право\n",
148 |       "level_0_topic_7 \t социология, социология_повседневности, дюркгейм_эмиль\n",
149 |       "level_0_topic_12 \t наука, управление_проектами, работа\n",
150 |       "level_0_topic_14 \t образование, университет, школа\n",
151 |       "level_0_topic_9 \t психология, люди_науки, история_науки\n",
152 |       "level_0_topic_18 \t мозг, нейробиология, искусственный_интеллект\n",
153 |       "level_0_topic_17 \t экология, зоология, этология\n",
154 |       "level_0_topic_1 \t медицина, эволюция, антропология\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "for topic_id in dump:\n",
160 |     "    print(topic_id, \"\\t\", \", \".join(topics_names[topic_id]))"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {
166 |     "deletable": true,
167 |     "editable": true
168 |    },
169 |    "source": [
170 |     "---"
171 |    ]
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 3",
177 |    "language": "python",
178 |    "name": "python3"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 3
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython3",
190 |    "version": "3.5.3"
191 |   },
192 |   "latex_envs": {
193 |    "bibliofile": "biblio.bib",
194 |    "cite_by": "apalike",
195 |    "current_citInitial": 1,
196 |    "eqLabelWithNumbers": true,
197 |    "eqNumInitial": 0
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 2
202 | }
203 | 


--------------------------------------------------------------------------------
/experiments/Most popular-sciency Habr authors.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 24,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pymongo\n",
 12 |     "import collections\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "db = pymongo.MongoClient()"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 10,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "def get_author(doc_id):\n",
 37 |     "    try:\n",
 38 |     "        return list(db[\"datasets\"][\"habrahabr\"].find({\"_id\": doc_id}, {\"authors_names\": 1}))[0][\"authors_names\"][0]\n",
 39 |     "    except Exception:\n",
 40 |     "        return None"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 15,
 46 |    "metadata": {
 47 |     "collapsed": false
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "clf_output = pd.read_csv(\"classifier_output.csv\")\n",
 52 |     "clf_output.columns = [\"id\", \"proba\"]\n",
 53 |     "clf_output = clf_output.set_index(\"id\")[\"proba\"]"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 21,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "%%time\n",
 65 |     "\n",
 66 |     "authors_relevance = collections.defaultdict(list)\n",
 67 |     "\n",
 68 |     "for doc_id, p in clf_output.items():\n",
 69 |     "    author_name = get_author(doc_id)\n",
 70 |     "    authors_relevance[author_name].append(p)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 112,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "P = len(clf_output[clf_output >  0.5]) / len(clf_output)\n",
 82 |     "N = len(clf_output[clf_output <= 0.5]) / len(clf_output)\n",
 83 |     "\n",
 84 |     "def relevance(n, p=0):\n",
 85 |     "    return np.sqrt(p / P) - np.sqrt(n / N)\n",
 86 |     "\n",
 87 |     "def cont_relevance(ps):\n",
 88 |     "    return np.log10(len(ps)) * np.median(2 * ps - 1)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 113,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "authors_series = pd.Series({k: relevance(*np.bincount(np.array(v) > 0.5)) if v else 0\n",
100 |     "                            for k, v in authors_relevance.items()})"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 97,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "authors_series = pd.Series({k: cont_relevance(np.array(v)) if v else 0\n",
112 |     "                            for k, v in authors_relevance.items()})"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 126,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       " 22.938461 -- https://habrahabr.ru/users/SLY_G\n",
127 |       " 18.314086 -- https://habrahabr.ru/users/lozga\n",
128 |       " 14.878677 -- https://habrahabr.ru/users/Zelenyikot\n",
129 |       " 12.548149 -- https://habrahabr.ru/users/krasandm\n",
130 |       " 12.498694 -- https://habrahabr.ru/users/Synth\n",
131 |       " 11.519339 -- https://habrahabr.ru/users/protogui\n",
132 |       " 11.377467 -- https://habrahabr.ru/users/AlexeyR\n",
133 |       " 11.285316 -- https://habrahabr.ru/users/LukinB\n",
134 |       " 10.340973 -- https://habrahabr.ru/users/PatientZero\n",
135 |       "  9.634685 -- https://habrahabr.ru/users/Boomburum\n",
136 |       "  ...\n",
137 |       " -7.757439 -- https://habrahabr.ru/users/BBSoD\n",
138 |       " -7.815363 -- https://habrahabr.ru/users/Tylerskald\n",
139 |       " -8.655005 -- https://habrahabr.ru/users/azproduction\n",
140 |       " -8.725304 -- https://habrahabr.ru/users/XaocCPS\n",
141 |       " -9.768331 -- https://habrahabr.ru/users/azazelis\n",
142 |       "-11.758759 -- https://habrahabr.ru/users/aleksandrit\n",
143 |       "-11.796221 -- https://habrahabr.ru/users/RoboForm\n",
144 |       "-13.036321 -- https://habrahabr.ru/users/marks\n",
145 |       "-15.156126 -- https://habrahabr.ru/users/alizar\n",
146 |       "-16.058914 -- https://habrahabr.ru/users/jeston\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "with open(\"popular-sciency-rating.txt\", \"w\") as f:\n",
152 |     "    for i, (author_name, r) in enumerate(authors_series.sort_values(ascending=False).items()):\n",
153 |     "        s = \"% 10.6f -- https://habrahabr.ru/users/%s\\n\" % (r, author_name)\n",
154 |     "        if i < 10 or i >= len(authors_series) - 10:\n",
155 |     "            print(s, end=\"\")\n",
156 |     "        elif i == 10:\n",
157 |     "            print(\"  ...\")\n",
158 |     "        f.write(s)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "---"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.5.3"
186 |   },
187 |   "latex_envs": {
188 |    "bibliofile": "biblio.bib",
189 |    "cite_by": "apalike",
190 |    "current_citInitial": 1,
191 |    "eqLabelWithNumbers": true,
192 |    "eqNumInitial": 0
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 2
197 | }
198 | 


--------------------------------------------------------------------------------
/parsers/habrahabr.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Парсер Постнауки"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%reload_ext autoreload\n",
 19 |     "%autoreload 2"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import bson"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from text_utils import BaseSource, BaseProcessor, BaseSink\n",
 42 |     "from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor\n",
 43 |     "from text_utils import UciBowSink, MongoDbSink"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from sklearn.pipeline import Pipeline\n",
 55 |     "from ipywidgets import FloatProgress\n",
 56 |     "from IPython.display import display\n",
 57 |     "from pathlib import Path"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Определим пайплайн всей коллекции Хабрахабра из BSON-дампа (`HabrahabrCollectionSource`, `HabrahabrCollectionProcessor`)."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "class HabrahabrCollectionSource(BaseSource):\n",
 76 |     "    def fit(self, root_path, *args):\n",
 77 |     "        stop_words = (root_path / \"stopwords.txt\").open().read().split()\n",
 78 |     "        self.root_path = root_path\n",
 79 |     "        self.bson_file = (root_path / \"habrahabr.bson\").open(\"rb\")\n",
 80 |     "        self.doc_pipeline = Pipeline([\n",
 81 |     "            (\"text-processor\",     DefaultTextProcessor()),\n",
 82 |     "            (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_words)),\n",
 83 |     "        ])\n",
 84 |     "        # Save source state\n",
 85 |     "        self.vocab_file = (root_path / \"vocab.pn.txt\").open(\"w\")\n",
 86 |     "        self.docword_file = (root_path / \"docword.pn.txt\").open(\"w\")\n",
 87 |     "        return self"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 10,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "class HabrahabrCollectionProcessor(BaseProcessor):\n",
 99 |     "    def transform(self, src, *args):\n",
100 |     "        docs = []\n",
101 |     "        for doc_id, bson_doc in enumerate(bson.decode_file_iter(src.bson_file)):\n",
102 |     "            if bson_doc[\"company_blog\"] is None:\n",
103 |     "                doc = {}\n",
104 |     "                doc[\"title\"] = bson_doc[\"title\"]\n",
105 |     "                doc[\"url\"] = bson_doc[\"url\"]\n",
106 |     "                doc[\"modalities\"] = src.doc_pipeline.fit_transform(bson_doc[\"content_html\"])\n",
107 |     "                doc[\"modalities\"][\"flat_tag\"] = bson_doc[\"tags\"]\n",
108 |     "                doc[\"modalities\"][\"authors\"] = [bson_doc[\"author_user\"]]\n",
109 |     "                doc[\"modalities\"][\"hubs\"] = bson_doc[\"hubs\"]\n",
110 |     "                doc[\"markdown\"] = bson_doc[\"content_html\"]\n",
111 |     "                doc[\"doc_id\"] = doc_id + 1\n",
112 |     "                docs.append(doc)\n",
113 |     "        docs = DefaultCollectionProcessor(min_len=1, min_df=2).fit_transform(docs)\n",
114 |     "        # Save Markdown texts in MongoDB\n",
115 |     "        MongoDbSink(\"habrahabr\", id_func=lambda doc: \"habr_%d\" % doc[\"doc_id\"]).fit_transform(docs)\n",
116 |     "        # Save collection UCI BOW format\n",
117 |     "        UciBowSink(src.vocab_file, src.docword_file).fit_transform(docs)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Построим парсер Хабрахабра из пайплайна, определенного выше."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 11,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "habrahabr_parser = Pipeline([\n",
136 |     "    (\"take-root-path\",         HabrahabrCollectionSource()),\n",
137 |     "    (\"process-the-collection\", HabrahabrCollectionProcessor()),\n",
138 |     "])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "Запустим парсер."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 12,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "root_path = Path(\"../datasets/habrahabr\")"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "%%time\n",
168 |     "\n",
169 |     "habrahabr_parser.fit_transform(root_path)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "---"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.5.2"
197 |   },
198 |   "latex_envs": {
199 |    "bibliofile": "biblio.bib",
200 |    "cite_by": "apalike",
201 |    "current_citInitial": 1,
202 |    "eqLabelWithNumbers": true,
203 |    "eqNumInitial": 0
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 0
208 | }
209 | 


--------------------------------------------------------------------------------
/experiments/UCI Merger.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 55,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import csv\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "from pathlib import Path"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "В этом блокноте будут мержиться несколько UCI-датасетов в один объединённый."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 56,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "uci_collections = [(Path(\"../datasets/postnauka\"), \"pn\"),\n",
 33 |     "                   (Path(\"../datasets/ruwiki\"), \"ruwiki\")]\n",
 34 |     "\n",
 35 |     "g_path, g_collection_name = Path(\"../datasets\"), \"merged\""
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Объединим словари в один и сохраним его."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 63,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "CPU times: user 2.81 s, sys: 47 ms, total: 2.86 s\n",
 57 |       "Wall time: 2.86 s\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "%%time\n",
 63 |     "\n",
 64 |     "global_dictionary = {}\n",
 65 |     "local_mappings = []\n",
 66 |     "\n",
 67 |     "for path, collection_name in uci_collections:\n",
 68 |     "    local_mapping = []\n",
 69 |     "    with (path / (\"vocab.%s.txt\" % collection_name)).open() as infile:\n",
 70 |     "        for ix, line in enumerate(infile):\n",
 71 |     "            token, modality = line.split()\n",
 72 |     "            value = global_dictionary.get((token, modality))\n",
 73 |     "            if value is None:\n",
 74 |     "                value = global_dictionary[(token, modality)] = len(global_dictionary)\n",
 75 |     "            local_mapping.append(value)\n",
 76 |     "    local_mappings.append(local_mapping)\n",
 77 |     "\n",
 78 |     "global_mapping = sorted(map(lambda p: (p[1], p[0]), global_dictionary.items()))\n",
 79 |     "\n",
 80 |     "# Добавим метки коллекций в словарь\n",
 81 |     "collection_ids = {}\n",
 82 |     "for _, collection_name in uci_collections:\n",
 83 |     "    collection_ids[collection_name] = len(global_mapping)\n",
 84 |     "    global_mapping.append((len(global_mapping), (collection_name, \"collection_id\")))\n",
 85 |     "\n",
 86 |     "with (g_path / (\"vocab.%s.txt\" % g_collection_name)).open(\"w\") as outfile:\n",
 87 |     "    for _, value in global_mapping:\n",
 88 |     "        outfile.write(\"%s %s\\n\" % value)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Теперь перестроим документы по объединённому словарю."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Сначала посчитаем суммарную длину и кол-во документов будущего `docword` файла."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 66,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "g_doc_count, g_bow_length = 0, 0\n",
114 |     "\n",
115 |     "for path, collection_name in uci_collections:\n",
116 |     "    with (path / (\"docword.%s.txt\" % collection_name)).open() as infile:\n",
117 |     "        dict_length = infile.readline()\n",
118 |     "        doc_count = int(infile.readline())\n",
119 |     "        bow_length = int(infile.readline())\n",
120 |     "        g_doc_count += doc_count\n",
121 |     "        g_bow_length += bow_length"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Теперь построим `docword` файл объединённой коллекции."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 70,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "CPU times: user 13min 2s, sys: 5.42 s, total: 13min 7s\n",
143 |       "Wall time: 13min 10s\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "%%time\n",
149 |     "\n",
150 |     "with (g_path / (\"docword.%s.txt\" % g_collection_name)).open(\"w\") as outfile:\n",
151 |     "    g_doc_id = 0\n",
152 |     "    outfile.write(\"%d\\n%d\\n%d\\n\" % (len(global_dictionary), g_doc_count, g_bow_length))\n",
153 |     "    for mapping, (path, collection_name) in zip(local_mappings, uci_collections):\n",
154 |     "        with (path / (\"docword.%s.txt\" % collection_name)).open() as infile:\n",
155 |     "            dict_length, doc_count, bow_length = int(infile.readline()), int(infile.readline()), int(infile.readline())\n",
156 |     "            seen_docs = set()\n",
157 |     "            for line in infile:\n",
158 |     "                doc_id, word_id, word_count = map(int, line.split())\n",
159 |     "                if doc_id not in seen_docs:\n",
160 |     "                    # Добавим метки коллекций в документы\n",
161 |     "                    collection_id = collection_ids[collection_name]\n",
162 |     "                    outfile.write(\"%d %d %d\\n\" % (g_doc_id + doc_id, collection_id + 1, 1))\n",
163 |     "                    seen_docs.add(doc_id)\n",
164 |     "                g_word_id = mapping[word_id - 1] + 1\n",
165 |     "                outfile.write(\"%d %d %d\\n\" % (g_doc_id + doc_id, g_word_id, word_count))\n",
166 |     "            g_doc_id += doc_count"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "---"
174 |    ]
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "Python 3",
180 |    "language": "python",
181 |    "name": "python3"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.5.2"
194 |   },
195 |   "latex_envs": {
196 |    "bibliofile": "biblio.bib",
197 |    "cite_by": "apalike",
198 |    "current_citInitial": 1,
199 |    "eqLabelWithNumbers": true,
200 |    "eqNumInitial": 0
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 0
205 | }
206 | 


--------------------------------------------------------------------------------
/server/artm_bridge.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import random
  4 | import pickle
  5 | import json
  6 | import zmq
  7 | import regex
  8 | import tempfile
  9 | import traceback
 10 | import glob
 11 | import os
 12 | 
 13 | from parsers import arbitrary, text_utils
 14 | from datetime import datetime
 15 | 
 16 | import artm
 17 | import artm_lib
 18 | 
 19 | 
 20 | MODEL_PATH = "hartm"
 21 | 
 22 | ZMQ_BACKEND_PORT = 2511
 23 | 
 24 | EMPTY, UP, DOWN = b"", b"UP", b"DOWN"
 25 | 
 26 | 
 27 | class BridgeParamError(ValueError):
 28 |     def __init__(self, message):
 29 |         self.message = message
 30 | 
 31 | 
 32 | def rm_flat_dir(dir_path):
 33 |     for file_path in glob.glob(os.path.join(dir_path, "*")):
 34 |         os.remove(file_path)
 35 |     os.rmdir(dir_path)
 36 | 
 37 | def process_msg(message):
 38 |     if message["act"] == "get_topics":
 39 |         response = artm_bridge.model.topics
 40 |     elif message["act"] == "get_documents":
 41 |         topic_id = message["topic_id"]
 42 |         offset = message["offset"]
 43 |         limit = message["limit"]
 44 |         if type(topic_id) is not str:
 45 |             raise BridgeParamError("incorrect param type: `topic_id`")
 46 |         if type(offset) is not int or type(limit) is not int:
 47 |             raise BridgeParamError("`limit` and `offset` fields must be integer")
 48 |         docs, weights = artm_bridge.get_documents_by_topic(topic_id, offset=offset, limit=limit)
 49 |         response = {"docs": docs, "weights": weights}
 50 |     elif message["act"] == "get_document":
 51 |         doc_id = message["doc_id"]
 52 |         if type(doc_id) is not str:
 53 |             raise BridgeParamError("incorrect param type: `doc_id`")
 54 |         docs = artm_bridge.data_source.get_documents_by_ids([doc_id], with_modalities=True)
 55 |         if len(docs) == 0:
 56 |             raise BridgeParamError("document with `doc_id` = '%s' is not found" % doc_id)
 57 |         doc = docs[0]
 58 |         if message["recommend_tags"]:
 59 |             doc["recommended_tags"] = artm_bridge.recommend_tags_by_doc(doc)
 60 |         response = doc
 61 |     elif message["act"] == "perform_search":
 62 |         query = message["query"]
 63 |         limit = message["limit"]
 64 |         if type(query) is not str:
 65 |             raise BridgeParamError("incorrect param type: `query`")
 66 |         if type(limit) is not int:
 67 |             raise BridgeParamError("incorrect param type: `limit`")
 68 |         response = dict(zip(["docs", "theta"],  artm_bridge.search_documents(query, limit=limit)))
 69 |     elif message["act"] == "recommend_docs":
 70 |         doc_id = message["doc_id"]
 71 |         if type(doc_id) is not str:
 72 |             raise BridgeParamError("incorrect param type: `doc_id`")
 73 |         sim_docs_ids = artm_bridge.recommend_docs_by_doc(doc_id)
 74 |         response = artm_bridge.data_source.get_documents_by_ids(sim_docs_ids, with_texts=False)
 75 |     elif message["act"] == "transform_doc":
 76 |         doc_path = message["doc_path"]
 77 |         filename = message["filename"]
 78 |         try:
 79 |             # Initialize file resources
 80 |             doc_file = open(doc_path)
 81 |             vw_fd,vw_path = tempfile.mkstemp(prefix="upload", text=True)
 82 |             vw_file = os.fdopen(vw_fd, "w")
 83 |             batch_path = tempfile.mkdtemp(prefix="batch")
 84 |             # Parse uploaded file
 85 |             doc = pipeline.fit_transform(doc_file)
 86 |             # Save to Vowpal Wabbit file
 87 |             text_utils.VowpalWabbitSink(vw_file, lambda x: "upload") \
 88 |                       .fit_transform([doc])
 89 |             # Transform uploaded document and return its Theta matrix
 90 |             response = {}
 91 |             response["filename"] = filename
 92 |             response["theta"] = artm_bridge.model.transform_one(vw_path, batch_path)
 93 |         except:
 94 |             raise
 95 |         finally:
 96 |             # Delete uploaded file
 97 |             doc_file.close()
 98 |             os.remove(doc_path)
 99 |             # Delete temporary files/dirs
100 |             os.remove(vw_path)
101 |             rm_flat_dir(batch_path)
102 |     elif False and message["act"] == "get_next_assessment":
103 |         ass_id = message["assessor_id"]
104 |         ass_cnt = message["assessors_cnt"]
105 |         col_name = message["collection_name"]
106 | 
107 |         if ass_id >= ass_cnt:
108 |             response = "Incorrent `assessor_id`"
109 |         else:
110 |             docs_count = db["datasets"][col_name].count()
111 |             min_id = int(ass_id * docs_count / ass_cnt)
112 |             max_id = int((ass_id + 1) * docs_count / ass_cnt)
113 |             # May take a long time for large datasets
114 |             docs_ids = db["datasets"][col_name].find({}, {"_id": 1})
115 |             docs_ids = list(map(lambda x: x["_id"],
116 |                                 docs_ids.sort([("_id", 1)])))
117 |             ass_docs_ids = docs_ids[min_id:max_id]
118 |             # Get unused documents' ids
119 |             used_docs_ids = db["assessment"][col_name].find({}, {"_id": 1})
120 |             used_docs_ids = list(map(lambda x: x["_id"], used_docs_ids))
121 |             unused_docs_ids = list(set(ass_docs_ids) - set(used_docs_ids))
122 |             # Form response
123 |             random.shuffle(unused_docs_ids)
124 |             # Use batches of 100 docs per request
125 |             response = unused_docs_ids[:100]
126 |     elif False and message["act"] == "assess_document":
127 |         doc_id = message["doc_id"]
128 |         is_relevant = message["is_relevant"]
129 |         col_names = [v for k, v in prefix_to_col_map.items()
130 |                      if doc_id.startswith(k + "_")]
131 |         if len(col_names) != 1:
132 |             response = False
133 |         else:
134 |             col_name = col_names[0]
135 |             dataset = db["assessment"][col_name]
136 |             doc = {
137 |                 "is_relevant": is_relevant,
138 |                 "assess_time": datetime.now()
139 |             }
140 |             dataset.replace_one({"_id": doc_id}, doc, upsert=True)
141 |             response = True
142 |     else:
143 |         raise BridgeParamError("unknown query")
144 | 
145 |     return response
146 | 
147 | try:
148 |     # Initialize arbitrary pipeline
149 |     pipeline = arbitrary.get_pipeline()
150 | 
151 |     # Initialize BigARTM logging
152 |     artm_log_path = tempfile.mkdtemp(prefix="artmlog")
153 |     lc = artm.messages.ConfigureLoggingArgs()
154 |     lc.log_dir = artm_log_path
155 |     lc.minloglevel = 2
156 |     artm.wrapper.LibArtm(logging_config=lc)
157 | 
158 |     # Initialize ZeroMQ
159 |     context = zmq.Context()
160 |     socket = context.socket(zmq.DEALER)
161 |     # TODO: maybe set socket identity for persistence?
162 |     socket.connect("tcp://localhost:%d" % ZMQ_BACKEND_PORT)
163 | 
164 |     # Initialize ARTM bridge
165 |     artm_bridge = artm_lib.ArtmBridge(MODEL_PATH)
166 | 
167 |     # Notify ARTM_proxy that we're up
168 |     socket.send(UP)
169 | 
170 |     print("ARTM_bridge: start serving ZeroMQ queries on port",
171 |           ZMQ_BACKEND_PORT)
172 | 
173 |     while True:
174 |         # Wait for next request from client
175 |         client, request = socket.recv_multipart()
176 |         message = json.loads(request.decode("utf-8"))
177 | 
178 |         # Debug logging
179 |         # print("> " + json.dumps(message))
180 | 
181 |         # Process message
182 |         response = {}
183 |         try:
184 |             response["ok"] = process_msg(message)
185 |         except BridgeParamError as e:
186 |             response["error"] = {"message": e.message}
187 |         except BaseException as e:
188 |             response["error"] = {"message": "server error"}
189 |             traceback.print_exc()
190 | 
191 |         socket.send_multipart([
192 |             client,
193 |             json.dumps({
194 |                 "act":  message["act"],
195 |                 "id":   message.get("id"),
196 |                 "data": response
197 |             }).encode("utf-8")
198 |         ])
199 | except:
200 |     traceback.print_exc()
201 |     print("Shutting down ARTM_bridge...")
202 | finally:
203 |     # Unregister
204 |     socket.send(DOWN)
205 |     # Clean up
206 |     rm_flat_dir(artm_log_path)
207 |     socket.close()
208 |     context.term()
209 | 


--------------------------------------------------------------------------------
/experiments/Sending requests to server from ARTM_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import json\n",
 12 |     "import urllib.request"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "def get_document(doc_id):\n",
 24 |     "    request_url = \"http://localhost:3000/get-document?doc_id=%s\" % doc_id\n",
 25 |     "    response = urllib.request.urlopen(request_url).read().decode(\"utf-8\")\n",
 26 |     "    return json.loads(response) if response else None\n",
 27 |     "\n",
 28 |     "def get_recommendations(doc_id):\n",
 29 |     "    # TODO: исправить с приходом гетерогенности\n",
 30 |     "    doc_id = doc_id.split(\"_\")[1]\n",
 31 |     "    request_url = \"http://localhost:3000/get-recommendations?doc_id=%s\" % doc_id\n",
 32 |     "    response = urllib.request.urlopen(request_url).read().decode(\"utf-8\")\n",
 33 |     "    return json.loads(response) if response else None    "
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
 48 |       "Wall time: 7.63 µs\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "%time\n",
 54 |     "doc = get_document(\"pn_1490\")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "Оценка талантливости\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "print(doc[\"title\"])"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 5,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "Какие источники информации использует человек при оценке способностей других людей? Почему мы так часто неверно оцениваем возможности других людей? Профессор социальной этики Гарвардского университета Махзарин Банаджи рассказывает о том, почему подбрасывание монеты может быть хорошим предсказанием успешности брака, в рамках проекта Serious Science, созданного командой ПостНауки.\n",
 88 |       "Тренированный человек, безусловно, может отличить хорошо исполненную музыку от сыгранной любителем. Такие люди обычно могут рассказать, в чем разница. Если дать им послушать записи студента старшей школы и, например, Сеговии, то они скажут, что Сеговия играет лучше. Но нас больше интересует ситуация, когда оба претендента кажутся одинаково хорошими. В таком случае может ли наш разум иметь предубеждения относительно них, из-за которых нам будет казаться, что они играют по-разному? Если правы музыканты и они всегда объективно оценивают, обращают внимание только на звук, то разницы быть не должно.\n",
 89 |       "\n",
 90 |       "\n",
 91 |       "Мы считаем, что это стало проблемой: мы теряем талантливых людей, потому что во многих областях уделяем чересчур много внимания тем, кто считается одаренным с детства. Мы не пытаемся давать указания другим, а только отмечаем интересную особенность, что даже среди экспертов существуют интересные различия между словами и поступками. Из этого вытекает следующий вопрос: можно ли это обобщить на другие области, помимо музыки? Оказывается, что да.\n",
 92 |       "Надеемся, что наука о принятии решений при найме на работу разовьется настолько, что мы сможем точно сказать, по каким критериям нужно ориентироваться во время интервью в случае каждой конкретной профессии. Я думаю, что вскоре интервью будут казаться старомодными. Я думаю, что принятие решения о взятии человека на работу в качестве коллеги или подчиненного очень похоже на принятие решения о том, кто будет вашем партнером, супругом. Я думаю, что в обоих этих случаях мы часто бываем одинаково неправы.\n",
 93 |       "\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "print(doc[\"markdown\"])"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "Токены: ['источник', 'информация', 'использовать', 'оценка', 'способность', 'часто', 'неверно', 'оценивать', 'возможность', 'профессор', 'социальный', 'этика', 'гарвардский', 'университет', 'махзарин', 'банаджи', 'рассказывать', 'подбрасывание', 'монета', 'хороший', 'предсказание', 'успешность', 'брак', 'рамка', 'проект', 'science', 'создавать', 'команда', 'постнаука', 'безусловно', 'отличать', 'исполнять', 'музыка', 'сыграть', 'любитель', 'обычно', 'рассказывать', 'разница', 'давать', 'послушать', 'запись', 'студент', 'старший', 'школа', 'сеговия', 'сеговия', 'играть', 'интересовать', 'ситуация', 'претендент', 'казаться', 'одинаково', 'хороший', 'разум', 'предубеждение', 'относительно', 'казаться', 'играть', 'правый', 'музыкант', 'объективно', 'оценивать', 'обращать', 'внимание', 'звук', 'разница', 'должно', 'считать', 'терять', 'талантливый', 'многий', 'область', 'уделять', 'чересчур', 'внимание', 'считаться', 'одаренный', 'детство', 'пытаться', 'давать', 'указание', 'отмечать', 'интересный', 'особенность', 'эксперт', 'интересный', 'различие', 'поступок', 'вытекать', 'следующий', 'обобщать', 'область', 'помимо', 'музыка', 'надеяться', 'принятие', 'решение', 'наем', 'развиваться', 'настолько', 'точно', 'критерий', 'ориентироваться', 'интервью', 'конкретный', 'профессия', 'думать', 'вскоре', 'интервью', 'казаться', 'старомодный', 'думать', 'принятие', 'решение', 'взятие', 'качество', 'коллега', 'подчиненный', 'похоже', 'принятие', 'решение', 'партнер', 'супруг', 'думать', 'часто', 'бывать', 'одинаково', 'неправый']\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "print(\"Токены: %s\" % doc[\"modalities\"][\"text\"])"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 7,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
132 |       "Wall time: 4.53 µs\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "%time\n",
138 |     "recommendations = get_recommendations(\"pn_1490\")"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Рекомендации\n",
153 |       "1. Посттравматический стресс в отношениях «мать — дочь»\n",
154 |       "2. Психология телесности\n",
155 |       "3. Становление киберпсихологии\n",
156 |       "4. Явление и понятие инсайта\n",
157 |       "5. Чем объясняются оптические иллюзии?\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "print(\"Рекомендации\")\n",
163 |     "for i, doc in enumerate(recommendations):\n",
164 |     "    print(\"%d. %s\" % (i + 1, doc[\"title\"]))"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "---"
172 |    ]
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "kernelspec": {
177 |    "display_name": "Python 3",
178 |    "language": "python",
179 |    "name": "python3"
180 |   },
181 |   "language_info": {
182 |    "codemirror_mode": {
183 |     "name": "ipython",
184 |     "version": 3
185 |    },
186 |    "file_extension": ".py",
187 |    "mimetype": "text/x-python",
188 |    "name": "python",
189 |    "nbconvert_exporter": "python",
190 |    "pygments_lexer": "ipython3",
191 |    "version": "3.5.2"
192 |   },
193 |   "latex_envs": {
194 |    "bibliofile": "biblio.bib",
195 |    "cite_by": "apalike",
196 |    "current_citInitial": 1,
197 |    "eqLabelWithNumbers": true,
198 |    "eqNumInitial": 0
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 0
203 | }
204 | 


--------------------------------------------------------------------------------
/experiments/Comparison of different lemmatizers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "deletable": true,
  9 |     "editable": true
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "import re\n",
 14 |     "from itertools import chain\n",
 15 |     "from collections import Counter\n",
 16 |     "from contextlib import closing"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {
 23 |     "collapsed": false,
 24 |     "deletable": true,
 25 |     "editable": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import mystem\n",
 30 |     "import pymorphy2\n",
 31 |     "import multiprocessing"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {
 38 |     "collapsed": true,
 39 |     "deletable": true,
 40 |     "editable": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "N_PROCS = 4"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {
 50 |     "deletable": true,
 51 |     "editable": true
 52 |    },
 53 |    "source": [
 54 |     "Сравнение будем проводить на большом текстовом документе (400 Кб, статья «Россия»), взятом из Википедии, и откопированным 10 раз. "
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {
 61 |     "collapsed": true,
 62 |     "deletable": true,
 63 |     "editable": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "text = open(\"../datasets/arbitrary/examples/big_russia.txt\").read()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 5,
 73 |    "metadata": {
 74 |     "collapsed": true,
 75 |     "deletable": true,
 76 |     "editable": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "split_regexp = re.compile(\"(?u)\\\\b\\\\w+\\\\b\")\n",
 81 |     "tokens = split_regexp.findall(text)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 6,
 87 |    "metadata": {
 88 |     "collapsed": false,
 89 |     "deletable": true,
 90 |     "editable": true
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "322660"
 97 |       ]
 98 |      },
 99 |      "execution_count": 6,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "len(tokens)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 7,
111 |    "metadata": {
112 |     "collapsed": false,
113 |     "deletable": true,
114 |     "editable": true
115 |    },
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "[('в', 12340),\n",
121 |        " ('и', 10210),\n",
122 |        " ('на', 4300),\n",
123 |        " ('России', 3800),\n",
124 |        " ('В', 3090),\n",
125 |        " ('с', 2270),\n",
126 |        " ('по', 1990),\n",
127 |        " ('году', 1650),\n",
128 |        " ('года', 1590),\n",
129 |        " ('из', 1410)]"
130 |       ]
131 |      },
132 |      "execution_count": 7,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "Counter(tokens).most_common(10)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "deletable": true,
145 |     "editable": true
146 |    },
147 |    "source": [
148 |     "## python-mystem"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 8,
154 |    "metadata": {
155 |     "collapsed": true,
156 |     "deletable": true,
157 |     "editable": true
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "from mystem import analyze"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 9,
167 |    "metadata": {
168 |     "collapsed": true
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "def mystem_nf(tok):\n",
173 |     "    with analyze(tok) as result:\n",
174 |     "        return str(result[0])"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 10,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "CPU times: user 13.4 s, sys: 15 ms, total: 13.4 s\n",
189 |       "Wall time: 13.4 s\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "%%time\n",
195 |     "\n",
196 |     "lemmas = list(map(mystem_nf, tokens))"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "322660"
210 |       ]
211 |      },
212 |      "execution_count": 11,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "len(lemmas)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 12,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "[('в', 15430),\n",
232 |        " ('и', 10460),\n",
233 |        " ('год', 5080),\n",
234 |        " ('россия', 4960),\n",
235 |        " ('на', 4590),\n",
236 |        " ('с', 2740),\n",
237 |        " ('быть', 2570),\n",
238 |        " ('российский', 2440),\n",
239 |        " ('по', 2420),\n",
240 |        " ('к', 1570)]"
241 |       ]
242 |      },
243 |      "execution_count": 12,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "Counter(lemmas).most_common(10)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {
255 |     "deletable": true,
256 |     "editable": true
257 |    },
258 |    "source": [
259 |     "## pymorphy2[fast]"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 13,
265 |    "metadata": {
266 |     "collapsed": true,
267 |     "deletable": true,
268 |     "editable": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "morph = pymorphy2.MorphAnalyzer()"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 14,
278 |    "metadata": {
279 |     "collapsed": true,
280 |     "deletable": true,
281 |     "editable": true
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "def morph_nf(tok):\n",
286 |     "    return morph.parse(tok)[0].normal_form"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 15,
292 |    "metadata": {
293 |     "collapsed": false,
294 |     "deletable": true,
295 |     "editable": true
296 |    },
297 |    "outputs": [
298 |     {
299 |      "name": "stdout",
300 |      "output_type": "stream",
301 |      "text": [
302 |       "CPU times: user 11.2 s, sys: 12 ms, total: 11.2 s\n",
303 |       "Wall time: 11.2 s\n"
304 |      ]
305 |     }
306 |    ],
307 |    "source": [
308 |     "%%time\n",
309 |     "\n",
310 |     "lemmas = list(map(morph_nf, tokens))"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 16,
316 |    "metadata": {
317 |     "collapsed": false,
318 |     "deletable": true,
319 |     "editable": true
320 |    },
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "322660"
326 |       ]
327 |      },
328 |      "execution_count": 16,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "len(lemmas)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 17,
340 |    "metadata": {
341 |     "collapsed": false,
342 |     "deletable": true,
343 |     "editable": true
344 |    },
345 |    "outputs": [
346 |     {
347 |      "data": {
348 |       "text/plain": [
349 |        "[('в', 15730),\n",
350 |        " ('и', 10460),\n",
351 |        " ('год', 6080),\n",
352 |        " ('россия', 4960),\n",
353 |        " ('на', 4590),\n",
354 |        " ('с', 2920),\n",
355 |        " ('быть', 2570),\n",
356 |        " ('российский', 2440),\n",
357 |        " ('по', 2420),\n",
358 |        " ('к', 1580)]"
359 |       ]
360 |      },
361 |      "execution_count": 17,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "Counter(lemmas).most_common(10)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {
373 |     "deletable": true,
374 |     "editable": true
375 |    },
376 |    "source": [
377 |     "---"
378 |    ]
379 |   }
380 |  ],
381 |  "metadata": {
382 |   "kernelspec": {
383 |    "display_name": "Python 3",
384 |    "language": "python",
385 |    "name": "python3"
386 |   },
387 |   "language_info": {
388 |    "codemirror_mode": {
389 |     "name": "ipython",
390 |     "version": 3
391 |    },
392 |    "file_extension": ".py",
393 |    "mimetype": "text/x-python",
394 |    "name": "python",
395 |    "nbconvert_exporter": "python",
396 |    "pygments_lexer": "ipython3",
397 |    "version": "3.5.3"
398 |   },
399 |   "latex_envs": {
400 |    "bibliofile": "biblio.bib",
401 |    "cite_by": "apalike",
402 |    "current_citInitial": 1,
403 |    "eqLabelWithNumbers": true,
404 |    "eqNumInitial": 0
405 |   }
406 |  },
407 |  "nbformat": 4,
408 |  "nbformat_minor": 0
409 | }
410 | 


--------------------------------------------------------------------------------
/parsers/postnauka.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Парсер Постнауки"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%reload_ext autoreload\n",
 19 |     "%autoreload 2"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import regex"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from text_utils import BaseSource, BaseProcessor, BaseSink\n",
 42 |     "from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor\n",
 43 |     "from text_utils import VowpalWabbitSink, MongoDbSink"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from sklearn.pipeline import Pipeline\n",
 55 |     "from ipywidgets import FloatProgress\n",
 56 |     "from IPython.display import display\n",
 57 |     "from pathlib import Path"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Определим сначала пайплайн для одного документа (`PostnaukaFileSource`, `PostnaukaFileProcessor`)."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "pn_tags_trim = regex.compile(\"\\[(post|pcourse) [^\\]]+\\]\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "class PostnaukaFileSource(BaseSource):\n",
 87 |     "    def fit(self, params, *args):\n",
 88 |     "        (text_path, meta_path) = params\n",
 89 |     "        self.text_path = text_path\n",
 90 |     "        self.meta_path = meta_path\n",
 91 |     "        return self"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "class PostnaukaFileProcessor(BaseProcessor):\n",
103 |     "    def __init__(self, stop_words):\n",
104 |     "        self.doc_pipeline = Pipeline([\n",
105 |     "            (\"text-processor\",     DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n",
106 |     "            (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_words)),\n",
107 |     "        ])\n",
108 |     "\n",
109 |     "    def transform(self, src, *args):\n",
110 |     "        # Parse text file\n",
111 |     "        with src.text_path.open() as fi:\n",
112 |     "            title = fi.readline().strip()\n",
113 |     "            fi.readline()\n",
114 |     "            description = fi.readline().strip()\n",
115 |     "            fi.readline()\n",
116 |     "            text = fi.read()\n",
117 |     "            text = pn_tags_trim.sub(\"\", text)\n",
118 |     "        # Parse meta file\n",
119 |     "        flat_tags = []\n",
120 |     "        authors = []\n",
121 |     "        authors_names = []\n",
122 |     "        with src.meta_path.open() as fi:\n",
123 |     "            for ln in fi:\n",
124 |     "                toks = regex.split(\"\\s+\", ln, 2)\n",
125 |     "                if toks[0] == \"post_tag\":\n",
126 |     "                    flat_tags.append(toks[-1].strip().lower())\n",
127 |     "                elif toks[0] == \"author\":\n",
128 |     "                    authors.append(toks[-1].strip().lower())\n",
129 |     "                elif toks[0] == \"author_name\":\n",
130 |     "                    authors_names.append(toks[-1].strip())\n",
131 |     "        # Run inner pipeline to form modalities\n",
132 |     "        modalities = self.doc_pipeline.fit_transform(text)\n",
133 |     "        # Finally, make a document and return it\n",
134 |     "        doc = {}\n",
135 |     "        doc[\"title\"] = title\n",
136 |     "        doc[\"description\"] = description\n",
137 |     "        doc[\"authors_names\"] = authors_names\n",
138 |     "        doc[\"modalities\"] = modalities\n",
139 |     "        doc[\"modalities\"][\"flat_tag\"] = flat_tags\n",
140 |     "        doc[\"modalities\"][\"authors\"] = authors\n",
141 |     "        doc[\"markdown\"] = text\n",
142 |     "        return doc"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "Теперь определим пайплайн всей коллекции файлов на диске (`PostnaukaCollectionSource`, `PostnaukaCollectionProcessor`)."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 8,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "class PostnaukaCollectionSource(BaseSource):\n",
161 |     "    def fit(self, root_path, *args):\n",
162 |     "        stop_words = (root_path / \"stopwords.txt\").open().read().split()\n",
163 |     "        self.root_path = root_path\n",
164 |     "        # We will spawn this pipeline in parallel for each document\n",
165 |     "        self.file_parser = Pipeline([\n",
166 |     "            (\"take-file-name\",      PostnaukaFileSource()),\n",
167 |     "            (\"convert-to-document\", PostnaukaFileProcessor(stop_words)),\n",
168 |     "        ])\n",
169 |     "        # Save source state\n",
170 |     "        self.vw_file = (root_path / \"postnauka.txt\").open(\"w\")\n",
171 |     "        self.files_paths = sorted(root_path.glob(\"raw_data/*.txt\"))\n",
172 |     "        self.metas_paths = sorted(root_path.glob(\"raw_data/meta/*_meta.txt\"))\n",
173 |     "        return self"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 9,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [],
183 |    "source": [
184 |     "class PostnaukaCollectionProcessor(BaseProcessor):\n",
185 |     "    def transform(self, src, *args):\n",
186 |     "        docs = []\n",
187 |     "        f = FloatProgress(min=0, max=len(src.files_paths))\n",
188 |     "        display(f)\n",
189 |     "        for doc_id, (file_path, meta_path) in enumerate(zip(src.files_paths, src.metas_paths)):\n",
190 |     "            # TODO: run these in parallel threads\n",
191 |     "            doc = src.file_parser.fit_transform((file_path, meta_path))\n",
192 |     "            doc[\"doc_id\"] = doc_id + 1\n",
193 |     "            docs.append(doc)\n",
194 |     "            f.value += 1\n",
195 |     "        docs = DefaultCollectionProcessor(min_len=100, min_df=2).fit_transform(docs)\n",
196 |     "        id_func = lambda doc: \"pn_%d\" % doc[\"doc_id\"]\n",
197 |     "        # Save Markdown texts in MongoDB\n",
198 |     "        MongoDbSink(\"postnauka\", id_func=id_func).fit_transform(docs)\n",
199 |     "        # Save collection in Vowpal Wabbit format\n",
200 |     "        VowpalWabbitSink(src.vw_file, id_func).fit_transform(docs)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Построим парсер Постнауки из пайплайна, определенного выше."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 10,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "postnauka_parser = Pipeline([\n",
219 |     "    (\"take-root-path\",         PostnaukaCollectionSource()),\n",
220 |     "    (\"process-the-collection\", PostnaukaCollectionProcessor()),\n",
221 |     "])"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "Запустим парсер."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 11,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "root_path = Path(\"../datasets/postnauka\")"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 12,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "CPU times: user 2min 42s, sys: 7.02 s, total: 2min 49s\n",
254 |       "Wall time: 5min 51s\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "%%time\n",
260 |     "\n",
261 |     "postnauka_parser.fit_transform(root_path)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "---"
269 |    ]
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "kernelspec": {
274 |    "display_name": "Python 3",
275 |    "language": "python",
276 |    "name": "python3"
277 |   },
278 |   "language_info": {
279 |    "codemirror_mode": {
280 |     "name": "ipython",
281 |     "version": 3
282 |    },
283 |    "file_extension": ".py",
284 |    "mimetype": "text/x-python",
285 |    "name": "python",
286 |    "nbconvert_exporter": "python",
287 |    "pygments_lexer": "ipython3",
288 |    "version": "3.5.2"
289 |   },
290 |   "latex_envs": {
291 |    "bibliofile": "biblio.bib",
292 |    "cite_by": "apalike",
293 |    "current_citInitial": 1,
294 |    "eqLabelWithNumbers": true,
295 |    "eqNumInitial": 0
296 |   }
297 |  },
298 |  "nbformat": 4,
299 |  "nbformat_minor": 0
300 | }
301 | 


--------------------------------------------------------------------------------
/parsers/text_utils.py:
--------------------------------------------------------------------------------
  1 | import regex
  2 | import copy
  3 | 
  4 | from pymystem3 import Mystem
  5 | from pymongo import MongoClient
  6 | from collections import Counter
  7 | from sklearn.pipeline import Pipeline
  8 | from bson.objectid import ObjectId
  9 | 
 10 | 
 11 | default_modalities = [
 12 |     "text",     # Preprocessed tokens of a document's contents.
 13 |     "flat_tag", # Flat tags associated manually with a document.
 14 | ]
 15 | 
 16 | 
 17 | # ----------------------------
 18 | # Generic interfaces
 19 | # ----------------------------
 20 | 
 21 | class BaseTransformable():
 22 |     """
 23 |     Root interface class, which describes abstract transformation of data.
 24 |     """
 25 | 
 26 |     def fit(self, *args):
 27 |         pass
 28 | 
 29 |     def transform(self, *args):
 30 |         pass
 31 | 
 32 |     def fit_transform(self, *args):
 33 |         return self.fit(*args).transform(*args)
 34 | 
 35 | class BaseSource(BaseTransformable):
 36 |     """
 37 |     Root interface class, which describes the starting point of processing.
 38 |     Purpose: accumulate some input, do a preparatory job, and pass
 39 |              an accumulated state further over pipeline.
 40 |     Input:   arbitrary data.
 41 |     Output:  pointer to self (default case), but may be re-defined.
 42 |     """
 43 | 
 44 |     def transform(self, *args):
 45 |         return self
 46 | 
 47 | class BaseProcessor(BaseTransformable):
 48 |     """
 49 |     Root interface class, which describes an intermediate step of processing.
 50 |     Purpose: take some input from previous steps, modify it and pass it over
 51 |              next processors on the pipeline.
 52 |     Input:   arbitrary data.
 53 |     Output:  arbitrary (modified) data.
 54 |     """
 55 | 
 56 |     def fit(self, *args):
 57 |         return self
 58 | 
 59 | class BaseSink(BaseTransformable):
 60 |     """
 61 |     Root interface class, which describes the terminate point of processing.
 62 |     Purpose: perform some action over processed data and serve as a
 63 |              terminal element on the pipeline.
 64 |     Input:   arbitrary data.
 65 |     Output:  None.
 66 |     """
 67 | 
 68 |     def fit(self, *args):
 69 |         return self
 70 | 
 71 |     def transform(self, *args):
 72 |         return None
 73 | 
 74 | class TextProcessor(BaseProcessor):
 75 |     """Interface class, which describes raw text processing."""
 76 |     pass
 77 | 
 78 | class DocumentProcessor(BaseProcessor):
 79 |     """Interface class, which describes document processing."""
 80 |     pass
 81 | 
 82 | class CollectionProcessor(BaseProcessor):
 83 |     """Interface class, which describes collection processing."""
 84 |     pass
 85 | 
 86 | 
 87 | # ----------------------------
 88 | # Specific classes
 89 | # ----------------------------
 90 | 
 91 | # TODO: Document all processors
 92 | 
 93 | class Splitter(BaseProcessor):
 94 |     def __init__(self, token_pattern):
 95 |         self.token_regexp = regex.compile(token_pattern)
 96 | 
 97 |     def transform(self, text, *args):
 98 |         return self.token_regexp.findall(text)
 99 | 
100 | class DictionaryFilterer(BaseProcessor):
101 |     def __init__(self, stop_words=None):
102 |         if stop_words is None:
103 |             self.stop_words = {}
104 |         else:
105 |             self.stop_words = set(stop_words)
106 | 
107 |     def transform(self, tokens, *args):
108 |         return list(filter(lambda t: t not in self.stop_words, tokens))
109 | 
110 | class FrequencyFilterer(BaseProcessor):
111 |     def __init__(self, min_df=None, max_df=None):
112 |         min_df = 0  if min_df is None else min_df
113 |         max_df = 1. if max_df is None else max_df
114 |         if not isinstance(min_df, int) and not isinstance(min_df, float):
115 |             raise ValueError("min_df is neither int nor float")
116 |         if not isinstance(max_df, int) and not isinstance(max_df, float):
117 |             raise ValueError("max_df is neither int nor float")
118 |         self.min_df = min_df
119 |         self.max_df = max_df
120 | 
121 |     def fit(self, tokens, *args):
122 |         freq = Counter(tokens)
123 |         min_df = self.min_df if isinstance(self.min_df, int) else self.min_df * len(tokens)
124 |         max_df = self.max_df if isinstance(self.max_df, int) else self.max_df * len(tokens)
125 |         self.stop_words = set(map(lambda p: p[0], filter(lambda p: p[1] < min_df or p[1] > max_df, freq.items())))
126 |         return self
127 | 
128 |     def transform(self, tokens, *args):
129 |         return list(filter(lambda t: t not in self.stop_words, tokens))
130 | 
131 | class LengthFilterer(BaseProcessor):
132 |     def __init__(self, min_len=0, len_func=None):
133 |         self.min_len = min_len
134 |         self.len_func = len if len_func is None else len_func
135 | 
136 |     def transform(self, tokens, *args):
137 |         return list(filter(lambda t: self.len_func(t) >= self.min_len, tokens))
138 | 
139 | class Lemmatizer(BaseProcessor):
140 |     def __init__(self):
141 |         self.m = Mystem()
142 | 
143 |     def transform(self, tokens, *args):
144 |         lemm_str = " ".join(tokens)
145 |         return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str)))
146 | 
147 | class DefaultTextProcessor(TextProcessor):
148 |     def __init__(self, token_pattern="(?u)\\b\\w+\\b", stop_words=None):
149 |         splitter = Splitter(token_pattern)
150 |         filterer = DictionaryFilterer(stop_words=stop_words)
151 | 
152 |         self.text_pipeline = Pipeline([
153 |             ("split-text",    splitter),
154 |             ("filter-tokens", filterer),
155 |         ])
156 | 
157 |     def transform(self, raw_text, *args):
158 |         return self.text_pipeline.fit_transform(raw_text.lower())
159 | 
160 | class DefaultDocumentProcessor(DocumentProcessor):
161 |     def __init__(self, min_df=None, max_df=None, stop_lemmas=None):
162 |         lemmatizer    = Lemmatizer()
163 |         dict_filterer = DictionaryFilterer(stop_words=stop_lemmas)
164 |         freq_filterer = FrequencyFilterer(min_df=min_df, max_df=max_df)
165 | 
166 |         self.doc_pipeline = Pipeline([
167 |             ("lemmatize-tokens",     lemmatizer),
168 |             ("filter-by-dictionary", dict_filterer),
169 |             ("filter-by-frequency",  freq_filterer),
170 |         ])
171 | 
172 |     def transform(self, tokens, *args):
173 |         modalities = dict.fromkeys(default_modalities, [])
174 |         modalities["text"] = self.doc_pipeline.fit_transform(tokens)
175 |         return modalities
176 | 
177 | class DefaultCollectionProcessor(CollectionProcessor):
178 |     def __init__(self, min_len=0, min_df=None, max_df=None, len_func=None):
179 |         len_func = (lambda doc: len(doc["modalities"]["text"])) if len_func is None else len_func
180 | 
181 |         len_filterer = LengthFilterer(min_len=min_len, len_func=len_func)
182 | 
183 |         self.col_pipeline = Pipeline([
184 |             ("filter-by-length", len_filterer),
185 |         ])
186 | 
187 |         self.freq_filterer = FrequencyFilterer(min_df=min_df, max_df=max_df)
188 | 
189 |     def fit(self, docs):
190 |         # TODO: make modality an external parameter
191 |         tokens = sum([doc["modalities"]["text"] for doc in docs], [])
192 |         self.freq_filterer.fit(tokens)
193 |         return self
194 | 
195 |     def transform(self, docs, *args):
196 |         docs = self.col_pipeline.fit_transform(docs)
197 |         docs_modified = []
198 |         for doc in docs:
199 |             # TODO: make modality an external parameter
200 |             doc["modalities"]["text"] = self.freq_filterer.transform(doc["modalities"]["text"])
201 |             docs_modified.append(doc)
202 |         return docs_modified
203 | 
204 | class UciBowSink(CollectionProcessor):
205 |     def __init__(self, vocab_file, docword_file):
206 |         self.vocab_file = vocab_file
207 |         self.docword_file = docword_file
208 | 
209 |     def fit(self, docs):
210 |         Ws = set()
211 |         for doc in docs:
212 |             for k, vs in doc["modalities"].items():
213 |                 Ws |= set(map(lambda v: (regex.sub("\s", "_", v), k), vs))
214 |         self.Ws = dict(zip(Ws, range(len(Ws))))
215 |         return self
216 | 
217 |     def transform(self, docs, *args):
218 |         w, d = len(self.Ws), len(docs)
219 |         nnzs = []
220 |         for docID, doc in enumerate(docs):
221 |             bow = []
222 |             for k, vs in doc["modalities"].items():
223 |                 bow += map(lambda v: self.Ws.get((regex.sub("\s", "_", v), k), -1), vs)
224 |             nnzs += map(lambda p: (docID + 1, p[0] + 1, p[1]), Counter(bow).items())
225 |         docword_header = "%d\n%d\n%d\n" % (d, w, len(nnzs))
226 |         words_list = sorted(self.Ws.items(), key=lambda p: p[1])
227 |         self.vocab_file.write("\n".join(map(lambda k: "%s %s" % k[0], words_list)))
228 |         self.docword_file.write(docword_header + "\n".join(map(lambda v: "%d %d %d" % v, nnzs)))
229 |         self.vocab_file.close()
230 |         self.docword_file.close()
231 | 
232 | class VowpalWabbitSink(BaseSink):
233 |     def __init__(self, vw_file, id_func):
234 |         self.vw_file = vw_file
235 |         self.id_func = id_func
236 | 
237 |     def transform(self, docs, *args):
238 |         for doc in docs:
239 |             modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0],
240 |                              " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items()))
241 |             self.vw_file.write("%s %s\n" % (self.id_func(doc), modalities_str))
242 |         self.vw_file.close()
243 | 
244 | class MongoDbSink(BaseSink):
245 |     def __init__(self, collection_name, id_func=None):
246 |         client = MongoClient()
247 |         self.collection = client["datasets"][collection_name]
248 |         self.id_func = id_func
249 | 
250 |     def transform(self, docs, *args):
251 |         reqs = copy.deepcopy(docs)
252 |         if self.id_func:
253 |             for req in reqs:
254 |                 req["_id"] = self.id_func(req)
255 |         result = self.collection.insert_many(reqs)
256 |         return result.inserted_ids


--------------------------------------------------------------------------------
/experiments/Parsing habr dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Хабрахабр"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true,
 18 |     "deletable": true,
 19 |     "editable": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "%matplotlib inline\n",
 24 |     "import regex\n",
 25 |     "import numpy as np\n",
 26 |     "import pandas as pd\n",
 27 |     "import matplotlib.pyplot as plt"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": false,
 35 |     "deletable": true,
 36 |     "editable": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from collections import Counter, defaultdict\n",
 41 |     "from pymongo import MongoClient\n",
 42 |     "from sklearn.pipeline import Pipeline\n",
 43 |     "from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor\n",
 44 |     "from ipywidgets import FloatProgress\n",
 45 |     "from IPython.display import display"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {
 51 |     "deletable": true,
 52 |     "editable": true
 53 |    },
 54 |    "source": [
 55 |     "Перегоняем данные из базы `test.habrahabr` в базу `datasets.habrahabr` с изменением формата и сохраняем на диске в формате Vowpal Wabbit."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 9,
 61 |    "metadata": {
 62 |     "collapsed": true,
 63 |     "deletable": true,
 64 |     "editable": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "client = MongoClient()\n",
 69 |     "in_collection = client[\"test\"][\"habrahabr\"]\n",
 70 |     "out_collection = client[\"datasets\"][\"habrahabr\"]"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 10,
 76 |    "metadata": {
 77 |     "collapsed": false,
 78 |     "deletable": true,
 79 |     "editable": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "stop_words = open(\"../datasets/habrahabr/stopwords.txt\").read().split()\n",
 84 |     "rare_words = open(\"../datasets/habrahabr/rarewords.txt\").read().split()\n",
 85 |     "stop_lemmas = set(stop_words).union(set(rare_words))\n",
 86 |     "doc_pipeline = Pipeline([\n",
 87 |     "    (\"text-processor\",     DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n",
 88 |     "    (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),\n",
 89 |     "])"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {
 95 |     "deletable": true,
 96 |     "editable": true
 97 |    },
 98 |    "source": [
 99 |     "---"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 11,
105 |    "metadata": {
106 |     "collapsed": false,
107 |     "deletable": true,
108 |     "editable": true
109 |    },
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "CPU times: user 1.26 s, sys: 10 ms, total: 1.27 s\n",
116 |       "Wall time: 1.27 s\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "%%time\n",
122 |     "\n",
123 |     "# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль\n",
124 |     "\n",
125 |     "pn_vocab = {\"text\": set(), \"flat_tag\": set()}\n",
126 |     "\n",
127 |     "for doc in open(\"../datasets/postnauka/postnauka.txt\"):\n",
128 |     "    tokens = doc.split()\n",
129 |     "    for token in tokens[1:]:\n",
130 |     "        if token.startswith(\"|\"):\n",
131 |     "            cur_mod = token[1:]\n",
132 |     "        else:\n",
133 |     "            if cur_mod == \"text\" or cur_mod == \"flat_tag\":\n",
134 |     "                pn_vocab[cur_mod].add(token)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 12,
140 |    "metadata": {
141 |     "collapsed": false,
142 |     "deletable": true,
143 |     "editable": true
144 |    },
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "44995"
150 |       ]
151 |      },
152 |      "execution_count": 12,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "len(pn_vocab[\"text\"]) + len(pn_vocab[\"flat_tag\"])"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 13,
164 |    "metadata": {
165 |     "collapsed": false,
166 |     "deletable": true,
167 |     "editable": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "def preprocess_tag(tag):\n",
172 |     "    return \"_\".join(regex.findall(\"(?u)\\\\b\\\\p{L}+\\\\b\", tag.strip().lower()))"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 14,
178 |    "metadata": {
179 |     "collapsed": false,
180 |     "deletable": true,
181 |     "editable": true
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "CPU times: user 19min 3s, sys: 37.3 s, total: 19min 40s\n",
189 |       "Wall time: 1h 22min 32s\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "%%time\n",
195 |     "\n",
196 |     "docs_count = in_collection.count({ \"company_blog\": None })\n",
197 |     "f = FloatProgress(min=0, max=docs_count)\n",
198 |     "display(f)\n",
199 |     "\n",
200 |     "counter = 0\n",
201 |     "\n",
202 |     "with open(\"../datasets/habrahabr/habrahabr.txt\", \"w\") as vw_file:\n",
203 |     "    for doc_id, mongo_doc in enumerate(in_collection.find({ \"company_blog\": None }).sort(\"_id\", 1), 1):\n",
204 |     "        doc = {}\n",
205 |     "        doc[\"_id\"] = \"habr_%d\" % doc_id\n",
206 |     "        doc[\"original_id\"] = mongo_doc[\"_id\"]\n",
207 |     "        doc[\"title\"] = mongo_doc[\"title\"]\n",
208 |     "        doc[\"url\"] = mongo_doc[\"url\"]\n",
209 |     "        doc[\"modalities\"] = {\"text_habr\": [], \"text\": [], \"flat_tag_habr\": [], \"flat_tag\": []}\n",
210 |     "        modalities = doc_pipeline.fit_transform(mongo_doc[\"content_html\"])\n",
211 |     "        for token in modalities[\"text\"]:\n",
212 |     "            if token in pn_vocab[\"text\"]:\n",
213 |     "                doc[\"modalities\"][\"text\"].append(token)\n",
214 |     "            else:\n",
215 |     "                doc[\"modalities\"][\"text\"].append(token)\n",
216 |     "                doc[\"modalities\"][\"text_habr\"].append(token)\n",
217 |     "        for token in map(preprocess_tag, mongo_doc[\"tags\"]):\n",
218 |     "            if token in pn_vocab[\"flat_tag\"]:\n",
219 |     "                doc[\"modalities\"][\"flat_tag\"].append(token)\n",
220 |     "            else:\n",
221 |     "                doc[\"modalities\"][\"flat_tag\"].append(token)\n",
222 |     "                doc[\"modalities\"][\"flat_tag_habr\"].append(token)\n",
223 |     "        doc[\"modalities\"][\"authors\"] = [mongo_doc[\"author_user\"]]\n",
224 |     "        doc[\"modalities\"][\"hubs\"] = mongo_doc[\"hubs\"]\n",
225 |     "        doc[\"markdown\"] = mongo_doc[\"content_html\"]\n",
226 |     "        # TODO: подтягивать имена авторов с Хабра\n",
227 |     "        doc[\"authors_names\"] = doc[\"modalities\"][\"authors\"]\n",
228 |     "        # Фильтрация коротких документов из Хабра\n",
229 |     "        if len(doc[\"modalities\"][\"text\"]) > 100:\n",
230 |     "            # Записать в Vowpal Wabbit\n",
231 |     "            modalities_str = \" \".join(map(lambda p: \"|%s %s\" % (p[0],\n",
232 |     "                             \" \".join(map(lambda t: \"_\".join(t.split()), p[1]))), doc[\"modalities\"].items()))\n",
233 |     "            vw_file.write(\"%s %s\\n\" % (doc[\"_id\"], modalities_str))\n",
234 |     "            # Записать в MongoDB\n",
235 |     "            out_collection.insert_one(doc)\n",
236 |     "        # Увеличить счетчик прогресс-бара\n",
237 |     "        f.value += 1"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {
243 |     "deletable": true,
244 |     "editable": true
245 |    },
246 |    "source": [
247 |     "---"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {
253 |     "deletable": true,
254 |     "editable": true
255 |    },
256 |    "source": [
257 |     "### Фильтрация слов с низким DF"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 28,
263 |    "metadata": {
264 |     "collapsed": false,
265 |     "deletable": true,
266 |     "editable": true
267 |    },
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "CPU times: user 18min 34s, sys: 29.3 s, total: 19min 3s\n",
274 |       "Wall time: 1h 7min 55s\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "%%time\n",
280 |     "\n",
281 |     "docs_count = in_collection.count({ \"company_blog\": None })\n",
282 |     "f = FloatProgress(min=0, max=docs_count)\n",
283 |     "display(f)\n",
284 |     "\n",
285 |     "word_counter = defaultdict(set)\n",
286 |     "\n",
287 |     "for doc_id, mongo_doc in enumerate(in_collection.find({ \"company_blog\": None }), 1):\n",
288 |     "    modalities = doc_pipeline.fit_transform(mongo_doc[\"content_html\"])\n",
289 |     "    for word in modalities[\"text\"]:\n",
290 |     "        word_counter[word].add(doc_id)\n",
291 |     "    # Увеличить счетчик прогресс-бара\n",
292 |     "    f.value += 1"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 29,
298 |    "metadata": {
299 |     "collapsed": false,
300 |     "deletable": true,
301 |     "editable": true
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "words = list(word_counter.items())"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 30,
311 |    "metadata": {
312 |     "collapsed": false,
313 |     "deletable": true,
314 |     "editable": true
315 |    },
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "602833"
321 |       ]
322 |      },
323 |      "execution_count": 30,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "len(word_counter)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 68,
335 |    "metadata": {
336 |     "collapsed": false,
337 |     "deletable": true,
338 |     "editable": true
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 69,
348 |    "metadata": {
349 |     "collapsed": false,
350 |     "deletable": true,
351 |     "editable": true
352 |    },
353 |    "outputs": [
354 |     {
355 |      "name": "stdout",
356 |      "output_type": "stream",
357 |      "text": [
358 |       "384972\n",
359 |       "0.6386047213739129\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "print(len(rare_words))\n",
365 |     "print(len(rare_words) / len(words))"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 70,
371 |    "metadata": {
372 |     "collapsed": false,
373 |     "deletable": true,
374 |     "editable": true
375 |    },
376 |    "outputs": [
377 |     {
378 |      "data": {
379 |       "text/plain": [
380 |        "4415124"
381 |       ]
382 |      },
383 |      "execution_count": 70,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "open(\"../datasets/habrahabr/rarewords.txt\", \"w\").write(\"\\n\".join(rare_words))"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {
395 |     "deletable": true,
396 |     "editable": true
397 |    },
398 |    "source": [
399 |     "---"
400 |    ]
401 |   }
402 |  ],
403 |  "metadata": {
404 |   "kernelspec": {
405 |    "display_name": "Python 3",
406 |    "language": "python",
407 |    "name": "python3"
408 |   },
409 |   "language_info": {
410 |    "codemirror_mode": {
411 |     "name": "ipython",
412 |     "version": 3
413 |    },
414 |    "file_extension": ".py",
415 |    "mimetype": "text/x-python",
416 |    "name": "python",
417 |    "nbconvert_exporter": "python",
418 |    "pygments_lexer": "ipython3",
419 |    "version": "3.5.3"
420 |   },
421 |   "latex_envs": {
422 |    "bibliofile": "biblio.bib",
423 |    "cite_by": "apalike",
424 |    "current_citInitial": 1,
425 |    "eqLabelWithNumbers": true,
426 |    "eqNumInitial": 0
427 |   }
428 |  },
429 |  "nbformat": 4,
430 |  "nbformat_minor": 0
431 | }
432 | 


--------------------------------------------------------------------------------
/experiments/Parsing elementy website.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 279,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "deletable": true,
  9 |     "editable": true
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "import regex\n",
 14 |     "import collections\n",
 15 |     "from lxml import html\n",
 16 |     "from pymongo import MongoClient\n",
 17 |     "from urllib.request import urlopen\n",
 18 |     "from urllib.error import HTTPError\n",
 19 |     "from sklearn.pipeline import Pipeline\n",
 20 |     "from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {
 26 |     "deletable": true,
 27 |     "editable": true
 28 |    },
 29 |    "source": [
 30 |     "### Загрузка страниц с веб-сайта elementy.ru"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 220,
 36 |    "metadata": {
 37 |     "collapsed": true,
 38 |     "deletable": true,
 39 |     "editable": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "maybe = lambda f, x: f(x) if x else None"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 25,
 49 |    "metadata": {
 50 |     "collapsed": true,
 51 |     "deletable": true,
 52 |     "editable": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def process_html(text):\n",
 57 |     "    return text.replace(\"\\xa0\", \" \")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 61,
 63 |    "metadata": {
 64 |     "collapsed": true,
 65 |     "deletable": true,
 66 |     "editable": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "def process_tag(text):\n",
 71 |     "    return regex.sub(\"\\s\", \"_\", process_html(text).strip()).lower()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 261,
 77 |    "metadata": {
 78 |     "collapsed": false,
 79 |     "deletable": true,
 80 |     "editable": true
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "Parsed 98/100 pages\n",
 88 |       "Parsed 198/200 pages\n",
 89 |       "Parsed 296/300 pages\n",
 90 |       "Parsed 392/400 pages\n",
 91 |       "Parsed 491/500 pages\n",
 92 |       "Parsed 589/600 pages\n",
 93 |       "Parsed 686/700 pages\n",
 94 |       "Parsed 784/800 pages\n",
 95 |       "Parsed 881/900 pages\n",
 96 |       "Parsed 981/1000 pages\n",
 97 |       "Parsed 1079/1100 pages\n",
 98 |       "Parsed 1176/1200 pages\n",
 99 |       "Parsed 1273/1300 pages\n",
100 |       "Parsed 1370/1400 pages\n",
101 |       "Parsed 1466/1500 pages\n",
102 |       "Parsed 1566/1600 pages\n",
103 |       "Parsed 1658/1700 pages\n",
104 |       "Parsed 1752/1800 pages\n",
105 |       "Parsed 1848/1900 pages\n",
106 |       "Parsed 1946/2000 pages\n",
107 |       "Parsed 2038/2100 pages\n",
108 |       "Parsed 2129/2200 pages\n",
109 |       "Parsed 2223/2300 pages\n",
110 |       "CPU times: user 45.4 s, sys: 3.45 s, total: 48.9 s\n",
111 |       "Wall time: 28min 17s\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "%%time\n",
117 |     "\n",
118 |     "# Парсинг всех страниц\n",
119 |     "pages_ids = list(range(431231, 433629))\n",
120 |     "pages = []\n",
121 |     "\n",
122 |     "for i, page_id in enumerate(pages_ids, 1):\n",
123 |     "    try:\n",
124 |     "        page_url = \"http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/\" % page_id\n",
125 |     "        page = html.parse(urlopen(page_url))\n",
126 |     "\n",
127 |     "        title = process_html(page.findtext(\"//h1\"))\n",
128 |     "        tags = list(map(lambda p: process_tag(p.text),\n",
129 |     "                        page.findall(\"//div[@class='mb itemhead newslist']/div/a\")[1:-1]))\n",
130 |     "        article = page.find(\"//div[@class='itemblock']/div[@class='memo']\")\n",
131 |     "\n",
132 |     "        summary = maybe(process_html, article.findtext(\"./p[@class='Intro']\"))\n",
133 |     "        text = []\n",
134 |     "        content_flag = False\n",
135 |     "        for elem in article.iterfind(\"p\"):\n",
136 |     "            if len(elem.classes) > 0:\n",
137 |     "                continue\n",
138 |     "            # TODO: filter wrong paragraphs\n",
139 |     "            # TODO: can also be non-paragraphs (h3, ol, etc)\n",
140 |     "            text.append(process_html(elem.text_content()))\n",
141 |     "        text = \"\\n\\n\".join(text)\n",
142 |     "        \n",
143 |     "        pages.append((page_id, title, tags, summary, text))\n",
144 |     "    except Exception:\n",
145 |     "        pass\n",
146 |     "    \n",
147 |     "    if i % 100 == 0:\n",
148 |     "        print(\"Parsed %d/%d pages\" % (len(pages), i))"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 267,
154 |    "metadata": {
155 |     "collapsed": false,
156 |     "deletable": true,
157 |     "editable": true
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "2300"
164 |       ]
165 |      },
166 |      "execution_count": 267,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "len(pages)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {
178 |     "deletable": true,
179 |     "editable": true
180 |    },
181 |    "source": [
182 |     "### Парсинг"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 292,
188 |    "metadata": {
189 |     "collapsed": false,
190 |     "deletable": true,
191 |     "editable": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "stop_words = open(\"../datasets/elementy/stopwords.txt\").read().split()\n",
196 |     "rare_words = open(\"../datasets/elementy/rarewords.txt\").read().split()\n",
197 |     "stop_lemmas = set(stop_words).union(set(rare_words))\n",
198 |     "doc_pipeline = Pipeline([\n",
199 |     "    (\"text-processor\",     DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n",
200 |     "    (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),\n",
201 |     "])"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 293,
207 |    "metadata": {
208 |     "collapsed": false
209 |    },
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "CPU times: user 1.48 s, sys: 19 ms, total: 1.5 s\n",
216 |       "Wall time: 1.52 s\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "%%time\n",
222 |     "\n",
223 |     "# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль\n",
224 |     "\n",
225 |     "pn_vocab = {\"text\": set(), \"flat_tag\": set()}\n",
226 |     "\n",
227 |     "for doc in open(\"../datasets/postnauka/postnauka.txt\"):\n",
228 |     "    tokens = doc.split()\n",
229 |     "    for token in tokens[1:]:\n",
230 |     "        if token.startswith(\"|\"):\n",
231 |     "            cur_mod = token[1:]\n",
232 |     "        else:\n",
233 |     "            if cur_mod == \"text\" or cur_mod == \"flat_tag\":\n",
234 |     "                pn_vocab[cur_mod].add(token)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 294,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "44995"
248 |       ]
249 |      },
250 |      "execution_count": 294,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "len(pn_vocab[\"text\"]) + len(pn_vocab[\"flat_tag\"])"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 295,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "client = MongoClient()\n",
268 |     "out_collection = client[\"datasets\"][\"elementy\"]"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 296,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Written 500 pages\n",
283 |       "Written 1000 pages\n",
284 |       "Written 1500 pages\n",
285 |       "Written 2000 pages\n",
286 |       "CPU times: user 38.1 s, sys: 612 ms, total: 38.7 s\n",
287 |       "Wall time: 2min 54s\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "%%time\n",
293 |     "\n",
294 |     "with open(\"../datasets/elementy/elementy.txt\", \"w\") as vw_file:\n",
295 |     "    for i, page in enumerate(pages, 1):\n",
296 |     "        page_id, title, tags, summary, text = page\n",
297 |     "        doc = {}\n",
298 |     "        doc[\"_id\"] = \"elem_%d\" % page_id\n",
299 |     "        doc[\"title\"] = title\n",
300 |     "        doc[\"url\"] = \"http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/\" % page_id\n",
301 |     "        doc[\"modalities\"] = {\"text_elem\": [], \"text\": [], \"flat_tag_elem\": [], \"flat_tag\": []}\n",
302 |     "        modalities = doc_pipeline.fit_transform(text)\n",
303 |     "        for token in modalities[\"text\"]:\n",
304 |     "            if token in pn_vocab[\"text\"]:\n",
305 |     "                doc[\"modalities\"][\"text\"].append(token)\n",
306 |     "            else:\n",
307 |     "                doc[\"modalities\"][\"text\"].append(token)\n",
308 |     "                doc[\"modalities\"][\"text_elem\"].append(token)\n",
309 |     "        for token in tags:\n",
310 |     "            if token in pn_vocab[\"flat_tag\"]:\n",
311 |     "                doc[\"modalities\"][\"flat_tag\"].append(token)\n",
312 |     "            else:\n",
313 |     "                doc[\"modalities\"][\"flat_tag\"].append(token)\n",
314 |     "                doc[\"modalities\"][\"flat_tag_elem\"].append(token)\n",
315 |     "        doc[\"summary\"] = summary\n",
316 |     "        doc[\"markdown\"] = text\n",
317 |     "        # Фильтрация коротких документов из Элементов\n",
318 |     "        if len(doc[\"modalities\"][\"text\"]) > 100:\n",
319 |     "            # Записать в Vowpal Wabbit\n",
320 |     "            modalities_str = \" \".join(map(lambda p: \"|%s %s\" % (p[0],\n",
321 |     "                             \" \".join(map(lambda t: \"_\".join(t.split()), p[1]))), doc[\"modalities\"].items()))\n",
322 |     "            vw_file.write(\"%s %s\\n\" % (doc[\"_id\"], modalities_str))\n",
323 |     "            # Записать в MongoDB\n",
324 |     "            out_collection.insert_one(doc)\n",
325 |     "        if i % 500 == 0:\n",
326 |     "            print(\"Written %d pages\" % i)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {
332 |     "deletable": true,
333 |     "editable": true
334 |    },
335 |    "source": [
336 |     "### Фильтрация слов с низким DF"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 285,
342 |    "metadata": {
343 |     "collapsed": false,
344 |     "deletable": true,
345 |     "editable": true
346 |    },
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "Processed 500 pages\n",
353 |       "Processed 1000 pages\n",
354 |       "Processed 1500 pages\n",
355 |       "Processed 2000 pages\n",
356 |       "CPU times: user 32.1 s, sys: 214 ms, total: 32.4 s\n",
357 |       "Wall time: 2min 50s\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "%%time\n",
363 |     "\n",
364 |     "word_counter = collections.defaultdict(set)\n",
365 |     "\n",
366 |     "for i, page in enumerate(pages, 1):\n",
367 |     "    page_id, _, _, _, text = page\n",
368 |     "    modalities = doc_pipeline.fit_transform(text)\n",
369 |     "    for word in modalities[\"text\"]:\n",
370 |     "        word_counter[word].add(page_id)\n",
371 |     "    if i % 500 == 0:\n",
372 |     "        print(\"Processed %d pages\" % i)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 286,
378 |    "metadata": {
379 |     "collapsed": true
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "words = list(word_counter.items())"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 287,
389 |    "metadata": {
390 |     "collapsed": false
391 |    },
392 |    "outputs": [
393 |     {
394 |      "data": {
395 |       "text/plain": [
396 |        "79946"
397 |       ]
398 |      },
399 |      "execution_count": 287,
400 |      "metadata": {},
401 |      "output_type": "execute_result"
402 |     }
403 |    ],
404 |    "source": [
405 |     "len(word_counter)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 288,
411 |    "metadata": {
412 |     "collapsed": true
413 |    },
414 |    "outputs": [],
415 |    "source": [
416 |     "rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 289,
422 |    "metadata": {
423 |     "collapsed": false
424 |    },
425 |    "outputs": [
426 |     {
427 |      "name": "stdout",
428 |      "output_type": "stream",
429 |      "text": [
430 |       "39494\n",
431 |       "0.49400845570760266\n"
432 |      ]
433 |     }
434 |    ],
435 |    "source": [
436 |     "print(len(rare_words))\n",
437 |     "print(len(rare_words) / len(words))"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 291,
443 |    "metadata": {
444 |     "collapsed": false
445 |    },
446 |    "outputs": [
447 |     {
448 |      "data": {
449 |       "text/plain": [
450 |        "388252"
451 |       ]
452 |      },
453 |      "execution_count": 291,
454 |      "metadata": {},
455 |      "output_type": "execute_result"
456 |     }
457 |    ],
458 |    "source": [
459 |     "open(\"../datasets/elementy/rarewords.txt\", \"w\").write(\"\\n\".join(rare_words))"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "metadata": {
465 |     "deletable": true,
466 |     "editable": true
467 |    },
468 |    "source": [
469 |     "---"
470 |    ]
471 |   }
472 |  ],
473 |  "metadata": {
474 |   "kernelspec": {
475 |    "display_name": "Python 3",
476 |    "language": "python",
477 |    "name": "python3"
478 |   },
479 |   "language_info": {
480 |    "codemirror_mode": {
481 |     "name": "ipython",
482 |     "version": 3
483 |    },
484 |    "file_extension": ".py",
485 |    "mimetype": "text/x-python",
486 |    "name": "python",
487 |    "nbconvert_exporter": "python",
488 |    "pygments_lexer": "ipython3",
489 |    "version": "3.5.3"
490 |   },
491 |   "latex_envs": {
492 |    "bibliofile": "biblio.bib",
493 |    "cite_by": "apalike",
494 |    "current_citInitial": 1,
495 |    "eqLabelWithNumbers": true,
496 |    "eqNumInitial": 0
497 |   }
498 |  },
499 |  "nbformat": 4,
500 |  "nbformat_minor": 2
501 | }
502 | 


--------------------------------------------------------------------------------
/experiments/Spectrum experiments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%reload_ext autoreload\n",
 12 |     "%autoreload 2"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 11,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from parsers import arbitrary, text_utils\n",
 24 |     "import artm\n",
 25 |     "import hierarchy_utils\n",
 26 |     "import pickle\n",
 27 |     "from spectrum import arrange_topics\n",
 28 |     "#import arranging.api as api\n",
 29 |     "from crossmin import CrossMinimizer\n",
 30 |     "import numpy as np"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stderr",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:604: DeprecationWarning: invalid escape sequence \\*\n",
 45 |       "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:714: DeprecationWarning: invalid escape sequence \\d\n",
 46 |       "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:783: DeprecationWarning: 'async' and 'await' will become reserved keywords in Python 3.7\n"
 47 |      ]
 48 |     },
 49 |     {
 50 |      "ename": "NameError",
 51 |      "evalue": "name 'pickle' is not defined",
 52 |      "output_type": "error",
 53 |      "traceback": [
 54 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 55 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 56 |       "\u001b[0;32m<ipython-input-2-62000a1b1939>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[0martm_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMODEL_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0martm_extra_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMODEL_PATH\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/extra_info.dump\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 57 |       "\u001b[0;31mNameError\u001b[0m: name 'pickle' is not defined"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "\n",
 63 |     "\n",
 64 |     "T = lambda lid, tid: \"level_%d_%s\" % (lid, tid)\n",
 65 |     "\n",
 66 |     "def from_artm_tid(artm_tid):\n",
 67 |     "    # This is due to hARTM bug\n",
 68 |     "    if artm_tid.startswith(\"level_0_\"):\n",
 69 |     "        return (0, artm_tid[8:])\n",
 70 |     "    else:\n",
 71 |     "        lid, tid = artm_tid[5:].split(\"_\", 1)\n",
 72 |     "        lid = int(lid)\n",
 73 |     "        return (lid, tid)\n",
 74 |     "\n",
 75 |     "MODEL_PATH = \"hartm/\"\n",
 76 |     "\n",
 77 |     "artm_model = hierarchy_utils.hARTM(theta_columns_naming=\"title\",\n",
 78 |     "                                   cache_theta=True)\n",
 79 |     "artm_model.load(MODEL_PATH)\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 8,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "artm_extra_info = pickle.load(open(MODEL_PATH + \"/extra_info.dump\", \"rb\"))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 9,
 96 |    "metadata": {
 97 |     "collapsed": true
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Extract Phi, Psi and Theta matrices\n",
102 |     "phis = []\n",
103 |     "psis = []\n",
104 |     "#theta = artm_extra_info[\"theta\"]\n",
105 |     "# theta = pickle.load(open(THETA_MODEL_PATH, \"rb\"))[\"theta\"]\n",
106 |     "for level_idx, artm_level in enumerate(artm_model._levels):\n",
107 |     "    phis.append(artm_level.get_phi(class_ids=\"flat_tag\"))\n",
108 |     "    if level_idx > 0:\n",
109 |     "        psis.append(artm_level.get_psi())\n",
110 |     "        \n",
111 |     "phi0_topic_titles = list(filter(lambda x: x.startswith(\"topic\"), phis[0].columns))\n",
112 |     "phi1_topic_titles = list(filter(lambda x: x.startswith(\"topic\"), phis[1].columns))"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 12,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "new_phi1_topic_order = np.array(phi1_topic_titles)[arrange_topics(phis[1][phi1_topic_titles].values)]\n",
124 |     "phis[1] = phis[1][new_phi1_topic_order]\n",
125 |     "psis[0] = psis[0].loc[new_phi1_topic_order]"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 13,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "children_threshold = 0.05\n",
137 |     "\n",
138 |     "D = np.zeros((len(phi0_topic_titles), len(phi1_topic_titles)))\n",
139 |     "\n",
140 |     "for parent_id, parent in enumerate(phi0_topic_titles):\n",
141 |     "    for child_id, maybe_child in enumerate(phi1_topic_titles):\n",
142 |     "        if psis[0].loc[maybe_child, parent] > children_threshold:\n",
143 |     "            D[parent_id, child_id] = 1"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 14,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "name": "stderr",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/pulp/solvers.py:71: DeprecationWarning: The SafeConfigParser class has been renamed to ConfigParser in Python 3.2. This alias will be removed in future versions. Use ConfigParser directly instead.\n",
158 |       "  'os':operating_system, 'arch':arch})\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "cm = CrossMinimizer(D)\n",
164 |     "idx = cm.solve(mode=\"auto\", model=None)\n",
165 |     "new_phi0_topic_order = np.array(phi1_topic_titles)[idx]"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 15,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "array(['topic_3', 'topic_1', 'topic_7', 'topic_9', 'topic_10', 'topic_12',\n",
179 |        "       'topic_2', 'topic_4', 'topic_13', 'topic_11', 'topic_17',\n",
180 |        "       'topic_16', 'topic_6', 'topic_14', 'topic_18', 'topic_5', 'topic_0',\n",
181 |        "       'topic_15', 'topic_8'], \n",
182 |        "      dtype='<U8')"
183 |       ]
184 |      },
185 |      "execution_count": 15,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "new_phi0_topic_order"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 16,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "array(['topic_58', 'topic_6', 'topic_19', 'topic_36', 'topic_3',\n",
205 |        "       'topic_59', 'topic_30', 'topic_38', 'topic_5', 'topic_49',\n",
206 |        "       'topic_0', 'topic_21', 'topic_50', 'topic_55', 'topic_16',\n",
207 |        "       'topic_2', 'topic_46', 'topic_20', 'topic_56', 'topic_52',\n",
208 |        "       'topic_24', 'topic_53', 'topic_4', 'topic_22', 'topic_34',\n",
209 |        "       'topic_26', 'topic_33', 'topic_42', 'topic_51', 'topic_45',\n",
210 |        "       'topic_18', 'topic_28', 'topic_37', 'topic_40', 'topic_12',\n",
211 |        "       'topic_31', 'topic_7', 'topic_10', 'topic_14', 'topic_1',\n",
212 |        "       'topic_48', 'topic_17', 'topic_23', 'topic_39', 'topic_43',\n",
213 |        "       'topic_8', 'topic_9', 'topic_41', 'topic_44', 'topic_13',\n",
214 |        "       'topic_29', 'topic_15', 'topic_54', 'topic_35', 'topic_32',\n",
215 |        "       'topic_11', 'topic_27', 'topic_57', 'topic_25', 'topic_47'], \n",
216 |        "      dtype='<U8')"
217 |       ]
218 |      },
219 |      "execution_count": 16,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "new_phi1_topic_order"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 17,
231 |    "metadata": {
232 |     "collapsed": true
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "def get_topic_name(lid, tid):\n",
237 |     "    return \", \".join(phis[lid][tid].sort_values()[-3:][::-1].index)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 18,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "topic_3 \tкультура, литература, культурология\n",
252 |       "topic_1 \tмедицина, эволюция, антропология\n",
253 |       "topic_7 \tсоциология, социология_повседневности, дюркгейм_эмиль\n",
254 |       "topic_9 \tпсихология, люди_науки, история_науки\n",
255 |       "topic_10 \tпалеонтология, биохимия, стволовые_клетки\n",
256 |       "topic_12 \tнаука, управление_проектами, работа\n",
257 |       "topic_2 \tинформационная_безопасность, копирайт, будущее\n",
258 |       "topic_4 \tистория, история_россии, ссср\n",
259 |       "topic_13 \tхимия, нанотехнологии, материаловедение\n",
260 |       "topic_11 \tфизика, физика_элементарных_частиц, квантовая_физика\n",
261 |       "topic_17 \tэкология, зоология, этология\n",
262 |       "topic_16 \tсредневековье, мифология, биоинформатика\n",
263 |       "topic_6 \tлингвистика, язык, право\n",
264 |       "topic_14 \tобразование, университет, школа\n",
265 |       "topic_18 \tмозг, нейробиология, искусственный_интеллект\n",
266 |       "topic_5 \tастрономия, астрофизика, вселенная\n",
267 |       "topic_0 \tфилософия, россия, география\n",
268 |       "topic_15 \tматематика, информационные_технологии, технологии\n",
269 |       "topic_8 \tобщество, экономика, россия\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "print(\"\\n\".join(map(lambda t: t + \" \\t\" + get_topic_name(0, t), new_phi0_topic_order)))"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 19,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "topic_0 \tфилософия, россия, география\n",
289 |       "topic_1 \tмедицина, эволюция, антропология\n",
290 |       "topic_2 \tинформационная_безопасность, копирайт, будущее\n",
291 |       "topic_3 \tкультура, литература, культурология\n",
292 |       "topic_4 \tистория, история_россии, ссср\n",
293 |       "topic_5 \tастрономия, астрофизика, вселенная\n",
294 |       "topic_6 \tлингвистика, язык, право\n",
295 |       "topic_7 \tсоциология, социология_повседневности, дюркгейм_эмиль\n",
296 |       "topic_8 \tобщество, экономика, россия\n",
297 |       "topic_9 \tпсихология, люди_науки, история_науки\n",
298 |       "topic_10 \tпалеонтология, биохимия, стволовые_клетки\n",
299 |       "topic_11 \tфизика, физика_элементарных_частиц, квантовая_физика\n",
300 |       "topic_12 \tнаука, управление_проектами, работа\n",
301 |       "topic_13 \tхимия, нанотехнологии, материаловедение\n",
302 |       "topic_14 \tобразование, университет, школа\n",
303 |       "topic_15 \tматематика, информационные_технологии, технологии\n",
304 |       "topic_16 \tсредневековье, мифология, биоинформатика\n",
305 |       "topic_17 \tэкология, зоология, этология\n",
306 |       "topic_18 \tмозг, нейробиология, искусственный_интеллект\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "print(\"\\n\".join(map(lambda t: t + \" \\t\" + get_topic_name(0, t), phi0_topic_titles)))"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 20,
317 |    "metadata": {
318 |     "collapsed": true
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "phi0_flat_spectrum = np.array(phi0_topic_titles)[arrange_topics(phis[0][phi0_topic_titles].values)]"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 21,
328 |    "metadata": {
329 |     "collapsed": false
330 |    },
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "topic_13 \tхимия, нанотехнологии, материаловедение\n",
337 |       "topic_5 \tастрономия, астрофизика, вселенная\n",
338 |       "topic_11 \tфизика, физика_элементарных_частиц, квантовая_физика\n",
339 |       "topic_15 \tматематика, информационные_технологии, технологии\n",
340 |       "topic_10 \tпалеонтология, биохимия, стволовые_клетки\n",
341 |       "topic_2 \tинформационная_безопасность, копирайт, будущее\n",
342 |       "topic_16 \tсредневековье, мифология, биоинформатика\n",
343 |       "topic_4 \tистория, история_россии, ссср\n",
344 |       "topic_8 \tобщество, экономика, россия\n",
345 |       "topic_0 \tфилософия, россия, география\n",
346 |       "topic_3 \tкультура, литература, культурология\n",
347 |       "topic_6 \tлингвистика, язык, право\n",
348 |       "topic_7 \tсоциология, социология_повседневности, дюркгейм_эмиль\n",
349 |       "topic_12 \tнаука, управление_проектами, работа\n",
350 |       "topic_14 \tобразование, университет, школа\n",
351 |       "topic_9 \tпсихология, люди_науки, история_науки\n",
352 |       "topic_18 \tмозг, нейробиология, искусственный_интеллект\n",
353 |       "topic_17 \tэкология, зоология, этология\n",
354 |       "topic_1 \tмедицина, эволюция, антропология\n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "print(\"\\n\".join(map(lambda t: t + \" \\t\" + get_topic_name(0, t), phi0_flat_spectrum)))"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 22,
365 |    "metadata": {
366 |     "collapsed": true
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "# Двухуровневый спектр\n",
371 |     "pickle.dump((new_phi0_topic_order, new_phi1_topic_order, D), open(\"spectrum.dump\", \"wb\"))"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 23,
377 |    "metadata": {
378 |     "collapsed": true
379 |    },
380 |    "outputs": [],
381 |    "source": [
382 |     "# Одноуровневый спектр\n",
383 |     "level_0_topics_ids = list(map(lambda tid: \"level_0_%s\" % tid, phi0_flat_spectrum))\n",
384 |     "pickle.dump(level_0_topics_ids, open(\"flat_spectrum.dump\", \"wb\"))"
385 |    ]
386 |   }
387 |  ],
388 |  "metadata": {
389 |   "kernelspec": {
390 |    "display_name": "Python 3",
391 |    "language": "python",
392 |    "name": "python3"
393 |   },
394 |   "language_info": {
395 |    "codemirror_mode": {
396 |     "name": "ipython",
397 |     "version": 3
398 |    },
399 |    "file_extension": ".py",
400 |    "mimetype": "text/x-python",
401 |    "name": "python",
402 |    "nbconvert_exporter": "python",
403 |    "pygments_lexer": "ipython3",
404 |    "version": "3.6.0"
405 |   }
406 |  },
407 |  "nbformat": 4,
408 |  "nbformat_minor": 2
409 | }
410 | 


--------------------------------------------------------------------------------
/server/artm_lib.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pickle
  4 | import regex
  5 | import os
  6 | import operator
  7 | 
  8 | import hierarchy_utils
  9 | from pymongo import MongoClient
 10 | from scipy.linalg import norm
 11 | from sklearn.metrics.pairwise import pairwise_distances
 12 | 
 13 | import artm
 14 | 
 15 | 
 16 | # List of all doc_id prefixes
 17 | # TODO: move to config file
 18 | prefix_to_col_map = {"pn": "postnauka", "habr": "habrahabr",
 19 |                      "elem": "elementy"}
 20 | 
 21 | 
 22 | def hellinger_dist(p, q):
 23 |     return norm(np.sqrt(p) - np.sqrt(q))
 24 | 
 25 | 
 26 | class ArtmModel:
 27 |     def __init__(self, model_path, topic_naming_n_words=3, psi_edge_threshold=0.05):
 28 |         # Load ARTM model and model extra info
 29 |         self._extra_info = pickle.load(open(os.path.join(model_path, "extra_info.dump"), "rb"))
 30 |         self._model = hierarchy_utils.hARTM(theta_columns_naming="title",
 31 |                                             cache_theta=True,
 32 |                                             class_ids=self._extra_info["class_ids"])
 33 |         self._model.load(model_path)
 34 | 
 35 |         # Extract Φ, Ψ, Ɵ matrices from model
 36 |         self._phis = []
 37 |         self._psis = []
 38 |         self._theta = self._extra_info["theta"]
 39 |         for level_idx, artm_level in enumerate(self._model):
 40 |             self._phis.append(artm_level.get_phi(class_ids=["flat_tag"]))
 41 |             if level_idx > 0:
 42 |                 self._psis.append(artm_level.get_psi())
 43 | 
 44 |         # Construct topic name mappings
 45 |         self._from_artm_tid_map = {}
 46 |         self._to_artm_tid_map = {}
 47 |         self._from_lid_tid_map = {}
 48 |         self._to_lid_tid_map = {}
 49 |         theta_new_index = []
 50 |         for artm_tid in self._theta.index:
 51 |             if artm_tid.startswith("level_0_"):
 52 |                 lid, tid = 0, artm_tid[8:]
 53 |             else:
 54 |                 lid, tid = artm_tid[5:].split("_", 1)
 55 |                 lid = int(lid)
 56 |             topic_id = "level_%d_%s" % (lid, tid) # internal project-consistent topic name
 57 |             if tid.startswith("topic_"):
 58 |                 self._from_artm_tid_map[artm_tid] = topic_id
 59 |                 self._to_artm_tid_map[topic_id] = artm_tid
 60 |                 self._from_lid_tid_map[lid, tid] = topic_id
 61 |                 self._to_lid_tid_map[topic_id] = (lid, tid)
 62 |             theta_new_index.append(topic_id)
 63 |         self._theta.index = theta_new_index
 64 | 
 65 |         # Construct spectrums map
 66 |         spectrum_map = {}
 67 |         for spectrum in self._extra_info["spectrums"]:
 68 |             for i, topic_id in enumerate(spectrum):
 69 |                 spectrum_map[topic_id] = i
 70 | 
 71 |         # Construct topics infos
 72 |         # TODO: make topic maning an external procedure
 73 |         self._topics = {}
 74 |         for lid, phi in enumerate(self._phis):
 75 |             names = phi.index[phi.values.argsort(axis=0)[::-1].T]
 76 |             for tid, top_words in zip(phi.columns, names):
 77 |                 # subject topic names are "topic_X", where X = 0, 1, ...
 78 |                 # background topic names are "background_X", where X = 0, 1, ...
 79 |                 if regex.match("^topic_\d+$", tid):
 80 |                     topic_id = self._from_lid_tid_map[lid, tid]
 81 |                     self._topics[topic_id] = {
 82 |                         "level_id":    lid,
 83 |                         "top_words":   list(top_words),
 84 |                         "_unnamed":    True,
 85 |                         "parents":     [],
 86 |                         "children":    [],
 87 |                         "weight":      0,
 88 |                         "spectrum_id": spectrum_map.get(topic_id)
 89 |                     }
 90 | 
 91 |         # Define parent-child relationship for topics
 92 |         for lid, psi in enumerate(self._psis):
 93 |             psi = (psi >= psi_edge_threshold)
 94 |             for tid1 in psi.columns:
 95 |                 if regex.match("^topic_\d+$", tid1):
 96 |                     for tid2 in psi.index:
 97 |                         if regex.match("^topic_\d+$", tid2) and psi.loc[tid2, tid1]:
 98 |                             topic_id_parent = self._from_lid_tid_map[lid, tid1]
 99 |                             topic_id_child = self._from_lid_tid_map[lid + 1, tid2]
100 |                             self._topics[topic_id_parent]["children"].append(topic_id_child)
101 |                             self._topics[topic_id_child]["parents"].append(topic_id_parent)
102 | 
103 |         # Assign top words to child topics
104 |         # TODO: make topic maning an external procedure
105 |         for topic_id, topic in self._topics.items():
106 |             parents_ids_set = set(topic["parents"])
107 |             sibling_topics_ids = [tid for tid, t in self._topics.items()
108 |                                   if parents_ids_set & set(t["parents"]) and "_unnamed" not in t]
109 |             used_top_words = sum(map(lambda tid: self._topics[tid]["top_words"][:topic_naming_n_words],
110 |                                      topic["parents"] + sibling_topics_ids), [])
111 |             topic["top_words"] = list(filter(lambda tw: tw not in used_top_words,
112 |                                              topic["top_words"]))[:topic_naming_n_words]
113 |             del topic["_unnamed"]
114 | 
115 |         # Define parent-child relationship for topics and documents
116 |         last_lid = self.num_levels - 1
117 |         doc_topics = self.get_topics_ids_by_level(last_lid)
118 |         self._doc_theta = self._theta.loc[doc_topics]
119 |         self._doc_thresholds = self._doc_theta.max(axis=0) / np.sqrt(2)
120 | 
121 |         # Define topic weight as:
122 |         # For last-level topic, weight = number of documents that belong to it
123 |         # For a higher-level topic, weight = sum(weight) of topic's child topics
124 |         docs_count = self._doc_theta.apply(lambda s: sum(s >= self._doc_thresholds), axis=1)
125 |         lids_tids = list(self._from_lid_tid_map.keys())
126 |         lids_tids = sorted(lids_tids, reverse=True)
127 |         for lid, tid in lids_tids:
128 |             topic_id = self._from_lid_tid_map[lid, tid]
129 |             if lid == last_lid:
130 |                 w = int(docs_count[topic_id])
131 |             else:
132 |                 w = 0
133 |                 for child_topic_id in self._topics[topic_id]["children"]:
134 |                     w += self._topics[child_topic_id]["weight"]
135 |             self._topics[topic_id]["weight"] = w
136 | 
137 |     def get_topics_ids_by_level(self, level_id):
138 |         if level_id < 0 or level_id >= self.num_levels:
139 |             raise ValueError("Unknown level_id: %d" % level_id)
140 | 
141 |         topics_ids = []
142 |         for (lid, tid), topic_id in self._from_lid_tid_map.items():
143 |             if lid == level_id:
144 |                 topics_ids.append(topic_id)
145 |         return topics_ids
146 | 
147 |     def get_docs_ids_by_topic(self, topic_id):
148 |         if topic_id not in self._doc_theta.index:
149 |             raise ValueError("Unknown document topic id: '%s'" % topic_id)
150 | 
151 |         ptd = self._doc_theta.loc[topic_id]
152 |         sorted_ptd = ptd[ptd >= self._doc_thresholds].sort_values(ascending=False)
153 |         return sorted_ptd
154 | 
155 |     def get_topics_by_docs_ids(self, docs_ids):
156 |         theta = self._theta
157 |         doc_theta = self._doc_theta
158 |         thresholds = self._doc_thresholds
159 |         topics = self._topics
160 |         tid_lid = self._to_lid_tid_map
161 | 
162 |         lowest_level_counter = pd.Series(np.zeros(len(doc_theta.index)),
163 |                                          index=doc_theta.index)
164 | 
165 |         # docs = dict(zip(theta.index, [set()]*len(theta.index)))
166 |         docs = {key: set() for key in theta.index}
167 | 
168 |         for doc in docs_ids:
169 |             if doc["doc_id"] not in thresholds.index:
170 |                 continue
171 |             topics_for_doc = {}
172 |             comparison = (doc_theta[doc["doc_id"]] > thresholds[doc["doc_id"]])
173 |             lowest_level_counter += np.int32(comparison)
174 |             for topic, doc_in_topic in zip(doc_theta.index, comparison):
175 |                 if doc_in_topic:
176 |                     docs[topic].add(doc["doc_id"])
177 | 
178 |         levels_count = tid_lid[lowest_level_counter.index[0]][0] + 1
179 | 
180 |         answer = pd.Series(np.zeros(len(theta.index)), index=theta.index)
181 |         answer[lowest_level_counter.index] = lowest_level_counter
182 | 
183 |         is_level_topic = lambda lid: lambda x: x.startswith("level_%d_t" % lid)
184 |         for lid in range(0, levels_count-1)[::-1]:
185 |             curr_level_topics = list(filter(is_level_topic(lid), answer.index))
186 |             for topic in curr_level_topics:
187 |                 for child in topics[topic]["children"]:
188 |                     answer[topic] += answer[child]
189 |                     docs[topic] |= docs[child]
190 | 
191 |         for lid in range(0, levels_count)[::-1]:
192 |             curr_level_topics = list(filter(is_level_topic(lid), answer.index))
193 |             total_docs_in_this_level = sum(answer[curr_level_topics])
194 |             if total_docs_in_this_level != 0:
195 |                 answer[curr_level_topics] /= total_docs_in_this_level
196 | 
197 |         return dict(zip(docs.keys(), [list(v) for k, v in docs.items()])), dict(answer)
198 | 
199 | 
200 |     def transform_one(self, vw_path, batch_path):
201 |         transform_batch = artm.BatchVectorizer(data_format="vowpal_wabbit",
202 |                                                data_path=vw_path,
203 |                                                batch_size=1,
204 |                                                target_folder=batch_path)
205 |         transform_theta = self._model.transform(transform_batch)
206 |         response = {}
207 |         for artm_tid, pdt in transform_theta["upload"].items():
208 |             if artm_tid in self._from_artm_tid_map:
209 |                 topic_id = self._from_artm_tid_map[artm_tid]
210 |                 response[topic_id] = float(pdt)
211 |         return response
212 | 
213 | 
214 |     @property
215 |     def theta(self):
216 |         return self._theta
217 | 
218 |     @property
219 |     def topics(self):
220 |         return self._topics
221 | 
222 |     @property
223 |     def num_levels(self):
224 |         return self._model.num_levels
225 | 
226 |     @property
227 |     def topics_ids(self):
228 |         return self._theta.index
229 | 
230 |     def get_phi(self, level_id):
231 |         return self._phis[level_id]
232 | 
233 |     def get_psi(self, level_id):
234 |         return self._psis[level_id]
235 | 
236 |     def to_topic_id(self, lid, tid):
237 |         return self._from_lid_tid_map[lid, tid]
238 | 
239 |     def from_topic_id(self, topic_id):
240 |         return self._to_lid_tid_map[topic_id]
241 | 
242 | 
243 | class ArtmDataSource:
244 |     def __init__(self):
245 |         self._db = MongoClient()
246 | 
247 |     def get_documents_by_ids(self, docs_ids, with_texts=True, with_modalities=False):
248 |         fields = {"title": 1, "authors_names" : 1}
249 |         if with_texts:
250 |             fields["markdown"] = 1
251 |         if with_modalities:
252 |             fields["modalities"] = 1
253 |         queries = {}
254 |         for doc_id in docs_ids:
255 |             prefix = doc_id.split("_", 1)[0]
256 |             col_name = prefix_to_col_map[prefix]
257 |             if col_name not in queries:
258 |                 queries[col_name] = []
259 |             queries[col_name].append(doc_id)
260 |         result = []
261 |         for col_name, col_docs_ids in queries.items():
262 |             dataset = self._db["datasets"][col_name]
263 |             result += dataset.find({"_id": {"$in": col_docs_ids}}, fields)
264 |         result_map = dict(map(lambda v: (v["_id"], v), result))
265 |         response = []
266 |         for doc_id in docs_ids:
267 |             if doc_id not in result_map:
268 |                 continue
269 |             doc = result_map[doc_id]
270 |             res = {
271 |                 "doc_id":        doc["_id"],
272 |                 "title":         doc["title"],
273 |                 "authors_names": doc.get("authors_names", [])
274 |             }
275 |             if with_texts:
276 |                 res["markdown"] = doc["markdown"]
277 |             if with_modalities:
278 |                 res["modalities"] = doc["modalities"]
279 |             response.append(res)
280 |         return response
281 | 
282 |     def search_query_in_models_docs(self, query, limit=10):
283 |         col_results = self._db.model.all_docs.find(
284 |                             {"$text": {"$search": query}},
285 |                             {"score": {"$meta": "textScore"}}).sort(
286 |                             [("score", {"$meta": "textScore"})]).limit(limit)
287 |         results = []
288 |         for row in col_results:
289 |             results.append({
290 |                     "doc_id": row["_id"],
291 |                     "score":  row["score"],
292 |                     })
293 | 
294 |         return sorted(results, key=lambda x: x["score"])
295 | 
296 | class ArtmBridge:
297 |     def __init__(self, model_path):
298 |         self._data_source = ArtmDataSource()
299 |         self._model = ArtmModel(model_path)
300 | 
301 |         # Select topics which will be used for recommendation
302 |         self._rec_lid = 0
303 |         rec_topics = self._model.get_topics_ids_by_level(self._rec_lid)
304 |         self._rec_tids = list(map(lambda t: self._model.from_topic_id(t)[1], rec_topics))
305 |         self._rec_theta = self._model.theta.T[rec_topics].sort_index()
306 | 
307 |     def get_documents_by_topic(self, topic_id, offset=0, limit=None, with_weights=True):
308 |         sorted_ptd = self._model.get_docs_ids_by_topic(topic_id)
309 |         if limit is None:
310 |             limit = len(sorted_ptd)
311 | 
312 |         sorted_ptd = sorted_ptd[offset:offset + limit]
313 |         docs_ids = sorted_ptd.index
314 |         docs = self._data_source.get_documents_by_ids(docs_ids, with_texts=False)
315 |         weights = {k: float(v) for k, v in sorted_ptd.items()}
316 |         if with_weights:
317 |             return docs, weights
318 |         else:
319 |             return docs
320 | 
321 |     def recommend_tags_by_doc(self, doc, rec_tags_count=5):
322 |         own_tags = set(doc["modalities"]["flat_tag"])
323 |         ptd = self._rec_theta.loc[doc["doc_id"]]
324 |         weighted_tags = self._model.get_phi(self._rec_lid)[self._rec_tids].mul(ptd.values)
325 |         rec_tags = {}
326 |         for _, pwt in weighted_tags.iteritems():
327 |             top_tags = pwt.nlargest(len(own_tags) + rec_tags_count)
328 |             for tag, w in top_tags.iteritems():
329 |                 tag = regex.sub("_", " ", tag)
330 |                 if tag not in own_tags:
331 |                     rec_tags[tag] = max(rec_tags.get(tag, 0), w)
332 |         rec_tags = list(map(lambda p: (p[1], p[0]), rec_tags.items()))
333 |         rec_tags.sort(reverse=True)
334 |         rec_tags = list(map(lambda x: x[1], rec_tags[:rec_tags_count]))
335 |         return rec_tags
336 | 
337 |     def recommend_docs_by_doc(self, doc_id, rec_docs_count=5, metric=hellinger_dist):
338 |         doc = self._rec_theta.loc[doc_id]
339 |         dist = pairwise_distances([doc], self._rec_theta, hellinger_dist)[0]
340 |         dist_series = pd.Series(data=dist, index=self._rec_theta.index)
341 |         sim_docs_ids = dist_series.nsmallest(rec_docs_count + 1).index
342 |         return sim_docs_ids[1:] # Not counting the `doc` itself.
343 | 
344 |     def search_documents(self, query, limit=10):
345 |         search_results = self._data_source.search_query_in_models_docs(query, limit)
346 |         return self._model.get_topics_by_docs_ids(search_results)
347 | 
348 |     @property
349 |     def data_source(self):
350 |         return self._data_source
351 | 
352 |     @property
353 |     def model(self):
354 |         return self._model


--------------------------------------------------------------------------------
/experiments/Topical Similarity Measurements for ARTM RecSys.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Анализ разных метрик тематической близости документов"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pickle\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "from scipy.linalg import norm\n",
 32 |     "from scipy.stats import entropy\n",
 33 |     "from pymongo import MongoClient\n",
 34 |     "from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "client = MongoClient()\n",
 46 |     "collection = client[\"datasets\"][\"postnauka\"]"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 4,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "def t(doc_id):\n",
 58 |     "    return collection.find_one(\"pn_%d\" % doc_id)[\"title\"]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "artm_model = pickle.load(open(\"../server/hartm.mdl\", \"rb\"))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 6,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "(110, 3446)"
 83 |       ]
 84 |      },
 85 |      "execution_count": 6,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "artm_model[\"theta\"].shape"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "theta_lvl0 = artm_model[\"theta\"][:10].T"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 8,
108 |    "metadata": {
109 |     "collapsed": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "theta_lvl1 = artm_model[\"theta\"][10:-70].T"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 9,
119 |    "metadata": {
120 |     "collapsed": false
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "theta_lvl2 = artm_model[\"theta\"][-70:].T"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Близость тематических профилей (косинусная мера)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 10,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "doc_id = 3123"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 11,
148 |    "metadata": {
149 |     "collapsed": true
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "# Первый уровень"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 12,
159 |    "metadata": {
160 |     "collapsed": false,
161 |     "scrolled": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n",
169 |       "\n",
170 |       "Top-5 similar (p(t|d) cosine similarity) documents:\n",
171 |       "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.9985)\n",
172 |       "2. По шагам | Теория принятия решений (doc_id=3423, p=0.9976)\n",
173 |       "3. 5 книг о поведенческой экономике (doc_id=3344, p=0.9965)\n",
174 |       "4. Психология создания трудностей и проблем (doc_id=2988, p=0.9949)\n",
175 |       "5. Что такое «робот»? (doc_id=2296, p=0.9943)\n",
176 |       "6. Курс «Теория принятия решений: математические модели выбора» (doc_id=3181, p=0.9939)\n",
177 |       "7. Задачи и проблемы в мышлении (doc_id=1665, p=0.9936)\n",
178 |       "8. Эмоциональные вычисления (doc_id=2295, p=0.9933)\n",
179 |       "9. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.9928)\n",
180 |       "10. Дэвид Вернон: «То, что мы называем искусственным интеллектом, им не является» (doc_id=2256, p=0.9927)\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "theta_lvl = theta_lvl0.sort_index()\n",
186 |     "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n",
187 |     "sim_matrix = cosine_similarity([theta_lvl.loc[doc_id]], theta_lvl)\n",
188 |     "print(\"Top-5 similar (p(t|d) cosine similarity) documents:\")\n",
189 |     "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, -11:-1][::-1], np.argsort(sim_matrix)[0, -11:-1][::-1])):\n",
190 |     "    print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "## Близость тематических профилей (KL-дивергенция)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 13,
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "doc_id = 3123"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 14,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "def sym_kl(p, q):\n",
220 |     "    return entropy(p, q) + entropy(q, p)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 15,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "# Первый уровень"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 16,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n",
246 |       "\n",
247 |       "Top-10 similar (p(t|d) symmetric KL-divergence) documents:\n",
248 |       "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.0967)\n",
249 |       "2. Главы | Методы измерения данных (doc_id=1051, p=0.1277)\n",
250 |       "3. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.1452)\n",
251 |       "4. Построение сложных вероятностных моделей (doc_id=2892, p=0.1529)\n",
252 |       "5. Эмоциональные вычисления (doc_id=2295, p=0.1542)\n",
253 |       "6. Психология создания трудностей и проблем (doc_id=2988, p=0.1583)\n",
254 |       "7. FAQ: Компьютерные доказательства (doc_id=1121, p=0.1693)\n",
255 |       "8. 5 книг о поведенческой экономике (doc_id=3344, p=0.2286)\n",
256 |       "9. По шагам | Теория принятия решений (doc_id=3423, p=0.2335)\n",
257 |       "10. Исследования мышления в когнитивной психологии (doc_id=2947, p=0.2378)\n",
258 |       "CPU times: user 64 ms, sys: 28 ms, total: 92 ms\n",
259 |       "Wall time: 290 ms\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "%%time\n",
265 |     "\n",
266 |     "theta_lvl = theta_lvl0.sort_index()\n",
267 |     "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n",
268 |     "sim_matrix = pairwise_distances([theta_lvl.loc[doc_id]], theta_lvl, sym_kl, n_jobs=-1)\n",
269 |     "print(\"Top-10 similar (p(t|d) symmetric KL-divergence) documents:\")\n",
270 |     "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, 1:11], np.argsort(sim_matrix)[0, 1:11])):\n",
271 |     "    print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "## Близость тематических профилей (расстояние Хеллингера)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 17,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "doc_id = 3123"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 18,
295 |    "metadata": {
296 |     "collapsed": true
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "def hellinger_dist(p, q):\n",
301 |     "    return norm(np.sqrt(p) - np.sqrt(q))"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 19,
307 |    "metadata": {
308 |     "collapsed": true
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "# Первый уровень"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 20,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n",
327 |       "\n",
328 |       "Top-10 similar (p(t|d) hellinger distance) documents:\n",
329 |       "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.1408)\n",
330 |       "2. Главы | Методы измерения данных (doc_id=1051, p=0.1643)\n",
331 |       "3. Психология создания трудностей и проблем (doc_id=2988, p=0.1798)\n",
332 |       "4. Построение сложных вероятностных моделей (doc_id=2892, p=0.1827)\n",
333 |       "5. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.1861)\n",
334 |       "6. Эмоциональные вычисления (doc_id=2295, p=0.1918)\n",
335 |       "7. FAQ: Компьютерные доказательства (doc_id=1121, p=0.1927)\n",
336 |       "8. По шагам | Теория принятия решений (doc_id=3423, p=0.2098)\n",
337 |       "9. 5 книг о поведенческой экономике (doc_id=3344, p=0.2142)\n",
338 |       "10. «Разработка операционной системы рыночного уровня должна укладываться в 1 млрд долларов» (doc_id=1734, p=0.2190)\n"
339 |      ]
340 |     }
341 |    ],
342 |    "source": [
343 |     "theta_lvl = theta_lvl0.sort_index()\n",
344 |     "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n",
345 |     "sim_matrix = pairwise_distances([theta_lvl.loc[doc_id]], theta_lvl, hellinger_dist)\n",
346 |     "print(\"Top-10 similar (p(t|d) hellinger distance) documents:\")\n",
347 |     "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, 1:11], np.argsort(sim_matrix)[0, 1:11])):\n",
348 |     "    print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "## Анализ тематических профилей"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 21,
361 |    "metadata": {
362 |     "collapsed": false
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "theta_lvl = theta_lvl1.sort_index()"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 22,
372 |    "metadata": {
373 |     "collapsed": false
374 |    },
375 |    "outputs": [
376 |     {
377 |      "data": {
378 |       "text/plain": [
379 |        "level1_topic_0     0.006610\n",
380 |        "level1_topic_1     0.000028\n",
381 |        "level1_topic_2     0.000386\n",
382 |        "level1_topic_3     0.000023\n",
383 |        "level1_topic_4     0.009620\n",
384 |        "level1_topic_5     0.001744\n",
385 |        "level1_topic_6     0.026533\n",
386 |        "level1_topic_7     0.000150\n",
387 |        "level1_topic_8     0.048912\n",
388 |        "level1_topic_9     0.000553\n",
389 |        "level1_topic_10    0.000025\n",
390 |        "level1_topic_11    0.000033\n",
391 |        "level1_topic_12    0.001444\n",
392 |        "level1_topic_13    0.000027\n",
393 |        "level1_topic_14    0.000031\n",
394 |        "level1_topic_15    0.033075\n",
395 |        "level1_topic_16    0.280329\n",
396 |        "level1_topic_17    0.051039\n",
397 |        "level1_topic_18    0.302440\n",
398 |        "level1_topic_19    0.153228\n",
399 |        "level1_topic_20    0.000022\n",
400 |        "level1_topic_21    0.000023\n",
401 |        "level1_topic_22    0.027129\n",
402 |        "level1_topic_23    0.027075\n",
403 |        "level1_topic_24    0.006719\n",
404 |        "level1_topic_25    0.000036\n",
405 |        "level1_topic_26    0.009875\n",
406 |        "level1_topic_27    0.000219\n",
407 |        "level1_topic_28    0.004038\n",
408 |        "level1_topic_29    0.008635\n",
409 |        "Name: 3123, dtype: float32"
410 |       ]
411 |      },
412 |      "execution_count": 22,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "theta_lvl.loc[3123]"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 23,
424 |    "metadata": {
425 |     "collapsed": false
426 |    },
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "level1_topic_0     0.009424\n",
432 |        "level1_topic_1     0.000032\n",
433 |        "level1_topic_2     0.001288\n",
434 |        "level1_topic_3     0.000033\n",
435 |        "level1_topic_4     0.017803\n",
436 |        "level1_topic_5     0.002538\n",
437 |        "level1_topic_6     0.008913\n",
438 |        "level1_topic_7     0.000083\n",
439 |        "level1_topic_8     0.053283\n",
440 |        "level1_topic_9     0.001030\n",
441 |        "level1_topic_10    0.003718\n",
442 |        "level1_topic_11    0.000185\n",
443 |        "level1_topic_12    0.000032\n",
444 |        "level1_topic_13    0.000021\n",
445 |        "level1_topic_14    0.001256\n",
446 |        "level1_topic_15    0.027246\n",
447 |        "level1_topic_16    0.258919\n",
448 |        "level1_topic_17    0.028847\n",
449 |        "level1_topic_18    0.353488\n",
450 |        "level1_topic_19    0.153007\n",
451 |        "level1_topic_20    0.000026\n",
452 |        "level1_topic_21    0.000025\n",
453 |        "level1_topic_22    0.005661\n",
454 |        "level1_topic_23    0.032550\n",
455 |        "level1_topic_24    0.013211\n",
456 |        "level1_topic_25    0.000029\n",
457 |        "level1_topic_26    0.012926\n",
458 |        "level1_topic_27    0.002171\n",
459 |        "level1_topic_28    0.006999\n",
460 |        "level1_topic_29    0.005257\n",
461 |        "Name: 2257, dtype: float32"
462 |       ]
463 |      },
464 |      "execution_count": 23,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "theta_lvl.loc[2257]"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 24,
476 |    "metadata": {
477 |     "collapsed": false
478 |    },
479 |    "outputs": [
480 |     {
481 |      "data": {
482 |       "text/plain": [
483 |        "level1_topic_0     0.010725\n",
484 |        "level1_topic_1     0.000212\n",
485 |        "level1_topic_2     0.000017\n",
486 |        "level1_topic_3     0.000021\n",
487 |        "level1_topic_4     0.019576\n",
488 |        "level1_topic_5     0.007783\n",
489 |        "level1_topic_6     0.001676\n",
490 |        "level1_topic_7     0.002375\n",
491 |        "level1_topic_8     0.013219\n",
492 |        "level1_topic_9     0.000790\n",
493 |        "level1_topic_10    0.000021\n",
494 |        "level1_topic_11    0.000479\n",
495 |        "level1_topic_12    0.001415\n",
496 |        "level1_topic_13    0.000022\n",
497 |        "level1_topic_14    0.000086\n",
498 |        "level1_topic_15    0.124689\n",
499 |        "level1_topic_16    0.253682\n",
500 |        "level1_topic_17    0.022450\n",
501 |        "level1_topic_18    0.295885\n",
502 |        "level1_topic_19    0.126896\n",
503 |        "level1_topic_20    0.000022\n",
504 |        "level1_topic_21    0.000712\n",
505 |        "level1_topic_22    0.000017\n",
506 |        "level1_topic_23    0.039969\n",
507 |        "level1_topic_24    0.023700\n",
508 |        "level1_topic_25    0.001273\n",
509 |        "level1_topic_26    0.008254\n",
510 |        "level1_topic_27    0.005643\n",
511 |        "level1_topic_28    0.017148\n",
512 |        "level1_topic_29    0.021242\n",
513 |        "Name: 1734, dtype: float32"
514 |       ]
515 |      },
516 |      "execution_count": 24,
517 |      "metadata": {},
518 |      "output_type": "execute_result"
519 |     }
520 |    ],
521 |    "source": [
522 |     "theta_lvl.loc[1734]"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "markdown",
527 |    "metadata": {},
528 |    "source": [
529 |     "---"
530 |    ]
531 |   }
532 |  ],
533 |  "metadata": {
534 |   "kernelspec": {
535 |    "display_name": "Python 3",
536 |    "language": "python",
537 |    "name": "python3"
538 |   },
539 |   "language_info": {
540 |    "codemirror_mode": {
541 |     "name": "ipython",
542 |     "version": 3
543 |    },
544 |    "file_extension": ".py",
545 |    "mimetype": "text/x-python",
546 |    "name": "python",
547 |    "nbconvert_exporter": "python",
548 |    "pygments_lexer": "ipython3",
549 |    "version": "3.5.2"
550 |   },
551 |   "latex_envs": {
552 |    "bibliofile": "biblio.bib",
553 |    "cite_by": "apalike",
554 |    "current_citInitial": 1,
555 |    "eqLabelWithNumbers": true,
556 |    "eqNumInitial": 0
557 |   }
558 |  },
559 |  "nbformat": 4,
560 |  "nbformat_minor": 0
561 | }
562 | 


--------------------------------------------------------------------------------
/experiments/Parsing ruwiki dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "deletable": true,
  9 |     "editable": true
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "import sys\n",
 14 |     "import csv\n",
 15 |     "import unicodedata\n",
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 7,
 23 |    "metadata": {
 24 |     "collapsed": true,
 25 |     "deletable": true,
 26 |     "editable": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from pymystem3 import Mystem\n",
 31 |     "from collections import Counter\n",
 32 |     "from multiprocessing import Pool\n",
 33 |     "from IPython.display import display\n",
 34 |     "from ipywidgets import FloatProgress\n",
 35 |     "from sklearn.pipeline import Pipeline\n",
 36 |     "from parsers.text_utils import DefaultTextProcessor, Lemmatizer"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 9,
 42 |    "metadata": {
 43 |     "collapsed": false,
 44 |     "deletable": true,
 45 |     "editable": true
 46 |    },
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/plain": [
 51 |        "9223372036854775807"
 52 |       ]
 53 |      },
 54 |      "execution_count": 9,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "csv.field_size_limit(sys.maxsize)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {
 66 |     "deletable": true,
 67 |     "editable": true
 68 |    },
 69 |    "source": [
 70 |     "Разобьём процесс на две части — токенизацию документов (без фильтрации) и, собственно, лемматизацию."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 29,
 76 |    "metadata": {
 77 |     "collapsed": false,
 78 |     "deletable": true,
 79 |     "editable": true
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "CPU times: user 40min 14s, sys: 29.9 s, total: 40min 44s\n",
 87 |       "Wall time: 40min 44s\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "%%time\n",
 93 |     "\n",
 94 |     "# Проделаем токенизацию с сохранением промежуточного\n",
 95 |     "# состояния в ruwiki.tonekized.csv.tmp\n",
 96 |     "\n",
 97 |     "tokenizer = DefaultTextProcessor()\n",
 98 |     "\n",
 99 |     "# 1361758 — предподсчитанное кол-во документов\n",
100 |     "f = FloatProgress(min=0, max=1361758)\n",
101 |     "display(f)\n",
102 |     "\n",
103 |     "unused_char = '\\U00037b84'\n",
104 |     "def strip_accents(s):\n",
105 |     "    s = s.replace(\"й\", unused_char)\n",
106 |     "    return \"\".join((c for c in unicodedata.normalize(\"NFD\", s) if unicodedata.category(c) != \"Mn\")).replace(unused_char, \"й\")\n",
107 |     "\n",
108 |     "def remove_underscores(s):\n",
109 |     "    return s.replace(\"_\", \"\")\n",
110 |     "\n",
111 |     "with open(\"../datasets/ruwiki/ruwiki.plain.csv\", \"r\") as infile:\n",
112 |     "    with open(\"ruwiki.tonekized.csv.tmp\", \"w\") as outfile:\n",
113 |     "        reader = csv.reader(infile)\n",
114 |     "        writer = csv.writer(outfile)\n",
115 |     "        count = 0\n",
116 |     "        cached_rows = []\n",
117 |     "        for title, text in reader:\n",
118 |     "            text = strip_accents(text)\n",
119 |     "            text = remove_underscores(text)\n",
120 |     "            tokens = tokenizer.fit_transform(text)\n",
121 |     "            cached_rows.append((title, \" \".join(tokens)))\n",
122 |     "            count += 1\n",
123 |     "            if count % 1000 == 0:\n",
124 |     "                writer.writerows(cached_rows)\n",
125 |     "                outfile.flush()\n",
126 |     "                f.value += len(cached_rows)\n",
127 |     "                cached_rows = []\n",
128 |     "        # Запишем оставшиеся строчки\n",
129 |     "        writer.writerows(cached_rows)\n",
130 |     "        f.value += len(cached_rows)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 30,
136 |    "metadata": {
137 |     "collapsed": false,
138 |     "deletable": true,
139 |     "editable": true
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "CPU times: user 5min 17s, sys: 2min 17s, total: 7min 35s\n",
147 |       "Wall time: 3h 13min 15s\n"
148 |      ]
149 |     }
150 |    ],
151 |    "source": [
152 |     "%%time\n",
153 |     "\n",
154 |     "# Теперь сделаем лемматизацию всех документов при помощи pymystem3\n",
155 |     "# Распараллеливая процесс на N_PROCS процессоров\n",
156 |     "\n",
157 |     "N_PROCS = 4\n",
158 |     "\n",
159 |     "# 1361758 — предподсчитанное кол-во документов\n",
160 |     "f = FloatProgress(min=0, max=1361758)\n",
161 |     "display(f)\n",
162 |     "\n",
163 |     "m = Mystem()\n",
164 |     "\n",
165 |     "def lemmatize(text):\n",
166 |     "    return \"\".join(m.lemmatize(text)).strip()\n",
167 |     "\n",
168 |     "with open(\"ruwiki.tonekized.csv.tmp\", \"r\") as infile:\n",
169 |     "    with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"w\") as outfile:\n",
170 |     "        reader = csv.reader(infile)\n",
171 |     "        writer = csv.writer(outfile)\n",
172 |     "        count = 0\n",
173 |     "        cached_titles = []\n",
174 |     "        cached_texts = []\n",
175 |     "        for title, text in reader:\n",
176 |     "            cached_titles.append(title)\n",
177 |     "            cached_texts.append(text)\n",
178 |     "            count += 1\n",
179 |     "            if count % 1000 == 0:\n",
180 |     "                with Pool(N_PROCS) as p:\n",
181 |     "                    lemmatized_texts = p.map(lemmatize, cached_texts)\n",
182 |     "                writer.writerows(zip(cached_titles, lemmatized_texts))\n",
183 |     "                outfile.flush()\n",
184 |     "                f.value += len(cached_titles)\n",
185 |     "                cached_texts = []\n",
186 |     "                cached_titles = []\n",
187 |     "        # Запишем оставшиеся строчки\n",
188 |     "        with Pool(N_PROCS) as p:\n",
189 |     "            lemmatized_texts = p.map(lemmatize, cached_texts)\n",
190 |     "        writer.writerows(zip(cached_titles, lemmatized_texts))\n",
191 |     "        f.value += len(cached_titles)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {
197 |     "deletable": true,
198 |     "editable": true
199 |    },
200 |    "source": [
201 |     "Посчитаем размер словаря, из которого состоит неотфильтрованная лемматизированная выборка."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 31,
207 |    "metadata": {
208 |     "collapsed": false,
209 |     "deletable": true,
210 |     "editable": true
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "CPU times: user 31min 53s, sys: 2min 5s, total: 33min 59s\n",
218 |       "Wall time: 31min 53s\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "%%time\n",
224 |     "\n",
225 |     "dictionary = Counter()\n",
226 |     "\n",
227 |     "# 1361758 — предподсчитанное кол-во документов\n",
228 |     "f = FloatProgress(min=0, max=1361758)\n",
229 |     "display(f)\n",
230 |     "\n",
231 |     "with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"r\") as infile:\n",
232 |     "    reader = csv.reader(infile)\n",
233 |     "    for title, text in reader:\n",
234 |     "        tokens = text.split()\n",
235 |     "        dictionary.update(tokens)\n",
236 |     "        f.value += 1"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 32,
242 |    "metadata": {
243 |     "collapsed": false,
244 |     "deletable": true,
245 |     "editable": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "dict_series = pd.DataFrame.from_dict(dictionary, orient=\"index\")[0]"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 33,
255 |    "metadata": {
256 |     "collapsed": false,
257 |     "deletable": true,
258 |     "editable": true
259 |    },
260 |    "outputs": [
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "3924272"
265 |       ]
266 |      },
267 |      "execution_count": 33,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "len(dict_series)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 34,
279 |    "metadata": {
280 |     "collapsed": false,
281 |     "deletable": true,
282 |     "editable": true
283 |    },
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "533185"
289 |       ]
290 |      },
291 |      "execution_count": 34,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "(dict_series > 10).value_counts()[True]"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {
303 |     "deletable": true,
304 |     "editable": true
305 |    },
306 |    "source": [
307 |     "Будем считать, что лемма должна встретиться более 10 раз в коллекции, чтобы мы положили её в словарь. Это сократит размер словаря в 8 раз от первоначального объёма."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {
313 |     "deletable": true,
314 |     "editable": true
315 |    },
316 |    "source": [
317 |     "Посмотрим на топ-50 слов:"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 35,
323 |    "metadata": {
324 |     "collapsed": false,
325 |     "deletable": true,
326 |     "editable": true
327 |    },
328 |    "outputs": [
329 |     {
330 |      "data": {
331 |       "text/plain": [
332 |        "Index(['в', 'и', 'год', 'на', 'с', 'быть', 'по', 'из', 'он', 'который', 'а',\n",
333 |        "       'к', 'не', 'что', 'от', 'для', 'за', '1', 'как', 'этот', 'свой', '2',\n",
334 |        "       'также', 'до', 'первый', 'время', 'о', 'его', 'после', 'они', '3',\n",
335 |        "       'район', 'один', 'то', 'становиться', 'при', 'г', 'город', '5',\n",
336 |        "       'примечание', 'ссылка', 'человек', 'м', 'тот', 'область', 'во', 'это',\n",
337 |        "       'она', 'весь', 'но'],\n",
338 |        "      dtype='object')"
339 |       ]
340 |      },
341 |      "execution_count": 35,
342 |      "metadata": {},
343 |      "output_type": "execute_result"
344 |     }
345 |    ],
346 |    "source": [
347 |     "top50_words = dict_series.sort_values(ascending=False)[:50].index\n",
348 |     "top50_words"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {
354 |     "deletable": true,
355 |     "editable": true
356 |    },
357 |    "source": [
358 |     "Слов, которые могли бы иметь выраженную тематику, здесь почти нет, зато довольно много мусорных и общих слов. Будем выбрасывать слова, входящие в этот список, из документов."
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {
364 |     "deletable": true,
365 |     "editable": true
366 |    },
367 |    "source": [
368 |     "Также будем фильтровать слова по стоп-словарю."
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 36,
374 |    "metadata": {
375 |     "collapsed": true,
376 |     "deletable": true,
377 |     "editable": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "stop_words = set(map(str.strip, open(\"../datasets/ruwiki/stopwords.txt\").readlines()))"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 37,
387 |    "metadata": {
388 |     "collapsed": false,
389 |     "deletable": true,
390 |     "editable": true
391 |    },
392 |    "outputs": [
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "CPU times: user 43min 12s, sys: 2min 23s, total: 45min 35s\n",
398 |       "Wall time: 43min 6s\n"
399 |      ]
400 |     }
401 |    ],
402 |    "source": [
403 |     "%%time\n",
404 |     "\n",
405 |     "common_words = dict_series[dict_series > 10].index\n",
406 |     "\n",
407 |     "# 1361758 — предподсчитанное кол-во документов\n",
408 |     "f = FloatProgress(min=0, max=1361758)\n",
409 |     "display(f)\n",
410 |     "\n",
411 |     "def accept_word(w):\n",
412 |     "    return w not in stop_words and w not in top50_words and w in common_words\n",
413 |     "\n",
414 |     "with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"r\") as infile:\n",
415 |     "    with open(\"../datasets/ruwiki/ruwiki.filtered.csv\", \"w\") as outfile:\n",
416 |     "        reader = csv.reader(infile)\n",
417 |     "        writer = csv.writer(outfile)\n",
418 |     "        for title, text in reader:\n",
419 |     "            tokens = text.split()\n",
420 |     "            writer.writerow((title, \" \".join(filter(accept_word, tokens))))\n",
421 |     "            f.value += 1"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {
427 |     "deletable": true,
428 |     "editable": true
429 |    },
430 |    "source": [
431 |     "Наконец, превратим коллекцию с отфильтрованным словарём в файл UCI Bag-of-words."
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {
437 |     "deletable": true,
438 |     "editable": true
439 |    },
440 |    "source": [
441 |     "Для начала построим словарь по отфильтрованной коллекции."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 11,
447 |    "metadata": {
448 |     "collapsed": false,
449 |     "deletable": true,
450 |     "editable": true
451 |    },
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "CPU times: user 28min 54s, sys: 2min 7s, total: 31min 2s\n",
458 |       "Wall time: 28min 59s\n"
459 |      ]
460 |     }
461 |    ],
462 |    "source": [
463 |     "%%time\n",
464 |     "\n",
465 |     "dictionary = set()\n",
466 |     "bow_length = 0\n",
467 |     "\n",
468 |     "# 1361758 — предподсчитанное кол-во документов\n",
469 |     "f = FloatProgress(min=0, max=1361758)\n",
470 |     "display(f)\n",
471 |     "\n",
472 |     "with open(\"../datasets/ruwiki/ruwiki.filtered.csv\", \"r\") as infile:\n",
473 |     "    reader = csv.reader(infile)\n",
474 |     "    for title, text in reader:\n",
475 |     "        tokens = set(text.split())\n",
476 |     "        dictionary.update(tokens)\n",
477 |     "        bow_length += len(tokens)\n",
478 |     "        f.value += 1"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 12,
484 |    "metadata": {
485 |     "collapsed": false,
486 |     "deletable": true,
487 |     "editable": true
488 |    },
489 |    "outputs": [
490 |     {
491 |      "data": {
492 |       "text/plain": [
493 |        "532345"
494 |       ]
495 |      },
496 |      "execution_count": 12,
497 |      "metadata": {},
498 |      "output_type": "execute_result"
499 |     }
500 |    ],
501 |    "source": [
502 |     "len(dictionary)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 13,
508 |    "metadata": {
509 |     "collapsed": false,
510 |     "deletable": true,
511 |     "editable": true
512 |    },
513 |    "outputs": [
514 |     {
515 |      "data": {
516 |       "text/plain": [
517 |        "185333372"
518 |       ]
519 |      },
520 |      "execution_count": 13,
521 |      "metadata": {},
522 |      "output_type": "execute_result"
523 |     }
524 |    ],
525 |    "source": [
526 |     "bow_length"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {
532 |     "deletable": true,
533 |     "editable": true
534 |    },
535 |    "source": [
536 |     "Запишем словарь в файл и переконвертируем документы."
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 14,
542 |    "metadata": {
543 |     "collapsed": true,
544 |     "deletable": true,
545 |     "editable": true
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "dict_mapping = dict(zip(dictionary, range(len(dictionary))))\n",
550 |     "dict_ordering = sorted(zip(range(len(dictionary)), dictionary))"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 15,
556 |    "metadata": {
557 |     "collapsed": false,
558 |     "deletable": true,
559 |     "editable": true
560 |    },
561 |    "outputs": [
562 |     {
563 |      "name": "stdout",
564 |      "output_type": "stream",
565 |      "text": [
566 |       "CPU times: user 379 ms, sys: 10 ms, total: 389 ms\n",
567 |       "Wall time: 388 ms\n"
568 |      ]
569 |     }
570 |    ],
571 |    "source": [
572 |     "%%time\n",
573 |     "\n",
574 |     "with open(\"../datasets/ruwiki/vocab.ruwiki.csv\", \"w\") as dictfile:\n",
575 |     "    for _, word in dict_ordering:\n",
576 |     "        dictfile.write(\"%s text\\n\" % word)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 16,
582 |    "metadata": {
583 |     "collapsed": false,
584 |     "deletable": true,
585 |     "editable": true
586 |    },
587 |    "outputs": [
588 |     {
589 |      "name": "stdout",
590 |      "output_type": "stream",
591 |      "text": [
592 |       "CPU times: user 41min 14s, sys: 2min 41s, total: 43min 56s\n",
593 |       "Wall time: 41min 27s\n"
594 |      ]
595 |     }
596 |    ],
597 |    "source": [
598 |     "%%time\n",
599 |     "\n",
600 |     "# 1361758 — предподсчитанное кол-во документов\n",
601 |     "doc_count = 1361758\n",
602 |     "f = FloatProgress(min=0, max=doc_count)\n",
603 |     "display(f)\n",
604 |     "\n",
605 |     "with open(\"../datasets/ruwiki/docword.ruwiki.txt\", \"w\") as docwordfile:\n",
606 |     "    docwordfile.write(\"%d\\n%d\\n%d\\n\" % (len(dictionary), doc_count, bow_length))\n",
607 |     "    with open(\"../datasets/ruwiki/ruwiki.filtered.txt\", \"r\") as infile:\n",
608 |     "        reader = csv.reader(infile)\n",
609 |     "        for docID, (title, text) in enumerate(reader):\n",
610 |     "            for word, count in Counter(text.split()).items():\n",
611 |     "                docwordfile.write(\"%d %d %d\\n\" % (docID + 1, dict_mapping[word] + 1, count))\n",
612 |     "            f.value += 1"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {
618 |     "deletable": true,
619 |     "editable": true
620 |    },
621 |    "source": [
622 |     "---"
623 |    ]
624 |   }
625 |  ],
626 |  "metadata": {
627 |   "kernelspec": {
628 |    "display_name": "Python 3",
629 |    "language": "python",
630 |    "name": "python3"
631 |   },
632 |   "language_info": {
633 |    "codemirror_mode": {
634 |     "name": "ipython",
635 |     "version": 3
636 |    },
637 |    "file_extension": ".py",
638 |    "mimetype": "text/x-python",
639 |    "name": "python",
640 |    "nbconvert_exporter": "python",
641 |    "pygments_lexer": "ipython3",
642 |    "version": "3.5.3"
643 |   },
644 |   "latex_envs": {
645 |    "bibliofile": "biblio.bib",
646 |    "cite_by": "apalike",
647 |    "current_citInitial": 1,
648 |    "eqLabelWithNumbers": true,
649 |    "eqNumInitial": 0
650 |   }
651 |  },
652 |  "nbformat": 4,
653 |  "nbformat_minor": 0
654 | }
655 | 


--------------------------------------------------------------------------------
/server/static/js/hammer.min.js:
--------------------------------------------------------------------------------
1 | /*! Hammer.JS - v2.0.8 - 2016-04-23
2 |  * http://hammerjs.github.io/
3 |  *
4 |  * Copyright (c) 2016 Jorik Tangelder;
5 |  * Licensed under the MIT license */
6 | !function(a,b,c,d){"use strict";function e(a,b,c){return setTimeout(j(a,c),b)}function f(a,b,c){return Array.isArray(a)?(g(a,c[b],c),!0):!1}function g(a,b,c){var e;if(a)if(a.forEach)a.forEach(b,c);else if(a.length!==d)for(e=0;e<a.length;)b.call(c,a[e],e,a),e++;else for(e in a)a.hasOwnProperty(e)&&b.call(c,a[e],e,a)}function h(b,c,d){var e="DEPRECATED METHOD: "+c+"\n"+d+" AT \n";return function(){var c=new Error("get-stack-trace"),d=c&&c.stack?c.stack.replace(/^[^\(]+?[\n$]/gm,"").replace(/^\s+at\s+/gm,"").replace(/^Object.<anonymous>\s*\(/gm,"{anonymous}()@"):"Unknown Stack Trace",f=a.console&&(a.console.warn||a.console.log);return f&&f.call(a.console,e,d),b.apply(this,arguments)}}function i(a,b,c){var d,e=b.prototype;d=a.prototype=Object.create(e),d.constructor=a,d._super=e,c&&la(d,c)}function j(a,b){return function(){return a.apply(b,arguments)}}function k(a,b){return typeof a==oa?a.apply(b?b[0]||d:d,b):a}function l(a,b){return a===d?b:a}function m(a,b,c){g(q(b),function(b){a.addEventListener(b,c,!1)})}function n(a,b,c){g(q(b),function(b){a.removeEventListener(b,c,!1)})}function o(a,b){for(;a;){if(a==b)return!0;a=a.parentNode}return!1}function p(a,b){return a.indexOf(b)>-1}function q(a){return a.trim().split(/\s+/g)}function r(a,b,c){if(a.indexOf&&!c)return a.indexOf(b);for(var d=0;d<a.length;){if(c&&a[d][c]==b||!c&&a[d]===b)return d;d++}return-1}function s(a){return Array.prototype.slice.call(a,0)}function t(a,b,c){for(var d=[],e=[],f=0;f<a.length;){var g=b?a[f][b]:a[f];r(e,g)<0&&d.push(a[f]),e[f]=g,f++}return c&&(d=b?d.sort(function(a,c){return a[b]>c[b]}):d.sort()),d}function u(a,b){for(var c,e,f=b[0].toUpperCase()+b.slice(1),g=0;g<ma.length;){if(c=ma[g],e=c?c+f:b,e in a)return e;g++}return d}function v(){return ua++}function w(b){var c=b.ownerDocument||b;return c.defaultView||c.parentWindow||a}function x(a,b){var c=this;this.manager=a,this.callback=b,this.element=a.element,this.target=a.options.inputTarget,this.domHandler=function(b){k(a.options.enable,[a])&&c.handler(b)},this.init()}function y(a){var b,c=a.options.inputClass;return new(b=c?c:xa?M:ya?P:wa?R:L)(a,z)}function z(a,b,c){var d=c.pointers.length,e=c.changedPointers.length,f=b&Ea&&d-e===0,g=b&(Ga|Ha)&&d-e===0;c.isFirst=!!f,c.isFinal=!!g,f&&(a.session={}),c.eventType=b,A(a,c),a.emit("hammer.input",c),a.recognize(c),a.session.prevInput=c}function A(a,b){var c=a.session,d=b.pointers,e=d.length;c.firstInput||(c.firstInput=D(b)),e>1&&!c.firstMultiple?c.firstMultiple=D(b):1===e&&(c.firstMultiple=!1);var f=c.firstInput,g=c.firstMultiple,h=g?g.center:f.center,i=b.center=E(d);b.timeStamp=ra(),b.deltaTime=b.timeStamp-f.timeStamp,b.angle=I(h,i),b.distance=H(h,i),B(c,b),b.offsetDirection=G(b.deltaX,b.deltaY);var j=F(b.deltaTime,b.deltaX,b.deltaY);b.overallVelocityX=j.x,b.overallVelocityY=j.y,b.overallVelocity=qa(j.x)>qa(j.y)?j.x:j.y,b.scale=g?K(g.pointers,d):1,b.rotation=g?J(g.pointers,d):0,b.maxPointers=c.prevInput?b.pointers.length>c.prevInput.maxPointers?b.pointers.length:c.prevInput.maxPointers:b.pointers.length,C(c,b);var k=a.element;o(b.srcEvent.target,k)&&(k=b.srcEvent.target),b.target=k}function B(a,b){var c=b.center,d=a.offsetDelta||{},e=a.prevDelta||{},f=a.prevInput||{};b.eventType!==Ea&&f.eventType!==Ga||(e=a.prevDelta={x:f.deltaX||0,y:f.deltaY||0},d=a.offsetDelta={x:c.x,y:c.y}),b.deltaX=e.x+(c.x-d.x),b.deltaY=e.y+(c.y-d.y)}function C(a,b){var c,e,f,g,h=a.lastInterval||b,i=b.timeStamp-h.timeStamp;if(b.eventType!=Ha&&(i>Da||h.velocity===d)){var j=b.deltaX-h.deltaX,k=b.deltaY-h.deltaY,l=F(i,j,k);e=l.x,f=l.y,c=qa(l.x)>qa(l.y)?l.x:l.y,g=G(j,k),a.lastInterval=b}else c=h.velocity,e=h.velocityX,f=h.velocityY,g=h.direction;b.velocity=c,b.velocityX=e,b.velocityY=f,b.direction=g}function D(a){for(var b=[],c=0;c<a.pointers.length;)b[c]={clientX:pa(a.pointers[c].clientX),clientY:pa(a.pointers[c].clientY)},c++;return{timeStamp:ra(),pointers:b,center:E(b),deltaX:a.deltaX,deltaY:a.deltaY}}function E(a){var b=a.length;if(1===b)return{x:pa(a[0].clientX),y:pa(a[0].clientY)};for(var c=0,d=0,e=0;b>e;)c+=a[e].clientX,d+=a[e].clientY,e++;return{x:pa(c/b),y:pa(d/b)}}function F(a,b,c){return{x:b/a||0,y:c/a||0}}function G(a,b){return a===b?Ia:qa(a)>=qa(b)?0>a?Ja:Ka:0>b?La:Ma}function H(a,b,c){c||(c=Qa);var d=b[c[0]]-a[c[0]],e=b[c[1]]-a[c[1]];return Math.sqrt(d*d+e*e)}function I(a,b,c){c||(c=Qa);var d=b[c[0]]-a[c[0]],e=b[c[1]]-a[c[1]];return 180*Math.atan2(e,d)/Math.PI}function J(a,b){return I(b[1],b[0],Ra)+I(a[1],a[0],Ra)}function K(a,b){return H(b[0],b[1],Ra)/H(a[0],a[1],Ra)}function L(){this.evEl=Ta,this.evWin=Ua,this.pressed=!1,x.apply(this,arguments)}function M(){this.evEl=Xa,this.evWin=Ya,x.apply(this,arguments),this.store=this.manager.session.pointerEvents=[]}function N(){this.evTarget=$a,this.evWin=_a,this.started=!1,x.apply(this,arguments)}function O(a,b){var c=s(a.touches),d=s(a.changedTouches);return b&(Ga|Ha)&&(c=t(c.concat(d),"identifier",!0)),[c,d]}function P(){this.evTarget=bb,this.targetIds={},x.apply(this,arguments)}function Q(a,b){var c=s(a.touches),d=this.targetIds;if(b&(Ea|Fa)&&1===c.length)return d[c[0].identifier]=!0,[c,c];var e,f,g=s(a.changedTouches),h=[],i=this.target;if(f=c.filter(function(a){return o(a.target,i)}),b===Ea)for(e=0;e<f.length;)d[f[e].identifier]=!0,e++;for(e=0;e<g.length;)d[g[e].identifier]&&h.push(g[e]),b&(Ga|Ha)&&delete d[g[e].identifier],e++;return h.length?[t(f.concat(h),"identifier",!0),h]:void 0}function R(){x.apply(this,arguments);var a=j(this.handler,this);this.touch=new P(this.manager,a),this.mouse=new L(this.manager,a),this.primaryTouch=null,this.lastTouches=[]}function S(a,b){a&Ea?(this.primaryTouch=b.changedPointers[0].identifier,T.call(this,b)):a&(Ga|Ha)&&T.call(this,b)}function T(a){var b=a.changedPointers[0];if(b.identifier===this.primaryTouch){var c={x:b.clientX,y:b.clientY};this.lastTouches.push(c);var d=this.lastTouches,e=function(){var a=d.indexOf(c);a>-1&&d.splice(a,1)};setTimeout(e,cb)}}function U(a){for(var b=a.srcEvent.clientX,c=a.srcEvent.clientY,d=0;d<this.lastTouches.length;d++){var e=this.lastTouches[d],f=Math.abs(b-e.x),g=Math.abs(c-e.y);if(db>=f&&db>=g)return!0}return!1}function V(a,b){this.manager=a,this.set(b)}function W(a){if(p(a,jb))return jb;var b=p(a,kb),c=p(a,lb);return b&&c?jb:b||c?b?kb:lb:p(a,ib)?ib:hb}function X(){if(!fb)return!1;var b={},c=a.CSS&&a.CSS.supports;return["auto","manipulation","pan-y","pan-x","pan-x pan-y","none"].forEach(function(d){b[d]=c?a.CSS.supports("touch-action",d):!0}),b}function Y(a){this.options=la({},this.defaults,a||{}),this.id=v(),this.manager=null,this.options.enable=l(this.options.enable,!0),this.state=nb,this.simultaneous={},this.requireFail=[]}function Z(a){return a&sb?"cancel":a&qb?"end":a&pb?"move":a&ob?"start":""}function $(a){return a==Ma?"down":a==La?"up":a==Ja?"left":a==Ka?"right":""}function _(a,b){var c=b.manager;return c?c.get(a):a}function aa(){Y.apply(this,arguments)}function ba(){aa.apply(this,arguments),this.pX=null,this.pY=null}function ca(){aa.apply(this,arguments)}function da(){Y.apply(this,arguments),this._timer=null,this._input=null}function ea(){aa.apply(this,arguments)}function fa(){aa.apply(this,arguments)}function ga(){Y.apply(this,arguments),this.pTime=!1,this.pCenter=!1,this._timer=null,this._input=null,this.count=0}function ha(a,b){return b=b||{},b.recognizers=l(b.recognizers,ha.defaults.preset),new ia(a,b)}function ia(a,b){this.options=la({},ha.defaults,b||{}),this.options.inputTarget=this.options.inputTarget||a,this.handlers={},this.session={},this.recognizers=[],this.oldCssProps={},this.element=a,this.input=y(this),this.touchAction=new V(this,this.options.touchAction),ja(this,!0),g(this.options.recognizers,function(a){var b=this.add(new a[0](a[1]));a[2]&&b.recognizeWith(a[2]),a[3]&&b.requireFailure(a[3])},this)}function ja(a,b){var c=a.element;if(c.style){var d;g(a.options.cssProps,function(e,f){d=u(c.style,f),b?(a.oldCssProps[d]=c.style[d],c.style[d]=e):c.style[d]=a.oldCssProps[d]||""}),b||(a.oldCssProps={})}}function ka(a,c){var d=b.createEvent("Event");d.initEvent(a,!0,!0),d.gesture=c,c.target.dispatchEvent(d)}var la,ma=["","webkit","Moz","MS","ms","o"],na=b.createElement("div"),oa="function",pa=Math.round,qa=Math.abs,ra=Date.now;la="function"!=typeof Object.assign?function(a){if(a===d||null===a)throw new TypeError("Cannot convert undefined or null to object");for(var b=Object(a),c=1;c<arguments.length;c++){var e=arguments[c];if(e!==d&&null!==e)for(var f in e)e.hasOwnProperty(f)&&(b[f]=e[f])}return b}:Object.assign;var sa=h(function(a,b,c){for(var e=Object.keys(b),f=0;f<e.length;)(!c||c&&a[e[f]]===d)&&(a[e[f]]=b[e[f]]),f++;return a},"extend","Use `assign`."),ta=h(function(a,b){return sa(a,b,!0)},"merge","Use `assign`."),ua=1,va=/mobile|tablet|ip(ad|hone|od)|android/i,wa="ontouchstart"in a,xa=u(a,"PointerEvent")!==d,ya=wa&&va.test(navigator.userAgent),za="touch",Aa="pen",Ba="mouse",Ca="kinect",Da=25,Ea=1,Fa=2,Ga=4,Ha=8,Ia=1,Ja=2,Ka=4,La=8,Ma=16,Na=Ja|Ka,Oa=La|Ma,Pa=Na|Oa,Qa=["x","y"],Ra=["clientX","clientY"];x.prototype={handler:function(){},init:function(){this.evEl&&m(this.element,this.evEl,this.domHandler),this.evTarget&&m(this.target,this.evTarget,this.domHandler),this.evWin&&m(w(this.element),this.evWin,this.domHandler)},destroy:function(){this.evEl&&n(this.element,this.evEl,this.domHandler),this.evTarget&&n(this.target,this.evTarget,this.domHandler),this.evWin&&n(w(this.element),this.evWin,this.domHandler)}};var Sa={mousedown:Ea,mousemove:Fa,mouseup:Ga},Ta="mousedown",Ua="mousemove mouseup";i(L,x,{handler:function(a){var b=Sa[a.type];b&Ea&&0===a.button&&(this.pressed=!0),b&Fa&&1!==a.which&&(b=Ga),this.pressed&&(b&Ga&&(this.pressed=!1),this.callback(this.manager,b,{pointers:[a],changedPointers:[a],pointerType:Ba,srcEvent:a}))}});var Va={pointerdown:Ea,pointermove:Fa,pointerup:Ga,pointercancel:Ha,pointerout:Ha},Wa={2:za,3:Aa,4:Ba,5:Ca},Xa="pointerdown",Ya="pointermove pointerup pointercancel";a.MSPointerEvent&&!a.PointerEvent&&(Xa="MSPointerDown",Ya="MSPointerMove MSPointerUp MSPointerCancel"),i(M,x,{handler:function(a){var b=this.store,c=!1,d=a.type.toLowerCase().replace("ms",""),e=Va[d],f=Wa[a.pointerType]||a.pointerType,g=f==za,h=r(b,a.pointerId,"pointerId");e&Ea&&(0===a.button||g)?0>h&&(b.push(a),h=b.length-1):e&(Ga|Ha)&&(c=!0),0>h||(b[h]=a,this.callback(this.manager,e,{pointers:b,changedPointers:[a],pointerType:f,srcEvent:a}),c&&b.splice(h,1))}});var Za={touchstart:Ea,touchmove:Fa,touchend:Ga,touchcancel:Ha},$a="touchstart",_a="touchstart touchmove touchend touchcancel";i(N,x,{handler:function(a){var b=Za[a.type];if(b===Ea&&(this.started=!0),this.started){var c=O.call(this,a,b);b&(Ga|Ha)&&c[0].length-c[1].length===0&&(this.started=!1),this.callback(this.manager,b,{pointers:c[0],changedPointers:c[1],pointerType:za,srcEvent:a})}}});var ab={touchstart:Ea,touchmove:Fa,touchend:Ga,touchcancel:Ha},bb="touchstart touchmove touchend touchcancel";i(P,x,{handler:function(a){var b=ab[a.type],c=Q.call(this,a,b);c&&this.callback(this.manager,b,{pointers:c[0],changedPointers:c[1],pointerType:za,srcEvent:a})}});var cb=2500,db=25;i(R,x,{handler:function(a,b,c){var d=c.pointerType==za,e=c.pointerType==Ba;if(!(e&&c.sourceCapabilities&&c.sourceCapabilities.firesTouchEvents)){if(d)S.call(this,b,c);else if(e&&U.call(this,c))return;this.callback(a,b,c)}},destroy:function(){this.touch.destroy(),this.mouse.destroy()}});var eb=u(na.style,"touchAction"),fb=eb!==d,gb="compute",hb="auto",ib="manipulation",jb="none",kb="pan-x",lb="pan-y",mb=X();V.prototype={set:function(a){a==gb&&(a=this.compute()),fb&&this.manager.element.style&&mb[a]&&(this.manager.element.style[eb]=a),this.actions=a.toLowerCase().trim()},update:function(){this.set(this.manager.options.touchAction)},compute:function(){var a=[];return g(this.manager.recognizers,function(b){k(b.options.enable,[b])&&(a=a.concat(b.getTouchAction()))}),W(a.join(" "))},preventDefaults:function(a){var b=a.srcEvent,c=a.offsetDirection;if(this.manager.session.prevented)return void b.preventDefault();var d=this.actions,e=p(d,jb)&&!mb[jb],f=p(d,lb)&&!mb[lb],g=p(d,kb)&&!mb[kb];if(e){var h=1===a.pointers.length,i=a.distance<2,j=a.deltaTime<250;if(h&&i&&j)return}return g&&f?void 0:e||f&&c&Na||g&&c&Oa?this.preventSrc(b):void 0},preventSrc:function(a){this.manager.session.prevented=!0,a.preventDefault()}};var nb=1,ob=2,pb=4,qb=8,rb=qb,sb=16,tb=32;Y.prototype={defaults:{},set:function(a){return la(this.options,a),this.manager&&this.manager.touchAction.update(),this},recognizeWith:function(a){if(f(a,"recognizeWith",this))return this;var b=this.simultaneous;return a=_(a,this),b[a.id]||(b[a.id]=a,a.recognizeWith(this)),this},dropRecognizeWith:function(a){return f(a,"dropRecognizeWith",this)?this:(a=_(a,this),delete this.simultaneous[a.id],this)},requireFailure:function(a){if(f(a,"requireFailure",this))return this;var b=this.requireFail;return a=_(a,this),-1===r(b,a)&&(b.push(a),a.requireFailure(this)),this},dropRequireFailure:function(a){if(f(a,"dropRequireFailure",this))return this;a=_(a,this);var b=r(this.requireFail,a);return b>-1&&this.requireFail.splice(b,1),this},hasRequireFailures:function(){return this.requireFail.length>0},canRecognizeWith:function(a){return!!this.simultaneous[a.id]},emit:function(a){function b(b){c.manager.emit(b,a)}var c=this,d=this.state;qb>d&&b(c.options.event+Z(d)),b(c.options.event),a.additionalEvent&&b(a.additionalEvent),d>=qb&&b(c.options.event+Z(d))},tryEmit:function(a){return this.canEmit()?this.emit(a):void(this.state=tb)},canEmit:function(){for(var a=0;a<this.requireFail.length;){if(!(this.requireFail[a].state&(tb|nb)))return!1;a++}return!0},recognize:function(a){var b=la({},a);return k(this.options.enable,[this,b])?(this.state&(rb|sb|tb)&&(this.state=nb),this.state=this.process(b),void(this.state&(ob|pb|qb|sb)&&this.tryEmit(b))):(this.reset(),void(this.state=tb))},process:function(a){},getTouchAction:function(){},reset:function(){}},i(aa,Y,{defaults:{pointers:1},attrTest:function(a){var b=this.options.pointers;return 0===b||a.pointers.length===b},process:function(a){var b=this.state,c=a.eventType,d=b&(ob|pb),e=this.attrTest(a);return d&&(c&Ha||!e)?b|sb:d||e?c&Ga?b|qb:b&ob?b|pb:ob:tb}}),i(ba,aa,{defaults:{event:"pan",threshold:10,pointers:1,direction:Pa},getTouchAction:function(){var a=this.options.direction,b=[];return a&Na&&b.push(lb),a&Oa&&b.push(kb),b},directionTest:function(a){var b=this.options,c=!0,d=a.distance,e=a.direction,f=a.deltaX,g=a.deltaY;return e&b.direction||(b.direction&Na?(e=0===f?Ia:0>f?Ja:Ka,c=f!=this.pX,d=Math.abs(a.deltaX)):(e=0===g?Ia:0>g?La:Ma,c=g!=this.pY,d=Math.abs(a.deltaY))),a.direction=e,c&&d>b.threshold&&e&b.direction},attrTest:function(a){return aa.prototype.attrTest.call(this,a)&&(this.state&ob||!(this.state&ob)&&this.directionTest(a))},emit:function(a){this.pX=a.deltaX,this.pY=a.deltaY;var b=$(a.direction);b&&(a.additionalEvent=this.options.event+b),this._super.emit.call(this,a)}}),i(ca,aa,{defaults:{event:"pinch",threshold:0,pointers:2},getTouchAction:function(){return[jb]},attrTest:function(a){return this._super.attrTest.call(this,a)&&(Math.abs(a.scale-1)>this.options.threshold||this.state&ob)},emit:function(a){if(1!==a.scale){var b=a.scale<1?"in":"out";a.additionalEvent=this.options.event+b}this._super.emit.call(this,a)}}),i(da,Y,{defaults:{event:"press",pointers:1,time:251,threshold:9},getTouchAction:function(){return[hb]},process:function(a){var b=this.options,c=a.pointers.length===b.pointers,d=a.distance<b.threshold,f=a.deltaTime>b.time;if(this._input=a,!d||!c||a.eventType&(Ga|Ha)&&!f)this.reset();else if(a.eventType&Ea)this.reset(),this._timer=e(function(){this.state=rb,this.tryEmit()},b.time,this);else if(a.eventType&Ga)return rb;return tb},reset:function(){clearTimeout(this._timer)},emit:function(a){this.state===rb&&(a&&a.eventType&Ga?this.manager.emit(this.options.event+"up",a):(this._input.timeStamp=ra(),this.manager.emit(this.options.event,this._input)))}}),i(ea,aa,{defaults:{event:"rotate",threshold:0,pointers:2},getTouchAction:function(){return[jb]},attrTest:function(a){return this._super.attrTest.call(this,a)&&(Math.abs(a.rotation)>this.options.threshold||this.state&ob)}}),i(fa,aa,{defaults:{event:"swipe",threshold:10,velocity:.3,direction:Na|Oa,pointers:1},getTouchAction:function(){return ba.prototype.getTouchAction.call(this)},attrTest:function(a){var b,c=this.options.direction;return c&(Na|Oa)?b=a.overallVelocity:c&Na?b=a.overallVelocityX:c&Oa&&(b=a.overallVelocityY),this._super.attrTest.call(this,a)&&c&a.offsetDirection&&a.distance>this.options.threshold&&a.maxPointers==this.options.pointers&&qa(b)>this.options.velocity&&a.eventType&Ga},emit:function(a){var b=$(a.offsetDirection);b&&this.manager.emit(this.options.event+b,a),this.manager.emit(this.options.event,a)}}),i(ga,Y,{defaults:{event:"tap",pointers:1,taps:1,interval:300,time:250,threshold:9,posThreshold:10},getTouchAction:function(){return[ib]},process:function(a){var b=this.options,c=a.pointers.length===b.pointers,d=a.distance<b.threshold,f=a.deltaTime<b.time;if(this.reset(),a.eventType&Ea&&0===this.count)return this.failTimeout();if(d&&f&&c){if(a.eventType!=Ga)return this.failTimeout();var g=this.pTime?a.timeStamp-this.pTime<b.interval:!0,h=!this.pCenter||H(this.pCenter,a.center)<b.posThreshold;this.pTime=a.timeStamp,this.pCenter=a.center,h&&g?this.count+=1:this.count=1,this._input=a;var i=this.count%b.taps;if(0===i)return this.hasRequireFailures()?(this._timer=e(function(){this.state=rb,this.tryEmit()},b.interval,this),ob):rb}return tb},failTimeout:function(){return this._timer=e(function(){this.state=tb},this.options.interval,this),tb},reset:function(){clearTimeout(this._timer)},emit:function(){this.state==rb&&(this._input.tapCount=this.count,this.manager.emit(this.options.event,this._input))}}),ha.VERSION="2.0.8",ha.defaults={domEvents:!1,touchAction:gb,enable:!0,inputTarget:null,inputClass:null,preset:[[ea,{enable:!1}],[ca,{enable:!1},["rotate"]],[fa,{direction:Na}],[ba,{direction:Na},["swipe"]],[ga],[ga,{event:"doubletap",taps:2},["tap"]],[da]],cssProps:{userSelect:"none",touchSelect:"none",touchCallout:"none",contentZooming:"none",userDrag:"none",tapHighlightColor:"rgba(0,0,0,0)"}};var ub=1,vb=2;ia.prototype={set:function(a){return la(this.options,a),a.touchAction&&this.touchAction.update(),a.inputTarget&&(this.input.destroy(),this.input.target=a.inputTarget,this.input.init()),this},stop:function(a){this.session.stopped=a?vb:ub},recognize:function(a){var b=this.session;if(!b.stopped){this.touchAction.preventDefaults(a);var c,d=this.recognizers,e=b.curRecognizer;(!e||e&&e.state&rb)&&(e=b.curRecognizer=null);for(var f=0;f<d.length;)c=d[f],b.stopped===vb||e&&c!=e&&!c.canRecognizeWith(e)?c.reset():c.recognize(a),!e&&c.state&(ob|pb|qb)&&(e=b.curRecognizer=c),f++}},get:function(a){if(a instanceof Y)return a;for(var b=this.recognizers,c=0;c<b.length;c++)if(b[c].options.event==a)return b[c];return null},add:function(a){if(f(a,"add",this))return this;var b=this.get(a.options.event);return b&&this.remove(b),this.recognizers.push(a),a.manager=this,this.touchAction.update(),a},remove:function(a){if(f(a,"remove",this))return this;if(a=this.get(a)){var b=this.recognizers,c=r(b,a);-1!==c&&(b.splice(c,1),this.touchAction.update())}return this},on:function(a,b){if(a!==d&&b!==d){var c=this.handlers;return g(q(a),function(a){c[a]=c[a]||[],c[a].push(b)}),this}},off:function(a,b){if(a!==d){var c=this.handlers;return g(q(a),function(a){b?c[a]&&c[a].splice(r(c[a],b),1):delete c[a]}),this}},emit:function(a,b){this.options.domEvents&&ka(a,b);var c=this.handlers[a]&&this.handlers[a].slice();if(c&&c.length){b.type=a,b.preventDefault=function(){b.srcEvent.preventDefault()};for(var d=0;d<c.length;)c[d](b),d++}},destroy:function(){this.element&&ja(this,!1),this.handlers={},this.session={},this.input.destroy(),this.element=null}},la(ha,{INPUT_START:Ea,INPUT_MOVE:Fa,INPUT_END:Ga,INPUT_CANCEL:Ha,STATE_POSSIBLE:nb,STATE_BEGAN:ob,STATE_CHANGED:pb,STATE_ENDED:qb,STATE_RECOGNIZED:rb,STATE_CANCELLED:sb,STATE_FAILED:tb,DIRECTION_NONE:Ia,DIRECTION_LEFT:Ja,DIRECTION_RIGHT:Ka,DIRECTION_UP:La,DIRECTION_DOWN:Ma,DIRECTION_HORIZONTAL:Na,DIRECTION_VERTICAL:Oa,DIRECTION_ALL:Pa,Manager:ia,Input:x,TouchAction:V,TouchInput:P,MouseInput:L,PointerEventInput:M,TouchMouseInput:R,SingleTouchInput:N,Recognizer:Y,AttrRecognizer:aa,Tap:ga,Pan:ba,Swipe:fa,Pinch:ca,Rotate:ea,Press:da,on:m,off:n,each:g,merge:ta,extend:sa,assign:la,inherit:i,bindFn:j,prefixed:u});var wb="undefined"!=typeof a?a:"undefined"!=typeof self?self:{};wb.Hammer=ha,"function"==typeof define&&define.amd?define(function(){return ha}):"undefined"!=typeof module&&module.exports?module.exports=ha:a[c]=ha}(window,document,"Hammer");
7 | //# sourceMappingURL=hammer.min.js.map


--------------------------------------------------------------------------------