├── experiments ├── server ├── parsers ├── .gitignore ├── Calculate pn and habr cooccurrences.ipynb ├── Count number of tokens in VW file.ipynb ├── Exporting Postnauka from MySQL dump.ipynb ├── Import of hierarchical spectrums.ipynb ├── Most popular-sciency Habr authors.ipynb ├── UCI Merger.ipynb ├── Sending requests to server from ARTM_model.ipynb ├── Comparison of different lemmatizers.ipynb ├── Parsing habr dataset.ipynb ├── Parsing elementy website.ipynb ├── Spectrum experiments.ipynb ├── Topical Similarity Measurements for ARTM RecSys.ipynb └── Parsing ruwiki dataset.ipynb ├── server ├── parsers ├── experiments ├── hartm ├── static │ ├── .gitignore │ ├── js │ │ ├── d3.min.js │ │ ├── jquery.min.js │ │ ├── bootstrap.min.js │ │ ├── fileinput.min.js │ │ ├── jquery.history.min.js │ │ └── hammer.min.js │ ├── css │ │ ├── fileinput.min.css │ │ └── index.css │ ├── img │ │ ├── pn.png │ │ ├── elem.png │ │ ├── habr.png │ │ ├── loading.gif │ │ └── loading-sm.gif │ ├── fonts │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ ├── bower.json │ └── index.html ├── hierarchy_utils.py ├── .gitignore ├── package.json ├── artm_proxy.py ├── server.js ├── artm_bridge.py └── artm_lib.py ├── parsers ├── hierarchy_utils.py ├── arbitrary.py ├── habrahabr.ipynb ├── postnauka.ipynb └── text_utils.py ├── .gitmodules ├── .gitignore ├── bigartm.nix ├── bigartm_py.nix ├── default.nix ├── README.md └── Dockerfile /experiments/server: -------------------------------------------------------------------------------- 1 | ../server -------------------------------------------------------------------------------- /server/parsers: -------------------------------------------------------------------------------- 1 | ../parsers/ -------------------------------------------------------------------------------- /experiments/parsers: -------------------------------------------------------------------------------- 1 | ../parsers -------------------------------------------------------------------------------- /server/experiments: -------------------------------------------------------------------------------- 1 | ../experiments/ -------------------------------------------------------------------------------- /server/hartm: -------------------------------------------------------------------------------- 1 | ../experiments/hartm -------------------------------------------------------------------------------- /server/static/.gitignore: -------------------------------------------------------------------------------- 1 | bower_components/ 2 | -------------------------------------------------------------------------------- /server/hierarchy_utils.py: -------------------------------------------------------------------------------- 1 | ../experiments/hierarchy_utils.py -------------------------------------------------------------------------------- /server/static/js/d3.min.js: -------------------------------------------------------------------------------- 1 | ../bower_components/d3/d3.min.js -------------------------------------------------------------------------------- /parsers/hierarchy_utils.py: -------------------------------------------------------------------------------- 1 | ../experiments/hierarchy_utils.py -------------------------------------------------------------------------------- /server/static/js/jquery.min.js: -------------------------------------------------------------------------------- 1 | ../bower_components/jquery/dist/jquery.min.js -------------------------------------------------------------------------------- /server/static/js/bootstrap.min.js: -------------------------------------------------------------------------------- 1 | ../bower_components/bootstrap/dist/js/bootstrap.min.js -------------------------------------------------------------------------------- /server/static/js/fileinput.min.js: -------------------------------------------------------------------------------- 1 | ../bower_components/bootstrap-fileinput/js/fileinput.min.js -------------------------------------------------------------------------------- /server/static/css/fileinput.min.css: -------------------------------------------------------------------------------- 1 | ../bower_components/bootstrap-fileinput/css/fileinput.min.css -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bigartm"] 2 | path = bigartm 3 | url = https://github.com/bigartm/bigartm 4 | -------------------------------------------------------------------------------- /server/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | uploads/ 3 | npm-debug.log 4 | hartm.mdl 5 | bigartm.* 6 | *.batch 7 | -------------------------------------------------------------------------------- /server/static/img/pn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/pn.png -------------------------------------------------------------------------------- /server/static/img/elem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/elem.png -------------------------------------------------------------------------------- /server/static/img/habr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/habr.png -------------------------------------------------------------------------------- /server/static/js/jquery.history.min.js: -------------------------------------------------------------------------------- 1 | ../bower_components/history.js/scripts/bundled/html4+html5/jquery.history.js -------------------------------------------------------------------------------- /server/static/img/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/loading.gif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | .ipynb_checkpoints/ 3 | __pycache__/ 4 | venv/ 5 | result 6 | *.DS_Store 7 | nohup.out 8 | *.dump 9 | -------------------------------------------------------------------------------- /server/static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- 1 | ../bower_components/bootstrap/dist/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /server/static/img/loading-sm.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TatianaShavrina/Rysearch/master/server/static/img/loading-sm.gif -------------------------------------------------------------------------------- /server/static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- 1 | ../bower_components/bootstrap/dist/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /experiments/.gitignore: -------------------------------------------------------------------------------- 1 | pn_batches/ 2 | hartm/ 3 | transform_batches/ 4 | test_batches/ 5 | bigartm.* 6 | *.txt 7 | *.batch 8 | *.tar 9 | *.tar.gz 10 | *.tar.bz2 11 | *.csv 12 | -------------------------------------------------------------------------------- /server/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Rysearch", 3 | "version": "0.0.1", 4 | "private": "true", 5 | "dependencies": { 6 | "express": ">=4.14.1", 7 | "zmq": ">=2.15.3", 8 | "uuid": ">=3.0.1", 9 | "multer": ">=1.3.0", 10 | "body-parser": ">=1.17.2" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /bigartm.nix: -------------------------------------------------------------------------------- 1 | with import {}; { 2 | bigartm = stdenv.mkDerivation rec { 3 | name = "bigartm"; 4 | 5 | buildInputs = [ cmake boost python python3 python35Packages.setuptools ]; 6 | cmakeFlags = "-DBUILD_TESTS=OFF -DBUILD_BIGARTM_CLI=OFF"; 7 | makeFlags = "-j4"; 8 | 9 | LDFLAGS="-L${boost.dev}/lib"; 10 | 11 | src = ./bigartm; 12 | }; 13 | } 14 | -------------------------------------------------------------------------------- /bigartm_py.nix: -------------------------------------------------------------------------------- 1 | { python27Packages, python35Packages, protobuf }: 2 | 3 | python35Packages.buildPythonPackage rec { 4 | name = "bigartm"; 5 | 6 | buildInputs = [ python27Packages.protobuf3_0 ]; 7 | 8 | propagatedBuildInputs = with python35Packages; [ 9 | numpy 10 | pandas 11 | tqdm 12 | ] ++ [ protobuf ]; 13 | 14 | src = ./bigartm; 15 | 16 | preConfigure = '' 17 | export PYTHONPATH="${python27Packages.protobuf3_0}/lib/python2.7/site-packages:$PYTHONPATH"; 18 | cd python 19 | ''; 20 | } 21 | -------------------------------------------------------------------------------- /server/static/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rysearch", 3 | "homepage": "https://github.com/AVBelyy/Rysearch", 4 | "authors": [ 5 | "Anton Belyy " 6 | ], 7 | "description": "An exploratoRY SEARCH engine", 8 | "dependencies": { 9 | "d3": ">=4.10.0", 10 | "bootstrap": ">=3.3.7", 11 | "bootstrap-fileinput": ">=4.4.2", 12 | "history.js": ">=1.8.0" 13 | }, 14 | "main": "", 15 | "license": "MIT", 16 | "private": true, 17 | "ignore": [ 18 | "**/.*", 19 | "node_modules", 20 | "bower_components", 21 | "test", 22 | "tests" 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | with import {}; 2 | 3 | let bigartm_python = callPackage ./bigartm_py.nix { 4 | python27Packages = python27Packages; 5 | python35Packages = python35Packages; 6 | protobuf = protobuf3_0; 7 | }; 8 | in { 9 | rysearch = stdenv.mkDerivation rec { 10 | name = "rysearch"; 11 | 12 | buildInputs = [ 13 | nodejs 14 | zeromq 15 | python35Packages.pymongo 16 | python35Packages.pyzmq 17 | python35Packages.numpy 18 | python35Packages.scipy 19 | python35Packages.pandas 20 | python35Packages.scikitlearn 21 | python35Packages.regex 22 | python35Packages.virtualenv 23 | ]; 24 | 25 | shellHook = '' 26 | if [ ! -d venv ]; then 27 | virtualenv --python=python3.5 venv 28 | venv/bin/pip install pymystem3 29 | venv/bin/pip install tqdm 30 | venv/bin/pip install protobuf==3.0.0 31 | fi 32 | export PATH="$(pwd)/venv/bin:$PATH" 33 | export ARTM_SHARED_LIBRARY="$(pwd)/result/lib/libartm.so"; 34 | export PYTHONPATH="$PYTHONPATH:$(toPythonPath ${bigartm_python})"; 35 | ''; 36 | }; 37 | } 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rysearch 2 | Rysearch is an explorato**ry search** engine and recommender system. Based on [BigARTM](http://bigartm.org), open-source library for topic modeling, it takes into account latent topical structure of texts to achieve good results in both knowledge exploration and visualization.[1] 3 | 4 | ## Quick start 5 | Use our pre-configured Docker image for quick installation. Run: 6 | ```bash 7 | docker run -t -p 3000:3000 tohnann/rysearch 8 | ``` 9 | And then open [http://localhost:3000](http://localhost:3000). 10 | 11 | ## Manual installation 12 | Everything is tested on Linux (NixOS) and Windows operating systems. If things don't work as described here — please submit us an issue. 13 | 14 | ### Running a Rysearch server 15 | ```bash 16 | cd server/ 17 | 18 | # Install Node.js libraries 19 | npm install 20 | ``` 21 | 22 | Rysearch server consists of two workers: ARTM_bridge and Node.js server. You have to run them as separate programs like this: 23 | ```bash 24 | # Run ARTM_bridge 25 | python3 artm_bridge.py 26 | ``` 27 | 28 | ```bash 29 | # Run Node.js server 30 | npm start 31 | ``` 32 | 33 | [1] K. V. Vorontsov et al. Non-Bayesian Additive Regularization for Multimodal Topic Modeling of Large Collections, *TM '15 Proceedings of the 2015 Workshop on Topic Models: Post-Processing and Applications*, 2014. 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | 3 | RUN apt-get -y update && \ 4 | apt-get -y dist-upgrade 5 | 6 | RUN apt-get -y install \ 7 | python3 \ 8 | python3-pymongo \ 9 | python3-zmq \ 10 | python3-numpy \ 11 | python3-scipy \ 12 | python3-sklearn \ 13 | python3-pip && \ 14 | pip3 install pandas 15 | 16 | RUN apt-get -y install \ 17 | nodejs \ 18 | npm 19 | 20 | RUN apt-get install -y wget libtool pkg-config build-essential autoconf automake uuid-dev && \ 21 | cd ~ && \ 22 | wget http://download.zeromq.org/zeromq-4.0.5.tar.gz && \ 23 | tar xvzf zeromq-4.0.5.tar.gz && \ 24 | cd zeromq-4.0.5 && \ 25 | ./configure && \ 26 | make install && \ 27 | ldconfig 28 | 29 | RUN apt-get install -y git 30 | RUN ln -s /usr/bin/nodejs /usr/bin/node 31 | 32 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6 && \ 33 | echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-3.4.list && \ 34 | apt-get update && \ 35 | apt-get -y install mongodb-org 36 | 37 | RUN apt-get -y install tmux 38 | 39 | RUN cd ~ && \ 40 | wget -qO- https://www.dropbox.com/s/uy4nfqr1m4spvvu/datasets.tar.gz | tar xzv && \ 41 | wget https://www.dropbox.com/s/h75rz3hfvpzanji/hartm.mdl 42 | 43 | EXPOSE 3000 44 | 45 | RUN locale-gen en_US.UTF-8 46 | ENV LANG en_US.UTF-8 47 | ENV LANGUAGE en_US:en 48 | ENV LC_ALL en_US.UTF-8 49 | 50 | CMD cd ~ && \ 51 | tmux new-session -s "rysearch" -d && \ 52 | tmux new-window -t "rysearch:1" "mongod -f /etc/mongod.conf" && \ 53 | if [ ! -d rysearch ]; then git clone -b master https://github.com/AVBelyy/Rysearch.git rysearch; fi && \ 54 | if [ -d shared ]; then rm ~/hartm.mdl; ln -s shared/hartm.mdl ~/hartm.mdl; fi && \ 55 | if [ -d shared ]; then rm -rf ~/datasets; ln -s shared/datasets ~/datasets; fi && \ 56 | cd rysearch/server && \ 57 | ln -s ~/hartm.mdl hartm.mdl && \ 58 | mongorestore -d datasets ~/datasets && \ 59 | tmux new-window -t "rysearch:2" "python3 artm_bridge.py" && \ 60 | tmux split-window -t "rysearch:2" -v "npm install >/dev/null 2>&1 && npm start" && \ 61 | tmux select-window -t "rysearch:2" && \ 62 | tmux attach-session -t "rysearch" 63 | -------------------------------------------------------------------------------- /parsers/arbitrary.py: -------------------------------------------------------------------------------- 1 | # Парсер произвольного документа 2 | 3 | import regex 4 | import unicodedata 5 | 6 | from sklearn.pipeline import Pipeline 7 | from pathlib import Path 8 | 9 | from parsers.text_utils import BaseSource, BaseProcessor, BaseSink 10 | from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor 11 | from parsers.text_utils import VowpalWabbitSink, MongoDbSink 12 | 13 | class ArbitraryFileSource(BaseSource): 14 | def fit(self, iter_source, *args): 15 | self.iter_source = iter_source 16 | return self 17 | 18 | class ArbitraryFileProcessor(BaseProcessor): 19 | def __init__(self, stop_words): 20 | self.doc_pipeline = Pipeline([ 21 | ("text-processor", DefaultTextProcessor(token_pattern="(?u)\\b\\p{L}+\\b")), 22 | ("document-processor", DefaultDocumentProcessor(stop_lemmas=stop_words)), 23 | ]) 24 | 25 | @staticmethod 26 | def strip_accents(s): 27 | unused_char = '\U00037b84' 28 | s = s.replace("й", unused_char) 29 | return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")).replace(unused_char, "й") 30 | 31 | def transform(self, src, *args): 32 | # Parse text file 33 | text = src.iter_source.read() 34 | # Get rid of accent marks 35 | text = ArbitraryFileProcessor.strip_accents(text) 36 | # Run inner pipeline to form modalities 37 | modalities = self.doc_pipeline.fit_transform(text) 38 | # Finally, make a document and return it 39 | doc = {} 40 | doc["modalities"] = modalities 41 | doc["markdown"] = text 42 | return doc 43 | 44 | def get_pipeline(): 45 | root_path = Path("../datasets/arbitrary") 46 | stop_words = (root_path / "stopwords.txt").open().read().split() 47 | return Pipeline([ 48 | ("file-source", ArbitraryFileSource()), 49 | ("file-processor", ArbitraryFileProcessor(stop_words)), 50 | ]) 51 | 52 | if __name__ == "__main__": 53 | import argparse 54 | pipeline = get_pipeline() 55 | argparser = argparse.ArgumentParser() 56 | argparser.add_argument("source_file") 57 | # argparser.add_argument("target_file") 58 | args = argparser.parse_args() 59 | with open(args.source_file) as src: 60 | doc = pipeline.fit_transform(src) 61 | print(doc) -------------------------------------------------------------------------------- /server/artm_proxy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load-balancing proxy for ARTM backend. 3 | """ 4 | 5 | import multiprocessing 6 | import subprocess 7 | import zmq 8 | 9 | ZMQ_FRONTEND_PORT = 2411 10 | ZMQ_BACKEND_PORT = 2511 11 | 12 | EMPTY = b"" 13 | 14 | try: 15 | context = zmq.Context.instance() 16 | frontend = context.socket(zmq.ROUTER) 17 | frontend.bind("tcp://*:%d" % ZMQ_FRONTEND_PORT) 18 | backend = context.socket(zmq.ROUTER) 19 | backend.bind("tcp://*:%d" % ZMQ_BACKEND_PORT) 20 | 21 | # Initialize main loop state 22 | available_workers = [] 23 | poller = zmq.Poller() 24 | # Only poll for requests from backend until workers are available 25 | poller.register(backend, zmq.POLLIN) 26 | 27 | print("ARTM_proxy: start serving ZeroMQ queries on ports", 28 | ZMQ_FRONTEND_PORT, "and", ZMQ_BACKEND_PORT) 29 | 30 | # Main loop 31 | # TODO: remove stale workers by time-out 32 | while True: 33 | sockets = dict(poller.poll()) 34 | prev_len = len(available_workers) 35 | 36 | if backend in sockets: 37 | response = backend.recv_multipart() 38 | worker, client = response[:2] 39 | if client == b"UP": 40 | available_workers.append(worker) 41 | elif client == b"DOWN": 42 | if worker in available_workers: 43 | available_workers.remove(worker) 44 | elif len(response) > 2: 45 | # If worker replied, send rest back to client 46 | reply = response[2] 47 | frontend.send_multipart([client, reply]) 48 | available_workers.append(worker) 49 | 50 | if frontend in sockets: 51 | # Get next client request, route to last-used worker 52 | # TODO: learn different routing tactics 53 | client, request = frontend.recv_multipart() 54 | worker = available_workers.pop(0) 55 | backend.send_multipart([worker, client, request]) 56 | 57 | if len(available_workers) > 0 and prev_len == 0: 58 | # Poll for clients now that a worker is available 59 | poller.register(frontend, zmq.POLLIN) 60 | if len(available_workers) == 0 and frontend in poller: 61 | # Don't poll clients if no workers are available 62 | poller.unregister(frontend) 63 | except: 64 | import traceback 65 | traceback.print_exc() 66 | print("Shutting down ARTM_proxy...") 67 | finally: 68 | # Clean up 69 | backend.close() 70 | frontend.close() 71 | context.term() 72 | -------------------------------------------------------------------------------- /experiments/Calculate pn and habr cooccurrences.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import itertools\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "import matplotlib.pyplot as plt" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Загрузим словарь со словами объединенной коллекции\n", 27 | "cooc_modality = \"text\"\n", 28 | "\n", 29 | "vocab_list = list(map(lambda r: r[0], filter(lambda r: len(r) > 1 and r[1] == cooc_modality,\n", 30 | " map(lambda r: r.strip().split(), open(\"merged_vocab.txt\", \"r\")))))\n", 31 | "vocab_map = dict(zip(vocab_list, range(len(vocab_list))))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "%%time\n", 43 | "\n", 44 | "word_count = {}\n", 45 | "pair_count = {}\n", 46 | "\n", 47 | "for i, text in enumerate(open(\"batch_vw.txt\"), 1):\n", 48 | " text = set(text.strip().split()[2:])\n", 49 | " token_ids = set(filter(None, map(vocab_map.get, text)))\n", 50 | " for u in token_ids:\n", 51 | " word_count.setdefault(u, 0)\n", 52 | " word_count[u] += 1\n", 53 | " for p in itertools.combinations(token_ids, 2):\n", 54 | " pair_count.setdefault(p, 0)\n", 55 | " pair_count[p] += 1\n", 56 | " if i % 100 == 0:\n", 57 | " print(\"Processed %i documents\" % i)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "---" 65 | ] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.5.3" 85 | }, 86 | "latex_envs": { 87 | "bibliofile": "biblio.bib", 88 | "cite_by": "apalike", 89 | "current_citInitial": 1, 90 | "eqLabelWithNumbers": true, 91 | "eqNumInitial": 0 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /server/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Exploratory search 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 50 | 51 | 52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /experiments/Count number of tokens in VW file.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "CPU times: user 33.2 s, sys: 271 ms, total: 33.5 s\n", 26 | "Wall time: 33.4 s\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "%%time\n", 32 | "\n", 33 | "stats = []\n", 34 | "\n", 35 | "for i, doc in enumerate(open(\"../datasets/habrahabr/habrahabr.txt\")):\n", 36 | " tokens = doc.split()\n", 37 | " doc_id = tokens[0]\n", 38 | " modalities = {}\n", 39 | " cur_mod = \"\"\n", 40 | " for token in tokens[1:]:\n", 41 | " if token.startswith(\"|\"):\n", 42 | " cur_mod = token[1:]\n", 43 | " modalities[cur_mod] = []\n", 44 | " else:\n", 45 | " modalities[cur_mod].append(token)\n", 46 | " stats.append((doc_id, len(modalities[\"text\"]), len(set(modalities[\"text\"])), \\\n", 47 | " len(modalities[\"text_habr\"]), len(set(modalities[\"text_habr\"]))))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "stats_df = pd.DataFrame(stats, columns=[\"doc_id\", \"n_all_common\", \"n_uniq_common\", \"n_all_spec\", \"n_uniq_spec\"])\n", 59 | "stats_df = stats_df.set_index(\"doc_id\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "stats_df.to_csv(\"modalities_stats.csv\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "---" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.5.2" 98 | }, 99 | "latex_envs": { 100 | "bibliofile": "biblio.bib", 101 | "cite_by": "apalike", 102 | "current_citInitial": 1, 103 | "eqLabelWithNumbers": true, 104 | "eqNumInitial": 0 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 0 109 | } 110 | -------------------------------------------------------------------------------- /server/static/css/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 70px; 3 | } 4 | 5 | #knowledge_map_container { 6 | position: absolute; 7 | top: 60px; 8 | bottom: 0px; 9 | left: 0px; 10 | right: 0px; 11 | } 12 | 13 | #doc_sunburst_container { 14 | position: absolute; 15 | top: 60px; 16 | bottom: 0px; 17 | left: 0px; 18 | right: 0px; 19 | } 20 | 21 | #document_container { 22 | padding-left: 10px; 23 | } 24 | 25 | #transform_container { 26 | display: none; 27 | } 28 | 29 | .polygons { 30 | fill: #00ff00; 31 | stroke: #000; 32 | } 33 | 34 | .hidden { 35 | display: none; 36 | } 37 | 38 | svg { 39 | padding-left: 60px; 40 | display: none; 41 | } 42 | 43 | .chosen_topic { 44 | font-family: 'Anton', sans-serif; 45 | font-size: 16px; 46 | font-style: normal; 47 | font-variant: normal; 48 | font-weight: normal; 49 | line-height: 15.4px; 50 | } 51 | 52 | .polygons #selected-polygon { 53 | fill: #f00; 54 | } 55 | 56 | .titles { 57 | font-family: 'Anton', sans-serif; 58 | font-size: 14px; 59 | font-style: normal; 60 | font-variant: normal; 61 | font-weight: normal; 62 | line-height: 15.4px; 63 | } 64 | 65 | .chosen_topics_label { 66 | font-size: 250%; 67 | } 68 | 69 | .document_title { 70 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 71 | font-size: 24px; 72 | font-style: normal; 73 | font-variant: normal; 74 | font-weight: bold; 75 | line-height: 26.4px; 76 | } 77 | 78 | .document_authors { 79 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 80 | font-size: 18px; 81 | font-style: normal; 82 | font-variant: normal; 83 | font-weight: bold; 84 | line-height: 22px; 85 | } 86 | 87 | .document_text { 88 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 89 | font-size: 14px; 90 | font-style: normal; 91 | font-variant: normal; 92 | font-weight: 400; 93 | line-height: 20px; 94 | } 95 | 96 | .recommendation_title { 97 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 98 | font-size: 16px; 99 | font-style: normal; 100 | font-variant: normal; 101 | font-weight: bold; 102 | line-height: 26.4px; 103 | } 104 | 105 | .recommendation_text { 106 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 107 | font-size: 12px; 108 | font-style: normal; 109 | font-variant: normal; 110 | font-weight: 400; 111 | line-height: 20px; 112 | } 113 | 114 | h1 { 115 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 116 | font-size: 24px; 117 | font-style: normal; 118 | font-variant: normal; 119 | font-weight: 500; 120 | line-height: 26.4px; 121 | } 122 | 123 | p { 124 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 125 | font-size: 14px; 126 | font-style: normal; 127 | font-variant: normal; 128 | font-weight: 400; 129 | line-height: 20px; 130 | } 131 | 132 | /* Upload button spinning animation */ 133 | .glyphicon-refresh-animate { 134 | -animation: spin .7s infinite linear; 135 | -webkit-animation: spin2 .7s infinite linear; 136 | } 137 | 138 | @-webkit-keyframes spin2 { 139 | from { -webkit-transform: rotate(0deg);} 140 | to { -webkit-transform: rotate(360deg);} 141 | } 142 | 143 | @keyframes spin { 144 | from { transform: scale(1) rotate(0deg);} 145 | to { transform: scale(1) rotate(360deg);} 146 | } 147 | 148 | #search_text { 149 | width: 400px; 150 | } 151 | 152 | .collection_image { 153 | vertical-align: text-top; 154 | margin-left: 0.25em; 155 | } 156 | -------------------------------------------------------------------------------- /experiments/Exporting Postnauka from MySQL dump.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import collections\n", 13 | "import pymysql\n", 14 | "import pymysql.cursors" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 13, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "CPU times: user 96 ms, sys: 0 ns, total: 96 ms\n", 29 | "Wall time: 1.3 s\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "%%time\n", 35 | "\n", 36 | "# Дополним посты на ПостНауке именами авторов.\n", 37 | "\n", 38 | "conn = pymysql.connect(host=\"localhost\",\n", 39 | " user=\"root\",\n", 40 | " password=\"\",\n", 41 | " db=\"postnauka\",\n", 42 | " charset=\"utf8\",\n", 43 | " cursorclass=pymysql.cursors.DictCursor)\n", 44 | "\n", 45 | "authors_names = collections.defaultdict(list)\n", 46 | "\n", 47 | "try:\n", 48 | " with conn.cursor() as cur:\n", 49 | " q = \"\"\"\n", 50 | " -- Получить имена авторов\n", 51 | " select tr.object_id as post_id, t.term_id as author_id, tt.description as author_name\n", 52 | " from pn_term_taxonomy tt\n", 53 | " join pn_terms t on (t.term_id = tt.term_id)\n", 54 | " join pn_term_relationships tr on (tr.term_taxonomy_id = tt.term_taxonomy_id)\n", 55 | " join pn_posts p on (p.id = tr.object_id)\n", 56 | " where p.post_type = 'post' and p.post_status = 'publish' and tt.taxonomy = 'author'\n", 57 | " order by tr.object_id, t.term_id\n", 58 | " \"\"\"\n", 59 | " cur.execute(q)\n", 60 | " for row in cur:\n", 61 | " doc_id = row[\"post_id\"]\n", 62 | " author_id = row[\"author_id\"]\n", 63 | " author_str = row[\"author_name\"].split()[:-3]\n", 64 | " author_name = \" \".join(author_str[:len(author_str) // 2])\n", 65 | " authors_names[doc_id].append((author_id, author_name))\n", 66 | "finally:\n", 67 | " conn.close()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 21, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "CPU times: user 265 ms, sys: 128 ms, total: 393 ms\n", 82 | "Wall time: 836 ms\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "%%time\n", 88 | "\n", 89 | "# Дополним существующий датасет именами авторов из MySQL дампа.\n", 90 | "\n", 91 | "for doc_id, authors in authors_names.items():\n", 92 | " with open(\"../datasets/postnauka/raw_data/meta/%s_meta.txt\" % doc_id, \"a\") as meta_file:\n", 93 | " for author_id, author_name in authors:\n", 94 | " meta_file.write(\"author_name\\t%d\\t%s\\n\" % (author_id, author_name))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "---" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.5.2" 122 | }, 123 | "latex_envs": { 124 | "bibliofile": "biblio.bib", 125 | "cite_by": "apalike", 126 | "current_citInitial": 1, 127 | "eqLabelWithNumbers": true, 128 | "eqNumInitial": 0 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 0 133 | } 134 | -------------------------------------------------------------------------------- /server/server.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | 3 | const express = require("express"); 4 | const multer = require("multer"); 5 | const zmq = require("zmq"); 6 | const uuidV4 = require("uuid/v4"); 7 | const bodyParser = require("body-parser"); 8 | 9 | // Initialize common data structures 10 | 11 | // TODO: think about using TTL (time-to-live) later 12 | var routingQueue = {}; 13 | 14 | // Initialize zmq 15 | var sock = zmq.socket("dealer"); 16 | 17 | function sendToSock (res, msg) { 18 | if (typeof msg !== "object" || msg === null) { 19 | return false; 20 | } 21 | var uuid = uuidV4(); 22 | routingQueue[uuid] = res; 23 | msg["id"] = uuid; 24 | return sock.send(JSON.stringify(msg)) === 0; 25 | } 26 | 27 | sock.connect("tcp://localhost:2411"); 28 | 29 | var artmTopics = null; 30 | sock.on("message", function (reply) { 31 | reply = JSON.parse(reply); 32 | if (reply.act == "get_topics") { 33 | artmTopics = reply.data; 34 | } else if (reply.act == "recommend_docs" || reply.act == "get_documents" || 35 | reply.act == "get_document" || reply.act == "transform_doc" || 36 | reply.act == "get_next_assessment" || reply.act == "assess_document" || 37 | reply.act == "perform_search" 38 | ) { 39 | var res = routingQueue[reply.id]; 40 | delete routingQueue[reply.id]; 41 | res.send(reply.data); 42 | } 43 | }); 44 | 45 | sock.send(JSON.stringify({"act": "get_topics"})); 46 | 47 | // Initialize express 48 | const app = express(); 49 | app.use(express.static("static")); 50 | app.use(bodyParser.urlencoded({ extended: true })); 51 | 52 | // TODO: temporary upload path! change later in production 53 | var UPLOAD_PATH = path.join(__dirname, "uploads/") 54 | var upload = multer({dest: UPLOAD_PATH}) 55 | 56 | app.get("/get-topics", function (req, res) { 57 | if (artmTopics) { 58 | res.send(artmTopics); 59 | } else { 60 | res.send({"error": "topics data not ready yet"}); 61 | } 62 | }); 63 | 64 | app.get("/get-documents", function (req, res) { 65 | var topicId = req.query.topic_id; 66 | var offset = parseInt(req.query.offset); 67 | var limit = parseInt(req.query.limit); 68 | sendToSock(res, { "act": "get_documents", "topic_id": topicId, 69 | "offset": offset, "limit": limit }); 70 | }); 71 | 72 | app.get("/perform-search", function (req, res) { 73 | var query = req.query.query; 74 | var limit = parseInt(req.query.limit); 75 | sendToSock(res, { "act": "perform_search", "query": query, "limit": limit }); 76 | }); 77 | 78 | app.get("/get-document", function (req, res) { 79 | var docId = req.query.doc_id; 80 | var recommTags = req.query.recommend_tags; 81 | sendToSock(res, { "act": "get_document", "doc_id": docId, "recommend_tags": !!recommTags }); 82 | }); 83 | 84 | app.get("/recommend-docs", function (req, res) { 85 | var docId = req.query.doc_id; 86 | sendToSock(res, { "act": "recommend_docs", "doc_id": docId }); 87 | }); 88 | 89 | app.post("/transform-doc", upload.single("doc"), function (req, res, next) { 90 | var fileObj = req.file; 91 | 92 | if (fileObj.mimetype != "text/plain") { 93 | res.send({"error": "unknown filetype '" + fileObj.mimetype + "'"}); 94 | return; 95 | } 96 | 97 | // Make request to ARTM_bridge 98 | sendToSock(res, { "act": "transform_doc", "doc_path": fileObj.path, 99 | "filename": fileObj.originalname }); 100 | }); 101 | 102 | app.get("/get-next-assessment", function (req, res) { 103 | var assessorId = parseInt(req.query.assessor_id); 104 | var assessorsCnt = parseInt(req.query.assessors_cnt); 105 | var collectionName = req.query.collection_name; 106 | sendToSock(res, {"act": "get_next_assessment", 107 | "collection_name": collectionName, 108 | "assessor_id": assessorId, 109 | "assessors_cnt": assessorsCnt}); 110 | }); 111 | 112 | app.post("/assess-document", function (req, res) { 113 | var docId = req.body.doc_id; 114 | var isRelevant = req.body.is_relevant === "true"; 115 | sendToSock(res, {"act": "assess_document", 116 | "doc_id": docId, 117 | "is_relevant": isRelevant}); 118 | }); 119 | 120 | var server = app.listen(3000, function () { 121 | var host = server.address().address; 122 | var port = server.address().port; 123 | 124 | console.log("Example app listening at http://%s:%s", host, port); 125 | }); 126 | -------------------------------------------------------------------------------- /experiments/Import of hierarchical spectrums.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 15, 18 | "metadata": { 19 | "collapsed": false, 20 | "deletable": true, 21 | "editable": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import pickle\n", 26 | "import hierarchy_utils" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 44, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "MODEL_PATH = \"hartm\"\n", 38 | "artm_extra_info = pickle.load(open(MODEL_PATH + \"/extra_info.dump\", \"rb\"))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 38, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "artm_extra_info[\"spectrums\"] = [dump]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 43, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "pickle.dump(artm_extra_info, open(MODEL_PATH + \"/extra_info.dump\", \"wb\"))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 12, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stderr", 72 | "output_type": "stream", 73 | "text": [ 74 | "/nix/store/r7qpc32yr09l9a0d5y3b8i84kw5phx4p-python3-3.5.3/lib/python3.5/json/encoder.py:198: DeprecationWarning: Interpreting naive datetime as local 2017-07-21 13:38:03.234389. Please add timezone info to timestamps.\n", 75 | " chunks = self.iterencode(o, _one_shot=True)\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "artm_model = hierarchy_utils.hARTM(theta_columns_naming=\"title\",\n", 81 | " cache_theta=True,\n", 82 | " class_ids=artm_extra_info[\"class_ids\"])\n", 83 | "artm_model.load(MODEL_PATH)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 16, 89 | "metadata": { 90 | "collapsed": false, 91 | "deletable": true, 92 | "editable": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "dump = pickle.load(open(\"flat_spectrum.dump\", \"rb\"))" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 18, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "phi0 = artm_model._levels[0].get_phi()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 29, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "topics_names = {}\n", 119 | "for t in phi0.columns:\n", 120 | " topics_names[\"level_0_\" + t] = list(phi0[t].sort_values(ascending=False)[:3].index)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 33, 126 | "metadata": { 127 | "collapsed": false, 128 | "deletable": true, 129 | "editable": true 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "level_0_topic_13 \t химия, нанотехнологии, материаловедение\n", 137 | "level_0_topic_5 \t астрономия, астрофизика, вселенная\n", 138 | "level_0_topic_11 \t физика, физика_элементарных_частиц, квантовая_физика\n", 139 | "level_0_topic_15 \t математика, информационные_технологии, технологии\n", 140 | "level_0_topic_10 \t палеонтология, биохимия, стволовые_клетки\n", 141 | "level_0_topic_2 \t информационная_безопасность, копирайт, будущее\n", 142 | "level_0_topic_16 \t средневековье, мифология, биоинформатика\n", 143 | "level_0_topic_4 \t история, история_россии, ссср\n", 144 | "level_0_topic_8 \t общество, экономика, россия\n", 145 | "level_0_topic_0 \t философия, россия, география\n", 146 | "level_0_topic_3 \t культура, литература, культурология\n", 147 | "level_0_topic_6 \t лингвистика, язык, право\n", 148 | "level_0_topic_7 \t социология, социология_повседневности, дюркгейм_эмиль\n", 149 | "level_0_topic_12 \t наука, управление_проектами, работа\n", 150 | "level_0_topic_14 \t образование, университет, школа\n", 151 | "level_0_topic_9 \t психология, люди_науки, история_науки\n", 152 | "level_0_topic_18 \t мозг, нейробиология, искусственный_интеллект\n", 153 | "level_0_topic_17 \t экология, зоология, этология\n", 154 | "level_0_topic_1 \t медицина, эволюция, антропология\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "for topic_id in dump:\n", 160 | " print(topic_id, \"\\t\", \", \".join(topics_names[topic_id]))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "source": [ 170 | "---" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.5.3" 191 | }, 192 | "latex_envs": { 193 | "bibliofile": "biblio.bib", 194 | "cite_by": "apalike", 195 | "current_citInitial": 1, 196 | "eqLabelWithNumbers": true, 197 | "eqNumInitial": 0 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /experiments/Most popular-sciency Habr authors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 24, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pymongo\n", 12 | "import collections\n", 13 | "import numpy as np\n", 14 | "import pandas as pd" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "db = pymongo.MongoClient()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 10, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "def get_author(doc_id):\n", 37 | " try:\n", 38 | " return list(db[\"datasets\"][\"habrahabr\"].find({\"_id\": doc_id}, {\"authors_names\": 1}))[0][\"authors_names\"][0]\n", 39 | " except Exception:\n", 40 | " return None" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 15, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "clf_output = pd.read_csv(\"classifier_output.csv\")\n", 52 | "clf_output.columns = [\"id\", \"proba\"]\n", 53 | "clf_output = clf_output.set_index(\"id\")[\"proba\"]" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 21, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "%%time\n", 65 | "\n", 66 | "authors_relevance = collections.defaultdict(list)\n", 67 | "\n", 68 | "for doc_id, p in clf_output.items():\n", 69 | " author_name = get_author(doc_id)\n", 70 | " authors_relevance[author_name].append(p)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 112, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "P = len(clf_output[clf_output > 0.5]) / len(clf_output)\n", 82 | "N = len(clf_output[clf_output <= 0.5]) / len(clf_output)\n", 83 | "\n", 84 | "def relevance(n, p=0):\n", 85 | " return np.sqrt(p / P) - np.sqrt(n / N)\n", 86 | "\n", 87 | "def cont_relevance(ps):\n", 88 | " return np.log10(len(ps)) * np.median(2 * ps - 1)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 113, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "authors_series = pd.Series({k: relevance(*np.bincount(np.array(v) > 0.5)) if v else 0\n", 100 | " for k, v in authors_relevance.items()})" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 97, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "authors_series = pd.Series({k: cont_relevance(np.array(v)) if v else 0\n", 112 | " for k, v in authors_relevance.items()})" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 126, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | " 22.938461 -- https://habrahabr.ru/users/SLY_G\n", 127 | " 18.314086 -- https://habrahabr.ru/users/lozga\n", 128 | " 14.878677 -- https://habrahabr.ru/users/Zelenyikot\n", 129 | " 12.548149 -- https://habrahabr.ru/users/krasandm\n", 130 | " 12.498694 -- https://habrahabr.ru/users/Synth\n", 131 | " 11.519339 -- https://habrahabr.ru/users/protogui\n", 132 | " 11.377467 -- https://habrahabr.ru/users/AlexeyR\n", 133 | " 11.285316 -- https://habrahabr.ru/users/LukinB\n", 134 | " 10.340973 -- https://habrahabr.ru/users/PatientZero\n", 135 | " 9.634685 -- https://habrahabr.ru/users/Boomburum\n", 136 | " ...\n", 137 | " -7.757439 -- https://habrahabr.ru/users/BBSoD\n", 138 | " -7.815363 -- https://habrahabr.ru/users/Tylerskald\n", 139 | " -8.655005 -- https://habrahabr.ru/users/azproduction\n", 140 | " -8.725304 -- https://habrahabr.ru/users/XaocCPS\n", 141 | " -9.768331 -- https://habrahabr.ru/users/azazelis\n", 142 | "-11.758759 -- https://habrahabr.ru/users/aleksandrit\n", 143 | "-11.796221 -- https://habrahabr.ru/users/RoboForm\n", 144 | "-13.036321 -- https://habrahabr.ru/users/marks\n", 145 | "-15.156126 -- https://habrahabr.ru/users/alizar\n", 146 | "-16.058914 -- https://habrahabr.ru/users/jeston\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "with open(\"popular-sciency-rating.txt\", \"w\") as f:\n", 152 | " for i, (author_name, r) in enumerate(authors_series.sort_values(ascending=False).items()):\n", 153 | " s = \"% 10.6f -- https://habrahabr.ru/users/%s\\n\" % (r, author_name)\n", 154 | " if i < 10 or i >= len(authors_series) - 10:\n", 155 | " print(s, end=\"\")\n", 156 | " elif i == 10:\n", 157 | " print(\" ...\")\n", 158 | " f.write(s)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "---" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.5.3" 186 | }, 187 | "latex_envs": { 188 | "bibliofile": "biblio.bib", 189 | "cite_by": "apalike", 190 | "current_citInitial": 1, 191 | "eqLabelWithNumbers": true, 192 | "eqNumInitial": 0 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /parsers/habrahabr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Парсер Постнауки" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%reload_ext autoreload\n", 19 | "%autoreload 2" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import bson" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from text_utils import BaseSource, BaseProcessor, BaseSink\n", 42 | "from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor\n", 43 | "from text_utils import UciBowSink, MongoDbSink" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from sklearn.pipeline import Pipeline\n", 55 | "from ipywidgets import FloatProgress\n", 56 | "from IPython.display import display\n", 57 | "from pathlib import Path" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Определим пайплайн всей коллекции Хабрахабра из BSON-дампа (`HabrahabrCollectionSource`, `HabrahabrCollectionProcessor`)." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "class HabrahabrCollectionSource(BaseSource):\n", 76 | " def fit(self, root_path, *args):\n", 77 | " stop_words = (root_path / \"stopwords.txt\").open().read().split()\n", 78 | " self.root_path = root_path\n", 79 | " self.bson_file = (root_path / \"habrahabr.bson\").open(\"rb\")\n", 80 | " self.doc_pipeline = Pipeline([\n", 81 | " (\"text-processor\", DefaultTextProcessor()),\n", 82 | " (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_words)),\n", 83 | " ])\n", 84 | " # Save source state\n", 85 | " self.vocab_file = (root_path / \"vocab.pn.txt\").open(\"w\")\n", 86 | " self.docword_file = (root_path / \"docword.pn.txt\").open(\"w\")\n", 87 | " return self" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "class HabrahabrCollectionProcessor(BaseProcessor):\n", 99 | " def transform(self, src, *args):\n", 100 | " docs = []\n", 101 | " for doc_id, bson_doc in enumerate(bson.decode_file_iter(src.bson_file)):\n", 102 | " if bson_doc[\"company_blog\"] is None:\n", 103 | " doc = {}\n", 104 | " doc[\"title\"] = bson_doc[\"title\"]\n", 105 | " doc[\"url\"] = bson_doc[\"url\"]\n", 106 | " doc[\"modalities\"] = src.doc_pipeline.fit_transform(bson_doc[\"content_html\"])\n", 107 | " doc[\"modalities\"][\"flat_tag\"] = bson_doc[\"tags\"]\n", 108 | " doc[\"modalities\"][\"authors\"] = [bson_doc[\"author_user\"]]\n", 109 | " doc[\"modalities\"][\"hubs\"] = bson_doc[\"hubs\"]\n", 110 | " doc[\"markdown\"] = bson_doc[\"content_html\"]\n", 111 | " doc[\"doc_id\"] = doc_id + 1\n", 112 | " docs.append(doc)\n", 113 | " docs = DefaultCollectionProcessor(min_len=1, min_df=2).fit_transform(docs)\n", 114 | " # Save Markdown texts in MongoDB\n", 115 | " MongoDbSink(\"habrahabr\", id_func=lambda doc: \"habr_%d\" % doc[\"doc_id\"]).fit_transform(docs)\n", 116 | " # Save collection UCI BOW format\n", 117 | " UciBowSink(src.vocab_file, src.docword_file).fit_transform(docs)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Построим парсер Хабрахабра из пайплайна, определенного выше." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 11, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "habrahabr_parser = Pipeline([\n", 136 | " (\"take-root-path\", HabrahabrCollectionSource()),\n", 137 | " (\"process-the-collection\", HabrahabrCollectionProcessor()),\n", 138 | "])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "Запустим парсер." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 12, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "root_path = Path(\"../datasets/habrahabr\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "%%time\n", 168 | "\n", 169 | "habrahabr_parser.fit_transform(root_path)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "---" 177 | ] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.5.2" 197 | }, 198 | "latex_envs": { 199 | "bibliofile": "biblio.bib", 200 | "cite_by": "apalike", 201 | "current_citInitial": 1, 202 | "eqLabelWithNumbers": true, 203 | "eqNumInitial": 0 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 0 208 | } 209 | -------------------------------------------------------------------------------- /experiments/UCI Merger.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 55, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import csv\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "from pathlib import Path" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "В этом блокноте будут мержиться несколько UCI-датасетов в один объединённый." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 56, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "uci_collections = [(Path(\"../datasets/postnauka\"), \"pn\"),\n", 33 | " (Path(\"../datasets/ruwiki\"), \"ruwiki\")]\n", 34 | "\n", 35 | "g_path, g_collection_name = Path(\"../datasets\"), \"merged\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Объединим словари в один и сохраним его." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 63, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "CPU times: user 2.81 s, sys: 47 ms, total: 2.86 s\n", 57 | "Wall time: 2.86 s\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "%%time\n", 63 | "\n", 64 | "global_dictionary = {}\n", 65 | "local_mappings = []\n", 66 | "\n", 67 | "for path, collection_name in uci_collections:\n", 68 | " local_mapping = []\n", 69 | " with (path / (\"vocab.%s.txt\" % collection_name)).open() as infile:\n", 70 | " for ix, line in enumerate(infile):\n", 71 | " token, modality = line.split()\n", 72 | " value = global_dictionary.get((token, modality))\n", 73 | " if value is None:\n", 74 | " value = global_dictionary[(token, modality)] = len(global_dictionary)\n", 75 | " local_mapping.append(value)\n", 76 | " local_mappings.append(local_mapping)\n", 77 | "\n", 78 | "global_mapping = sorted(map(lambda p: (p[1], p[0]), global_dictionary.items()))\n", 79 | "\n", 80 | "# Добавим метки коллекций в словарь\n", 81 | "collection_ids = {}\n", 82 | "for _, collection_name in uci_collections:\n", 83 | " collection_ids[collection_name] = len(global_mapping)\n", 84 | " global_mapping.append((len(global_mapping), (collection_name, \"collection_id\")))\n", 85 | "\n", 86 | "with (g_path / (\"vocab.%s.txt\" % g_collection_name)).open(\"w\") as outfile:\n", 87 | " for _, value in global_mapping:\n", 88 | " outfile.write(\"%s %s\\n\" % value)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Теперь перестроим документы по объединённому словарю." 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Сначала посчитаем суммарную длину и кол-во документов будущего `docword` файла." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 66, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "g_doc_count, g_bow_length = 0, 0\n", 114 | "\n", 115 | "for path, collection_name in uci_collections:\n", 116 | " with (path / (\"docword.%s.txt\" % collection_name)).open() as infile:\n", 117 | " dict_length = infile.readline()\n", 118 | " doc_count = int(infile.readline())\n", 119 | " bow_length = int(infile.readline())\n", 120 | " g_doc_count += doc_count\n", 121 | " g_bow_length += bow_length" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Теперь построим `docword` файл объединённой коллекции." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 70, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "CPU times: user 13min 2s, sys: 5.42 s, total: 13min 7s\n", 143 | "Wall time: 13min 10s\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "%%time\n", 149 | "\n", 150 | "with (g_path / (\"docword.%s.txt\" % g_collection_name)).open(\"w\") as outfile:\n", 151 | " g_doc_id = 0\n", 152 | " outfile.write(\"%d\\n%d\\n%d\\n\" % (len(global_dictionary), g_doc_count, g_bow_length))\n", 153 | " for mapping, (path, collection_name) in zip(local_mappings, uci_collections):\n", 154 | " with (path / (\"docword.%s.txt\" % collection_name)).open() as infile:\n", 155 | " dict_length, doc_count, bow_length = int(infile.readline()), int(infile.readline()), int(infile.readline())\n", 156 | " seen_docs = set()\n", 157 | " for line in infile:\n", 158 | " doc_id, word_id, word_count = map(int, line.split())\n", 159 | " if doc_id not in seen_docs:\n", 160 | " # Добавим метки коллекций в документы\n", 161 | " collection_id = collection_ids[collection_name]\n", 162 | " outfile.write(\"%d %d %d\\n\" % (g_doc_id + doc_id, collection_id + 1, 1))\n", 163 | " seen_docs.add(doc_id)\n", 164 | " g_word_id = mapping[word_id - 1] + 1\n", 165 | " outfile.write(\"%d %d %d\\n\" % (g_doc_id + doc_id, g_word_id, word_count))\n", 166 | " g_doc_id += doc_count" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "---" 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.5.2" 194 | }, 195 | "latex_envs": { 196 | "bibliofile": "biblio.bib", 197 | "cite_by": "apalike", 198 | "current_citInitial": 1, 199 | "eqLabelWithNumbers": true, 200 | "eqNumInitial": 0 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 0 205 | } 206 | -------------------------------------------------------------------------------- /server/artm_bridge.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import random 4 | import pickle 5 | import json 6 | import zmq 7 | import regex 8 | import tempfile 9 | import traceback 10 | import glob 11 | import os 12 | 13 | from parsers import arbitrary, text_utils 14 | from datetime import datetime 15 | 16 | import artm 17 | import artm_lib 18 | 19 | 20 | MODEL_PATH = "hartm" 21 | 22 | ZMQ_BACKEND_PORT = 2511 23 | 24 | EMPTY, UP, DOWN = b"", b"UP", b"DOWN" 25 | 26 | 27 | class BridgeParamError(ValueError): 28 | def __init__(self, message): 29 | self.message = message 30 | 31 | 32 | def rm_flat_dir(dir_path): 33 | for file_path in glob.glob(os.path.join(dir_path, "*")): 34 | os.remove(file_path) 35 | os.rmdir(dir_path) 36 | 37 | def process_msg(message): 38 | if message["act"] == "get_topics": 39 | response = artm_bridge.model.topics 40 | elif message["act"] == "get_documents": 41 | topic_id = message["topic_id"] 42 | offset = message["offset"] 43 | limit = message["limit"] 44 | if type(topic_id) is not str: 45 | raise BridgeParamError("incorrect param type: `topic_id`") 46 | if type(offset) is not int or type(limit) is not int: 47 | raise BridgeParamError("`limit` and `offset` fields must be integer") 48 | docs, weights = artm_bridge.get_documents_by_topic(topic_id, offset=offset, limit=limit) 49 | response = {"docs": docs, "weights": weights} 50 | elif message["act"] == "get_document": 51 | doc_id = message["doc_id"] 52 | if type(doc_id) is not str: 53 | raise BridgeParamError("incorrect param type: `doc_id`") 54 | docs = artm_bridge.data_source.get_documents_by_ids([doc_id], with_modalities=True) 55 | if len(docs) == 0: 56 | raise BridgeParamError("document with `doc_id` = '%s' is not found" % doc_id) 57 | doc = docs[0] 58 | if message["recommend_tags"]: 59 | doc["recommended_tags"] = artm_bridge.recommend_tags_by_doc(doc) 60 | response = doc 61 | elif message["act"] == "perform_search": 62 | query = message["query"] 63 | limit = message["limit"] 64 | if type(query) is not str: 65 | raise BridgeParamError("incorrect param type: `query`") 66 | if type(limit) is not int: 67 | raise BridgeParamError("incorrect param type: `limit`") 68 | response = dict(zip(["docs", "theta"], artm_bridge.search_documents(query, limit=limit))) 69 | elif message["act"] == "recommend_docs": 70 | doc_id = message["doc_id"] 71 | if type(doc_id) is not str: 72 | raise BridgeParamError("incorrect param type: `doc_id`") 73 | sim_docs_ids = artm_bridge.recommend_docs_by_doc(doc_id) 74 | response = artm_bridge.data_source.get_documents_by_ids(sim_docs_ids, with_texts=False) 75 | elif message["act"] == "transform_doc": 76 | doc_path = message["doc_path"] 77 | filename = message["filename"] 78 | try: 79 | # Initialize file resources 80 | doc_file = open(doc_path) 81 | vw_fd,vw_path = tempfile.mkstemp(prefix="upload", text=True) 82 | vw_file = os.fdopen(vw_fd, "w") 83 | batch_path = tempfile.mkdtemp(prefix="batch") 84 | # Parse uploaded file 85 | doc = pipeline.fit_transform(doc_file) 86 | # Save to Vowpal Wabbit file 87 | text_utils.VowpalWabbitSink(vw_file, lambda x: "upload") \ 88 | .fit_transform([doc]) 89 | # Transform uploaded document and return its Theta matrix 90 | response = {} 91 | response["filename"] = filename 92 | response["theta"] = artm_bridge.model.transform_one(vw_path, batch_path) 93 | except: 94 | raise 95 | finally: 96 | # Delete uploaded file 97 | doc_file.close() 98 | os.remove(doc_path) 99 | # Delete temporary files/dirs 100 | os.remove(vw_path) 101 | rm_flat_dir(batch_path) 102 | elif False and message["act"] == "get_next_assessment": 103 | ass_id = message["assessor_id"] 104 | ass_cnt = message["assessors_cnt"] 105 | col_name = message["collection_name"] 106 | 107 | if ass_id >= ass_cnt: 108 | response = "Incorrent `assessor_id`" 109 | else: 110 | docs_count = db["datasets"][col_name].count() 111 | min_id = int(ass_id * docs_count / ass_cnt) 112 | max_id = int((ass_id + 1) * docs_count / ass_cnt) 113 | # May take a long time for large datasets 114 | docs_ids = db["datasets"][col_name].find({}, {"_id": 1}) 115 | docs_ids = list(map(lambda x: x["_id"], 116 | docs_ids.sort([("_id", 1)]))) 117 | ass_docs_ids = docs_ids[min_id:max_id] 118 | # Get unused documents' ids 119 | used_docs_ids = db["assessment"][col_name].find({}, {"_id": 1}) 120 | used_docs_ids = list(map(lambda x: x["_id"], used_docs_ids)) 121 | unused_docs_ids = list(set(ass_docs_ids) - set(used_docs_ids)) 122 | # Form response 123 | random.shuffle(unused_docs_ids) 124 | # Use batches of 100 docs per request 125 | response = unused_docs_ids[:100] 126 | elif False and message["act"] == "assess_document": 127 | doc_id = message["doc_id"] 128 | is_relevant = message["is_relevant"] 129 | col_names = [v for k, v in prefix_to_col_map.items() 130 | if doc_id.startswith(k + "_")] 131 | if len(col_names) != 1: 132 | response = False 133 | else: 134 | col_name = col_names[0] 135 | dataset = db["assessment"][col_name] 136 | doc = { 137 | "is_relevant": is_relevant, 138 | "assess_time": datetime.now() 139 | } 140 | dataset.replace_one({"_id": doc_id}, doc, upsert=True) 141 | response = True 142 | else: 143 | raise BridgeParamError("unknown query") 144 | 145 | return response 146 | 147 | try: 148 | # Initialize arbitrary pipeline 149 | pipeline = arbitrary.get_pipeline() 150 | 151 | # Initialize BigARTM logging 152 | artm_log_path = tempfile.mkdtemp(prefix="artmlog") 153 | lc = artm.messages.ConfigureLoggingArgs() 154 | lc.log_dir = artm_log_path 155 | lc.minloglevel = 2 156 | artm.wrapper.LibArtm(logging_config=lc) 157 | 158 | # Initialize ZeroMQ 159 | context = zmq.Context() 160 | socket = context.socket(zmq.DEALER) 161 | # TODO: maybe set socket identity for persistence? 162 | socket.connect("tcp://localhost:%d" % ZMQ_BACKEND_PORT) 163 | 164 | # Initialize ARTM bridge 165 | artm_bridge = artm_lib.ArtmBridge(MODEL_PATH) 166 | 167 | # Notify ARTM_proxy that we're up 168 | socket.send(UP) 169 | 170 | print("ARTM_bridge: start serving ZeroMQ queries on port", 171 | ZMQ_BACKEND_PORT) 172 | 173 | while True: 174 | # Wait for next request from client 175 | client, request = socket.recv_multipart() 176 | message = json.loads(request.decode("utf-8")) 177 | 178 | # Debug logging 179 | # print("> " + json.dumps(message)) 180 | 181 | # Process message 182 | response = {} 183 | try: 184 | response["ok"] = process_msg(message) 185 | except BridgeParamError as e: 186 | response["error"] = {"message": e.message} 187 | except BaseException as e: 188 | response["error"] = {"message": "server error"} 189 | traceback.print_exc() 190 | 191 | socket.send_multipart([ 192 | client, 193 | json.dumps({ 194 | "act": message["act"], 195 | "id": message.get("id"), 196 | "data": response 197 | }).encode("utf-8") 198 | ]) 199 | except: 200 | traceback.print_exc() 201 | print("Shutting down ARTM_bridge...") 202 | finally: 203 | # Unregister 204 | socket.send(DOWN) 205 | # Clean up 206 | rm_flat_dir(artm_log_path) 207 | socket.close() 208 | context.term() 209 | -------------------------------------------------------------------------------- /experiments/Sending requests to server from ARTM_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import json\n", 12 | "import urllib.request" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "def get_document(doc_id):\n", 24 | " request_url = \"http://localhost:3000/get-document?doc_id=%s\" % doc_id\n", 25 | " response = urllib.request.urlopen(request_url).read().decode(\"utf-8\")\n", 26 | " return json.loads(response) if response else None\n", 27 | "\n", 28 | "def get_recommendations(doc_id):\n", 29 | " # TODO: исправить с приходом гетерогенности\n", 30 | " doc_id = doc_id.split(\"_\")[1]\n", 31 | " request_url = \"http://localhost:3000/get-recommendations?doc_id=%s\" % doc_id\n", 32 | " response = urllib.request.urlopen(request_url).read().decode(\"utf-8\")\n", 33 | " return json.loads(response) if response else None " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", 48 | "Wall time: 7.63 µs\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "%time\n", 54 | "doc = get_document(\"pn_1490\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Оценка талантливости\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "print(doc[\"title\"])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Какие источники информации использует человек при оценке способностей других людей? Почему мы так часто неверно оцениваем возможности других людей? Профессор социальной этики Гарвардского университета Махзарин Банаджи рассказывает о том, почему подбрасывание монеты может быть хорошим предсказанием успешности брака, в рамках проекта Serious Science, созданного командой ПостНауки.\n", 88 | "Тренированный человек, безусловно, может отличить хорошо исполненную музыку от сыгранной любителем. Такие люди обычно могут рассказать, в чем разница. Если дать им послушать записи студента старшей школы и, например, Сеговии, то они скажут, что Сеговия играет лучше. Но нас больше интересует ситуация, когда оба претендента кажутся одинаково хорошими. В таком случае может ли наш разум иметь предубеждения относительно них, из-за которых нам будет казаться, что они играют по-разному? Если правы музыканты и они всегда объективно оценивают, обращают внимание только на звук, то разницы быть не должно.\n", 89 | "\n", 90 | "\n", 91 | "Мы считаем, что это стало проблемой: мы теряем талантливых людей, потому что во многих областях уделяем чересчур много внимания тем, кто считается одаренным с детства. Мы не пытаемся давать указания другим, а только отмечаем интересную особенность, что даже среди экспертов существуют интересные различия между словами и поступками. Из этого вытекает следующий вопрос: можно ли это обобщить на другие области, помимо музыки? Оказывается, что да.\n", 92 | "Надеемся, что наука о принятии решений при найме на работу разовьется настолько, что мы сможем точно сказать, по каким критериям нужно ориентироваться во время интервью в случае каждой конкретной профессии. Я думаю, что вскоре интервью будут казаться старомодными. Я думаю, что принятие решения о взятии человека на работу в качестве коллеги или подчиненного очень похоже на принятие решения о том, кто будет вашем партнером, супругом. Я думаю, что в обоих этих случаях мы часто бываем одинаково неправы.\n", 93 | "\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "print(doc[\"markdown\"])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "Токены: ['источник', 'информация', 'использовать', 'оценка', 'способность', 'часто', 'неверно', 'оценивать', 'возможность', 'профессор', 'социальный', 'этика', 'гарвардский', 'университет', 'махзарин', 'банаджи', 'рассказывать', 'подбрасывание', 'монета', 'хороший', 'предсказание', 'успешность', 'брак', 'рамка', 'проект', 'science', 'создавать', 'команда', 'постнаука', 'безусловно', 'отличать', 'исполнять', 'музыка', 'сыграть', 'любитель', 'обычно', 'рассказывать', 'разница', 'давать', 'послушать', 'запись', 'студент', 'старший', 'школа', 'сеговия', 'сеговия', 'играть', 'интересовать', 'ситуация', 'претендент', 'казаться', 'одинаково', 'хороший', 'разум', 'предубеждение', 'относительно', 'казаться', 'играть', 'правый', 'музыкант', 'объективно', 'оценивать', 'обращать', 'внимание', 'звук', 'разница', 'должно', 'считать', 'терять', 'талантливый', 'многий', 'область', 'уделять', 'чересчур', 'внимание', 'считаться', 'одаренный', 'детство', 'пытаться', 'давать', 'указание', 'отмечать', 'интересный', 'особенность', 'эксперт', 'интересный', 'различие', 'поступок', 'вытекать', 'следующий', 'обобщать', 'область', 'помимо', 'музыка', 'надеяться', 'принятие', 'решение', 'наем', 'развиваться', 'настолько', 'точно', 'критерий', 'ориентироваться', 'интервью', 'конкретный', 'профессия', 'думать', 'вскоре', 'интервью', 'казаться', 'старомодный', 'думать', 'принятие', 'решение', 'взятие', 'качество', 'коллега', 'подчиненный', 'похоже', 'принятие', 'решение', 'партнер', 'супруг', 'думать', 'часто', 'бывать', 'одинаково', 'неправый']\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "print(\"Токены: %s\" % doc[\"modalities\"][\"text\"])" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", 132 | "Wall time: 4.53 µs\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "%time\n", 138 | "recommendations = get_recommendations(\"pn_1490\")" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Рекомендации\n", 153 | "1. Посттравматический стресс в отношениях «мать — дочь»\n", 154 | "2. Психология телесности\n", 155 | "3. Становление киберпсихологии\n", 156 | "4. Явление и понятие инсайта\n", 157 | "5. Чем объясняются оптические иллюзии?\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "print(\"Рекомендации\")\n", 163 | "for i, doc in enumerate(recommendations):\n", 164 | " print(\"%d. %s\" % (i + 1, doc[\"title\"]))" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "---" 172 | ] 173 | } 174 | ], 175 | "metadata": { 176 | "kernelspec": { 177 | "display_name": "Python 3", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "language_info": { 182 | "codemirror_mode": { 183 | "name": "ipython", 184 | "version": 3 185 | }, 186 | "file_extension": ".py", 187 | "mimetype": "text/x-python", 188 | "name": "python", 189 | "nbconvert_exporter": "python", 190 | "pygments_lexer": "ipython3", 191 | "version": "3.5.2" 192 | }, 193 | "latex_envs": { 194 | "bibliofile": "biblio.bib", 195 | "cite_by": "apalike", 196 | "current_citInitial": 1, 197 | "eqLabelWithNumbers": true, 198 | "eqNumInitial": 0 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 0 203 | } 204 | -------------------------------------------------------------------------------- /experiments/Comparison of different lemmatizers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import re\n", 14 | "from itertools import chain\n", 15 | "from collections import Counter\n", 16 | "from contextlib import closing" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": false, 24 | "deletable": true, 25 | "editable": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import mystem\n", 30 | "import pymorphy2\n", 31 | "import multiprocessing" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": true, 39 | "deletable": true, 40 | "editable": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "N_PROCS = 4" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "deletable": true, 51 | "editable": true 52 | }, 53 | "source": [ 54 | "Сравнение будем проводить на большом текстовом документе (400 Кб, статья «Россия»), взятом из Википедии, и откопированным 10 раз. " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": { 61 | "collapsed": true, 62 | "deletable": true, 63 | "editable": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "text = open(\"../datasets/arbitrary/examples/big_russia.txt\").read()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 5, 73 | "metadata": { 74 | "collapsed": true, 75 | "deletable": true, 76 | "editable": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "split_regexp = re.compile(\"(?u)\\\\b\\\\w+\\\\b\")\n", 81 | "tokens = split_regexp.findall(text)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": { 88 | "collapsed": false, 89 | "deletable": true, 90 | "editable": true 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "322660" 97 | ] 98 | }, 99 | "execution_count": 6, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "len(tokens)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": { 112 | "collapsed": false, 113 | "deletable": true, 114 | "editable": true 115 | }, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "[('в', 12340),\n", 121 | " ('и', 10210),\n", 122 | " ('на', 4300),\n", 123 | " ('России', 3800),\n", 124 | " ('В', 3090),\n", 125 | " ('с', 2270),\n", 126 | " ('по', 1990),\n", 127 | " ('году', 1650),\n", 128 | " ('года', 1590),\n", 129 | " ('из', 1410)]" 130 | ] 131 | }, 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "Counter(tokens).most_common(10)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "deletable": true, 145 | "editable": true 146 | }, 147 | "source": [ 148 | "## python-mystem" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 8, 154 | "metadata": { 155 | "collapsed": true, 156 | "deletable": true, 157 | "editable": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "from mystem import analyze" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 9, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "def mystem_nf(tok):\n", 173 | " with analyze(tok) as result:\n", 174 | " return str(result[0])" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 10, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "CPU times: user 13.4 s, sys: 15 ms, total: 13.4 s\n", 189 | "Wall time: 13.4 s\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "%%time\n", 195 | "\n", 196 | "lemmas = list(map(mystem_nf, tokens))" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 11, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "322660" 210 | ] 211 | }, 212 | "execution_count": 11, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "len(lemmas)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 12, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "[('в', 15430),\n", 232 | " ('и', 10460),\n", 233 | " ('год', 5080),\n", 234 | " ('россия', 4960),\n", 235 | " ('на', 4590),\n", 236 | " ('с', 2740),\n", 237 | " ('быть', 2570),\n", 238 | " ('российский', 2440),\n", 239 | " ('по', 2420),\n", 240 | " ('к', 1570)]" 241 | ] 242 | }, 243 | "execution_count": 12, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "Counter(lemmas).most_common(10)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": { 255 | "deletable": true, 256 | "editable": true 257 | }, 258 | "source": [ 259 | "## pymorphy2[fast]" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 13, 265 | "metadata": { 266 | "collapsed": true, 267 | "deletable": true, 268 | "editable": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "morph = pymorphy2.MorphAnalyzer()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 14, 278 | "metadata": { 279 | "collapsed": true, 280 | "deletable": true, 281 | "editable": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "def morph_nf(tok):\n", 286 | " return morph.parse(tok)[0].normal_form" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 15, 292 | "metadata": { 293 | "collapsed": false, 294 | "deletable": true, 295 | "editable": true 296 | }, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "CPU times: user 11.2 s, sys: 12 ms, total: 11.2 s\n", 303 | "Wall time: 11.2 s\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "%%time\n", 309 | "\n", 310 | "lemmas = list(map(morph_nf, tokens))" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 16, 316 | "metadata": { 317 | "collapsed": false, 318 | "deletable": true, 319 | "editable": true 320 | }, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "322660" 326 | ] 327 | }, 328 | "execution_count": 16, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "len(lemmas)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 17, 340 | "metadata": { 341 | "collapsed": false, 342 | "deletable": true, 343 | "editable": true 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "[('в', 15730),\n", 350 | " ('и', 10460),\n", 351 | " ('год', 6080),\n", 352 | " ('россия', 4960),\n", 353 | " ('на', 4590),\n", 354 | " ('с', 2920),\n", 355 | " ('быть', 2570),\n", 356 | " ('российский', 2440),\n", 357 | " ('по', 2420),\n", 358 | " ('к', 1580)]" 359 | ] 360 | }, 361 | "execution_count": 17, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "Counter(lemmas).most_common(10)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "deletable": true, 374 | "editable": true 375 | }, 376 | "source": [ 377 | "---" 378 | ] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.5.3" 398 | }, 399 | "latex_envs": { 400 | "bibliofile": "biblio.bib", 401 | "cite_by": "apalike", 402 | "current_citInitial": 1, 403 | "eqLabelWithNumbers": true, 404 | "eqNumInitial": 0 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 0 409 | } 410 | -------------------------------------------------------------------------------- /parsers/postnauka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Парсер Постнауки" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%reload_ext autoreload\n", 19 | "%autoreload 2" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import regex" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from text_utils import BaseSource, BaseProcessor, BaseSink\n", 42 | "from text_utils import DefaultTextProcessor, DefaultDocumentProcessor, DefaultCollectionProcessor\n", 43 | "from text_utils import VowpalWabbitSink, MongoDbSink" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from sklearn.pipeline import Pipeline\n", 55 | "from ipywidgets import FloatProgress\n", 56 | "from IPython.display import display\n", 57 | "from pathlib import Path" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Определим сначала пайплайн для одного документа (`PostnaukaFileSource`, `PostnaukaFileProcessor`)." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "pn_tags_trim = regex.compile(\"\\[(post|pcourse) [^\\]]+\\]\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "class PostnaukaFileSource(BaseSource):\n", 87 | " def fit(self, params, *args):\n", 88 | " (text_path, meta_path) = params\n", 89 | " self.text_path = text_path\n", 90 | " self.meta_path = meta_path\n", 91 | " return self" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "class PostnaukaFileProcessor(BaseProcessor):\n", 103 | " def __init__(self, stop_words):\n", 104 | " self.doc_pipeline = Pipeline([\n", 105 | " (\"text-processor\", DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n", 106 | " (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_words)),\n", 107 | " ])\n", 108 | "\n", 109 | " def transform(self, src, *args):\n", 110 | " # Parse text file\n", 111 | " with src.text_path.open() as fi:\n", 112 | " title = fi.readline().strip()\n", 113 | " fi.readline()\n", 114 | " description = fi.readline().strip()\n", 115 | " fi.readline()\n", 116 | " text = fi.read()\n", 117 | " text = pn_tags_trim.sub(\"\", text)\n", 118 | " # Parse meta file\n", 119 | " flat_tags = []\n", 120 | " authors = []\n", 121 | " authors_names = []\n", 122 | " with src.meta_path.open() as fi:\n", 123 | " for ln in fi:\n", 124 | " toks = regex.split(\"\\s+\", ln, 2)\n", 125 | " if toks[0] == \"post_tag\":\n", 126 | " flat_tags.append(toks[-1].strip().lower())\n", 127 | " elif toks[0] == \"author\":\n", 128 | " authors.append(toks[-1].strip().lower())\n", 129 | " elif toks[0] == \"author_name\":\n", 130 | " authors_names.append(toks[-1].strip())\n", 131 | " # Run inner pipeline to form modalities\n", 132 | " modalities = self.doc_pipeline.fit_transform(text)\n", 133 | " # Finally, make a document and return it\n", 134 | " doc = {}\n", 135 | " doc[\"title\"] = title\n", 136 | " doc[\"description\"] = description\n", 137 | " doc[\"authors_names\"] = authors_names\n", 138 | " doc[\"modalities\"] = modalities\n", 139 | " doc[\"modalities\"][\"flat_tag\"] = flat_tags\n", 140 | " doc[\"modalities\"][\"authors\"] = authors\n", 141 | " doc[\"markdown\"] = text\n", 142 | " return doc" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Теперь определим пайплайн всей коллекции файлов на диске (`PostnaukaCollectionSource`, `PostnaukaCollectionProcessor`)." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 8, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "class PostnaukaCollectionSource(BaseSource):\n", 161 | " def fit(self, root_path, *args):\n", 162 | " stop_words = (root_path / \"stopwords.txt\").open().read().split()\n", 163 | " self.root_path = root_path\n", 164 | " # We will spawn this pipeline in parallel for each document\n", 165 | " self.file_parser = Pipeline([\n", 166 | " (\"take-file-name\", PostnaukaFileSource()),\n", 167 | " (\"convert-to-document\", PostnaukaFileProcessor(stop_words)),\n", 168 | " ])\n", 169 | " # Save source state\n", 170 | " self.vw_file = (root_path / \"postnauka.txt\").open(\"w\")\n", 171 | " self.files_paths = sorted(root_path.glob(\"raw_data/*.txt\"))\n", 172 | " self.metas_paths = sorted(root_path.glob(\"raw_data/meta/*_meta.txt\"))\n", 173 | " return self" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 9, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "class PostnaukaCollectionProcessor(BaseProcessor):\n", 185 | " def transform(self, src, *args):\n", 186 | " docs = []\n", 187 | " f = FloatProgress(min=0, max=len(src.files_paths))\n", 188 | " display(f)\n", 189 | " for doc_id, (file_path, meta_path) in enumerate(zip(src.files_paths, src.metas_paths)):\n", 190 | " # TODO: run these in parallel threads\n", 191 | " doc = src.file_parser.fit_transform((file_path, meta_path))\n", 192 | " doc[\"doc_id\"] = doc_id + 1\n", 193 | " docs.append(doc)\n", 194 | " f.value += 1\n", 195 | " docs = DefaultCollectionProcessor(min_len=100, min_df=2).fit_transform(docs)\n", 196 | " id_func = lambda doc: \"pn_%d\" % doc[\"doc_id\"]\n", 197 | " # Save Markdown texts in MongoDB\n", 198 | " MongoDbSink(\"postnauka\", id_func=id_func).fit_transform(docs)\n", 199 | " # Save collection in Vowpal Wabbit format\n", 200 | " VowpalWabbitSink(src.vw_file, id_func).fit_transform(docs)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Построим парсер Постнауки из пайплайна, определенного выше." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "postnauka_parser = Pipeline([\n", 219 | " (\"take-root-path\", PostnaukaCollectionSource()),\n", 220 | " (\"process-the-collection\", PostnaukaCollectionProcessor()),\n", 221 | "])" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Запустим парсер." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 11, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "root_path = Path(\"../datasets/postnauka\")" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 12, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "CPU times: user 2min 42s, sys: 7.02 s, total: 2min 49s\n", 254 | "Wall time: 5min 51s\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "%%time\n", 260 | "\n", 261 | "postnauka_parser.fit_transform(root_path)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "---" 269 | ] 270 | } 271 | ], 272 | "metadata": { 273 | "kernelspec": { 274 | "display_name": "Python 3", 275 | "language": "python", 276 | "name": "python3" 277 | }, 278 | "language_info": { 279 | "codemirror_mode": { 280 | "name": "ipython", 281 | "version": 3 282 | }, 283 | "file_extension": ".py", 284 | "mimetype": "text/x-python", 285 | "name": "python", 286 | "nbconvert_exporter": "python", 287 | "pygments_lexer": "ipython3", 288 | "version": "3.5.2" 289 | }, 290 | "latex_envs": { 291 | "bibliofile": "biblio.bib", 292 | "cite_by": "apalike", 293 | "current_citInitial": 1, 294 | "eqLabelWithNumbers": true, 295 | "eqNumInitial": 0 296 | } 297 | }, 298 | "nbformat": 4, 299 | "nbformat_minor": 0 300 | } 301 | -------------------------------------------------------------------------------- /parsers/text_utils.py: -------------------------------------------------------------------------------- 1 | import regex 2 | import copy 3 | 4 | from pymystem3 import Mystem 5 | from pymongo import MongoClient 6 | from collections import Counter 7 | from sklearn.pipeline import Pipeline 8 | from bson.objectid import ObjectId 9 | 10 | 11 | default_modalities = [ 12 | "text", # Preprocessed tokens of a document's contents. 13 | "flat_tag", # Flat tags associated manually with a document. 14 | ] 15 | 16 | 17 | # ---------------------------- 18 | # Generic interfaces 19 | # ---------------------------- 20 | 21 | class BaseTransformable(): 22 | """ 23 | Root interface class, which describes abstract transformation of data. 24 | """ 25 | 26 | def fit(self, *args): 27 | pass 28 | 29 | def transform(self, *args): 30 | pass 31 | 32 | def fit_transform(self, *args): 33 | return self.fit(*args).transform(*args) 34 | 35 | class BaseSource(BaseTransformable): 36 | """ 37 | Root interface class, which describes the starting point of processing. 38 | Purpose: accumulate some input, do a preparatory job, and pass 39 | an accumulated state further over pipeline. 40 | Input: arbitrary data. 41 | Output: pointer to self (default case), but may be re-defined. 42 | """ 43 | 44 | def transform(self, *args): 45 | return self 46 | 47 | class BaseProcessor(BaseTransformable): 48 | """ 49 | Root interface class, which describes an intermediate step of processing. 50 | Purpose: take some input from previous steps, modify it and pass it over 51 | next processors on the pipeline. 52 | Input: arbitrary data. 53 | Output: arbitrary (modified) data. 54 | """ 55 | 56 | def fit(self, *args): 57 | return self 58 | 59 | class BaseSink(BaseTransformable): 60 | """ 61 | Root interface class, which describes the terminate point of processing. 62 | Purpose: perform some action over processed data and serve as a 63 | terminal element on the pipeline. 64 | Input: arbitrary data. 65 | Output: None. 66 | """ 67 | 68 | def fit(self, *args): 69 | return self 70 | 71 | def transform(self, *args): 72 | return None 73 | 74 | class TextProcessor(BaseProcessor): 75 | """Interface class, which describes raw text processing.""" 76 | pass 77 | 78 | class DocumentProcessor(BaseProcessor): 79 | """Interface class, which describes document processing.""" 80 | pass 81 | 82 | class CollectionProcessor(BaseProcessor): 83 | """Interface class, which describes collection processing.""" 84 | pass 85 | 86 | 87 | # ---------------------------- 88 | # Specific classes 89 | # ---------------------------- 90 | 91 | # TODO: Document all processors 92 | 93 | class Splitter(BaseProcessor): 94 | def __init__(self, token_pattern): 95 | self.token_regexp = regex.compile(token_pattern) 96 | 97 | def transform(self, text, *args): 98 | return self.token_regexp.findall(text) 99 | 100 | class DictionaryFilterer(BaseProcessor): 101 | def __init__(self, stop_words=None): 102 | if stop_words is None: 103 | self.stop_words = {} 104 | else: 105 | self.stop_words = set(stop_words) 106 | 107 | def transform(self, tokens, *args): 108 | return list(filter(lambda t: t not in self.stop_words, tokens)) 109 | 110 | class FrequencyFilterer(BaseProcessor): 111 | def __init__(self, min_df=None, max_df=None): 112 | min_df = 0 if min_df is None else min_df 113 | max_df = 1. if max_df is None else max_df 114 | if not isinstance(min_df, int) and not isinstance(min_df, float): 115 | raise ValueError("min_df is neither int nor float") 116 | if not isinstance(max_df, int) and not isinstance(max_df, float): 117 | raise ValueError("max_df is neither int nor float") 118 | self.min_df = min_df 119 | self.max_df = max_df 120 | 121 | def fit(self, tokens, *args): 122 | freq = Counter(tokens) 123 | min_df = self.min_df if isinstance(self.min_df, int) else self.min_df * len(tokens) 124 | max_df = self.max_df if isinstance(self.max_df, int) else self.max_df * len(tokens) 125 | self.stop_words = set(map(lambda p: p[0], filter(lambda p: p[1] < min_df or p[1] > max_df, freq.items()))) 126 | return self 127 | 128 | def transform(self, tokens, *args): 129 | return list(filter(lambda t: t not in self.stop_words, tokens)) 130 | 131 | class LengthFilterer(BaseProcessor): 132 | def __init__(self, min_len=0, len_func=None): 133 | self.min_len = min_len 134 | self.len_func = len if len_func is None else len_func 135 | 136 | def transform(self, tokens, *args): 137 | return list(filter(lambda t: self.len_func(t) >= self.min_len, tokens)) 138 | 139 | class Lemmatizer(BaseProcessor): 140 | def __init__(self): 141 | self.m = Mystem() 142 | 143 | def transform(self, tokens, *args): 144 | lemm_str = " ".join(tokens) 145 | return list(filter(lambda s: s.strip(), self.m.lemmatize(lemm_str))) 146 | 147 | class DefaultTextProcessor(TextProcessor): 148 | def __init__(self, token_pattern="(?u)\\b\\w+\\b", stop_words=None): 149 | splitter = Splitter(token_pattern) 150 | filterer = DictionaryFilterer(stop_words=stop_words) 151 | 152 | self.text_pipeline = Pipeline([ 153 | ("split-text", splitter), 154 | ("filter-tokens", filterer), 155 | ]) 156 | 157 | def transform(self, raw_text, *args): 158 | return self.text_pipeline.fit_transform(raw_text.lower()) 159 | 160 | class DefaultDocumentProcessor(DocumentProcessor): 161 | def __init__(self, min_df=None, max_df=None, stop_lemmas=None): 162 | lemmatizer = Lemmatizer() 163 | dict_filterer = DictionaryFilterer(stop_words=stop_lemmas) 164 | freq_filterer = FrequencyFilterer(min_df=min_df, max_df=max_df) 165 | 166 | self.doc_pipeline = Pipeline([ 167 | ("lemmatize-tokens", lemmatizer), 168 | ("filter-by-dictionary", dict_filterer), 169 | ("filter-by-frequency", freq_filterer), 170 | ]) 171 | 172 | def transform(self, tokens, *args): 173 | modalities = dict.fromkeys(default_modalities, []) 174 | modalities["text"] = self.doc_pipeline.fit_transform(tokens) 175 | return modalities 176 | 177 | class DefaultCollectionProcessor(CollectionProcessor): 178 | def __init__(self, min_len=0, min_df=None, max_df=None, len_func=None): 179 | len_func = (lambda doc: len(doc["modalities"]["text"])) if len_func is None else len_func 180 | 181 | len_filterer = LengthFilterer(min_len=min_len, len_func=len_func) 182 | 183 | self.col_pipeline = Pipeline([ 184 | ("filter-by-length", len_filterer), 185 | ]) 186 | 187 | self.freq_filterer = FrequencyFilterer(min_df=min_df, max_df=max_df) 188 | 189 | def fit(self, docs): 190 | # TODO: make modality an external parameter 191 | tokens = sum([doc["modalities"]["text"] for doc in docs], []) 192 | self.freq_filterer.fit(tokens) 193 | return self 194 | 195 | def transform(self, docs, *args): 196 | docs = self.col_pipeline.fit_transform(docs) 197 | docs_modified = [] 198 | for doc in docs: 199 | # TODO: make modality an external parameter 200 | doc["modalities"]["text"] = self.freq_filterer.transform(doc["modalities"]["text"]) 201 | docs_modified.append(doc) 202 | return docs_modified 203 | 204 | class UciBowSink(CollectionProcessor): 205 | def __init__(self, vocab_file, docword_file): 206 | self.vocab_file = vocab_file 207 | self.docword_file = docword_file 208 | 209 | def fit(self, docs): 210 | Ws = set() 211 | for doc in docs: 212 | for k, vs in doc["modalities"].items(): 213 | Ws |= set(map(lambda v: (regex.sub("\s", "_", v), k), vs)) 214 | self.Ws = dict(zip(Ws, range(len(Ws)))) 215 | return self 216 | 217 | def transform(self, docs, *args): 218 | w, d = len(self.Ws), len(docs) 219 | nnzs = [] 220 | for docID, doc in enumerate(docs): 221 | bow = [] 222 | for k, vs in doc["modalities"].items(): 223 | bow += map(lambda v: self.Ws.get((regex.sub("\s", "_", v), k), -1), vs) 224 | nnzs += map(lambda p: (docID + 1, p[0] + 1, p[1]), Counter(bow).items()) 225 | docword_header = "%d\n%d\n%d\n" % (d, w, len(nnzs)) 226 | words_list = sorted(self.Ws.items(), key=lambda p: p[1]) 227 | self.vocab_file.write("\n".join(map(lambda k: "%s %s" % k[0], words_list))) 228 | self.docword_file.write(docword_header + "\n".join(map(lambda v: "%d %d %d" % v, nnzs))) 229 | self.vocab_file.close() 230 | self.docword_file.close() 231 | 232 | class VowpalWabbitSink(BaseSink): 233 | def __init__(self, vw_file, id_func): 234 | self.vw_file = vw_file 235 | self.id_func = id_func 236 | 237 | def transform(self, docs, *args): 238 | for doc in docs: 239 | modalities_str = " ".join(map(lambda p: "|%s %s" % (p[0], 240 | " ".join(map(lambda t: "_".join(t.split()), p[1]))), doc["modalities"].items())) 241 | self.vw_file.write("%s %s\n" % (self.id_func(doc), modalities_str)) 242 | self.vw_file.close() 243 | 244 | class MongoDbSink(BaseSink): 245 | def __init__(self, collection_name, id_func=None): 246 | client = MongoClient() 247 | self.collection = client["datasets"][collection_name] 248 | self.id_func = id_func 249 | 250 | def transform(self, docs, *args): 251 | reqs = copy.deepcopy(docs) 252 | if self.id_func: 253 | for req in reqs: 254 | req["_id"] = self.id_func(req) 255 | result = self.collection.insert_many(reqs) 256 | return result.inserted_ids -------------------------------------------------------------------------------- /experiments/Parsing habr dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Хабрахабр" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true, 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "%matplotlib inline\n", 24 | "import regex\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import matplotlib.pyplot as plt" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": false, 35 | "deletable": true, 36 | "editable": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from collections import Counter, defaultdict\n", 41 | "from pymongo import MongoClient\n", 42 | "from sklearn.pipeline import Pipeline\n", 43 | "from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor\n", 44 | "from ipywidgets import FloatProgress\n", 45 | "from IPython.display import display" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "deletable": true, 52 | "editable": true 53 | }, 54 | "source": [ 55 | "Перегоняем данные из базы `test.habrahabr` в базу `datasets.habrahabr` с изменением формата и сохраняем на диске в формате Vowpal Wabbit." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": { 62 | "collapsed": true, 63 | "deletable": true, 64 | "editable": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "client = MongoClient()\n", 69 | "in_collection = client[\"test\"][\"habrahabr\"]\n", 70 | "out_collection = client[\"datasets\"][\"habrahabr\"]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 10, 76 | "metadata": { 77 | "collapsed": false, 78 | "deletable": true, 79 | "editable": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "stop_words = open(\"../datasets/habrahabr/stopwords.txt\").read().split()\n", 84 | "rare_words = open(\"../datasets/habrahabr/rarewords.txt\").read().split()\n", 85 | "stop_lemmas = set(stop_words).union(set(rare_words))\n", 86 | "doc_pipeline = Pipeline([\n", 87 | " (\"text-processor\", DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n", 88 | " (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),\n", 89 | "])" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "deletable": true, 96 | "editable": true 97 | }, 98 | "source": [ 99 | "---" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 11, 105 | "metadata": { 106 | "collapsed": false, 107 | "deletable": true, 108 | "editable": true 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "CPU times: user 1.26 s, sys: 10 ms, total: 1.27 s\n", 116 | "Wall time: 1.27 s\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "%%time\n", 122 | "\n", 123 | "# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль\n", 124 | "\n", 125 | "pn_vocab = {\"text\": set(), \"flat_tag\": set()}\n", 126 | "\n", 127 | "for doc in open(\"../datasets/postnauka/postnauka.txt\"):\n", 128 | " tokens = doc.split()\n", 129 | " for token in tokens[1:]:\n", 130 | " if token.startswith(\"|\"):\n", 131 | " cur_mod = token[1:]\n", 132 | " else:\n", 133 | " if cur_mod == \"text\" or cur_mod == \"flat_tag\":\n", 134 | " pn_vocab[cur_mod].add(token)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 12, 140 | "metadata": { 141 | "collapsed": false, 142 | "deletable": true, 143 | "editable": true 144 | }, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "44995" 150 | ] 151 | }, 152 | "execution_count": 12, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "len(pn_vocab[\"text\"]) + len(pn_vocab[\"flat_tag\"])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 13, 164 | "metadata": { 165 | "collapsed": false, 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "def preprocess_tag(tag):\n", 172 | " return \"_\".join(regex.findall(\"(?u)\\\\b\\\\p{L}+\\\\b\", tag.strip().lower()))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 14, 178 | "metadata": { 179 | "collapsed": false, 180 | "deletable": true, 181 | "editable": true 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "CPU times: user 19min 3s, sys: 37.3 s, total: 19min 40s\n", 189 | "Wall time: 1h 22min 32s\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "%%time\n", 195 | "\n", 196 | "docs_count = in_collection.count({ \"company_blog\": None })\n", 197 | "f = FloatProgress(min=0, max=docs_count)\n", 198 | "display(f)\n", 199 | "\n", 200 | "counter = 0\n", 201 | "\n", 202 | "with open(\"../datasets/habrahabr/habrahabr.txt\", \"w\") as vw_file:\n", 203 | " for doc_id, mongo_doc in enumerate(in_collection.find({ \"company_blog\": None }).sort(\"_id\", 1), 1):\n", 204 | " doc = {}\n", 205 | " doc[\"_id\"] = \"habr_%d\" % doc_id\n", 206 | " doc[\"original_id\"] = mongo_doc[\"_id\"]\n", 207 | " doc[\"title\"] = mongo_doc[\"title\"]\n", 208 | " doc[\"url\"] = mongo_doc[\"url\"]\n", 209 | " doc[\"modalities\"] = {\"text_habr\": [], \"text\": [], \"flat_tag_habr\": [], \"flat_tag\": []}\n", 210 | " modalities = doc_pipeline.fit_transform(mongo_doc[\"content_html\"])\n", 211 | " for token in modalities[\"text\"]:\n", 212 | " if token in pn_vocab[\"text\"]:\n", 213 | " doc[\"modalities\"][\"text\"].append(token)\n", 214 | " else:\n", 215 | " doc[\"modalities\"][\"text\"].append(token)\n", 216 | " doc[\"modalities\"][\"text_habr\"].append(token)\n", 217 | " for token in map(preprocess_tag, mongo_doc[\"tags\"]):\n", 218 | " if token in pn_vocab[\"flat_tag\"]:\n", 219 | " doc[\"modalities\"][\"flat_tag\"].append(token)\n", 220 | " else:\n", 221 | " doc[\"modalities\"][\"flat_tag\"].append(token)\n", 222 | " doc[\"modalities\"][\"flat_tag_habr\"].append(token)\n", 223 | " doc[\"modalities\"][\"authors\"] = [mongo_doc[\"author_user\"]]\n", 224 | " doc[\"modalities\"][\"hubs\"] = mongo_doc[\"hubs\"]\n", 225 | " doc[\"markdown\"] = mongo_doc[\"content_html\"]\n", 226 | " # TODO: подтягивать имена авторов с Хабра\n", 227 | " doc[\"authors_names\"] = doc[\"modalities\"][\"authors\"]\n", 228 | " # Фильтрация коротких документов из Хабра\n", 229 | " if len(doc[\"modalities\"][\"text\"]) > 100:\n", 230 | " # Записать в Vowpal Wabbit\n", 231 | " modalities_str = \" \".join(map(lambda p: \"|%s %s\" % (p[0],\n", 232 | " \" \".join(map(lambda t: \"_\".join(t.split()), p[1]))), doc[\"modalities\"].items()))\n", 233 | " vw_file.write(\"%s %s\\n\" % (doc[\"_id\"], modalities_str))\n", 234 | " # Записать в MongoDB\n", 235 | " out_collection.insert_one(doc)\n", 236 | " # Увеличить счетчик прогресс-бара\n", 237 | " f.value += 1" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "deletable": true, 244 | "editable": true 245 | }, 246 | "source": [ 247 | "---" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "deletable": true, 254 | "editable": true 255 | }, 256 | "source": [ 257 | "### Фильтрация слов с низким DF" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 28, 263 | "metadata": { 264 | "collapsed": false, 265 | "deletable": true, 266 | "editable": true 267 | }, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "CPU times: user 18min 34s, sys: 29.3 s, total: 19min 3s\n", 274 | "Wall time: 1h 7min 55s\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "%%time\n", 280 | "\n", 281 | "docs_count = in_collection.count({ \"company_blog\": None })\n", 282 | "f = FloatProgress(min=0, max=docs_count)\n", 283 | "display(f)\n", 284 | "\n", 285 | "word_counter = defaultdict(set)\n", 286 | "\n", 287 | "for doc_id, mongo_doc in enumerate(in_collection.find({ \"company_blog\": None }), 1):\n", 288 | " modalities = doc_pipeline.fit_transform(mongo_doc[\"content_html\"])\n", 289 | " for word in modalities[\"text\"]:\n", 290 | " word_counter[word].add(doc_id)\n", 291 | " # Увеличить счетчик прогресс-бара\n", 292 | " f.value += 1" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 29, 298 | "metadata": { 299 | "collapsed": false, 300 | "deletable": true, 301 | "editable": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "words = list(word_counter.items())" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 30, 311 | "metadata": { 312 | "collapsed": false, 313 | "deletable": true, 314 | "editable": true 315 | }, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "602833" 321 | ] 322 | }, 323 | "execution_count": 30, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "len(word_counter)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 68, 335 | "metadata": { 336 | "collapsed": false, 337 | "deletable": true, 338 | "editable": true 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 69, 348 | "metadata": { 349 | "collapsed": false, 350 | "deletable": true, 351 | "editable": true 352 | }, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "384972\n", 359 | "0.6386047213739129\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "print(len(rare_words))\n", 365 | "print(len(rare_words) / len(words))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 70, 371 | "metadata": { 372 | "collapsed": false, 373 | "deletable": true, 374 | "editable": true 375 | }, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "4415124" 381 | ] 382 | }, 383 | "execution_count": 70, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "open(\"../datasets/habrahabr/rarewords.txt\", \"w\").write(\"\\n\".join(rare_words))" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": { 395 | "deletable": true, 396 | "editable": true 397 | }, 398 | "source": [ 399 | "---" 400 | ] 401 | } 402 | ], 403 | "metadata": { 404 | "kernelspec": { 405 | "display_name": "Python 3", 406 | "language": "python", 407 | "name": "python3" 408 | }, 409 | "language_info": { 410 | "codemirror_mode": { 411 | "name": "ipython", 412 | "version": 3 413 | }, 414 | "file_extension": ".py", 415 | "mimetype": "text/x-python", 416 | "name": "python", 417 | "nbconvert_exporter": "python", 418 | "pygments_lexer": "ipython3", 419 | "version": "3.5.3" 420 | }, 421 | "latex_envs": { 422 | "bibliofile": "biblio.bib", 423 | "cite_by": "apalike", 424 | "current_citInitial": 1, 425 | "eqLabelWithNumbers": true, 426 | "eqNumInitial": 0 427 | } 428 | }, 429 | "nbformat": 4, 430 | "nbformat_minor": 0 431 | } 432 | -------------------------------------------------------------------------------- /experiments/Parsing elementy website.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 279, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import regex\n", 14 | "import collections\n", 15 | "from lxml import html\n", 16 | "from pymongo import MongoClient\n", 17 | "from urllib.request import urlopen\n", 18 | "from urllib.error import HTTPError\n", 19 | "from sklearn.pipeline import Pipeline\n", 20 | "from parsers.text_utils import DefaultTextProcessor, DefaultDocumentProcessor" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "deletable": true, 27 | "editable": true 28 | }, 29 | "source": [ 30 | "### Загрузка страниц с веб-сайта elementy.ru" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 220, 36 | "metadata": { 37 | "collapsed": true, 38 | "deletable": true, 39 | "editable": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "maybe = lambda f, x: f(x) if x else None" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 25, 49 | "metadata": { 50 | "collapsed": true, 51 | "deletable": true, 52 | "editable": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "def process_html(text):\n", 57 | " return text.replace(\"\\xa0\", \" \")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 61, 63 | "metadata": { 64 | "collapsed": true, 65 | "deletable": true, 66 | "editable": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "def process_tag(text):\n", 71 | " return regex.sub(\"\\s\", \"_\", process_html(text).strip()).lower()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 261, 77 | "metadata": { 78 | "collapsed": false, 79 | "deletable": true, 80 | "editable": true 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Parsed 98/100 pages\n", 88 | "Parsed 198/200 pages\n", 89 | "Parsed 296/300 pages\n", 90 | "Parsed 392/400 pages\n", 91 | "Parsed 491/500 pages\n", 92 | "Parsed 589/600 pages\n", 93 | "Parsed 686/700 pages\n", 94 | "Parsed 784/800 pages\n", 95 | "Parsed 881/900 pages\n", 96 | "Parsed 981/1000 pages\n", 97 | "Parsed 1079/1100 pages\n", 98 | "Parsed 1176/1200 pages\n", 99 | "Parsed 1273/1300 pages\n", 100 | "Parsed 1370/1400 pages\n", 101 | "Parsed 1466/1500 pages\n", 102 | "Parsed 1566/1600 pages\n", 103 | "Parsed 1658/1700 pages\n", 104 | "Parsed 1752/1800 pages\n", 105 | "Parsed 1848/1900 pages\n", 106 | "Parsed 1946/2000 pages\n", 107 | "Parsed 2038/2100 pages\n", 108 | "Parsed 2129/2200 pages\n", 109 | "Parsed 2223/2300 pages\n", 110 | "CPU times: user 45.4 s, sys: 3.45 s, total: 48.9 s\n", 111 | "Wall time: 28min 17s\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "%%time\n", 117 | "\n", 118 | "# Парсинг всех страниц\n", 119 | "pages_ids = list(range(431231, 433629))\n", 120 | "pages = []\n", 121 | "\n", 122 | "for i, page_id in enumerate(pages_ids, 1):\n", 123 | " try:\n", 124 | " page_url = \"http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/\" % page_id\n", 125 | " page = html.parse(urlopen(page_url))\n", 126 | "\n", 127 | " title = process_html(page.findtext(\"//h1\"))\n", 128 | " tags = list(map(lambda p: process_tag(p.text),\n", 129 | " page.findall(\"//div[@class='mb itemhead newslist']/div/a\")[1:-1]))\n", 130 | " article = page.find(\"//div[@class='itemblock']/div[@class='memo']\")\n", 131 | "\n", 132 | " summary = maybe(process_html, article.findtext(\"./p[@class='Intro']\"))\n", 133 | " text = []\n", 134 | " content_flag = False\n", 135 | " for elem in article.iterfind(\"p\"):\n", 136 | " if len(elem.classes) > 0:\n", 137 | " continue\n", 138 | " # TODO: filter wrong paragraphs\n", 139 | " # TODO: can also be non-paragraphs (h3, ol, etc)\n", 140 | " text.append(process_html(elem.text_content()))\n", 141 | " text = \"\\n\\n\".join(text)\n", 142 | " \n", 143 | " pages.append((page_id, title, tags, summary, text))\n", 144 | " except Exception:\n", 145 | " pass\n", 146 | " \n", 147 | " if i % 100 == 0:\n", 148 | " print(\"Parsed %d/%d pages\" % (len(pages), i))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 267, 154 | "metadata": { 155 | "collapsed": false, 156 | "deletable": true, 157 | "editable": true 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "2300" 164 | ] 165 | }, 166 | "execution_count": 267, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "len(pages)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "deletable": true, 179 | "editable": true 180 | }, 181 | "source": [ 182 | "### Парсинг" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 292, 188 | "metadata": { 189 | "collapsed": false, 190 | "deletable": true, 191 | "editable": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "stop_words = open(\"../datasets/elementy/stopwords.txt\").read().split()\n", 196 | "rare_words = open(\"../datasets/elementy/rarewords.txt\").read().split()\n", 197 | "stop_lemmas = set(stop_words).union(set(rare_words))\n", 198 | "doc_pipeline = Pipeline([\n", 199 | " (\"text-processor\", DefaultTextProcessor(token_pattern=\"(?u)\\\\b\\\\p{L}+\\\\b\")),\n", 200 | " (\"document-processor\", DefaultDocumentProcessor(stop_lemmas=stop_lemmas)),\n", 201 | "])" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 293, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "CPU times: user 1.48 s, sys: 19 ms, total: 1.5 s\n", 216 | "Wall time: 1.52 s\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "%%time\n", 222 | "\n", 223 | "# TODO: вынести разнесение токенов по двум модальностям (MOD и MOD_habr) в отдельный модуль\n", 224 | "\n", 225 | "pn_vocab = {\"text\": set(), \"flat_tag\": set()}\n", 226 | "\n", 227 | "for doc in open(\"../datasets/postnauka/postnauka.txt\"):\n", 228 | " tokens = doc.split()\n", 229 | " for token in tokens[1:]:\n", 230 | " if token.startswith(\"|\"):\n", 231 | " cur_mod = token[1:]\n", 232 | " else:\n", 233 | " if cur_mod == \"text\" or cur_mod == \"flat_tag\":\n", 234 | " pn_vocab[cur_mod].add(token)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 294, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "44995" 248 | ] 249 | }, 250 | "execution_count": 294, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "len(pn_vocab[\"text\"]) + len(pn_vocab[\"flat_tag\"])" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 295, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "client = MongoClient()\n", 268 | "out_collection = client[\"datasets\"][\"elementy\"]" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 296, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "Written 500 pages\n", 283 | "Written 1000 pages\n", 284 | "Written 1500 pages\n", 285 | "Written 2000 pages\n", 286 | "CPU times: user 38.1 s, sys: 612 ms, total: 38.7 s\n", 287 | "Wall time: 2min 54s\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "%%time\n", 293 | "\n", 294 | "with open(\"../datasets/elementy/elementy.txt\", \"w\") as vw_file:\n", 295 | " for i, page in enumerate(pages, 1):\n", 296 | " page_id, title, tags, summary, text = page\n", 297 | " doc = {}\n", 298 | " doc[\"_id\"] = \"elem_%d\" % page_id\n", 299 | " doc[\"title\"] = title\n", 300 | " doc[\"url\"] = \"http://elementy.ru/nauchno-populyarnaya_biblioteka/%d/\" % page_id\n", 301 | " doc[\"modalities\"] = {\"text_elem\": [], \"text\": [], \"flat_tag_elem\": [], \"flat_tag\": []}\n", 302 | " modalities = doc_pipeline.fit_transform(text)\n", 303 | " for token in modalities[\"text\"]:\n", 304 | " if token in pn_vocab[\"text\"]:\n", 305 | " doc[\"modalities\"][\"text\"].append(token)\n", 306 | " else:\n", 307 | " doc[\"modalities\"][\"text\"].append(token)\n", 308 | " doc[\"modalities\"][\"text_elem\"].append(token)\n", 309 | " for token in tags:\n", 310 | " if token in pn_vocab[\"flat_tag\"]:\n", 311 | " doc[\"modalities\"][\"flat_tag\"].append(token)\n", 312 | " else:\n", 313 | " doc[\"modalities\"][\"flat_tag\"].append(token)\n", 314 | " doc[\"modalities\"][\"flat_tag_elem\"].append(token)\n", 315 | " doc[\"summary\"] = summary\n", 316 | " doc[\"markdown\"] = text\n", 317 | " # Фильтрация коротких документов из Элементов\n", 318 | " if len(doc[\"modalities\"][\"text\"]) > 100:\n", 319 | " # Записать в Vowpal Wabbit\n", 320 | " modalities_str = \" \".join(map(lambda p: \"|%s %s\" % (p[0],\n", 321 | " \" \".join(map(lambda t: \"_\".join(t.split()), p[1]))), doc[\"modalities\"].items()))\n", 322 | " vw_file.write(\"%s %s\\n\" % (doc[\"_id\"], modalities_str))\n", 323 | " # Записать в MongoDB\n", 324 | " out_collection.insert_one(doc)\n", 325 | " if i % 500 == 0:\n", 326 | " print(\"Written %d pages\" % i)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "deletable": true, 333 | "editable": true 334 | }, 335 | "source": [ 336 | "### Фильтрация слов с низким DF" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 285, 342 | "metadata": { 343 | "collapsed": false, 344 | "deletable": true, 345 | "editable": true 346 | }, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "Processed 500 pages\n", 353 | "Processed 1000 pages\n", 354 | "Processed 1500 pages\n", 355 | "Processed 2000 pages\n", 356 | "CPU times: user 32.1 s, sys: 214 ms, total: 32.4 s\n", 357 | "Wall time: 2min 50s\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "%%time\n", 363 | "\n", 364 | "word_counter = collections.defaultdict(set)\n", 365 | "\n", 366 | "for i, page in enumerate(pages, 1):\n", 367 | " page_id, _, _, _, text = page\n", 368 | " modalities = doc_pipeline.fit_transform(text)\n", 369 | " for word in modalities[\"text\"]:\n", 370 | " word_counter[word].add(page_id)\n", 371 | " if i % 500 == 0:\n", 372 | " print(\"Processed %d pages\" % i)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 286, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "words = list(word_counter.items())" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 287, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "79946" 397 | ] 398 | }, 399 | "execution_count": 287, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "len(word_counter)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 288, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "rare_words = set(map(lambda p: p[0], filter(lambda p: len(p[1]) <= 1, words)))" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 289, 422 | "metadata": { 423 | "collapsed": false 424 | }, 425 | "outputs": [ 426 | { 427 | "name": "stdout", 428 | "output_type": "stream", 429 | "text": [ 430 | "39494\n", 431 | "0.49400845570760266\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "print(len(rare_words))\n", 437 | "print(len(rare_words) / len(words))" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 291, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "388252" 451 | ] 452 | }, 453 | "execution_count": 291, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "open(\"../datasets/elementy/rarewords.txt\", \"w\").write(\"\\n\".join(rare_words))" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": { 465 | "deletable": true, 466 | "editable": true 467 | }, 468 | "source": [ 469 | "---" 470 | ] 471 | } 472 | ], 473 | "metadata": { 474 | "kernelspec": { 475 | "display_name": "Python 3", 476 | "language": "python", 477 | "name": "python3" 478 | }, 479 | "language_info": { 480 | "codemirror_mode": { 481 | "name": "ipython", 482 | "version": 3 483 | }, 484 | "file_extension": ".py", 485 | "mimetype": "text/x-python", 486 | "name": "python", 487 | "nbconvert_exporter": "python", 488 | "pygments_lexer": "ipython3", 489 | "version": "3.5.3" 490 | }, 491 | "latex_envs": { 492 | "bibliofile": "biblio.bib", 493 | "cite_by": "apalike", 494 | "current_citInitial": 1, 495 | "eqLabelWithNumbers": true, 496 | "eqNumInitial": 0 497 | } 498 | }, 499 | "nbformat": 4, 500 | "nbformat_minor": 2 501 | } 502 | -------------------------------------------------------------------------------- /experiments/Spectrum experiments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%reload_ext autoreload\n", 12 | "%autoreload 2" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 11, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "from parsers import arbitrary, text_utils\n", 24 | "import artm\n", 25 | "import hierarchy_utils\n", 26 | "import pickle\n", 27 | "from spectrum import arrange_topics\n", 28 | "#import arranging.api as api\n", 29 | "from crossmin import CrossMinimizer\n", 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stderr", 42 | "output_type": "stream", 43 | "text": [ 44 | "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:604: DeprecationWarning: invalid escape sequence \\*\n", 45 | "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:714: DeprecationWarning: invalid escape sequence \\d\n", 46 | "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/bigartm-0.8.3-py3.6.egg/artm/master_component.py:783: DeprecationWarning: 'async' and 'await' will become reserved keywords in Python 3.7\n" 47 | ] 48 | }, 49 | { 50 | "ename": "NameError", 51 | "evalue": "name 'pickle' is not defined", 52 | "output_type": "error", 53 | "traceback": [ 54 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 55 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 56 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0martm_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMODEL_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0martm_extra_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMODEL_PATH\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/extra_info.dump\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 57 | "\u001b[0;31mNameError\u001b[0m: name 'pickle' is not defined" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "\n", 63 | "\n", 64 | "T = lambda lid, tid: \"level_%d_%s\" % (lid, tid)\n", 65 | "\n", 66 | "def from_artm_tid(artm_tid):\n", 67 | " # This is due to hARTM bug\n", 68 | " if artm_tid.startswith(\"level_0_\"):\n", 69 | " return (0, artm_tid[8:])\n", 70 | " else:\n", 71 | " lid, tid = artm_tid[5:].split(\"_\", 1)\n", 72 | " lid = int(lid)\n", 73 | " return (lid, tid)\n", 74 | "\n", 75 | "MODEL_PATH = \"hartm/\"\n", 76 | "\n", 77 | "artm_model = hierarchy_utils.hARTM(theta_columns_naming=\"title\",\n", 78 | " cache_theta=True)\n", 79 | "artm_model.load(MODEL_PATH)\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 8, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "artm_extra_info = pickle.load(open(MODEL_PATH + \"/extra_info.dump\", \"rb\"))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 9, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "# Extract Phi, Psi and Theta matrices\n", 102 | "phis = []\n", 103 | "psis = []\n", 104 | "#theta = artm_extra_info[\"theta\"]\n", 105 | "# theta = pickle.load(open(THETA_MODEL_PATH, \"rb\"))[\"theta\"]\n", 106 | "for level_idx, artm_level in enumerate(artm_model._levels):\n", 107 | " phis.append(artm_level.get_phi(class_ids=\"flat_tag\"))\n", 108 | " if level_idx > 0:\n", 109 | " psis.append(artm_level.get_psi())\n", 110 | " \n", 111 | "phi0_topic_titles = list(filter(lambda x: x.startswith(\"topic\"), phis[0].columns))\n", 112 | "phi1_topic_titles = list(filter(lambda x: x.startswith(\"topic\"), phis[1].columns))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 12, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "new_phi1_topic_order = np.array(phi1_topic_titles)[arrange_topics(phis[1][phi1_topic_titles].values)]\n", 124 | "phis[1] = phis[1][new_phi1_topic_order]\n", 125 | "psis[0] = psis[0].loc[new_phi1_topic_order]" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 13, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "children_threshold = 0.05\n", 137 | "\n", 138 | "D = np.zeros((len(phi0_topic_titles), len(phi1_topic_titles)))\n", 139 | "\n", 140 | "for parent_id, parent in enumerate(phi0_topic_titles):\n", 141 | " for child_id, maybe_child in enumerate(phi1_topic_titles):\n", 142 | " if psis[0].loc[maybe_child, parent] > children_threshold:\n", 143 | " D[parent_id, child_id] = 1" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 14, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stderr", 155 | "output_type": "stream", 156 | "text": [ 157 | "/Users/aksholokhov/.anaconda3/lib/python3.6/site-packages/pulp/solvers.py:71: DeprecationWarning: The SafeConfigParser class has been renamed to ConfigParser in Python 3.2. This alias will be removed in future versions. Use ConfigParser directly instead.\n", 158 | " 'os':operating_system, 'arch':arch})\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "cm = CrossMinimizer(D)\n", 164 | "idx = cm.solve(mode=\"auto\", model=None)\n", 165 | "new_phi0_topic_order = np.array(phi1_topic_titles)[idx]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 15, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "array(['topic_3', 'topic_1', 'topic_7', 'topic_9', 'topic_10', 'topic_12',\n", 179 | " 'topic_2', 'topic_4', 'topic_13', 'topic_11', 'topic_17',\n", 180 | " 'topic_16', 'topic_6', 'topic_14', 'topic_18', 'topic_5', 'topic_0',\n", 181 | " 'topic_15', 'topic_8'], \n", 182 | " dtype=' 0: 42 | self._psis.append(artm_level.get_psi()) 43 | 44 | # Construct topic name mappings 45 | self._from_artm_tid_map = {} 46 | self._to_artm_tid_map = {} 47 | self._from_lid_tid_map = {} 48 | self._to_lid_tid_map = {} 49 | theta_new_index = [] 50 | for artm_tid in self._theta.index: 51 | if artm_tid.startswith("level_0_"): 52 | lid, tid = 0, artm_tid[8:] 53 | else: 54 | lid, tid = artm_tid[5:].split("_", 1) 55 | lid = int(lid) 56 | topic_id = "level_%d_%s" % (lid, tid) # internal project-consistent topic name 57 | if tid.startswith("topic_"): 58 | self._from_artm_tid_map[artm_tid] = topic_id 59 | self._to_artm_tid_map[topic_id] = artm_tid 60 | self._from_lid_tid_map[lid, tid] = topic_id 61 | self._to_lid_tid_map[topic_id] = (lid, tid) 62 | theta_new_index.append(topic_id) 63 | self._theta.index = theta_new_index 64 | 65 | # Construct spectrums map 66 | spectrum_map = {} 67 | for spectrum in self._extra_info["spectrums"]: 68 | for i, topic_id in enumerate(spectrum): 69 | spectrum_map[topic_id] = i 70 | 71 | # Construct topics infos 72 | # TODO: make topic maning an external procedure 73 | self._topics = {} 74 | for lid, phi in enumerate(self._phis): 75 | names = phi.index[phi.values.argsort(axis=0)[::-1].T] 76 | for tid, top_words in zip(phi.columns, names): 77 | # subject topic names are "topic_X", where X = 0, 1, ... 78 | # background topic names are "background_X", where X = 0, 1, ... 79 | if regex.match("^topic_\d+$", tid): 80 | topic_id = self._from_lid_tid_map[lid, tid] 81 | self._topics[topic_id] = { 82 | "level_id": lid, 83 | "top_words": list(top_words), 84 | "_unnamed": True, 85 | "parents": [], 86 | "children": [], 87 | "weight": 0, 88 | "spectrum_id": spectrum_map.get(topic_id) 89 | } 90 | 91 | # Define parent-child relationship for topics 92 | for lid, psi in enumerate(self._psis): 93 | psi = (psi >= psi_edge_threshold) 94 | for tid1 in psi.columns: 95 | if regex.match("^topic_\d+$", tid1): 96 | for tid2 in psi.index: 97 | if regex.match("^topic_\d+$", tid2) and psi.loc[tid2, tid1]: 98 | topic_id_parent = self._from_lid_tid_map[lid, tid1] 99 | topic_id_child = self._from_lid_tid_map[lid + 1, tid2] 100 | self._topics[topic_id_parent]["children"].append(topic_id_child) 101 | self._topics[topic_id_child]["parents"].append(topic_id_parent) 102 | 103 | # Assign top words to child topics 104 | # TODO: make topic maning an external procedure 105 | for topic_id, topic in self._topics.items(): 106 | parents_ids_set = set(topic["parents"]) 107 | sibling_topics_ids = [tid for tid, t in self._topics.items() 108 | if parents_ids_set & set(t["parents"]) and "_unnamed" not in t] 109 | used_top_words = sum(map(lambda tid: self._topics[tid]["top_words"][:topic_naming_n_words], 110 | topic["parents"] + sibling_topics_ids), []) 111 | topic["top_words"] = list(filter(lambda tw: tw not in used_top_words, 112 | topic["top_words"]))[:topic_naming_n_words] 113 | del topic["_unnamed"] 114 | 115 | # Define parent-child relationship for topics and documents 116 | last_lid = self.num_levels - 1 117 | doc_topics = self.get_topics_ids_by_level(last_lid) 118 | self._doc_theta = self._theta.loc[doc_topics] 119 | self._doc_thresholds = self._doc_theta.max(axis=0) / np.sqrt(2) 120 | 121 | # Define topic weight as: 122 | # For last-level topic, weight = number of documents that belong to it 123 | # For a higher-level topic, weight = sum(weight) of topic's child topics 124 | docs_count = self._doc_theta.apply(lambda s: sum(s >= self._doc_thresholds), axis=1) 125 | lids_tids = list(self._from_lid_tid_map.keys()) 126 | lids_tids = sorted(lids_tids, reverse=True) 127 | for lid, tid in lids_tids: 128 | topic_id = self._from_lid_tid_map[lid, tid] 129 | if lid == last_lid: 130 | w = int(docs_count[topic_id]) 131 | else: 132 | w = 0 133 | for child_topic_id in self._topics[topic_id]["children"]: 134 | w += self._topics[child_topic_id]["weight"] 135 | self._topics[topic_id]["weight"] = w 136 | 137 | def get_topics_ids_by_level(self, level_id): 138 | if level_id < 0 or level_id >= self.num_levels: 139 | raise ValueError("Unknown level_id: %d" % level_id) 140 | 141 | topics_ids = [] 142 | for (lid, tid), topic_id in self._from_lid_tid_map.items(): 143 | if lid == level_id: 144 | topics_ids.append(topic_id) 145 | return topics_ids 146 | 147 | def get_docs_ids_by_topic(self, topic_id): 148 | if topic_id not in self._doc_theta.index: 149 | raise ValueError("Unknown document topic id: '%s'" % topic_id) 150 | 151 | ptd = self._doc_theta.loc[topic_id] 152 | sorted_ptd = ptd[ptd >= self._doc_thresholds].sort_values(ascending=False) 153 | return sorted_ptd 154 | 155 | def get_topics_by_docs_ids(self, docs_ids): 156 | theta = self._theta 157 | doc_theta = self._doc_theta 158 | thresholds = self._doc_thresholds 159 | topics = self._topics 160 | tid_lid = self._to_lid_tid_map 161 | 162 | lowest_level_counter = pd.Series(np.zeros(len(doc_theta.index)), 163 | index=doc_theta.index) 164 | 165 | # docs = dict(zip(theta.index, [set()]*len(theta.index))) 166 | docs = {key: set() for key in theta.index} 167 | 168 | for doc in docs_ids: 169 | if doc["doc_id"] not in thresholds.index: 170 | continue 171 | topics_for_doc = {} 172 | comparison = (doc_theta[doc["doc_id"]] > thresholds[doc["doc_id"]]) 173 | lowest_level_counter += np.int32(comparison) 174 | for topic, doc_in_topic in zip(doc_theta.index, comparison): 175 | if doc_in_topic: 176 | docs[topic].add(doc["doc_id"]) 177 | 178 | levels_count = tid_lid[lowest_level_counter.index[0]][0] + 1 179 | 180 | answer = pd.Series(np.zeros(len(theta.index)), index=theta.index) 181 | answer[lowest_level_counter.index] = lowest_level_counter 182 | 183 | is_level_topic = lambda lid: lambda x: x.startswith("level_%d_t" % lid) 184 | for lid in range(0, levels_count-1)[::-1]: 185 | curr_level_topics = list(filter(is_level_topic(lid), answer.index)) 186 | for topic in curr_level_topics: 187 | for child in topics[topic]["children"]: 188 | answer[topic] += answer[child] 189 | docs[topic] |= docs[child] 190 | 191 | for lid in range(0, levels_count)[::-1]: 192 | curr_level_topics = list(filter(is_level_topic(lid), answer.index)) 193 | total_docs_in_this_level = sum(answer[curr_level_topics]) 194 | if total_docs_in_this_level != 0: 195 | answer[curr_level_topics] /= total_docs_in_this_level 196 | 197 | return dict(zip(docs.keys(), [list(v) for k, v in docs.items()])), dict(answer) 198 | 199 | 200 | def transform_one(self, vw_path, batch_path): 201 | transform_batch = artm.BatchVectorizer(data_format="vowpal_wabbit", 202 | data_path=vw_path, 203 | batch_size=1, 204 | target_folder=batch_path) 205 | transform_theta = self._model.transform(transform_batch) 206 | response = {} 207 | for artm_tid, pdt in transform_theta["upload"].items(): 208 | if artm_tid in self._from_artm_tid_map: 209 | topic_id = self._from_artm_tid_map[artm_tid] 210 | response[topic_id] = float(pdt) 211 | return response 212 | 213 | 214 | @property 215 | def theta(self): 216 | return self._theta 217 | 218 | @property 219 | def topics(self): 220 | return self._topics 221 | 222 | @property 223 | def num_levels(self): 224 | return self._model.num_levels 225 | 226 | @property 227 | def topics_ids(self): 228 | return self._theta.index 229 | 230 | def get_phi(self, level_id): 231 | return self._phis[level_id] 232 | 233 | def get_psi(self, level_id): 234 | return self._psis[level_id] 235 | 236 | def to_topic_id(self, lid, tid): 237 | return self._from_lid_tid_map[lid, tid] 238 | 239 | def from_topic_id(self, topic_id): 240 | return self._to_lid_tid_map[topic_id] 241 | 242 | 243 | class ArtmDataSource: 244 | def __init__(self): 245 | self._db = MongoClient() 246 | 247 | def get_documents_by_ids(self, docs_ids, with_texts=True, with_modalities=False): 248 | fields = {"title": 1, "authors_names" : 1} 249 | if with_texts: 250 | fields["markdown"] = 1 251 | if with_modalities: 252 | fields["modalities"] = 1 253 | queries = {} 254 | for doc_id in docs_ids: 255 | prefix = doc_id.split("_", 1)[0] 256 | col_name = prefix_to_col_map[prefix] 257 | if col_name not in queries: 258 | queries[col_name] = [] 259 | queries[col_name].append(doc_id) 260 | result = [] 261 | for col_name, col_docs_ids in queries.items(): 262 | dataset = self._db["datasets"][col_name] 263 | result += dataset.find({"_id": {"$in": col_docs_ids}}, fields) 264 | result_map = dict(map(lambda v: (v["_id"], v), result)) 265 | response = [] 266 | for doc_id in docs_ids: 267 | if doc_id not in result_map: 268 | continue 269 | doc = result_map[doc_id] 270 | res = { 271 | "doc_id": doc["_id"], 272 | "title": doc["title"], 273 | "authors_names": doc.get("authors_names", []) 274 | } 275 | if with_texts: 276 | res["markdown"] = doc["markdown"] 277 | if with_modalities: 278 | res["modalities"] = doc["modalities"] 279 | response.append(res) 280 | return response 281 | 282 | def search_query_in_models_docs(self, query, limit=10): 283 | col_results = self._db.model.all_docs.find( 284 | {"$text": {"$search": query}}, 285 | {"score": {"$meta": "textScore"}}).sort( 286 | [("score", {"$meta": "textScore"})]).limit(limit) 287 | results = [] 288 | for row in col_results: 289 | results.append({ 290 | "doc_id": row["_id"], 291 | "score": row["score"], 292 | }) 293 | 294 | return sorted(results, key=lambda x: x["score"]) 295 | 296 | class ArtmBridge: 297 | def __init__(self, model_path): 298 | self._data_source = ArtmDataSource() 299 | self._model = ArtmModel(model_path) 300 | 301 | # Select topics which will be used for recommendation 302 | self._rec_lid = 0 303 | rec_topics = self._model.get_topics_ids_by_level(self._rec_lid) 304 | self._rec_tids = list(map(lambda t: self._model.from_topic_id(t)[1], rec_topics)) 305 | self._rec_theta = self._model.theta.T[rec_topics].sort_index() 306 | 307 | def get_documents_by_topic(self, topic_id, offset=0, limit=None, with_weights=True): 308 | sorted_ptd = self._model.get_docs_ids_by_topic(topic_id) 309 | if limit is None: 310 | limit = len(sorted_ptd) 311 | 312 | sorted_ptd = sorted_ptd[offset:offset + limit] 313 | docs_ids = sorted_ptd.index 314 | docs = self._data_source.get_documents_by_ids(docs_ids, with_texts=False) 315 | weights = {k: float(v) for k, v in sorted_ptd.items()} 316 | if with_weights: 317 | return docs, weights 318 | else: 319 | return docs 320 | 321 | def recommend_tags_by_doc(self, doc, rec_tags_count=5): 322 | own_tags = set(doc["modalities"]["flat_tag"]) 323 | ptd = self._rec_theta.loc[doc["doc_id"]] 324 | weighted_tags = self._model.get_phi(self._rec_lid)[self._rec_tids].mul(ptd.values) 325 | rec_tags = {} 326 | for _, pwt in weighted_tags.iteritems(): 327 | top_tags = pwt.nlargest(len(own_tags) + rec_tags_count) 328 | for tag, w in top_tags.iteritems(): 329 | tag = regex.sub("_", " ", tag) 330 | if tag not in own_tags: 331 | rec_tags[tag] = max(rec_tags.get(tag, 0), w) 332 | rec_tags = list(map(lambda p: (p[1], p[0]), rec_tags.items())) 333 | rec_tags.sort(reverse=True) 334 | rec_tags = list(map(lambda x: x[1], rec_tags[:rec_tags_count])) 335 | return rec_tags 336 | 337 | def recommend_docs_by_doc(self, doc_id, rec_docs_count=5, metric=hellinger_dist): 338 | doc = self._rec_theta.loc[doc_id] 339 | dist = pairwise_distances([doc], self._rec_theta, hellinger_dist)[0] 340 | dist_series = pd.Series(data=dist, index=self._rec_theta.index) 341 | sim_docs_ids = dist_series.nsmallest(rec_docs_count + 1).index 342 | return sim_docs_ids[1:] # Not counting the `doc` itself. 343 | 344 | def search_documents(self, query, limit=10): 345 | search_results = self._data_source.search_query_in_models_docs(query, limit) 346 | return self._model.get_topics_by_docs_ids(search_results) 347 | 348 | @property 349 | def data_source(self): 350 | return self._data_source 351 | 352 | @property 353 | def model(self): 354 | return self._model -------------------------------------------------------------------------------- /experiments/Topical Similarity Measurements for ARTM RecSys.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Анализ разных метрик тематической близости документов" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pickle\n", 19 | "import numpy as np\n", 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from scipy.linalg import norm\n", 32 | "from scipy.stats import entropy\n", 33 | "from pymongo import MongoClient\n", 34 | "from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "client = MongoClient()\n", 46 | "collection = client[\"datasets\"][\"postnauka\"]" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "def t(doc_id):\n", 58 | " return collection.find_one(\"pn_%d\" % doc_id)[\"title\"]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "artm_model = pickle.load(open(\"../server/hartm.mdl\", \"rb\"))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(110, 3446)" 83 | ] 84 | }, 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "artm_model[\"theta\"].shape" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "theta_lvl0 = artm_model[\"theta\"][:10].T" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 8, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "theta_lvl1 = artm_model[\"theta\"][10:-70].T" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "theta_lvl2 = artm_model[\"theta\"][-70:].T" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Близость тематических профилей (косинусная мера)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 10, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "doc_id = 3123" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 11, 148 | "metadata": { 149 | "collapsed": true 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "# Первый уровень" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 12, 159 | "metadata": { 160 | "collapsed": false, 161 | "scrolled": false 162 | }, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n", 169 | "\n", 170 | "Top-5 similar (p(t|d) cosine similarity) documents:\n", 171 | "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.9985)\n", 172 | "2. По шагам | Теория принятия решений (doc_id=3423, p=0.9976)\n", 173 | "3. 5 книг о поведенческой экономике (doc_id=3344, p=0.9965)\n", 174 | "4. Психология создания трудностей и проблем (doc_id=2988, p=0.9949)\n", 175 | "5. Что такое «робот»? (doc_id=2296, p=0.9943)\n", 176 | "6. Курс «Теория принятия решений: математические модели выбора» (doc_id=3181, p=0.9939)\n", 177 | "7. Задачи и проблемы в мышлении (doc_id=1665, p=0.9936)\n", 178 | "8. Эмоциональные вычисления (doc_id=2295, p=0.9933)\n", 179 | "9. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.9928)\n", 180 | "10. Дэвид Вернон: «То, что мы называем искусственным интеллектом, им не является» (doc_id=2256, p=0.9927)\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "theta_lvl = theta_lvl0.sort_index()\n", 186 | "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n", 187 | "sim_matrix = cosine_similarity([theta_lvl.loc[doc_id]], theta_lvl)\n", 188 | "print(\"Top-5 similar (p(t|d) cosine similarity) documents:\")\n", 189 | "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, -11:-1][::-1], np.argsort(sim_matrix)[0, -11:-1][::-1])):\n", 190 | " print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "## Близость тематических профилей (KL-дивергенция)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 13, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "doc_id = 3123" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 14, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "def sym_kl(p, q):\n", 220 | " return entropy(p, q) + entropy(q, p)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 15, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# Первый уровень" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 16, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n", 246 | "\n", 247 | "Top-10 similar (p(t|d) symmetric KL-divergence) documents:\n", 248 | "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.0967)\n", 249 | "2. Главы | Методы измерения данных (doc_id=1051, p=0.1277)\n", 250 | "3. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.1452)\n", 251 | "4. Построение сложных вероятностных моделей (doc_id=2892, p=0.1529)\n", 252 | "5. Эмоциональные вычисления (doc_id=2295, p=0.1542)\n", 253 | "6. Психология создания трудностей и проблем (doc_id=2988, p=0.1583)\n", 254 | "7. FAQ: Компьютерные доказательства (doc_id=1121, p=0.1693)\n", 255 | "8. 5 книг о поведенческой экономике (doc_id=3344, p=0.2286)\n", 256 | "9. По шагам | Теория принятия решений (doc_id=3423, p=0.2335)\n", 257 | "10. Исследования мышления в когнитивной психологии (doc_id=2947, p=0.2378)\n", 258 | "CPU times: user 64 ms, sys: 28 ms, total: 92 ms\n", 259 | "Wall time: 290 ms\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "%%time\n", 265 | "\n", 266 | "theta_lvl = theta_lvl0.sort_index()\n", 267 | "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n", 268 | "sim_matrix = pairwise_distances([theta_lvl.loc[doc_id]], theta_lvl, sym_kl, n_jobs=-1)\n", 269 | "print(\"Top-10 similar (p(t|d) symmetric KL-divergence) documents:\")\n", 270 | "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, 1:11], np.argsort(sim_matrix)[0, 1:11])):\n", 271 | " print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Близость тематических профилей (расстояние Хеллингера)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 17, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "doc_id = 3123" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 18, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "def hellinger_dist(p, q):\n", 301 | " return norm(np.sqrt(p) - np.sqrt(q))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 19, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# Первый уровень" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 20, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "> Математические методы прогнозирования объемов продаж (doc_id=3123)\n", 327 | "\n", 328 | "Top-10 similar (p(t|d) hellinger distance) documents:\n", 329 | "1. Главы | Закономерности простых чисел. Гипотеза Римана (doc_id=2109, p=0.1408)\n", 330 | "2. Главы | Методы измерения данных (doc_id=1051, p=0.1643)\n", 331 | "3. Психология создания трудностей и проблем (doc_id=2988, p=0.1798)\n", 332 | "4. Построение сложных вероятностных моделей (doc_id=2892, p=0.1827)\n", 333 | "5. Марвин Мински и эмоциональные машины (doc_id=3069, p=0.1861)\n", 334 | "6. Эмоциональные вычисления (doc_id=2295, p=0.1918)\n", 335 | "7. FAQ: Компьютерные доказательства (doc_id=1121, p=0.1927)\n", 336 | "8. По шагам | Теория принятия решений (doc_id=3423, p=0.2098)\n", 337 | "9. 5 книг о поведенческой экономике (doc_id=3344, p=0.2142)\n", 338 | "10. «Разработка операционной системы рыночного уровня должна укладываться в 1 млрд долларов» (doc_id=1734, p=0.2190)\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "theta_lvl = theta_lvl0.sort_index()\n", 344 | "print(\"> %s (doc_id=%d)\\n\" % (t(doc_id), doc_id))\n", 345 | "sim_matrix = pairwise_distances([theta_lvl.loc[doc_id]], theta_lvl, hellinger_dist)\n", 346 | "print(\"Top-10 similar (p(t|d) hellinger distance) documents:\")\n", 347 | "for rid, (prob, sim_doc_id) in enumerate(zip(np.sort(sim_matrix)[0, 1:11], np.argsort(sim_matrix)[0, 1:11])):\n", 348 | " print(\"%d. %s (doc_id=%d, p=%.4f)\" % (rid + 1, t(sim_doc_id + 1), sim_doc_id + 1, prob))" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "## Анализ тематических профилей" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 21, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "theta_lvl = theta_lvl1.sort_index()" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 22, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "level1_topic_0 0.006610\n", 380 | "level1_topic_1 0.000028\n", 381 | "level1_topic_2 0.000386\n", 382 | "level1_topic_3 0.000023\n", 383 | "level1_topic_4 0.009620\n", 384 | "level1_topic_5 0.001744\n", 385 | "level1_topic_6 0.026533\n", 386 | "level1_topic_7 0.000150\n", 387 | "level1_topic_8 0.048912\n", 388 | "level1_topic_9 0.000553\n", 389 | "level1_topic_10 0.000025\n", 390 | "level1_topic_11 0.000033\n", 391 | "level1_topic_12 0.001444\n", 392 | "level1_topic_13 0.000027\n", 393 | "level1_topic_14 0.000031\n", 394 | "level1_topic_15 0.033075\n", 395 | "level1_topic_16 0.280329\n", 396 | "level1_topic_17 0.051039\n", 397 | "level1_topic_18 0.302440\n", 398 | "level1_topic_19 0.153228\n", 399 | "level1_topic_20 0.000022\n", 400 | "level1_topic_21 0.000023\n", 401 | "level1_topic_22 0.027129\n", 402 | "level1_topic_23 0.027075\n", 403 | "level1_topic_24 0.006719\n", 404 | "level1_topic_25 0.000036\n", 405 | "level1_topic_26 0.009875\n", 406 | "level1_topic_27 0.000219\n", 407 | "level1_topic_28 0.004038\n", 408 | "level1_topic_29 0.008635\n", 409 | "Name: 3123, dtype: float32" 410 | ] 411 | }, 412 | "execution_count": 22, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "theta_lvl.loc[3123]" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 23, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "level1_topic_0 0.009424\n", 432 | "level1_topic_1 0.000032\n", 433 | "level1_topic_2 0.001288\n", 434 | "level1_topic_3 0.000033\n", 435 | "level1_topic_4 0.017803\n", 436 | "level1_topic_5 0.002538\n", 437 | "level1_topic_6 0.008913\n", 438 | "level1_topic_7 0.000083\n", 439 | "level1_topic_8 0.053283\n", 440 | "level1_topic_9 0.001030\n", 441 | "level1_topic_10 0.003718\n", 442 | "level1_topic_11 0.000185\n", 443 | "level1_topic_12 0.000032\n", 444 | "level1_topic_13 0.000021\n", 445 | "level1_topic_14 0.001256\n", 446 | "level1_topic_15 0.027246\n", 447 | "level1_topic_16 0.258919\n", 448 | "level1_topic_17 0.028847\n", 449 | "level1_topic_18 0.353488\n", 450 | "level1_topic_19 0.153007\n", 451 | "level1_topic_20 0.000026\n", 452 | "level1_topic_21 0.000025\n", 453 | "level1_topic_22 0.005661\n", 454 | "level1_topic_23 0.032550\n", 455 | "level1_topic_24 0.013211\n", 456 | "level1_topic_25 0.000029\n", 457 | "level1_topic_26 0.012926\n", 458 | "level1_topic_27 0.002171\n", 459 | "level1_topic_28 0.006999\n", 460 | "level1_topic_29 0.005257\n", 461 | "Name: 2257, dtype: float32" 462 | ] 463 | }, 464 | "execution_count": 23, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "theta_lvl.loc[2257]" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 24, 476 | "metadata": { 477 | "collapsed": false 478 | }, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "level1_topic_0 0.010725\n", 484 | "level1_topic_1 0.000212\n", 485 | "level1_topic_2 0.000017\n", 486 | "level1_topic_3 0.000021\n", 487 | "level1_topic_4 0.019576\n", 488 | "level1_topic_5 0.007783\n", 489 | "level1_topic_6 0.001676\n", 490 | "level1_topic_7 0.002375\n", 491 | "level1_topic_8 0.013219\n", 492 | "level1_topic_9 0.000790\n", 493 | "level1_topic_10 0.000021\n", 494 | "level1_topic_11 0.000479\n", 495 | "level1_topic_12 0.001415\n", 496 | "level1_topic_13 0.000022\n", 497 | "level1_topic_14 0.000086\n", 498 | "level1_topic_15 0.124689\n", 499 | "level1_topic_16 0.253682\n", 500 | "level1_topic_17 0.022450\n", 501 | "level1_topic_18 0.295885\n", 502 | "level1_topic_19 0.126896\n", 503 | "level1_topic_20 0.000022\n", 504 | "level1_topic_21 0.000712\n", 505 | "level1_topic_22 0.000017\n", 506 | "level1_topic_23 0.039969\n", 507 | "level1_topic_24 0.023700\n", 508 | "level1_topic_25 0.001273\n", 509 | "level1_topic_26 0.008254\n", 510 | "level1_topic_27 0.005643\n", 511 | "level1_topic_28 0.017148\n", 512 | "level1_topic_29 0.021242\n", 513 | "Name: 1734, dtype: float32" 514 | ] 515 | }, 516 | "execution_count": 24, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "theta_lvl.loc[1734]" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": {}, 528 | "source": [ 529 | "---" 530 | ] 531 | } 532 | ], 533 | "metadata": { 534 | "kernelspec": { 535 | "display_name": "Python 3", 536 | "language": "python", 537 | "name": "python3" 538 | }, 539 | "language_info": { 540 | "codemirror_mode": { 541 | "name": "ipython", 542 | "version": 3 543 | }, 544 | "file_extension": ".py", 545 | "mimetype": "text/x-python", 546 | "name": "python", 547 | "nbconvert_exporter": "python", 548 | "pygments_lexer": "ipython3", 549 | "version": "3.5.2" 550 | }, 551 | "latex_envs": { 552 | "bibliofile": "biblio.bib", 553 | "cite_by": "apalike", 554 | "current_citInitial": 1, 555 | "eqLabelWithNumbers": true, 556 | "eqNumInitial": 0 557 | } 558 | }, 559 | "nbformat": 4, 560 | "nbformat_minor": 0 561 | } 562 | -------------------------------------------------------------------------------- /experiments/Parsing ruwiki dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import sys\n", 14 | "import csv\n", 15 | "import unicodedata\n", 16 | "import numpy as np\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 7, 23 | "metadata": { 24 | "collapsed": true, 25 | "deletable": true, 26 | "editable": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "from pymystem3 import Mystem\n", 31 | "from collections import Counter\n", 32 | "from multiprocessing import Pool\n", 33 | "from IPython.display import display\n", 34 | "from ipywidgets import FloatProgress\n", 35 | "from sklearn.pipeline import Pipeline\n", 36 | "from parsers.text_utils import DefaultTextProcessor, Lemmatizer" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 9, 42 | "metadata": { 43 | "collapsed": false, 44 | "deletable": true, 45 | "editable": true 46 | }, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "9223372036854775807" 52 | ] 53 | }, 54 | "execution_count": 9, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "csv.field_size_limit(sys.maxsize)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "deletable": true, 67 | "editable": true 68 | }, 69 | "source": [ 70 | "Разобьём процесс на две части — токенизацию документов (без фильтрации) и, собственно, лемматизацию." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 29, 76 | "metadata": { 77 | "collapsed": false, 78 | "deletable": true, 79 | "editable": true 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "CPU times: user 40min 14s, sys: 29.9 s, total: 40min 44s\n", 87 | "Wall time: 40min 44s\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "%%time\n", 93 | "\n", 94 | "# Проделаем токенизацию с сохранением промежуточного\n", 95 | "# состояния в ruwiki.tonekized.csv.tmp\n", 96 | "\n", 97 | "tokenizer = DefaultTextProcessor()\n", 98 | "\n", 99 | "# 1361758 — предподсчитанное кол-во документов\n", 100 | "f = FloatProgress(min=0, max=1361758)\n", 101 | "display(f)\n", 102 | "\n", 103 | "unused_char = '\\U00037b84'\n", 104 | "def strip_accents(s):\n", 105 | " s = s.replace(\"й\", unused_char)\n", 106 | " return \"\".join((c for c in unicodedata.normalize(\"NFD\", s) if unicodedata.category(c) != \"Mn\")).replace(unused_char, \"й\")\n", 107 | "\n", 108 | "def remove_underscores(s):\n", 109 | " return s.replace(\"_\", \"\")\n", 110 | "\n", 111 | "with open(\"../datasets/ruwiki/ruwiki.plain.csv\", \"r\") as infile:\n", 112 | " with open(\"ruwiki.tonekized.csv.tmp\", \"w\") as outfile:\n", 113 | " reader = csv.reader(infile)\n", 114 | " writer = csv.writer(outfile)\n", 115 | " count = 0\n", 116 | " cached_rows = []\n", 117 | " for title, text in reader:\n", 118 | " text = strip_accents(text)\n", 119 | " text = remove_underscores(text)\n", 120 | " tokens = tokenizer.fit_transform(text)\n", 121 | " cached_rows.append((title, \" \".join(tokens)))\n", 122 | " count += 1\n", 123 | " if count % 1000 == 0:\n", 124 | " writer.writerows(cached_rows)\n", 125 | " outfile.flush()\n", 126 | " f.value += len(cached_rows)\n", 127 | " cached_rows = []\n", 128 | " # Запишем оставшиеся строчки\n", 129 | " writer.writerows(cached_rows)\n", 130 | " f.value += len(cached_rows)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 30, 136 | "metadata": { 137 | "collapsed": false, 138 | "deletable": true, 139 | "editable": true 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "CPU times: user 5min 17s, sys: 2min 17s, total: 7min 35s\n", 147 | "Wall time: 3h 13min 15s\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "%%time\n", 153 | "\n", 154 | "# Теперь сделаем лемматизацию всех документов при помощи pymystem3\n", 155 | "# Распараллеливая процесс на N_PROCS процессоров\n", 156 | "\n", 157 | "N_PROCS = 4\n", 158 | "\n", 159 | "# 1361758 — предподсчитанное кол-во документов\n", 160 | "f = FloatProgress(min=0, max=1361758)\n", 161 | "display(f)\n", 162 | "\n", 163 | "m = Mystem()\n", 164 | "\n", 165 | "def lemmatize(text):\n", 166 | " return \"\".join(m.lemmatize(text)).strip()\n", 167 | "\n", 168 | "with open(\"ruwiki.tonekized.csv.tmp\", \"r\") as infile:\n", 169 | " with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"w\") as outfile:\n", 170 | " reader = csv.reader(infile)\n", 171 | " writer = csv.writer(outfile)\n", 172 | " count = 0\n", 173 | " cached_titles = []\n", 174 | " cached_texts = []\n", 175 | " for title, text in reader:\n", 176 | " cached_titles.append(title)\n", 177 | " cached_texts.append(text)\n", 178 | " count += 1\n", 179 | " if count % 1000 == 0:\n", 180 | " with Pool(N_PROCS) as p:\n", 181 | " lemmatized_texts = p.map(lemmatize, cached_texts)\n", 182 | " writer.writerows(zip(cached_titles, lemmatized_texts))\n", 183 | " outfile.flush()\n", 184 | " f.value += len(cached_titles)\n", 185 | " cached_texts = []\n", 186 | " cached_titles = []\n", 187 | " # Запишем оставшиеся строчки\n", 188 | " with Pool(N_PROCS) as p:\n", 189 | " lemmatized_texts = p.map(lemmatize, cached_texts)\n", 190 | " writer.writerows(zip(cached_titles, lemmatized_texts))\n", 191 | " f.value += len(cached_titles)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "deletable": true, 198 | "editable": true 199 | }, 200 | "source": [ 201 | "Посчитаем размер словаря, из которого состоит неотфильтрованная лемматизированная выборка." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 31, 207 | "metadata": { 208 | "collapsed": false, 209 | "deletable": true, 210 | "editable": true 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "CPU times: user 31min 53s, sys: 2min 5s, total: 33min 59s\n", 218 | "Wall time: 31min 53s\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "%%time\n", 224 | "\n", 225 | "dictionary = Counter()\n", 226 | "\n", 227 | "# 1361758 — предподсчитанное кол-во документов\n", 228 | "f = FloatProgress(min=0, max=1361758)\n", 229 | "display(f)\n", 230 | "\n", 231 | "with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"r\") as infile:\n", 232 | " reader = csv.reader(infile)\n", 233 | " for title, text in reader:\n", 234 | " tokens = text.split()\n", 235 | " dictionary.update(tokens)\n", 236 | " f.value += 1" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 32, 242 | "metadata": { 243 | "collapsed": false, 244 | "deletable": true, 245 | "editable": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "dict_series = pd.DataFrame.from_dict(dictionary, orient=\"index\")[0]" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 33, 255 | "metadata": { 256 | "collapsed": false, 257 | "deletable": true, 258 | "editable": true 259 | }, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "3924272" 265 | ] 266 | }, 267 | "execution_count": 33, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "len(dict_series)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 34, 279 | "metadata": { 280 | "collapsed": false, 281 | "deletable": true, 282 | "editable": true 283 | }, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "533185" 289 | ] 290 | }, 291 | "execution_count": 34, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "(dict_series > 10).value_counts()[True]" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": { 303 | "deletable": true, 304 | "editable": true 305 | }, 306 | "source": [ 307 | "Будем считать, что лемма должна встретиться более 10 раз в коллекции, чтобы мы положили её в словарь. Это сократит размер словаря в 8 раз от первоначального объёма." 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "deletable": true, 314 | "editable": true 315 | }, 316 | "source": [ 317 | "Посмотрим на топ-50 слов:" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 35, 323 | "metadata": { 324 | "collapsed": false, 325 | "deletable": true, 326 | "editable": true 327 | }, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "Index(['в', 'и', 'год', 'на', 'с', 'быть', 'по', 'из', 'он', 'который', 'а',\n", 333 | " 'к', 'не', 'что', 'от', 'для', 'за', '1', 'как', 'этот', 'свой', '2',\n", 334 | " 'также', 'до', 'первый', 'время', 'о', 'его', 'после', 'они', '3',\n", 335 | " 'район', 'один', 'то', 'становиться', 'при', 'г', 'город', '5',\n", 336 | " 'примечание', 'ссылка', 'человек', 'м', 'тот', 'область', 'во', 'это',\n", 337 | " 'она', 'весь', 'но'],\n", 338 | " dtype='object')" 339 | ] 340 | }, 341 | "execution_count": 35, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "top50_words = dict_series.sort_values(ascending=False)[:50].index\n", 348 | "top50_words" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "deletable": true, 355 | "editable": true 356 | }, 357 | "source": [ 358 | "Слов, которые могли бы иметь выраженную тематику, здесь почти нет, зато довольно много мусорных и общих слов. Будем выбрасывать слова, входящие в этот список, из документов." 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": { 364 | "deletable": true, 365 | "editable": true 366 | }, 367 | "source": [ 368 | "Также будем фильтровать слова по стоп-словарю." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 36, 374 | "metadata": { 375 | "collapsed": true, 376 | "deletable": true, 377 | "editable": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "stop_words = set(map(str.strip, open(\"../datasets/ruwiki/stopwords.txt\").readlines()))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 37, 387 | "metadata": { 388 | "collapsed": false, 389 | "deletable": true, 390 | "editable": true 391 | }, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "CPU times: user 43min 12s, sys: 2min 23s, total: 45min 35s\n", 398 | "Wall time: 43min 6s\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "%%time\n", 404 | "\n", 405 | "common_words = dict_series[dict_series > 10].index\n", 406 | "\n", 407 | "# 1361758 — предподсчитанное кол-во документов\n", 408 | "f = FloatProgress(min=0, max=1361758)\n", 409 | "display(f)\n", 410 | "\n", 411 | "def accept_word(w):\n", 412 | " return w not in stop_words and w not in top50_words and w in common_words\n", 413 | "\n", 414 | "with open(\"../datasets/ruwiki/ruwiki.lemmatized.csv\", \"r\") as infile:\n", 415 | " with open(\"../datasets/ruwiki/ruwiki.filtered.csv\", \"w\") as outfile:\n", 416 | " reader = csv.reader(infile)\n", 417 | " writer = csv.writer(outfile)\n", 418 | " for title, text in reader:\n", 419 | " tokens = text.split()\n", 420 | " writer.writerow((title, \" \".join(filter(accept_word, tokens))))\n", 421 | " f.value += 1" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": { 427 | "deletable": true, 428 | "editable": true 429 | }, 430 | "source": [ 431 | "Наконец, превратим коллекцию с отфильтрованным словарём в файл UCI Bag-of-words." 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": { 437 | "deletable": true, 438 | "editable": true 439 | }, 440 | "source": [ 441 | "Для начала построим словарь по отфильтрованной коллекции." 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 11, 447 | "metadata": { 448 | "collapsed": false, 449 | "deletable": true, 450 | "editable": true 451 | }, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "CPU times: user 28min 54s, sys: 2min 7s, total: 31min 2s\n", 458 | "Wall time: 28min 59s\n" 459 | ] 460 | } 461 | ], 462 | "source": [ 463 | "%%time\n", 464 | "\n", 465 | "dictionary = set()\n", 466 | "bow_length = 0\n", 467 | "\n", 468 | "# 1361758 — предподсчитанное кол-во документов\n", 469 | "f = FloatProgress(min=0, max=1361758)\n", 470 | "display(f)\n", 471 | "\n", 472 | "with open(\"../datasets/ruwiki/ruwiki.filtered.csv\", \"r\") as infile:\n", 473 | " reader = csv.reader(infile)\n", 474 | " for title, text in reader:\n", 475 | " tokens = set(text.split())\n", 476 | " dictionary.update(tokens)\n", 477 | " bow_length += len(tokens)\n", 478 | " f.value += 1" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 12, 484 | "metadata": { 485 | "collapsed": false, 486 | "deletable": true, 487 | "editable": true 488 | }, 489 | "outputs": [ 490 | { 491 | "data": { 492 | "text/plain": [ 493 | "532345" 494 | ] 495 | }, 496 | "execution_count": 12, 497 | "metadata": {}, 498 | "output_type": "execute_result" 499 | } 500 | ], 501 | "source": [ 502 | "len(dictionary)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 13, 508 | "metadata": { 509 | "collapsed": false, 510 | "deletable": true, 511 | "editable": true 512 | }, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/plain": [ 517 | "185333372" 518 | ] 519 | }, 520 | "execution_count": 13, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "bow_length" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": { 532 | "deletable": true, 533 | "editable": true 534 | }, 535 | "source": [ 536 | "Запишем словарь в файл и переконвертируем документы." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 14, 542 | "metadata": { 543 | "collapsed": true, 544 | "deletable": true, 545 | "editable": true 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "dict_mapping = dict(zip(dictionary, range(len(dictionary))))\n", 550 | "dict_ordering = sorted(zip(range(len(dictionary)), dictionary))" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 15, 556 | "metadata": { 557 | "collapsed": false, 558 | "deletable": true, 559 | "editable": true 560 | }, 561 | "outputs": [ 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "CPU times: user 379 ms, sys: 10 ms, total: 389 ms\n", 567 | "Wall time: 388 ms\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "%%time\n", 573 | "\n", 574 | "with open(\"../datasets/ruwiki/vocab.ruwiki.csv\", \"w\") as dictfile:\n", 575 | " for _, word in dict_ordering:\n", 576 | " dictfile.write(\"%s text\\n\" % word)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 16, 582 | "metadata": { 583 | "collapsed": false, 584 | "deletable": true, 585 | "editable": true 586 | }, 587 | "outputs": [ 588 | { 589 | "name": "stdout", 590 | "output_type": "stream", 591 | "text": [ 592 | "CPU times: user 41min 14s, sys: 2min 41s, total: 43min 56s\n", 593 | "Wall time: 41min 27s\n" 594 | ] 595 | } 596 | ], 597 | "source": [ 598 | "%%time\n", 599 | "\n", 600 | "# 1361758 — предподсчитанное кол-во документов\n", 601 | "doc_count = 1361758\n", 602 | "f = FloatProgress(min=0, max=doc_count)\n", 603 | "display(f)\n", 604 | "\n", 605 | "with open(\"../datasets/ruwiki/docword.ruwiki.txt\", \"w\") as docwordfile:\n", 606 | " docwordfile.write(\"%d\\n%d\\n%d\\n\" % (len(dictionary), doc_count, bow_length))\n", 607 | " with open(\"../datasets/ruwiki/ruwiki.filtered.txt\", \"r\") as infile:\n", 608 | " reader = csv.reader(infile)\n", 609 | " for docID, (title, text) in enumerate(reader):\n", 610 | " for word, count in Counter(text.split()).items():\n", 611 | " docwordfile.write(\"%d %d %d\\n\" % (docID + 1, dict_mapping[word] + 1, count))\n", 612 | " f.value += 1" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": { 618 | "deletable": true, 619 | "editable": true 620 | }, 621 | "source": [ 622 | "---" 623 | ] 624 | } 625 | ], 626 | "metadata": { 627 | "kernelspec": { 628 | "display_name": "Python 3", 629 | "language": "python", 630 | "name": "python3" 631 | }, 632 | "language_info": { 633 | "codemirror_mode": { 634 | "name": "ipython", 635 | "version": 3 636 | }, 637 | "file_extension": ".py", 638 | "mimetype": "text/x-python", 639 | "name": "python", 640 | "nbconvert_exporter": "python", 641 | "pygments_lexer": "ipython3", 642 | "version": "3.5.3" 643 | }, 644 | "latex_envs": { 645 | "bibliofile": "biblio.bib", 646 | "cite_by": "apalike", 647 | "current_citInitial": 1, 648 | "eqLabelWithNumbers": true, 649 | "eqNumInitial": 0 650 | } 651 | }, 652 | "nbformat": 4, 653 | "nbformat_minor": 0 654 | } 655 | -------------------------------------------------------------------------------- /server/static/js/hammer.min.js: -------------------------------------------------------------------------------- 1 | /*! Hammer.JS - v2.0.8 - 2016-04-23 2 | * http://hammerjs.github.io/ 3 | * 4 | * Copyright (c) 2016 Jorik Tangelder; 5 | * Licensed under the MIT license */ 6 | !function(a,b,c,d){"use strict";function e(a,b,c){return setTimeout(j(a,c),b)}function f(a,b,c){return Array.isArray(a)?(g(a,c[b],c),!0):!1}function g(a,b,c){var e;if(a)if(a.forEach)a.forEach(b,c);else if(a.length!==d)for(e=0;e\s*\(/gm,"{anonymous}()@"):"Unknown Stack Trace",f=a.console&&(a.console.warn||a.console.log);return f&&f.call(a.console,e,d),b.apply(this,arguments)}}function i(a,b,c){var d,e=b.prototype;d=a.prototype=Object.create(e),d.constructor=a,d._super=e,c&&la(d,c)}function j(a,b){return function(){return a.apply(b,arguments)}}function k(a,b){return typeof a==oa?a.apply(b?b[0]||d:d,b):a}function l(a,b){return a===d?b:a}function m(a,b,c){g(q(b),function(b){a.addEventListener(b,c,!1)})}function n(a,b,c){g(q(b),function(b){a.removeEventListener(b,c,!1)})}function o(a,b){for(;a;){if(a==b)return!0;a=a.parentNode}return!1}function p(a,b){return a.indexOf(b)>-1}function q(a){return a.trim().split(/\s+/g)}function r(a,b,c){if(a.indexOf&&!c)return a.indexOf(b);for(var d=0;dc[b]}):d.sort()),d}function u(a,b){for(var c,e,f=b[0].toUpperCase()+b.slice(1),g=0;g1&&!c.firstMultiple?c.firstMultiple=D(b):1===e&&(c.firstMultiple=!1);var f=c.firstInput,g=c.firstMultiple,h=g?g.center:f.center,i=b.center=E(d);b.timeStamp=ra(),b.deltaTime=b.timeStamp-f.timeStamp,b.angle=I(h,i),b.distance=H(h,i),B(c,b),b.offsetDirection=G(b.deltaX,b.deltaY);var j=F(b.deltaTime,b.deltaX,b.deltaY);b.overallVelocityX=j.x,b.overallVelocityY=j.y,b.overallVelocity=qa(j.x)>qa(j.y)?j.x:j.y,b.scale=g?K(g.pointers,d):1,b.rotation=g?J(g.pointers,d):0,b.maxPointers=c.prevInput?b.pointers.length>c.prevInput.maxPointers?b.pointers.length:c.prevInput.maxPointers:b.pointers.length,C(c,b);var k=a.element;o(b.srcEvent.target,k)&&(k=b.srcEvent.target),b.target=k}function B(a,b){var c=b.center,d=a.offsetDelta||{},e=a.prevDelta||{},f=a.prevInput||{};b.eventType!==Ea&&f.eventType!==Ga||(e=a.prevDelta={x:f.deltaX||0,y:f.deltaY||0},d=a.offsetDelta={x:c.x,y:c.y}),b.deltaX=e.x+(c.x-d.x),b.deltaY=e.y+(c.y-d.y)}function C(a,b){var c,e,f,g,h=a.lastInterval||b,i=b.timeStamp-h.timeStamp;if(b.eventType!=Ha&&(i>Da||h.velocity===d)){var j=b.deltaX-h.deltaX,k=b.deltaY-h.deltaY,l=F(i,j,k);e=l.x,f=l.y,c=qa(l.x)>qa(l.y)?l.x:l.y,g=G(j,k),a.lastInterval=b}else c=h.velocity,e=h.velocityX,f=h.velocityY,g=h.direction;b.velocity=c,b.velocityX=e,b.velocityY=f,b.direction=g}function D(a){for(var b=[],c=0;ce;)c+=a[e].clientX,d+=a[e].clientY,e++;return{x:pa(c/b),y:pa(d/b)}}function F(a,b,c){return{x:b/a||0,y:c/a||0}}function G(a,b){return a===b?Ia:qa(a)>=qa(b)?0>a?Ja:Ka:0>b?La:Ma}function H(a,b,c){c||(c=Qa);var d=b[c[0]]-a[c[0]],e=b[c[1]]-a[c[1]];return Math.sqrt(d*d+e*e)}function I(a,b,c){c||(c=Qa);var d=b[c[0]]-a[c[0]],e=b[c[1]]-a[c[1]];return 180*Math.atan2(e,d)/Math.PI}function J(a,b){return I(b[1],b[0],Ra)+I(a[1],a[0],Ra)}function K(a,b){return H(b[0],b[1],Ra)/H(a[0],a[1],Ra)}function L(){this.evEl=Ta,this.evWin=Ua,this.pressed=!1,x.apply(this,arguments)}function M(){this.evEl=Xa,this.evWin=Ya,x.apply(this,arguments),this.store=this.manager.session.pointerEvents=[]}function N(){this.evTarget=$a,this.evWin=_a,this.started=!1,x.apply(this,arguments)}function O(a,b){var c=s(a.touches),d=s(a.changedTouches);return b&(Ga|Ha)&&(c=t(c.concat(d),"identifier",!0)),[c,d]}function P(){this.evTarget=bb,this.targetIds={},x.apply(this,arguments)}function Q(a,b){var c=s(a.touches),d=this.targetIds;if(b&(Ea|Fa)&&1===c.length)return d[c[0].identifier]=!0,[c,c];var e,f,g=s(a.changedTouches),h=[],i=this.target;if(f=c.filter(function(a){return o(a.target,i)}),b===Ea)for(e=0;e-1&&d.splice(a,1)};setTimeout(e,cb)}}function U(a){for(var b=a.srcEvent.clientX,c=a.srcEvent.clientY,d=0;d=f&&db>=g)return!0}return!1}function V(a,b){this.manager=a,this.set(b)}function W(a){if(p(a,jb))return jb;var b=p(a,kb),c=p(a,lb);return b&&c?jb:b||c?b?kb:lb:p(a,ib)?ib:hb}function X(){if(!fb)return!1;var b={},c=a.CSS&&a.CSS.supports;return["auto","manipulation","pan-y","pan-x","pan-x pan-y","none"].forEach(function(d){b[d]=c?a.CSS.supports("touch-action",d):!0}),b}function Y(a){this.options=la({},this.defaults,a||{}),this.id=v(),this.manager=null,this.options.enable=l(this.options.enable,!0),this.state=nb,this.simultaneous={},this.requireFail=[]}function Z(a){return a&sb?"cancel":a&qb?"end":a&pb?"move":a&ob?"start":""}function $(a){return a==Ma?"down":a==La?"up":a==Ja?"left":a==Ka?"right":""}function _(a,b){var c=b.manager;return c?c.get(a):a}function aa(){Y.apply(this,arguments)}function ba(){aa.apply(this,arguments),this.pX=null,this.pY=null}function ca(){aa.apply(this,arguments)}function da(){Y.apply(this,arguments),this._timer=null,this._input=null}function ea(){aa.apply(this,arguments)}function fa(){aa.apply(this,arguments)}function ga(){Y.apply(this,arguments),this.pTime=!1,this.pCenter=!1,this._timer=null,this._input=null,this.count=0}function ha(a,b){return b=b||{},b.recognizers=l(b.recognizers,ha.defaults.preset),new ia(a,b)}function ia(a,b){this.options=la({},ha.defaults,b||{}),this.options.inputTarget=this.options.inputTarget||a,this.handlers={},this.session={},this.recognizers=[],this.oldCssProps={},this.element=a,this.input=y(this),this.touchAction=new V(this,this.options.touchAction),ja(this,!0),g(this.options.recognizers,function(a){var b=this.add(new a[0](a[1]));a[2]&&b.recognizeWith(a[2]),a[3]&&b.requireFailure(a[3])},this)}function ja(a,b){var c=a.element;if(c.style){var d;g(a.options.cssProps,function(e,f){d=u(c.style,f),b?(a.oldCssProps[d]=c.style[d],c.style[d]=e):c.style[d]=a.oldCssProps[d]||""}),b||(a.oldCssProps={})}}function ka(a,c){var d=b.createEvent("Event");d.initEvent(a,!0,!0),d.gesture=c,c.target.dispatchEvent(d)}var la,ma=["","webkit","Moz","MS","ms","o"],na=b.createElement("div"),oa="function",pa=Math.round,qa=Math.abs,ra=Date.now;la="function"!=typeof Object.assign?function(a){if(a===d||null===a)throw new TypeError("Cannot convert undefined or null to object");for(var b=Object(a),c=1;ch&&(b.push(a),h=b.length-1):e&(Ga|Ha)&&(c=!0),0>h||(b[h]=a,this.callback(this.manager,e,{pointers:b,changedPointers:[a],pointerType:f,srcEvent:a}),c&&b.splice(h,1))}});var Za={touchstart:Ea,touchmove:Fa,touchend:Ga,touchcancel:Ha},$a="touchstart",_a="touchstart touchmove touchend touchcancel";i(N,x,{handler:function(a){var b=Za[a.type];if(b===Ea&&(this.started=!0),this.started){var c=O.call(this,a,b);b&(Ga|Ha)&&c[0].length-c[1].length===0&&(this.started=!1),this.callback(this.manager,b,{pointers:c[0],changedPointers:c[1],pointerType:za,srcEvent:a})}}});var ab={touchstart:Ea,touchmove:Fa,touchend:Ga,touchcancel:Ha},bb="touchstart touchmove touchend touchcancel";i(P,x,{handler:function(a){var b=ab[a.type],c=Q.call(this,a,b);c&&this.callback(this.manager,b,{pointers:c[0],changedPointers:c[1],pointerType:za,srcEvent:a})}});var cb=2500,db=25;i(R,x,{handler:function(a,b,c){var d=c.pointerType==za,e=c.pointerType==Ba;if(!(e&&c.sourceCapabilities&&c.sourceCapabilities.firesTouchEvents)){if(d)S.call(this,b,c);else if(e&&U.call(this,c))return;this.callback(a,b,c)}},destroy:function(){this.touch.destroy(),this.mouse.destroy()}});var eb=u(na.style,"touchAction"),fb=eb!==d,gb="compute",hb="auto",ib="manipulation",jb="none",kb="pan-x",lb="pan-y",mb=X();V.prototype={set:function(a){a==gb&&(a=this.compute()),fb&&this.manager.element.style&&mb[a]&&(this.manager.element.style[eb]=a),this.actions=a.toLowerCase().trim()},update:function(){this.set(this.manager.options.touchAction)},compute:function(){var a=[];return g(this.manager.recognizers,function(b){k(b.options.enable,[b])&&(a=a.concat(b.getTouchAction()))}),W(a.join(" "))},preventDefaults:function(a){var b=a.srcEvent,c=a.offsetDirection;if(this.manager.session.prevented)return void b.preventDefault();var d=this.actions,e=p(d,jb)&&!mb[jb],f=p(d,lb)&&!mb[lb],g=p(d,kb)&&!mb[kb];if(e){var h=1===a.pointers.length,i=a.distance<2,j=a.deltaTime<250;if(h&&i&&j)return}return g&&f?void 0:e||f&&c&Na||g&&c&Oa?this.preventSrc(b):void 0},preventSrc:function(a){this.manager.session.prevented=!0,a.preventDefault()}};var nb=1,ob=2,pb=4,qb=8,rb=qb,sb=16,tb=32;Y.prototype={defaults:{},set:function(a){return la(this.options,a),this.manager&&this.manager.touchAction.update(),this},recognizeWith:function(a){if(f(a,"recognizeWith",this))return this;var b=this.simultaneous;return a=_(a,this),b[a.id]||(b[a.id]=a,a.recognizeWith(this)),this},dropRecognizeWith:function(a){return f(a,"dropRecognizeWith",this)?this:(a=_(a,this),delete this.simultaneous[a.id],this)},requireFailure:function(a){if(f(a,"requireFailure",this))return this;var b=this.requireFail;return a=_(a,this),-1===r(b,a)&&(b.push(a),a.requireFailure(this)),this},dropRequireFailure:function(a){if(f(a,"dropRequireFailure",this))return this;a=_(a,this);var b=r(this.requireFail,a);return b>-1&&this.requireFail.splice(b,1),this},hasRequireFailures:function(){return this.requireFail.length>0},canRecognizeWith:function(a){return!!this.simultaneous[a.id]},emit:function(a){function b(b){c.manager.emit(b,a)}var c=this,d=this.state;qb>d&&b(c.options.event+Z(d)),b(c.options.event),a.additionalEvent&&b(a.additionalEvent),d>=qb&&b(c.options.event+Z(d))},tryEmit:function(a){return this.canEmit()?this.emit(a):void(this.state=tb)},canEmit:function(){for(var a=0;af?Ja:Ka,c=f!=this.pX,d=Math.abs(a.deltaX)):(e=0===g?Ia:0>g?La:Ma,c=g!=this.pY,d=Math.abs(a.deltaY))),a.direction=e,c&&d>b.threshold&&e&b.direction},attrTest:function(a){return aa.prototype.attrTest.call(this,a)&&(this.state&ob||!(this.state&ob)&&this.directionTest(a))},emit:function(a){this.pX=a.deltaX,this.pY=a.deltaY;var b=$(a.direction);b&&(a.additionalEvent=this.options.event+b),this._super.emit.call(this,a)}}),i(ca,aa,{defaults:{event:"pinch",threshold:0,pointers:2},getTouchAction:function(){return[jb]},attrTest:function(a){return this._super.attrTest.call(this,a)&&(Math.abs(a.scale-1)>this.options.threshold||this.state&ob)},emit:function(a){if(1!==a.scale){var b=a.scale<1?"in":"out";a.additionalEvent=this.options.event+b}this._super.emit.call(this,a)}}),i(da,Y,{defaults:{event:"press",pointers:1,time:251,threshold:9},getTouchAction:function(){return[hb]},process:function(a){var b=this.options,c=a.pointers.length===b.pointers,d=a.distanceb.time;if(this._input=a,!d||!c||a.eventType&(Ga|Ha)&&!f)this.reset();else if(a.eventType&Ea)this.reset(),this._timer=e(function(){this.state=rb,this.tryEmit()},b.time,this);else if(a.eventType&Ga)return rb;return tb},reset:function(){clearTimeout(this._timer)},emit:function(a){this.state===rb&&(a&&a.eventType&Ga?this.manager.emit(this.options.event+"up",a):(this._input.timeStamp=ra(),this.manager.emit(this.options.event,this._input)))}}),i(ea,aa,{defaults:{event:"rotate",threshold:0,pointers:2},getTouchAction:function(){return[jb]},attrTest:function(a){return this._super.attrTest.call(this,a)&&(Math.abs(a.rotation)>this.options.threshold||this.state&ob)}}),i(fa,aa,{defaults:{event:"swipe",threshold:10,velocity:.3,direction:Na|Oa,pointers:1},getTouchAction:function(){return ba.prototype.getTouchAction.call(this)},attrTest:function(a){var b,c=this.options.direction;return c&(Na|Oa)?b=a.overallVelocity:c&Na?b=a.overallVelocityX:c&Oa&&(b=a.overallVelocityY),this._super.attrTest.call(this,a)&&c&a.offsetDirection&&a.distance>this.options.threshold&&a.maxPointers==this.options.pointers&&qa(b)>this.options.velocity&&a.eventType&Ga},emit:function(a){var b=$(a.offsetDirection);b&&this.manager.emit(this.options.event+b,a),this.manager.emit(this.options.event,a)}}),i(ga,Y,{defaults:{event:"tap",pointers:1,taps:1,interval:300,time:250,threshold:9,posThreshold:10},getTouchAction:function(){return[ib]},process:function(a){var b=this.options,c=a.pointers.length===b.pointers,d=a.distance