├── .DS_Store
├── .gitignore
├── README.md
├── hackathon_1_may_2017
    ├── .DS_Store
    ├── ChunkByTime.ipynb
    ├── ElasticsearchSynonyms.ipynb
    ├── Get_links_from_meetings.ipynb
    ├── PredictChannel.ipynb
    ├── ThreadIndexingWithES.ipynb
    ├── chunk.py
    ├── docker-compose.yml
    ├── event-parser.ipynb
    ├── fact_extraction.ipynb
    ├── fact_extraction_with_mystem.ipynb
    ├── for_tomita
    │   ├── config.proto
    │   ├── courses.cxx
    │   ├── education.cxx
    │   ├── fact_types.proto
    │   ├── interest.cxx
    │   ├── job.cxx
    │   ├── mydic.gzt
    │   └── name.cxx
    ├── help_data
    │   ├── java_policy
    │   └── synonyms.txt
    ├── key_words.py
    ├── predict_channel.py
    ├── questions.csv
    ├── requirements.txt
    ├── slack_data_loader.py
    ├── test_simple_question_extraction.ipynb
    ├── tokenizer.py
    └── vw.sh
├── hackathon_2_march_2018
    ├── .DS_Store
    ├── README.md
    ├── data_fetch
    │   ├── README.md
    │   ├── msg_parser.py
    │   ├── reaction_parser.py
    │   ├── run.py
    │   └── users_parser.py
    ├── mention_count.ipynb
    ├── topic_modelling
    │   ├── 01. clean_text_parsing.ipynb
    │   ├── 02. vocabulary.ipynb
    │   └── 03. modeling.ipynb
    └── username_mining
    │   └── db.ipynb
├── hackathon_3_december_2018
    ├── dv_qa
    │   ├── 2018-ods-answers.png
    │   ├── 2018-ods-answers_tab.png
    │   ├── 2018-ods-questions.png
    │   ├── 2018-ods-questions_tab.png
    │   ├── 2018-ods-top-users_tab.png
    │   ├── README.md
    │   ├── ods-answers.png
    │   ├── ods-answers_tab.png
    │   ├── ods-check-export.ipynb
    │   ├── ods-get-data.ipynb
    │   ├── ods-qa.ipynb
    │   ├── ods-questions.png
    │   ├── ods-questions_tab.png
    │   └── ods-top-users_tab.png
    └── folium_map
    │   ├── artgor_plot_folium.ipynb
    │   ├── big_map_latest_upd.html
    │   ├── parse_geoservice_data.ipynb
    │   ├── user_geodata.py
    │   ├── user_geodata_settings.json
    │   └── utils.py
└── karma_onreact_counting.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | ODS_dump_Mar_10_2017/
3 | opendatascience Slack export May 20 2017/
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introspect_hackathon
2 | - [Код и данные](https://github.com/open-data-science/Introspect_hackathon/tree/master/hackathon_1_may_2017) с ODS Introspect Hackathon'а #1 который проходил в кафе "Райский Пирожок", 19-21 Мая 2017.
3 | - [Код и данные](https://github.com/open-data-science/Introspect_hackathon/tree/master/hackathon_2_march_2018) с ODS Introspect Hackathon'а #2 который проходил в Mail.ru, 16-18 Марта 2018.
4 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_1_may_2017/.DS_Store


--------------------------------------------------------------------------------
/hackathon_1_may_2017/ElasticsearchSynonyms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import asyncio\n",
 12 |     "from aioes import Elasticsearch"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 53,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "es = Elasticsearch(['localhost:9200'])"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 110,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "def gen_synonyms():\n",
 35 |     "    \"\"\"\n",
 36 |     "    Generate some synonyms in a file. All words separated by comma are treated as equal\n",
 37 |     "    \"\"\"\n",
 38 |     "    with open(\"synonyms.txt\", \"w\") as syns:\n",
 39 |     "        syns.write(\"xboost, эксгебуст, эксбуст, иксгебуст, xgboost\\n\")\n",
 40 |     "        syns.write(\"пыха, пыху, пых, php\\n\")\n",
 41 |     "        syns.write(\"lol, лол\\n\")\n",
 42 |     "        syns.write(\"питон, python\\n\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 105,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "index_body = {\n",
 52 |     "    \"settings\": {\n",
 53 |     "        \"analysis\": {\n",
 54 |     "          \"filter\": {\n",
 55 |     "            \"russian_stop\": {\n",
 56 |     "              \"type\":       \"stop\",\n",
 57 |     "              \"stopwords\":  \"_russian_\" \n",
 58 |     "            },\n",
 59 |     "            \"russian_stemmer\": {\n",
 60 |     "              \"type\":       \"stemmer\",\n",
 61 |     "              \"language\":   \"russian\"\n",
 62 |     "            },\n",
 63 |     "            \"synonyms_expand\": {\n",
 64 |     "              \"type\": \"synonym\", \n",
 65 |     "              # path to synonym file.\n",
 66 |     "              # for ES to be able to read it, security policy should be set as described here:\n",
 67 |     "              # https://stackoverflow.com/questions/35401917/reading-a-file-in-an-elasticsearch-plugin\n",
 68 |     "              \"synonyms_path\": \"/Users/enchantner/Experiments/synonyms.txt\"\n",
 69 |     "            }\n",
 70 |     "          },\n",
 71 |     "          \"analyzer\": {\n",
 72 |     "            \"russian_syn\": {\n",
 73 |     "              \"tokenizer\":  \"standard\",\n",
 74 |     "              \"filter\": [\n",
 75 |     "                \"lowercase\",\n",
 76 |     "                \"russian_stop\",\n",
 77 |     "                \"russian_stemmer\",\n",
 78 |     "                \"synonyms_expand\"\n",
 79 |     "              ]\n",
 80 |     "            }\n",
 81 |     "          }\n",
 82 |     "        }\n",
 83 |     "    },\n",
 84 |     "    \"mappings\":{  \n",
 85 |     "        \"question\":{  \n",
 86 |     "          \"properties\":{  \n",
 87 |     "            \"text\":{\"type\":\"string\", \"analyzer\":\"russian_syn\"}\n",
 88 |     "          }\n",
 89 |     "        }\n",
 90 |     "    }\n",
 91 |     "}\n",
 92 |     "\n",
 93 |     "async def create_index():\n",
 94 |     "    ret = await es.indices.create(\n",
 95 |     "        index=\"questions-index\",\n",
 96 |     "        body=index_body\n",
 97 |     "    )\n",
 98 |     "    print(ret)\n",
 99 |     "    \n",
100 |     "    \n",
101 |     "async def delete_index():\n",
102 |     "    ret = await es.delete(\n",
103 |     "        index=\"questions-index\"\n",
104 |     "    )\n",
105 |     "    print(ret)\n",
106 |     "    \n",
107 |     "async def openclose():\n",
108 |     "    \"\"\"\n",
109 |     "    Closing and opening index again reloads synomyms file\n",
110 |     "    \"\"\"\n",
111 |     "    await es.indices.close(index=\"questions-index\")\n",
112 |     "    await es.indices.open(index=\"questions-index\")\n",
113 |     "    \n",
114 |     "async def populate_index():\n",
115 |     "    await es.index(\n",
116 |     "        index=\"questions-index\",\n",
117 |     "        doc_type=\"question\",\n",
118 |     "        body={\n",
119 |     "            \"text\": \"А что мне рассказать про иксгебуст?\"\n",
120 |     "        }\n",
121 |     "    )\n",
122 |     "    await es.index(\n",
123 |     "        index=\"questions-index\",\n",
124 |     "        doc_type=\"question\",\n",
125 |     "        body={\n",
126 |     "            \"text\": \"Я ненавижу PHP, что мне делать?\"\n",
127 |     "        }\n",
128 |     "    )\n",
129 |     "    await es.index(\n",
130 |     "        index=\"questions-index\",\n",
131 |     "        doc_type=\"question\",\n",
132 |     "        body={\n",
133 |     "            \"text\": \"Я люблю питон, что мне делать?\"\n",
134 |     "        }\n",
135 |     "    )"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 103,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "{'acknowledged': True}\n",
148 |       "{'acknowledged': True, 'shards_acknowledged': True}\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "loop = asyncio.get_event_loop()\n",
154 |     "loop.run_until_complete(delete_index())\n",
155 |     "loop.run_until_complete(create_index())\n",
156 |     "loop.run_until_complete(populate_index())"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 111,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# reload synonims without recreating the whole database\n",
166 |     "gen_synonyms()\n",
167 |     "loop.run_until_complete(openclose())"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {
174 |     "collapsed": true
175 |    },
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.6.0"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 2
201 | }
202 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/Get_links_from_meetings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from slack_data_loader import SlackLoader\n",
 12 |     "import datetime\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np\n",
 15 |     "import re"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 3,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "path_to_dump = '/Users/alex/Documents/ODS/opendatascience Slack export May 20 2017/'"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 4,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stderr",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "100%|██████████| 728/728 [00:00<00:00, 1909.51it/s]\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "exporter = SlackLoader(path_to_dump, only_channels=('_meetings',),\n",
 44 |     "                           start_date=datetime.datetime(2017, 1, 1))"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "channel_attrs = ['id', 'name', 'created', 'creator', 'is_archived', 'is_general', 'pins', 'topic']\n",
 56 |     "\n",
 57 |     "def channels_to_df(channels):\n",
 58 |     "    full_list = []\n",
 59 |     "    for ch_id, ch_dict in channels.items():\n",
 60 |     "        new_channel_dict = {}\n",
 61 |     "        for k in channel_attrs:\n",
 62 |     "            new_channel_dict[k] = ch_dict.get(k, None)\n",
 63 |     "        new_channel_dict['num_members'] = len(ch_dict['members'])\n",
 64 |     "        new_channel_dict['purpose'] = ch_dict['purpose']['value']\n",
 65 |     "        full_list.append(new_channel_dict)\n",
 66 |     "    return pd.DataFrame(full_list).set_index('id')\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 6,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "df = channels_to_df(exporter.channels)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 17,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "'26 декабря в ИППИ был семинар про NIPS-2016 и прогресс в машобуче. Занять место в первых рядах не успел, поэтому пришлось записывать видео с последних и дрожащими руками. Звук более-менее слышно, а слайды, наверное, сами участники смогут предоставить <https://www.youtube.com/watch?v=vFY2rez41_g>'"
 89 |       ]
 90 |      },
 91 |      "execution_count": 17,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "exporter.messages[0]['text']"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 31,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "links_regex = re.compile(r'<(http[^>|]+)>')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 23,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "from operator import itemgetter, add\n",
120 |     "from functools import reduce"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": []
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 32,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "links_total = reduce(add, map(links_regex.findall, map(itemgetter('text'),exporter.messages)))"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 33,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "['https://www.youtube.com/watch?v=vFY2rez41_g',\n",
150 |        " 'https://www.youtube.com/watch?v=Jh3D8Gi4N0I',\n",
151 |        " 'http://www.nebotut.ru/',\n",
152 |        " 'http://www.belylist.ru/',\n",
153 |        " 'https://daily.afisha.ru/eating/3902-samyy-pyanyy-okrug-v-mire-5-marshrutov-dlya-barhoppinga-v-prazdniki/',\n",
154 |        " 'http://anticafe-hp.ru/',\n",
155 |        " 'http://tceh.com/event/kiwicom-brno-moscow-python-meetup/',\n",
156 |        " 'https://corp.mail.ru/ru/press/events/298/',\n",
157 |        " 'https://www.youtube.com/watch?v=1sQijC_I0gg',\n",
158 |        " 'https://corp.mail.ru/ru/press/events/300/',\n",
159 |        " 'https://opendatascience.slack.com/archives/_meetings/p1484385943000910',\n",
160 |        " 'http://www.oreilly.com/pub/e/3855',\n",
161 |        " 'https://www.facebook.com/events/1292384680807838/',\n",
162 |        " 'https://vc.ru/p/statsbot-deal',\n",
163 |        " 'http://venturebeat.com/2016/11/14/visa-lawyer-bot-pledges-to-help-immigrants-make-america-great-again/',\n",
164 |        " 'http://dpllab.com/',\n",
165 |        " 'https://events.yandex.ru/events/ds/04/#place',\n",
166 |        " 'https://events.yandex.ru/events/mltr/21-jan-2017/',\n",
167 |        " 'https://opendatascience.slack.com/archives/_meetings/p1472564367000042',\n",
168 |        " 'https://events.yandex.ru/events/b-konf/16-feb-2017/',\n",
169 |        " 'https://vk.com/wall-117459195_196',\n",
170 |        " 'https://boosters.pro/sberbank',\n",
171 |        " 'https://pp.vk.me/c637626/v637626651/2b2c6/IYnue7kT5oM.jpg',\n",
172 |        " 'https://events.yandex.ru/surveys/4316/',\n",
173 |        " 'https://www.meetup.com/PyData-Moscow/',\n",
174 |        " 'https://geektimes.ru/company/mailru/blog/285026/',\n",
175 |        " 'https://robotics.timepad.ru/event/429057/',\n",
176 |        " 'https://openvisconf.com/',\n",
177 |        " 'https://events.yandex.ru/events/meetings/09-february-2017/',\n",
178 |        " 'https://ai-community.timepad.ru/event/432990/',\n",
179 |        " 'https://flyelephant.net/events/webinar-introduction-to-singularity',\n",
180 |        " 'http://singularity.lbl.gov/',\n",
181 |        " 'https://events.yandex.ru/events/meetings/09-february-2017/register/',\n",
182 |        " 'https://www.meetup.com/PyData-Moscow/events/237579800/',\n",
183 |        " 'https://corp.mail.ru/ru/press/events/315/',\n",
184 |        " 'http://tceh.com/medhack/',\n",
185 |        " 'http://lurkmore.to/%D0%91%D0%BE%D0%B1%D1%83%D0%BA',\n",
186 |        " 'http://medit-2017.ru/',\n",
187 |        " 'https://vk.com/wall-118482811_48',\n",
188 |        " 'http://Instagram.com/playittodeath',\n",
189 |        " 'http://www.info-space.ru/',\n",
190 |        " 'https://events.yandex.ru/surveys/4453/',\n",
191 |        " 'https://goo.gl/forms/4d7p46wsbLtni2Ot1',\n",
192 |        " 'http://www.mathnet.ru/php/seminars.phtml?option_lang=rus&amp;presentid=16449',\n",
193 |        " 'https://habrahabr.ru/company/superjob/blog/321950/',\n",
194 |        " 'https://www.meetup.com/Moscow-Data-Science/',\n",
195 |        " 'https://opendatascience.slack.com/archives/_random_flood/p1487253931038865',\n",
196 |        " 'https://events.yandex.ru/events/mltr/25-feb-2017/',\n",
197 |        " 'https://newprolab.timepad.ru/event/447417/',\n",
198 |        " 'http://wwsss17.com/',\n",
199 |        " 'https://vk.com/wwsss17',\n",
200 |        " 'https://cs.hse.ru/announcements/202188811.html',\n",
201 |        " 'https://events.yandex.ru/events/ds/18-mar-2017/',\n",
202 |        " 'http://pydata.org/amsterdam2017/',\n",
203 |        " 'http://hackathon.spb.ru/',\n",
204 |        " 'https://corp.mail.ru/ru/press/events/323/',\n",
205 |        " 'https://goo.gl/forms/kNIPf1df1KQnEZz72',\n",
206 |        " 'https://events.kaspersky.com/hackathon/',\n",
207 |        " 'http://hackathon.ai/',\n",
208 |        " 'https://events.yandex.ru/surveys/4527/',\n",
209 |        " 'https://opendatascience.slack.com/archives/_meetings/p1488168000003088',\n",
210 |        " 'https://robotics.timepad.ru/event/399682/',\n",
211 |        " 'http://sk.ru/foundation/events/april2017/robotics/p/classes.aspx',\n",
212 |        " 'https://www.instagram.com/p/BRKxeHGgMmH/',\n",
213 |        " 'https://opendatascience.slack.com/archives/_meetings/p1488530583003484',\n",
214 |        " 'https://events.yandex.ru/events/mltr/11-mar-2017/',\n",
215 |        " 'https://habrahabr.ru/company/mailru/blog/322432/',\n",
216 |        " 'http://hackathon.mts.ru',\n",
217 |        " 'http://hackathon.mts.ru/images/picTeam/t3.png',\n",
218 |        " 'http://rb.ru/rosbank/',\n",
219 |        " 'https://docs.google.com/forms/d/e/1FAIpQLSfDH9IWJHAUEkeb5rXGpwvea24Nd4VV2LZBQ42xhgrgws_YpQ/viewform?c=0&amp;w=1',\n",
220 |        " 'https://vc.ru/p/9578',\n",
221 |        " 'https://www.youtube.com/watch?v=E62S4QNltLc',\n",
222 |        " 'https://www.youtube.com/watch?v=fhZXqTGsunw',\n",
223 |        " 'https://goo.gl/forms/83hiODGnTzjwxJkY2',\n",
224 |        " 'https://events.yandex.ru/events/mltr/11-mar-2017/',\n",
225 |        " 'https://goo.gl/forms/8AAipXJCQvqc6WHC2',\n",
226 |        " 'https://flyelephant.net/events/webinar-julia',\n",
227 |        " 'https://habrahabr.ru/company/flyelephant/blog/323840/',\n",
228 |        " 'http://imgur.com/ogIQN0i',\n",
229 |        " 'https://www.youtube.com/channel/UC91wUxUQ_uWznIo04dpXo3A',\n",
230 |        " 'http://i.imgur.com/QQSFllR.jpg',\n",
231 |        " 'https://vk.com/wall-142135418_5',\n",
232 |        " 'http://www.datascience.in.ua/',\n",
233 |        " 'https://sys.mail.ru/blog/entry/shemotehnika-08-pro-kartograf/',\n",
234 |        " 'https://www.youtube.com/watch?v=DlK_37MVOvY',\n",
235 |        " 'http://msk.opendataday.ru/',\n",
236 |        " 'https://events.yandex.ru/events/mltr/25-mar-2017/',\n",
237 |        " 'https://events.yandex.ru/events/meetings/3-april-2017/',\n",
238 |        " 'https://www.youtube.com/playlist?list=PLkvzAel8ISD39_e1_jIhhWnSltFNOdTwZ',\n",
239 |        " 'https://flyelephant.net/events/webinar-julia-live',\n",
240 |        " 'https://chatbotconf.ru/ru',\n",
241 |        " 'https://twitter.com/rsalakhu/status/846045079487832066?s=09',\n",
242 |        " 'https://events.yandex.ru/events/ds/15-apr-2017/',\n",
243 |        " 'https://goo.gl/forms/d4hPTIHClEkzchgu1',\n",
244 |        " 'https://www.youtube.com/channel/UCBLlcLoM4czHN21yeaWxGZA',\n",
245 |        " 'http://ospcon.osp.ru/bigdata',\n",
246 |        " 'https://www.facebook.com/events/828239720648373',\n",
247 |        " 'http://cs403922.userapi.com/v403922807/3ea2/z7rffcCbvm8.jpg',\n",
248 |        " 'https://knowledgepit.fedcsis.org/contest/view.php?id=120',\n",
249 |        " 'https://events.yandex.ru/surveys/4685/',\n",
250 |        " 'https://yandex.ru/maps/-/CZcsM8YJ',\n",
251 |        " 'https://events.kaspersky.com/hackathon/',\n",
252 |        " 'https://events.webinar.ru/1904081/345927',\n",
253 |        " 'https://events.yandex.ru/events/mltr/08-apr-2017/',\n",
254 |        " 'https://rambler-co-e-org.timepad.ru/event/470664/',\n",
255 |        " 'https://www.meetup.com/Apache-Spark-in-Moscow/events/past/?scroll=true',\n",
256 |        " 'http://ai-community.com/events/ai-community/4-global-ai-meetup-computer-vision-technologies-06/04',\n",
257 |        " 'https://opendatascience.slack.com/archives/C04422A5C/p1490684902916746',\n",
258 |        " 'https://events.yandex.ru/events/meetings/13-apr-2017/',\n",
259 |        " 'https://academy.yandex.ru/events/sci-sem/cv/',\n",
260 |        " 'https://clickhouse.yandex/presentations/meetup4/clickhouse_for_analysts.pdf',\n",
261 |        " 'http://matlab.ru/seminars/conf2017',\n",
262 |        " 'http://meetu.ps/e/CGvgm/sYfx1/d',\n",
263 |        " 'https://ladiescode.timepad.ru/event/471400/',\n",
264 |        " 'http://data-science.com.ua/conferences/data-science-ua-conference-2017/',\n",
265 |        " 'https://www.facebook.com/events/168767966965411/',\n",
266 |        " 'http://datafest.in.ua',\n",
267 |        " 'https://events.yandex.ru/events/science-seminars/24-april-2017/',\n",
268 |        " 'http://grammars.grlmc.com/DeepLearn2017/',\n",
269 |        " 'https://sites.google.com/site/emotiw2016/',\n",
270 |        " 'https://youtu.be/oPDbUIWND_k',\n",
271 |        " 'http://lifecode.site/?utm_source=newsletter&amp;utm_medium=genehack&amp;utm_campaign=13apr',\n",
272 |        " 'http://datafest.by/',\n",
273 |        " 'https://it.mail.ru/video/',\n",
274 |        " 'http://spacehack.xyz/',\n",
275 |        " 'https://www.youtube.com/watch?v=F1QvOJcxAzw',\n",
276 |        " 'https://moscowdjango.timepad.ru/event/468277/',\n",
277 |        " 'https://events.yandex.ru/events/mltr/22-apr-2017/',\n",
278 |        " 'https://domclick.timepad.ru/event/476750/',\n",
279 |        " 'https://vk.com/wall-44016343_14912?w=wall-138127986_121',\n",
280 |        " 'https://goo.gl/forms/g8tB0MRUMXWVgpCh2',\n",
281 |        " 'https://goo.gl/forms/mL4eHnLEIbVfUFQb2',\n",
282 |        " 'http://xn--90aihcg1anaka9m.xn--p1ai',\n",
283 |        " 'http://xn--80abdlnldpssn.xn--p1ai',\n",
284 |        " 'https://docs.google.com/document/d/1jwDGxd50NbzAuCcz60ct6Kr1rMQkB-8Q3EIAf_RX-mU/edit?usp=drivesdk',\n",
285 |        " 'https://habrahabr.ru/company/microsoft/blog/326812/',\n",
286 |        " 'http://spacehack.xyz',\n",
287 |        " 'https://www.meetup.com/GDG-Moscow/events/239324673/',\n",
288 |        " 'http://datascience.in.ua',\n",
289 |        " 'https://www.facebook.com/photo.php?fbid=10212499098569921&amp;set=a.10200648969644104.1073741825.1156281111&amp;type=3&amp;theater',\n",
290 |        " 'https://events.webinar.ru/2442095/395929',\n",
291 |        " 'http://dataring.ru/competitions/avito-recommendations/',\n",
292 |        " 'https://events.yandex.ru/events/mltr/22-apr-2017/',\n",
293 |        " 'https://www.meetup.com/PyData-Moscow/events/239404221/',\n",
294 |        " 'https://goo.gl/forms/mL4eHnLEIbVfUFQb2',\n",
295 |        " 'https://www.meetup.com/PyData-Moscow/events/239404221/',\n",
296 |        " 'https://corp.mail.ru/ru/press/events/341/',\n",
297 |        " 'https://www.youtube.com/channel/UCBLlcLoM4czHN21yeaWxGZA',\n",
298 |        " 'http://spacehack.xyz/',\n",
299 |        " 'http://dmlabs.org/spacehack.jpg',\n",
300 |        " 'https://bigquery.cloud.google.com/dataset/fh-bigquery:reddit_comments',\n",
301 |        " 'https://events.yandex.ru/events/ds/20-may-2017/',\n",
302 |        " 'https://aisummit2017.ru/',\n",
303 |        " 'https://www.s7.ru/home/offers/hackathon/index.dot',\n",
304 |        " 'http://spa2017.org',\n",
305 |        " 'http://www.pdmi.ras.ru/EIMI/2017/CNSA/index.html',\n",
306 |        " 'http://www.pdmi.ras.ru/EIMI/2017/PTRP/index.html',\n",
307 |        " 'http://dmery.ing.puc.cl/index.php/material/gdxray/',\n",
308 |        " 'https://www.dropbox.com/sh/aym7wgup7m2c5hh/AACFjjmmozhWKFmRyzM0S0KYa?dl=0',\n",
309 |        " 'https://fom-events.timepad.ru/event/485547/',\n",
310 |        " 'https://events.yandex.ru/events/ds/27-apr-2017/',\n",
311 |        " 'http://ritfest.ru/',\n",
312 |        " 'https://pages.awscloud.com/awsomedaymoscow2017registration.html',\n",
313 |        " 'http://machinescansee.com/',\n",
314 |        " 'https://www.youtube.com/watch?v=WhM3Vvz37a0',\n",
315 |        " 'http://www.mobiledimension.ru/confmay.php',\n",
316 |        " 'http://events.softline.ru/event/10902/',\n",
317 |        " 'https://www.youtube.com/watch?v=ZBwxRww_EYo',\n",
318 |        " 'https://goo.gl/forms/qJ8JQsOfqzpxkN5m2',\n",
319 |        " 'https://www.youtube.com/watch?v=rE3Y9eCfN8E',\n",
320 |        " 'http://geodata.pro',\n",
321 |        " 'https://robotics.timepad.ru/event/492985/',\n",
322 |        " 'https://goo.gl/forms/qJ8JQsOfqzpxkN5m2',\n",
323 |        " 'https://cs.hse.ru/datacase/pravoru',\n",
324 |        " 'https://youtu.be/IFG9IBI2NoM',\n",
325 |        " 'https://www.facebook.com/events/455274888147623/',\n",
326 |        " 'http://talks.sourced.tech/machine-learning-2017/',\n",
327 |        " 'https://domclick.timepad.ru/event/476750/',\n",
328 |        " 'http://www.moscowpython.ru/meetup/45/',\n",
329 |        " 'http://machinescansee.com/',\n",
330 |        " 'http://it-events.com',\n",
331 |        " 'https://corp.mail.ru/ru/press/events/347/',\n",
332 |        " 'https://youtu.be/eixlC8K8GIg']"
333 |       ]
334 |      },
335 |      "execution_count": 33,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "links_total"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": []
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 3",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.6.1"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/ThreadIndexingWithES.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import asyncio\n",
 12 |     "from aioes import Elasticsearch\n",
 13 |     "from elasticsearch import helpers"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from chunk import TimeDistance\n",
 25 |     "from chunk import Chunker\n",
 26 |     "from slack_data_loader import SlackLoader"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import json"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "PATH_TO_DATA = './data'"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 15,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "es = Elasticsearch(['localhost:9200'])"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "collapsed": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "def gen_synonyms():\n",
 71 |     "    \"\"\"\n",
 72 |     "    Generate some synonyms in a file. All words separated by comma are treated as equal\n",
 73 |     "    \"\"\"\n",
 74 |     "    with open(\"help_data/synonyms.txt\", \"w\") as syns:\n",
 75 |     "        syns.write(\"xboost, эксгебуст, эксбуст, иксгебуст, xgboost\\n\")\n",
 76 |     "        syns.write(\"пыха, пыху, пых, php\\n\")\n",
 77 |     "        syns.write(\"lol, лол\\n\")\n",
 78 |     "        syns.write(\"питон, python\\n\")"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 20,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "index_name = \"ods-slack-index\"\n",
 90 |     "mapping_name = \"thread\"\n",
 91 |     "message_mapping = \"message\"\n",
 92 |     "index_body = {\n",
 93 |     "    \"settings\": {\n",
 94 |     "        \"analysis\": {\n",
 95 |     "          \"filter\": {\n",
 96 |     "            \"russian_stop\": {\n",
 97 |     "              \"type\":       \"stop\",\n",
 98 |     "              \"stopwords\":  \"_russian_\" \n",
 99 |     "            },\n",
100 |     "            \"russian_stemmer\": {\n",
101 |     "              \"type\":       \"stemmer\",\n",
102 |     "              \"language\":   \"russian\"\n",
103 |     "            },\n",
104 |     "            \"synonyms_expand\": {\n",
105 |     "              \"type\": \"synonym\", \n",
106 |     "              # path to synonym file.\n",
107 |     "              # for ES to be able to read it, security policy should be set as described here:\n",
108 |     "              # https://stackoverflow.com/questions/35401917/reading-a-file-in-an-elasticsearch-plugin\n",
109 |     "              \"synonyms_path\": \"/usr/share/config_data/synonyms.txt\"\n",
110 |     "            }\n",
111 |     "          },\n",
112 |     "          \"analyzer\": {\n",
113 |     "            \"russian_syn\": {\n",
114 |     "              \"tokenizer\":  \"standard\",\n",
115 |     "              \"filter\": [\n",
116 |     "                \"lowercase\",\n",
117 |     "                \"russian_stop\",\n",
118 |     "                \"russian_stemmer\",\n",
119 |     "                \"synonyms_expand\"\n",
120 |     "              ]\n",
121 |     "            }\n",
122 |     "          }\n",
123 |     "        }\n",
124 |     "    },\n",
125 |     "    \"mappings\":{  \n",
126 |     "        mapping_name:{\n",
127 |     "          \"properties\":{\n",
128 |     "            \"channel\": {\"type\": \"keyword\"},\n",
129 |     "            \"title\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n",
130 |     "            \"ts\": {\"type\": \"date\"},\n",
131 |     "            \"messages\" : {\n",
132 |     "                \"properties\":{\n",
133 |     "                    \"text\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n",
134 |     "                    \"user_id\": {\"type\": \"keyword\"},\n",
135 |     "                    \"user_real_name\": {\"type\":\"string\"},\n",
136 |     "                    \"ts\": {\"type\": \"date\"}\n",
137 |     "                }\n",
138 |     "            }\n",
139 |     "          }\n",
140 |     "        },\n",
141 |     "        message_mapping:{\n",
142 |     "            \"properties\":{\n",
143 |     "                \"text\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n",
144 |     "                \"user_id\": {\"type\": \"keyword\"},\n",
145 |     "                \"user_real_name\": {\"type\":\"string\"},\n",
146 |     "                \"ts\": {\"type\": \"date\"}\n",
147 |     "            }\n",
148 |     "        }\n",
149 |     "    }\n",
150 |     "}\n",
151 |     "\n",
152 |     "async def create_index():\n",
153 |     "    return await es.indices.create(\n",
154 |     "        index=index_name,\n",
155 |     "        body=index_body\n",
156 |     "    )\n",
157 |     "    \n",
158 |     "async def check_index_exists():\n",
159 |     "    return await es.indices.exists(index=index_name)\n",
160 |     "\n",
161 |     "\n",
162 |     "async def delete_index():\n",
163 |     "    return await es.delete(index=index_name)\n",
164 |     "\n",
165 |     "async def openclose():\n",
166 |     "    \"\"\"\n",
167 |     "    Closing and opening index again reloads synomyms file\n",
168 |     "    \"\"\"\n",
169 |     "    await es.indices.close(index=index_name)\n",
170 |     "    await es.indices.open(index=index_name)\n",
171 |     "    \n",
172 |     "async def populate_index(channel, messages):\n",
173 |     "    await es.index(\n",
174 |     "        index=index_name,\n",
175 |     "        doc_type=mapping_name,\n",
176 |     "        body={\n",
177 |     "            \"channel\": channel,\n",
178 |     "            \"title\": messages[0]['text'],\n",
179 |     "            \"ts\": messages[0]['ts'] * 1000,\n",
180 |     "            \"messages\": messages\n",
181 |     "        }\n",
182 |     "    )\n",
183 |     "    \"\"\"\n",
184 |     "    for message in messages: # make bulk upload here\n",
185 |     "        await es.index(\n",
186 |     "            index=index_name,\n",
187 |     "            doc_type=message_mapping,\n",
188 |     "            body=message\n",
189 |     "        )\n",
190 |     "    \"\"\"\n",
191 |     "\n",
192 |     "async def query_index(query):\n",
193 |     "    return await es.search(\n",
194 |     "        index=index_name,\n",
195 |     "        doc_type=mapping_name,\n",
196 |     "        body={\n",
197 |     "            \"query\":{\n",
198 |     "                \"multi_match\" : {\n",
199 |     "                  \"fields\" : [ \"title^3\", \"messages.text\" ],\n",
200 |     "                  \"query\": query\n",
201 |     "                }\n",
202 |     "            }\n",
203 |     "        }\n",
204 |     "    )\n",
205 |     "\n",
206 |     "async def search_user(username):\n",
207 |     "    return await es.search(\n",
208 |     "        index=index_name,\n",
209 |     "        doc_type=mapping_name,\n",
210 |     "        body={\n",
211 |     "            \"query\":{\n",
212 |     "                \"multi_match\" : {\n",
213 |     "                  \"fields\" : [ \"messages.user_real_name\" ],\n",
214 |     "                  \"query\": username\n",
215 |     "                }\n",
216 |     "            }\n",
217 |     "        }\n",
218 |     "    )"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 7,
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "loop = asyncio.get_event_loop()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 16,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "{'acknowledged': True}\n",
244 |       "{'acknowledged': True, 'shards_acknowledged': True}\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "if loop.run_until_complete(check_index_exists()):\n",
250 |     "    print(loop.run_until_complete(delete_index()))\n",
251 |     "    \n",
252 |     "print(loop.run_until_complete(create_index()))"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {
259 |     "collapsed": true
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "# reload synonims without recreating the whole database\n",
264 |     "gen_synonyms()\n",
265 |     "loop.run_until_complete(openclose())"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 9,
271 |    "metadata": {
272 |     "collapsed": true
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "async def index_channel(channel = \"nlp\"):\n",
277 |     "    data = SlackLoader(PATH_TO_DATA, only_channels=[channel])\n",
278 |     "    chunker = Chunker()\n",
279 |     "    groups  = chunker.get_groups(data)\n",
280 |     "    \n",
281 |     "    print(\"Indexing: \" + channel)\n",
282 |     "\n",
283 |     "    workers = []\n",
284 |     "    for group in groups:\n",
285 |     "        users = data.users\n",
286 |     "        for msg in group:\n",
287 |     "            if msg['user'] in users:\n",
288 |     "                msg['user_real_name'] = users[msg['user']]['name']\n",
289 |     "            if 'dt' in msg:\n",
290 |     "                del msg['dt']\n",
291 |     "            msg['timestamp'] = str(msg['ts'])\n",
292 |     "            msg['ts'] = int(msg['ts'])\n",
293 |     "            if \"attachments\" in msg:\n",
294 |     "                for attach in msg[\"attachments\"]:\n",
295 |     "                    if 'ts' in attach:\n",
296 |     "                        attach['ts'] = float(attach['ts'])\n",
297 |     "        workers.append(\n",
298 |     "            asyncio.ensure_future(populate_index(channel, group))\n",
299 |     "        )\n",
300 |     "    return await asyncio.gather(*workers)\n",
301 |     "\n",
302 |     "async def index_channels(channels):\n",
303 |     "    await asyncio.gather(\n",
304 |     "        *[asyncio.ensure_future(index_channel(channel)) for channel in channels]\n",
305 |     "    )\n",
306 |     "        "
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 17,
312 |    "metadata": {
313 |     "collapsed": false
314 |    },
315 |    "outputs": [
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "Indexing: nlp\n",
321 |       "Indexing: deep_learning\n",
322 |       "Indexing: datasets\n",
323 |       "Indexing: sequences_series\n",
324 |       "Indexing: bayesian\n",
325 |       "Indexing: _meetings\n",
326 |       "Indexing: edu_academy\n",
327 |       "Indexing: edu_books\n",
328 |       "Indexing: visualization\n",
329 |       "Indexing: hardware\n",
330 |       "Indexing: reinforcement_learnin\n",
331 |       "Indexing: theory_and_practice\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "useful_channels = [\"nlp\", \"deep_learning\", \"datasets\",\n",
337 |     "                  \"sequences_series\", \"bayesian\", \"_meetings\", \"edu_academy\",\n",
338 |     "                  \"edu_books\", \"visualization\", \"hardware\",\n",
339 |     "                  \"reinforcement_learnin\", \"theory_and_practice\"]\n",
340 |     "\n",
341 |     "loop.run_until_complete(index_channels(useful_channels))"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "loop.run_until_complete(query_index(\"как использовать xgboost в python\"))"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 57,
358 |    "metadata": {
359 |     "collapsed": false
360 |    },
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "\n",
367 |       "------------(nlp)--------------\n",
368 |       "elwis: <https://gist.github.com/Kein1945/9111512>\n",
369 |       "elwis: если надо просто отрезать окончания, то вот это подойдет\n",
370 |       "stajilov: проверял недавно, не умеет нормально, лемматизацию может делать типо но нужен русский wordnet нормальный\n",
371 |       "\n",
372 |       "------------(nlp)--------------\n",
373 |       "elwis: Коллега сделала интересную сравнительную таблицу чатботов, может кому-то пригодится: <https://medium.com/@datamonsters/25-chatbot-platforms-a-comparative-table-aeefc932eaff>\n",
374 |       "alexantonov: elwis: Отличная статья. А совершенно случайно нет на русском языке?\n",
375 |       "elwis: <@U32506X36> К сожалению нет\n",
376 |       "\n",
377 |       "------------(nlp)--------------\n",
378 |       "elwis: Если tf-idf вектора нормализованные, можно вместо косинусной близости считать скалярное произведение\n",
379 |       "\n",
380 |       "------------(nlp)--------------\n",
381 |       "rvnikita: Ребята, привет. Что почиттать чтобы быстро разобраться в основнах NLP? <https://www.amazon.co.uk/d/Books/Natural-Language-Processing-Python-Steven/0596516495> хорошая книжка или есть что-то другое более признаное?\n",
382 |       "octocat: rvnikita: хорошая, но заточена под NLTK.\n",
383 |       "elwis: это главная книга по nltk, есть в электронном виде: <http://www.nltk.org/book/>\n",
384 |       "aledovsky: Одной из лучших книг по nlp на мой взгляд является Martin Jurafsky - Speеch and Language Processing. Это большая книжка, но из неё можно независимо читать главы.  Я бы предложил почитать несколько вводных плюс главы про прикладные задачи. Есть второе издание, которое нетрудно нагуглить в pdf и драфт третьего, который на сайте автора <http://web.stanford.edu/~jurafsky/slp3/>. Третье издание похоже в базовых главах и сильно отличается в описании прикладных задач.\n",
385 |       "buzzword_miner: Обработка неструктурированны текстов \n",
386 |       "Поиск. Организация и манипулирование\n",
387 |       "buzzword_miner: <https://ozon-st.cdn.ngenix.net/multimedia/1010991212.jpg>\n",
388 |       "\n",
389 |       "------------(nlp)--------------\n",
390 |       "elwis: была похожая проблема с bigartm, решил установив --threads 1. А что такое -j не подскажете? это то же самое?\n",
391 |       "khansuleyman: -j - то же, что и --jobs. Одновременное выполнение указанного количества команд\n",
392 |       "ryazanoff: Кто тыкал уже? Там проблемы с 3 питоном\n",
393 |       "angriff07: так там и не заявляется работа с питоном 3... в readme написано, что python2.7\n",
394 |       "\n",
395 |       "------------(nlp)--------------\n",
396 |       "elwis: это главная книга по nltk, есть в электронном виде: <http://www.nltk.org/book/>\n",
397 |       "dimakarp1996: =\n",
398 |       "i: <https://github.com/giacbrd/ShallowLearn>\n",
399 |       "i: m.yurushkin: тоже вае на текстах хочу. а у тебя какой датасет?\n",
400 |       "\n",
401 |       "------------(nlp)--------------\n",
402 |       "0x1337: Корректно ли считать схожесть текстов косинусным расстоянием, если вектора – это не OHE представление, а tf-idf веса?\n",
403 |       "ololo: да, так обычно и делают, если я правильно вопрос понял\n",
404 |       "alex.ozerin: Да, косинус между суммами ohe будет глупым булевым поиском. Tfidf -- разумный вариант взвешивания\n",
405 |       "mrukhlov: а что за ohe?\n",
406 |       "alex.ozerin: One hot encoding\n",
407 |       "mrukhlov: спасибо\n",
408 |       "elwis: Если tf-idf вектора нормализованные, можно вместо косинусной близости считать скалярное произведение\n",
409 |       "amir: а зачем вообще использовать tf-idf, если есть w2v и даже более совершенные модели эмбеддингов?\n",
410 |       "ololo: потому что tf-idf в некоторых случаях лучше работает, например для IR\n",
411 |       "0x1337: у меня линейный свм порвал как тузик грелку Xgboost на ворд2век + bigARTM. \n",
412 |       "amir: Не знал, что такое тоже может быть. А какие тексты используются?\n",
413 |       "0x1337: новостные статьи на русском. \n",
414 |       "evgeny: <@U4E1EF5CZ> а какие более совершенные модели эмбеддингов ты имеешь в виду?\n",
415 |       "elwis: <@U3PETUSSE> а как ты соединил ворд2век и BigARTM если не секрет?\n",
416 |       "0x1337: <@U443HBJ8L> Для документов считаешь распределение топиков, вот и новые фичи. \n",
417 |       "elwis: ясно, а я сначала подумал что ты вектора слов как-то в BigARTM сумел запихнуть как токены)\n",
418 |       "amir: <@U0D8KLBFV> google swivel, fasttext\n",
419 |       "\n",
420 |       "------------(nlp)--------------\n",
421 |       "wingrime: Господа, nltk умеет стиминг русский?\n",
422 |       "novitoll: Судя по тому, что я скачал все данные от nltk `import nltk;nltk.download('all')`, то тут только для en-US. Но можно проверить в директорий, куда все данные скачались. На Linux - это по дефолту хранится в `/home/user/nltk_data/stemmers/`. Тут есть только `porter_test` для инглиша. \n",
423 |       "Думаю, для русского языка можно использовать `pymorphy2`\n",
424 |       "dselivanov: когда я послдений раз смотрел стемминг делался через такую жопу, что после этого я вообще nltk перестал воспринимать за библиотеку. Он транслитерировал русский в английский, потом делал стемминг, затем конвертировал обратно\n",
425 |       "dselivanov: такой вот пиздец\n",
426 |       "gleberof: С русским языком у pymorphy2 тоже не все идеально. \"Открытие банка\" -&gt; \"открыть\", \"банка\". Понятно конечно почему. Но пока pymorphy2 лучшее что есть для стемминга\n",
427 |       "wingrime: Спсб\n",
428 |       "wingrime: А что умеет сейчас нормализацию?\n",
429 |       "alexeyev: <@U50GC05J7> лемматизацию, в смысле?\n",
430 |       "wingrime: В смысле аналогично заменам We'll -&gt; we will\n",
431 |       "wingrime: Раскрытие сокращений\n",
432 |       "wingrime: Синонимов\n",
433 |       "elwis: <https://gist.github.com/Kein1945/9111512>\n",
434 |       "elwis: если надо просто отрезать окончания, то вот это подойдет\n",
435 |       "stajilov: проверял недавно, не умеет нормально, лемматизацию может делать типо но нужен русский wordnet нормальный\n",
436 |       "windj007: если лицензия позволяет, то можно ещё\n",
437 |       "<https://pypi.python.org/pypi/pymystem3/0.1.1>\n",
438 |       "0x1337: +1, mystem пушка.\n"
439 |      ]
440 |     }
441 |    ],
442 |    "source": [
443 |     "res = loop.run_until_complete(search_user(\"generall\"))['hits']['hits']\n",
444 |     "for hit in res:\n",
445 |     "    print(\"\\n------------({})--------------\".format(hit['_source']['channel']))\n",
446 |     "    for msg in hit['_source']['messages']:\n",
447 |     "        print(\"{}: {}\".format(msg['user_real_name'], msg['text']))"
448 |    ]
449 |   }
450 |  ],
451 |  "metadata": {
452 |   "kernelspec": {
453 |    "display_name": "Python [default]",
454 |    "language": "python",
455 |    "name": "python3"
456 |   },
457 |   "language_info": {
458 |    "codemirror_mode": {
459 |     "name": "ipython",
460 |     "version": 3
461 |    },
462 |    "file_extension": ".py",
463 |    "mimetype": "text/x-python",
464 |    "name": "python",
465 |    "nbconvert_exporter": "python",
466 |    "pygments_lexer": "ipython3",
467 |    "version": "3.5.2"
468 |   }
469 |  },
470 |  "nbformat": 4,
471 |  "nbformat_minor": 2
472 | }
473 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/chunk.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | import os
  6 | import datetime
  7 | import time
  8 | 
  9 | import itertools
 10 | 
 11 | from slack_data_loader import SlackLoader
 12 | 
 13 | SECS_IN_DAY = 60 * 60 * 24
 14 | 
 15 | # this class wrap time by daily activity distribution 
 16 | class TimeDistance:
 17 |     
 18 |     def calc_dist(self, times):
 19 |         day_times = times % SECS_IN_DAY
 20 |         hist, ranges = np.histogram(day_times, range=(0, SECS_IN_DAY), bins=self.bins)
 21 |         total_count = hist.sum()
 22 |         normalized_hist = hist / self.total_count
 23 |         ranges = ranges.astype(int)[:-1]
 24 |         dist = dict(zip(ranges, normalized_hist))
 25 |         mean = normalized_hist.mean()
 26 |         return (mean, dist)
 27 |     
 28 |     
 29 |     def get_time_range(self, ts):
 30 |         dt = datetime.datetime.fromtimestamp(ts)
 31 |         return str(dt.year) + str(int(dt.month / 6))
 32 |     
 33 |     def init_distribution(self, times):
 34 |         # split by years somehow
 35 |         self.bins = 100
 36 |         self.time_step = int(SECS_IN_DAY / self.bins)
 37 |         self.total_count = times.size
 38 |         
 39 |         datetimes = map(self.get_time_range, times)
 40 |         zp = zip(datetimes, times)
 41 |         grps = itertools.groupby(zp, key=lambda x: x[0])
 42 | 
 43 |         time_groups = list([ (k, np.array([y for x,y in g]) ) for k,g in grps])
 44 |         
 45 |         meanes = []
 46 |         dists = []
 47 |         for key, group_times in time_groups:
 48 |             mean, dist = self.calc_dist(group_times)
 49 |             meanes.append( (key, mean) )
 50 |             dists.append( (key, dist) )
 51 |         
 52 |         self.dist = dict(dists)
 53 |         self.mean = dict(meanes)
 54 |         
 55 |         return self
 56 |     
 57 |     def get_range_start(self, ts):
 58 |         return int(ts % SECS_IN_DAY / self.time_step) * self.time_step
 59 |     
 60 |     def get_dist(self, ts):
 61 |         curr_range = self.get_range_start(ts)
 62 |         return self.dist[self.get_time_range(ts)][curr_range]
 63 |     
 64 |     def get_mean(self, ts):
 65 |         return self.mean[self.get_time_range(ts)]
 66 |         
 67 |     def distance(self, ts1, ts2):
 68 |         max_ts = max(ts1, ts2)
 69 |         min_ts = min(ts1, ts2)
 70 |         curr = min_ts
 71 |         dist = 0.0
 72 |         diff = max_ts - min_ts
 73 |         if diff > SECS_IN_DAY:
 74 |             secs = int(diff / SECS_IN_DAY) * SECS_IN_DAY
 75 |             dist += secs * self.get_mean(curr)
 76 |             curr += secs
 77 |         while curr < max_ts:
 78 |             time_to_next_range = self.time_step - curr % self.time_step
 79 |             time_to_end = max_ts - curr
 80 |             min_time = min(time_to_end, time_to_next_range)
 81 |             curr += min_time
 82 |             dist += self.get_dist(curr) * min_time
 83 |         return dist
 84 | 
 85 | class Chunker:
 86 |     def split_by_threshold(self, difs, threshold):
 87 |         res = []
 88 |         start = 0
 89 |         curr = difs
 90 |         while len(curr) > 0:
 91 |             group_len = len(list(itertools.takewhile(lambda x: x < threshold, curr)))
 92 |             res.append(range(start, start + group_len + 1))
 93 |             curr = curr[group_len + 1:]
 94 |             start = start + group_len + 1
 95 |         return res
 96 | 
 97 |     def cluster_time_series(self, timeObj, times, threshold = 100.0):
 98 |         time_difs = np.zeros(times.size - 1)
 99 |         for i in range(0, times.size - 2):
100 |             time_difs[i] = timeObj.distance(times[i], times[i + 1])
101 |         chunks = self.split_by_threshold(time_difs, threshold)
102 |         return chunks
103 |     
104 |     def merge_with_threads(self, chunks, threads):
105 |         for thread in threads:
106 |             chunks = list(filter(lambda x: not (x[0] <= thread[0] <= x[-1] or x[0] <= thread[-1] <= x[-1]), chunks))
107 |         chunks += threads
108 |         return sorted(chunks, key=lambda x: x[0])
109 | 
110 |     def get_groups(self, data, threshold = 30):
111 |         times = np.array(list(map(lambda x: x['ts'], data.messages)))
112 |         timeObj = TimeDistance().init_distribution(times)
113 |         chunks = self.cluster_time_series(timeObj, times, threshold=threshold)
114 |         chunk_lengthes = np.array(list(map(len, chunks)))
115 |         threads = data.find_threads()
116 |         chunks = self.merge_with_threads(chunks, threads)
117 |         for chunk in chunks:
118 |             yield [ data.messages[i] for i in chunk ]
119 | 
120 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   elasticsearch1:
 4 |     image: elasticsearch
 5 |     container_name: elasticsearch1
 6 |     environment:
 7 |       - cluster.name=docker-cluster
 8 |       - bootstrap.memory_lock=true
 9 |       - "ES_JAVA_OPTS=-Xms1g -Xmx1g -Djava.security.policy=file:///usr/share/config_data/java_policy"
10 |     ulimits:
11 |       memlock:
12 |         soft: -1
13 |         hard: -1
14 |       nofile:
15 |         soft: 262144
16 |         hard: 262144
17 |     mem_limit: 1g
18 |     cap_add:
19 |       - IPC_LOCK
20 |     volumes:
21 |       - ./search_data:/usr/share/elasticsearch/data
22 |       - ./help_data:/usr/share/config_data
23 |     ports:
24 |       - 9200:9200
25 |       - 9300:9300
26 |     network_mode: "host"
27 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/event-parser.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 102,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from urllib.parse import urlparse\n",
 12 |     "import requests\n",
 13 |     "import scrapy"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 92,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "def postprocess_event_json(event_json):\n",
 25 |     "    for k, v in event_json.items():\n",
 26 |     "        if isinstance(v, list):\n",
 27 |     "            v = '\\n'.join(v)\n",
 28 |     "        v = v.replace('\\xa0', ' ').replace('\\u200b', '')\n",
 29 |     "        event_json[k] = v.strip()\n",
 30 |     "    return event_json\n",
 31 |     "\n",
 32 |     "def get_event_json(url):\n",
 33 |     "    url_parts = urlparse(url)\n",
 34 |     "    host = url_parts.netloc\n",
 35 |     "    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'})\n",
 36 |     "    dom = scrapy.Selector(text=page.content)\n",
 37 |     "    if host == 'www.facebook.com':\n",
 38 |     "        title = dom.css('#seo_h1_tag ::text').extract_first()\n",
 39 |     "        datetime = dom.css('#event_summary').extract()\n",
 40 |     "        json = {\n",
 41 |     "            'title': title,\n",
 42 |     "            'datatime': 'unheard',\n",
 43 |     "            'location': 'na kudykinoy gore',\n",
 44 |     "            'source': 'facebook'\n",
 45 |     "        }\n",
 46 |     "    if host == 'events.yandex.ru':\n",
 47 |     "        json = {\n",
 48 |     "            'title': dom.css('h2.title ::text').extract_first(),\n",
 49 |     "            'datetime': dom.css('.event-header__when ::text').extract_first(),\n",
 50 |     "            'location': (dom.css('.event-header__place ::text').extract_first() or 'Unknown City') + ' Яндекс',\n",
 51 |     "            'source': url,\n",
 52 |     "            'decription': dom.css('.b-static-text ::text').extract()\n",
 53 |     "        }\n",
 54 |     "    if host == 'www.meetup.com':\n",
 55 |     "        json = {\n",
 56 |     "            'title': dom.css('.pageHead-headline ::text').extract_first(),\n",
 57 |     "            'datetime': ' '.join([s.strip() for s in dom.css('.eventTimeDisplay time ::text').extract() if s != ' ']),\n",
 58 |     "            'location': ' '.join(dom.css('.venueDisplay ::text').extract()),\n",
 59 |     "            'source': url,\n",
 60 |     "            'decription': dom.css('.event-description ::text').extract()\n",
 61 |     "        }\n",
 62 |     "    if host.endswith('timepad.ru'):\n",
 63 |     "        json = {\n",
 64 |     "            'title': dom.css('.ep-3-hero__subtitle ::text').extract_first().strip(),\n",
 65 |     "            'datetime': dom.css('.ep3-pagesummary__time-begin span ::text').extract_first(),\n",
 66 |     "            'location': dom.css('.ep3-pagesummary__place-city ::text').extract_first().strip() + ', ' + dom.css('.ep3-pagesummary__place-adress span ::text').extract_first().strip(),\n",
 67 |     "            'source': url,\n",
 68 |     "            'description': dom.css('.ep3-content .clearfix p ::text').extract()\n",
 69 |     "        }\n",
 70 |     "    \n",
 71 |     "    json = postprocess_event_json(json)\n",
 72 |     "    return json"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 103,
 78 |    "metadata": {
 79 |     "scrolled": false
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "url: https://events.yandex.ru/events/yac/29-may-2018/\n",
 87 |       "datetime: OK\n",
 88 |       "description: SKIP\n",
 89 |       "location: OK\n",
 90 |       "source: OK\n",
 91 |       "title: OK\n",
 92 |       "\n",
 93 |       "url: https://www.meetup.com/PyData-Moscow/events/240661336/\n",
 94 |       "title: OK\n",
 95 |       "datetime: OK\n",
 96 |       "location: OK\n",
 97 |       "source: OK\n",
 98 |       "description: SKIP\n",
 99 |       "\n",
100 |       "url: https://sdsj.timepad.ru/event/603431/\n",
101 |       "title: OK\n",
102 |       "datetime: OK\n",
103 |       "location: OK\n",
104 |       "source: OK\n",
105 |       "description: SKIP\n",
106 |       "\n",
107 |       "url: https://www.facebook.com/events/1727074767621344/\n",
108 |       "\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "def get_events_markup():\n",
114 |     "    events_markup = {\n",
115 |     "      \"https://events.yandex.ru/events/yac/29-may-2018/\": {\n",
116 |     "        'datetime': '29 мая, 08:30',\n",
117 |     "        'description': '',\n",
118 |     "        'location': 'Unknown City Яндекс',\n",
119 |     "        'source': 'https://events.yandex.ru/events/yac/29-may-2018/',\n",
120 |     "        'title': 'Yet another Conference 2018'\n",
121 |     "      },\n",
122 |     "      \"https://www.meetup.com/PyData-Moscow/events/240661336/\": {\n",
123 |     "        'title': 'Третий PyData Meetup',\n",
124 |     "        'datetime': 'Friday, June 23, 2017 6:30 PM to 9:30 PM',\n",
125 |     "        'location': 'Yandex ул. Льва Толстого, 16  ·  Moscow',\n",
126 |     "        'source': 'https://www.meetup.com/PyData-Moscow/events/240661336/',\n",
127 |     "        'description': '',\n",
128 |     "      },\n",
129 |     "      \"https://sdsj.timepad.ru/event/603431/\": {\n",
130 |     "        \"title\": \"Sberbank Data Science Day 2017\",\n",
131 |     "        \"datetime\": \"11 ноября 2017 c 9:30 до 22:00\",\n",
132 |     "        \"location\": \"Москва, ш. Энтузиастов, 5\",\n",
133 |     "        \"source\": \"https://sdsj.timepad.ru/event/603431/\",\n",
134 |     "        \"description\": ''\n",
135 |     "      },\n",
136 |     "      \"https://www.facebook.com/events/1727074767621344/\": {\n",
137 |     "\n",
138 |     "      }\n",
139 |     "    }\n",
140 |     "    return events_markup\n",
141 |     "\n",
142 |     "def test_get_event_json():\n",
143 |     "    events_markup = get_events_markup()\n",
144 |     "        \n",
145 |     "    for url, markup_dict in events_markup.items():\n",
146 |     "        print('url: %s' % url)\n",
147 |     "        event_dict = get_event_json(url)\n",
148 |     "        for k, markup_v in markup_dict.items():\n",
149 |     "            print('%s: ' % k, end='')\n",
150 |     "            event_v = event_dict.get(k, 'NONE')\n",
151 |     "            if event_v == markup_v:\n",
152 |     "                print('OK')\n",
153 |     "            elif k in ('description'):\n",
154 |     "                print('SKIP')\n",
155 |     "            else:\n",
156 |     "                print('ERROR:\\n%s\\n----- should be -----\\n%s' % (event_v, markup_v))\n",
157 |     "        print()\n",
158 |     "\n",
159 |     "test_get_event_json()"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.6.4"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 2
184 | }
185 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/fact_extraction_with_mystem.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import json\n",
 12 |     "import glob\n",
 13 |     "import os\n",
 14 |     "import datetime\n",
 15 |     "import re\n",
 16 |     "import pandas as pd\n",
 17 |     "from slack_data_loader import SlackLoader\n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "data_folder = '/Users/alex/Documents/ODS/opendatascience Slack export May 20 2017/'\n",
 29 |     "ods = SlackLoader(data_folder,only_channels=('welcome',))"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "type\n",
 41 |        "message    3992\n",
 42 |        "Name: dt, dtype: int64"
 43 |       ]
 44 |      },
 45 |      "execution_count": 3,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "df_msg = pd.DataFrame.from_records(ods.messages)\n",
 52 |     "df_msg.fillna(0).groupby(['type'])['dt'].count()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 6,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "как пример хорошего :smiley:\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "print(df_msg.text[4])"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Вытащим сообщения-представления"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "df_msg.head()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 8,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "k = 0\n",
 97 |     "hi_messages = []\n",
 98 |     "hi_tokens=('все', 'привет','добр','шалом','салют','здрав','хай','я','ребят','коллег')\n",
 99 |     "for _, i in df_msg.iterrows():\n",
100 |     "    if i.text.lower().startswith(hi_tokens) and len(i.text) > 100:\n",
101 |     "        k+=1\n",
102 |     "        hi_messages.append(i.text)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 1,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "for i in hi_messages[-10:]:\n",
112 |     "    print(i)\n",
113 |     "    print('='*80)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 11,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stderr",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "Installing mystem to /Users/alex/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-macosx10.8.tar.gz\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "from pymystem3 import Mystem\n",
131 |     "mystem = Mystem()\n",
132 |     "# Installing mystem to /home/dmchk/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz\n",
133 |     "# Экземпляр класса Mystem предоставляет метод lemmatize, вызывающий mystem с соответствующими параметрами."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 12,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "как насчет небольшой стемминг\n",
146 |       "\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "text = 'Как насчёт небольшого стемминга'\n",
152 |     "lemmas = mystem.lemmatize(text)\n",
153 |     "print(''.join(lemmas))"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 16,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "from random import  choice"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 32,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "import matplotlib.pyplot as plt"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 68,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "from IPython.display import HTML"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 37,
192 |    "metadata": {
193 |     "collapsed": true
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "import numpy as np"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 75,
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "def get_n_colors_on_colormap(n):\n",
209 |     "    cmap = plt.cm.get_cmap('jet')\n",
210 |     "    # cmap = matplotlib.cm.get_cmap('Spectral')\n",
211 |     "    rg = np.linspace(0.3,1.0,n)\n",
212 |     "    cols = np.asarray(np.floor(255*cmap(rg)),dtype=int)\n",
213 |     "    get_color_hash = lambda x: '#%02x%02x%02x' % tuple(x.tolist())\n",
214 |     "    return list(map(get_color_hash, cols[:,:3]))"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 87,
220 |    "metadata": {
221 |     "collapsed": true
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "tags = ('S','SPRO','V','ADVPRO','A','PR','ADV','APRO','CONJ', 'NUM', 'ANUM', 'PART')\n",
226 |     "tags_to_color_mapping = dict(zip(tags, get_n_colors_on_colormap(len(tags))))"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 92,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "def plot_seq_with_tags(seq):\n",
247 |     "    tmplt = '<p>{}</p>'\n",
248 |     "    span_tmlpt = \"\"\"<span style=\"background-color: {color};\">{content}</span>\"\"\"\n",
249 |     "    fulltext = ''\n",
250 |     "    for _word, _pos in seq:\n",
251 |     "        if _pos is not None:\n",
252 |     "            fulltext += span_tmlpt.format(color=tags_to_color_mapping[_pos], content=_word)\n",
253 |     "        else:\n",
254 |     "            fulltext += _word\n",
255 |     "    return HTML(tmplt.format(fulltext))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 79,
261 |    "metadata": {
262 |     "collapsed": true
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "def get_tagging_for_text(rando):\n",
267 |     "    analysis = mystem.analyze(rando)\n",
268 |     "    seq = []\n",
269 |     "    for i in analysis:\n",
270 |     "        if 'analysis' in i and i['analysis']:\n",
271 |     "            info = i['analysis'][0]['gr']\n",
272 |     "            pos = info.split(',')[0]\n",
273 |     "            if '=' in pos:\n",
274 |     "                pos = pos.split('=')[0]\n",
275 |     "            seq.append((i['text'],pos))\n",
276 |     "        else:\n",
277 |     "            seq.append((i['text'],None))\n",
278 |     "    return seq"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 96,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "text/html": [
289 |        "<p><span style=\"background-color: #00b0ff;\">S</span><span style=\"background-color: #0cf4ea;\">SPRO</span><span style=\"background-color: #3fffb7;\">V</span><span style=\"background-color: #73ff83;\">ADVPRO</span><span style=\"background-color: #a6ff4f;\">A</span><span style=\"background-color: #ddff18;\">PR</span><span style=\"background-color: #ffd700;\">ADV</span><span style=\"background-color: #ff9b00;\">APRO</span><span style=\"background-color: #ff5c00;\">CONJ</span><span style=\"background-color: #ff2100;\">NUM</span><span style=\"background-color: #c80000;\">ANUM</span><span style=\"background-color: #7f0000;\">PART</span></p>"
290 |       ],
291 |       "text/plain": [
292 |        "<IPython.core.display.HTML object>"
293 |       ]
294 |      },
295 |      "execution_count": 96,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "plot_seq_with_tags(list(zip(tags,tags)))"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 94,
307 |    "metadata": {},
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/html": [
312 |        "<p><span style=\"background-color: #0cf4ea;\">Всем</span> <span style=\"background-color: #00b0ff;\">привет</span>,\n",
313 |        "<span style=\"background-color: #0cf4ea;\">Меня</span> <span style=\"background-color: #3fffb7;\">зовут</span> <span style=\"background-color: #00b0ff;\">Александр</span> (<span style=\"background-color: #00b0ff;\">Саша</span>, <span style=\"background-color: #00b0ff;\">Алекс</span> - <span style=\"background-color: #73ff83;\">как</span> <span style=\"background-color: #3fffb7;\">хотите</span> :slightly_smiling_face:, <span style=\"background-color: #0cf4ea;\">я</span> <span style=\"background-color: #ddff18;\">из</span> <span style=\"background-color: #00b0ff;\">Санкт-Петербурга</span>. <span style=\"background-color: #0cf4ea;\">Я</span> <span style=\"background-color: #3fffb7;\">закончил</span> <span style=\"background-color: #00b0ff;\">факультет</span> <span style=\"background-color: #00b0ff;\">Политологии</span> <span style=\"background-color: #00b0ff;\">МГИМО</span>; <span style=\"background-color: #3fffb7;\">понял</span>, <span style=\"background-color: #ff5c00;\">что</span> <span style=\"background-color: #7f0000;\">не</span> <span style=\"background-color: #3fffb7;\">хватает</span> <span style=\"background-color: #0cf4ea;\">мне</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">жизни</span> <span style=\"background-color: #00b0ff;\">математики</span> <span style=\"background-color: #ff5c00;\">и</span> <span style=\"background-color: #a6ff4f;\">количественных</span> <span style=\"background-color: #00b0ff;\">методов</span>, <span style=\"background-color: #ff5c00;\">и</span> <span style=\"background-color: #3fffb7;\">поступил</span> <span style=\"background-color: #ddff18;\">на</span> <span style=\"background-color: #a6ff4f;\">магистерскую</span> <span style=\"background-color: #00b0ff;\">программу</span> <span style=\"background-color: #ddff18;\">по</span> <span style=\"background-color: #a6ff4f;\">Международным</span> <span style=\"background-color: #00b0ff;\">Отношениям</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">Университете</span> <span style=\"background-color: #00b0ff;\">Калифорнии</span> (University of California, San Diego) <span style=\"background-color: #ddff18;\">с</span> <span style=\"background-color: #00b0ff;\">фокусом</span> <span style=\"background-color: #ddff18;\">на</span> <span style=\"background-color: #a6ff4f;\">экономическое</span> <span style=\"background-color: #00b0ff;\">развитие</span> (economic development), <span style=\"background-color: #a6ff4f;\">статистический</span> <span style=\"background-color: #00b0ff;\">анализ</span> <span style=\"background-color: #00b0ff;\">эконометрику</span>. <span style=\"background-color: #73ff83;\">Там</span> <span style=\"background-color: #3fffb7;\">работал</span> <span style=\"background-color: #ddff18;\">со</span> Stata, <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #a6ff4f;\">основном</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">сфере</span> <span style=\"background-color: #a6ff4f;\">регрессионного</span> <span style=\"background-color: #00b0ff;\">анализа</span> (Time-Series ARIMA models, Impact Evaluation, Randomized Controlled Trials, Regression Discontinuity Design, Propensity-Score Matching etc.). <span style=\"background-color: #73ff83;\">Там</span> <span style=\"background-color: #7f0000;\">же</span> <span style=\"background-color: #3fffb7;\">прошел</span> <span style=\"background-color: #c80000;\">один</span> <span style=\"background-color: #00b0ff;\">курс</span> <span style=\"background-color: #ddff18;\">по</span> Big Data Analytics, <span style=\"background-color: #3fffb7;\">научился</span> <span style=\"background-color: #3fffb7;\">юзать</span> R <span style=\"background-color: #ff5c00;\">и</span> <span style=\"background-color: #3fffb7;\">работать</span> <span style=\"background-color: #ddff18;\">с</span> <span style=\"background-color: #00b0ff;\">текстом</span>. \n",
314 |        "<span style=\"background-color: #ffd700;\">Сейчас</span> <span style=\"background-color: #3fffb7;\">работаю</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">должности</span> Data Analyst <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">стартапе</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">Нью-Йорке</span>: <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">основном</span> <span style=\"background-color: #3fffb7;\">работаю</span> <span style=\"background-color: #ddff18;\">с</span> <span style=\"background-color: #00b0ff;\">текстом</span> <span style=\"background-color: #ddff18;\">из</span> <span style=\"background-color: #a6ff4f;\">социальных</span> <span style=\"background-color: #00b0ff;\">медиа</span>, <span style=\"background-color: #3fffb7;\">использую</span> <span style=\"background-color: #00b0ff;\">дорогой</span> <span style=\"background-color: #ff5c00;\">и</span> <span style=\"background-color: #3fffb7;\">полюбившейся</span> <span style=\"background-color: #0cf4ea;\">мне</span> Python. <span style=\"background-color: #ffd700;\">Иногда</span> <span style=\"background-color: #3fffb7;\">есть</span> <span style=\"background-color: #00b0ff;\">проектики</span> <span style=\"background-color: #ddff18;\">по</span> Social Network Analysis <span style=\"background-color: #ff5c00;\">и</span> Supervised Learning Algorithms. <span style=\"background-color: #ddff18;\">На</span> <span style=\"background-color: #00b0ff;\">стороне</span> <span style=\"background-color: #3fffb7;\">стараюсь</span> <span style=\"background-color: #3fffb7;\">наверстать</span> <span style=\"background-color: #00b0ff;\">пробелы</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">знаниях</span> <span style=\"background-color: #ddff18;\">в</span> Data Science c <span style=\"background-color: #00b0ff;\">помощью</span> <span style=\"background-color: #ffd700;\">онлайн</span> <span style=\"background-color: #00b0ff;\">курсов</span> (Coursera, Udacity, Udemy, edX, DataCamp) <span style=\"background-color: #ff5c00;\">или</span> <span style=\"background-color: #00b0ff;\">конференций</span> (<span style=\"background-color: #3fffb7;\">ездил</span> <span style=\"background-color: #ddff18;\">на</span> Open Data Science Conference <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">Бостоне</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #a6ff4f;\">прошлом</span> <span style=\"background-color: #00b0ff;\">месяце</span>). <span style=\"background-color: #3fffb7;\">Планирую</span> <span style=\"background-color: #ffd700;\">активнее</span> <span style=\"background-color: #3fffb7;\">участвовать</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">соревнованиях</span> Kaggle.\n",
315 |        "<span style=\"background-color: #ffd700;\">Определенно</span> <span style=\"background-color: #7f0000;\">не</span> <span style=\"background-color: #3fffb7;\">хватает</span> <span style=\"background-color: #00b0ff;\">знаний</span> <span style=\"background-color: #ddff18;\">в</span> <span style=\"background-color: #00b0ff;\">сфере</span> <span style=\"background-color: #00b0ff;\">Матанализа</span>, <span style=\"background-color: #a6ff4f;\">Линейной</span> <span style=\"background-color: #00b0ff;\">алгебры</span> <span style=\"background-color: #ff5c00;\">и</span> computer science, <span style=\"background-color: #ff5c00;\">но</span> <span style=\"background-color: #73ff83;\">здесь</span> <span style=\"background-color: #3fffb7;\">прибегаю</span> <span style=\"background-color: #ddff18;\">к</span> <span style=\"background-color: #00b0ff;\">помощи</span> <span style=\"background-color: #a6ff4f;\">всезнающей</span> <@U13C9QU9Z>.\n",
316 |        "<span style=\"background-color: #73ff83;\">Всегда</span> <span style=\"background-color: #a6ff4f;\">рад</span> <span style=\"background-color: #a6ff4f;\">новым</span> <span style=\"background-color: #00b0ff;\">знакомствам</span> in the Data Science world. <span style=\"background-color: #3fffb7;\">Думаю</span>, <span style=\"background-color: #3fffb7;\">смогу</span> <span style=\"background-color: #73ff83;\">здесь</span> (slack) <span style=\"background-color: #0cf4ea;\">многому</span> <span style=\"background-color: #3fffb7;\">научиться</span>, <span style=\"background-color: #ff5c00;\">но</span> <span style=\"background-color: #73ff83;\">всегда</span> <span style=\"background-color: #a6ff4f;\">готов</span> <span style=\"background-color: #3fffb7;\">помочь</span> <span style=\"background-color: #00b0ff;\">советом</span> (<span style=\"background-color: #0cf4ea;\">чем</span> <span style=\"background-color: #3fffb7;\">смогу</span> - <span style=\"background-color: #3fffb7;\">помогу</span>!)\n",
317 |        "<span style=\"background-color: #ddff18;\">В</span> <span style=\"background-color: #a6ff4f;\">общем</span>, <span style=\"background-color: #3fffb7;\">пишите</span> - <span style=\"background-color: #7f0000;\">не</span> <span style=\"background-color: #3fffb7;\">стесняйтесь</span> :wink:\n",
318 |        "</p>"
319 |       ],
320 |       "text/plain": [
321 |        "<IPython.core.display.HTML object>"
322 |       ]
323 |      },
324 |      "execution_count": 94,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "rando = choice(hi_messages)\n",
331 |     "seq = get_tagging_for_text(rando)\n",
332 |     "plot_seq_with_tags(seq)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 97,
338 |    "metadata": {
339 |     "collapsed": true
340 |    },
341 |    "outputs": [],
342 |    "source": [
343 |     "from operator import itemgetter"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 100,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "Counter({'SPRO': 8,\n",
355 |        "         None: 290,\n",
356 |        "         'S': 47,\n",
357 |        "         'V': 31,\n",
358 |        "         'ADVPRO': 7,\n",
359 |        "         'PR': 29,\n",
360 |        "         'CONJ': 10,\n",
361 |        "         'PART': 4,\n",
362 |        "         'A': 15,\n",
363 |        "         'ANUM': 1,\n",
364 |        "         'ADV': 5})"
365 |       ]
366 |      },
367 |      "execution_count": 100,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "Counter(map(itemgetter(1), seq))"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": []
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": []
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "outputs": [],
401 |    "source": []
402 |   }
403 |  ],
404 |  "metadata": {
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.1"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 2
425 | }
426 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/config.proto:
--------------------------------------------------------------------------------
 1 | encoding "utf8"; 
 2 | 
 3 | TTextMinerConfig {
 4 |   Dictionary = "mydic.gzt";
 5 | 
 6 |   PrettyOutput = "PrettyOutput.html";  
 7 | 
 8 |   Input = {Dir = "messages"}
 9 | 
10 |   //Output = {File = "output" 
11 |   //          Format = text}
12 | 
13 |   Output = {
14 |     File = "facts.xml";
15 |     Format = xml;
16 |     //append = 1;
17 |   }
18 | 
19 |   Articles = [
20 |     { Name = "имя" },
21 |     { Name = "курсы" },
22 |     { Name = "образование" },
23 |     { Name = "работа" },
24 |     { Name = "интерес" }                                           
25 |   ]
26 | 
27 |   Facts = [
28 |     { Name = "Name" },
29 |     { Name = "Course" },
30 |     { Name = "Education" },
31 |     { Name = "Job" },
32 |     { Name = "Interest" }
33 |   ]
34 | }
35 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/courses.cxx:
--------------------------------------------------------------------------------
 1 | #encoding "utf-8"
 2 | 
 3 | CourseW -> "курс" | "специализация";
 4 | CourseShort -> "к" | "спец";
 5 | 
 6 | CourseDescr -> CourseW | CourseShort;
 7 | 
 8 | CourseNameNoun -> (Adj<gnc-agr[1]>) (Word<gnc-agr[1]>) Word<gnc-agr[1], rt> (Word<gnc-agr[1]>) (Word<gnc-agr[1]>);
 9 | 
10 | Course -> CourseDescr CourseNameNoun interp (Course.CourseName);
11 | 
12 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/education.cxx:
--------------------------------------------------------------------------------
1 | #encoding "utf-8"
2 | 
3 | EduW -> "учиться" | "закончить";
4 | 
5 | EduNameNoun -> (Adj<gnc-agr[1]>) (Word<gnc-agr[1]>) Word<gnc-agr[1]> (Word<gnc-agr[1]>) (Word<gnc-agr[1]>) ;
6 | 
7 | Edu -> EduW EduNameNoun interp (Education.Name);
8 | 
9 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/fact_types.proto:
--------------------------------------------------------------------------------
 1 | import "base.proto";
 2 | import "facttypes_base.proto";  
 3 | 
 4 | message Name: NFactType.TFact
 5 | {
 6 |     required string Name = 1;
 7 | }
 8 | 
 9 | message Course: NFactType.TFact
10 | {
11 |     required string CourseName = 1;
12 | }
13 | 
14 | message Education: NFactType.TFact
15 | {
16 |     required string Name = 1;
17 | }
18 | 
19 | message Job: NFactType.TFact
20 | {
21 |     required string Name = 1;
22 | }
23 | 
24 | message Interest: NFactType.TFact
25 | {
26 |     required string Name = 1;
27 | }


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/interest.cxx:
--------------------------------------------------------------------------------
1 | #encoding "utf-8"
2 | 
3 | InterestW -> "интерес" | "интересоваться";
4 | 
5 | InterestTitle -> (Word<gnc-agr[1]>) Word<gnc-agr[1]> ;
6 | 
7 | Interest -> InterestW InterestTitle interp (Interest.Name);
8 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/job.cxx:
--------------------------------------------------------------------------------
1 | #encoding "utf-8"
2 | 
3 | JobW -> "работать" | "заниматься" | "я";
4 | 
5 | JobTitle -> (Word<gnc-agr[1]>) (Word<gnc-agr[1]>) (Word<gnc-agr[1]>) Noun<gnc-agr[1], rt> (Word<gnc-agr[1]>) (Word<gnc-agr[1]>);
6 | 
7 | Job -> JobW JobTitle interp (Job.Name);
8 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/mydic.gzt:
--------------------------------------------------------------------------------
 1 | encoding "utf8";               
 2 | 
 3 | import "base.proto";           
 4 | import "articles_base.proto";  
 5 | import "fact_types.proto"; 
 6 | 
 7 | TAuxDicArticle "имя"
 8 | {
 9 |     key = { "tomita:name.cxx" type=CUSTOM }
10 | }
11 | 
12 | 
13 | TAuxDicArticle "курсы"
14 | {
15 |     key = { "tomita:courses.cxx" type=CUSTOM }
16 | }
17 | 
18 | TAuxDicArticle "образование"
19 | {
20 |     key = { "tomita:education.cxx" type=CUSTOM }
21 | }
22 | 
23 | TAuxDicArticle "работа"
24 | {
25 |     key = { "tomita:job.cxx" type=CUSTOM }
26 | }
27 | 
28 | TAuxDicArticle "интерес"
29 | {
30 |     key = { "tomita:interest.cxx" type=CUSTOM }
31 | }


--------------------------------------------------------------------------------
/hackathon_1_may_2017/for_tomita/name.cxx:
--------------------------------------------------------------------------------
1 | #encoding "utf-8"     
2 | 
3 | NameNoun -> (Word<h-reg1, gram='фам, им'>) Word<h-reg1, gram='имя, им'> (Word<h-reg1, gram='отч, им'>);
4 | 
5 | S -> NameNoun interp (Name.Name);
6 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/help_data/java_policy:
--------------------------------------------------------------------------------
1 | grant {
2 |     permission java.io.FilePermission "/usr/share/config_data/synonyms.txt", "read,write";
3 | };


--------------------------------------------------------------------------------
/hackathon_1_may_2017/help_data/synonyms.txt:
--------------------------------------------------------------------------------
1 | xboost, эксгебуст, эксбуст, иксгебуст, xgboost
2 | пыха, пыху, пых, php
3 | lol, лол
4 | питон, python
5 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/key_words.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import glob
  3 | import os
  4 | import datetime
  5 | import re
  6 | import pymorphy2
  7 | import pandas as pd
  8 | from nltk import word_tokenize
  9 | 
 10 | morph = pymorphy2.MorphAnalyzer()
 11 | 
 12 | 
 13 | #from slack_export import SlackExport, normalize_links
 14 | from Introspect_hackathon.slack_data_loader import SlackLoader
 15 | 
 16 | 
 17 | def start():
 18 | 	# data_folder = ‘/Users/alex/Documents/ODS/oct_4_2016_dump’
 19 | 	data_folder = "ODS_dump_Mar_10_2017"
 20 | 	
 21 | 	#ods = SlackExport(data_folder)
 22 | 	ods = SlackLoader(data_folder, exclude_channels=["_random_flood", "career"])
 23 | 
 24 | 	df_msg = pd.DataFrame.from_records(ods.messages)
 25 | 	
 26 | 	return df_msg
 27 | 
 28 | 
 29 | def cleanUsernames(str):
 30 | 	re.sub(r"<@", "")
 31 | 
 32 | 
 33 | def lemm(st):
 34 | 	if st == '':
 35 | 		return ''
 36 | 	else:
 37 | 		return morph.parse(st)[0].normal_form
 38 | 
 39 | 
 40 | 
 41 | from stop_words import get_stop_words
 42 | from string import punctuation
 43 | 
 44 | punct = set(punctuation)
 45 | punct.add(' > ')
 46 | punct.add(' < ')
 47 | 
 48 | stop_words = set(get_stop_words('ru'))
 49 | 
 50 | print(':' in punct)
 51 | 
 52 | def pars(text):
 53 | 	target = []
 54 | 	print(len(text))
 55 | 	count = 0
 56 | 	for supidx, txt in enumerate(text):
 57 | 		if supidx == 100:
 58 | 			break
 59 | 		# print(txt)
 60 | 		count += 1
 61 | 		for line in txt.split('\n'):
 62 | 			# if len(grade.findall(line)) != len([l for l in line]):
 63 | 			snt = re.split("\.+ |, | ! | \? |  \( |\) | - ", line)
 64 | 			bigram = []
 65 | 			words = []
 66 | 			trigram = []
 67 | 			for s in snt:
 68 | 				spl = s.split(' ')
 69 | 				if len(s) > 1:
 70 | 					for i in range(0, (len(spl) - 1)):
 71 | 						if ((spl[i] not in stop_words) and (spl[i + 1] not in stop_words)
 72 | 							and spl[i].isdigit() == False and spl[i + 1].isdigit() == False
 73 | 							and (spl[i + 1] not in punct) and (spl[i] not in punct)):
 74 | 							bigram.append(str(lemm(spl[i])) + ' ' + str(lemm(spl[i+1])))
 75 | 
 76 | 					for i in range(0, (len(spl) - 2)):
 77 | 						if ((spl[i] not in stop_words) and (spl[i + 1] not in stop_words)
 78 | 							and (spl[i + 2] not in stop_words)
 79 | 							and spl[i].isdigit() == False and spl[i + 1].isdigit() == False
 80 | 							and spl[i + 2].isdigit() == False
 81 | 							and (spl[i + 1] not in punct) and (spl[i] not in punct)
 82 | 							and (spl[i + 2] not in punct)):
 83 | 							trigram.append(str(lemm(spl[i])) + ' ' + str(lemm(spl[i + 1])) + ' '+str(lemm(spl[i+2])))
 84 | 
 85 | 					for i in range(0, (len(spl))):
 86 | 						if (spl[i] not in stop_words) and spl[i].isdigit() == False:
 87 | 							words.append(str(lemm(spl[i])))
 88 | 				trg = bigram + words + trigram
 89 | 				target.append(trg)
 90 | 				#print(count, trg)
 91 | 	return target
 92 | 
 93 | def clean(matrix):
 94 | 	return pars(matrix['text'])
 95 | 
 96 | 
 97 | def loadCommonLang(datapath="corpus_freq_dict.csv"):
 98 | 	fCorpus = open(datapath, encoding="UTF-8")
 99 | 	lines = fCorpus.readlines()
100 | 	
101 | 	vocabulary = {}
102 | 	for i, line in enumerate(lines):
103 | 		if line != "\n":
104 | 			sample = re.sub("\n", "", line)
105 | 			sample = sample.split(",")
106 | 			vocabulary[sample[0]] = int(sample[1])
107 | 	
108 | 	return vocabulary
109 | 
110 | 
111 | def countAllWordsVocab(vocabulary):
112 | 	cnt = 0
113 | 	for word in vocabulary:
114 | 		cnt += vocabulary[word]
115 | 	
116 | 	return cnt
117 | 
118 | 
119 | def strange(m, m1):  # для 2 массивов слов
120 | 	f = len(m)
121 | 	f1 = countAllWordsVocab(m1)
122 | 
123 | 	mass = []
124 | 	for word in set(m):
125 | 		if word in m1:
126 | 			res = round((m.count(word) / f) / (m1[word] / f1), 4)
127 | 			mass.append((word, res))
128 | 		else:
129 | 			mass.append((word, 75.))
130 | 	return mass
131 | 
132 | 
133 | def oneList(text):
134 | 	res = []
135 | 	for i, sentence in enumerate(text):
136 | 		res += sentence
137 | 	
138 | 	return res
139 | 
140 | 
141 | import pymysql.cursors, re
142 | 
143 | 
144 | def getMySQLData(limit=1000000, sql="SELECT ttext FROM %s LIMIT %s"):
145 | 	connection = pymysql.connect(host='localhost', user='root', password='root', db='sys', charset='utf8mb4',
146 | 								 cursorclass=pymysql.cursors.DictCursor)
147 | 	
148 | 	try:
149 | 		with connection.cursor() as cursor:
150 | 			#sql = "SELECT ttext FROM %s LIMIT %s"
151 | 			cursor.execute(sql, limit)
152 | 			
153 | 			result = cursor.fetchall()
154 | 			data = []
155 | 			for i, item in enumerate(result):
156 | 				try:
157 | 					if item['ttext'] != None:
158 | 						data.append(re.sub("\n", " ", item['ttext']))
159 | 				except Exception:
160 | 					print("%d %s" % (i, item))
161 | 			
162 | 			return data
163 | 	finally:
164 | 		connection.close()
165 | 
166 | 
167 | def loadTwitterDict():
168 | 	from collections import Counter
169 | 	# limit = 1000000
170 | 	limit = 111000
171 | 	negdata = getMySQLData(limit, "SELECT ttext FROM `sortneg` LIMIT %s")
172 | 	posdata = getMySQLData(limit, "SELECT ttext FROM `sortpos` LIMIT %s")
173 | 	limit = 1000000
174 | 	neutraldata = getMySQLData(limit, "SELECT ttext FROM `sentiment` LIMIT %s")
175 | 	
176 | 	data = negdata + posdata + neutraldata
177 | 	
178 | 
179 | 	#sentences = pars(data)
180 | 	sentences = []
181 | 	for i, text in enumerate(data):
182 | 		sentences += re.split("\.+ |, | ! | \? |  \( |\) | - ", text)
183 | 
184 | 	words = []
185 | 	for i, sentence in enumerate(sentences):
186 | 		words += word_tokenize(sentence)
187 | 	
188 | 	#words = []
189 | 	#for i, sentence in enumerate(sentences):
190 | 	#	words += sentence
191 | 
192 | 	from nltk import collections
193 | 	counts = dict(Counter(words))
194 | 	return counts
195 | 
196 | 
197 | def wordsChoose(dic, barrier=5.):
198 | 	res = []
199 | 	for i, word in enumerate(dic):
200 | 		if word[1] > barrier:
201 | 			res.append(word)
202 | 	return res
203 | 
204 | 
205 | def writeData(data, datapath='strange.csv'):
206 | 	import csv
207 | 	with open(datapath, "w+", encoding="UTF-8") as f:
208 | 		a = csv.writer(f)
209 | 		for i, word in enumerate(data):
210 | 			a.writerow(word)
211 | 
212 | 
213 | if __name__ == "__main__":
214 | 	counts = loadTwitterDict()
215 | 	#vocab = loadCommonLang()
216 | 	df_msg = start()
217 | 	lemmatized = clean(df_msg)
218 | 	onelst = oneList(lemmatized)
219 | 	strangeness = strange(onelst, counts)
220 | 	ranged_strange_words = wordsChoose(strangeness)
221 | 	writeData(ranged_strange_words, datapath='strange.csv')
222 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/predict_channel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | import codecs
  4 | from collections import defaultdict
  5 | 
  6 | import pymorphy2
  7 | from sklearn.metrics import confusion_matrix
  8 | from sklearn.model_selection import train_test_split
  9 | import stop_words
 10 | 
 11 | from tokenizer import tokenize, preprocessing
 12 | from slack_data_loader import SlackLoader
 13 | 
 14 | 
 15 | morph = pymorphy2.MorphAnalyzer()
 16 | 
 17 | QUESTION_WORD_LEMMAS = ("как", "как-то", "какой", "какой-то", "зачем", "почему", "когда", "кто", "где",
 18 |                         "когда", "куда", "куда-то", "чот")
 19 | QUESTION_WORDS = ("подскажите", "посоветуйте", "дайте", "киньте", "кинте")
 20 | STOP_WORDS = stop_words.get_stop_words("russian")
 21 | PUNCTUATION = ['.', ',', ';', ':', '!', '?', '-', '<', '>', '(', ')', '<-', '`', '::', '//', '/', '>:',
 22 |                '{', '}', '--', '(<', '\\', '}]', ']', '[', '))', '>>', '..', '...', '==', '```', '#',
 23 |                '~', '"', '%)', ';<', '|', '!!', 'slightly_smiling_face', 'simple_smile', 'http',
 24 |                'https', ':/', 'smile', 'www.', 'com', 'ru', 'org', 'ru.', "'"]
 25 | 
 26 | 
 27 | def is_question(tokens):
 28 |     is_q = False
 29 |     for i, t in enumerate(tokens):
 30 |         if t in QUESTION_WORDS:
 31 |             is_q = True
 32 |         t = morph.parse(t)[0].normal_form
 33 |         tokens[i] = t
 34 |         if t in QUESTION_WORD_LEMMAS:
 35 |             is_q = True
 36 |     return is_q
 37 | 
 38 | 
 39 | def prepare_data():
 40 |     print('Loading data...')
 41 |     loader = SlackLoader('opendatascience Slack export May 20 2017', is_sorted=False,
 42 |                          only_channels=['nlp', 'deep_learning', 'datasets', 'sequences_series', 'bayesian', '_meetings',
 43 |                                         'edu_academy', 'edu_books', 'visualization',
 44 |                                         'hardware', 'reinforcement_learnin', 'theory_and_practice'])
 45 | 
 46 |     print('Converting data...')
 47 |     channel_messages = []
 48 |     previous_channel = ''
 49 |     label_id = 0
 50 |     with codecs.open('vw_data_train.vw', 'w', encoding='utf8') as vw_train:
 51 |         with codecs.open('vw_data_test.vw', 'w', encoding='utf8') as vw_test:
 52 |             for m in loader.messages:
 53 |                 tokens = [t for t in tokenize(preprocessing(m['text']))
 54 |                           if t not in PUNCTUATION and not t.startswith('@')]
 55 |                 # take only questions
 56 |                 if is_question(tokens):
 57 |                     if previous_channel != m['channel']:
 58 |                         previous_channel = m['channel']
 59 |                         if channel_messages:
 60 |                             label_id += 1
 61 |                             train, test = train_test_split(channel_messages, test_size=0.15)
 62 |                             for t in train:
 63 |                                 text = t[1].replace(':', ';').replace('|', '/')
 64 |                                 vw_train.write('%s | %s\n' % (label_id, text))
 65 |                             for t in test:
 66 |                                 text = t[1].replace(':', ';').replace('|', '/')
 67 |                                 vw_test.write('%s | %s\n' % (label_id, text))
 68 |                             channel_messages = []
 69 |                     tokens = [t for t in tokens if t not in STOP_WORDS]
 70 |                     if len(tokens) > 3:
 71 |                         channel_messages.append((m['channel'], ' '.join(tokens)))
 72 | 
 73 |             # a last channel data
 74 |             label_id += 1
 75 |             train, test = train_test_split(channel_messages, test_size=0.15)
 76 |             for t in train:
 77 |                 text = t[1].replace(':', ';').replace('|', '/')
 78 |                 vw_train.write('%s | %s\n' % (label_id, text))
 79 |             for t in test:
 80 |                 text = t[1].replace(':', ';').replace('|', '/')
 81 |                 vw_test.write('%s | %s\n' % (label_id, text))
 82 | 
 83 | 
 84 | '''def convert_to_vw_format():
 85 |     file_name = 'fasttext_data.txt'
 86 |     file_name_vw = file_name.split('.')[0] + '.vw'
 87 |     previous_label = ''
 88 |     label_id = 0
 89 |     with codecs.open(file_name, 'r', encoding='utf-8') as f:
 90 |         with codecs.open(file_name_vw, 'w', encoding='utf-8') as f_out:
 91 |             for sentence in f:
 92 |                 try:
 93 |                     label, text = sentence.strip().split(' ', 1)
 94 |                     if previous_label != label:
 95 |                         label_id += 1
 96 |                         previous_label = label
 97 |                     text = text.replace(':', ';').replace('|', '/')
 98 |                     f_out.write('%s | %s\n' % (label_id, text))
 99 |                 except Exception as e:
100 |                     print(e)'''
101 | 
102 | 
103 | def analyze():
104 |     total = 0
105 |     correct = 0
106 |     labels_total = defaultdict(int)
107 |     labels_correct = defaultdict(int)
108 |     y_true = []
109 |     y_pred = []
110 |     with codecs.open('vw_data_test.vw', encoding='utf-8') as f:
111 |         with codecs.open('vw_data_test.vw.pred', encoding='utf-8') as f_pred:
112 |             for l in f:
113 |                 try:
114 |                     label_pred = f_pred.readline().strip()
115 |                     label, text = l.split(' | ')
116 |                     y_pred.append(int(label_pred))
117 |                     y_true.append(int(label))
118 |                     if (int(label) in [2, 8] and int(label_pred) in [2, 8]) and label != label_pred:
119 |                         if len(text.strip().split(' ')) <= 3:
120 |                             print('%s - %s' % (label, text.strip()))
121 |                     if label == label_pred:
122 |                         correct += 1
123 |                         labels_correct[label] += 1
124 |                     total += 1
125 |                     labels_total[label] += 1
126 |                 except:
127 |                     pass
128 | 
129 |     print('Accuracy total %s' % (correct / float(total)))
130 |     for l, v in labels_correct.iteritems():
131 |         print('Accuracy for label %s: %s' % (l, v / float(labels_total[l])))
132 | 
133 |     print(confusion_matrix(y_true, y_pred))
134 | 
135 | if __name__ == '__main__':
136 |     # prepare_data()
137 | 
138 |     # here we train a model and predict on test data: bash vw.sh
139 |     from subprocess import call
140 |     call(["bash", "vw.sh", "vw_data_train.vw", "vw_data_test.vw", "0.05", "2"])
141 | 
142 |     analyze()
143 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | numpy
3 | pymorphy2
4 | scikit-learn
5 | vowpalwabbit
6 | stop_words


--------------------------------------------------------------------------------
/hackathon_1_may_2017/slack_data_loader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import absolute_import
 4 | 
 5 | from collections import defaultdict
 6 | 
 7 | import datetime
 8 | import glob
 9 | import json
10 | import os
11 | import re
12 | 
13 | 
14 | def _read_json_dict(filename, key='id'):
15 |     with open(filename) as fin:
16 |         records = json.load(fin)
17 |         json_dict = {
18 |             record[key]: record
19 |             for record in records
20 |         }
21 |     return json_dict
22 | 
23 | 
24 | class SlackLoader(object):
25 |     def __init__(self, export_path, exclude_channels=(), only_channels=(), start_date=None, end_date=None,
26 |                  is_sorted=True):
27 |         self.exclude_channels = exclude_channels
28 |         self.only_channels = only_channels
29 |         if start_date:
30 |             self.start_date = (start_date - datetime.datetime(1970, 1, 1)).total_seconds()
31 |         else:
32 |             self.start_date = None
33 |         if end_date:
34 |             self.end_date = (end_date - datetime.datetime(1970, 1, 1)).total_seconds()
35 |         else:
36 |             self.end_date = None
37 |         self.channels = _read_json_dict(os.path.join(export_path, 'channels.json'))
38 |         self.users = _read_json_dict(os.path.join(export_path, 'users.json'))
39 |         self.messages = self.load_export(export_path, is_sorted)
40 | 
41 |     def load_export(self, export_path, is_sorted):
42 |         messages = []
43 |         for channel_id, channel in self.channels.items():
44 |             if channel['is_archived']:
45 |                 continue
46 |             if channel['name'] in self.exclude_channels:
47 |                 continue
48 |             if self.only_channels and channel['name'] not in self.only_channels:
49 |                 continue
50 |             messages_glob = os.path.join(export_path, channel['name'], '*.json')
51 |             for messages_filename in glob.glob(messages_glob):
52 |                 with open(messages_filename) as f_messages:
53 |                     for record in json.load(f_messages):
54 |                         if 'subtype' in record:
55 |                             continue
56 |                         if 'ts' in record:
57 |                             if self.start_date and float(record['ts']) < self.start_date:
58 |                                 continue
59 |                             if self.end_date and float(record['ts']) > self.end_date:
60 |                                 continue
61 |                             record['ts'] = float(record['ts'])
62 |                             record['dt'] = datetime.datetime.fromtimestamp(record['ts'])
63 |                         record['channel'] = channel_id
64 |                         messages.append(record)
65 |         if is_sorted:
66 |             messages = sorted(messages, key=lambda x: x['ts'])
67 | 
68 |         return messages
69 | 
70 |     def find_threads(self):
71 |         dd = defaultdict(list)
72 |         for i in range(0, len(self.messages)):
73 |             msg = self.messages[i]
74 |             if "thread_ts" in msg:
75 |                 dd[msg["thread_ts"]].append(i)
76 |         return list(dd.values())
77 | 
78 | re_slack_link = re.compile(r'(?P<all><(?P<id>[^\|]*)(\|(?P<title>[^>]*))?>)')
79 | 
80 | 
81 | def _extract_slack_link_id(m):
82 |     return m.group('id')
83 | 
84 | 
85 | def normalize_links(text):
86 |     return re_slack_link.sub(_extract_slack_link_id, text)
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     loader = SlackLoader('ODS_dump_Mar_10_2017', exclude_channels=['_random_flood', 'career'])
91 |     print(len(loader.messages))
92 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/test_simple_question_extraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from slack_data_loader import SlackLoader\n",
 12 |     "import datetime\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np\n",
 15 |     "import re\n",
 16 |     "from operator import itemgetter"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 3,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "path_to_dump = './opendatascience Slack export May 20 2017/'"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 4,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "exporter = SlackLoader(path_to_dump, only_channels=('deep_learning',),\n",
 37 |     "                           start_date=datetime.datetime(2017, 1, 1))"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 5,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "Loaded 7540 messages\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "print(\"Loaded {} messages\".format(len(exporter.messages)))"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 6,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "channel_attrs = ['id', 'name', 'created', 'creator', 'is_archived', 'is_general', 'pins', 'topic']\n",
 66 |     "\n",
 67 |     "def channels_to_df(channels):\n",
 68 |     "    full_list = []\n",
 69 |     "    for ch_id, ch_dict in channels.items():\n",
 70 |     "        new_channel_dict = {}\n",
 71 |     "        for k in channel_attrs:\n",
 72 |     "            new_channel_dict[k] = ch_dict.get(k, None)\n",
 73 |     "        new_channel_dict['num_members'] = len(ch_dict['members'])\n",
 74 |     "        new_channel_dict['purpose'] = ch_dict['purpose']['value']\n",
 75 |     "        full_list.append(new_channel_dict)\n",
 76 |     "    return pd.DataFrame(full_list).set_index('id')\n"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 7,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/html": [
 87 |        "<div>\n",
 88 |        "<style>\n",
 89 |        "    .dataframe thead tr:only-child th {\n",
 90 |        "        text-align: right;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe thead th {\n",
 94 |        "        text-align: left;\n",
 95 |        "    }\n",
 96 |        "\n",
 97 |        "    .dataframe tbody tr th {\n",
 98 |        "        vertical-align: top;\n",
 99 |        "    }\n",
100 |        "</style>\n",
101 |        "<table border=\"1\" class=\"dataframe\">\n",
102 |        "  <thead>\n",
103 |        "    <tr style=\"text-align: right;\">\n",
104 |        "      <th></th>\n",
105 |        "      <th>created</th>\n",
106 |        "      <th>creator</th>\n",
107 |        "      <th>is_archived</th>\n",
108 |        "      <th>is_general</th>\n",
109 |        "      <th>name</th>\n",
110 |        "      <th>num_members</th>\n",
111 |        "      <th>pins</th>\n",
112 |        "      <th>purpose</th>\n",
113 |        "      <th>topic</th>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>id</th>\n",
117 |        "      <th></th>\n",
118 |        "      <th></th>\n",
119 |        "      <th></th>\n",
120 |        "      <th></th>\n",
121 |        "      <th></th>\n",
122 |        "      <th></th>\n",
123 |        "      <th></th>\n",
124 |        "      <th></th>\n",
125 |        "      <th></th>\n",
126 |        "    </tr>\n",
127 |        "  </thead>\n",
128 |        "  <tbody>\n",
129 |        "    <tr>\n",
130 |        "      <th>C2A4GEL6M</th>\n",
131 |        "      <td>1473445368</td>\n",
132 |        "      <td>U04ELQZAU</td>\n",
133 |        "      <td>True</td>\n",
134 |        "      <td>False</td>\n",
135 |        "      <td>alexyashadasha</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>None</td>\n",
138 |        "      <td></td>\n",
139 |        "      <td>{'value': '', 'creator': '', 'last_set': '0'}</td>\n",
140 |        "    </tr>\n",
141 |        "    <tr>\n",
142 |        "      <th>C1P8YT7C7</th>\n",
143 |        "      <td>1467817046</td>\n",
144 |        "      <td>U04URBM8V</td>\n",
145 |        "      <td>False</td>\n",
146 |        "      <td>False</td>\n",
147 |        "      <td>bayesian</td>\n",
148 |        "      <td>307</td>\n",
149 |        "      <td>[{'id': '1467888432.000030', 'type': 'C', 'use...</td>\n",
150 |        "      <td>Church of Bayes: Discussing Bayesian statistic...</td>\n",
151 |        "      <td>{'value': ':bayes:', 'creator': 'U04ELQZAU', '...</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>C0804BS5Q</th>\n",
155 |        "      <td>1437511383</td>\n",
156 |        "      <td>U049NHC4X</td>\n",
157 |        "      <td>False</td>\n",
158 |        "      <td>False</td>\n",
159 |        "      <td>big_data</td>\n",
160 |        "      <td>1301</td>\n",
161 |        "      <td>[{'id': '1485303977.000947', 'type': 'C', 'use...</td>\n",
162 |        "      <td>Hadoop, Spark и прочее\\r\\n\\r\\nПолезные материа...</td>\n",
163 |        "      <td>{'value': 'Big Pain in the ...', 'creator': 'U...</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>C0MQQT6E6</th>\n",
167 |        "      <td>1455738772</td>\n",
168 |        "      <td>U070Y25AS</td>\n",
169 |        "      <td>False</td>\n",
170 |        "      <td>False</td>\n",
171 |        "      <td>bioinformatics</td>\n",
172 |        "      <td>125</td>\n",
173 |        "      <td>None</td>\n",
174 |        "      <td></td>\n",
175 |        "      <td>{'value': ':bioscience:', 'creator': 'U04ELQZA...</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>C115898GZ</th>\n",
179 |        "      <td>1460749144</td>\n",
180 |        "      <td>U04422XJL</td>\n",
181 |        "      <td>True</td>\n",
182 |        "      <td>False</td>\n",
183 |        "      <td>blackoxchallenge</td>\n",
184 |        "      <td>0</td>\n",
185 |        "      <td>None</td>\n",
186 |        "      <td></td>\n",
187 |        "      <td>{'value': '', 'creator': '', 'last_set': '0'}</td>\n",
188 |        "    </tr>\n",
189 |        "  </tbody>\n",
190 |        "</table>\n",
191 |        "</div>"
192 |       ],
193 |       "text/plain": [
194 |        "              created    creator  is_archived  is_general              name  \\\n",
195 |        "id                                                                            \n",
196 |        "C2A4GEL6M  1473445368  U04ELQZAU         True       False    alexyashadasha   \n",
197 |        "C1P8YT7C7  1467817046  U04URBM8V        False       False          bayesian   \n",
198 |        "C0804BS5Q  1437511383  U049NHC4X        False       False          big_data   \n",
199 |        "C0MQQT6E6  1455738772  U070Y25AS        False       False    bioinformatics   \n",
200 |        "C115898GZ  1460749144  U04422XJL         True       False  blackoxchallenge   \n",
201 |        "\n",
202 |        "           num_members                                               pins  \\\n",
203 |        "id                                                                          \n",
204 |        "C2A4GEL6M            0                                               None   \n",
205 |        "C1P8YT7C7          307  [{'id': '1467888432.000030', 'type': 'C', 'use...   \n",
206 |        "C0804BS5Q         1301  [{'id': '1485303977.000947', 'type': 'C', 'use...   \n",
207 |        "C0MQQT6E6          125                                               None   \n",
208 |        "C115898GZ            0                                               None   \n",
209 |        "\n",
210 |        "                                                     purpose  \\\n",
211 |        "id                                                             \n",
212 |        "C2A4GEL6M                                                      \n",
213 |        "C1P8YT7C7  Church of Bayes: Discussing Bayesian statistic...   \n",
214 |        "C0804BS5Q  Hadoop, Spark и прочее\\r\\n\\r\\nПолезные материа...   \n",
215 |        "C0MQQT6E6                                                      \n",
216 |        "C115898GZ                                                      \n",
217 |        "\n",
218 |        "                                                       topic  \n",
219 |        "id                                                            \n",
220 |        "C2A4GEL6M      {'value': '', 'creator': '', 'last_set': '0'}  \n",
221 |        "C1P8YT7C7  {'value': ':bayes:', 'creator': 'U04ELQZAU', '...  \n",
222 |        "C0804BS5Q  {'value': 'Big Pain in the ...', 'creator': 'U...  \n",
223 |        "C0MQQT6E6  {'value': ':bioscience:', 'creator': 'U04ELQZA...  \n",
224 |        "C115898GZ      {'value': '', 'creator': '', 'last_set': '0'}  "
225 |       ]
226 |      },
227 |      "execution_count": 7,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "df = channels_to_df(exporter.channels)\n",
234 |     "df.head()"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 41,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "qwords = (\"как\", \"какой\", \"зачем\", \"почему\", \"когда\", \"кто\", \"где\", \"когда\", \"куда\", \"чот\")\n",
244 |     "splitter = re.compile(r\"(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s\")\n",
245 |     "\n",
246 |     "import pymorphy2\n",
247 |     "morph = pymorphy2.MorphAnalyzer()\n",
248 |     "\n",
249 |     "def lemm(st):\n",
250 |     "    if st == '':\n",
251 |     "        return ''\n",
252 |     "    else:\n",
253 |     "        return morph.parse(st)[0].normal_form\n",
254 |     "\n",
255 |     "def is_question(d):\n",
256 |     "    x = d.lower()\n",
257 |     "    snt = x.split()\n",
258 |     "    num_words = len(snt)\n",
259 |     "    snt = [lemm(w) for w in snt]\n",
260 |     "    #print((num_words > 4) and  any(w in qwords for w in snt), [w in qwords for w in snt])\n",
261 |     "    return (num_words > 4) and any(w in qwords for w in snt)\n",
262 |     "\n",
263 |     "def contains_sentance_with_questions(d):\n",
264 |     "    x = d['text'].lower()\n",
265 |     "    sents = splitter.split(x)\n",
266 |     "    #print(any(map(is_question, sents)))\n",
267 |     "    return any(map(is_question, sents))\n",
268 |     "\n",
269 |     "questions = list(filter(contains_sentance_with_questions, exporter.messages))"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "questions = list(filter(contains_sentance_with_questions, exporter.messages))"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 42,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "found 1255 questions\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "print(\"found {} questions\".format(len(questions)))"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "for _text in map(itemgetter('text'), questions):\n",
305 |     "    print(_text)\n",
306 |     "    print('-'*40)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": []
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "kernelspec": {
321 |    "display_name": "Python 3",
322 |    "language": "python",
323 |    "name": "python3"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 3
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython3",
335 |    "version": "3.6.4"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 2
340 | }
341 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | GROUPING_SPACE_REGEX = re.compile(r'([^@\w_\-])', re.UNICODE | re.MULTILINE)
 5 | 
 6 | ALPHABET = re.compile(u'[A-Za-zА-ЯЁа-яё]')
 7 | 
 8 | # special tokens to be found before system processing
 9 | web_address_re = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
10 | email_re = re.compile(u'[а-яёА-ЯЁA-Za-z0-9.+_-]+@[^@]+\.[a-zA-Zа-яёА-ЯЁ]+')
11 | number_re = re.compile(u'(^|[^\w\-])[+\-]*[0-9]+[0-9 =―—–_:.,/x×\-*]*[\-ыхейюомуя]*([^\w\-]|$)', re.UNICODE)
12 | 
13 | 
14 | def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
15 |     """
16 |     Split text into tokens. Don't split by a hyphen and an underscore.
17 |     Preserve punctuation, but not whitespaces.
18 |     """
19 |     return [t for t in _split(text) if t]
20 | 
21 | 
22 | def replace_number(match_obj):
23 |     return u'%sNUM%s' % (match_obj.group(1), match_obj.group(2))
24 | 
25 | 
26 | def tokenize(text):
27 |     inp_tokens = simple_word_tokenize(text)
28 |     tokens_len = len(inp_tokens)
29 |     output_tokens = []
30 |     # combine some tokens together: contractions, smileys, emoticons, etc.
31 |     for index, token in enumerate(inp_tokens):
32 |         # contractions with length < 5
33 |         if token in u'.' and 0 < index < tokens_len - 1 and inp_tokens[index + 1] not in u'.?-–—)\'"”»' and \
34 |            output_tokens and len(output_tokens[-1]) < 5:
35 |             output_tokens[-1] += token
36 |         # english contractions
37 |         elif token in [u's', u've', u'm', u'll', u're', u'd', u't'] and index > 0 and inp_tokens[index - 1] in u'\'`':
38 |             output_tokens[-1] += token
39 |         # cut a hyphen off from the beginning of a word
40 |         elif token[0] == u'-' and len(token) > 1 and ALPHABET.match(token[1]):
41 |             output_tokens.append(u'-')
42 |             output_tokens.append(token[1:])
43 |         # !? or ?!
44 |         elif token in u'?!' and index > 0 and inp_tokens[index - 1] in u'?!':
45 |             if len(output_tokens[-1]) < 2:
46 |                 output_tokens[-1] += token
47 |         # repetition of dots, question marks, slashes, etc
48 |         elif token in u'.,?!^*/=:;«»"“”-–—@+()_❤☀' and index > 0 and inp_tokens[index - 1] == token:
49 |             if len(output_tokens[-1]) < 2:
50 |                 output_tokens[-1] += token
51 |         # smileys, emoticons
52 |         elif token in u'-–—/_{}()[]<>`*:^=DP' and index > 0 and inp_tokens[index - 1] and \
53 |                 inp_tokens[index - 1] in u'/`^:{}()[]<>*%=;-–—_':
54 |             output_tokens[-1] += token
55 |         else:
56 |             if not token.isspace():
57 |                 output_tokens.append(token)
58 |     return output_tokens
59 | 
60 | 
61 | def preprocessing(sent):
62 |     # replace URL address on URL token, e-mail on EMAIL and numbers on NUM before tokenizing
63 |     # sent = web_address_re.sub('URL', )  # number_re.sub(replace_number, sent)
64 |     sent = email_re.sub('EMAIL', sent).replace('\n', '').strip().lower()
65 |     sent = sent.replace(u'ё', u'е').replace('&quot;', '"').replace('&lt;', '<').replace('&gt;', '>'). \
66 |         replace('&amp;', '&').replace('&apos;', '`').replace('', '').replace('<br>', '')
67 |     return sent
68 | 
69 | if __name__ == '__main__':
70 |     # test
71 |     import codecs
72 |     from time import time
73 | 
74 |     total = 0
75 |     error = 0
76 |     with codecs.open('tokens.txt', 'w', encoding='utf-8') as f_out:
77 |         with codecs.open('sentences.txt', 'r', encoding='utf-8') as f:
78 |             start_time = time()
79 |             for sentence in f:
80 |                 sentence = sentence.strip()
81 |                 if sentence:
82 |                     sentence = preprocessing(sentence)
83 |                     tokens = tokenize(sentence)
84 |                     f_out.write(u' '.join(tokens)+'\n')
85 |             print 'Execution time: %s' % (time() - start_time)
86 | 


--------------------------------------------------------------------------------
/hackathon_1_may_2017/vw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILENAME_TRAIN=$1
 4 | FILENAME_TEST=$2
 5 | LR=$3
 6 | NGRAMS=$4
 7 | # LINES=`wc -l ${FILENAME}`
 8 | # POS=`expr "${LINES}" : '.* '`
 9 | # COUNT_OF_LINES=${LINES:0:${POS}}
10 | # COUNT_OF_LINES=$((COUNT_OF_LINES * 85 / 100))
11 | # echo $COUNT_OF_LINES
12 | 
13 | # POS=`expr "${FILENAME}" : '.*\.'`
14 | # NAME=${FILENAME:0:${POS} - 1}
15 | # echo $NAME
16 | 
17 | #gshuf ${FILENAME} >> split -l $COUNT_OF_LINES
18 | #mv xaa ${NAME}_train.vw
19 | #mv xab ${NAME}_test.vw
20 | 
21 | gshuf ${FILENAME_TRAIN} -o ${FILENAME_TRAIN}
22 | 
23 | cd ../vowpal_wabbit/vowpalwabbit/
24 | ./vw -c -k -b 25 --oaa 12 -l ${LR} --ngram ${NGRAMS} -d ../../Introspect_hackathon/${FILENAME_TRAIN} -f vw_ods_channels.bin --passes 20 --holdout_off
25 | ./vw -t -i vw_ods_channels.bin -d ../../Introspect_hackathon/${FILENAME_TEST}
26 | 
27 | # predict
28 | ./vw -t -i vw_ods_channels.bin -d ../../Introspect_hackathon/${FILENAME_TEST} -p ../../Introspect_hackathon/${FILENAME_TEST}.pred --quiet
29 | 
30 | mv vw_ods_channels.bin ../../Introspect_hackathon/models/


--------------------------------------------------------------------------------
/hackathon_2_march_2018/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_2_march_2018/.DS_Store


--------------------------------------------------------------------------------
/hackathon_2_march_2018/README.md:
--------------------------------------------------------------------------------
1 | # Код, данные и заметки с хакатона в Mail.ru
2 | Загружайте сюда и пишите простое пояснение 
3 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/data_fetch/README.md:
--------------------------------------------------------------------------------
 1 | # Код для экспорта данных из JSON-дампа Slack в БД
 2 | 
 3 | Получает данные про 
 4 | - пользователей (таблица imported_user_data)
 5 | - каналы (таблица imported_channel)
 6 | - сообщения (таблица imported_messages)
 7 | - реакции на сообщения по юзерам (таблица imported_reactions)
 8 | - количество реакций на сообщения по типам (таблица imported_reactions_count) 
 9 | 
10 | # Prerequisites
11 | 
12 | ```
13 | python3 -m pip install -r requirements.txt
14 | ```
15 | 
16 | # Usage
17 | 
18 | ```
19 | python3 run.py
20 | ```
21 | 
22 | Предполагается, что в ../data лежат разархивированный ODS-дамп.
23 | 
24 | База (по умолчанию sqlite) будет лежать в ../ods-slack.db. 
25 | 
26 | # Notes
27 | 
28 | - В imported_reactions не все юзеры, т.к. в дампе указаны не все юзеры, поставившие смайл.<br/>
29 |   Дамп отображает on-hover поведение Slack: показывает ~50 именованных юзеров, а дальше пишет and 42 others.<br/>
30 |   Вот эти 42 юзера не попали в дамп.
31 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/data_fetch/msg_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os, logging
  4 | import json
  5 | from pprint import pprint
  6 | from collections import Counter
  7 | from sqlalchemy.ext.declarative import declarative_base
  8 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey
  9 | from sqlalchemy.engine.url import URL
 10 | from sqlalchemy.orm import sessionmaker
 11 | from sqlalchemy.pool import NullPool
 12 | 
 13 | Base = declarative_base()
 14 | 
 15 | class Channel(Base):
 16 |     __tablename__ = 'imported_channel'
 17 | 
 18 |     name = Column('name', String, primary_key=True)
 19 | 
 20 |     def __init__(self, name):
 21 |         self.name = name
 22 | 
 23 |     def __repr__(self):
 24 |         return "<Channel({0})>".format(self.name)
 25 | 
 26 | 
 27 | class Message(Base):
 28 |     __tablename__ = 'imported_messages'
 29 | 
 30 |     channel = Column('channel', String, primary_key=True)
 31 |     ts = Column('ts', String, primary_key=True)
 32 | 
 33 |     type = Column('type', String)
 34 |     text = Column('text', String)
 35 |     user = Column('user', String)
 36 |     thread_ts = Column('thread_ts', String)
 37 |     parent_user_id = Column('parent_user_id', String)
 38 |     subtype = Column('subtype', String)
 39 |     # reactions = Column('reactions', String) table
 40 |     # edited = Column('edited', String) later
 41 |     # attachments = Column('attachments', String) table
 42 |     reply_count = Column('reply_count', Integer)
 43 |     # replies = Column('replies', String) table
 44 |     unread_count = Column('unread_count', Integer)
 45 |     bot_id = Column('bot_id', String)
 46 |     username = Column('username', String)
 47 |     # file = Column('file', String)
 48 | 
 49 |     def __init__(self, channel, data):
 50 |         self.channel = channel
 51 |         self.ts = data.get('ts', '')
 52 | 
 53 |         self.type = data.get('type', '')
 54 |         self.text = str(data.get('text', ''))
 55 |         self.user = data.get('user', '')
 56 |         self.thread_ts = data.get('thread_ts', '')
 57 |         self.parent_user_id = data.get('parent_user_id', '')
 58 |         self.subtype = data.get('subtype', '')
 59 |         # self.reactions = data.get('reactions', '')
 60 |         # self.edited = data.get('edited', '')
 61 |         # self.attachments = data.get('attachments', '')
 62 |         self.reply_count = data.get('reply_count', '')
 63 |         self.replies = data.get('replies', '') 
 64 |         self.unread_count = data.get('unread_count', '')
 65 |         self.bot_id = data.get('bot_id', '')
 66 |         self.username = data.get('username', '')
 67 | 
 68 |     def __repr__(self):
 69 |         return "<Message({0}, {1}, {2})>".format(self.channel, self.date, self.index)
 70 | 
 71 | 
 72 | def parse_messages(session, data_path):
 73 |     # c = Counter()  
 74 |     dirs = [e.name for e in os.scandir(data_path) if e.is_dir()]
 75 |     # msg_keys = set()
 76 |     for dir in dirs:
 77 |         pprint(data_path + os.sep + dir)
 78 |         # if dir != 'welcome':
 79 |         #     continue
 80 |         for d, dirs, files in os.walk(data_path + os.sep + dir):
 81 |             channel = d.split('/')[-1]
 82 |             session.add(Channel(channel))
 83 |             for f in files:
 84 |                 path = os.path.join(d, f)
 85 |                 # print(path)
 86 |                 data = json.load(open(path))
 87 |                 for msg in data:
 88 |                     session.add(Message(channel, msg))
 89 |                     # c.update(msg.keys())
 90 |                     # for key in msg.keys():
 91 |                     #     msg_keys.add(key)
 92 |                     # session.add()
 93 |         session.commit()
 94 |     # pprint(msg_keys)
 95 |     # pprint(c.most_common(60))
 96 |     # print(len(msg_keys), '\n')
 97 | 
 98 | 
 99 | # if __name__ == '__main__':
100 | #     Base.metadata.create_all(engine)
101 | #     parse_messages('../data')
102 | #     # print(c.most_common(100))
103 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/data_fetch/reaction_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os, logging
  4 | import json
  5 | from pprint import pprint
  6 | from collections import Counter
  7 | from sqlalchemy.ext.declarative import declarative_base
  8 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey
  9 | from sqlalchemy.engine.url import URL
 10 | from sqlalchemy.orm import sessionmaker
 11 | from sqlalchemy.pool import NullPool
 12 | 
 13 | Base = declarative_base()
 14 | 
 15 | class Reaction(Base):
 16 |     __tablename__ = 'imported_reactions'
 17 | 
 18 |     channel = Column('channel', String, primary_key=True)
 19 |     message_ts = Column('message_ts', String, primary_key=True)
 20 |     user_id = Column('user_id', String, primary_key=True)
 21 |     name = Column('name', String, primary_key=True)
 22 | 
 23 |     def __init__(self, channel, message_ts, user_id, name):
 24 |         self.channel = channel
 25 |         self.message_ts = message_ts
 26 |         self.user_id = user_id
 27 |         self.name = name
 28 | 
 29 |     def __repr__(self):
 30 |         return "<Reaction({0}, {1}, {2})>".format(self.name, self.message_ts, self.user_id)
 31 | 
 32 | 
 33 | class ReactionCount(Base):
 34 |     __tablename__ = 'imported_reactions_count'
 35 | 
 36 |     channel = Column('channel', String, primary_key=True)
 37 |     message_ts = Column('message_ts', String, primary_key=True)
 38 |     name = Column('name', String, primary_key=True)
 39 |     count = Column('count', Integer)
 40 | 
 41 |     def __init__(self, channel, message_ts, name, count):
 42 |         self.channel = channel
 43 |         self.message_ts = message_ts
 44 |         self.name = name
 45 |         self.count = count
 46 | 
 47 |     def __repr__(self):
 48 |         return "<Reaction({0}, {1}, {2})>".format(self.name, self.message_ts, self.count)
 49 | 
 50 | 
 51 | def parse_reactions(session, data_path):
 52 |     # c = Counter()  
 53 |     dirs = [e.name for e in os.scandir(data_path) if e.is_dir()]
 54 |     # msg_keys = set()
 55 |     for dir in dirs:
 56 |         pprint(data_path + os.sep + dir)
 57 |         # if dir != 'welcome':
 58 |         #     continue
 59 |         for d, dirs, files in os.walk(data_path + os.sep + dir):
 60 |             channel = d.split('/')[-1]
 61 |             # session.add(Channel(channel))
 62 |             for f in files:
 63 |                 path = os.path.join(d, f)
 64 |                 # print(path)
 65 |                 data = json.load(open(path))
 66 |                 for msg in data:
 67 |                     if 'reactions' in msg.keys():
 68 |                         for reaction in msg['reactions']:
 69 |                             for user in reaction['users']:
 70 |                                 session.add(Reaction(channel, msg['ts'], user, reaction['name']))
 71 |                     # c.update(msg.keys())
 72 |                     # for key in msg.keys():
 73 |                     #     msg_keys.add(key)
 74 |                     # session.add()
 75 |         session.commit()
 76 |     # pprint(msg_keys)
 77 |     # pprint(c.most_common(60))
 78 |     # print(len(msg_keys), '\n')
 79 | 
 80 | 
 81 | def parse_reactions_count(session, data_path):
 82 |     # c = Counter()  
 83 |     dirs = [e.name for e in os.scandir(data_path) if e.is_dir()]
 84 |     # msg_keys = set()
 85 |     for dir in dirs:
 86 |         pprint(data_path + os.sep + dir)
 87 |         # if dir != 'welcome':
 88 |         #     continue
 89 |         for d, dirs, files in os.walk(data_path + os.sep + dir):
 90 |             channel = d.split('/')[-1]
 91 |             # session.add(Channel(channel))
 92 |             for f in files:
 93 |                 path = os.path.join(d, f)
 94 |                 # print(path)
 95 |                 data = json.load(open(path))
 96 |                 for msg in data:
 97 |                     if 'reactions' in msg.keys():
 98 |                         for reaction in msg['reactions']:
 99 |                             session.add(ReactionCount(channel, msg['ts'], reaction['name'], reaction['count']))
100 |                     # c.update(msg.keys())
101 |                     # for key in msg.keys():
102 |                     #     msg_keys.add(key)
103 |                     # session.add()
104 |         session.commit()
105 |     # pprint(msg_keys)
106 |     # pprint(c.most_common(60))
107 |     # print(len(msg_keys), '\n')
108 | 
109 | # if __name__ == '__main__':
110 | #     Base.metadata.create_all(engine)
111 | #     parse_reactions_count('../data')
112 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/data_fetch/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, logging
 4 | import json
 5 | from pprint import pprint
 6 | from sqlalchemy.ext.declarative import declarative_base
 7 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, MetaData, ForeignKey
 8 | from sqlalchemy.engine.url import URL
 9 | from sqlalchemy.orm import sessionmaker
10 | from sqlalchemy.pool import NullPool
11 | 
12 | logger = logging.getLogger(__name__)
13 | local_name = 'sqlite:///../ods-slack.db'
14 | remote_name = 'postgres://usgbqmayetwlrv:a8b6a60b922bd6d08c3e94fa41eac937f71ed3bc4afade4995a3bdf5d54e36ca@ec2-54-247-81-88.eu-west-1.compute.amazonaws.com:5432/d7942vtj104cpv'
15 | engine = create_engine(local_name, echo=True)
16 | Base = declarative_base()
17 | Session = sessionmaker(bind=engine)
18 | dump_ODS_path = '../data'
19 | 
20 | import users_parser
21 | import msg_parser
22 | import reaction_parser
23 | 
24 | if __name__ == '__main__':
25 |     Base.metadata.create_all(engine)
26 |     users_parser.Base.metadata.create_all(engine)
27 |     msg_parser.Base.metadata.create_all(engine)
28 |     reaction_parser.Base.metadata.create_all(engine)
29 |     Base.metadata.create_all(engine)
30 | 
31 |     session = Session()
32 | 
33 |     # users_parser.UserData.__table__.drop(engine)
34 |     # msg_parser.Channel.__table__.drop(engine)
35 |     # msg_parser.Message.__table__.drop(engine)
36 |     # reaction_parser.Reaction.__table__.drop(engine)
37 |     # reaction_parser.ReactionCount.__table__.drop(engine)
38 | 
39 |     users_parser.parse_users(session, dump_ODS_path)
40 |     msg_parser.parse_messages(session, dump_ODS_path)
41 |     reaction_parser.parse_reactions(session, dump_ODS_path)
42 |     reaction_parser.parse_reactions_count(session, dump_ODS_path)
43 | 
44 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/data_fetch/users_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os, logging
  4 | import json
  5 | from pprint import pprint
  6 | from sqlalchemy.ext.declarative import declarative_base
  7 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, MetaData, ForeignKey
  8 | from sqlalchemy.engine.url import URL
  9 | from sqlalchemy.orm import sessionmaker
 10 | from sqlalchemy.pool import NullPool
 11 | 
 12 | Base = declarative_base()
 13 | 
 14 | class UserData(Base):
 15 |     __tablename__ = 'imported_user_data'
 16 | 
 17 |     id = Column('id', String, primary_key=True)
 18 |     name = Column('name', String)
 19 |     deleted = Column('deleted', Boolean)
 20 | 
 21 |     tz = Column('tz', String)
 22 |     tz_label = Column('tz_label', String)
 23 |     tz_offset = Column('tz_offset', Integer)
 24 |     is_admin = Column('is_admin', Boolean)
 25 |     is_owner = Column('is_owner', Boolean)
 26 |     is_primary_owner = Column('is_primary_owner', Boolean)
 27 |     is_restricted = Column('is_restricted', Boolean)
 28 |     is_ultra_restricted = Column('is_ultra_restricted', Boolean)
 29 |     is_bot = Column('is_bot', Boolean)
 30 |     updated = Column('updated', Integer)
 31 |     is_app_user = Column('is_app_user', Boolean)
 32 | 
 33 |     title = Column('title', String)                               
 34 |     phone = Column('phone', String)                               
 35 |     skype = Column('skype', String)                               
 36 |     real_name = Column('real_name', String)                               
 37 |     real_name_normalized = Column('real_name_normalized', String)                               
 38 |     display_name = Column('display_name', String)                               
 39 |     display_name_normalized = Column('display_name_normalized', String)                               
 40 |     fields = Column('fields', String)                               
 41 |     status_text = Column('status_text', String)                               
 42 |     status_emoji = Column('status_emoji', String)                               
 43 |     avatar_hash = Column('avatar_hash', String)                               
 44 |     first_name = Column('first_name', String)                               
 45 |     last_name = Column('last_name', String)                               
 46 |     image_24 = Column('image_24', String)                               
 47 |     image_32 = Column('image_32', String)                               
 48 |     image_48 = Column('image_48', String)                               
 49 |     image_72 = Column('image_72', String)                               
 50 |     image_192 = Column('image_192', String)                               
 51 |     image_512 = Column('image_512', String)                               
 52 |     team = Column('team', String)        
 53 | 
 54 |     def __init__(self, data):
 55 |         self.id = data.get('id')
 56 |         self.name = data.get('name')
 57 |         self.deleted = data.get('deleted', False)
 58 | 
 59 |         self.tz = data.get('tz', '')
 60 |         self.tz_label = data.get('tz_label', '')
 61 |         self.tz_offset = data.get('tz_offset', 0)
 62 |         self.is_admin = data.get('is_admin', False)
 63 |         self.is_owner = data.get('is_owner', False)
 64 |         self.is_primary_owner = data.get('is_primary_owner', False)
 65 |         self.is_restricted = data.get('is_restricted', False)
 66 |         self.is_ultra_restricted = data.get('is_ultra_restricted', False)
 67 |         self.is_bot = data.get('is_bot', False)
 68 |         self.updated = data.get('updated', 0)
 69 |         self.is_app_user = data.get('is_app_user', False)
 70 | 
 71 |         self.title = data.get('profile', {}).get('title', '')
 72 |         self.phone = data.get('profile', {}).get('phone', '')
 73 |         self.skype = data.get('profile', {}).get('skype', '')
 74 |         self.real_name = data.get('profile', {}).get('real_name', '')
 75 |         self.real_name_normalized = data.get('profile', {}).get('real_name_normalized', '')
 76 |         self.display_name = data.get('profile', {}).get('display_name', '')
 77 |         self.display_name_normalized = data.get('profile', {}).get('display_name_normalized', '')
 78 |         # self.fields = data.get('fields', '')
 79 |         self.status_text = data.get('profile', {}).get('status_text', '')
 80 |         self.status_emoji = data.get('profile', {}).get('status_emoji', '')
 81 |         self.avatar_hash = data.get('profile', {}).get('avatar_hash', '')
 82 |         self.first_name = data.get('profile', {}).get('first_name', '')
 83 |         self.last_name = data.get('profile', {}).get('last_name', '')
 84 |         self.image_24 = data.get('profile', {}).get('image_24', '')
 85 |         self.image_32 = data.get('profile', {}).get('image_32', '')
 86 |         self.image_48 = data.get('profile', {}).get('image_48', '')
 87 |         self.image_72 = data.get('profile', {}).get('image_72', '')
 88 |         self.image_192 = data.get('profile', {}).get('image_192', '')
 89 |         self.image_512 = data.get('profile', {}).get('image_512', '')
 90 |         self.team = data.get('profile', {}).get('team', '')
 91 | 
 92 |     def __repr__(self):
 93 |         return "<UserData({0}, {1})>".format(self.id, self.real_name)
 94 | 
 95 | 
 96 | 
 97 | def parse_users(session, data_path):
 98 |     json_path = data_path + '/users.json'
 99 |     data = json.load(open(json_path))
100 |     for user in data:
101 |         session.add(UserData(user))
102 |     session.commit()
103 | 
104 | 
105 | def parse_messages_get_fields(data_path):
106 |     json_path = data_path + '/users.json'
107 |     fields_ods = {'skype' : 'Xf0DANL9SL', 'github' : 'Xf3WC3HJMR' } 
108 |     data = json.load(open(json_path))
109 |     with open('fields.csv', 'w') as f:
110 |         for user in data:
111 |             skype = ''
112 |             github = ''
113 |             if user['profile'].get('fields', ''):
114 |                 if fields_ods['skype'] in user['profile']['fields']:
115 |                     skype = user['profile']['fields'][fields_ods['skype']]
116 |                     if skype['alt'] != '':
117 |                         skype = skype['alt']
118 |                     else:
119 |                         skype = skype['value']
120 |                 if fields_ods['github'] in user['profile']['fields']:
121 |                     github = user['profile']['fields'][fields_ods['github']]
122 |                     if github['alt'] != '':
123 |                         github = github['alt']
124 |                     else:
125 |                         github = github['value']
126 |             f.write('"' + '","'.join([user['id'], user['name'], user['profile']['title'], 
127 |                 user['profile'].get('real_name_normalized', ''), 
128 |                 user['profile'].get('first_name', ''), user['profile'].get('last_name', ''), 
129 |                 skype, github]) + '"\n')
130 |                 
131 | 
132 | if __name__ == '__main__':
133 |     parse_messages_get_fields('../data/users.json')


--------------------------------------------------------------------------------
/hackathon_2_march_2018/topic_modelling/01. clean_text_parsing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "\n",
 14 |     "import artm\n",
 15 |     "import json\n",
 16 |     "import re\n",
 17 |     "\n",
 18 |     "import os\n",
 19 |     "\n",
 20 |     "import nltk\n",
 21 |     "from nltk.stem import SnowballStemmer\n",
 22 |     "from nltk.corpus import brown\n",
 23 |     "\n",
 24 |     "from tqdm import tqdm, tqdm_notebook, tqdm_pandas\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stderr",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "<input>:8: DeprecationWarning: invalid escape sequence \\w\n",
 40 |       "<input>:8: DeprecationWarning: invalid escape sequence \\w\n",
 41 |       "<ipython-input-3-e9dd4ed0e002>:8: DeprecationWarning: invalid escape sequence \\w\n",
 42 |       "  stem = re.sub('[!@#$:]', '', ' '.join(re.findall('\\w{4,}', str(stem).lower())))\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "stemmer = SnowballStemmer('russian')\n",
 48 |     "\n",
 49 |     "def clean_text(document):\n",
 50 |     "    #stem = BeautifulSoup(document, 'xml').get_text()\n",
 51 |     "    document = str(document)\n",
 52 |     "    stem=[stemmer.stem(w) for w in document.split()]\n",
 53 |     "    stem= ' '.join(stem)\n",
 54 |     "    stem = re.sub('[!@#$:]', '', ' '.join(re.findall('\\w{4,}', str(stem).lower())))\n",
 55 |     "    return(stem)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": []
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "df = pd.read_csv('../../data/ods_dump/messages.csv')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "(751861, 2)"
 87 |       ]
 88 |      },
 89 |      "execution_count": 4,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "df.shape"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": []
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 5,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "df['for_del1']  = df['text'].apply(lambda x:1 if 'channel' in str(x) else 0)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 6,
121 |    "metadata": {
122 |     "collapsed": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "df['for_del2']  = df['text'].apply(lambda x:1 if 'upload' in str(x) else 0)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 7,
132 |    "metadata": {
133 |     "collapsed": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "df['for_del3']  = df['text'].apply(lambda x:1 if 'joined' in str(x) else 0)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "df['for_del4']  = df['text'].apply(lambda x:1 if 'added' in str(x) else 0)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": [
159 |     "df['for_del'] = df['for_del1'] + df['for_del2'] + df['for_del3'] + df['for_del4']  "
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 10,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "df_clean = df[df['for_del'] == 0][['user', 'text']]"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 11,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/html": [
181 |        "<div>\n",
182 |        "<style>\n",
183 |        "    .dataframe thead tr:only-child th {\n",
184 |        "        text-align: right;\n",
185 |        "    }\n",
186 |        "\n",
187 |        "    .dataframe thead th {\n",
188 |        "        text-align: left;\n",
189 |        "    }\n",
190 |        "\n",
191 |        "    .dataframe tbody tr th {\n",
192 |        "        vertical-align: top;\n",
193 |        "    }\n",
194 |        "</style>\n",
195 |        "<table border=\"1\" class=\"dataframe\">\n",
196 |        "  <thead>\n",
197 |        "    <tr style=\"text-align: right;\">\n",
198 |        "      <th></th>\n",
199 |        "      <th>user</th>\n",
200 |        "      <th>text</th>\n",
201 |        "    </tr>\n",
202 |        "  </thead>\n",
203 |        "  <tbody>\n",
204 |        "    <tr>\n",
205 |        "      <th>0</th>\n",
206 |        "      <td>U1UMQM200</td>\n",
207 |        "      <td>&lt;@U1Z2QA4EM&gt; как избавиться от рекурсии?</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>1</th>\n",
211 |        "      <td>U1Z2QA4EM</td>\n",
212 |        "      <td>&lt;@U1UMQM200&gt;: избавиться от искушения - это ка...</td>\n",
213 |        "    </tr>\n",
214 |        "    <tr>\n",
215 |        "      <th>2</th>\n",
216 |        "      <td>U09JEC7V0</td>\n",
217 |        "      <td>&lt;@U1Z2QA4EM&gt; в психотерапию умеешь?</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>3</th>\n",
221 |        "      <td>U1Z2QA4EM</td>\n",
222 |        "      <td>&lt;@U09JEC7V0&gt;: ох уж этот реверс в аметисты сос...</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>4</th>\n",
226 |        "      <td>U065VP6F7</td>\n",
227 |        "      <td>&lt;@U1Z2QA4EM&gt; может ты у мамки психолог?</td>\n",
228 |        "    </tr>\n",
229 |        "  </tbody>\n",
230 |        "</table>\n",
231 |        "</div>"
232 |       ],
233 |       "text/plain": [
234 |        "        user                                               text\n",
235 |        "0  U1UMQM200           <@U1Z2QA4EM> как избавиться от рекурсии?\n",
236 |        "1  U1Z2QA4EM  <@U1UMQM200>: избавиться от искушения - это ка...\n",
237 |        "2  U09JEC7V0                <@U1Z2QA4EM> в психотерапию умеешь?\n",
238 |        "3  U1Z2QA4EM  <@U09JEC7V0>: ох уж этот реверс в аметисты сос...\n",
239 |        "4  U065VP6F7            <@U1Z2QA4EM> может ты у мамки психолог?"
240 |       ]
241 |      },
242 |      "execution_count": 11,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "df_clean.head()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 16,
254 |    "metadata": {
255 |     "collapsed": true
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "#tqdm.pandas()"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 1,
265 |    "metadata": {
266 |     "collapsed": true
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "#df_clean['stem_text'] = df_clean['text'].progress_apply(lambda x: clean_text(x))"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 2,
276 |    "metadata": {
277 |     "collapsed": true,
278 |     "scrolled": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "#df_clean.to_csv('../../data/ods_dump/clean_message.csv')"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 3,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/html": [
293 |        "<div>\n",
294 |        "<style>\n",
295 |        "    .dataframe thead tr:only-child th {\n",
296 |        "        text-align: right;\n",
297 |        "    }\n",
298 |        "\n",
299 |        "    .dataframe thead th {\n",
300 |        "        text-align: left;\n",
301 |        "    }\n",
302 |        "\n",
303 |        "    .dataframe tbody tr th {\n",
304 |        "        vertical-align: top;\n",
305 |        "    }\n",
306 |        "</style>\n",
307 |        "<table border=\"1\" class=\"dataframe\">\n",
308 |        "  <thead>\n",
309 |        "    <tr style=\"text-align: right;\">\n",
310 |        "      <th></th>\n",
311 |        "      <th>Unnamed: 0</th>\n",
312 |        "      <th>user</th>\n",
313 |        "      <th>text</th>\n",
314 |        "      <th>stem_text</th>\n",
315 |        "    </tr>\n",
316 |        "  </thead>\n",
317 |        "  <tbody>\n",
318 |        "    <tr>\n",
319 |        "      <th>0</th>\n",
320 |        "      <td>0</td>\n",
321 |        "      <td>U1UMQM200</td>\n",
322 |        "      <td>&lt;@U1Z2QA4EM&gt; как избавиться от рекурсии?</td>\n",
323 |        "      <td>u1z2qa4em избав рекурсии</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>1</th>\n",
327 |        "      <td>1</td>\n",
328 |        "      <td>U1Z2QA4EM</td>\n",
329 |        "      <td>&lt;@U1UMQM200&gt;: избавиться от искушения - это ка...</td>\n",
330 |        "      <td>u1umqm200 избав искушен контрольн выстрел голов</td>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <th>2</th>\n",
334 |        "      <td>2</td>\n",
335 |        "      <td>U09JEC7V0</td>\n",
336 |        "      <td>&lt;@U1Z2QA4EM&gt; в психотерапию умеешь?</td>\n",
337 |        "      <td>u1z2qa4em психотерап умеешь</td>\n",
338 |        "    </tr>\n",
339 |        "  </tbody>\n",
340 |        "</table>\n",
341 |        "</div>"
342 |       ],
343 |       "text/plain": [
344 |        "   Unnamed: 0       user                                               text  \\\n",
345 |        "0           0  U1UMQM200           <@U1Z2QA4EM> как избавиться от рекурсии?   \n",
346 |        "1           1  U1Z2QA4EM  <@U1UMQM200>: избавиться от искушения - это ка...   \n",
347 |        "2           2  U09JEC7V0                <@U1Z2QA4EM> в психотерапию умеешь?   \n",
348 |        "\n",
349 |        "                                         stem_text  \n",
350 |        "0                         u1z2qa4em избав рекурсии  \n",
351 |        "1  u1umqm200 избав искушен контрольн выстрел голов  \n",
352 |        "2                      u1z2qa4em психотерап умеешь  "
353 |       ]
354 |      },
355 |      "execution_count": 3,
356 |      "metadata": {},
357 |      "output_type": "execute_result"
358 |     }
359 |    ],
360 |    "source": [
361 |     "df_clean = pd.read_csv('../../data/ods_dump/clean_message.csv')\n",
362 |     "df_clean.head(3)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {
369 |     "collapsed": true
370 |    },
371 |    "outputs": [],
372 |    "source": []
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {
378 |     "collapsed": true
379 |    },
380 |    "outputs": [],
381 |    "source": []
382 |   }
383 |  ],
384 |  "metadata": {
385 |   "kernelspec": {
386 |    "display_name": "Python 3",
387 |    "language": "python",
388 |    "name": "python3"
389 |   },
390 |   "language_info": {
391 |    "codemirror_mode": {
392 |     "name": "ipython",
393 |     "version": 3
394 |    },
395 |    "file_extension": ".py",
396 |    "mimetype": "text/x-python",
397 |    "name": "python",
398 |    "nbconvert_exporter": "python",
399 |    "pygments_lexer": "ipython3",
400 |    "version": "3.6.3"
401 |   }
402 |  },
403 |  "nbformat": 4,
404 |  "nbformat_minor": 2
405 | }
406 | 


--------------------------------------------------------------------------------
/hackathon_2_march_2018/topic_modelling/02. vocabulary.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 11,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "\n",
 14 |     "import gc\n",
 15 |     "\n",
 16 |     "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
 17 |     "import artm"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 34,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/html": [
 28 |        "<div>\n",
 29 |        "<style>\n",
 30 |        "    .dataframe thead tr:only-child th {\n",
 31 |        "        text-align: right;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe thead th {\n",
 35 |        "        text-align: left;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe tbody tr th {\n",
 39 |        "        vertical-align: top;\n",
 40 |        "    }\n",
 41 |        "</style>\n",
 42 |        "<table border=\"1\" class=\"dataframe\">\n",
 43 |        "  <thead>\n",
 44 |        "    <tr style=\"text-align: right;\">\n",
 45 |        "      <th></th>\n",
 46 |        "      <th>user</th>\n",
 47 |        "      <th>text</th>\n",
 48 |        "      <th>stem_text</th>\n",
 49 |        "    </tr>\n",
 50 |        "  </thead>\n",
 51 |        "  <tbody>\n",
 52 |        "    <tr>\n",
 53 |        "      <th>0</th>\n",
 54 |        "      <td>U1UMQM200</td>\n",
 55 |        "      <td>&lt;@U1Z2QA4EM&gt; как избавиться от рекурсии?</td>\n",
 56 |        "      <td>u1z2qa4em избав рекурсии</td>\n",
 57 |        "    </tr>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>1</th>\n",
 60 |        "      <td>U1Z2QA4EM</td>\n",
 61 |        "      <td>&lt;@U1UMQM200&gt;: избавиться от искушения - это ка...</td>\n",
 62 |        "      <td>u1umqm200 избав искушен контрольн выстрел голов</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>2</th>\n",
 66 |        "      <td>U09JEC7V0</td>\n",
 67 |        "      <td>&lt;@U1Z2QA4EM&gt; в психотерапию умеешь?</td>\n",
 68 |        "      <td>u1z2qa4em психотерап умеешь</td>\n",
 69 |        "    </tr>\n",
 70 |        "  </tbody>\n",
 71 |        "</table>\n",
 72 |        "</div>"
 73 |       ],
 74 |       "text/plain": [
 75 |        "        user                                               text  \\\n",
 76 |        "0  U1UMQM200           <@U1Z2QA4EM> как избавиться от рекурсии?   \n",
 77 |        "1  U1Z2QA4EM  <@U1UMQM200>: избавиться от искушения - это ка...   \n",
 78 |        "2  U09JEC7V0                <@U1Z2QA4EM> в психотерапию умеешь?   \n",
 79 |        "\n",
 80 |        "                                         stem_text  \n",
 81 |        "0                         u1z2qa4em избав рекурсии  \n",
 82 |        "1  u1umqm200 избав искушен контрольн выстрел голов  \n",
 83 |        "2                      u1z2qa4em психотерап умеешь  "
 84 |       ]
 85 |      },
 86 |      "execution_count": 34,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "df_clean = pd.read_csv('../../data/ods_dump/clean_message.csv', usecols=['user', 'text', 'stem_text'])\n",
 93 |     "df_clean.head(3)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 35,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "cv = TfidfVectorizer(max_features=10000, max_df=0.9, min_df=0.00001, ngram_range=(2,3), stop_words='english')"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 36,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "message = df_clean['stem_text'].fillna(' ') \n",
112 |     "del(df_clean)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 37,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "cv.fit(message)\n",
122 |     "n_wd = cv.transform(message)\n",
123 |     "del(message)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 38,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "vocabulary = cv.get_feature_names()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 39,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "79"
144 |       ]
145 |      },
146 |      "execution_count": 39,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "gc.collect()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 40,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "n_wd = n_wd.todense()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 41,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "10000"
173 |       ]
174 |      },
175 |      "execution_count": 41,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "len(vocabulary)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": []
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": []
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": []
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 42,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "bv = artm.BatchVectorizer(data_format='bow_n_wd', batch_size=1000, \n",
212 |     "                          n_wd=n_wd.T,\n",
213 |     "                          vocabulary=vocabulary)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {
220 |     "collapsed": true
221 |    },
222 |    "outputs": [],
223 |    "source": []
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {
229 |     "collapsed": true
230 |    },
231 |    "outputs": [],
232 |    "source": []
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "kernelspec": {
237 |    "display_name": "Python 3",
238 |    "language": "python",
239 |    "name": "python3"
240 |   },
241 |   "language_info": {
242 |    "codemirror_mode": {
243 |     "name": "ipython",
244 |     "version": 3
245 |    },
246 |    "file_extension": ".py",
247 |    "mimetype": "text/x-python",
248 |    "name": "python",
249 |    "nbconvert_exporter": "python",
250 |    "pygments_lexer": "ipython3",
251 |    "version": "3.6.3"
252 |   }
253 |  },
254 |  "nbformat": 4,
255 |  "nbformat_minor": 2
256 | }
257 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/2018-ods-answers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-answers.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/2018-ods-answers_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-answers_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/2018-ods-questions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-questions.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/2018-ods-questions_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-questions_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/2018-ods-top-users_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-top-users_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/README.md:
--------------------------------------------------------------------------------
 1 | # ODS вопросы и ответы 
 2 | 
 3 | **Задача** - посчитать количество сообщений с вопросами по каналам и пользователей которые чаще всего отвечают на вопросы
 4 | 
 5 | **ods-get-data.ipynb** - сбор данных из файлов в каталоге и подкаталогах экспорта slack.
 6 | 
 7 | **ods-qa.ipynb** - получение информации по не пустым корневым сообщениям (thread_ts == ts или пустое поле text), выборка из них сообщений с вопросами по наличию в сообщении знака '?' или слов маркеров, группировка сообщений с вопросами по каналам, группировка по количеству ответов пользователей на вопросы исключая пользователей которые задали вопрос.  
 8 | 
 9 | **Разобраться в следующий раз:** много сообщений с пустыми thread_ts и ts (есть боты и пользователи) и текстом (с учетом того что при удалении в текст записывается "This message was deleted".)
10 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-answers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-answers.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-answers_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-answers_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-check-export.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import glob\n",
 10 |     "import datetime\n",
 11 |     "import pandas as pd\n",
 12 |     "import os\n",
 13 |     "import re\n",
 14 |     "\n",
 15 |     "# https://stackoverflow.com/questions/36587211/easiest-way-to-read-csv-files-with-multiprocessing-in-pandas\n",
 16 |     "# http://python-3.ru/page/multiprocessing\n",
 17 |     "from multiprocessing import Pool"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# Создаю/обнуляю список файлов \n",
 27 |     "files_full_path_list = list()\n",
 28 |     "\n",
 29 |     "# Путь к корневому каталогу файлов\n",
 30 |     "files_path = '/opt/app/data/shared/latest_dump/*/*.json'\n",
 31 |     "# files_path = '/opt/app/data/shared/latest_dump/*/2018*.json'"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# Получаю перечень полных пути файлов в подкаталогах\n",
 41 |     "for file_name in glob.glob(files_path, recursive=True):\n",
 42 |     "    # Добавляю полный путь в список\n",
 43 |     "    files_full_path_list.append(file_name)\n",
 44 |     "    \n",
 45 |     "files_full_path_list.sort()\n",
 46 |     "\n",
 47 |     "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Количество файлов:', len(files_full_path_list))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "files_full_path_list[:10]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# os.path.getsize('/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# !ls -all /opt/app/data/shared/latest_dump/___top_links/2018-08-06.json"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# %%time\n",
 84 |     "\n",
 85 |     "# # CPU times: user 6min 38s, sys: 36.4 s, total: 7min 15s\n",
 86 |     "\n",
 87 |     "# files_df = pd.DataFrame()\n",
 88 |     "\n",
 89 |     "# for file in files_full_path_list:\n",
 90 |     "\n",
 91 |     "#     file_size = os.path.getsize(file)\n",
 92 |     "    \n",
 93 |     "#     # re.findall('dump/(.+?)/\\d', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n",
 94 |     "#     file_cat = re.findall('dump/(.+?)/\\d', file)[0]\n",
 95 |     "    \n",
 96 |     "#     # re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n",
 97 |     "#     file_date = re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', file)[0]\n",
 98 |     "    \n",
 99 |     "#     list = [[file, file_size, file_cat, file_date]]\n",
100 |     "    \n",
101 |     "#     files_df = files_df.append(pd.DataFrame(list, columns=['file', 'file_size', 'file_cat', 'file_date']),ignore_index=True)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": []
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "files_df = pd.DataFrame()\n",
118 |     "\n",
119 |     "def get_file_info(file):\n",
120 |     "    file_size = os.path.getsize(file)\n",
121 |     "\n",
122 |     "    # re.findall('dump/(.+?)/\\d', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n",
123 |     "    file_cat = re.findall('dump/(.+?)/\\d', file)[0]\n",
124 |     "\n",
125 |     "    # re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n",
126 |     "    file_date = re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', file)[0]\n",
127 |     "\n",
128 |     "    list = [[file, file_size, file_cat, file_date]]\n",
129 |     "    \n",
130 |     "    return pd.DataFrame(list, columns=['file', 'file_size', 'file_cat', 'file_date'])\n",
131 |     "\n",
132 |     "pool = Pool(processes=10)\n",
133 |     "df_list = pool.map(get_file_info, files_full_path_list)\n",
134 |     "\n",
135 |     "files_df = pd.concat(df_list, ignore_index=True)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "files_df.tail()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "files_df.info()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "files_df['file_date'] = pd.to_datetime(files_df['file_date'], format='%Y-%m-%d')\n",
163 |     "files_df['file_size'] = files_df['file_size'].round(0).astype(int)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "files_df[['file_cat']]\\\n",
173 |     "        .groupby(['file_cat'])['file_cat'] \\\n",
174 |     "        .count() \\\n",
175 |     "        .reset_index(name='count') \\\n",
176 |     "        .sort_values(['count'], ascending=False) \\\n",
177 |     "        .head(10)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "files_df[['file_cat', 'file_size']]\\\n",
187 |     "        .groupby(['file_cat'])['file_size'] \\\n",
188 |     "        .sum() \\\n",
189 |     "        .reset_index(name='sum_kilobytes') \\\n",
190 |     "        .sort_values(['sum_kilobytes'], ascending=False) \\\n",
191 |     "        .head(10)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": []
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": []
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": []
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": []
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": []
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": []
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": []
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": []
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": []
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": []
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": []
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": []
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": []
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": []
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "import os\n",
306 |     "import pandas as pd \n",
307 |     "from multiprocessing import Pool\n",
308 |     "\n",
309 |     "# wrap your csv importer in a function that can be mapped\n",
310 |     "def read_csv(filename):\n",
311 |     "    'converts a filename to a pandas dataframe'\n",
312 |     "    return pd.read_csv(filename)\n",
313 |     "\n",
314 |     "\n",
315 |     "def main():\n",
316 |     "    # set up your pool\n",
317 |     "    pool = Pool(processes=8) # or whatever your hardware can support\n",
318 |     "\n",
319 |     "    # get a list of file names\n",
320 |     "    files = os.listdir('.')\n",
321 |     "    file_list = [filename for filename in files if filename.split('.')[1]=='csv']\n",
322 |     "\n",
323 |     "    # have your pool map the file names to dataframes\n",
324 |     "    df_list = pool.map(read_csv, file_list)\n",
325 |     "\n",
326 |     "    # reduce the list of dataframes to a single dataframe\n",
327 |     "    combined_df = pd.concat(df_list, ignore_index=True)\n",
328 |     "\n",
329 |     "if __name__ == '__main__':\n",
330 |     "    main()"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "from multiprocessing import Pool\n",
340 |     "\n",
341 |     "def doubler(number):\n",
342 |     "    return number * 2\n",
343 |     " \n",
344 |     "numbers = [5, 10, 20]\n",
345 |     "pool = Pool(processes=3)\n",
346 |     "print(pool.map(doubler, numbers))"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": []
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": []
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": []
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": []
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": []
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": []
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": []
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": []
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": []
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": []
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": []
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": []
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": []
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": []
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": []
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": []
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": []
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {},
472 |    "outputs": [],
473 |    "source": []
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "# Сколько файлов по каталогам и какого размера каталоги?\n"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": []
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "# Сколько служебных сообщений?\n",
498 |     "user leave channel \n",
499 |     "user enter channel"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": []
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": null,
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": []
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": []
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": null,
526 |    "metadata": {},
527 |    "outputs": [],
528 |    "source": [
529 |     "# Создаю пустой dataframe для данных их файлов\n",
530 |     "json_df = pd.DataFrame()"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "%%time\n",
540 |     "# Наполняю данными о сообщениях за 2018 год dataframe (1min 31s) без multiprocessing\n",
541 |     "# Переделать на multiprocessing\n",
542 |     "\n",
543 |     "for file in files_full_path_list:\n",
544 |     "    # Читаю файлы в dataframe\n",
545 |     "    data_parsed = json.loads(open(file).read())\n",
546 |     "    df = json_normalize(data_parsed)\n",
547 |     "    # Добавляю имя файла в dataframe для дальнейшего получения даты и названия канал\n",
548 |     "    df.insert(loc=0, column='FILE', value=file)\n",
549 |     "#     print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Добавляю содержимое файла в dataframe', file, round(os.path.getsize(file_name)/1000/1000,2), 'мегабайт')\n",
550 |     "    json_df = json_df.append(df, ignore_index=True, sort=False)"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "# # Сохраняю dataframe в csv\n",
560 |     "\n",
561 |     "# csv_file_name = '2018_ods_raw_new.csv'\n",
562 |     "# csv_file_dir = '/opt/app/data/'\n",
563 |     "# csv_file_path = csv_file_dir + csv_file_name\n",
564 |     "\n",
565 |     "# # Проверка существует ли файл. Если существует удаляю\n",
566 |     "# if os.path.exists(csv_file_path):\n",
567 |     "#     os.remove(csv_file_name)\n",
568 |     "   \n",
569 |     "# print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Добавляю содержимое dataframe в csv', csv_file_path) \n",
570 |     "# json_df.to_csv(csv_file_name, sep='|', index=False, encoding='utf-8')\n",
571 |     "\n",
572 |     "# print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \\\n",
573 |     "#        'Размер csv файла:', \\\n",
574 |     "#        round(os.path.getsize(csv_file_path)/1000/1000,3), \\\n",
575 |     "#        'Мегабайт')"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": null,
581 |    "metadata": {},
582 |    "outputs": [],
583 |    "source": []
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": []
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": null,
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": []
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "metadata": {},
603 |    "outputs": [],
604 |    "source": []
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": []
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": []
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": []
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": []
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {},
645 |    "outputs": [],
646 |    "source": []
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": null,
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": []
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": []
661 |   }
662 |  ],
663 |  "metadata": {
664 |   "kernelspec": {
665 |    "display_name": "Python 3",
666 |    "language": "python",
667 |    "name": "python3"
668 |   },
669 |   "language_info": {
670 |    "codemirror_mode": {
671 |     "name": "ipython",
672 |     "version": 3
673 |    },
674 |    "file_extension": ".py",
675 |    "mimetype": "text/x-python",
676 |    "name": "python",
677 |    "nbconvert_exporter": "python",
678 |    "pygments_lexer": "ipython3",
679 |    "version": "3.7.0"
680 |   }
681 |  },
682 |  "nbformat": 4,
683 |  "nbformat_minor": 2
684 | }
685 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-get-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import glob\n",
 10 |     "import datetime\n",
 11 |     "import pandas as pd\n",
 12 |     "import json\n",
 13 |     "from pandas.io.json import json_normalize\n",
 14 |     "\n",
 15 |     "# https://stackoverflow.com/questions/36587211/easiest-way-to-read-csv-files-with-multiprocessing-in-pandas\n",
 16 |     "# http://python-3.ru/page/multiprocessing\n",
 17 |     "from multiprocessing import Pool\n",
 18 |     "\n",
 19 |     "import os"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# Создаю/обнуляю список файлов \n",
 29 |     "files_full_path_list = list()\n",
 30 |     "\n",
 31 |     "# Путь к корневому каталогу файлов\n",
 32 |     "files_path = '/opt/app/data/shared/latest_dump/*/*.json'\n",
 33 |     "# files_path = '/opt/app/data/shared/latest_dump/*/2018*.json'"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Получаю перечень полных пути файлов в подкаталогах\n",
 43 |     "for file_name in glob.glob(files_path, recursive=True):\n",
 44 |     "    # Добавляю полный путь в список\n",
 45 |     "    files_full_path_list.append(file_name)\n",
 46 |     "    \n",
 47 |     "files_full_path_list.sort()\n",
 48 |     "\n",
 49 |     "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Количество файлов:', len(files_full_path_list))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "files_full_path_list[:10]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "json_df = pd.DataFrame()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Создаю результирующий dataframe из json файлов с помощью multiprocessing\n",
 77 |     "\n",
 78 |     "def get_file_data(file):\n",
 79 |     "    data_parsed = json.loads(open(file).read())\n",
 80 |     "    df = json_normalize(data_parsed)\n",
 81 |     "    df.insert(loc=0, column='file', value=file)\n",
 82 |     "    return df\n",
 83 |     "\n",
 84 |     "pool = Pool(processes=10)\n",
 85 |     "df_list = pool.map(get_file_data, files_full_path_list)\n",
 86 |     "\n",
 87 |     "json_df = pd.concat(df_list, ignore_index=True, sort=True)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# Сохраняю dataframe в csv\n",
 97 |     "\n",
 98 |     "csv_file_name = 'ods_data.csv'\n",
 99 |     "csv_file_dir = './'\n",
100 |     "csv_file_path = csv_file_dir + csv_file_name\n",
101 |     "\n",
102 |     "# Проверка существует ли файл. Если существует удаляю\n",
103 |     "if os.path.exists(csv_file_path):\n",
104 |     "    os.remove(csv_file_name)\n",
105 |     "\n",
106 |     "json_df.to_csv(csv_file_name, sep='|', index=False, encoding='utf-8')\n",
107 |     "\n",
108 |     "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \\\n",
109 |     "       'Размер csv файла:', \\\n",
110 |     "       round(os.path.getsize(csv_file_path)/(1000*1000.0),2), \\\n",
111 |     "       'Мегабайт')"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": []
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.7.0"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-qa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Количество вопросов по каналам\n",
  8 |     "### Количество ответов пользователя по каналам\n",
  9 |     "### Стата по лайкам за год:\n",
 10 |     "- каких лайков сколько  \n",
 11 |     "- самые залайканные посты в открытых каналах (включая максимум :parrot: , :pepe_sad: , :catshake: , :ods: , ...)  "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "### Импорт библиотек"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import glob\n",
 28 |     "import pandas as pd\n",
 29 |     "import json\n",
 30 |     "from pandas.io.json import json_normalize\n",
 31 |     "\n",
 32 |     "from datetime import datetime\n",
 33 |     "\n",
 34 |     "import os\n",
 35 |     "\n",
 36 |     "import re\n",
 37 |     "\n",
 38 |     "import pymorphy2\n",
 39 |     "morph = pymorphy2.MorphAnalyzer()\n",
 40 |     "\n",
 41 |     "%matplotlib inline\n",
 42 |     "import matplotlib.pyplot as plt"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "print ('Размер файла', round(os.path.getsize('ods_data.csv')/1000/1000.0,2), 'мегабайт')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "df = pd.read_csv('ods_data.csv', sep='|', encoding='utf-8', dtype=str)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# tp = pd.read_csv('2018_ods_raw.csv', sep='|', encoding='utf-8', dtype=str, iterator=True, chunksize=1000)\n",
 70 |     "# print (tp)\n",
 71 |     "# #<pandas.io.parsers.TextFileReader object at 0x00000000150E0048>\n",
 72 |     "# df = pd.concat(tp, ignore_index=True)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# df.info() "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "len(df)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Все столбцы таблицы\n",
100 |     "# list(json_df)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# df.tail()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Добавляю категорию для сообщений\n",
119 |     "df['cat'] = df['file']\n",
120 |     "df['cat'] = df['cat'].str.replace('/opt/app/data/shared/latest_dump/', '')\n",
121 |     "df['cat'] = df['cat'].str.replace('\\/.*','').str.strip()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "df['thread_ts'] = pd.to_datetime(df['thread_ts'], unit='s')\n",
131 |     "df['ts'] = pd.to_datetime(df['ts'], unit='s')"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "# Информация по сообщениям\n",
139 |     "\n",
140 |     "##### Количество сообщений всего/2018: 1 089 398 / 374 038\n",
141 |     "##### Количество родительских сообщений всего/2018: 51724 / 26345\n",
142 |     "##### Количество не пустых родительских сообщений всего/2018: 51447 / 26142\n",
143 |     "##### Родительское сообщений набравшее больше всего ответов в 2018 и вообще (1183 шт.) https://opendatascience.slack.com/archives/C0SGCGB52/p1537287302000100"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# Количество ответов на родительские сообщения \n",
153 |     "df.groupby(['thread_ts'])['thread_ts'].agg('count').sort_values(ascending=False).head()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# https://opendatascience.slack.com/archives/C91N8TL83/p1542103865495600\n",
163 |     "# df.loc[(df['thread_ts'] == '2018-11-13 10:11:05.495599985') & (df['thread_ts'] == df['ts'])]\n",
164 |     "# https://opendatascience.slack.com/archives/C91N8TL83/p1542103865495600\n",
165 |     "# df.loc[(df['thread_ts'] == '2018-11-14 09:55:01.799499989') & (df['thread_ts'] == df['ts'])]\n",
166 |     "df['text'].loc[(df['thread_ts'] == '2018-09-18 16:15:02.000099897') & (df['thread_ts'] == df['ts'])]"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "len(df[df.thread_ts.isnull()])"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# Родительские сообщения\n",
185 |     "# Возможно неправильно!?!?!!?\n",
186 |     "# df_parent = df[df.thread_ts == df.ts] # 2801\n",
187 |     "# df_parent = df[df.thread_ts.isnull()] # 26652 из них _random_b 3491 не нашел как связать с ответами\n",
188 |     "# df_parent = df[(df.thread_ts.isnull()) | (df.thread_ts == df.ts)]\n",
189 |     "\n",
190 |     "df_parent = df.loc[(df['thread_ts'] == df['ts'])]\n",
191 |     "\n",
192 |     "print ('Количество сообщений:', len(df))\n",
193 |     "print ('Количество родительских сообщений:', len(df_parent))\n",
194 |     "print ('Среднее количество ответов на родительское сообщение:', round(len(df)/len(df_parent),2))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# Самые популярные родительские сообщения по количеству ответов\n",
204 |     "# reply_count больше 200 почему то не бывает хотя есть сообщения с большим количеством коментариев\n",
205 |     "\n",
206 |     "# df_parent[['thread_ts', 'cat', 'text', 'reply_count']].sort_values(['reply_count'], ascending=False).head()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# # df_a.groupby(['cat', 'user']).size().head(10)\n",
216 |     "# df_x = df.loc[df['thread_ts'].isin(df_parent['thread_ts'])]\n",
217 |     "# # df_x[['thread_ts', 'ts', 'cat', 'text', 'reply_count']]\n",
218 |     "# df_x.groupby(['thread_ts'])['thread_ts'].agg('count').sort_values(ascending=False).head()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# Удаляю сообщения с пустым text\n",
228 |     "# Почему такое бывает не разобрался\n",
229 |     "df_parent = df_parent.dropna(subset=['text'])\n",
230 |     "print ('Количество не пустых родительских сообщений:', len(df_parent))"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# Родительских сообщений по каналам\n",
240 |     "df_parent[['cat','thread_ts']]\\\n",
241 |     "        .groupby(['cat'])['thread_ts'] \\\n",
242 |     "        .count() \\\n",
243 |     "        .reset_index(name='count') \\\n",
244 |     "        .sort_values(['count'], ascending=False) \\\n",
245 |     "        .head(10)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "# Информация по заданным вопросам\n",
253 |     "##### Количество родительских сообщений с вопросами 2018: 17851 из 26142\n",
254 |     "##### Количество родительских сообщений с вопросами всего: 33011 из 51447"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# Добавляю столбец текстом сообщений в номальной форме для того что бы потом искать сообщения с вопросами\n",
264 |     "df_parent['morph_text'] = df_parent['text']"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "def to_norm_form(data, column):\n",
274 |     "    full_words_list = []\n",
275 |     "    words_row_list = data[column].tolist()\n",
276 |     "    # Каждую строчку в переданном столбце\n",
277 |     "    for i in range(len(words_row_list)):\n",
278 |     "        # Получаю список слов\n",
279 |     "        words_list = re.sub(\"[^\\w]\", \" \",  words_row_list[i]).split()\n",
280 |     "        # Каждое слово из строки\n",
281 |     "        norm_words_list = []\n",
282 |     "        for word in words_list:\n",
283 |     "            norm_word = morph.parse(word)[0].normal_form\n",
284 |     "            norm_words_list.append(norm_word)\n",
285 |     "        \n",
286 |     "        full_words_string = ' '.join(norm_words_list)\n",
287 |     "        full_words_list.append(full_words_string)\n",
288 |     "        \n",
289 |     "    return full_words_list"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "%%time\n",
299 |     "# Запонляю столбец morph_text текстом сообщений в номальной форме для того что бы потом искать сообщения с вопросами\n",
300 |     "df_parent['morph_text'] = to_norm_form(df_parent, 'morph_text')"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "df_parent[['text', 'morph_text']].head()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "# Добавляю признак что в тексте был знак вопроса\n",
319 |     "df_parent['found_question_mark'] = df_parent['text'].str.contains('\\?')"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "df_parent[['text', 'morph_text', 'found_question_mark']].tail()"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "# morph.parse('зачем')[0].normal_form"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# Количество родительских сообщений с вопросами\n",
347 |     "# QUESTION_WORD_LEMMAS = (\"как\", \"как-то\", \"какой\", \"какой-то\", \"зачем\", \"почему\", \"когда\", \"кто\", \"где\", \"когда\", \"куда\", \"куда-то\", \"чот\")\n",
348 |     "#QUESTION_WORDS = ('вопрос', 'обьяснит', 'подсказать', 'посоветовать', 'как') # Количество родительских сообщений с вопросами: 1828\n",
349 |     "QUESTION_WORDS = ('вопрос', 'обьяснит', 'подсказать', 'посоветовать', 'как', 'почему', 'зачем')\n",
350 |     "\n",
351 |     "df_q = df_parent.loc[(df_parent['morph_text'].str.contains('|'.join(QUESTION_WORDS))) | (df_parent.found_question_mark == True)].reset_index()"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "print ('Количество родительских сообщений:', len(df_parent))\n",
361 |     "print ('Количество родительских сообщений с вопросами:', len(df_q))"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "# Вопросов в родительских сообщениях по каналам\n",
371 |     "df_q[['cat','thread_ts']]\\\n",
372 |     "        .groupby(['cat'])['thread_ts'] \\\n",
373 |     "        .count() \\\n",
374 |     "        .reset_index(name='count') \\\n",
375 |     "        .sort_values(['count'], ascending=False) \\\n",
376 |     "        .head(10)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "# Вопросов в родительских сообщениях по каналам\n",
386 |     "\n",
387 |     "# df_q = df_q[df_q.cat != '_random_b'] # 5287\n",
388 |     "# df_q = df_q[df_q.cat != 'stack_overflow'] # 456\n",
389 |     "\n",
390 |     "# df_q.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False).head()"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "plt.rcParams[\"figure.figsize\"] = (16, 9)\n",
400 |     "\n",
401 |     "df_user_questions = df_q.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False)\n",
402 |     "df_user_questions.head(10).plot.bar()\n",
403 |     "\n",
404 |     "plt.title('Количество вопросов по каналам за 2018 год (шт.)', loc='center')\n",
405 |     "\n",
406 |     "plt.xlabel('Канал')\n",
407 |     "plt.ylabel('Количество')\n",
408 |     "\n",
409 |     "plt.savefig('ods-questions.png', bbox_inches = 'tight')\n",
410 |     "\n",
411 |     "# plt.savefig('2018-ods-questions.svg', format='svg')\n",
412 |     "# plt.savefig('2018-ods-questions.png', bbox_inches = 'tight', dpi=600)\n",
413 |     "# I used 1200 dpi because a lot of scientific journals require images in 1200 / 600 / 300 dpi depending on what the image is of"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "# Информация по ответам на вопросы"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "# # reply_count показывает странные цифры\n",
430 |     "\n",
431 |     "# # Ответов на сообщения вопросы по полю reply_count\n",
432 |     "# # df_q[['cat', 'thread_ts', 'text', 'morph_text', 'reply_count']].sort_values('reply_count', ascending=False).head()\n",
433 |     "# df_q[['cat', 'thread_ts', 'reply_count']].sort_values('reply_count', ascending=False).head()"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "# df_q.loc[(df_q['thread_ts'] == '2018-11-01 18:36:36.419199944')]"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "# df.loc[df['thread_ts'].isin(df_q['thread_ts'])].tail()"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "# Ответов по вхождению сообщений пользователей в сообщения вопросы\n",
461 |     "\n",
462 |     "# наверно будет хорошей идеей отфильтровать из ответов на вопросы пользователей которые задали родительский вопрос\n",
463 |     "\n",
464 |     "# df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts'])]\n",
465 |     "# df_a[['cat','thread_ts', 'user']]\\\n",
466 |     "#                         .groupby(['cat', 'thread_ts', 'user'])['thread_ts'] \\\n",
467 |     "#                         .count() \\\n",
468 |     "#                         .reset_index(name='count') \\\n",
469 |     "#                         .sort_values(['count'], ascending=False) \\\n",
470 |     "#                         .head()"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "# len(df_a)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "# df_q.loc[(df_q['thread_ts'] == '2018-11-16 08:59:45.085799932')]"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "# наверно будет хорошей идеей отфильтровать из ответов на вопросы пользователей которые задали родительский вопрос\n",
498 |     "# df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts']) & ~df['user'].isin(df_q['user'])]\n",
499 |     "\n",
500 |     "# Ответов по вхождению сообщений пользователей в сообщения вопросы\n",
501 |     "df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts'])]\n",
502 |     "\n",
503 |     "df_a[['cat']]\\\n",
504 |     "    .groupby(['cat'])['cat'] \\\n",
505 |     "    .count() \\\n",
506 |     "    .reset_index(name='count') \\\n",
507 |     "    .sort_values(['count'], ascending=False) \\\n",
508 |     "    .head(10)"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "len(df_a)"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "# df_q.tail()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "plt.rcParams[\"figure.figsize\"] = (16, 9)\n",
536 |     "\n",
537 |     "df_user_answers = df_a.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False)\n",
538 |     "df_user_answers.head(10).plot.bar()\n",
539 |     "\n",
540 |     "plt.title('Количество ответов по каналам за 2018 год (шт.)', loc='center')\n",
541 |     "\n",
542 |     "plt.xlabel('Канал')\n",
543 |     "plt.ylabel('Количество')\n",
544 |     "\n",
545 |     "plt.savefig('ods-answers.png', bbox_inches = 'tight')\n",
546 |     "\n",
547 |     "# plt.savefig('2018-ods-questions.svg', format='svg')\n",
548 |     "# plt.savefig('2018-ods-answers.png', bbox_inches = 'tight', dpi=600)\n",
549 |     "# I used 1200 dpi because a lot of scientific journals require images in 1200 / 600 / 300 dpi depending on what the image is of"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "users_list = json.loads(open('/opt/app/data/shared/latest_dump/users.json').read())\n",
559 |     "users_df = json_normalize(users_list)\n",
560 |     "\n",
561 |     "# users_df.info()\n",
562 |     "# users_df[['id', 'name']].head()"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "df_a = pd.merge(df_a, users_df,  how='left', left_on=['user'], right_on = ['id'])"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": null,
577 |    "metadata": {},
578 |    "outputs": [],
579 |    "source": [
580 |     "df_a[['user', 'name']].head()"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {},
587 |    "outputs": [],
588 |    "source": [
589 |     "# Топ пользователей по ответам на вопросы\n",
590 |     "df_a.groupby(['user', 'name'])['user'].agg('count').reset_index(name='count').sort_values(['count'], ascending=False).head(10)"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": null,
596 |    "metadata": {},
597 |    "outputs": [],
598 |    "source": []
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": null,
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": []
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": []
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": []
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": null,
624 |    "metadata": {},
625 |    "outputs": [],
626 |    "source": []
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": null,
631 |    "metadata": {},
632 |    "outputs": [],
633 |    "source": []
634 |   }
635 |  ],
636 |  "metadata": {
637 |   "kernelspec": {
638 |    "display_name": "Python 3",
639 |    "language": "python",
640 |    "name": "python3"
641 |   },
642 |   "language_info": {
643 |    "codemirror_mode": {
644 |     "name": "ipython",
645 |     "version": 3
646 |    },
647 |    "file_extension": ".py",
648 |    "mimetype": "text/x-python",
649 |    "name": "python",
650 |    "nbconvert_exporter": "python",
651 |    "pygments_lexer": "ipython3",
652 |    "version": "3.7.0"
653 |   }
654 |  },
655 |  "nbformat": 4,
656 |  "nbformat_minor": 2
657 | }
658 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-questions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-questions.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-questions_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-questions_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/dv_qa/ods-top-users_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-top-users_tab.png


--------------------------------------------------------------------------------
/hackathon_3_december_2018/folium_map/parse_geoservice_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "\n",
 12 |     "from geopy.geocoders import Nominatim, Yandex\n",
 13 |     "from geopy.exc import GeocoderServiceError, GeocoderTimedOut"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "def update_locations_geodata(geolocator, unique_locations, locations_geodata, bad_locations):\n",
 23 |     "    for city in unique_locations:\n",
 24 |     "        if (city in locations_geodata) or (city in bad_locations):\n",
 25 |     "            continue\n",
 26 |     "\n",
 27 |     "        try:\n",
 28 |     "            location = geolocator.geocode(city)\n",
 29 |     "        except GeocoderServiceError as e:\n",
 30 |     "            print('GeocoderServiceError: {}'.format(e))\n",
 31 |     "            break\n",
 32 |     "        except GeocoderTimedOut as e:\n",
 33 |     "            print('GeocoderTimedOut: {}'.format(e))\n",
 34 |     "            break\n",
 35 |     "\n",
 36 |     "        if location is None:\n",
 37 |     "            bad_locations.append(city)\n",
 38 |     "        else:\n",
 39 |     "            locations_geodata[city] = location"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div>\n",
 51 |        "<style scoped>\n",
 52 |        "    .dataframe tbody tr th:only-of-type {\n",
 53 |        "        vertical-align: middle;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe tbody tr th {\n",
 57 |        "        vertical-align: top;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe thead th {\n",
 61 |        "        text-align: right;\n",
 62 |        "    }\n",
 63 |        "</style>\n",
 64 |        "<table border=\"1\" class=\"dataframe\">\n",
 65 |        "  <thead>\n",
 66 |        "    <tr style=\"text-align: right;\">\n",
 67 |        "      <th></th>\n",
 68 |        "      <th>id</th>\n",
 69 |        "      <th>city</th>\n",
 70 |        "    </tr>\n",
 71 |        "  </thead>\n",
 72 |        "  <tbody>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>0</th>\n",
 75 |        "      <td>UE7T3UC1M</td>\n",
 76 |        "      <td>Москва</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>UE61U6DCL</td>\n",
 81 |        "      <td>Москва</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>2</th>\n",
 85 |        "      <td>UEF068197</td>\n",
 86 |        "      <td>Moscow</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>3</th>\n",
 90 |        "      <td>UE7JRC006</td>\n",
 91 |        "      <td>Краснодар</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>4</th>\n",
 95 |        "      <td>UE7M36F7Y</td>\n",
 96 |        "      <td>Samara</td>\n",
 97 |        "    </tr>\n",
 98 |        "  </tbody>\n",
 99 |        "</table>\n",
100 |        "</div>"
101 |       ],
102 |       "text/plain": [
103 |        "          id       city\n",
104 |        "0  UE7T3UC1M     Москва\n",
105 |        "1  UE61U6DCL     Москва\n",
106 |        "2  UEF068197     Moscow\n",
107 |        "3  UE7JRC006  Краснодар\n",
108 |        "4  UE7M36F7Y     Samara"
109 |       ]
110 |      },
111 |      "execution_count": 3,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "user_locations = pd.read_csv('./../user_id_to_from.csv')\n",
118 |     "user_locations.rename({'from': 'city'}, axis=1, inplace=True)\n",
119 |     "user_locations.head()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "(2101,)"
131 |       ]
132 |      },
133 |      "execution_count": 4,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "user_locations['city'].unique().shape"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 5,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "locations_geodata = dict()\n",
149 |     "bad_locations = []"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 44,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "locations_geodata: 2041, bad_locations: 60\n",
162 |       "complete flag: True\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "# execute until get all data (remember to geolocator limits!)\n",
168 |     "\n",
169 |     "#geolocator = Nominatim(user_agent='aborisihin')\n",
170 |     "geolocator = Yandex()\n",
171 |     "\n",
172 |     "update_locations_geodata(geolocator, user_locations['city'].unique(), locations_geodata, bad_locations)\n",
173 |     "\n",
174 |     "print('locations_geodata: {}, bad_locations: {}'.format(len(locations_geodata), len(bad_locations)))\n",
175 |     "\n",
176 |     "complete_flag = (len(locations_geodata) + len(bad_locations) == len(user_locations['city'].unique()))\n",
177 |     "print('complete flag: {}'.format(complete_flag))"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 32,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "def geodata_value_osm(location_string, key):\n",
187 |     "    if location_string in locations_geodata:\n",
188 |     "        return locations_geodata[location_string].raw[key]\n",
189 |     "    else:\n",
190 |     "        return None\n",
191 |     "    \n",
192 |     "def geodata_value_yandex(location_string, key):\n",
193 |     "    if location_string in locations_geodata:\n",
194 |     "        if (key == 'text') or (key == 'kind'):\n",
195 |     "            return locations_geodata[location_string].raw['metaDataProperty']['GeocoderMetaData'][key]\n",
196 |     "        elif key == 'lat':\n",
197 |     "            return locations_geodata[location_string].raw['Point']['pos'].split(' ')[0]\n",
198 |     "        elif key == 'lon':\n",
199 |     "            return locations_geodata[location_string].raw['Point']['pos'].split(' ')[1]\n",
200 |     "        \n",
201 |     "    return None"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 33,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# OpenStreetMaps\n",
211 |     "# user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'display_name'))\n",
212 |     "# user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'type'))\n",
213 |     "# user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lat'))\n",
214 |     "# user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lon'))\n",
215 |     "\n",
216 |     "# Yandex\n",
217 |     "user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'text'))\n",
218 |     "user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'kind'))\n",
219 |     "user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lat'))\n",
220 |     "user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lon'))"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 39,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/html": [
231 |        "<div>\n",
232 |        "<style scoped>\n",
233 |        "    .dataframe tbody tr th:only-of-type {\n",
234 |        "        vertical-align: middle;\n",
235 |        "    }\n",
236 |        "\n",
237 |        "    .dataframe tbody tr th {\n",
238 |        "        vertical-align: top;\n",
239 |        "    }\n",
240 |        "\n",
241 |        "    .dataframe thead th {\n",
242 |        "        text-align: right;\n",
243 |        "    }\n",
244 |        "</style>\n",
245 |        "<table border=\"1\" class=\"dataframe\">\n",
246 |        "  <thead>\n",
247 |        "    <tr style=\"text-align: right;\">\n",
248 |        "      <th></th>\n",
249 |        "      <th>id</th>\n",
250 |        "      <th>city</th>\n",
251 |        "      <th>geolocation_name</th>\n",
252 |        "      <th>geolocation_type</th>\n",
253 |        "      <th>geolocation_lat</th>\n",
254 |        "      <th>geolocation_lon</th>\n",
255 |        "    </tr>\n",
256 |        "  </thead>\n",
257 |        "  <tbody>\n",
258 |        "    <tr>\n",
259 |        "      <th>0</th>\n",
260 |        "      <td>UE7T3UC1M</td>\n",
261 |        "      <td>Москва</td>\n",
262 |        "      <td>Россия, Москва</td>\n",
263 |        "      <td>province</td>\n",
264 |        "      <td>37.622504</td>\n",
265 |        "      <td>55.753215</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>1</th>\n",
269 |        "      <td>UE61U6DCL</td>\n",
270 |        "      <td>Москва</td>\n",
271 |        "      <td>Россия, Москва</td>\n",
272 |        "      <td>province</td>\n",
273 |        "      <td>37.622504</td>\n",
274 |        "      <td>55.753215</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>2</th>\n",
278 |        "      <td>UEF068197</td>\n",
279 |        "      <td>Moscow</td>\n",
280 |        "      <td>Россия, Москва</td>\n",
281 |        "      <td>locality</td>\n",
282 |        "      <td>37.617635</td>\n",
283 |        "      <td>55.755814</td>\n",
284 |        "    </tr>\n",
285 |        "    <tr>\n",
286 |        "      <th>3</th>\n",
287 |        "      <td>UE7JRC006</td>\n",
288 |        "      <td>Краснодар</td>\n",
289 |        "      <td>Россия, Краснодар</td>\n",
290 |        "      <td>locality</td>\n",
291 |        "      <td>38.975313</td>\n",
292 |        "      <td>45.03547</td>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "      <th>4</th>\n",
296 |        "      <td>UE7M36F7Y</td>\n",
297 |        "      <td>Samara</td>\n",
298 |        "      <td>Россия, Самара</td>\n",
299 |        "      <td>locality</td>\n",
300 |        "      <td>50.101783</td>\n",
301 |        "      <td>53.195538</td>\n",
302 |        "    </tr>\n",
303 |        "  </tbody>\n",
304 |        "</table>\n",
305 |        "</div>"
306 |       ],
307 |       "text/plain": [
308 |        "          id       city   geolocation_name geolocation_type geolocation_lat  \\\n",
309 |        "0  UE7T3UC1M     Москва     Россия, Москва         province       37.622504   \n",
310 |        "1  UE61U6DCL     Москва     Россия, Москва         province       37.622504   \n",
311 |        "2  UEF068197     Moscow     Россия, Москва         locality       37.617635   \n",
312 |        "3  UE7JRC006  Краснодар  Россия, Краснодар         locality       38.975313   \n",
313 |        "4  UE7M36F7Y     Samara     Россия, Самара         locality       50.101783   \n",
314 |        "\n",
315 |        "  geolocation_lon  \n",
316 |        "0       55.753215  \n",
317 |        "1       55.753215  \n",
318 |        "2       55.755814  \n",
319 |        "3        45.03547  \n",
320 |        "4       53.195538  "
321 |       ]
322 |      },
323 |      "execution_count": 39,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "user_locations.head()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 42,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "#CSV_PATH = './user_locations_osm.csv'\n",
339 |     "CSV_PATH = './user_locations_yandex.csv'\n",
340 |     "\n",
341 |     "user_locations.to_csv(CSV_PATH, index=False)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": []
350 |   }
351 |  ],
352 |  "metadata": {
353 |   "kernelspec": {
354 |    "display_name": "Python 3",
355 |    "language": "python",
356 |    "name": "python3"
357 |   },
358 |   "language_info": {
359 |    "codemirror_mode": {
360 |     "name": "ipython",
361 |     "version": 3
362 |    },
363 |    "file_extension": ".py",
364 |    "mimetype": "text/x-python",
365 |    "name": "python",
366 |    "nbconvert_exporter": "python",
367 |    "pygments_lexer": "ipython3",
368 |    "version": "3.7.0"
369 |   }
370 |  },
371 |  "nbformat": 4,
372 |  "nbformat_minor": 2
373 | }
374 | 


--------------------------------------------------------------------------------
/hackathon_3_december_2018/folium_map/user_geodata.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def get_username_from_dump(users_json, user_id):
 9 |     """ Get username from dump file by user id.
10 |     
11 |     Args:
12 |         users_json: dict object with dump data
13 |         user_id: id to find
14 |     """
15 |     for user in users_json:
16 |         if user.get('id', '') == user_id:
17 |             return user.get('name', '')
18 |     return ''
19 | 
20 | 
21 | def filter_coordinate(data_row, coordinate_field):
22 |     """ Get correct coordinates from dataframe row.
23 |     Filter data by 'geolocation_type' field.
24 |     
25 |     Args:
26 |         data_row: data
27 |         coordinate_field: coordinate field name
28 |     """
29 |     correct_location_types = [
30 |         'city', # OSM
31 |         'locality', # yandex
32 |         'province', # yandex
33 |         'area' # yandex
34 |     ]
35 |     
36 |     if data_row['geolocation_type'] in correct_location_types:
37 |         return np.float(data_row[coordinate_field])
38 |     else:
39 |         return None
40 | 
41 | 
42 | def prepare_user_data(settings_filepath):
43 |     """ Prepare user geodata csv file.
44 |     Connect geodata from geolocator service with usernames and filer correct coordinates.
45 |     
46 |     Args:
47 |         settings_filepath: path to settings file
48 |     """
49 |     print('open settings: {}'.format(settings_filepath))
50 |     with open(settings_filepath, 'r') as settings_file:
51 |         settings = json.load(settings_file)
52 |     
53 |     with open(settings['users_dump_file'], 'r') as users_json_file:
54 |         users_json = json.load(users_json_file)
55 |     
56 |     user_locations = pd.read_csv(settings['users_locations_file'])
57 |     
58 |     user_locations['user'] = user_locations['id'].apply(lambda x: get_username_from_dump(users_json, x))
59 |     user_locations['latitude'] = user_locations.apply(lambda x: filter_coordinate(x, 'geolocation_lat'), axis=1)
60 |     user_locations['longitude'] = user_locations.apply(lambda x: filter_coordinate(x, 'geolocation_lon'), axis=1)
61 |     
62 |     user_locations.to_csv(settings['output_file'], index=False)
63 |     
64 | 
65 | if __name__ == '__main__':
66 |     if len(sys.argv) >= 2:
67 |         prepare_user_data(sys.argv[1])
68 |     else:
69 |         print('settings file needed')


--------------------------------------------------------------------------------
/hackathon_3_december_2018/folium_map/user_geodata_settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "users_dump_file": "./../shared/latest_dump/users.json",
3 |     "users_locations_file": "user_locations_osm.csv",
4 |     "output_file": "filtered_user_locations.csv"
5 | }


--------------------------------------------------------------------------------
/hackathon_3_december_2018/folium_map/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | import folium
  4 | from folium import plugins
  5 | from folium import IFrame
  6 | 
  7 | import json
  8 | import time
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | from plotly import tools
 12 | import plotly.graph_objs as go
 13 | import plotly
 14 | import pandas as pd
 15 | from plotly.offline import plot
 16 | 
 17 | from geopy.geocoders import Nominatim
 18 | import configparser
 19 | 
 20 | config = configparser.ConfigParser()
 21 | config.read('settings.ini')
 22 | username = config['PLOTLY']['username']
 23 | api_key = config['PLOTLY']['api_key']
 24 | 
 25 | plotly.tools.set_credentials_file(username=username, api_key=api_key)
 26 | geolocator = Nominatim(user_agent='artgor')
 27 | 
 28 | 
 29 | def plot_top_channels(plot_type='matplotlib', top_n=20):
 30 |     """
 31 |     Plot top channels by user count.
 32 |     
 33 |     Use channels.json to get channels and user count in them.
 34 |     Can plot either top N channels in matplotlib or in Plotly
 35 |     
 36 |     :params: plot_type - matplotlib/plotly
 37 |     :params: top_n - plot top n channels. If None - plot all
 38 |     """
 39 |     # load file
 40 |     with open('shared/latest_dump/channels.json', 'r') as f:
 41 |         channels = json.load(f)
 42 |     
 43 |     # number of users in channels
 44 |     d = {i['name']: len(i['members']) for i in channels}
 45 |     
 46 |     # sort data and convert to pandas DF. set channel as index for plotting
 47 |     sorted_d = sorted(d.items(), key = lambda x: x[1], reverse=True)
 48 |     df = pd.DataFrame(sorted_d, columns=['channel', 'user_count'])
 49 |     df = df.set_index('channel')
 50 |     
 51 |     if top_n is None:
 52 |         top_n = len(df)
 53 |     
 54 |     if plot_type == 'matplotlib':
 55 |         df[:top_n].sort_values('user_count').plot(kind='barh', figsize=(12, 8));
 56 |         plt.title(f'Топ-{top_n} каналов по количеству пользователей');
 57 |         plt.show()
 58 |         
 59 |     elif plot_type == 'plotly':
 60 |         data = [go.Bar(
 61 |                 x=df[:top_n].index,
 62 |                 y=df[:top_n]['user_count'],
 63 |                 name='user counts'
 64 |             )]
 65 |         layout = go.Layout()
 66 |         fig = go.Figure(data=data, layout=layout)
 67 |         plot(fig, filename='top_channels.html')
 68 |     else:
 69 |         raise ValueError('Possible values: matplotlib or plotly')
 70 |         
 71 | def prepare_data_for_folium(return_df=False, save_df=True, df_name='user_geo'):
 72 |     """
 73 |     Prepairs data for foluim in a naive way.
 74 |     
 75 |     Uses information about user time zone to prepare data for using in folium.
 76 |     
 77 |     :params: return_df - whether to return df
 78 |     :params: save_df - whether to save df
 79 |     :params: df_name - name of saved df
 80 |     """
 81 |     with open('shared/latest_dump/users.json', 'r') as f:
 82 |         users = json.load(f)
 83 |     
 84 |     # mapping of users to timezone
 85 |     user_tz = {i['name'] : i['tz'] if 'tz' in i.keys() else '' for i in users}
 86 |     
 87 |     # unique tz
 88 |     tzs = list(set(list(user_tz.values())))
 89 |     
 90 |     #cities
 91 |     city_list = sorted([i.split('/')[1] for i in tzs[1:]])
 92 |     
 93 |     # getting data from api. There is a limit of number of requests
 94 |     # so sleep is used
 95 |     city_geo = {}
 96 |     for i, c in enumerate(city_list):
 97 |         if i % 30 == 0:
 98 |             time.sleep(1)
 99 |         
100 |         location = geolocator.geocode(c)
101 |         city_geo[c] = (location.latitude, location.longitude)
102 |         
103 |     # create DataFrame
104 |     u_c_df = pd.DataFrame.from_dict(user_tz, orient='index')
105 |     u_c_df.reset_index(inplace=True)
106 |     u_c_df.columns = ['user', 'tz']
107 |     
108 |     u_c_df['city'] = u_c_df['tz'].apply(lambda x: x.split('/')[1] if '/' in x else '')
109 |     
110 |     # dropping empty rows
111 |     u_c_df = u_c_df.loc[u_c_df['city'] != '']
112 |     u_c_df['latitude'] = u_c_df['city'].apply(lambda x: city_geo[x][0])
113 |     u_c_df['longitude'] = u_c_df['city'].apply(lambda x: city_geo[x][1])
114 |     u_c_df['user_count'] = u_c_df.groupby('city')['user'].transform('count')
115 |     
116 |     if save_df:
117 |         u_c_df.to_csv(f'{df_name}.csv', index=False)
118 |         
119 |     if return_df:
120 |         return df
121 |     
122 | def make_plotly_map(u_c_df, plot_by='city', add_heatmap=True):
123 |     """
124 |     Make folium map.
125 |     
126 |     Makes folium map with heatmap.
127 |     Can be done by cities or geo data.
128 |     Text of markers is made with html, so it can be easily changed to show any information.
129 |     
130 |     :params: u_c_df  pandas DataFrame with data. Must have columns: user (display name), city,
131 |             latitude, longtitude. 
132 |     :params: plot_by - It is adequate to do it by city, but there is a case,
133 |             when it was better to do it by geo - when data isn't completely clean.
134 |             add_heatmap - whether to add heatmap.
135 |     
136 |     """
137 |     m = folium.Map([], zoom_start=15)
138 |     if add_heatmap:
139 |         geo_matrix = u_c_df[['latitude', 'longitude']].values
140 |         m.add_child(plugins.HeatMap(geo_matrix, radius=10, min_opacity=0.6, max_zoom=10, max_val=1, blur=10, gradient={0.4: 'blue', 0.65: 'lime', 1: 'crimson'}));
141 |     
142 |     marker_cluster = plugins.MarkerCluster().add_to(m)
143 |     
144 |     if plot_by == 'city':
145 |         main_col = 'city'
146 |         count_col = 'user_count_city'
147 |         
148 |     elif plot_by == 'geo':
149 |         main_col = 'latitude'
150 |         count_col = 'user_count_lat'
151 |     else:
152 |         raise ValueError('Possible values: city or geo')
153 |     
154 |     for c in u_c_df[main_col].unique():
155 |         # make list of first 5 people
156 |         city_users = list(u_c_df.loc[u_c_df[main_col] == c, 'user'].values)[:5]
157 |         #city_users = '\n'.join(city_users)
158 |         
159 |         # user count
160 |         user_count = u_c_df.loc[u_c_df[main_col] == c, count_col].unique()[0]
161 |         
162 |         # city name
163 |         city_name = c if main_col == 'city' else u_c_df.loc[u_c_df[main_col] == c, 'city'].unique()[0]
164 |         
165 |         # creating folium markers
166 |         html=f"""
167 |             <h2> Город: {city_name}</h2><br>
168 |             Количество пользователей: {user_count}<br>
169 |             Здесь живут такие люди:<br>
170 |             """
171 |         for u in city_users:
172 |             html += u + '<br>'
173 |         
174 |         iframe = IFrame(html=html, width=500, height=300)
175 |         popup = folium.Popup(iframe, max_width=300)
176 |         
177 |         folium.Marker(location=[u_c_df.loc[u_c_df[main_col] == c, 'latitude'].unique()[0],
178 |                                 u_c_df.loc[u_c_df[main_col] == c, 'longitude'].unique()[0]],
179 |                             popup=popup
180 |                            ).add_to(marker_cluster)
181 |         
182 |     return m


--------------------------------------------------------------------------------