├── .DS_Store ├── .gitignore ├── README.md ├── hackathon_1_may_2017 ├── .DS_Store ├── ChunkByTime.ipynb ├── ElasticsearchSynonyms.ipynb ├── Get_links_from_meetings.ipynb ├── PredictChannel.ipynb ├── ThreadIndexingWithES.ipynb ├── chunk.py ├── docker-compose.yml ├── event-parser.ipynb ├── fact_extraction.ipynb ├── fact_extraction_with_mystem.ipynb ├── for_tomita │ ├── config.proto │ ├── courses.cxx │ ├── education.cxx │ ├── fact_types.proto │ ├── interest.cxx │ ├── job.cxx │ ├── mydic.gzt │ └── name.cxx ├── help_data │ ├── java_policy │ └── synonyms.txt ├── key_words.py ├── predict_channel.py ├── questions.csv ├── requirements.txt ├── slack_data_loader.py ├── test_simple_question_extraction.ipynb ├── tokenizer.py └── vw.sh ├── hackathon_2_march_2018 ├── .DS_Store ├── README.md ├── data_fetch │ ├── README.md │ ├── msg_parser.py │ ├── reaction_parser.py │ ├── run.py │ └── users_parser.py ├── mention_count.ipynb ├── topic_modelling │ ├── 01. clean_text_parsing.ipynb │ ├── 02. vocabulary.ipynb │ └── 03. modeling.ipynb └── username_mining │ └── db.ipynb ├── hackathon_3_december_2018 ├── dv_qa │ ├── 2018-ods-answers.png │ ├── 2018-ods-answers_tab.png │ ├── 2018-ods-questions.png │ ├── 2018-ods-questions_tab.png │ ├── 2018-ods-top-users_tab.png │ ├── README.md │ ├── ods-answers.png │ ├── ods-answers_tab.png │ ├── ods-check-export.ipynb │ ├── ods-get-data.ipynb │ ├── ods-qa.ipynb │ ├── ods-questions.png │ ├── ods-questions_tab.png │ └── ods-top-users_tab.png └── folium_map │ ├── artgor_plot_folium.ipynb │ ├── big_map_latest_upd.html │ ├── parse_geoservice_data.ipynb │ ├── user_geodata.py │ ├── user_geodata_settings.json │ └── utils.py └── karma_onreact_counting.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | ODS_dump_Mar_10_2017/ 3 | opendatascience Slack export May 20 2017/ 4 | *.pyc 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introspect_hackathon 2 | - [Код и данные](https://github.com/open-data-science/Introspect_hackathon/tree/master/hackathon_1_may_2017) с ODS Introspect Hackathon'а #1 который проходил в кафе "Райский Пирожок", 19-21 Мая 2017. 3 | - [Код и данные](https://github.com/open-data-science/Introspect_hackathon/tree/master/hackathon_2_march_2018) с ODS Introspect Hackathon'а #2 который проходил в Mail.ru, 16-18 Марта 2018. 4 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_1_may_2017/.DS_Store -------------------------------------------------------------------------------- /hackathon_1_may_2017/ElasticsearchSynonyms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import asyncio\n", 12 | "from aioes import Elasticsearch" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 53, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "es = Elasticsearch(['localhost:9200'])" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 110, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "def gen_synonyms():\n", 35 | " \"\"\"\n", 36 | " Generate some synonyms in a file. All words separated by comma are treated as equal\n", 37 | " \"\"\"\n", 38 | " with open(\"synonyms.txt\", \"w\") as syns:\n", 39 | " syns.write(\"xboost, эксгебуст, эксбуст, иксгебуст, xgboost\\n\")\n", 40 | " syns.write(\"пыха, пыху, пых, php\\n\")\n", 41 | " syns.write(\"lol, лол\\n\")\n", 42 | " syns.write(\"питон, python\\n\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 105, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "index_body = {\n", 52 | " \"settings\": {\n", 53 | " \"analysis\": {\n", 54 | " \"filter\": {\n", 55 | " \"russian_stop\": {\n", 56 | " \"type\": \"stop\",\n", 57 | " \"stopwords\": \"_russian_\" \n", 58 | " },\n", 59 | " \"russian_stemmer\": {\n", 60 | " \"type\": \"stemmer\",\n", 61 | " \"language\": \"russian\"\n", 62 | " },\n", 63 | " \"synonyms_expand\": {\n", 64 | " \"type\": \"synonym\", \n", 65 | " # path to synonym file.\n", 66 | " # for ES to be able to read it, security policy should be set as described here:\n", 67 | " # https://stackoverflow.com/questions/35401917/reading-a-file-in-an-elasticsearch-plugin\n", 68 | " \"synonyms_path\": \"/Users/enchantner/Experiments/synonyms.txt\"\n", 69 | " }\n", 70 | " },\n", 71 | " \"analyzer\": {\n", 72 | " \"russian_syn\": {\n", 73 | " \"tokenizer\": \"standard\",\n", 74 | " \"filter\": [\n", 75 | " \"lowercase\",\n", 76 | " \"russian_stop\",\n", 77 | " \"russian_stemmer\",\n", 78 | " \"synonyms_expand\"\n", 79 | " ]\n", 80 | " }\n", 81 | " }\n", 82 | " }\n", 83 | " },\n", 84 | " \"mappings\":{ \n", 85 | " \"question\":{ \n", 86 | " \"properties\":{ \n", 87 | " \"text\":{\"type\":\"string\", \"analyzer\":\"russian_syn\"}\n", 88 | " }\n", 89 | " }\n", 90 | " }\n", 91 | "}\n", 92 | "\n", 93 | "async def create_index():\n", 94 | " ret = await es.indices.create(\n", 95 | " index=\"questions-index\",\n", 96 | " body=index_body\n", 97 | " )\n", 98 | " print(ret)\n", 99 | " \n", 100 | " \n", 101 | "async def delete_index():\n", 102 | " ret = await es.delete(\n", 103 | " index=\"questions-index\"\n", 104 | " )\n", 105 | " print(ret)\n", 106 | " \n", 107 | "async def openclose():\n", 108 | " \"\"\"\n", 109 | " Closing and opening index again reloads synomyms file\n", 110 | " \"\"\"\n", 111 | " await es.indices.close(index=\"questions-index\")\n", 112 | " await es.indices.open(index=\"questions-index\")\n", 113 | " \n", 114 | "async def populate_index():\n", 115 | " await es.index(\n", 116 | " index=\"questions-index\",\n", 117 | " doc_type=\"question\",\n", 118 | " body={\n", 119 | " \"text\": \"А что мне рассказать про иксгебуст?\"\n", 120 | " }\n", 121 | " )\n", 122 | " await es.index(\n", 123 | " index=\"questions-index\",\n", 124 | " doc_type=\"question\",\n", 125 | " body={\n", 126 | " \"text\": \"Я ненавижу PHP, что мне делать?\"\n", 127 | " }\n", 128 | " )\n", 129 | " await es.index(\n", 130 | " index=\"questions-index\",\n", 131 | " doc_type=\"question\",\n", 132 | " body={\n", 133 | " \"text\": \"Я люблю питон, что мне делать?\"\n", 134 | " }\n", 135 | " )" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 103, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "{'acknowledged': True}\n", 148 | "{'acknowledged': True, 'shards_acknowledged': True}\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "loop = asyncio.get_event_loop()\n", 154 | "loop.run_until_complete(delete_index())\n", 155 | "loop.run_until_complete(create_index())\n", 156 | "loop.run_until_complete(populate_index())" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 111, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# reload synonims without recreating the whole database\n", 166 | "gen_synonyms()\n", 167 | "loop.run_until_complete(openclose())" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": true 175 | }, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.6.0" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 2 201 | } 202 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/Get_links_from_meetings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from slack_data_loader import SlackLoader\n", 12 | "import datetime\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import re" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "path_to_dump = '/Users/alex/Documents/ODS/opendatascience Slack export May 20 2017/'" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 4, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "100%|██████████| 728/728 [00:00<00:00, 1909.51it/s]\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "exporter = SlackLoader(path_to_dump, only_channels=('_meetings',),\n", 44 | " start_date=datetime.datetime(2017, 1, 1))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "channel_attrs = ['id', 'name', 'created', 'creator', 'is_archived', 'is_general', 'pins', 'topic']\n", 56 | "\n", 57 | "def channels_to_df(channels):\n", 58 | " full_list = []\n", 59 | " for ch_id, ch_dict in channels.items():\n", 60 | " new_channel_dict = {}\n", 61 | " for k in channel_attrs:\n", 62 | " new_channel_dict[k] = ch_dict.get(k, None)\n", 63 | " new_channel_dict['num_members'] = len(ch_dict['members'])\n", 64 | " new_channel_dict['purpose'] = ch_dict['purpose']['value']\n", 65 | " full_list.append(new_channel_dict)\n", 66 | " return pd.DataFrame(full_list).set_index('id')\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = channels_to_df(exporter.channels)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 17, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "'26 декабря в ИППИ был семинар про NIPS-2016 и прогресс в машобуче. Занять место в первых рядах не успел, поэтому пришлось записывать видео с последних и дрожащими руками. Звук более-менее слышно, а слайды, наверное, сами участники смогут предоставить '" 89 | ] 90 | }, 91 | "execution_count": 17, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "exporter.messages[0]['text']" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 31, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "links_regex = re.compile(r'<(http[^>|]+)>')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 23, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "from operator import itemgetter, add\n", 120 | "from functools import reduce" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 32, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "links_total = reduce(add, map(links_regex.findall, map(itemgetter('text'),exporter.messages)))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 33, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "['https://www.youtube.com/watch?v=vFY2rez41_g',\n", 150 | " 'https://www.youtube.com/watch?v=Jh3D8Gi4N0I',\n", 151 | " 'http://www.nebotut.ru/',\n", 152 | " 'http://www.belylist.ru/',\n", 153 | " 'https://daily.afisha.ru/eating/3902-samyy-pyanyy-okrug-v-mire-5-marshrutov-dlya-barhoppinga-v-prazdniki/',\n", 154 | " 'http://anticafe-hp.ru/',\n", 155 | " 'http://tceh.com/event/kiwicom-brno-moscow-python-meetup/',\n", 156 | " 'https://corp.mail.ru/ru/press/events/298/',\n", 157 | " 'https://www.youtube.com/watch?v=1sQijC_I0gg',\n", 158 | " 'https://corp.mail.ru/ru/press/events/300/',\n", 159 | " 'https://opendatascience.slack.com/archives/_meetings/p1484385943000910',\n", 160 | " 'http://www.oreilly.com/pub/e/3855',\n", 161 | " 'https://www.facebook.com/events/1292384680807838/',\n", 162 | " 'https://vc.ru/p/statsbot-deal',\n", 163 | " 'http://venturebeat.com/2016/11/14/visa-lawyer-bot-pledges-to-help-immigrants-make-america-great-again/',\n", 164 | " 'http://dpllab.com/',\n", 165 | " 'https://events.yandex.ru/events/ds/04/#place',\n", 166 | " 'https://events.yandex.ru/events/mltr/21-jan-2017/',\n", 167 | " 'https://opendatascience.slack.com/archives/_meetings/p1472564367000042',\n", 168 | " 'https://events.yandex.ru/events/b-konf/16-feb-2017/',\n", 169 | " 'https://vk.com/wall-117459195_196',\n", 170 | " 'https://boosters.pro/sberbank',\n", 171 | " 'https://pp.vk.me/c637626/v637626651/2b2c6/IYnue7kT5oM.jpg',\n", 172 | " 'https://events.yandex.ru/surveys/4316/',\n", 173 | " 'https://www.meetup.com/PyData-Moscow/',\n", 174 | " 'https://geektimes.ru/company/mailru/blog/285026/',\n", 175 | " 'https://robotics.timepad.ru/event/429057/',\n", 176 | " 'https://openvisconf.com/',\n", 177 | " 'https://events.yandex.ru/events/meetings/09-february-2017/',\n", 178 | " 'https://ai-community.timepad.ru/event/432990/',\n", 179 | " 'https://flyelephant.net/events/webinar-introduction-to-singularity',\n", 180 | " 'http://singularity.lbl.gov/',\n", 181 | " 'https://events.yandex.ru/events/meetings/09-february-2017/register/',\n", 182 | " 'https://www.meetup.com/PyData-Moscow/events/237579800/',\n", 183 | " 'https://corp.mail.ru/ru/press/events/315/',\n", 184 | " 'http://tceh.com/medhack/',\n", 185 | " 'http://lurkmore.to/%D0%91%D0%BE%D0%B1%D1%83%D0%BA',\n", 186 | " 'http://medit-2017.ru/',\n", 187 | " 'https://vk.com/wall-118482811_48',\n", 188 | " 'http://Instagram.com/playittodeath',\n", 189 | " 'http://www.info-space.ru/',\n", 190 | " 'https://events.yandex.ru/surveys/4453/',\n", 191 | " 'https://goo.gl/forms/4d7p46wsbLtni2Ot1',\n", 192 | " 'http://www.mathnet.ru/php/seminars.phtml?option_lang=rus&presentid=16449',\n", 193 | " 'https://habrahabr.ru/company/superjob/blog/321950/',\n", 194 | " 'https://www.meetup.com/Moscow-Data-Science/',\n", 195 | " 'https://opendatascience.slack.com/archives/_random_flood/p1487253931038865',\n", 196 | " 'https://events.yandex.ru/events/mltr/25-feb-2017/',\n", 197 | " 'https://newprolab.timepad.ru/event/447417/',\n", 198 | " 'http://wwsss17.com/',\n", 199 | " 'https://vk.com/wwsss17',\n", 200 | " 'https://cs.hse.ru/announcements/202188811.html',\n", 201 | " 'https://events.yandex.ru/events/ds/18-mar-2017/',\n", 202 | " 'http://pydata.org/amsterdam2017/',\n", 203 | " 'http://hackathon.spb.ru/',\n", 204 | " 'https://corp.mail.ru/ru/press/events/323/',\n", 205 | " 'https://goo.gl/forms/kNIPf1df1KQnEZz72',\n", 206 | " 'https://events.kaspersky.com/hackathon/',\n", 207 | " 'http://hackathon.ai/',\n", 208 | " 'https://events.yandex.ru/surveys/4527/',\n", 209 | " 'https://opendatascience.slack.com/archives/_meetings/p1488168000003088',\n", 210 | " 'https://robotics.timepad.ru/event/399682/',\n", 211 | " 'http://sk.ru/foundation/events/april2017/robotics/p/classes.aspx',\n", 212 | " 'https://www.instagram.com/p/BRKxeHGgMmH/',\n", 213 | " 'https://opendatascience.slack.com/archives/_meetings/p1488530583003484',\n", 214 | " 'https://events.yandex.ru/events/mltr/11-mar-2017/',\n", 215 | " 'https://habrahabr.ru/company/mailru/blog/322432/',\n", 216 | " 'http://hackathon.mts.ru',\n", 217 | " 'http://hackathon.mts.ru/images/picTeam/t3.png',\n", 218 | " 'http://rb.ru/rosbank/',\n", 219 | " 'https://docs.google.com/forms/d/e/1FAIpQLSfDH9IWJHAUEkeb5rXGpwvea24Nd4VV2LZBQ42xhgrgws_YpQ/viewform?c=0&w=1',\n", 220 | " 'https://vc.ru/p/9578',\n", 221 | " 'https://www.youtube.com/watch?v=E62S4QNltLc',\n", 222 | " 'https://www.youtube.com/watch?v=fhZXqTGsunw',\n", 223 | " 'https://goo.gl/forms/83hiODGnTzjwxJkY2',\n", 224 | " 'https://events.yandex.ru/events/mltr/11-mar-2017/',\n", 225 | " 'https://goo.gl/forms/8AAipXJCQvqc6WHC2',\n", 226 | " 'https://flyelephant.net/events/webinar-julia',\n", 227 | " 'https://habrahabr.ru/company/flyelephant/blog/323840/',\n", 228 | " 'http://imgur.com/ogIQN0i',\n", 229 | " 'https://www.youtube.com/channel/UC91wUxUQ_uWznIo04dpXo3A',\n", 230 | " 'http://i.imgur.com/QQSFllR.jpg',\n", 231 | " 'https://vk.com/wall-142135418_5',\n", 232 | " 'http://www.datascience.in.ua/',\n", 233 | " 'https://sys.mail.ru/blog/entry/shemotehnika-08-pro-kartograf/',\n", 234 | " 'https://www.youtube.com/watch?v=DlK_37MVOvY',\n", 235 | " 'http://msk.opendataday.ru/',\n", 236 | " 'https://events.yandex.ru/events/mltr/25-mar-2017/',\n", 237 | " 'https://events.yandex.ru/events/meetings/3-april-2017/',\n", 238 | " 'https://www.youtube.com/playlist?list=PLkvzAel8ISD39_e1_jIhhWnSltFNOdTwZ',\n", 239 | " 'https://flyelephant.net/events/webinar-julia-live',\n", 240 | " 'https://chatbotconf.ru/ru',\n", 241 | " 'https://twitter.com/rsalakhu/status/846045079487832066?s=09',\n", 242 | " 'https://events.yandex.ru/events/ds/15-apr-2017/',\n", 243 | " 'https://goo.gl/forms/d4hPTIHClEkzchgu1',\n", 244 | " 'https://www.youtube.com/channel/UCBLlcLoM4czHN21yeaWxGZA',\n", 245 | " 'http://ospcon.osp.ru/bigdata',\n", 246 | " 'https://www.facebook.com/events/828239720648373',\n", 247 | " 'http://cs403922.userapi.com/v403922807/3ea2/z7rffcCbvm8.jpg',\n", 248 | " 'https://knowledgepit.fedcsis.org/contest/view.php?id=120',\n", 249 | " 'https://events.yandex.ru/surveys/4685/',\n", 250 | " 'https://yandex.ru/maps/-/CZcsM8YJ',\n", 251 | " 'https://events.kaspersky.com/hackathon/',\n", 252 | " 'https://events.webinar.ru/1904081/345927',\n", 253 | " 'https://events.yandex.ru/events/mltr/08-apr-2017/',\n", 254 | " 'https://rambler-co-e-org.timepad.ru/event/470664/',\n", 255 | " 'https://www.meetup.com/Apache-Spark-in-Moscow/events/past/?scroll=true',\n", 256 | " 'http://ai-community.com/events/ai-community/4-global-ai-meetup-computer-vision-technologies-06/04',\n", 257 | " 'https://opendatascience.slack.com/archives/C04422A5C/p1490684902916746',\n", 258 | " 'https://events.yandex.ru/events/meetings/13-apr-2017/',\n", 259 | " 'https://academy.yandex.ru/events/sci-sem/cv/',\n", 260 | " 'https://clickhouse.yandex/presentations/meetup4/clickhouse_for_analysts.pdf',\n", 261 | " 'http://matlab.ru/seminars/conf2017',\n", 262 | " 'http://meetu.ps/e/CGvgm/sYfx1/d',\n", 263 | " 'https://ladiescode.timepad.ru/event/471400/',\n", 264 | " 'http://data-science.com.ua/conferences/data-science-ua-conference-2017/',\n", 265 | " 'https://www.facebook.com/events/168767966965411/',\n", 266 | " 'http://datafest.in.ua',\n", 267 | " 'https://events.yandex.ru/events/science-seminars/24-april-2017/',\n", 268 | " 'http://grammars.grlmc.com/DeepLearn2017/',\n", 269 | " 'https://sites.google.com/site/emotiw2016/',\n", 270 | " 'https://youtu.be/oPDbUIWND_k',\n", 271 | " 'http://lifecode.site/?utm_source=newsletter&utm_medium=genehack&utm_campaign=13apr',\n", 272 | " 'http://datafest.by/',\n", 273 | " 'https://it.mail.ru/video/',\n", 274 | " 'http://spacehack.xyz/',\n", 275 | " 'https://www.youtube.com/watch?v=F1QvOJcxAzw',\n", 276 | " 'https://moscowdjango.timepad.ru/event/468277/',\n", 277 | " 'https://events.yandex.ru/events/mltr/22-apr-2017/',\n", 278 | " 'https://domclick.timepad.ru/event/476750/',\n", 279 | " 'https://vk.com/wall-44016343_14912?w=wall-138127986_121',\n", 280 | " 'https://goo.gl/forms/g8tB0MRUMXWVgpCh2',\n", 281 | " 'https://goo.gl/forms/mL4eHnLEIbVfUFQb2',\n", 282 | " 'http://xn--90aihcg1anaka9m.xn--p1ai',\n", 283 | " 'http://xn--80abdlnldpssn.xn--p1ai',\n", 284 | " 'https://docs.google.com/document/d/1jwDGxd50NbzAuCcz60ct6Kr1rMQkB-8Q3EIAf_RX-mU/edit?usp=drivesdk',\n", 285 | " 'https://habrahabr.ru/company/microsoft/blog/326812/',\n", 286 | " 'http://spacehack.xyz',\n", 287 | " 'https://www.meetup.com/GDG-Moscow/events/239324673/',\n", 288 | " 'http://datascience.in.ua',\n", 289 | " 'https://www.facebook.com/photo.php?fbid=10212499098569921&set=a.10200648969644104.1073741825.1156281111&type=3&theater',\n", 290 | " 'https://events.webinar.ru/2442095/395929',\n", 291 | " 'http://dataring.ru/competitions/avito-recommendations/',\n", 292 | " 'https://events.yandex.ru/events/mltr/22-apr-2017/',\n", 293 | " 'https://www.meetup.com/PyData-Moscow/events/239404221/',\n", 294 | " 'https://goo.gl/forms/mL4eHnLEIbVfUFQb2',\n", 295 | " 'https://www.meetup.com/PyData-Moscow/events/239404221/',\n", 296 | " 'https://corp.mail.ru/ru/press/events/341/',\n", 297 | " 'https://www.youtube.com/channel/UCBLlcLoM4czHN21yeaWxGZA',\n", 298 | " 'http://spacehack.xyz/',\n", 299 | " 'http://dmlabs.org/spacehack.jpg',\n", 300 | " 'https://bigquery.cloud.google.com/dataset/fh-bigquery:reddit_comments',\n", 301 | " 'https://events.yandex.ru/events/ds/20-may-2017/',\n", 302 | " 'https://aisummit2017.ru/',\n", 303 | " 'https://www.s7.ru/home/offers/hackathon/index.dot',\n", 304 | " 'http://spa2017.org',\n", 305 | " 'http://www.pdmi.ras.ru/EIMI/2017/CNSA/index.html',\n", 306 | " 'http://www.pdmi.ras.ru/EIMI/2017/PTRP/index.html',\n", 307 | " 'http://dmery.ing.puc.cl/index.php/material/gdxray/',\n", 308 | " 'https://www.dropbox.com/sh/aym7wgup7m2c5hh/AACFjjmmozhWKFmRyzM0S0KYa?dl=0',\n", 309 | " 'https://fom-events.timepad.ru/event/485547/',\n", 310 | " 'https://events.yandex.ru/events/ds/27-apr-2017/',\n", 311 | " 'http://ritfest.ru/',\n", 312 | " 'https://pages.awscloud.com/awsomedaymoscow2017registration.html',\n", 313 | " 'http://machinescansee.com/',\n", 314 | " 'https://www.youtube.com/watch?v=WhM3Vvz37a0',\n", 315 | " 'http://www.mobiledimension.ru/confmay.php',\n", 316 | " 'http://events.softline.ru/event/10902/',\n", 317 | " 'https://www.youtube.com/watch?v=ZBwxRww_EYo',\n", 318 | " 'https://goo.gl/forms/qJ8JQsOfqzpxkN5m2',\n", 319 | " 'https://www.youtube.com/watch?v=rE3Y9eCfN8E',\n", 320 | " 'http://geodata.pro',\n", 321 | " 'https://robotics.timepad.ru/event/492985/',\n", 322 | " 'https://goo.gl/forms/qJ8JQsOfqzpxkN5m2',\n", 323 | " 'https://cs.hse.ru/datacase/pravoru',\n", 324 | " 'https://youtu.be/IFG9IBI2NoM',\n", 325 | " 'https://www.facebook.com/events/455274888147623/',\n", 326 | " 'http://talks.sourced.tech/machine-learning-2017/',\n", 327 | " 'https://domclick.timepad.ru/event/476750/',\n", 328 | " 'http://www.moscowpython.ru/meetup/45/',\n", 329 | " 'http://machinescansee.com/',\n", 330 | " 'http://it-events.com',\n", 331 | " 'https://corp.mail.ru/ru/press/events/347/',\n", 332 | " 'https://youtu.be/eixlC8K8GIg']" 333 | ] 334 | }, 335 | "execution_count": 33, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "links_total" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.6.1" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 2 375 | } 376 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/ThreadIndexingWithES.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import asyncio\n", 12 | "from aioes import Elasticsearch\n", 13 | "from elasticsearch import helpers" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from chunk import TimeDistance\n", 25 | "from chunk import Chunker\n", 26 | "from slack_data_loader import SlackLoader" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import json" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "PATH_TO_DATA = './data'" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 15, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "es = Elasticsearch(['localhost:9200'])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "def gen_synonyms():\n", 71 | " \"\"\"\n", 72 | " Generate some synonyms in a file. All words separated by comma are treated as equal\n", 73 | " \"\"\"\n", 74 | " with open(\"help_data/synonyms.txt\", \"w\") as syns:\n", 75 | " syns.write(\"xboost, эксгебуст, эксбуст, иксгебуст, xgboost\\n\")\n", 76 | " syns.write(\"пыха, пыху, пых, php\\n\")\n", 77 | " syns.write(\"lol, лол\\n\")\n", 78 | " syns.write(\"питон, python\\n\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 20, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "index_name = \"ods-slack-index\"\n", 90 | "mapping_name = \"thread\"\n", 91 | "message_mapping = \"message\"\n", 92 | "index_body = {\n", 93 | " \"settings\": {\n", 94 | " \"analysis\": {\n", 95 | " \"filter\": {\n", 96 | " \"russian_stop\": {\n", 97 | " \"type\": \"stop\",\n", 98 | " \"stopwords\": \"_russian_\" \n", 99 | " },\n", 100 | " \"russian_stemmer\": {\n", 101 | " \"type\": \"stemmer\",\n", 102 | " \"language\": \"russian\"\n", 103 | " },\n", 104 | " \"synonyms_expand\": {\n", 105 | " \"type\": \"synonym\", \n", 106 | " # path to synonym file.\n", 107 | " # for ES to be able to read it, security policy should be set as described here:\n", 108 | " # https://stackoverflow.com/questions/35401917/reading-a-file-in-an-elasticsearch-plugin\n", 109 | " \"synonyms_path\": \"/usr/share/config_data/synonyms.txt\"\n", 110 | " }\n", 111 | " },\n", 112 | " \"analyzer\": {\n", 113 | " \"russian_syn\": {\n", 114 | " \"tokenizer\": \"standard\",\n", 115 | " \"filter\": [\n", 116 | " \"lowercase\",\n", 117 | " \"russian_stop\",\n", 118 | " \"russian_stemmer\",\n", 119 | " \"synonyms_expand\"\n", 120 | " ]\n", 121 | " }\n", 122 | " }\n", 123 | " }\n", 124 | " },\n", 125 | " \"mappings\":{ \n", 126 | " mapping_name:{\n", 127 | " \"properties\":{\n", 128 | " \"channel\": {\"type\": \"keyword\"},\n", 129 | " \"title\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n", 130 | " \"ts\": {\"type\": \"date\"},\n", 131 | " \"messages\" : {\n", 132 | " \"properties\":{\n", 133 | " \"text\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n", 134 | " \"user_id\": {\"type\": \"keyword\"},\n", 135 | " \"user_real_name\": {\"type\":\"string\"},\n", 136 | " \"ts\": {\"type\": \"date\"}\n", 137 | " }\n", 138 | " }\n", 139 | " }\n", 140 | " },\n", 141 | " message_mapping:{\n", 142 | " \"properties\":{\n", 143 | " \"text\": {\"type\":\"string\", \"analyzer\":\"russian_syn\"},\n", 144 | " \"user_id\": {\"type\": \"keyword\"},\n", 145 | " \"user_real_name\": {\"type\":\"string\"},\n", 146 | " \"ts\": {\"type\": \"date\"}\n", 147 | " }\n", 148 | " }\n", 149 | " }\n", 150 | "}\n", 151 | "\n", 152 | "async def create_index():\n", 153 | " return await es.indices.create(\n", 154 | " index=index_name,\n", 155 | " body=index_body\n", 156 | " )\n", 157 | " \n", 158 | "async def check_index_exists():\n", 159 | " return await es.indices.exists(index=index_name)\n", 160 | "\n", 161 | "\n", 162 | "async def delete_index():\n", 163 | " return await es.delete(index=index_name)\n", 164 | "\n", 165 | "async def openclose():\n", 166 | " \"\"\"\n", 167 | " Closing and opening index again reloads synomyms file\n", 168 | " \"\"\"\n", 169 | " await es.indices.close(index=index_name)\n", 170 | " await es.indices.open(index=index_name)\n", 171 | " \n", 172 | "async def populate_index(channel, messages):\n", 173 | " await es.index(\n", 174 | " index=index_name,\n", 175 | " doc_type=mapping_name,\n", 176 | " body={\n", 177 | " \"channel\": channel,\n", 178 | " \"title\": messages[0]['text'],\n", 179 | " \"ts\": messages[0]['ts'] * 1000,\n", 180 | " \"messages\": messages\n", 181 | " }\n", 182 | " )\n", 183 | " \"\"\"\n", 184 | " for message in messages: # make bulk upload here\n", 185 | " await es.index(\n", 186 | " index=index_name,\n", 187 | " doc_type=message_mapping,\n", 188 | " body=message\n", 189 | " )\n", 190 | " \"\"\"\n", 191 | "\n", 192 | "async def query_index(query):\n", 193 | " return await es.search(\n", 194 | " index=index_name,\n", 195 | " doc_type=mapping_name,\n", 196 | " body={\n", 197 | " \"query\":{\n", 198 | " \"multi_match\" : {\n", 199 | " \"fields\" : [ \"title^3\", \"messages.text\" ],\n", 200 | " \"query\": query\n", 201 | " }\n", 202 | " }\n", 203 | " }\n", 204 | " )\n", 205 | "\n", 206 | "async def search_user(username):\n", 207 | " return await es.search(\n", 208 | " index=index_name,\n", 209 | " doc_type=mapping_name,\n", 210 | " body={\n", 211 | " \"query\":{\n", 212 | " \"multi_match\" : {\n", 213 | " \"fields\" : [ \"messages.user_real_name\" ],\n", 214 | " \"query\": username\n", 215 | " }\n", 216 | " }\n", 217 | " }\n", 218 | " )" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 7, 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "loop = asyncio.get_event_loop()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 16, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "{'acknowledged': True}\n", 244 | "{'acknowledged': True, 'shards_acknowledged': True}\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "if loop.run_until_complete(check_index_exists()):\n", 250 | " print(loop.run_until_complete(delete_index()))\n", 251 | " \n", 252 | "print(loop.run_until_complete(create_index()))" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "# reload synonims without recreating the whole database\n", 264 | "gen_synonyms()\n", 265 | "loop.run_until_complete(openclose())" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 9, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "async def index_channel(channel = \"nlp\"):\n", 277 | " data = SlackLoader(PATH_TO_DATA, only_channels=[channel])\n", 278 | " chunker = Chunker()\n", 279 | " groups = chunker.get_groups(data)\n", 280 | " \n", 281 | " print(\"Indexing: \" + channel)\n", 282 | "\n", 283 | " workers = []\n", 284 | " for group in groups:\n", 285 | " users = data.users\n", 286 | " for msg in group:\n", 287 | " if msg['user'] in users:\n", 288 | " msg['user_real_name'] = users[msg['user']]['name']\n", 289 | " if 'dt' in msg:\n", 290 | " del msg['dt']\n", 291 | " msg['timestamp'] = str(msg['ts'])\n", 292 | " msg['ts'] = int(msg['ts'])\n", 293 | " if \"attachments\" in msg:\n", 294 | " for attach in msg[\"attachments\"]:\n", 295 | " if 'ts' in attach:\n", 296 | " attach['ts'] = float(attach['ts'])\n", 297 | " workers.append(\n", 298 | " asyncio.ensure_future(populate_index(channel, group))\n", 299 | " )\n", 300 | " return await asyncio.gather(*workers)\n", 301 | "\n", 302 | "async def index_channels(channels):\n", 303 | " await asyncio.gather(\n", 304 | " *[asyncio.ensure_future(index_channel(channel)) for channel in channels]\n", 305 | " )\n", 306 | " " 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 17, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Indexing: nlp\n", 321 | "Indexing: deep_learning\n", 322 | "Indexing: datasets\n", 323 | "Indexing: sequences_series\n", 324 | "Indexing: bayesian\n", 325 | "Indexing: _meetings\n", 326 | "Indexing: edu_academy\n", 327 | "Indexing: edu_books\n", 328 | "Indexing: visualization\n", 329 | "Indexing: hardware\n", 330 | "Indexing: reinforcement_learnin\n", 331 | "Indexing: theory_and_practice\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "useful_channels = [\"nlp\", \"deep_learning\", \"datasets\",\n", 337 | " \"sequences_series\", \"bayesian\", \"_meetings\", \"edu_academy\",\n", 338 | " \"edu_books\", \"visualization\", \"hardware\",\n", 339 | " \"reinforcement_learnin\", \"theory_and_practice\"]\n", 340 | "\n", 341 | "loop.run_until_complete(index_channels(useful_channels))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "loop.run_until_complete(query_index(\"как использовать xgboost в python\"))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 57, 358 | "metadata": { 359 | "collapsed": false 360 | }, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "\n", 367 | "------------(nlp)--------------\n", 368 | "elwis: \n", 369 | "elwis: если надо просто отрезать окончания, то вот это подойдет\n", 370 | "stajilov: проверял недавно, не умеет нормально, лемматизацию может делать типо но нужен русский wordnet нормальный\n", 371 | "\n", 372 | "------------(nlp)--------------\n", 373 | "elwis: Коллега сделала интересную сравнительную таблицу чатботов, может кому-то пригодится: \n", 374 | "alexantonov: elwis: Отличная статья. А совершенно случайно нет на русском языке?\n", 375 | "elwis: <@U32506X36> К сожалению нет\n", 376 | "\n", 377 | "------------(nlp)--------------\n", 378 | "elwis: Если tf-idf вектора нормализованные, можно вместо косинусной близости считать скалярное произведение\n", 379 | "\n", 380 | "------------(nlp)--------------\n", 381 | "rvnikita: Ребята, привет. Что почиттать чтобы быстро разобраться в основнах NLP? хорошая книжка или есть что-то другое более признаное?\n", 382 | "octocat: rvnikita: хорошая, но заточена под NLTK.\n", 383 | "elwis: это главная книга по nltk, есть в электронном виде: \n", 384 | "aledovsky: Одной из лучших книг по nlp на мой взгляд является Martin Jurafsky - Speеch and Language Processing. Это большая книжка, но из неё можно независимо читать главы. Я бы предложил почитать несколько вводных плюс главы про прикладные задачи. Есть второе издание, которое нетрудно нагуглить в pdf и драфт третьего, который на сайте автора . Третье издание похоже в базовых главах и сильно отличается в описании прикладных задач.\n", 385 | "buzzword_miner: Обработка неструктурированны текстов \n", 386 | "Поиск. Организация и манипулирование\n", 387 | "buzzword_miner: \n", 388 | "\n", 389 | "------------(nlp)--------------\n", 390 | "elwis: была похожая проблема с bigartm, решил установив --threads 1. А что такое -j не подскажете? это то же самое?\n", 391 | "khansuleyman: -j - то же, что и --jobs. Одновременное выполнение указанного количества команд\n", 392 | "ryazanoff: Кто тыкал уже? Там проблемы с 3 питоном\n", 393 | "angriff07: так там и не заявляется работа с питоном 3... в readme написано, что python2.7\n", 394 | "\n", 395 | "------------(nlp)--------------\n", 396 | "elwis: это главная книга по nltk, есть в электронном виде: \n", 397 | "dimakarp1996: =\n", 398 | "i: \n", 399 | "i: m.yurushkin: тоже вае на текстах хочу. а у тебя какой датасет?\n", 400 | "\n", 401 | "------------(nlp)--------------\n", 402 | "0x1337: Корректно ли считать схожесть текстов косинусным расстоянием, если вектора – это не OHE представление, а tf-idf веса?\n", 403 | "ololo: да, так обычно и делают, если я правильно вопрос понял\n", 404 | "alex.ozerin: Да, косинус между суммами ohe будет глупым булевым поиском. Tfidf -- разумный вариант взвешивания\n", 405 | "mrukhlov: а что за ohe?\n", 406 | "alex.ozerin: One hot encoding\n", 407 | "mrukhlov: спасибо\n", 408 | "elwis: Если tf-idf вектора нормализованные, можно вместо косинусной близости считать скалярное произведение\n", 409 | "amir: а зачем вообще использовать tf-idf, если есть w2v и даже более совершенные модели эмбеддингов?\n", 410 | "ololo: потому что tf-idf в некоторых случаях лучше работает, например для IR\n", 411 | "0x1337: у меня линейный свм порвал как тузик грелку Xgboost на ворд2век + bigARTM. \n", 412 | "amir: Не знал, что такое тоже может быть. А какие тексты используются?\n", 413 | "0x1337: новостные статьи на русском. \n", 414 | "evgeny: <@U4E1EF5CZ> а какие более совершенные модели эмбеддингов ты имеешь в виду?\n", 415 | "elwis: <@U3PETUSSE> а как ты соединил ворд2век и BigARTM если не секрет?\n", 416 | "0x1337: <@U443HBJ8L> Для документов считаешь распределение топиков, вот и новые фичи. \n", 417 | "elwis: ясно, а я сначала подумал что ты вектора слов как-то в BigARTM сумел запихнуть как токены)\n", 418 | "amir: <@U0D8KLBFV> google swivel, fasttext\n", 419 | "\n", 420 | "------------(nlp)--------------\n", 421 | "wingrime: Господа, nltk умеет стиминг русский?\n", 422 | "novitoll: Судя по тому, что я скачал все данные от nltk `import nltk;nltk.download('all')`, то тут только для en-US. Но можно проверить в директорий, куда все данные скачались. На Linux - это по дефолту хранится в `/home/user/nltk_data/stemmers/`. Тут есть только `porter_test` для инглиша. \n", 423 | "Думаю, для русского языка можно использовать `pymorphy2`\n", 424 | "dselivanov: когда я послдений раз смотрел стемминг делался через такую жопу, что после этого я вообще nltk перестал воспринимать за библиотеку. Он транслитерировал русский в английский, потом делал стемминг, затем конвертировал обратно\n", 425 | "dselivanov: такой вот пиздец\n", 426 | "gleberof: С русским языком у pymorphy2 тоже не все идеально. \"Открытие банка\" -> \"открыть\", \"банка\". Понятно конечно почему. Но пока pymorphy2 лучшее что есть для стемминга\n", 427 | "wingrime: Спсб\n", 428 | "wingrime: А что умеет сейчас нормализацию?\n", 429 | "alexeyev: <@U50GC05J7> лемматизацию, в смысле?\n", 430 | "wingrime: В смысле аналогично заменам We'll -> we will\n", 431 | "wingrime: Раскрытие сокращений\n", 432 | "wingrime: Синонимов\n", 433 | "elwis: \n", 434 | "elwis: если надо просто отрезать окончания, то вот это подойдет\n", 435 | "stajilov: проверял недавно, не умеет нормально, лемматизацию может делать типо но нужен русский wordnet нормальный\n", 436 | "windj007: если лицензия позволяет, то можно ещё\n", 437 | "\n", 438 | "0x1337: +1, mystem пушка.\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "res = loop.run_until_complete(search_user(\"generall\"))['hits']['hits']\n", 444 | "for hit in res:\n", 445 | " print(\"\\n------------({})--------------\".format(hit['_source']['channel']))\n", 446 | " for msg in hit['_source']['messages']:\n", 447 | " print(\"{}: {}\".format(msg['user_real_name'], msg['text']))" 448 | ] 449 | } 450 | ], 451 | "metadata": { 452 | "kernelspec": { 453 | "display_name": "Python [default]", 454 | "language": "python", 455 | "name": "python3" 456 | }, 457 | "language_info": { 458 | "codemirror_mode": { 459 | "name": "ipython", 460 | "version": 3 461 | }, 462 | "file_extension": ".py", 463 | "mimetype": "text/x-python", 464 | "name": "python", 465 | "nbconvert_exporter": "python", 466 | "pygments_lexer": "ipython3", 467 | "version": "3.5.2" 468 | } 469 | }, 470 | "nbformat": 4, 471 | "nbformat_minor": 2 472 | } 473 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/chunk.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | import os 6 | import datetime 7 | import time 8 | 9 | import itertools 10 | 11 | from slack_data_loader import SlackLoader 12 | 13 | SECS_IN_DAY = 60 * 60 * 24 14 | 15 | # this class wrap time by daily activity distribution 16 | class TimeDistance: 17 | 18 | def calc_dist(self, times): 19 | day_times = times % SECS_IN_DAY 20 | hist, ranges = np.histogram(day_times, range=(0, SECS_IN_DAY), bins=self.bins) 21 | total_count = hist.sum() 22 | normalized_hist = hist / self.total_count 23 | ranges = ranges.astype(int)[:-1] 24 | dist = dict(zip(ranges, normalized_hist)) 25 | mean = normalized_hist.mean() 26 | return (mean, dist) 27 | 28 | 29 | def get_time_range(self, ts): 30 | dt = datetime.datetime.fromtimestamp(ts) 31 | return str(dt.year) + str(int(dt.month / 6)) 32 | 33 | def init_distribution(self, times): 34 | # split by years somehow 35 | self.bins = 100 36 | self.time_step = int(SECS_IN_DAY / self.bins) 37 | self.total_count = times.size 38 | 39 | datetimes = map(self.get_time_range, times) 40 | zp = zip(datetimes, times) 41 | grps = itertools.groupby(zp, key=lambda x: x[0]) 42 | 43 | time_groups = list([ (k, np.array([y for x,y in g]) ) for k,g in grps]) 44 | 45 | meanes = [] 46 | dists = [] 47 | for key, group_times in time_groups: 48 | mean, dist = self.calc_dist(group_times) 49 | meanes.append( (key, mean) ) 50 | dists.append( (key, dist) ) 51 | 52 | self.dist = dict(dists) 53 | self.mean = dict(meanes) 54 | 55 | return self 56 | 57 | def get_range_start(self, ts): 58 | return int(ts % SECS_IN_DAY / self.time_step) * self.time_step 59 | 60 | def get_dist(self, ts): 61 | curr_range = self.get_range_start(ts) 62 | return self.dist[self.get_time_range(ts)][curr_range] 63 | 64 | def get_mean(self, ts): 65 | return self.mean[self.get_time_range(ts)] 66 | 67 | def distance(self, ts1, ts2): 68 | max_ts = max(ts1, ts2) 69 | min_ts = min(ts1, ts2) 70 | curr = min_ts 71 | dist = 0.0 72 | diff = max_ts - min_ts 73 | if diff > SECS_IN_DAY: 74 | secs = int(diff / SECS_IN_DAY) * SECS_IN_DAY 75 | dist += secs * self.get_mean(curr) 76 | curr += secs 77 | while curr < max_ts: 78 | time_to_next_range = self.time_step - curr % self.time_step 79 | time_to_end = max_ts - curr 80 | min_time = min(time_to_end, time_to_next_range) 81 | curr += min_time 82 | dist += self.get_dist(curr) * min_time 83 | return dist 84 | 85 | class Chunker: 86 | def split_by_threshold(self, difs, threshold): 87 | res = [] 88 | start = 0 89 | curr = difs 90 | while len(curr) > 0: 91 | group_len = len(list(itertools.takewhile(lambda x: x < threshold, curr))) 92 | res.append(range(start, start + group_len + 1)) 93 | curr = curr[group_len + 1:] 94 | start = start + group_len + 1 95 | return res 96 | 97 | def cluster_time_series(self, timeObj, times, threshold = 100.0): 98 | time_difs = np.zeros(times.size - 1) 99 | for i in range(0, times.size - 2): 100 | time_difs[i] = timeObj.distance(times[i], times[i + 1]) 101 | chunks = self.split_by_threshold(time_difs, threshold) 102 | return chunks 103 | 104 | def merge_with_threads(self, chunks, threads): 105 | for thread in threads: 106 | chunks = list(filter(lambda x: not (x[0] <= thread[0] <= x[-1] or x[0] <= thread[-1] <= x[-1]), chunks)) 107 | chunks += threads 108 | return sorted(chunks, key=lambda x: x[0]) 109 | 110 | def get_groups(self, data, threshold = 30): 111 | times = np.array(list(map(lambda x: x['ts'], data.messages))) 112 | timeObj = TimeDistance().init_distribution(times) 113 | chunks = self.cluster_time_series(timeObj, times, threshold=threshold) 114 | chunk_lengthes = np.array(list(map(len, chunks))) 115 | threads = data.find_threads() 116 | chunks = self.merge_with_threads(chunks, threads) 117 | for chunk in chunks: 118 | yield [ data.messages[i] for i in chunk ] 119 | 120 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | elasticsearch1: 4 | image: elasticsearch 5 | container_name: elasticsearch1 6 | environment: 7 | - cluster.name=docker-cluster 8 | - bootstrap.memory_lock=true 9 | - "ES_JAVA_OPTS=-Xms1g -Xmx1g -Djava.security.policy=file:///usr/share/config_data/java_policy" 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | nofile: 15 | soft: 262144 16 | hard: 262144 17 | mem_limit: 1g 18 | cap_add: 19 | - IPC_LOCK 20 | volumes: 21 | - ./search_data:/usr/share/elasticsearch/data 22 | - ./help_data:/usr/share/config_data 23 | ports: 24 | - 9200:9200 25 | - 9300:9300 26 | network_mode: "host" 27 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/event-parser.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 102, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from urllib.parse import urlparse\n", 12 | "import requests\n", 13 | "import scrapy" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 92, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def postprocess_event_json(event_json):\n", 25 | " for k, v in event_json.items():\n", 26 | " if isinstance(v, list):\n", 27 | " v = '\\n'.join(v)\n", 28 | " v = v.replace('\\xa0', ' ').replace('\\u200b', '')\n", 29 | " event_json[k] = v.strip()\n", 30 | " return event_json\n", 31 | "\n", 32 | "def get_event_json(url):\n", 33 | " url_parts = urlparse(url)\n", 34 | " host = url_parts.netloc\n", 35 | " page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'})\n", 36 | " dom = scrapy.Selector(text=page.content)\n", 37 | " if host == 'www.facebook.com':\n", 38 | " title = dom.css('#seo_h1_tag ::text').extract_first()\n", 39 | " datetime = dom.css('#event_summary').extract()\n", 40 | " json = {\n", 41 | " 'title': title,\n", 42 | " 'datatime': 'unheard',\n", 43 | " 'location': 'na kudykinoy gore',\n", 44 | " 'source': 'facebook'\n", 45 | " }\n", 46 | " if host == 'events.yandex.ru':\n", 47 | " json = {\n", 48 | " 'title': dom.css('h2.title ::text').extract_first(),\n", 49 | " 'datetime': dom.css('.event-header__when ::text').extract_first(),\n", 50 | " 'location': (dom.css('.event-header__place ::text').extract_first() or 'Unknown City') + ' Яндекс',\n", 51 | " 'source': url,\n", 52 | " 'decription': dom.css('.b-static-text ::text').extract()\n", 53 | " }\n", 54 | " if host == 'www.meetup.com':\n", 55 | " json = {\n", 56 | " 'title': dom.css('.pageHead-headline ::text').extract_first(),\n", 57 | " 'datetime': ' '.join([s.strip() for s in dom.css('.eventTimeDisplay time ::text').extract() if s != ' ']),\n", 58 | " 'location': ' '.join(dom.css('.venueDisplay ::text').extract()),\n", 59 | " 'source': url,\n", 60 | " 'decription': dom.css('.event-description ::text').extract()\n", 61 | " }\n", 62 | " if host.endswith('timepad.ru'):\n", 63 | " json = {\n", 64 | " 'title': dom.css('.ep-3-hero__subtitle ::text').extract_first().strip(),\n", 65 | " 'datetime': dom.css('.ep3-pagesummary__time-begin span ::text').extract_first(),\n", 66 | " 'location': dom.css('.ep3-pagesummary__place-city ::text').extract_first().strip() + ', ' + dom.css('.ep3-pagesummary__place-adress span ::text').extract_first().strip(),\n", 67 | " 'source': url,\n", 68 | " 'description': dom.css('.ep3-content .clearfix p ::text').extract()\n", 69 | " }\n", 70 | " \n", 71 | " json = postprocess_event_json(json)\n", 72 | " return json" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 103, 78 | "metadata": { 79 | "scrolled": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "url: https://events.yandex.ru/events/yac/29-may-2018/\n", 87 | "datetime: OK\n", 88 | "description: SKIP\n", 89 | "location: OK\n", 90 | "source: OK\n", 91 | "title: OK\n", 92 | "\n", 93 | "url: https://www.meetup.com/PyData-Moscow/events/240661336/\n", 94 | "title: OK\n", 95 | "datetime: OK\n", 96 | "location: OK\n", 97 | "source: OK\n", 98 | "description: SKIP\n", 99 | "\n", 100 | "url: https://sdsj.timepad.ru/event/603431/\n", 101 | "title: OK\n", 102 | "datetime: OK\n", 103 | "location: OK\n", 104 | "source: OK\n", 105 | "description: SKIP\n", 106 | "\n", 107 | "url: https://www.facebook.com/events/1727074767621344/\n", 108 | "\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "def get_events_markup():\n", 114 | " events_markup = {\n", 115 | " \"https://events.yandex.ru/events/yac/29-may-2018/\": {\n", 116 | " 'datetime': '29 мая, 08:30',\n", 117 | " 'description': '',\n", 118 | " 'location': 'Unknown City Яндекс',\n", 119 | " 'source': 'https://events.yandex.ru/events/yac/29-may-2018/',\n", 120 | " 'title': 'Yet another Conference 2018'\n", 121 | " },\n", 122 | " \"https://www.meetup.com/PyData-Moscow/events/240661336/\": {\n", 123 | " 'title': 'Третий PyData Meetup',\n", 124 | " 'datetime': 'Friday, June 23, 2017 6:30 PM to 9:30 PM',\n", 125 | " 'location': 'Yandex ул. Льва Толстого, 16 · Moscow',\n", 126 | " 'source': 'https://www.meetup.com/PyData-Moscow/events/240661336/',\n", 127 | " 'description': '',\n", 128 | " },\n", 129 | " \"https://sdsj.timepad.ru/event/603431/\": {\n", 130 | " \"title\": \"Sberbank Data Science Day 2017\",\n", 131 | " \"datetime\": \"11 ноября 2017 c 9:30 до 22:00\",\n", 132 | " \"location\": \"Москва, ш. Энтузиастов, 5\",\n", 133 | " \"source\": \"https://sdsj.timepad.ru/event/603431/\",\n", 134 | " \"description\": ''\n", 135 | " },\n", 136 | " \"https://www.facebook.com/events/1727074767621344/\": {\n", 137 | "\n", 138 | " }\n", 139 | " }\n", 140 | " return events_markup\n", 141 | "\n", 142 | "def test_get_event_json():\n", 143 | " events_markup = get_events_markup()\n", 144 | " \n", 145 | " for url, markup_dict in events_markup.items():\n", 146 | " print('url: %s' % url)\n", 147 | " event_dict = get_event_json(url)\n", 148 | " for k, markup_v in markup_dict.items():\n", 149 | " print('%s: ' % k, end='')\n", 150 | " event_v = event_dict.get(k, 'NONE')\n", 151 | " if event_v == markup_v:\n", 152 | " print('OK')\n", 153 | " elif k in ('description'):\n", 154 | " print('SKIP')\n", 155 | " else:\n", 156 | " print('ERROR:\\n%s\\n----- should be -----\\n%s' % (event_v, markup_v))\n", 157 | " print()\n", 158 | "\n", 159 | "test_get_event_json()" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.6.4" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/fact_extraction_with_mystem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import json\n", 12 | "import glob\n", 13 | "import os\n", 14 | "import datetime\n", 15 | "import re\n", 16 | "import pandas as pd\n", 17 | "from slack_data_loader import SlackLoader\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "data_folder = '/Users/alex/Documents/ODS/opendatascience Slack export May 20 2017/'\n", 29 | "ods = SlackLoader(data_folder,only_channels=('welcome',))" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "type\n", 41 | "message 3992\n", 42 | "Name: dt, dtype: int64" 43 | ] 44 | }, 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "df_msg = pd.DataFrame.from_records(ods.messages)\n", 52 | "df_msg.fillna(0).groupby(['type'])['dt'].count()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 6, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "как пример хорошего :smiley:\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "print(df_msg.text[4])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Вытащим сообщения-представления" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df_msg.head()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "k = 0\n", 97 | "hi_messages = []\n", 98 | "hi_tokens=('все', 'привет','добр','шалом','салют','здрав','хай','я','ребят','коллег')\n", 99 | "for _, i in df_msg.iterrows():\n", 100 | " if i.text.lower().startswith(hi_tokens) and len(i.text) > 100:\n", 101 | " k+=1\n", 102 | " hi_messages.append(i.text)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 1, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "for i in hi_messages[-10:]:\n", 112 | " print(i)\n", 113 | " print('='*80)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 11, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "Installing mystem to /Users/alex/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-macosx10.8.tar.gz\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "from pymystem3 import Mystem\n", 131 | "mystem = Mystem()\n", 132 | "# Installing mystem to /home/dmchk/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz\n", 133 | "# Экземпляр класса Mystem предоставляет метод lemmatize, вызывающий mystem с соответствующими параметрами." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 12, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "как насчет небольшой стемминг\n", 146 | "\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "text = 'Как насчёт небольшого стемминга'\n", 152 | "lemmas = mystem.lemmatize(text)\n", 153 | "print(''.join(lemmas))" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 16, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "from random import choice" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 32, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "import matplotlib.pyplot as plt" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 68, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "from IPython.display import HTML" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 37, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "import numpy as np" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 75, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "def get_n_colors_on_colormap(n):\n", 209 | " cmap = plt.cm.get_cmap('jet')\n", 210 | " # cmap = matplotlib.cm.get_cmap('Spectral')\n", 211 | " rg = np.linspace(0.3,1.0,n)\n", 212 | " cols = np.asarray(np.floor(255*cmap(rg)),dtype=int)\n", 213 | " get_color_hash = lambda x: '#%02x%02x%02x' % tuple(x.tolist())\n", 214 | " return list(map(get_color_hash, cols[:,:3]))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 87, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "tags = ('S','SPRO','V','ADVPRO','A','PR','ADV','APRO','CONJ', 'NUM', 'ANUM', 'PART')\n", 226 | "tags_to_color_mapping = dict(zip(tags, get_n_colors_on_colormap(len(tags))))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 92, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "def plot_seq_with_tags(seq):\n", 247 | " tmplt = '

{}

'\n", 248 | " span_tmlpt = \"\"\"{content}\"\"\"\n", 249 | " fulltext = ''\n", 250 | " for _word, _pos in seq:\n", 251 | " if _pos is not None:\n", 252 | " fulltext += span_tmlpt.format(color=tags_to_color_mapping[_pos], content=_word)\n", 253 | " else:\n", 254 | " fulltext += _word\n", 255 | " return HTML(tmplt.format(fulltext))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 79, 261 | "metadata": { 262 | "collapsed": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "def get_tagging_for_text(rando):\n", 267 | " analysis = mystem.analyze(rando)\n", 268 | " seq = []\n", 269 | " for i in analysis:\n", 270 | " if 'analysis' in i and i['analysis']:\n", 271 | " info = i['analysis'][0]['gr']\n", 272 | " pos = info.split(',')[0]\n", 273 | " if '=' in pos:\n", 274 | " pos = pos.split('=')[0]\n", 275 | " seq.append((i['text'],pos))\n", 276 | " else:\n", 277 | " seq.append((i['text'],None))\n", 278 | " return seq" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 96, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "

SSPROVADVPROAPRADVAPROCONJNUMANUMPART

" 290 | ], 291 | "text/plain": [ 292 | "" 293 | ] 294 | }, 295 | "execution_count": 96, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "plot_seq_with_tags(list(zip(tags,tags)))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 94, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/html": [ 312 | "

Всем привет,\n", 313 | "Меня зовут Александр (Саша, Алекс - как хотите :slightly_smiling_face:, я из Санкт-Петербурга. Я закончил факультет Политологии МГИМО; понял, что не хватает мне в жизни математики и количественных методов, и поступил на магистерскую программу по Международным Отношениям в Университете Калифорнии (University of California, San Diego) с фокусом на экономическое развитие (economic development), статистический анализ эконометрику. Там работал со Stata, в основном в сфере регрессионного анализа (Time-Series ARIMA models, Impact Evaluation, Randomized Controlled Trials, Regression Discontinuity Design, Propensity-Score Matching etc.). Там же прошел один курс по Big Data Analytics, научился юзать R и работать с текстом. \n", 314 | "Сейчас работаю в должности Data Analyst в стартапе в Нью-Йорке: в основном работаю с текстом из социальных медиа, использую дорогой и полюбившейся мне Python. Иногда есть проектики по Social Network Analysis и Supervised Learning Algorithms. На стороне стараюсь наверстать пробелы в знаниях в Data Science c помощью онлайн курсов (Coursera, Udacity, Udemy, edX, DataCamp) или конференций (ездил на Open Data Science Conference в Бостоне в прошлом месяце). Планирую активнее участвовать в соревнованиях Kaggle.\n", 315 | "Определенно не хватает знаний в сфере Матанализа, Линейной алгебры и computer science, но здесь прибегаю к помощи всезнающей <@U13C9QU9Z>.\n", 316 | "Всегда рад новым знакомствам in the Data Science world. Думаю, смогу здесь (slack) многому научиться, но всегда готов помочь советом (чем смогу - помогу!)\n", 317 | "В общем, пишите - не стесняйтесь :wink:\n", 318 | "

" 319 | ], 320 | "text/plain": [ 321 | "" 322 | ] 323 | }, 324 | "execution_count": 94, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "rando = choice(hi_messages)\n", 331 | "seq = get_tagging_for_text(rando)\n", 332 | "plot_seq_with_tags(seq)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 97, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "from operator import itemgetter" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 100, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "Counter({'SPRO': 8,\n", 355 | " None: 290,\n", 356 | " 'S': 47,\n", 357 | " 'V': 31,\n", 358 | " 'ADVPRO': 7,\n", 359 | " 'PR': 29,\n", 360 | " 'CONJ': 10,\n", 361 | " 'PART': 4,\n", 362 | " 'A': 15,\n", 363 | " 'ANUM': 1,\n", 364 | " 'ADV': 5})" 365 | ] 366 | }, 367 | "execution_count": 100, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "Counter(map(itemgetter(1), seq))" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": true 399 | }, 400 | "outputs": [], 401 | "source": [] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.6.1" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 2 425 | } 426 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/config.proto: -------------------------------------------------------------------------------- 1 | encoding "utf8"; 2 | 3 | TTextMinerConfig { 4 | Dictionary = "mydic.gzt"; 5 | 6 | PrettyOutput = "PrettyOutput.html"; 7 | 8 | Input = {Dir = "messages"} 9 | 10 | //Output = {File = "output" 11 | // Format = text} 12 | 13 | Output = { 14 | File = "facts.xml"; 15 | Format = xml; 16 | //append = 1; 17 | } 18 | 19 | Articles = [ 20 | { Name = "имя" }, 21 | { Name = "курсы" }, 22 | { Name = "образование" }, 23 | { Name = "работа" }, 24 | { Name = "интерес" } 25 | ] 26 | 27 | Facts = [ 28 | { Name = "Name" }, 29 | { Name = "Course" }, 30 | { Name = "Education" }, 31 | { Name = "Job" }, 32 | { Name = "Interest" } 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/courses.cxx: -------------------------------------------------------------------------------- 1 | #encoding "utf-8" 2 | 3 | CourseW -> "курс" | "специализация"; 4 | CourseShort -> "к" | "спец"; 5 | 6 | CourseDescr -> CourseW | CourseShort; 7 | 8 | CourseNameNoun -> (Adj) (Word) Word (Word) (Word); 9 | 10 | Course -> CourseDescr CourseNameNoun interp (Course.CourseName); 11 | 12 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/education.cxx: -------------------------------------------------------------------------------- 1 | #encoding "utf-8" 2 | 3 | EduW -> "учиться" | "закончить"; 4 | 5 | EduNameNoun -> (Adj) (Word) Word (Word) (Word) ; 6 | 7 | Edu -> EduW EduNameNoun interp (Education.Name); 8 | 9 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/fact_types.proto: -------------------------------------------------------------------------------- 1 | import "base.proto"; 2 | import "facttypes_base.proto"; 3 | 4 | message Name: NFactType.TFact 5 | { 6 | required string Name = 1; 7 | } 8 | 9 | message Course: NFactType.TFact 10 | { 11 | required string CourseName = 1; 12 | } 13 | 14 | message Education: NFactType.TFact 15 | { 16 | required string Name = 1; 17 | } 18 | 19 | message Job: NFactType.TFact 20 | { 21 | required string Name = 1; 22 | } 23 | 24 | message Interest: NFactType.TFact 25 | { 26 | required string Name = 1; 27 | } -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/interest.cxx: -------------------------------------------------------------------------------- 1 | #encoding "utf-8" 2 | 3 | InterestW -> "интерес" | "интересоваться"; 4 | 5 | InterestTitle -> (Word) Word ; 6 | 7 | Interest -> InterestW InterestTitle interp (Interest.Name); 8 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/job.cxx: -------------------------------------------------------------------------------- 1 | #encoding "utf-8" 2 | 3 | JobW -> "работать" | "заниматься" | "я"; 4 | 5 | JobTitle -> (Word) (Word) (Word) Noun (Word) (Word); 6 | 7 | Job -> JobW JobTitle interp (Job.Name); 8 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/mydic.gzt: -------------------------------------------------------------------------------- 1 | encoding "utf8"; 2 | 3 | import "base.proto"; 4 | import "articles_base.proto"; 5 | import "fact_types.proto"; 6 | 7 | TAuxDicArticle "имя" 8 | { 9 | key = { "tomita:name.cxx" type=CUSTOM } 10 | } 11 | 12 | 13 | TAuxDicArticle "курсы" 14 | { 15 | key = { "tomita:courses.cxx" type=CUSTOM } 16 | } 17 | 18 | TAuxDicArticle "образование" 19 | { 20 | key = { "tomita:education.cxx" type=CUSTOM } 21 | } 22 | 23 | TAuxDicArticle "работа" 24 | { 25 | key = { "tomita:job.cxx" type=CUSTOM } 26 | } 27 | 28 | TAuxDicArticle "интерес" 29 | { 30 | key = { "tomita:interest.cxx" type=CUSTOM } 31 | } -------------------------------------------------------------------------------- /hackathon_1_may_2017/for_tomita/name.cxx: -------------------------------------------------------------------------------- 1 | #encoding "utf-8" 2 | 3 | NameNoun -> (Word) Word (Word); 4 | 5 | S -> NameNoun interp (Name.Name); 6 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/help_data/java_policy: -------------------------------------------------------------------------------- 1 | grant { 2 | permission java.io.FilePermission "/usr/share/config_data/synonyms.txt", "read,write"; 3 | }; -------------------------------------------------------------------------------- /hackathon_1_may_2017/help_data/synonyms.txt: -------------------------------------------------------------------------------- 1 | xboost, эксгебуст, эксбуст, иксгебуст, xgboost 2 | пыха, пыху, пых, php 3 | lol, лол 4 | питон, python 5 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/key_words.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import os 4 | import datetime 5 | import re 6 | import pymorphy2 7 | import pandas as pd 8 | from nltk import word_tokenize 9 | 10 | morph = pymorphy2.MorphAnalyzer() 11 | 12 | 13 | #from slack_export import SlackExport, normalize_links 14 | from Introspect_hackathon.slack_data_loader import SlackLoader 15 | 16 | 17 | def start(): 18 | # data_folder = ‘/Users/alex/Documents/ODS/oct_4_2016_dump’ 19 | data_folder = "ODS_dump_Mar_10_2017" 20 | 21 | #ods = SlackExport(data_folder) 22 | ods = SlackLoader(data_folder, exclude_channels=["_random_flood", "career"]) 23 | 24 | df_msg = pd.DataFrame.from_records(ods.messages) 25 | 26 | return df_msg 27 | 28 | 29 | def cleanUsernames(str): 30 | re.sub(r"<@", "") 31 | 32 | 33 | def lemm(st): 34 | if st == '': 35 | return '' 36 | else: 37 | return morph.parse(st)[0].normal_form 38 | 39 | 40 | 41 | from stop_words import get_stop_words 42 | from string import punctuation 43 | 44 | punct = set(punctuation) 45 | punct.add(' > ') 46 | punct.add(' < ') 47 | 48 | stop_words = set(get_stop_words('ru')) 49 | 50 | print(':' in punct) 51 | 52 | def pars(text): 53 | target = [] 54 | print(len(text)) 55 | count = 0 56 | for supidx, txt in enumerate(text): 57 | if supidx == 100: 58 | break 59 | # print(txt) 60 | count += 1 61 | for line in txt.split('\n'): 62 | # if len(grade.findall(line)) != len([l for l in line]): 63 | snt = re.split("\.+ |, | ! | \? | \( |\) | - ", line) 64 | bigram = [] 65 | words = [] 66 | trigram = [] 67 | for s in snt: 68 | spl = s.split(' ') 69 | if len(s) > 1: 70 | for i in range(0, (len(spl) - 1)): 71 | if ((spl[i] not in stop_words) and (spl[i + 1] not in stop_words) 72 | and spl[i].isdigit() == False and spl[i + 1].isdigit() == False 73 | and (spl[i + 1] not in punct) and (spl[i] not in punct)): 74 | bigram.append(str(lemm(spl[i])) + ' ' + str(lemm(spl[i+1]))) 75 | 76 | for i in range(0, (len(spl) - 2)): 77 | if ((spl[i] not in stop_words) and (spl[i + 1] not in stop_words) 78 | and (spl[i + 2] not in stop_words) 79 | and spl[i].isdigit() == False and spl[i + 1].isdigit() == False 80 | and spl[i + 2].isdigit() == False 81 | and (spl[i + 1] not in punct) and (spl[i] not in punct) 82 | and (spl[i + 2] not in punct)): 83 | trigram.append(str(lemm(spl[i])) + ' ' + str(lemm(spl[i + 1])) + ' '+str(lemm(spl[i+2]))) 84 | 85 | for i in range(0, (len(spl))): 86 | if (spl[i] not in stop_words) and spl[i].isdigit() == False: 87 | words.append(str(lemm(spl[i]))) 88 | trg = bigram + words + trigram 89 | target.append(trg) 90 | #print(count, trg) 91 | return target 92 | 93 | def clean(matrix): 94 | return pars(matrix['text']) 95 | 96 | 97 | def loadCommonLang(datapath="corpus_freq_dict.csv"): 98 | fCorpus = open(datapath, encoding="UTF-8") 99 | lines = fCorpus.readlines() 100 | 101 | vocabulary = {} 102 | for i, line in enumerate(lines): 103 | if line != "\n": 104 | sample = re.sub("\n", "", line) 105 | sample = sample.split(",") 106 | vocabulary[sample[0]] = int(sample[1]) 107 | 108 | return vocabulary 109 | 110 | 111 | def countAllWordsVocab(vocabulary): 112 | cnt = 0 113 | for word in vocabulary: 114 | cnt += vocabulary[word] 115 | 116 | return cnt 117 | 118 | 119 | def strange(m, m1): # для 2 массивов слов 120 | f = len(m) 121 | f1 = countAllWordsVocab(m1) 122 | 123 | mass = [] 124 | for word in set(m): 125 | if word in m1: 126 | res = round((m.count(word) / f) / (m1[word] / f1), 4) 127 | mass.append((word, res)) 128 | else: 129 | mass.append((word, 75.)) 130 | return mass 131 | 132 | 133 | def oneList(text): 134 | res = [] 135 | for i, sentence in enumerate(text): 136 | res += sentence 137 | 138 | return res 139 | 140 | 141 | import pymysql.cursors, re 142 | 143 | 144 | def getMySQLData(limit=1000000, sql="SELECT ttext FROM %s LIMIT %s"): 145 | connection = pymysql.connect(host='localhost', user='root', password='root', db='sys', charset='utf8mb4', 146 | cursorclass=pymysql.cursors.DictCursor) 147 | 148 | try: 149 | with connection.cursor() as cursor: 150 | #sql = "SELECT ttext FROM %s LIMIT %s" 151 | cursor.execute(sql, limit) 152 | 153 | result = cursor.fetchall() 154 | data = [] 155 | for i, item in enumerate(result): 156 | try: 157 | if item['ttext'] != None: 158 | data.append(re.sub("\n", " ", item['ttext'])) 159 | except Exception: 160 | print("%d %s" % (i, item)) 161 | 162 | return data 163 | finally: 164 | connection.close() 165 | 166 | 167 | def loadTwitterDict(): 168 | from collections import Counter 169 | # limit = 1000000 170 | limit = 111000 171 | negdata = getMySQLData(limit, "SELECT ttext FROM `sortneg` LIMIT %s") 172 | posdata = getMySQLData(limit, "SELECT ttext FROM `sortpos` LIMIT %s") 173 | limit = 1000000 174 | neutraldata = getMySQLData(limit, "SELECT ttext FROM `sentiment` LIMIT %s") 175 | 176 | data = negdata + posdata + neutraldata 177 | 178 | 179 | #sentences = pars(data) 180 | sentences = [] 181 | for i, text in enumerate(data): 182 | sentences += re.split("\.+ |, | ! | \? | \( |\) | - ", text) 183 | 184 | words = [] 185 | for i, sentence in enumerate(sentences): 186 | words += word_tokenize(sentence) 187 | 188 | #words = [] 189 | #for i, sentence in enumerate(sentences): 190 | # words += sentence 191 | 192 | from nltk import collections 193 | counts = dict(Counter(words)) 194 | return counts 195 | 196 | 197 | def wordsChoose(dic, barrier=5.): 198 | res = [] 199 | for i, word in enumerate(dic): 200 | if word[1] > barrier: 201 | res.append(word) 202 | return res 203 | 204 | 205 | def writeData(data, datapath='strange.csv'): 206 | import csv 207 | with open(datapath, "w+", encoding="UTF-8") as f: 208 | a = csv.writer(f) 209 | for i, word in enumerate(data): 210 | a.writerow(word) 211 | 212 | 213 | if __name__ == "__main__": 214 | counts = loadTwitterDict() 215 | #vocab = loadCommonLang() 216 | df_msg = start() 217 | lemmatized = clean(df_msg) 218 | onelst = oneList(lemmatized) 219 | strangeness = strange(onelst, counts) 220 | ranged_strange_words = wordsChoose(strangeness) 221 | writeData(ranged_strange_words, datapath='strange.csv') 222 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/predict_channel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | import codecs 4 | from collections import defaultdict 5 | 6 | import pymorphy2 7 | from sklearn.metrics import confusion_matrix 8 | from sklearn.model_selection import train_test_split 9 | import stop_words 10 | 11 | from tokenizer import tokenize, preprocessing 12 | from slack_data_loader import SlackLoader 13 | 14 | 15 | morph = pymorphy2.MorphAnalyzer() 16 | 17 | QUESTION_WORD_LEMMAS = ("как", "как-то", "какой", "какой-то", "зачем", "почему", "когда", "кто", "где", 18 | "когда", "куда", "куда-то", "чот") 19 | QUESTION_WORDS = ("подскажите", "посоветуйте", "дайте", "киньте", "кинте") 20 | STOP_WORDS = stop_words.get_stop_words("russian") 21 | PUNCTUATION = ['.', ',', ';', ':', '!', '?', '-', '<', '>', '(', ')', '<-', '`', '::', '//', '/', '>:', 22 | '{', '}', '--', '(<', '\\', '}]', ']', '[', '))', '>>', '..', '...', '==', '```', '#', 23 | '~', '"', '%)', ';<', '|', '!!', 'slightly_smiling_face', 'simple_smile', 'http', 24 | 'https', ':/', 'smile', 'www.', 'com', 'ru', 'org', 'ru.', "'"] 25 | 26 | 27 | def is_question(tokens): 28 | is_q = False 29 | for i, t in enumerate(tokens): 30 | if t in QUESTION_WORDS: 31 | is_q = True 32 | t = morph.parse(t)[0].normal_form 33 | tokens[i] = t 34 | if t in QUESTION_WORD_LEMMAS: 35 | is_q = True 36 | return is_q 37 | 38 | 39 | def prepare_data(): 40 | print('Loading data...') 41 | loader = SlackLoader('opendatascience Slack export May 20 2017', is_sorted=False, 42 | only_channels=['nlp', 'deep_learning', 'datasets', 'sequences_series', 'bayesian', '_meetings', 43 | 'edu_academy', 'edu_books', 'visualization', 44 | 'hardware', 'reinforcement_learnin', 'theory_and_practice']) 45 | 46 | print('Converting data...') 47 | channel_messages = [] 48 | previous_channel = '' 49 | label_id = 0 50 | with codecs.open('vw_data_train.vw', 'w', encoding='utf8') as vw_train: 51 | with codecs.open('vw_data_test.vw', 'w', encoding='utf8') as vw_test: 52 | for m in loader.messages: 53 | tokens = [t for t in tokenize(preprocessing(m['text'])) 54 | if t not in PUNCTUATION and not t.startswith('@')] 55 | # take only questions 56 | if is_question(tokens): 57 | if previous_channel != m['channel']: 58 | previous_channel = m['channel'] 59 | if channel_messages: 60 | label_id += 1 61 | train, test = train_test_split(channel_messages, test_size=0.15) 62 | for t in train: 63 | text = t[1].replace(':', ';').replace('|', '/') 64 | vw_train.write('%s | %s\n' % (label_id, text)) 65 | for t in test: 66 | text = t[1].replace(':', ';').replace('|', '/') 67 | vw_test.write('%s | %s\n' % (label_id, text)) 68 | channel_messages = [] 69 | tokens = [t for t in tokens if t not in STOP_WORDS] 70 | if len(tokens) > 3: 71 | channel_messages.append((m['channel'], ' '.join(tokens))) 72 | 73 | # a last channel data 74 | label_id += 1 75 | train, test = train_test_split(channel_messages, test_size=0.15) 76 | for t in train: 77 | text = t[1].replace(':', ';').replace('|', '/') 78 | vw_train.write('%s | %s\n' % (label_id, text)) 79 | for t in test: 80 | text = t[1].replace(':', ';').replace('|', '/') 81 | vw_test.write('%s | %s\n' % (label_id, text)) 82 | 83 | 84 | '''def convert_to_vw_format(): 85 | file_name = 'fasttext_data.txt' 86 | file_name_vw = file_name.split('.')[0] + '.vw' 87 | previous_label = '' 88 | label_id = 0 89 | with codecs.open(file_name, 'r', encoding='utf-8') as f: 90 | with codecs.open(file_name_vw, 'w', encoding='utf-8') as f_out: 91 | for sentence in f: 92 | try: 93 | label, text = sentence.strip().split(' ', 1) 94 | if previous_label != label: 95 | label_id += 1 96 | previous_label = label 97 | text = text.replace(':', ';').replace('|', '/') 98 | f_out.write('%s | %s\n' % (label_id, text)) 99 | except Exception as e: 100 | print(e)''' 101 | 102 | 103 | def analyze(): 104 | total = 0 105 | correct = 0 106 | labels_total = defaultdict(int) 107 | labels_correct = defaultdict(int) 108 | y_true = [] 109 | y_pred = [] 110 | with codecs.open('vw_data_test.vw', encoding='utf-8') as f: 111 | with codecs.open('vw_data_test.vw.pred', encoding='utf-8') as f_pred: 112 | for l in f: 113 | try: 114 | label_pred = f_pred.readline().strip() 115 | label, text = l.split(' | ') 116 | y_pred.append(int(label_pred)) 117 | y_true.append(int(label)) 118 | if (int(label) in [2, 8] and int(label_pred) in [2, 8]) and label != label_pred: 119 | if len(text.strip().split(' ')) <= 3: 120 | print('%s - %s' % (label, text.strip())) 121 | if label == label_pred: 122 | correct += 1 123 | labels_correct[label] += 1 124 | total += 1 125 | labels_total[label] += 1 126 | except: 127 | pass 128 | 129 | print('Accuracy total %s' % (correct / float(total))) 130 | for l, v in labels_correct.iteritems(): 131 | print('Accuracy for label %s: %s' % (l, v / float(labels_total[l]))) 132 | 133 | print(confusion_matrix(y_true, y_pred)) 134 | 135 | if __name__ == '__main__': 136 | # prepare_data() 137 | 138 | # here we train a model and predict on test data: bash vw.sh 139 | from subprocess import call 140 | call(["bash", "vw.sh", "vw_data_train.vw", "vw_data_test.vw", "0.05", "2"]) 141 | 142 | analyze() 143 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | numpy 3 | pymorphy2 4 | scikit-learn 5 | vowpalwabbit 6 | stop_words -------------------------------------------------------------------------------- /hackathon_1_may_2017/slack_data_loader.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | from collections import defaultdict 6 | 7 | import datetime 8 | import glob 9 | import json 10 | import os 11 | import re 12 | 13 | 14 | def _read_json_dict(filename, key='id'): 15 | with open(filename) as fin: 16 | records = json.load(fin) 17 | json_dict = { 18 | record[key]: record 19 | for record in records 20 | } 21 | return json_dict 22 | 23 | 24 | class SlackLoader(object): 25 | def __init__(self, export_path, exclude_channels=(), only_channels=(), start_date=None, end_date=None, 26 | is_sorted=True): 27 | self.exclude_channels = exclude_channels 28 | self.only_channels = only_channels 29 | if start_date: 30 | self.start_date = (start_date - datetime.datetime(1970, 1, 1)).total_seconds() 31 | else: 32 | self.start_date = None 33 | if end_date: 34 | self.end_date = (end_date - datetime.datetime(1970, 1, 1)).total_seconds() 35 | else: 36 | self.end_date = None 37 | self.channels = _read_json_dict(os.path.join(export_path, 'channels.json')) 38 | self.users = _read_json_dict(os.path.join(export_path, 'users.json')) 39 | self.messages = self.load_export(export_path, is_sorted) 40 | 41 | def load_export(self, export_path, is_sorted): 42 | messages = [] 43 | for channel_id, channel in self.channels.items(): 44 | if channel['is_archived']: 45 | continue 46 | if channel['name'] in self.exclude_channels: 47 | continue 48 | if self.only_channels and channel['name'] not in self.only_channels: 49 | continue 50 | messages_glob = os.path.join(export_path, channel['name'], '*.json') 51 | for messages_filename in glob.glob(messages_glob): 52 | with open(messages_filename) as f_messages: 53 | for record in json.load(f_messages): 54 | if 'subtype' in record: 55 | continue 56 | if 'ts' in record: 57 | if self.start_date and float(record['ts']) < self.start_date: 58 | continue 59 | if self.end_date and float(record['ts']) > self.end_date: 60 | continue 61 | record['ts'] = float(record['ts']) 62 | record['dt'] = datetime.datetime.fromtimestamp(record['ts']) 63 | record['channel'] = channel_id 64 | messages.append(record) 65 | if is_sorted: 66 | messages = sorted(messages, key=lambda x: x['ts']) 67 | 68 | return messages 69 | 70 | def find_threads(self): 71 | dd = defaultdict(list) 72 | for i in range(0, len(self.messages)): 73 | msg = self.messages[i] 74 | if "thread_ts" in msg: 75 | dd[msg["thread_ts"]].append(i) 76 | return list(dd.values()) 77 | 78 | re_slack_link = re.compile(r'(?P<(?P[^\|]*)(\|(?P[^>]*))?>)') 79 | 80 | 81 | def _extract_slack_link_id(m): 82 | return m.group('id') 83 | 84 | 85 | def normalize_links(text): 86 | return re_slack_link.sub(_extract_slack_link_id, text) 87 | 88 | 89 | if __name__ == '__main__': 90 | loader = SlackLoader('ODS_dump_Mar_10_2017', exclude_channels=['_random_flood', 'career']) 91 | print(len(loader.messages)) 92 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/test_simple_question_extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from slack_data_loader import SlackLoader\n", 12 | "import datetime\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import re\n", 16 | "from operator import itemgetter" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "path_to_dump = './opendatascience Slack export May 20 2017/'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "exporter = SlackLoader(path_to_dump, only_channels=('deep_learning',),\n", 37 | " start_date=datetime.datetime(2017, 1, 1))" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Loaded 7540 messages\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "print(\"Loaded {} messages\".format(len(exporter.messages)))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 6, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "channel_attrs = ['id', 'name', 'created', 'creator', 'is_archived', 'is_general', 'pins', 'topic']\n", 66 | "\n", 67 | "def channels_to_df(channels):\n", 68 | " full_list = []\n", 69 | " for ch_id, ch_dict in channels.items():\n", 70 | " new_channel_dict = {}\n", 71 | " for k in channel_attrs:\n", 72 | " new_channel_dict[k] = ch_dict.get(k, None)\n", 73 | " new_channel_dict['num_members'] = len(ch_dict['members'])\n", 74 | " new_channel_dict['purpose'] = ch_dict['purpose']['value']\n", 75 | " full_list.append(new_channel_dict)\n", 76 | " return pd.DataFrame(full_list).set_index('id')\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 7, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "<div>\n", 88 | "<style>\n", 89 | " .dataframe thead tr:only-child th {\n", 90 | " text-align: right;\n", 91 | " }\n", 92 | "\n", 93 | " .dataframe thead th {\n", 94 | " text-align: left;\n", 95 | " }\n", 96 | "\n", 97 | " .dataframe tbody tr th {\n", 98 | " vertical-align: top;\n", 99 | " }\n", 100 | "</style>\n", 101 | "<table border=\"1\" class=\"dataframe\">\n", 102 | " <thead>\n", 103 | " <tr style=\"text-align: right;\">\n", 104 | " <th></th>\n", 105 | " <th>created</th>\n", 106 | " <th>creator</th>\n", 107 | " <th>is_archived</th>\n", 108 | " <th>is_general</th>\n", 109 | " <th>name</th>\n", 110 | " <th>num_members</th>\n", 111 | " <th>pins</th>\n", 112 | " <th>purpose</th>\n", 113 | " <th>topic</th>\n", 114 | " </tr>\n", 115 | " <tr>\n", 116 | " <th>id</th>\n", 117 | " <th></th>\n", 118 | " <th></th>\n", 119 | " <th></th>\n", 120 | " <th></th>\n", 121 | " <th></th>\n", 122 | " <th></th>\n", 123 | " <th></th>\n", 124 | " <th></th>\n", 125 | " <th></th>\n", 126 | " </tr>\n", 127 | " </thead>\n", 128 | " <tbody>\n", 129 | " <tr>\n", 130 | " <th>C2A4GEL6M</th>\n", 131 | " <td>1473445368</td>\n", 132 | " <td>U04ELQZAU</td>\n", 133 | " <td>True</td>\n", 134 | " <td>False</td>\n", 135 | " <td>alexyashadasha</td>\n", 136 | " <td>0</td>\n", 137 | " <td>None</td>\n", 138 | " <td></td>\n", 139 | " <td>{'value': '', 'creator': '', 'last_set': '0'}</td>\n", 140 | " </tr>\n", 141 | " <tr>\n", 142 | " <th>C1P8YT7C7</th>\n", 143 | " <td>1467817046</td>\n", 144 | " <td>U04URBM8V</td>\n", 145 | " <td>False</td>\n", 146 | " <td>False</td>\n", 147 | " <td>bayesian</td>\n", 148 | " <td>307</td>\n", 149 | " <td>[{'id': '1467888432.000030', 'type': 'C', 'use...</td>\n", 150 | " <td>Church of Bayes: Discussing Bayesian statistic...</td>\n", 151 | " <td>{'value': ':bayes:', 'creator': 'U04ELQZAU', '...</td>\n", 152 | " </tr>\n", 153 | " <tr>\n", 154 | " <th>C0804BS5Q</th>\n", 155 | " <td>1437511383</td>\n", 156 | " <td>U049NHC4X</td>\n", 157 | " <td>False</td>\n", 158 | " <td>False</td>\n", 159 | " <td>big_data</td>\n", 160 | " <td>1301</td>\n", 161 | " <td>[{'id': '1485303977.000947', 'type': 'C', 'use...</td>\n", 162 | " <td>Hadoop, Spark и прочее\\r\\n\\r\\nПолезные материа...</td>\n", 163 | " <td>{'value': 'Big Pain in the ...', 'creator': 'U...</td>\n", 164 | " </tr>\n", 165 | " <tr>\n", 166 | " <th>C0MQQT6E6</th>\n", 167 | " <td>1455738772</td>\n", 168 | " <td>U070Y25AS</td>\n", 169 | " <td>False</td>\n", 170 | " <td>False</td>\n", 171 | " <td>bioinformatics</td>\n", 172 | " <td>125</td>\n", 173 | " <td>None</td>\n", 174 | " <td></td>\n", 175 | " <td>{'value': ':bioscience:', 'creator': 'U04ELQZA...</td>\n", 176 | " </tr>\n", 177 | " <tr>\n", 178 | " <th>C115898GZ</th>\n", 179 | " <td>1460749144</td>\n", 180 | " <td>U04422XJL</td>\n", 181 | " <td>True</td>\n", 182 | " <td>False</td>\n", 183 | " <td>blackoxchallenge</td>\n", 184 | " <td>0</td>\n", 185 | " <td>None</td>\n", 186 | " <td></td>\n", 187 | " <td>{'value': '', 'creator': '', 'last_set': '0'}</td>\n", 188 | " </tr>\n", 189 | " </tbody>\n", 190 | "</table>\n", 191 | "</div>" 192 | ], 193 | "text/plain": [ 194 | " created creator is_archived is_general name \\\n", 195 | "id \n", 196 | "C2A4GEL6M 1473445368 U04ELQZAU True False alexyashadasha \n", 197 | "C1P8YT7C7 1467817046 U04URBM8V False False bayesian \n", 198 | "C0804BS5Q 1437511383 U049NHC4X False False big_data \n", 199 | "C0MQQT6E6 1455738772 U070Y25AS False False bioinformatics \n", 200 | "C115898GZ 1460749144 U04422XJL True False blackoxchallenge \n", 201 | "\n", 202 | " num_members pins \\\n", 203 | "id \n", 204 | "C2A4GEL6M 0 None \n", 205 | "C1P8YT7C7 307 [{'id': '1467888432.000030', 'type': 'C', 'use... \n", 206 | "C0804BS5Q 1301 [{'id': '1485303977.000947', 'type': 'C', 'use... \n", 207 | "C0MQQT6E6 125 None \n", 208 | "C115898GZ 0 None \n", 209 | "\n", 210 | " purpose \\\n", 211 | "id \n", 212 | "C2A4GEL6M \n", 213 | "C1P8YT7C7 Church of Bayes: Discussing Bayesian statistic... \n", 214 | "C0804BS5Q Hadoop, Spark и прочее\\r\\n\\r\\nПолезные материа... \n", 215 | "C0MQQT6E6 \n", 216 | "C115898GZ \n", 217 | "\n", 218 | " topic \n", 219 | "id \n", 220 | "C2A4GEL6M {'value': '', 'creator': '', 'last_set': '0'} \n", 221 | "C1P8YT7C7 {'value': ':bayes:', 'creator': 'U04ELQZAU', '... \n", 222 | "C0804BS5Q {'value': 'Big Pain in the ...', 'creator': 'U... \n", 223 | "C0MQQT6E6 {'value': ':bioscience:', 'creator': 'U04ELQZA... \n", 224 | "C115898GZ {'value': '', 'creator': '', 'last_set': '0'} " 225 | ] 226 | }, 227 | "execution_count": 7, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "df = channels_to_df(exporter.channels)\n", 234 | "df.head()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 41, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "qwords = (\"как\", \"какой\", \"зачем\", \"почему\", \"когда\", \"кто\", \"где\", \"когда\", \"куда\", \"чот\")\n", 244 | "splitter = re.compile(r\"(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s\")\n", 245 | "\n", 246 | "import pymorphy2\n", 247 | "morph = pymorphy2.MorphAnalyzer()\n", 248 | "\n", 249 | "def lemm(st):\n", 250 | " if st == '':\n", 251 | " return ''\n", 252 | " else:\n", 253 | " return morph.parse(st)[0].normal_form\n", 254 | "\n", 255 | "def is_question(d):\n", 256 | " x = d.lower()\n", 257 | " snt = x.split()\n", 258 | " num_words = len(snt)\n", 259 | " snt = [lemm(w) for w in snt]\n", 260 | " #print((num_words > 4) and any(w in qwords for w in snt), [w in qwords for w in snt])\n", 261 | " return (num_words > 4) and any(w in qwords for w in snt)\n", 262 | "\n", 263 | "def contains_sentance_with_questions(d):\n", 264 | " x = d['text'].lower()\n", 265 | " sents = splitter.split(x)\n", 266 | " #print(any(map(is_question, sents)))\n", 267 | " return any(map(is_question, sents))\n", 268 | "\n", 269 | "questions = list(filter(contains_sentance_with_questions, exporter.messages))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "questions = list(filter(contains_sentance_with_questions, exporter.messages))" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 42, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "found 1255 questions\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "print(\"found {} questions\".format(len(questions)))" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "for _text in map(itemgetter('text'), questions):\n", 305 | " print(_text)\n", 306 | " print('-'*40)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.6.4" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 2 340 | } 341 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | GROUPING_SPACE_REGEX = re.compile(r'([^@\w_\-])', re.UNICODE | re.MULTILINE) 5 | 6 | ALPHABET = re.compile(u'[A-Za-zА-ЯЁа-яё]') 7 | 8 | # special tokens to be found before system processing 9 | web_address_re = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 10 | email_re = re.compile(u'[а-яёА-ЯЁA-Za-z0-9.+_-]+@[^@]+\.[a-zA-Zа-яёА-ЯЁ]+') 11 | number_re = re.compile(u'(^|[^\w\-])[+\-]*[0-9]+[0-9 =―—–_:.,/x×\-*]*[\-ыхейюомуя]*([^\w\-]|$)', re.UNICODE) 12 | 13 | 14 | def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split): 15 | """ 16 | Split text into tokens. Don't split by a hyphen and an underscore. 17 | Preserve punctuation, but not whitespaces. 18 | """ 19 | return [t for t in _split(text) if t] 20 | 21 | 22 | def replace_number(match_obj): 23 | return u'%sNUM%s' % (match_obj.group(1), match_obj.group(2)) 24 | 25 | 26 | def tokenize(text): 27 | inp_tokens = simple_word_tokenize(text) 28 | tokens_len = len(inp_tokens) 29 | output_tokens = [] 30 | # combine some tokens together: contractions, smileys, emoticons, etc. 31 | for index, token in enumerate(inp_tokens): 32 | # contractions with length < 5 33 | if token in u'.' and 0 < index < tokens_len - 1 and inp_tokens[index + 1] not in u'.?-–—)\'"”»' and \ 34 | output_tokens and len(output_tokens[-1]) < 5: 35 | output_tokens[-1] += token 36 | # english contractions 37 | elif token in [u's', u've', u'm', u'll', u're', u'd', u't'] and index > 0 and inp_tokens[index - 1] in u'\'`': 38 | output_tokens[-1] += token 39 | # cut a hyphen off from the beginning of a word 40 | elif token[0] == u'-' and len(token) > 1 and ALPHABET.match(token[1]): 41 | output_tokens.append(u'-') 42 | output_tokens.append(token[1:]) 43 | # !? or ?! 44 | elif token in u'?!' and index > 0 and inp_tokens[index - 1] in u'?!': 45 | if len(output_tokens[-1]) < 2: 46 | output_tokens[-1] += token 47 | # repetition of dots, question marks, slashes, etc 48 | elif token in u'.,?!^*/=:;«»"“”-–—@+()_❤☀' and index > 0 and inp_tokens[index - 1] == token: 49 | if len(output_tokens[-1]) < 2: 50 | output_tokens[-1] += token 51 | # smileys, emoticons 52 | elif token in u'-–—/_{}()[]<>`*:^=DP' and index > 0 and inp_tokens[index - 1] and \ 53 | inp_tokens[index - 1] in u'/`^:{}()[]<>*%=;-–—_': 54 | output_tokens[-1] += token 55 | else: 56 | if not token.isspace(): 57 | output_tokens.append(token) 58 | return output_tokens 59 | 60 | 61 | def preprocessing(sent): 62 | # replace URL address on URL token, e-mail on EMAIL and numbers on NUM before tokenizing 63 | # sent = web_address_re.sub('URL', ) # number_re.sub(replace_number, sent) 64 | sent = email_re.sub('EMAIL', sent).replace('\n', '').strip().lower() 65 | sent = sent.replace(u'ё', u'е').replace('"', '"').replace('<', '<').replace('>', '>'). \ 66 | replace('&', '&').replace(''', '`').replace('', '').replace('<br>', '') 67 | return sent 68 | 69 | if __name__ == '__main__': 70 | # test 71 | import codecs 72 | from time import time 73 | 74 | total = 0 75 | error = 0 76 | with codecs.open('tokens.txt', 'w', encoding='utf-8') as f_out: 77 | with codecs.open('sentences.txt', 'r', encoding='utf-8') as f: 78 | start_time = time() 79 | for sentence in f: 80 | sentence = sentence.strip() 81 | if sentence: 82 | sentence = preprocessing(sentence) 83 | tokens = tokenize(sentence) 84 | f_out.write(u' '.join(tokens)+'\n') 85 | print 'Execution time: %s' % (time() - start_time) 86 | -------------------------------------------------------------------------------- /hackathon_1_may_2017/vw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILENAME_TRAIN=$1 4 | FILENAME_TEST=$2 5 | LR=$3 6 | NGRAMS=$4 7 | # LINES=`wc -l ${FILENAME}` 8 | # POS=`expr "${LINES}" : '.* '` 9 | # COUNT_OF_LINES=${LINES:0:${POS}} 10 | # COUNT_OF_LINES=$((COUNT_OF_LINES * 85 / 100)) 11 | # echo $COUNT_OF_LINES 12 | 13 | # POS=`expr "${FILENAME}" : '.*\.'` 14 | # NAME=${FILENAME:0:${POS} - 1} 15 | # echo $NAME 16 | 17 | #gshuf ${FILENAME} >> split -l $COUNT_OF_LINES 18 | #mv xaa ${NAME}_train.vw 19 | #mv xab ${NAME}_test.vw 20 | 21 | gshuf ${FILENAME_TRAIN} -o ${FILENAME_TRAIN} 22 | 23 | cd ../vowpal_wabbit/vowpalwabbit/ 24 | ./vw -c -k -b 25 --oaa 12 -l ${LR} --ngram ${NGRAMS} -d ../../Introspect_hackathon/${FILENAME_TRAIN} -f vw_ods_channels.bin --passes 20 --holdout_off 25 | ./vw -t -i vw_ods_channels.bin -d ../../Introspect_hackathon/${FILENAME_TEST} 26 | 27 | # predict 28 | ./vw -t -i vw_ods_channels.bin -d ../../Introspect_hackathon/${FILENAME_TEST} -p ../../Introspect_hackathon/${FILENAME_TEST}.pred --quiet 29 | 30 | mv vw_ods_channels.bin ../../Introspect_hackathon/models/ -------------------------------------------------------------------------------- /hackathon_2_march_2018/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_2_march_2018/.DS_Store -------------------------------------------------------------------------------- /hackathon_2_march_2018/README.md: -------------------------------------------------------------------------------- 1 | # Код, данные и заметки с хакатона в Mail.ru 2 | Загружайте сюда и пишите простое пояснение 3 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/data_fetch/README.md: -------------------------------------------------------------------------------- 1 | # Код для экспорта данных из JSON-дампа Slack в БД 2 | 3 | Получает данные про 4 | - пользователей (таблица imported_user_data) 5 | - каналы (таблица imported_channel) 6 | - сообщения (таблица imported_messages) 7 | - реакции на сообщения по юзерам (таблица imported_reactions) 8 | - количество реакций на сообщения по типам (таблица imported_reactions_count) 9 | 10 | # Prerequisites 11 | 12 | ``` 13 | python3 -m pip install -r requirements.txt 14 | ``` 15 | 16 | # Usage 17 | 18 | ``` 19 | python3 run.py 20 | ``` 21 | 22 | Предполагается, что в ../data лежат разархивированный ODS-дамп. 23 | 24 | База (по умолчанию sqlite) будет лежать в ../ods-slack.db. 25 | 26 | # Notes 27 | 28 | - В imported_reactions не все юзеры, т.к. в дампе указаны не все юзеры, поставившие смайл.<br/> 29 | Дамп отображает on-hover поведение Slack: показывает ~50 именованных юзеров, а дальше пишет and 42 others.<br/> 30 | Вот эти 42 юзера не попали в дамп. 31 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/data_fetch/msg_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, logging 4 | import json 5 | from pprint import pprint 6 | from collections import Counter 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey 9 | from sqlalchemy.engine.url import URL 10 | from sqlalchemy.orm import sessionmaker 11 | from sqlalchemy.pool import NullPool 12 | 13 | Base = declarative_base() 14 | 15 | class Channel(Base): 16 | __tablename__ = 'imported_channel' 17 | 18 | name = Column('name', String, primary_key=True) 19 | 20 | def __init__(self, name): 21 | self.name = name 22 | 23 | def __repr__(self): 24 | return "<Channel({0})>".format(self.name) 25 | 26 | 27 | class Message(Base): 28 | __tablename__ = 'imported_messages' 29 | 30 | channel = Column('channel', String, primary_key=True) 31 | ts = Column('ts', String, primary_key=True) 32 | 33 | type = Column('type', String) 34 | text = Column('text', String) 35 | user = Column('user', String) 36 | thread_ts = Column('thread_ts', String) 37 | parent_user_id = Column('parent_user_id', String) 38 | subtype = Column('subtype', String) 39 | # reactions = Column('reactions', String) table 40 | # edited = Column('edited', String) later 41 | # attachments = Column('attachments', String) table 42 | reply_count = Column('reply_count', Integer) 43 | # replies = Column('replies', String) table 44 | unread_count = Column('unread_count', Integer) 45 | bot_id = Column('bot_id', String) 46 | username = Column('username', String) 47 | # file = Column('file', String) 48 | 49 | def __init__(self, channel, data): 50 | self.channel = channel 51 | self.ts = data.get('ts', '') 52 | 53 | self.type = data.get('type', '') 54 | self.text = str(data.get('text', '')) 55 | self.user = data.get('user', '') 56 | self.thread_ts = data.get('thread_ts', '') 57 | self.parent_user_id = data.get('parent_user_id', '') 58 | self.subtype = data.get('subtype', '') 59 | # self.reactions = data.get('reactions', '') 60 | # self.edited = data.get('edited', '') 61 | # self.attachments = data.get('attachments', '') 62 | self.reply_count = data.get('reply_count', '') 63 | self.replies = data.get('replies', '') 64 | self.unread_count = data.get('unread_count', '') 65 | self.bot_id = data.get('bot_id', '') 66 | self.username = data.get('username', '') 67 | 68 | def __repr__(self): 69 | return "<Message({0}, {1}, {2})>".format(self.channel, self.date, self.index) 70 | 71 | 72 | def parse_messages(session, data_path): 73 | # c = Counter() 74 | dirs = [e.name for e in os.scandir(data_path) if e.is_dir()] 75 | # msg_keys = set() 76 | for dir in dirs: 77 | pprint(data_path + os.sep + dir) 78 | # if dir != 'welcome': 79 | # continue 80 | for d, dirs, files in os.walk(data_path + os.sep + dir): 81 | channel = d.split('/')[-1] 82 | session.add(Channel(channel)) 83 | for f in files: 84 | path = os.path.join(d, f) 85 | # print(path) 86 | data = json.load(open(path)) 87 | for msg in data: 88 | session.add(Message(channel, msg)) 89 | # c.update(msg.keys()) 90 | # for key in msg.keys(): 91 | # msg_keys.add(key) 92 | # session.add() 93 | session.commit() 94 | # pprint(msg_keys) 95 | # pprint(c.most_common(60)) 96 | # print(len(msg_keys), '\n') 97 | 98 | 99 | # if __name__ == '__main__': 100 | # Base.metadata.create_all(engine) 101 | # parse_messages('../data') 102 | # # print(c.most_common(100)) 103 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/data_fetch/reaction_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, logging 4 | import json 5 | from pprint import pprint 6 | from collections import Counter 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey 9 | from sqlalchemy.engine.url import URL 10 | from sqlalchemy.orm import sessionmaker 11 | from sqlalchemy.pool import NullPool 12 | 13 | Base = declarative_base() 14 | 15 | class Reaction(Base): 16 | __tablename__ = 'imported_reactions' 17 | 18 | channel = Column('channel', String, primary_key=True) 19 | message_ts = Column('message_ts', String, primary_key=True) 20 | user_id = Column('user_id', String, primary_key=True) 21 | name = Column('name', String, primary_key=True) 22 | 23 | def __init__(self, channel, message_ts, user_id, name): 24 | self.channel = channel 25 | self.message_ts = message_ts 26 | self.user_id = user_id 27 | self.name = name 28 | 29 | def __repr__(self): 30 | return "<Reaction({0}, {1}, {2})>".format(self.name, self.message_ts, self.user_id) 31 | 32 | 33 | class ReactionCount(Base): 34 | __tablename__ = 'imported_reactions_count' 35 | 36 | channel = Column('channel', String, primary_key=True) 37 | message_ts = Column('message_ts', String, primary_key=True) 38 | name = Column('name', String, primary_key=True) 39 | count = Column('count', Integer) 40 | 41 | def __init__(self, channel, message_ts, name, count): 42 | self.channel = channel 43 | self.message_ts = message_ts 44 | self.name = name 45 | self.count = count 46 | 47 | def __repr__(self): 48 | return "<Reaction({0}, {1}, {2})>".format(self.name, self.message_ts, self.count) 49 | 50 | 51 | def parse_reactions(session, data_path): 52 | # c = Counter() 53 | dirs = [e.name for e in os.scandir(data_path) if e.is_dir()] 54 | # msg_keys = set() 55 | for dir in dirs: 56 | pprint(data_path + os.sep + dir) 57 | # if dir != 'welcome': 58 | # continue 59 | for d, dirs, files in os.walk(data_path + os.sep + dir): 60 | channel = d.split('/')[-1] 61 | # session.add(Channel(channel)) 62 | for f in files: 63 | path = os.path.join(d, f) 64 | # print(path) 65 | data = json.load(open(path)) 66 | for msg in data: 67 | if 'reactions' in msg.keys(): 68 | for reaction in msg['reactions']: 69 | for user in reaction['users']: 70 | session.add(Reaction(channel, msg['ts'], user, reaction['name'])) 71 | # c.update(msg.keys()) 72 | # for key in msg.keys(): 73 | # msg_keys.add(key) 74 | # session.add() 75 | session.commit() 76 | # pprint(msg_keys) 77 | # pprint(c.most_common(60)) 78 | # print(len(msg_keys), '\n') 79 | 80 | 81 | def parse_reactions_count(session, data_path): 82 | # c = Counter() 83 | dirs = [e.name for e in os.scandir(data_path) if e.is_dir()] 84 | # msg_keys = set() 85 | for dir in dirs: 86 | pprint(data_path + os.sep + dir) 87 | # if dir != 'welcome': 88 | # continue 89 | for d, dirs, files in os.walk(data_path + os.sep + dir): 90 | channel = d.split('/')[-1] 91 | # session.add(Channel(channel)) 92 | for f in files: 93 | path = os.path.join(d, f) 94 | # print(path) 95 | data = json.load(open(path)) 96 | for msg in data: 97 | if 'reactions' in msg.keys(): 98 | for reaction in msg['reactions']: 99 | session.add(ReactionCount(channel, msg['ts'], reaction['name'], reaction['count'])) 100 | # c.update(msg.keys()) 101 | # for key in msg.keys(): 102 | # msg_keys.add(key) 103 | # session.add() 104 | session.commit() 105 | # pprint(msg_keys) 106 | # pprint(c.most_common(60)) 107 | # print(len(msg_keys), '\n') 108 | 109 | # if __name__ == '__main__': 110 | # Base.metadata.create_all(engine) 111 | # parse_reactions_count('../data') 112 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/data_fetch/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, logging 4 | import json 5 | from pprint import pprint 6 | from sqlalchemy.ext.declarative import declarative_base 7 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, MetaData, ForeignKey 8 | from sqlalchemy.engine.url import URL 9 | from sqlalchemy.orm import sessionmaker 10 | from sqlalchemy.pool import NullPool 11 | 12 | logger = logging.getLogger(__name__) 13 | local_name = 'sqlite:///../ods-slack.db' 14 | remote_name = 'postgres://usgbqmayetwlrv:a8b6a60b922bd6d08c3e94fa41eac937f71ed3bc4afade4995a3bdf5d54e36ca@ec2-54-247-81-88.eu-west-1.compute.amazonaws.com:5432/d7942vtj104cpv' 15 | engine = create_engine(local_name, echo=True) 16 | Base = declarative_base() 17 | Session = sessionmaker(bind=engine) 18 | dump_ODS_path = '../data' 19 | 20 | import users_parser 21 | import msg_parser 22 | import reaction_parser 23 | 24 | if __name__ == '__main__': 25 | Base.metadata.create_all(engine) 26 | users_parser.Base.metadata.create_all(engine) 27 | msg_parser.Base.metadata.create_all(engine) 28 | reaction_parser.Base.metadata.create_all(engine) 29 | Base.metadata.create_all(engine) 30 | 31 | session = Session() 32 | 33 | # users_parser.UserData.__table__.drop(engine) 34 | # msg_parser.Channel.__table__.drop(engine) 35 | # msg_parser.Message.__table__.drop(engine) 36 | # reaction_parser.Reaction.__table__.drop(engine) 37 | # reaction_parser.ReactionCount.__table__.drop(engine) 38 | 39 | users_parser.parse_users(session, dump_ODS_path) 40 | msg_parser.parse_messages(session, dump_ODS_path) 41 | reaction_parser.parse_reactions(session, dump_ODS_path) 42 | reaction_parser.parse_reactions_count(session, dump_ODS_path) 43 | 44 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/data_fetch/users_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, logging 4 | import json 5 | from pprint import pprint 6 | from sqlalchemy.ext.declarative import declarative_base 7 | from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, MetaData, ForeignKey 8 | from sqlalchemy.engine.url import URL 9 | from sqlalchemy.orm import sessionmaker 10 | from sqlalchemy.pool import NullPool 11 | 12 | Base = declarative_base() 13 | 14 | class UserData(Base): 15 | __tablename__ = 'imported_user_data' 16 | 17 | id = Column('id', String, primary_key=True) 18 | name = Column('name', String) 19 | deleted = Column('deleted', Boolean) 20 | 21 | tz = Column('tz', String) 22 | tz_label = Column('tz_label', String) 23 | tz_offset = Column('tz_offset', Integer) 24 | is_admin = Column('is_admin', Boolean) 25 | is_owner = Column('is_owner', Boolean) 26 | is_primary_owner = Column('is_primary_owner', Boolean) 27 | is_restricted = Column('is_restricted', Boolean) 28 | is_ultra_restricted = Column('is_ultra_restricted', Boolean) 29 | is_bot = Column('is_bot', Boolean) 30 | updated = Column('updated', Integer) 31 | is_app_user = Column('is_app_user', Boolean) 32 | 33 | title = Column('title', String) 34 | phone = Column('phone', String) 35 | skype = Column('skype', String) 36 | real_name = Column('real_name', String) 37 | real_name_normalized = Column('real_name_normalized', String) 38 | display_name = Column('display_name', String) 39 | display_name_normalized = Column('display_name_normalized', String) 40 | fields = Column('fields', String) 41 | status_text = Column('status_text', String) 42 | status_emoji = Column('status_emoji', String) 43 | avatar_hash = Column('avatar_hash', String) 44 | first_name = Column('first_name', String) 45 | last_name = Column('last_name', String) 46 | image_24 = Column('image_24', String) 47 | image_32 = Column('image_32', String) 48 | image_48 = Column('image_48', String) 49 | image_72 = Column('image_72', String) 50 | image_192 = Column('image_192', String) 51 | image_512 = Column('image_512', String) 52 | team = Column('team', String) 53 | 54 | def __init__(self, data): 55 | self.id = data.get('id') 56 | self.name = data.get('name') 57 | self.deleted = data.get('deleted', False) 58 | 59 | self.tz = data.get('tz', '') 60 | self.tz_label = data.get('tz_label', '') 61 | self.tz_offset = data.get('tz_offset', 0) 62 | self.is_admin = data.get('is_admin', False) 63 | self.is_owner = data.get('is_owner', False) 64 | self.is_primary_owner = data.get('is_primary_owner', False) 65 | self.is_restricted = data.get('is_restricted', False) 66 | self.is_ultra_restricted = data.get('is_ultra_restricted', False) 67 | self.is_bot = data.get('is_bot', False) 68 | self.updated = data.get('updated', 0) 69 | self.is_app_user = data.get('is_app_user', False) 70 | 71 | self.title = data.get('profile', {}).get('title', '') 72 | self.phone = data.get('profile', {}).get('phone', '') 73 | self.skype = data.get('profile', {}).get('skype', '') 74 | self.real_name = data.get('profile', {}).get('real_name', '') 75 | self.real_name_normalized = data.get('profile', {}).get('real_name_normalized', '') 76 | self.display_name = data.get('profile', {}).get('display_name', '') 77 | self.display_name_normalized = data.get('profile', {}).get('display_name_normalized', '') 78 | # self.fields = data.get('fields', '') 79 | self.status_text = data.get('profile', {}).get('status_text', '') 80 | self.status_emoji = data.get('profile', {}).get('status_emoji', '') 81 | self.avatar_hash = data.get('profile', {}).get('avatar_hash', '') 82 | self.first_name = data.get('profile', {}).get('first_name', '') 83 | self.last_name = data.get('profile', {}).get('last_name', '') 84 | self.image_24 = data.get('profile', {}).get('image_24', '') 85 | self.image_32 = data.get('profile', {}).get('image_32', '') 86 | self.image_48 = data.get('profile', {}).get('image_48', '') 87 | self.image_72 = data.get('profile', {}).get('image_72', '') 88 | self.image_192 = data.get('profile', {}).get('image_192', '') 89 | self.image_512 = data.get('profile', {}).get('image_512', '') 90 | self.team = data.get('profile', {}).get('team', '') 91 | 92 | def __repr__(self): 93 | return "<UserData({0}, {1})>".format(self.id, self.real_name) 94 | 95 | 96 | 97 | def parse_users(session, data_path): 98 | json_path = data_path + '/users.json' 99 | data = json.load(open(json_path)) 100 | for user in data: 101 | session.add(UserData(user)) 102 | session.commit() 103 | 104 | 105 | def parse_messages_get_fields(data_path): 106 | json_path = data_path + '/users.json' 107 | fields_ods = {'skype' : 'Xf0DANL9SL', 'github' : 'Xf3WC3HJMR' } 108 | data = json.load(open(json_path)) 109 | with open('fields.csv', 'w') as f: 110 | for user in data: 111 | skype = '' 112 | github = '' 113 | if user['profile'].get('fields', ''): 114 | if fields_ods['skype'] in user['profile']['fields']: 115 | skype = user['profile']['fields'][fields_ods['skype']] 116 | if skype['alt'] != '': 117 | skype = skype['alt'] 118 | else: 119 | skype = skype['value'] 120 | if fields_ods['github'] in user['profile']['fields']: 121 | github = user['profile']['fields'][fields_ods['github']] 122 | if github['alt'] != '': 123 | github = github['alt'] 124 | else: 125 | github = github['value'] 126 | f.write('"' + '","'.join([user['id'], user['name'], user['profile']['title'], 127 | user['profile'].get('real_name_normalized', ''), 128 | user['profile'].get('first_name', ''), user['profile'].get('last_name', ''), 129 | skype, github]) + '"\n') 130 | 131 | 132 | if __name__ == '__main__': 133 | parse_messages_get_fields('../data/users.json') -------------------------------------------------------------------------------- /hackathon_2_march_2018/topic_modelling/01. clean_text_parsing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "import artm\n", 15 | "import json\n", 16 | "import re\n", 17 | "\n", 18 | "import os\n", 19 | "\n", 20 | "import nltk\n", 21 | "from nltk.stem import SnowballStemmer\n", 22 | "from nltk.corpus import brown\n", 23 | "\n", 24 | "from tqdm import tqdm, tqdm_notebook, tqdm_pandas\n", 25 | "\n", 26 | "\n", 27 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stderr", 37 | "output_type": "stream", 38 | "text": [ 39 | "<input>:8: DeprecationWarning: invalid escape sequence \\w\n", 40 | "<input>:8: DeprecationWarning: invalid escape sequence \\w\n", 41 | "<ipython-input-3-e9dd4ed0e002>:8: DeprecationWarning: invalid escape sequence \\w\n", 42 | " stem = re.sub('[!@#$:]', '', ' '.join(re.findall('\\w{4,}', str(stem).lower())))\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "stemmer = SnowballStemmer('russian')\n", 48 | "\n", 49 | "def clean_text(document):\n", 50 | " #stem = BeautifulSoup(document, 'xml').get_text()\n", 51 | " document = str(document)\n", 52 | " stem=[stemmer.stem(w) for w in document.split()]\n", 53 | " stem= ' '.join(stem)\n", 54 | " stem = re.sub('[!@#$:]', '', ' '.join(re.findall('\\w{4,}', str(stem).lower())))\n", 55 | " return(stem)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "df = pd.read_csv('../../data/ods_dump/messages.csv')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "(751861, 2)" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "df.shape" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "df['for_del1'] = df['text'].apply(lambda x:1 if 'channel' in str(x) else 0)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "df['for_del2'] = df['text'].apply(lambda x:1 if 'upload' in str(x) else 0)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 7, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "df['for_del3'] = df['text'].apply(lambda x:1 if 'joined' in str(x) else 0)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "df['for_del4'] = df['text'].apply(lambda x:1 if 'added' in str(x) else 0)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "df['for_del'] = df['for_del1'] + df['for_del2'] + df['for_del3'] + df['for_del4'] " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 10, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "df_clean = df[df['for_del'] == 0][['user', 'text']]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 11, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/html": [ 181 | "<div>\n", 182 | "<style>\n", 183 | " .dataframe thead tr:only-child th {\n", 184 | " text-align: right;\n", 185 | " }\n", 186 | "\n", 187 | " .dataframe thead th {\n", 188 | " text-align: left;\n", 189 | " }\n", 190 | "\n", 191 | " .dataframe tbody tr th {\n", 192 | " vertical-align: top;\n", 193 | " }\n", 194 | "</style>\n", 195 | "<table border=\"1\" class=\"dataframe\">\n", 196 | " <thead>\n", 197 | " <tr style=\"text-align: right;\">\n", 198 | " <th></th>\n", 199 | " <th>user</th>\n", 200 | " <th>text</th>\n", 201 | " </tr>\n", 202 | " </thead>\n", 203 | " <tbody>\n", 204 | " <tr>\n", 205 | " <th>0</th>\n", 206 | " <td>U1UMQM200</td>\n", 207 | " <td><@U1Z2QA4EM> как избавиться от рекурсии?</td>\n", 208 | " </tr>\n", 209 | " <tr>\n", 210 | " <th>1</th>\n", 211 | " <td>U1Z2QA4EM</td>\n", 212 | " <td><@U1UMQM200>: избавиться от искушения - это ка...</td>\n", 213 | " </tr>\n", 214 | " <tr>\n", 215 | " <th>2</th>\n", 216 | " <td>U09JEC7V0</td>\n", 217 | " <td><@U1Z2QA4EM> в психотерапию умеешь?</td>\n", 218 | " </tr>\n", 219 | " <tr>\n", 220 | " <th>3</th>\n", 221 | " <td>U1Z2QA4EM</td>\n", 222 | " <td><@U09JEC7V0>: ох уж этот реверс в аметисты сос...</td>\n", 223 | " </tr>\n", 224 | " <tr>\n", 225 | " <th>4</th>\n", 226 | " <td>U065VP6F7</td>\n", 227 | " <td><@U1Z2QA4EM> может ты у мамки психолог?</td>\n", 228 | " </tr>\n", 229 | " </tbody>\n", 230 | "</table>\n", 231 | "</div>" 232 | ], 233 | "text/plain": [ 234 | " user text\n", 235 | "0 U1UMQM200 <@U1Z2QA4EM> как избавиться от рекурсии?\n", 236 | "1 U1Z2QA4EM <@U1UMQM200>: избавиться от искушения - это ка...\n", 237 | "2 U09JEC7V0 <@U1Z2QA4EM> в психотерапию умеешь?\n", 238 | "3 U1Z2QA4EM <@U09JEC7V0>: ох уж этот реверс в аметисты сос...\n", 239 | "4 U065VP6F7 <@U1Z2QA4EM> может ты у мамки психолог?" 240 | ] 241 | }, 242 | "execution_count": 11, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "df_clean.head()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 16, 254 | "metadata": { 255 | "collapsed": true 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "#tqdm.pandas()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 1, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "#df_clean['stem_text'] = df_clean['text'].progress_apply(lambda x: clean_text(x))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 2, 276 | "metadata": { 277 | "collapsed": true, 278 | "scrolled": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "#df_clean.to_csv('../../data/ods_dump/clean_message.csv')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 3, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/html": [ 293 | "<div>\n", 294 | "<style>\n", 295 | " .dataframe thead tr:only-child th {\n", 296 | " text-align: right;\n", 297 | " }\n", 298 | "\n", 299 | " .dataframe thead th {\n", 300 | " text-align: left;\n", 301 | " }\n", 302 | "\n", 303 | " .dataframe tbody tr th {\n", 304 | " vertical-align: top;\n", 305 | " }\n", 306 | "</style>\n", 307 | "<table border=\"1\" class=\"dataframe\">\n", 308 | " <thead>\n", 309 | " <tr style=\"text-align: right;\">\n", 310 | " <th></th>\n", 311 | " <th>Unnamed: 0</th>\n", 312 | " <th>user</th>\n", 313 | " <th>text</th>\n", 314 | " <th>stem_text</th>\n", 315 | " </tr>\n", 316 | " </thead>\n", 317 | " <tbody>\n", 318 | " <tr>\n", 319 | " <th>0</th>\n", 320 | " <td>0</td>\n", 321 | " <td>U1UMQM200</td>\n", 322 | " <td><@U1Z2QA4EM> как избавиться от рекурсии?</td>\n", 323 | " <td>u1z2qa4em избав рекурсии</td>\n", 324 | " </tr>\n", 325 | " <tr>\n", 326 | " <th>1</th>\n", 327 | " <td>1</td>\n", 328 | " <td>U1Z2QA4EM</td>\n", 329 | " <td><@U1UMQM200>: избавиться от искушения - это ка...</td>\n", 330 | " <td>u1umqm200 избав искушен контрольн выстрел голов</td>\n", 331 | " </tr>\n", 332 | " <tr>\n", 333 | " <th>2</th>\n", 334 | " <td>2</td>\n", 335 | " <td>U09JEC7V0</td>\n", 336 | " <td><@U1Z2QA4EM> в психотерапию умеешь?</td>\n", 337 | " <td>u1z2qa4em психотерап умеешь</td>\n", 338 | " </tr>\n", 339 | " </tbody>\n", 340 | "</table>\n", 341 | "</div>" 342 | ], 343 | "text/plain": [ 344 | " Unnamed: 0 user text \\\n", 345 | "0 0 U1UMQM200 <@U1Z2QA4EM> как избавиться от рекурсии? \n", 346 | "1 1 U1Z2QA4EM <@U1UMQM200>: избавиться от искушения - это ка... \n", 347 | "2 2 U09JEC7V0 <@U1Z2QA4EM> в психотерапию умеешь? \n", 348 | "\n", 349 | " stem_text \n", 350 | "0 u1z2qa4em избав рекурсии \n", 351 | "1 u1umqm200 избав искушен контрольн выстрел голов \n", 352 | "2 u1z2qa4em психотерап умеешь " 353 | ] 354 | }, 355 | "execution_count": 3, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "df_clean = pd.read_csv('../../data/ods_dump/clean_message.csv')\n", 362 | "df_clean.head(3)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": { 378 | "collapsed": true 379 | }, 380 | "outputs": [], 381 | "source": [] 382 | } 383 | ], 384 | "metadata": { 385 | "kernelspec": { 386 | "display_name": "Python 3", 387 | "language": "python", 388 | "name": "python3" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 3 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython3", 400 | "version": "3.6.3" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 2 405 | } 406 | -------------------------------------------------------------------------------- /hackathon_2_march_2018/topic_modelling/02. vocabulary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "import gc\n", 15 | "\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 17 | "import artm" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 34, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/html": [ 28 | "<div>\n", 29 | "<style>\n", 30 | " .dataframe thead tr:only-child th {\n", 31 | " text-align: right;\n", 32 | " }\n", 33 | "\n", 34 | " .dataframe thead th {\n", 35 | " text-align: left;\n", 36 | " }\n", 37 | "\n", 38 | " .dataframe tbody tr th {\n", 39 | " vertical-align: top;\n", 40 | " }\n", 41 | "</style>\n", 42 | "<table border=\"1\" class=\"dataframe\">\n", 43 | " <thead>\n", 44 | " <tr style=\"text-align: right;\">\n", 45 | " <th></th>\n", 46 | " <th>user</th>\n", 47 | " <th>text</th>\n", 48 | " <th>stem_text</th>\n", 49 | " </tr>\n", 50 | " </thead>\n", 51 | " <tbody>\n", 52 | " <tr>\n", 53 | " <th>0</th>\n", 54 | " <td>U1UMQM200</td>\n", 55 | " <td><@U1Z2QA4EM> как избавиться от рекурсии?</td>\n", 56 | " <td>u1z2qa4em избав рекурсии</td>\n", 57 | " </tr>\n", 58 | " <tr>\n", 59 | " <th>1</th>\n", 60 | " <td>U1Z2QA4EM</td>\n", 61 | " <td><@U1UMQM200>: избавиться от искушения - это ка...</td>\n", 62 | " <td>u1umqm200 избав искушен контрольн выстрел голов</td>\n", 63 | " </tr>\n", 64 | " <tr>\n", 65 | " <th>2</th>\n", 66 | " <td>U09JEC7V0</td>\n", 67 | " <td><@U1Z2QA4EM> в психотерапию умеешь?</td>\n", 68 | " <td>u1z2qa4em психотерап умеешь</td>\n", 69 | " </tr>\n", 70 | " </tbody>\n", 71 | "</table>\n", 72 | "</div>" 73 | ], 74 | "text/plain": [ 75 | " user text \\\n", 76 | "0 U1UMQM200 <@U1Z2QA4EM> как избавиться от рекурсии? \n", 77 | "1 U1Z2QA4EM <@U1UMQM200>: избавиться от искушения - это ка... \n", 78 | "2 U09JEC7V0 <@U1Z2QA4EM> в психотерапию умеешь? \n", 79 | "\n", 80 | " stem_text \n", 81 | "0 u1z2qa4em избав рекурсии \n", 82 | "1 u1umqm200 избав искушен контрольн выстрел голов \n", 83 | "2 u1z2qa4em психотерап умеешь " 84 | ] 85 | }, 86 | "execution_count": 34, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "df_clean = pd.read_csv('../../data/ods_dump/clean_message.csv', usecols=['user', 'text', 'stem_text'])\n", 93 | "df_clean.head(3)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 35, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "cv = TfidfVectorizer(max_features=10000, max_df=0.9, min_df=0.00001, ngram_range=(2,3), stop_words='english')" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 36, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "message = df_clean['stem_text'].fillna(' ') \n", 112 | "del(df_clean)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 37, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "cv.fit(message)\n", 122 | "n_wd = cv.transform(message)\n", 123 | "del(message)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 38, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "vocabulary = cv.get_feature_names()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 39, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "79" 144 | ] 145 | }, 146 | "execution_count": 39, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "gc.collect()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 40, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "n_wd = n_wd.todense()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 41, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "10000" 173 | ] 174 | }, 175 | "execution_count": 41, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "len(vocabulary)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 42, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "bv = artm.BatchVectorizer(data_format='bow_n_wd', batch_size=1000, \n", 212 | " n_wd=n_wd.T,\n", 213 | " vocabulary=vocabulary)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": true 230 | }, 231 | "outputs": [], 232 | "source": [] 233 | } 234 | ], 235 | "metadata": { 236 | "kernelspec": { 237 | "display_name": "Python 3", 238 | "language": "python", 239 | "name": "python3" 240 | }, 241 | "language_info": { 242 | "codemirror_mode": { 243 | "name": "ipython", 244 | "version": 3 245 | }, 246 | "file_extension": ".py", 247 | "mimetype": "text/x-python", 248 | "name": "python", 249 | "nbconvert_exporter": "python", 250 | "pygments_lexer": "ipython3", 251 | "version": "3.6.3" 252 | } 253 | }, 254 | "nbformat": 4, 255 | "nbformat_minor": 2 256 | } 257 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/2018-ods-answers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-answers.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/2018-ods-answers_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-answers_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/2018-ods-questions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-questions.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/2018-ods-questions_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-questions_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/2018-ods-top-users_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/2018-ods-top-users_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/README.md: -------------------------------------------------------------------------------- 1 | # ODS вопросы и ответы 2 | 3 | **Задача** - посчитать количество сообщений с вопросами по каналам и пользователей которые чаще всего отвечают на вопросы 4 | 5 | **ods-get-data.ipynb** - сбор данных из файлов в каталоге и подкаталогах экспорта slack. 6 | 7 | **ods-qa.ipynb** - получение информации по не пустым корневым сообщениям (thread_ts == ts или пустое поле text), выборка из них сообщений с вопросами по наличию в сообщении знака '?' или слов маркеров, группировка сообщений с вопросами по каналам, группировка по количеству ответов пользователей на вопросы исключая пользователей которые задали вопрос. 8 | 9 | **Разобраться в следующий раз:** много сообщений с пустыми thread_ts и ts (есть боты и пользователи) и текстом (с учетом того что при удалении в текст записывается "This message was deleted".) 10 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-answers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-answers.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-answers_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-answers_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-check-export.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import glob\n", 10 | "import datetime\n", 11 | "import pandas as pd\n", 12 | "import os\n", 13 | "import re\n", 14 | "\n", 15 | "# https://stackoverflow.com/questions/36587211/easiest-way-to-read-csv-files-with-multiprocessing-in-pandas\n", 16 | "# http://python-3.ru/page/multiprocessing\n", 17 | "from multiprocessing import Pool" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# Создаю/обнуляю список файлов \n", 27 | "files_full_path_list = list()\n", 28 | "\n", 29 | "# Путь к корневому каталогу файлов\n", 30 | "files_path = '/opt/app/data/shared/latest_dump/*/*.json'\n", 31 | "# files_path = '/opt/app/data/shared/latest_dump/*/2018*.json'" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# Получаю перечень полных пути файлов в подкаталогах\n", 41 | "for file_name in glob.glob(files_path, recursive=True):\n", 42 | " # Добавляю полный путь в список\n", 43 | " files_full_path_list.append(file_name)\n", 44 | " \n", 45 | "files_full_path_list.sort()\n", 46 | "\n", 47 | "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Количество файлов:', len(files_full_path_list))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "files_full_path_list[:10]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# os.path.getsize('/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# !ls -all /opt/app/data/shared/latest_dump/___top_links/2018-08-06.json" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# %%time\n", 84 | "\n", 85 | "# # CPU times: user 6min 38s, sys: 36.4 s, total: 7min 15s\n", 86 | "\n", 87 | "# files_df = pd.DataFrame()\n", 88 | "\n", 89 | "# for file in files_full_path_list:\n", 90 | "\n", 91 | "# file_size = os.path.getsize(file)\n", 92 | " \n", 93 | "# # re.findall('dump/(.+?)/\\d', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n", 94 | "# file_cat = re.findall('dump/(.+?)/\\d', file)[0]\n", 95 | " \n", 96 | "# # re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n", 97 | "# file_date = re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', file)[0]\n", 98 | " \n", 99 | "# list = [[file, file_size, file_cat, file_date]]\n", 100 | " \n", 101 | "# files_df = files_df.append(pd.DataFrame(list, columns=['file', 'file_size', 'file_cat', 'file_date']),ignore_index=True)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "files_df = pd.DataFrame()\n", 118 | "\n", 119 | "def get_file_info(file):\n", 120 | " file_size = os.path.getsize(file)\n", 121 | "\n", 122 | " # re.findall('dump/(.+?)/\\d', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n", 123 | " file_cat = re.findall('dump/(.+?)/\\d', file)[0]\n", 124 | "\n", 125 | " # re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', '/opt/app/data/shared/latest_dump/___top_links/2018-08-06.json')[0]\n", 126 | " file_date = re.findall('(\\d\\d\\d\\d-\\d\\d-\\d\\d)', file)[0]\n", 127 | "\n", 128 | " list = [[file, file_size, file_cat, file_date]]\n", 129 | " \n", 130 | " return pd.DataFrame(list, columns=['file', 'file_size', 'file_cat', 'file_date'])\n", 131 | "\n", 132 | "pool = Pool(processes=10)\n", 133 | "df_list = pool.map(get_file_info, files_full_path_list)\n", 134 | "\n", 135 | "files_df = pd.concat(df_list, ignore_index=True)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "files_df.tail()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "files_df.info()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "files_df['file_date'] = pd.to_datetime(files_df['file_date'], format='%Y-%m-%d')\n", 163 | "files_df['file_size'] = files_df['file_size'].round(0).astype(int)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "files_df[['file_cat']]\\\n", 173 | " .groupby(['file_cat'])['file_cat'] \\\n", 174 | " .count() \\\n", 175 | " .reset_index(name='count') \\\n", 176 | " .sort_values(['count'], ascending=False) \\\n", 177 | " .head(10)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "files_df[['file_cat', 'file_size']]\\\n", 187 | " .groupby(['file_cat'])['file_size'] \\\n", 188 | " .sum() \\\n", 189 | " .reset_index(name='sum_kilobytes') \\\n", 190 | " .sort_values(['sum_kilobytes'], ascending=False) \\\n", 191 | " .head(10)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "import os\n", 306 | "import pandas as pd \n", 307 | "from multiprocessing import Pool\n", 308 | "\n", 309 | "# wrap your csv importer in a function that can be mapped\n", 310 | "def read_csv(filename):\n", 311 | " 'converts a filename to a pandas dataframe'\n", 312 | " return pd.read_csv(filename)\n", 313 | "\n", 314 | "\n", 315 | "def main():\n", 316 | " # set up your pool\n", 317 | " pool = Pool(processes=8) # or whatever your hardware can support\n", 318 | "\n", 319 | " # get a list of file names\n", 320 | " files = os.listdir('.')\n", 321 | " file_list = [filename for filename in files if filename.split('.')[1]=='csv']\n", 322 | "\n", 323 | " # have your pool map the file names to dataframes\n", 324 | " df_list = pool.map(read_csv, file_list)\n", 325 | "\n", 326 | " # reduce the list of dataframes to a single dataframe\n", 327 | " combined_df = pd.concat(df_list, ignore_index=True)\n", 328 | "\n", 329 | "if __name__ == '__main__':\n", 330 | " main()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "from multiprocessing import Pool\n", 340 | "\n", 341 | "def doubler(number):\n", 342 | " return number * 2\n", 343 | " \n", 344 | "numbers = [5, 10, 20]\n", 345 | "pool = Pool(processes=3)\n", 346 | "print(pool.map(doubler, numbers))" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "# Сколько файлов по каталогам и какого размера каталоги?\n" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# Сколько служебных сообщений?\n", 498 | "user leave channel \n", 499 | "user enter channel" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "# Создаю пустой dataframe для данных их файлов\n", 530 | "json_df = pd.DataFrame()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "%%time\n", 540 | "# Наполняю данными о сообщениях за 2018 год dataframe (1min 31s) без multiprocessing\n", 541 | "# Переделать на multiprocessing\n", 542 | "\n", 543 | "for file in files_full_path_list:\n", 544 | " # Читаю файлы в dataframe\n", 545 | " data_parsed = json.loads(open(file).read())\n", 546 | " df = json_normalize(data_parsed)\n", 547 | " # Добавляю имя файла в dataframe для дальнейшего получения даты и названия канал\n", 548 | " df.insert(loc=0, column='FILE', value=file)\n", 549 | "# print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Добавляю содержимое файла в dataframe', file, round(os.path.getsize(file_name)/1000/1000,2), 'мегабайт')\n", 550 | " json_df = json_df.append(df, ignore_index=True, sort=False)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "# # Сохраняю dataframe в csv\n", 560 | "\n", 561 | "# csv_file_name = '2018_ods_raw_new.csv'\n", 562 | "# csv_file_dir = '/opt/app/data/'\n", 563 | "# csv_file_path = csv_file_dir + csv_file_name\n", 564 | "\n", 565 | "# # Проверка существует ли файл. Если существует удаляю\n", 566 | "# if os.path.exists(csv_file_path):\n", 567 | "# os.remove(csv_file_name)\n", 568 | " \n", 569 | "# print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Добавляю содержимое dataframe в csv', csv_file_path) \n", 570 | "# json_df.to_csv(csv_file_name, sep='|', index=False, encoding='utf-8')\n", 571 | "\n", 572 | "# print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \\\n", 573 | "# 'Размер csv файла:', \\\n", 574 | "# round(os.path.getsize(csv_file_path)/1000/1000,3), \\\n", 575 | "# 'Мегабайт')" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [] 661 | } 662 | ], 663 | "metadata": { 664 | "kernelspec": { 665 | "display_name": "Python 3", 666 | "language": "python", 667 | "name": "python3" 668 | }, 669 | "language_info": { 670 | "codemirror_mode": { 671 | "name": "ipython", 672 | "version": 3 673 | }, 674 | "file_extension": ".py", 675 | "mimetype": "text/x-python", 676 | "name": "python", 677 | "nbconvert_exporter": "python", 678 | "pygments_lexer": "ipython3", 679 | "version": "3.7.0" 680 | } 681 | }, 682 | "nbformat": 4, 683 | "nbformat_minor": 2 684 | } 685 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-get-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import glob\n", 10 | "import datetime\n", 11 | "import pandas as pd\n", 12 | "import json\n", 13 | "from pandas.io.json import json_normalize\n", 14 | "\n", 15 | "# https://stackoverflow.com/questions/36587211/easiest-way-to-read-csv-files-with-multiprocessing-in-pandas\n", 16 | "# http://python-3.ru/page/multiprocessing\n", 17 | "from multiprocessing import Pool\n", 18 | "\n", 19 | "import os" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# Создаю/обнуляю список файлов \n", 29 | "files_full_path_list = list()\n", 30 | "\n", 31 | "# Путь к корневому каталогу файлов\n", 32 | "files_path = '/opt/app/data/shared/latest_dump/*/*.json'\n", 33 | "# files_path = '/opt/app/data/shared/latest_dump/*/2018*.json'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Получаю перечень полных пути файлов в подкаталогах\n", 43 | "for file_name in glob.glob(files_path, recursive=True):\n", 44 | " # Добавляю полный путь в список\n", 45 | " files_full_path_list.append(file_name)\n", 46 | " \n", 47 | "files_full_path_list.sort()\n", 48 | "\n", 49 | "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Количество файлов:', len(files_full_path_list))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "files_full_path_list[:10]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "json_df = pd.DataFrame()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Создаю результирующий dataframe из json файлов с помощью multiprocessing\n", 77 | "\n", 78 | "def get_file_data(file):\n", 79 | " data_parsed = json.loads(open(file).read())\n", 80 | " df = json_normalize(data_parsed)\n", 81 | " df.insert(loc=0, column='file', value=file)\n", 82 | " return df\n", 83 | "\n", 84 | "pool = Pool(processes=10)\n", 85 | "df_list = pool.map(get_file_data, files_full_path_list)\n", 86 | "\n", 87 | "json_df = pd.concat(df_list, ignore_index=True, sort=True)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# Сохраняю dataframe в csv\n", 97 | "\n", 98 | "csv_file_name = 'ods_data.csv'\n", 99 | "csv_file_dir = './'\n", 100 | "csv_file_path = csv_file_dir + csv_file_name\n", 101 | "\n", 102 | "# Проверка существует ли файл. Если существует удаляю\n", 103 | "if os.path.exists(csv_file_path):\n", 104 | " os.remove(csv_file_name)\n", 105 | "\n", 106 | "json_df.to_csv(csv_file_name, sep='|', index=False, encoding='utf-8')\n", 107 | "\n", 108 | "print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \\\n", 109 | " 'Размер csv файла:', \\\n", 110 | " round(os.path.getsize(csv_file_path)/(1000*1000.0),2), \\\n", 111 | " 'Мегабайт')" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.7.0" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-qa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Количество вопросов по каналам\n", 8 | "### Количество ответов пользователя по каналам\n", 9 | "### Стата по лайкам за год:\n", 10 | "- каких лайков сколько \n", 11 | "- самые залайканные посты в открытых каналах (включая максимум :parrot: , :pepe_sad: , :catshake: , :ods: , ...) " 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Импорт библиотек" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import glob\n", 28 | "import pandas as pd\n", 29 | "import json\n", 30 | "from pandas.io.json import json_normalize\n", 31 | "\n", 32 | "from datetime import datetime\n", 33 | "\n", 34 | "import os\n", 35 | "\n", 36 | "import re\n", 37 | "\n", 38 | "import pymorphy2\n", 39 | "morph = pymorphy2.MorphAnalyzer()\n", 40 | "\n", 41 | "%matplotlib inline\n", 42 | "import matplotlib.pyplot as plt" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "print ('Размер файла', round(os.path.getsize('ods_data.csv')/1000/1000.0,2), 'мегабайт')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df = pd.read_csv('ods_data.csv', sep='|', encoding='utf-8', dtype=str)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# tp = pd.read_csv('2018_ods_raw.csv', sep='|', encoding='utf-8', dtype=str, iterator=True, chunksize=1000)\n", 70 | "# print (tp)\n", 71 | "# #<pandas.io.parsers.TextFileReader object at 0x00000000150E0048>\n", 72 | "# df = pd.concat(tp, ignore_index=True)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# df.info() " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "len(df)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Все столбцы таблицы\n", 100 | "# list(json_df)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# df.tail()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Добавляю категорию для сообщений\n", 119 | "df['cat'] = df['file']\n", 120 | "df['cat'] = df['cat'].str.replace('/opt/app/data/shared/latest_dump/', '')\n", 121 | "df['cat'] = df['cat'].str.replace('\\/.*','').str.strip()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "df['thread_ts'] = pd.to_datetime(df['thread_ts'], unit='s')\n", 131 | "df['ts'] = pd.to_datetime(df['ts'], unit='s')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "# Информация по сообщениям\n", 139 | "\n", 140 | "##### Количество сообщений всего/2018: 1 089 398 / 374 038\n", 141 | "##### Количество родительских сообщений всего/2018: 51724 / 26345\n", 142 | "##### Количество не пустых родительских сообщений всего/2018: 51447 / 26142\n", 143 | "##### Родительское сообщений набравшее больше всего ответов в 2018 и вообще (1183 шт.) https://opendatascience.slack.com/archives/C0SGCGB52/p1537287302000100" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Количество ответов на родительские сообщения \n", 153 | "df.groupby(['thread_ts'])['thread_ts'].agg('count').sort_values(ascending=False).head()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# https://opendatascience.slack.com/archives/C91N8TL83/p1542103865495600\n", 163 | "# df.loc[(df['thread_ts'] == '2018-11-13 10:11:05.495599985') & (df['thread_ts'] == df['ts'])]\n", 164 | "# https://opendatascience.slack.com/archives/C91N8TL83/p1542103865495600\n", 165 | "# df.loc[(df['thread_ts'] == '2018-11-14 09:55:01.799499989') & (df['thread_ts'] == df['ts'])]\n", 166 | "df['text'].loc[(df['thread_ts'] == '2018-09-18 16:15:02.000099897') & (df['thread_ts'] == df['ts'])]" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "len(df[df.thread_ts.isnull()])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "# Родительские сообщения\n", 185 | "# Возможно неправильно!?!?!!?\n", 186 | "# df_parent = df[df.thread_ts == df.ts] # 2801\n", 187 | "# df_parent = df[df.thread_ts.isnull()] # 26652 из них _random_b 3491 не нашел как связать с ответами\n", 188 | "# df_parent = df[(df.thread_ts.isnull()) | (df.thread_ts == df.ts)]\n", 189 | "\n", 190 | "df_parent = df.loc[(df['thread_ts'] == df['ts'])]\n", 191 | "\n", 192 | "print ('Количество сообщений:', len(df))\n", 193 | "print ('Количество родительских сообщений:', len(df_parent))\n", 194 | "print ('Среднее количество ответов на родительское сообщение:', round(len(df)/len(df_parent),2))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "# Самые популярные родительские сообщения по количеству ответов\n", 204 | "# reply_count больше 200 почему то не бывает хотя есть сообщения с большим количеством коментариев\n", 205 | "\n", 206 | "# df_parent[['thread_ts', 'cat', 'text', 'reply_count']].sort_values(['reply_count'], ascending=False).head()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# # df_a.groupby(['cat', 'user']).size().head(10)\n", 216 | "# df_x = df.loc[df['thread_ts'].isin(df_parent['thread_ts'])]\n", 217 | "# # df_x[['thread_ts', 'ts', 'cat', 'text', 'reply_count']]\n", 218 | "# df_x.groupby(['thread_ts'])['thread_ts'].agg('count').sort_values(ascending=False).head()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# Удаляю сообщения с пустым text\n", 228 | "# Почему такое бывает не разобрался\n", 229 | "df_parent = df_parent.dropna(subset=['text'])\n", 230 | "print ('Количество не пустых родительских сообщений:', len(df_parent))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Родительских сообщений по каналам\n", 240 | "df_parent[['cat','thread_ts']]\\\n", 241 | " .groupby(['cat'])['thread_ts'] \\\n", 242 | " .count() \\\n", 243 | " .reset_index(name='count') \\\n", 244 | " .sort_values(['count'], ascending=False) \\\n", 245 | " .head(10)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "# Информация по заданным вопросам\n", 253 | "##### Количество родительских сообщений с вопросами 2018: 17851 из 26142\n", 254 | "##### Количество родительских сообщений с вопросами всего: 33011 из 51447" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# Добавляю столбец текстом сообщений в номальной форме для того что бы потом искать сообщения с вопросами\n", 264 | "df_parent['morph_text'] = df_parent['text']" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "def to_norm_form(data, column):\n", 274 | " full_words_list = []\n", 275 | " words_row_list = data[column].tolist()\n", 276 | " # Каждую строчку в переданном столбце\n", 277 | " for i in range(len(words_row_list)):\n", 278 | " # Получаю список слов\n", 279 | " words_list = re.sub(\"[^\\w]\", \" \", words_row_list[i]).split()\n", 280 | " # Каждое слово из строки\n", 281 | " norm_words_list = []\n", 282 | " for word in words_list:\n", 283 | " norm_word = morph.parse(word)[0].normal_form\n", 284 | " norm_words_list.append(norm_word)\n", 285 | " \n", 286 | " full_words_string = ' '.join(norm_words_list)\n", 287 | " full_words_list.append(full_words_string)\n", 288 | " \n", 289 | " return full_words_list" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "%%time\n", 299 | "# Запонляю столбец morph_text текстом сообщений в номальной форме для того что бы потом искать сообщения с вопросами\n", 300 | "df_parent['morph_text'] = to_norm_form(df_parent, 'morph_text')" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "df_parent[['text', 'morph_text']].head()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "# Добавляю признак что в тексте был знак вопроса\n", 319 | "df_parent['found_question_mark'] = df_parent['text'].str.contains('\\?')" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "df_parent[['text', 'morph_text', 'found_question_mark']].tail()" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# morph.parse('зачем')[0].normal_form" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# Количество родительских сообщений с вопросами\n", 347 | "# QUESTION_WORD_LEMMAS = (\"как\", \"как-то\", \"какой\", \"какой-то\", \"зачем\", \"почему\", \"когда\", \"кто\", \"где\", \"когда\", \"куда\", \"куда-то\", \"чот\")\n", 348 | "#QUESTION_WORDS = ('вопрос', 'обьяснит', 'подсказать', 'посоветовать', 'как') # Количество родительских сообщений с вопросами: 1828\n", 349 | "QUESTION_WORDS = ('вопрос', 'обьяснит', 'подсказать', 'посоветовать', 'как', 'почему', 'зачем')\n", 350 | "\n", 351 | "df_q = df_parent.loc[(df_parent['morph_text'].str.contains('|'.join(QUESTION_WORDS))) | (df_parent.found_question_mark == True)].reset_index()" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "print ('Количество родительских сообщений:', len(df_parent))\n", 361 | "print ('Количество родительских сообщений с вопросами:', len(df_q))" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "# Вопросов в родительских сообщениях по каналам\n", 371 | "df_q[['cat','thread_ts']]\\\n", 372 | " .groupby(['cat'])['thread_ts'] \\\n", 373 | " .count() \\\n", 374 | " .reset_index(name='count') \\\n", 375 | " .sort_values(['count'], ascending=False) \\\n", 376 | " .head(10)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# Вопросов в родительских сообщениях по каналам\n", 386 | "\n", 387 | "# df_q = df_q[df_q.cat != '_random_b'] # 5287\n", 388 | "# df_q = df_q[df_q.cat != 'stack_overflow'] # 456\n", 389 | "\n", 390 | "# df_q.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False).head()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "plt.rcParams[\"figure.figsize\"] = (16, 9)\n", 400 | "\n", 401 | "df_user_questions = df_q.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False)\n", 402 | "df_user_questions.head(10).plot.bar()\n", 403 | "\n", 404 | "plt.title('Количество вопросов по каналам за 2018 год (шт.)', loc='center')\n", 405 | "\n", 406 | "plt.xlabel('Канал')\n", 407 | "plt.ylabel('Количество')\n", 408 | "\n", 409 | "plt.savefig('ods-questions.png', bbox_inches = 'tight')\n", 410 | "\n", 411 | "# plt.savefig('2018-ods-questions.svg', format='svg')\n", 412 | "# plt.savefig('2018-ods-questions.png', bbox_inches = 'tight', dpi=600)\n", 413 | "# I used 1200 dpi because a lot of scientific journals require images in 1200 / 600 / 300 dpi depending on what the image is of" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "# Информация по ответам на вопросы" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# # reply_count показывает странные цифры\n", 430 | "\n", 431 | "# # Ответов на сообщения вопросы по полю reply_count\n", 432 | "# # df_q[['cat', 'thread_ts', 'text', 'morph_text', 'reply_count']].sort_values('reply_count', ascending=False).head()\n", 433 | "# df_q[['cat', 'thread_ts', 'reply_count']].sort_values('reply_count', ascending=False).head()" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# df_q.loc[(df_q['thread_ts'] == '2018-11-01 18:36:36.419199944')]" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "# df.loc[df['thread_ts'].isin(df_q['thread_ts'])].tail()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "# Ответов по вхождению сообщений пользователей в сообщения вопросы\n", 461 | "\n", 462 | "# наверно будет хорошей идеей отфильтровать из ответов на вопросы пользователей которые задали родительский вопрос\n", 463 | "\n", 464 | "# df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts'])]\n", 465 | "# df_a[['cat','thread_ts', 'user']]\\\n", 466 | "# .groupby(['cat', 'thread_ts', 'user'])['thread_ts'] \\\n", 467 | "# .count() \\\n", 468 | "# .reset_index(name='count') \\\n", 469 | "# .sort_values(['count'], ascending=False) \\\n", 470 | "# .head()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "# len(df_a)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# df_q.loc[(df_q['thread_ts'] == '2018-11-16 08:59:45.085799932')]" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# наверно будет хорошей идеей отфильтровать из ответов на вопросы пользователей которые задали родительский вопрос\n", 498 | "# df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts']) & ~df['user'].isin(df_q['user'])]\n", 499 | "\n", 500 | "# Ответов по вхождению сообщений пользователей в сообщения вопросы\n", 501 | "df_a = df.loc[df['thread_ts'].isin(df_q['thread_ts'])]\n", 502 | "\n", 503 | "df_a[['cat']]\\\n", 504 | " .groupby(['cat'])['cat'] \\\n", 505 | " .count() \\\n", 506 | " .reset_index(name='count') \\\n", 507 | " .sort_values(['count'], ascending=False) \\\n", 508 | " .head(10)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "len(df_a)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "# df_q.tail()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "plt.rcParams[\"figure.figsize\"] = (16, 9)\n", 536 | "\n", 537 | "df_user_answers = df_a.groupby(['cat'])['thread_ts'].agg('count').sort_values(ascending=False)\n", 538 | "df_user_answers.head(10).plot.bar()\n", 539 | "\n", 540 | "plt.title('Количество ответов по каналам за 2018 год (шт.)', loc='center')\n", 541 | "\n", 542 | "plt.xlabel('Канал')\n", 543 | "plt.ylabel('Количество')\n", 544 | "\n", 545 | "plt.savefig('ods-answers.png', bbox_inches = 'tight')\n", 546 | "\n", 547 | "# plt.savefig('2018-ods-questions.svg', format='svg')\n", 548 | "# plt.savefig('2018-ods-answers.png', bbox_inches = 'tight', dpi=600)\n", 549 | "# I used 1200 dpi because a lot of scientific journals require images in 1200 / 600 / 300 dpi depending on what the image is of" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "users_list = json.loads(open('/opt/app/data/shared/latest_dump/users.json').read())\n", 559 | "users_df = json_normalize(users_list)\n", 560 | "\n", 561 | "# users_df.info()\n", 562 | "# users_df[['id', 'name']].head()" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "df_a = pd.merge(df_a, users_df, how='left', left_on=['user'], right_on = ['id'])" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "df_a[['user', 'name']].head()" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "# Топ пользователей по ответам на вопросы\n", 590 | "df_a.groupby(['user', 'name'])['user'].agg('count').reset_index(name='count').sort_values(['count'], ascending=False).head(10)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "Python 3", 639 | "language": "python", 640 | "name": "python3" 641 | }, 642 | "language_info": { 643 | "codemirror_mode": { 644 | "name": "ipython", 645 | "version": 3 646 | }, 647 | "file_extension": ".py", 648 | "mimetype": "text/x-python", 649 | "name": "python", 650 | "nbconvert_exporter": "python", 651 | "pygments_lexer": "ipython3", 652 | "version": "3.7.0" 653 | } 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 2 657 | } 658 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-questions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-questions.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-questions_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-questions_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/dv_qa/ods-top-users_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-data-science/Introspect_hackathon/8e5c65933660716e44f0ef7852d5a48324710ebf/hackathon_3_december_2018/dv_qa/ods-top-users_tab.png -------------------------------------------------------------------------------- /hackathon_3_december_2018/folium_map/parse_geoservice_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "from geopy.geocoders import Nominatim, Yandex\n", 13 | "from geopy.exc import GeocoderServiceError, GeocoderTimedOut" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "def update_locations_geodata(geolocator, unique_locations, locations_geodata, bad_locations):\n", 23 | " for city in unique_locations:\n", 24 | " if (city in locations_geodata) or (city in bad_locations):\n", 25 | " continue\n", 26 | "\n", 27 | " try:\n", 28 | " location = geolocator.geocode(city)\n", 29 | " except GeocoderServiceError as e:\n", 30 | " print('GeocoderServiceError: {}'.format(e))\n", 31 | " break\n", 32 | " except GeocoderTimedOut as e:\n", 33 | " print('GeocoderTimedOut: {}'.format(e))\n", 34 | " break\n", 35 | "\n", 36 | " if location is None:\n", 37 | " bad_locations.append(city)\n", 38 | " else:\n", 39 | " locations_geodata[city] = location" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "<div>\n", 51 | "<style scoped>\n", 52 | " .dataframe tbody tr th:only-of-type {\n", 53 | " vertical-align: middle;\n", 54 | " }\n", 55 | "\n", 56 | " .dataframe tbody tr th {\n", 57 | " vertical-align: top;\n", 58 | " }\n", 59 | "\n", 60 | " .dataframe thead th {\n", 61 | " text-align: right;\n", 62 | " }\n", 63 | "</style>\n", 64 | "<table border=\"1\" class=\"dataframe\">\n", 65 | " <thead>\n", 66 | " <tr style=\"text-align: right;\">\n", 67 | " <th></th>\n", 68 | " <th>id</th>\n", 69 | " <th>city</th>\n", 70 | " </tr>\n", 71 | " </thead>\n", 72 | " <tbody>\n", 73 | " <tr>\n", 74 | " <th>0</th>\n", 75 | " <td>UE7T3UC1M</td>\n", 76 | " <td>Москва</td>\n", 77 | " </tr>\n", 78 | " <tr>\n", 79 | " <th>1</th>\n", 80 | " <td>UE61U6DCL</td>\n", 81 | " <td>Москва</td>\n", 82 | " </tr>\n", 83 | " <tr>\n", 84 | " <th>2</th>\n", 85 | " <td>UEF068197</td>\n", 86 | " <td>Moscow</td>\n", 87 | " </tr>\n", 88 | " <tr>\n", 89 | " <th>3</th>\n", 90 | " <td>UE7JRC006</td>\n", 91 | " <td>Краснодар</td>\n", 92 | " </tr>\n", 93 | " <tr>\n", 94 | " <th>4</th>\n", 95 | " <td>UE7M36F7Y</td>\n", 96 | " <td>Samara</td>\n", 97 | " </tr>\n", 98 | " </tbody>\n", 99 | "</table>\n", 100 | "</div>" 101 | ], 102 | "text/plain": [ 103 | " id city\n", 104 | "0 UE7T3UC1M Москва\n", 105 | "1 UE61U6DCL Москва\n", 106 | "2 UEF068197 Moscow\n", 107 | "3 UE7JRC006 Краснодар\n", 108 | "4 UE7M36F7Y Samara" 109 | ] 110 | }, 111 | "execution_count": 3, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "user_locations = pd.read_csv('./../user_id_to_from.csv')\n", 118 | "user_locations.rename({'from': 'city'}, axis=1, inplace=True)\n", 119 | "user_locations.head()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "(2101,)" 131 | ] 132 | }, 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "user_locations['city'].unique().shape" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "locations_geodata = dict()\n", 149 | "bad_locations = []" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 44, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "locations_geodata: 2041, bad_locations: 60\n", 162 | "complete flag: True\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "# execute until get all data (remember to geolocator limits!)\n", 168 | "\n", 169 | "#geolocator = Nominatim(user_agent='aborisihin')\n", 170 | "geolocator = Yandex()\n", 171 | "\n", 172 | "update_locations_geodata(geolocator, user_locations['city'].unique(), locations_geodata, bad_locations)\n", 173 | "\n", 174 | "print('locations_geodata: {}, bad_locations: {}'.format(len(locations_geodata), len(bad_locations)))\n", 175 | "\n", 176 | "complete_flag = (len(locations_geodata) + len(bad_locations) == len(user_locations['city'].unique()))\n", 177 | "print('complete flag: {}'.format(complete_flag))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 32, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "def geodata_value_osm(location_string, key):\n", 187 | " if location_string in locations_geodata:\n", 188 | " return locations_geodata[location_string].raw[key]\n", 189 | " else:\n", 190 | " return None\n", 191 | " \n", 192 | "def geodata_value_yandex(location_string, key):\n", 193 | " if location_string in locations_geodata:\n", 194 | " if (key == 'text') or (key == 'kind'):\n", 195 | " return locations_geodata[location_string].raw['metaDataProperty']['GeocoderMetaData'][key]\n", 196 | " elif key == 'lat':\n", 197 | " return locations_geodata[location_string].raw['Point']['pos'].split(' ')[0]\n", 198 | " elif key == 'lon':\n", 199 | " return locations_geodata[location_string].raw['Point']['pos'].split(' ')[1]\n", 200 | " \n", 201 | " return None" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 33, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# OpenStreetMaps\n", 211 | "# user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'display_name'))\n", 212 | "# user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'type'))\n", 213 | "# user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lat'))\n", 214 | "# user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lon'))\n", 215 | "\n", 216 | "# Yandex\n", 217 | "user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'text'))\n", 218 | "user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'kind'))\n", 219 | "user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lat'))\n", 220 | "user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lon'))" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 39, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "<div>\n", 232 | "<style scoped>\n", 233 | " .dataframe tbody tr th:only-of-type {\n", 234 | " vertical-align: middle;\n", 235 | " }\n", 236 | "\n", 237 | " .dataframe tbody tr th {\n", 238 | " vertical-align: top;\n", 239 | " }\n", 240 | "\n", 241 | " .dataframe thead th {\n", 242 | " text-align: right;\n", 243 | " }\n", 244 | "</style>\n", 245 | "<table border=\"1\" class=\"dataframe\">\n", 246 | " <thead>\n", 247 | " <tr style=\"text-align: right;\">\n", 248 | " <th></th>\n", 249 | " <th>id</th>\n", 250 | " <th>city</th>\n", 251 | " <th>geolocation_name</th>\n", 252 | " <th>geolocation_type</th>\n", 253 | " <th>geolocation_lat</th>\n", 254 | " <th>geolocation_lon</th>\n", 255 | " </tr>\n", 256 | " </thead>\n", 257 | " <tbody>\n", 258 | " <tr>\n", 259 | " <th>0</th>\n", 260 | " <td>UE7T3UC1M</td>\n", 261 | " <td>Москва</td>\n", 262 | " <td>Россия, Москва</td>\n", 263 | " <td>province</td>\n", 264 | " <td>37.622504</td>\n", 265 | " <td>55.753215</td>\n", 266 | " </tr>\n", 267 | " <tr>\n", 268 | " <th>1</th>\n", 269 | " <td>UE61U6DCL</td>\n", 270 | " <td>Москва</td>\n", 271 | " <td>Россия, Москва</td>\n", 272 | " <td>province</td>\n", 273 | " <td>37.622504</td>\n", 274 | " <td>55.753215</td>\n", 275 | " </tr>\n", 276 | " <tr>\n", 277 | " <th>2</th>\n", 278 | " <td>UEF068197</td>\n", 279 | " <td>Moscow</td>\n", 280 | " <td>Россия, Москва</td>\n", 281 | " <td>locality</td>\n", 282 | " <td>37.617635</td>\n", 283 | " <td>55.755814</td>\n", 284 | " </tr>\n", 285 | " <tr>\n", 286 | " <th>3</th>\n", 287 | " <td>UE7JRC006</td>\n", 288 | " <td>Краснодар</td>\n", 289 | " <td>Россия, Краснодар</td>\n", 290 | " <td>locality</td>\n", 291 | " <td>38.975313</td>\n", 292 | " <td>45.03547</td>\n", 293 | " </tr>\n", 294 | " <tr>\n", 295 | " <th>4</th>\n", 296 | " <td>UE7M36F7Y</td>\n", 297 | " <td>Samara</td>\n", 298 | " <td>Россия, Самара</td>\n", 299 | " <td>locality</td>\n", 300 | " <td>50.101783</td>\n", 301 | " <td>53.195538</td>\n", 302 | " </tr>\n", 303 | " </tbody>\n", 304 | "</table>\n", 305 | "</div>" 306 | ], 307 | "text/plain": [ 308 | " id city geolocation_name geolocation_type geolocation_lat \\\n", 309 | "0 UE7T3UC1M Москва Россия, Москва province 37.622504 \n", 310 | "1 UE61U6DCL Москва Россия, Москва province 37.622504 \n", 311 | "2 UEF068197 Moscow Россия, Москва locality 37.617635 \n", 312 | "3 UE7JRC006 Краснодар Россия, Краснодар locality 38.975313 \n", 313 | "4 UE7M36F7Y Samara Россия, Самара locality 50.101783 \n", 314 | "\n", 315 | " geolocation_lon \n", 316 | "0 55.753215 \n", 317 | "1 55.753215 \n", 318 | "2 55.755814 \n", 319 | "3 45.03547 \n", 320 | "4 53.195538 " 321 | ] 322 | }, 323 | "execution_count": 39, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "user_locations.head()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 42, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "#CSV_PATH = './user_locations_osm.csv'\n", 339 | "CSV_PATH = './user_locations_yandex.csv'\n", 340 | "\n", 341 | "user_locations.to_csv(CSV_PATH, index=False)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [] 350 | } 351 | ], 352 | "metadata": { 353 | "kernelspec": { 354 | "display_name": "Python 3", 355 | "language": "python", 356 | "name": "python3" 357 | }, 358 | "language_info": { 359 | "codemirror_mode": { 360 | "name": "ipython", 361 | "version": 3 362 | }, 363 | "file_extension": ".py", 364 | "mimetype": "text/x-python", 365 | "name": "python", 366 | "nbconvert_exporter": "python", 367 | "pygments_lexer": "ipython3", 368 | "version": "3.7.0" 369 | } 370 | }, 371 | "nbformat": 4, 372 | "nbformat_minor": 2 373 | } 374 | -------------------------------------------------------------------------------- /hackathon_3_december_2018/folium_map/user_geodata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def get_username_from_dump(users_json, user_id): 9 | """ Get username from dump file by user id. 10 | 11 | Args: 12 | users_json: dict object with dump data 13 | user_id: id to find 14 | """ 15 | for user in users_json: 16 | if user.get('id', '') == user_id: 17 | return user.get('name', '') 18 | return '' 19 | 20 | 21 | def filter_coordinate(data_row, coordinate_field): 22 | """ Get correct coordinates from dataframe row. 23 | Filter data by 'geolocation_type' field. 24 | 25 | Args: 26 | data_row: data 27 | coordinate_field: coordinate field name 28 | """ 29 | correct_location_types = [ 30 | 'city', # OSM 31 | 'locality', # yandex 32 | 'province', # yandex 33 | 'area' # yandex 34 | ] 35 | 36 | if data_row['geolocation_type'] in correct_location_types: 37 | return np.float(data_row[coordinate_field]) 38 | else: 39 | return None 40 | 41 | 42 | def prepare_user_data(settings_filepath): 43 | """ Prepare user geodata csv file. 44 | Connect geodata from geolocator service with usernames and filer correct coordinates. 45 | 46 | Args: 47 | settings_filepath: path to settings file 48 | """ 49 | print('open settings: {}'.format(settings_filepath)) 50 | with open(settings_filepath, 'r') as settings_file: 51 | settings = json.load(settings_file) 52 | 53 | with open(settings['users_dump_file'], 'r') as users_json_file: 54 | users_json = json.load(users_json_file) 55 | 56 | user_locations = pd.read_csv(settings['users_locations_file']) 57 | 58 | user_locations['user'] = user_locations['id'].apply(lambda x: get_username_from_dump(users_json, x)) 59 | user_locations['latitude'] = user_locations.apply(lambda x: filter_coordinate(x, 'geolocation_lat'), axis=1) 60 | user_locations['longitude'] = user_locations.apply(lambda x: filter_coordinate(x, 'geolocation_lon'), axis=1) 61 | 62 | user_locations.to_csv(settings['output_file'], index=False) 63 | 64 | 65 | if __name__ == '__main__': 66 | if len(sys.argv) >= 2: 67 | prepare_user_data(sys.argv[1]) 68 | else: 69 | print('settings file needed') -------------------------------------------------------------------------------- /hackathon_3_december_2018/folium_map/user_geodata_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "users_dump_file": "./../shared/latest_dump/users.json", 3 | "users_locations_file": "user_locations_osm.csv", 4 | "output_file": "filtered_user_locations.csv" 5 | } -------------------------------------------------------------------------------- /hackathon_3_december_2018/folium_map/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import folium 4 | from folium import plugins 5 | from folium import IFrame 6 | 7 | import json 8 | import time 9 | 10 | import matplotlib.pyplot as plt 11 | from plotly import tools 12 | import plotly.graph_objs as go 13 | import plotly 14 | import pandas as pd 15 | from plotly.offline import plot 16 | 17 | from geopy.geocoders import Nominatim 18 | import configparser 19 | 20 | config = configparser.ConfigParser() 21 | config.read('settings.ini') 22 | username = config['PLOTLY']['username'] 23 | api_key = config['PLOTLY']['api_key'] 24 | 25 | plotly.tools.set_credentials_file(username=username, api_key=api_key) 26 | geolocator = Nominatim(user_agent='artgor') 27 | 28 | 29 | def plot_top_channels(plot_type='matplotlib', top_n=20): 30 | """ 31 | Plot top channels by user count. 32 | 33 | Use channels.json to get channels and user count in them. 34 | Can plot either top N channels in matplotlib or in Plotly 35 | 36 | :params: plot_type - matplotlib/plotly 37 | :params: top_n - plot top n channels. If None - plot all 38 | """ 39 | # load file 40 | with open('shared/latest_dump/channels.json', 'r') as f: 41 | channels = json.load(f) 42 | 43 | # number of users in channels 44 | d = {i['name']: len(i['members']) for i in channels} 45 | 46 | # sort data and convert to pandas DF. set channel as index for plotting 47 | sorted_d = sorted(d.items(), key = lambda x: x[1], reverse=True) 48 | df = pd.DataFrame(sorted_d, columns=['channel', 'user_count']) 49 | df = df.set_index('channel') 50 | 51 | if top_n is None: 52 | top_n = len(df) 53 | 54 | if plot_type == 'matplotlib': 55 | df[:top_n].sort_values('user_count').plot(kind='barh', figsize=(12, 8)); 56 | plt.title(f'Топ-{top_n} каналов по количеству пользователей'); 57 | plt.show() 58 | 59 | elif plot_type == 'plotly': 60 | data = [go.Bar( 61 | x=df[:top_n].index, 62 | y=df[:top_n]['user_count'], 63 | name='user counts' 64 | )] 65 | layout = go.Layout() 66 | fig = go.Figure(data=data, layout=layout) 67 | plot(fig, filename='top_channels.html') 68 | else: 69 | raise ValueError('Possible values: matplotlib or plotly') 70 | 71 | def prepare_data_for_folium(return_df=False, save_df=True, df_name='user_geo'): 72 | """ 73 | Prepairs data for foluim in a naive way. 74 | 75 | Uses information about user time zone to prepare data for using in folium. 76 | 77 | :params: return_df - whether to return df 78 | :params: save_df - whether to save df 79 | :params: df_name - name of saved df 80 | """ 81 | with open('shared/latest_dump/users.json', 'r') as f: 82 | users = json.load(f) 83 | 84 | # mapping of users to timezone 85 | user_tz = {i['name'] : i['tz'] if 'tz' in i.keys() else '' for i in users} 86 | 87 | # unique tz 88 | tzs = list(set(list(user_tz.values()))) 89 | 90 | #cities 91 | city_list = sorted([i.split('/')[1] for i in tzs[1:]]) 92 | 93 | # getting data from api. There is a limit of number of requests 94 | # so sleep is used 95 | city_geo = {} 96 | for i, c in enumerate(city_list): 97 | if i % 30 == 0: 98 | time.sleep(1) 99 | 100 | location = geolocator.geocode(c) 101 | city_geo[c] = (location.latitude, location.longitude) 102 | 103 | # create DataFrame 104 | u_c_df = pd.DataFrame.from_dict(user_tz, orient='index') 105 | u_c_df.reset_index(inplace=True) 106 | u_c_df.columns = ['user', 'tz'] 107 | 108 | u_c_df['city'] = u_c_df['tz'].apply(lambda x: x.split('/')[1] if '/' in x else '') 109 | 110 | # dropping empty rows 111 | u_c_df = u_c_df.loc[u_c_df['city'] != ''] 112 | u_c_df['latitude'] = u_c_df['city'].apply(lambda x: city_geo[x][0]) 113 | u_c_df['longitude'] = u_c_df['city'].apply(lambda x: city_geo[x][1]) 114 | u_c_df['user_count'] = u_c_df.groupby('city')['user'].transform('count') 115 | 116 | if save_df: 117 | u_c_df.to_csv(f'{df_name}.csv', index=False) 118 | 119 | if return_df: 120 | return df 121 | 122 | def make_plotly_map(u_c_df, plot_by='city', add_heatmap=True): 123 | """ 124 | Make folium map. 125 | 126 | Makes folium map with heatmap. 127 | Can be done by cities or geo data. 128 | Text of markers is made with html, so it can be easily changed to show any information. 129 | 130 | :params: u_c_df pandas DataFrame with data. Must have columns: user (display name), city, 131 | latitude, longtitude. 132 | :params: plot_by - It is adequate to do it by city, but there is a case, 133 | when it was better to do it by geo - when data isn't completely clean. 134 | add_heatmap - whether to add heatmap. 135 | 136 | """ 137 | m = folium.Map([], zoom_start=15) 138 | if add_heatmap: 139 | geo_matrix = u_c_df[['latitude', 'longitude']].values 140 | m.add_child(plugins.HeatMap(geo_matrix, radius=10, min_opacity=0.6, max_zoom=10, max_val=1, blur=10, gradient={0.4: 'blue', 0.65: 'lime', 1: 'crimson'})); 141 | 142 | marker_cluster = plugins.MarkerCluster().add_to(m) 143 | 144 | if plot_by == 'city': 145 | main_col = 'city' 146 | count_col = 'user_count_city' 147 | 148 | elif plot_by == 'geo': 149 | main_col = 'latitude' 150 | count_col = 'user_count_lat' 151 | else: 152 | raise ValueError('Possible values: city or geo') 153 | 154 | for c in u_c_df[main_col].unique(): 155 | # make list of first 5 people 156 | city_users = list(u_c_df.loc[u_c_df[main_col] == c, 'user'].values)[:5] 157 | #city_users = '\n'.join(city_users) 158 | 159 | # user count 160 | user_count = u_c_df.loc[u_c_df[main_col] == c, count_col].unique()[0] 161 | 162 | # city name 163 | city_name = c if main_col == 'city' else u_c_df.loc[u_c_df[main_col] == c, 'city'].unique()[0] 164 | 165 | # creating folium markers 166 | html=f""" 167 | <h2> Город: {city_name}</h2><br> 168 | Количество пользователей: {user_count}<br> 169 | Здесь живут такие люди:<br> 170 | """ 171 | for u in city_users: 172 | html += u + '<br>' 173 | 174 | iframe = IFrame(html=html, width=500, height=300) 175 | popup = folium.Popup(iframe, max_width=300) 176 | 177 | folium.Marker(location=[u_c_df.loc[u_c_df[main_col] == c, 'latitude'].unique()[0], 178 | u_c_df.loc[u_c_df[main_col] == c, 'longitude'].unique()[0]], 179 | popup=popup 180 | ).add_to(marker_cluster) 181 | 182 | return m --------------------------------------------------------------------------------