├── .gitignore
├── Makefile
├── Procfile
├── README.md
├── base_summarizer.py
├── compat.py
├── data
├── test-events-channel2.json
└── test-events-elastic.json
├── img
├── hackathon-discussion.png
└── meeting-discussion.png
├── interval_summarizer.py
├── lsa.py
├── main.py
├── requirements.txt
├── slack_summary.py
├── sp_summarizer.py
├── test-events.json
├── test_hypothesis_summarizer.py
├── test_service_components.py
├── test_spacy_with_hypothesis.py
├── test_summarizer.py
├── ts_config.py
├── ts_summarizer.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | *.pyc
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | platform=$(shell uname -s)
2 | conda_path=$(shell which conda)
3 |
4 | .PHONY: show check-env venv check run hyp spacy_hyp
5 |
6 |
7 | ifeq ($(platform),Darwin)
8 |
9 | ifneq ($(findstring conda,$(conda_path)),conda)
10 | $(error Conda not present)
11 | else
12 | @echo Conda present at $(conda_path)
13 | endif
14 |
15 | ifeq ($(SUMMARIZE_VENV),)
16 | SUMMARIZE_VENV=summarize_venv2
17 | endif
18 | ifeq ($(CONDA_ENV_PATH),)
19 | CONDA_ENV_PATH=//anaconda
20 | endif
21 |
22 | HOST_IP?=10.0.0.10
23 | NB_PORT?=8887
24 |
25 | PYLIBS := numpy scipy scikit-learn gensim spacy flask
26 | VENVDIR := $(CONDA_ENV_PATH)/envs/$(SUMMARIZE_VENV)
27 |
28 | $(VENVDIR):
29 | test -d $(VENVDIR) || conda create -y -n $(SUMMARIZE_VENV) $(PYLIBS)
30 |
31 | deps: $(VENVDIR)
32 |
33 | check: $(VENVDIR)
34 | source activate $(SUMMARIZE_VENV);\
35 | python ./test_summarizer.py;\
36 | python ./test_service_components.py
37 |
38 | run: $(VENVDIR)
39 | source activate $(SUMMARIZE_VENV);\
40 | python ./ts_summarizer.py
41 |
42 | else ifeq ($(platform),Linux)
43 |
44 | VENVDIR := ./venv
45 | PYVENV := $(VENVDIR)/bin/python
46 | NBVENV := $(VENVDIR)/bin/ipython
47 | PIPVENV := $(VENVDIR)/bin/pip
48 |
49 | clean:
50 | rm -r $(VENVDIR)
51 |
52 | check: | $(VENVDIR)
53 | $(PYVENV) ./test_summarizer.py;\
54 | $(PYVENV) ./test_service_components.py
55 |
56 | hyp: | $(VENVDIR)
57 | $(PYVENV) ./test_hypothesis_summarizer.py
58 |
59 | spacy_hyp: | $(VENVDIR)
60 | $(PYVENV) ./test_spacy_with_hypothesis.py
61 |
62 | run: | $(VENVDIR)
63 | $(PYVENV) ./ts_summarizer.py
64 |
65 | notebook: | $(VENVDIR)
66 | $(NBVENV) notebook --ip=$(HOST_IP) --port=$(NB_PORT) --no-browser
67 |
68 | $(VENVDIR):
69 | test -d $(VENVDIR) || (virtualenv $(VENVDIR);\
70 | $(PIPVENV) install -r ./requirements.txt;\
71 | $(PYVENV) -m spacy.en.download all)
72 |
73 | else
74 | $(error, Unknown platform)
75 |
76 | endif
77 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn gettingstarted.wsgi --log-file -
2 | web: python main.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Slack Summary
2 |
3 | Summarize it is a chat summarizer plugin for instant messaging applications. It summarizes the large content of chat logs which enables users to quickly understand the current context of the conversation. Currently Summarize it works on top of Slack as its plugin.
4 |
5 | The original relied on an HP cloud concepts expraction api. We've pulled that out to remove any reliance on 3rd party apis, and are going to improve the summarizations.
6 |
7 | ## Installing Summarize It plugin for your slack
8 |
9 | To install the summary package
10 |
11 | Firsts, create a token for your team `https://api.slack.com/web`
12 |
13 | pip install flask requests slacker wsgiref jupyter mock pbr spacy numpy
14 |
15 | Then run
16 |
17 | python -m spacy.en.download all
18 |
19 | Edit the `config.py` file that it includes the line
20 |
21 | keys = {
22 | "slack": "your-token-here"
23 | }
24 |
25 | Then edit `ts_config.py` file to adjust the debugging options
26 |
27 | SUMMARY_INTERVALS = [{'minutes': 10, 'size': 1}, {'hours':12, 'size': 2}]
28 | TS_DEBUG = True
29 | TS_LOG = "./ts_summ.log"
30 | DEBUG=True
31 | LOG_FILE="./summary.log"
32 |
33 | Here the `LOG_FILE` stores where notices of users accessing the server is stored and the
34 | value of `DEBUG` determines if detailed logging is stored.
35 |
36 | The plugin is executed by running
37 |
38 | python main.py
39 |
40 |
41 | Tests are currently setup to run in a python `virtualenv`. These will executed by
42 | runnning
43 |
44 | make check
45 |
46 | but realize that the test will install and run in a virtualenv
47 |
48 |
49 | To complete the installation
50 |
51 | 1. Visit `https://.slack.com/services/new/slash-commands`
52 |
53 | 2. Enter the command name you wish to use
54 |
55 | 3. Enter the request url as `/slack`
56 |
57 | ## Using Summarize It plugin with slack
58 |
59 | Let's assume that that plugin is named summary. The plugin supports a small
60 | command line syntax with allows specification of the previous window of time to look
61 | back. Currently this can be specified in `minutes, days, or weeks`. Keyword search is
62 | coming soon.
63 |
64 | So to get the key messages from the last 5 days:
65 |
66 | /summary 5 days
67 |
68 | Or to get a summary of the important messages over the last two weeks
69 |
70 | /summary 2 weeks
71 |
72 |
73 | ## Screenshots
74 |
75 | #### Hackathon Discussion
76 | 
77 |
78 | #### Meeting Discussion
79 | 
80 |
81 | ## Authors and Contributors
82 | Yask Srivastava (Developer), [Ketan Bhatt](https://github.com/ketanbhatt) (Developer), [Pranu Sarna](https://github.com/psarna94) (Developer) and [Vinayak Mehta](https://github.com/vortex-ape) (Data Scientist).
83 |
84 | ## Support or Contact
85 | Having trouble with summarize it? Create an issue in the repository GitHub Repo.
86 |
87 |
--------------------------------------------------------------------------------
/base_summarizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division, print_function, unicode_literals
5 |
6 | from collections import namedtuple
7 | from operator import attrgetter
8 | from utils import ItemsCount
9 | import logging
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))
13 |
14 | class BaseSummarizer(object):
15 | def __init__(self, ):
16 | self.logger = logging.getLogger(__name__)
17 |
18 | def __call__(self, document, sentences_count):
19 | raise NotImplementedError("This method should be overriden in subclass")
20 |
21 | def normalize_word(self, word):
22 | return word.lower()
23 |
24 | def _get_best_sentences(self, sentences, count, rating, *args, **kwargs):
25 | rate = rating
26 | self.logger.info("Sentences are %s" % sentences)
27 |
28 | infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
29 | for o, s in enumerate(sentences))
30 | # sort sentences by rating in descending order
31 | infos = sorted(infos, key=attrgetter("rating"), reverse=True)
32 | # get `count` first best rated sentences
33 | count = ItemsCount(count)
34 | # if not isinstance(count, ItemsCount):
35 | # count = ItemsCount(count)
36 | infos = count(infos)
37 | # sort sentences by their order in document
38 | infos = sorted(infos, key=attrgetter("order"))
39 |
40 | return tuple(i.sentence for i in infos)
41 |
--------------------------------------------------------------------------------
/compat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division, print_function, unicode_literals
5 |
6 | from sys import version_info
7 |
8 |
9 | PY3 = version_info[0] == 3
10 |
11 | if PY3:
12 | bytes = bytes
13 | unicode = str
14 | else:
15 | bytes = str
16 | unicode = unicode
17 | string_types = (bytes, unicode,)
18 |
--------------------------------------------------------------------------------
/data/test-events-channel2.json:
--------------------------------------------------------------------------------
1 | {"messages": [{u'text': u'thank you', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445970190.000167'}, {u'text': u'I will do that. Thanks.', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445970177.000166'}, {u'text': u'i think this `&clearmemcache=brisket` trick still works , so the correct number appears to be 130. could you please post on with this issue?', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445970158.000165'}, {u'text': u'Sorry, not 135. Just the 145 and 130.', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969964.000164'}, {u'text': u'', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969663.000162', u'attachments': [{u'thumb_height': 172, u'image_bytes': 25818, u'thumb_width': 288, u'title': u'Screen Shot', u'service_name': u'Cloudup', u'image_width': 288, u'author_name': u'Chrissie Pollock', u'title_link': u'https://cloudup.com/cA3JBxJRVUX', u'image_height': 172, u'service_url': u'https://cloudup.com', u'id': 1, u'image_url': u'https://cldup.com/srhnwNU4Yf-3000x3000.png', u'fallback': u'Cloudup Photo: Screen Shot', u'thumb_url': u'https://cldup.com/srhnwNU4Yf-3000x3000.png', u'from_url': u'https://cloudup.com/cA3JBxJRVUX'}]}, {u'text': u'', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969632.000160', u'attachments': [{u'thumb_height': 34, u'image_bytes': 1809, u'thumb_width': 50, u'title': u'Screen Shot', u'service_name': u'Cloudup', u'image_width': 50, u'author_name': u'Chrissie Pollock', u'title_link': u'https://cloudup.com/cZJMWS6ikXx', u'image_height': 34, u'service_url': u'https://cloudup.com', u'id': 1, u'image_url': u'https://cldup.com/IJiJlHtWPr-50x50.png', u'fallback': u'Cloudup Photo: Screen Shot', u'thumb_url': u'https://cldup.com/IJiJlHtWPr-50x50.png', u'from_url': u'https://cloudup.com/cZJMWS6ikXx'}]}, {u'subtype': u'channel_leave', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445969621.000159', u'text': u'<@U029KGD6U|andrewspittle> has left the channel'}, {u'text': u'k. Thanks. I have a mobile support user whose Best Views Ever stats keep going down. I have screenshots showing 145 then 135 and now it\u2019s 130. All for the same day in Sept. ', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969589.000158'}, {u'text': u"hi <@U02FKB7KR> i'm in a hangout. what's the question, i can try to answer async. i might ask you to post on a datap2 if it's a tricky question :simple_smile:", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445969535.000157'}, {u'text': u'Hello! Can someone help me with a stats question?', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969496.000156'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969470.000155', u'text': u'<@U02FKB7KR|chrissiepollock> has joined the channel'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02CJ23LK', u'ts': u'1445918013.000154', u'text': u'<@U02CJ23LK|hafiz> has joined the channel'}, {u'text': u'<@U029DBR0N> I\u2019m not sure if you\u2019ve seen that ^^^, but it is relevant to the Guided Transfer part of the question.', u'type': u'message', u'user': u'U029BJ4TH', u'ts': u'1445899059.000153'}, {u'text': u'', u'type': u'message', u'user': u'U029BJ4TH', u'ts': u'1445898870.000152'}, {u'text': u'I\u2019m headed off, Joe, but I\u2019m interested and happy to help :simple_smile:', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894785.000151'}, {u'text': u'Opening a chat with \u201cPLUGINS"', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894666.000150'}, {u'text': u'Purchasing a Guided Transfer :smile:', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894654.000149'}, {u'text': u'That\u2019s my particular interest, but I figure since we are collecting data, what else might be usefull to gather?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894646.000148'}, {u'text': u'A content export?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894646.000147'}, {u'text': u'How could we know without doing that? What could indicate an intent to go self hosted?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894642.000146'}, {u'text': u'The easiest way is to ask during the cancellation', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894629.000145'}, {u'text': u'To flip \u2018em', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894564.000144'}, {u'text': u'Ooooh interesting', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894560.000143'}, {u'text': u'What % of refunds are potential JPOP customers?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894547.000142'}, {u'text': u'One example would be the potential funnel from to Jetpack plans.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894462.000141'}, {u'text': u'When we encounter someone looking to cancel/refund. We can collect data about why.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894434.000140'}, {u'text': u'Data in live chat being something like tags that commonly occur with folks who then go on to cancel a product? Or something else?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445893884.000139'}, {u'text': u'I was looking for a list of reasons that we currently track, so that we might correlate the data collected in live chat with broader trends.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445893769.000138'}, {u'text': u'Or reasons that we sort of discern and label ourselves?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445892248.000137'}, {u'text': u'Hm, you mean user-defined reasons?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445892239.000136'}, {u'text': u'At this point, my main concern was making sure we\u2019re speaking the same language. Do we track any cancelation \u2018reasons\u201d other than:\n- No longer use\n- Don\u2019t know what this is\n- Does not work as expected\n- I want to delete my site\n- Other', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891882.000134', u'edited': {u'user': u'U029DBR0N', u'ts': u'1445891894.000000'}}, {u'text': u'There\u2019s some in-progress work on the self-service part and the gathering of refund reasons here. - ', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891607.000133'}, {u'text': u'I\u2019m about to jump into a 3210, happy to catch up async', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891595.000132'}, {u'text': u'Like what percentage of our volume is going toward helping people with actions that should be self-serve and, maybe?, are actions that aren\u2019t associated with higher engagement.', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891587.000131'}, {u'text': u'Part of the idea, I think, was to get a sense of what portion of chats are currently serving refunds and cancellations.', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891551.000130'}, {u'text': u'Is that what you\u2019re investigating?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891489.000129'}, {u'text': u'Is there something special about folks who refund through chats?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891484.000128'}, {u'text': u'<@U029KGD6U|andrewspittle> has joined the channel', u'ts': u'1445891431.000127', u'subtype': u'channel_join', u'inviter': u'U03CGFPKV', u'type': u'message', u'user': u'U029KGD6U'}, {u'text': u'I believe <@U029KGD6U> made the tagging suggestion. Maybe he has a slightly less-fuzzy recollection of the idea.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445891421.000126'}, {u'text': u'If we have all the data we need, then perhaps the tagging is not needed.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891317.000125'}, {u'text': u'The details of the town hall are slightly\u2026 fuzzy\u2026 in my head. :simple_smile: But I recall a question about why the refund rate was so high. Particularly with the Business plan.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891239.000124'}, {u'text': u'To see if there\u2019s a meaningful pattern of behavior pre-dating a refund?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891175.000123'}, {u'text': u'Hm, what\u2019s the Big End Goal Joe?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891165.000122'}, {u'text': u'Hello, data :simple_smile: I\u2019m following up on the idea floated at the TownHall about tracking cancelations/refunds initiated via live chat. The idea is: we would tag chats that result in a refund with some kind of meaningful data.\n\nAre there some specific data points that it would be beneficial for us to capture? Perhaps matching the \u201creason\u201d to the same categories you\u2019re tracking now?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891121.000121'}, {u'text': u'so it can\u2019t be the code used to track', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445879023.000120'}, {u'text': u'thanks for the link but to add to that Jetpack hasn\u2019t had an update since this problem started', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445879009.000119'}, {u'text': u"but now that I think of it, that doesn't seem relevant", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878978.000118'}, {u'text': u'I was thinking of this: ', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878958.000117'}, {u'text': u'<@U03CGFPKV> who was in that conversation?', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878888.000116'}, {u'text': u'<@U02H4GV02>: there was some recent conversation about switching these stats to Tracks. That could be the reason', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878860.000115'}, {u'text': u":data: team hangout starting, i'll take a look later", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878778.000114'}, {u'username': u'<@U02A15G2E|timmyc>', u'display_as_bot': False, u'text': u'<@U02A15G2E|timmyc> uploaded a file: ', u'upload': True, u'ts': u'1445878774.000113', u'subtype': u'file_share', u'user': u'U02A15G2E', u'file': {u'thumb_480_w': 480, u'groups': [], u'filetype': u'png', u'thumb_480': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_480.png', u'display_as_bot': False, u'id': u'F0D78TW1X', u'size': 62027, u'url_download': u'https://slack-files.com/files-pub/T024FN1V2-F0D78TW1X-714fea3df5/download/jetpack_nux_-_metrimattic.png', u'thumb_360_w': 360, u'title': u'Jetpack_Nux_-_Metrimattic.png', u'url_private': u'https://files.slack.com/files-pri/T024FN1V2-F0D78TW1X/jetpack_nux_-_metrimattic.png', u'thumb_720_h': 436, u'thumb_360': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_360.png', u'thumb_64': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_64.png', u'ims': [], u'thumb_720_w': 720, u'thumb_80': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_80.png', u'comments_count': 0, u'thumb_360_h': 218, u'thumb_480_h': 290, u'external_type': u'', u'username': u'', u'timestamp': 1445878771, u'public_url_shared': False, u'editable': False, u'original_h': 934, u'thumb_160': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_160.png', u'url_private_download': u'https://files.slack.com/files-pri/T024FN1V2-F0D78TW1X/download/jetpack_nux_-_metrimattic.png', u'thumb_1024': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_1024.png', u'user': u'U02A15G2E', u'image_exif_rotation': 1, u'thumb_960': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_960.png', u'is_public': True, u'pretty_type': u'PNG', u'name': u'Jetpack_Nux_-_Metrimattic.png', u'mimetype': u'image/png', u'permalink_public': u'https://slack-files.com/T024FN1V2-F0D78TW1X-714fea3df5', u'permalink': u'https://a8c.slack.com/files/timmyc/F0D78TW1X/jetpack_nux_-_metrimattic.png', u'is_external': False, u'created': 1445878771, u'url': u'https://slack-files.com/files-pub/T024FN1V2-F0D78TW1X-714fea3df5/jetpack_nux_-_metrimattic.png', u'thumb_1024_h': 619, u'thumb_960_h': 581, u'original_w': 1544, u'thumb_960_w': 960, u'thumb_1024_w': 1024, u'mode': u'hosted', u'thumb_720': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_720.png', u'channels': [u'C029ENR23']}, u'type': u'message', u'bot_id': None}, {u'text': u'mysterious', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878767.000112'}, {u'text': u'fwiw if you toggle to \u201cYearly\u201d on any of the jetpack links above, I do see data on the charts', u'type': u'message', u'user': u'U02A15G2E', u'ts': u'1445878750.000111'}, {u'text': u'will do Martin, thanks', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878739.000110'}, {u'text': u'can you please post on datap2? and mark unresolved?', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878685.000108', u'edited': {u'user': u'U027KUZUY', u'ts': u'1445878697.000000'}}, {u'text': u'hmm', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878656.000107'}, {u'text': u'', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878645.000106'}, {u'text': u'', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878639.000105'}, {u'text': u'data seems to be coming in', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878637.000104'}, {u'text': u'all the Jetpack one\u2019s I\u2019m looking at are gaffed', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878633.000103'}, {u'text': u'interesting', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878624.000102'}, {u'text': u"hey <@U02H4GV02> this doesn't seem to be a MC-wide thing: ", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878595.000101'}, {u'text': u'Anyone know why stats graphs aren\u2019t working on Mission Control currently? ', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878558.000100'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878555.000099', u'text': u'<@U02H4GV02|jesse> has joined the channel'}, {u'text': u'<@U027KUZUY|martin> set the channel topic: On duty this week: @martin. Not around? Not answering? Post on Data P2.', u'ts': u'1445869753.000098', u'topic': u'On duty this week: @martin. Not around? Not answering? Post on Data P2.', u'subtype': u'channel_topic', u'user': u'U027KUZUY', u'type': u'message'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445843480.000097', u'text': u'<@U029DBR0N|joe> has joined the channel'}, {u'text': u':data: ++', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535851.000096'}, {u'text': u'We should be getting some new devs on the Tracks squad soon and will have more resources to implement these things', u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535774.000095'}, {u'text': u'Thanks :y:', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445535753.000094'}, {u'text': u'awesome, thanks :simple_smile:', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535749.000093'}, {u'text': u'Added a card here: ', u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535745.000092'}, {u'text': u'It\u2019s <@U024FNH8S>\u2019s not mine :simple_smile: Just wasn\u2019t sure if it was on your backlog', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535716.000091'}, {u'text': u"<@U027KR1M5>: It's a good idea :simple_smile:", u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535679.000090'}, {u'text': u'<@U029CED9T>: ^ any plans to add annotations to these pages? ', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445531113.000089'}, {u'text': u'Hey hey :simple_smile:\n\nIs there a plan for adding annotations, so I could link the event with the P2 post with the explanation for the variation? \n', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445530662.000088'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445530624.000087', u'text': u'<@U024FNH8S|folletto> has joined the channel'}, {u'text': u'yup', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466565.000086'}, {u'text': u'Phil will be working on some stuff in , so he needs a missioncontrol checkout, which I think is part of a wpcom sandbox.', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466383.000085'}, {u'text': u'no worries at all. thanks for the quick reply!', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466332.000084'}, {u'text': u'sorry to bother', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466327.000083'}, {u'text': u'ok great, thanks <@U027KUZUY>', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466324.000082'}, {u'text': u"hey <@U033TML75> i'm off & on today. yes, wpcom/MC sandbox.", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466316.000081'}, {u'text': u'k', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466311.000080'}, {u'text': u'I can take care of that if he needs it', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466278.000079'}, {u'text': u'He might need Hue access on Nosara.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466260.000078'}, {u'text': u'I think Martin meant wpcom sandbox', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466238.000077'}, {u'text': u"I don't think he should need a hadoop sandbox, at least not yet.", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466224.000076'}, {u'text': u"but looks like <@U027KUZUY> is AFK and i dont want to hold up phil if he's ready to get started tonight", u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466175.000075'}, {u'text': u'but just want to be sure its a regular sandbox vs. a hadoop sandbox', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466153.000074'}, {u'text': u'and <@U027KUZUY> asked for a sandbox for him', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466140.000073'}, {u'text': u'<@U03CGFPKV> or <@U027LSDDA> - i see pcrumm starting to login to stuff', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466133.000072'}, {u'text': u'thanks <@U03CGFPKV>', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384748.000071'}, {u'text': u'sounds good - yeah i need to go eat and sort of pay attention to the mrs. (especially after being gone a week) :wink:', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384745.000070'}, {u'text': u'I can try to help some more tomorrow. If Andy is back, he is certainly the expert here.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384642.000069'}, {u'text': u'I see.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384610.000068'}, {u'text': u'well i believe it was, which is how barry noticed in the first place (we got alerts for too many logs in queue because the same ones would fail and get requeued over and over)', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384538.000067'}, {u'text': u"since they were previously getting requeued, and the the queue wasn't filling up (as far as I know)", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384502.000066'}, {u'text': u'My guess is that there may be some sort of database timeout or something', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384480.000065'}, {u'text': u'I have to get going', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384456.000064'}]}
2 |
--------------------------------------------------------------------------------
/data/test-events-elastic.json:
--------------------------------------------------------------------------------
1 | {"messages": [ {
2 | "type": "message",
3 | "user": "U029CL0GJ",
4 | "text": "presumably those have been resolved by now however it seems the ES query i need to do has since changed as i'm getting an error",
5 | "ts": "1414028017.000313"
6 | },
7 | {
8 | "type": "message",
9 | "user": "U029CL0GJ",
10 | "text": "```Exception(Elastica\\Exception\\ResponseException): SearchPhaseExecutionException[Failed to execute phase [query_fetch], all shards failed; shardFailures {[br5TQBVNRr-pVXXW9crQeA][blog_network_7-6][0]: RemoteTransportException[[][inet[\/192.0.80.174:9300]][search\/phase\/query+fetch]]; nested: ElasticsearchIllegalArgumentException[field [tag] isn't a leaf field]; }]```",
11 | "edited": {
12 | "user": "U029CL0GJ",
13 | "ts": "1414028035.000000"
14 | },
15 | "ts": "1414028030.000314"
16 | },
17 | {
18 | "type": "message",
19 | "user": "U027LSDDA",
20 | "text": "hmmm...",
21 | "ts": "1414028037.000317"
22 | },
23 | {
24 | "type": "message",
25 | "user": "U029CL0GJ",
26 | "text": "here's my code. rather than stumbling through it, can someone take a look at see if anything sticks out as being wrong?",
27 | "ts": "1414028064.000318"
28 | },
29 | {
30 | "type": "message",
31 | "user": "U029CL0GJ",
32 | "text": "",
33 | "ts": "1414028064.000319"
34 | },
35 | {
36 | "type": "message",
37 | "user": "U027LSDDA",
38 | "text": "its probably a change to how ES handles it",
39 | "ts": "1414028067.000320"
40 | },
41 | {
42 | "type": "message",
43 | "user": "U027LSDDA",
44 | "text": "looking...",
45 | "ts": "1414028069.000321"
46 | },
47 | {
48 | "type": "message",
49 | "user": "U029CL0GJ",
50 | "text": "thanks",
51 | "ts": "1414028072.000322"
52 | },
53 | {
54 | "type": "message",
55 | "user": "U027LSDDA",
56 | "text": "oh, i think the 'tag' in your 'fields' list is not specific enough",
57 | "ts": "1414028159.000323"
58 | },
59 | {
60 | "type": "message",
61 | "user": "U027LSDDA",
62 | "text": "you need say tag.term_id or tag.name, tag.slug",
63 | "ts": "1414028174.000324"
64 | },
65 | {
66 | "type": "message",
67 | "user": "U027LSDDA",
68 | "text": "the goal is to get all posts\/comments a user subscribes to?",
69 | "ts": "1414028231.000325"
70 | },
71 | {
72 | "type": "message",
73 | "user": "U029CL0GJ",
74 | "text": "all of the tabs",
75 | "ts": "1414028253.000326"
76 | },
77 | {
78 | "type": "message",
79 | "user": "U029CL0GJ",
80 | "text": "although i see nick did some commits since i worked on it",
81 | "ts": "1414028287.000327"
82 | },
83 | {
84 | "type": "message",
85 | "user": "U029CL0GJ",
86 | "text": "hmm, maybe he beat me to it",
87 | "ts": "1414028290.000328"
88 | },
89 | {
90 | "type": "message",
91 | "user": "U029CL0GJ",
92 | "text": "meh, still going to commit",
93 | "ts": "1414028311.000329"
94 | },
95 | {
96 | "type": "message",
97 | "user": "U029CL0GJ",
98 | "text": "i ditched all the ajax since ES is so fast",
99 | "ts": "1414028318.000330"
100 | },
101 | {
102 | "type": "message",
103 | "user": "U027LSDDA",
104 | "text": "cool :simple_smile:",
105 | "ts": "1414028331.000331"
106 | },
107 | {
108 | "type": "message",
109 | "user": "U027LSDDA",
110 | "text": "getting 250 results is probably fine right now. Someday our index may get too big",
111 | "ts": "1414028371.000332"
112 | },
113 | {
114 | "type": "message",
115 | "user": "U027LSDDA",
116 | "text": "but we'll deal with that when it breaks :simple_smile:",
117 | "ts": "1414028396.000333"
118 | },
119 | {
120 | "type": "message",
121 | "user": "U029CL0GJ",
122 | "text": "for tags, it looks like i wanted a meta object: ",
123 | "ts": "1414028427.000334"
124 | },
125 | {
126 | "type": "message",
127 | "user": "U029CL0GJ",
128 | "text": "all the tags",
129 | "ts": "1414028466.000335"
130 | },
131 | {
132 | "type": "message",
133 | "user": "U027LSDDA",
134 | "text": "i think you have specify each field individually now",
135 | "ts": "1414028471.000336"
136 | },
137 | {
138 | "type": "message",
139 | "user": "U027LSDDA",
140 | "text": "i vaguely remember that change in ES at some point, but don't recall why",
141 | "ts": "1414028487.000337"
142 | },
143 | {
144 | "type": "message",
145 | "user": "U027LSDDA",
146 | "text": "so you need tag.slug and tag.name",
147 | "ts": "1414028521.000338"
148 | },
149 | {
150 | "type": "message",
151 | "user": "U027LSDDA",
152 | "text": "i think its because they don't actually get stored together",
153 | "ts": "1414028552.000339"
154 | },
155 | {
156 | "type": "message",
157 | "user": "U029CL0GJ",
158 | "text": "i just removed tag for now, i'll circle back. no errors but no results either.",
159 | "ts": "1414028562.000340"
160 | },
161 | {
162 | "type": "message",
163 | "user": "U027LSDDA",
164 | "text": "did you get empty results, false, or WP_Error back?",
165 | "ts": "1414028628.000341"
166 | },
167 | {
168 | "type": "message",
169 | "user": "U029CL0GJ",
170 | "text": "empty results",
171 | "ts": "1414028635.000342"
172 | },
173 | {
174 | "type": "message",
175 | "user": "U027LSDDA",
176 | "text": "oh, you probably also need to set 'blog_id' = null",
177 | "ts": "1414028684.000343"
178 | },
179 | {
180 | "type": "message",
181 | "user": "U027LSDDA",
182 | "text": "in $es_query_args",
183 | "ts": "1414028693.000344"
184 | },
185 | {
186 | "type": "message",
187 | "user": "U029CL0GJ",
188 | "text": "query in json form: ",
189 | "ts": "1414028697.000345"
190 | },
191 | {
192 | "type": "message",
193 | "user": "U027LSDDA",
194 | "text": "that was a change we had to make to our api",
195 | "ts": "1414028702.000346"
196 | },
197 | {
198 | "type": "message",
199 | "user": "U027LSDDA",
200 | "text": "otherwise it will autofilter by the current blog_id",
201 | "ts": "1414028772.000347"
202 | },
203 | {
204 | "type": "message",
205 | "user": "U029CL0GJ",
206 | "text": "that worked i think",
207 | "ts": "1414028774.000348"
208 | },
209 | {
210 | "type": "message",
211 | "user": "U029CL0GJ",
212 | "text": "yeah, loads in a fraction of a second :simple_smile:",
213 | "ts": "1414028860.000349"
214 | },
215 | {
216 | "type": "message",
217 | "user": "U027LSDDA",
218 | "text": "as it should :simple_smile:",
219 | "ts": "1414028871.000350"
220 | },
221 | {
222 | "type": "message",
223 | "user": "U029CL0GJ",
224 | "text": "cool. now to list out post tags and i'll be good to go.",
225 | "ts": "1414028907.000351"
226 | },
227 | {
228 | "type": "message",
229 | "user": "U029CL0GJ",
230 | "text": "thanks for you help!",
231 | "ts": "1414028909.000352"
232 | },
233 | {
234 | "type": "message",
235 | "user": "U027LSDDA",
236 | "text": "anytime",
237 | "ts": "1414028917.000353"
238 | },
239 | {
240 | "type": "message",
241 | "user": "U027LSDDA",
242 | "text": "that delete-by-query image may just have been a random spike",
243 | "ts": "1414030195.000354"
244 | },
245 | {
246 | "type": "message",
247 | "user": "U029CL0GJ",
248 | "text": "this tag format is annoying :disappointed:",
249 | "ts": "1414030571.000355"
250 | },
251 | {
252 | "type": "message",
253 | "user": "U027LSDDA",
254 | "text": "what about it?",
255 | "ts": "1414030584.000356"
256 | },
257 | {
258 | "type": "message",
259 | "user": "U029CL0GJ",
260 | "text": "if there's one tag, it's a string, if there's more than one, it's an array",
261 | "ts": "1414030587.000357"
262 | },
263 | {
264 | "type": "message",
265 | "user": "U029CL0GJ",
266 | "text": "i also have to match up keys between the two fields",
267 | "ts": "1414030595.000358"
268 | },
269 | {
270 | "type": "message",
271 | "user": "U027LSDDA",
272 | "text": "ya",
273 | "ts": "1414030602.000359"
274 | },
275 | {
276 | "type": "message",
277 | "user": "U027LSDDA",
278 | "text": "ya, its not really a full object",
279 | "ts": "1414030618.000360"
280 | },
281 | {
282 | "type": "message",
283 | "user": "U029CL0GJ",
284 | "text": "the string vs array is particularly annoying",
285 | "ts": "1414030623.000361"
286 | },
287 | {
288 | "type": "message",
289 | "user": "U027LSDDA",
290 | "text": "its more for searching against than as a doc store",
291 | "ts": "1414030629.000362"
292 | },
293 | {
294 | "type": "message",
295 | "user": "U029CL0GJ",
296 | "text": "ah",
297 | "ts": "1414030636.000363"
298 | },
299 | {
300 | "type": "message",
301 | "user": "U027LSDDA",
302 | "text": "that's why we usually just get the blog_id, post_id and then use the DB",
303 | "ts": "1414030645.000364"
304 | },
305 | {
306 | "type": "message",
307 | "user": "U029CL0GJ",
308 | "text": "gotcha. that'd be slow here though -- too many `switch_to_blog()`",
309 | "ts": "1414030660.000365"
310 | },
311 | {
312 | "type": "message",
313 | "user": "U027LSDDA",
314 | "text": "the string vs array thing is a pain",
315 | "ts": "1414030663.000366"
316 | },
317 | {
318 | "type": "message",
319 | "user": "U027LSDDA",
320 | "text": "you'll probably hit the cache most of the time with get_blog_post(), but that is a lot of posts",
321 | "ts": "1414030783.000367"
322 | },
323 | {
324 | "type": "message",
325 | "user": "U027LSDDA",
326 | "text": "<@U029CL0GJ>: if you return \"fields\": [ \"_source\" ] that should be the orig doc that we indexed. Might be more consistently formatted for your needs",
327 | "ts": "1414030941.000368"
328 | },
329 | {
330 | "type": "message",
331 | "user": "U029CL0GJ",
332 | "text": "too late, already got it working :simple_smile:",
333 | "ts": "1414031030.000369"
334 | },
335 | {
336 | "type": "message",
337 | "user": "U027LSDDA",
338 | "text": ":simple_smile:",
339 | "ts": "1414031035.000370"
340 | },
341 | {
342 | "type": "message",
343 | "user": "U029CL0GJ",
344 | "text": "but thanks",
345 | "ts": "1414031051.000371"
346 | },
347 | {
348 | "type": "message",
349 | "user": "U027LSDDA",
350 | "text": "np",
351 | "ts": "1414031055.000372"
352 | },
353 | {
354 | "type": "message",
355 | "user": "U029CL0GJ",
356 | "text": "launched: ",
357 | "ts": "1414031171.000373"
358 | },
359 | {
360 | "type": "message",
361 | "user": "U029CL0GJ",
362 | "text": "dat speed :smile:",
363 | "ts": "1414031173.000374"
364 | },
365 | {
366 | "type": "message",
367 | "user": "U029CL0GJ",
368 | "text": "literally took 10+ seconds of spinner wheel to load before",
369 | "ts": "1414031188.000375"
370 | },
371 | {
372 | "type": "message",
373 | "user": "U027LSDDA",
374 | "text": "dang that's fast",
375 | "ts": "1414031188.000376"
376 | },
377 | {
378 | "type": "message",
379 | "user": "U027LSDDA",
380 | "text": "nice",
381 | "ts": "1414031190.000377"
382 | },
383 | {
384 | "type": "message",
385 | "user": "U029CL0GJ",
386 | "text": "oops, that's posts only from my tag debugging",
387 | "ts": "1414031209.000378"
388 | },
389 | {
390 | "type": "message",
391 | "user": "U029CL0GJ",
392 | "text": "fixed",
393 | "ts": "1414031263.000379"
394 | }]}
395 |
--------------------------------------------------------------------------------
/img/hackathon-discussion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/Slack-Summary/92ee08557c68728b4aee15ec45d070ac206695a9/img/hackathon-discussion.png
--------------------------------------------------------------------------------
/img/meeting-discussion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/Slack-Summary/92ee08557c68728b4aee15ec45d070ac206695a9/img/meeting-discussion.png
--------------------------------------------------------------------------------
/interval_summarizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import namedtuple
3 | from datetime import (timedelta, datetime)
4 | import re
5 | import logging
6 | import logging.handlers
7 | import sys
8 | import json
9 | import io
10 | from ts_config import TS_DEBUG, TS_LOG
11 | import glob
12 | from utils import get_msg_text
13 | from slacker import Slacker
14 | from config import keys
15 |
16 | logging.basicConfig(level=logging.INFO)
17 |
18 | class IntervalSpec(object):
19 | slk_ts = re.compile(r'(?P[1-9][^\.]+).*')
20 |
21 | class TsSummarizer(object):
22 | """Constructs summaries over a set of ranges"""
23 | flrg = re.compile(r'[\n\r\.]|\&[a-z]+;|]+>|\:[^: ]+\:|`{3}[^`]*`{3}')
24 | archive_link = u'https://a8c.slack.com/archives/{}/p{}'
25 | def __init__(self, ):
26 | self.logger = logging.getLogger(__name__)
27 | self.channel = None
28 | self.slack = None
29 | log_level = logging.DEBUG if TS_DEBUG else logging.INFO
30 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31 | fh = logging.handlers.RotatingFileHandler('./interval_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
32 | fh.setLevel(log_level)
33 | fh.setFormatter(formatter)
34 | self.logger = logging.getLogger('interval_summarizer')
35 | self.logger.handlers = []
36 | self.logger.addHandler(fh)
37 |
38 | def summarize(self, messages, range_spec=None):
39 | """ Produce the input """
40 | return messages
41 |
42 | def report_summary(self, messages, range_spec=None):
43 | """The interval summaries are joined."""
44 | return '\n'.join(self.summarize(messages, range_spec=range_spec))
45 |
46 | def set_channel(self, channel):
47 | self.channel = channel
48 |
49 | def set_slack(self, conn):
50 | self.slack = conn
51 |
52 | def tagged_sum(self, msg):
53 | user = "USER UNKNOWN"
54 | if 'user' in msg:
55 | user = msg['user']
56 | elif 'bot_id' in msg:
57 | user = msg['bot_id']
58 | elif 'username' in msg and msg['username'] == u'bot':
59 | user = 'bot'
60 | split_text = get_msg_text(msg).split()
61 | text = u' '.join(split_text[:30])+u'...' if len(split_text) > 30 else u' '.join(split_text)
62 | if self.channel:
63 | link = TsSummarizer.archive_link.format(self.channel, re.sub(r'\.',u'',msg['ts']))
64 | text = u'<'+link+'|'+text+'>'
65 | return u'@{} <@{}>: {}'.format(ts_to_time(msg['ts']).strftime("%a-%b-%-m-%Y %H:%M:%S"), user, text)
66 |
67 |
68 | def ts_to_time(slack_ts):
69 | """
70 | Parameters
71 | slack_ts : string EPOCH.ID
72 | Return
73 | datetime
74 | """
75 | return datetime.utcfromtimestamp(long(IntervalSpec.slk_ts.search(slack_ts).group('epoch')))
76 |
77 | def tspec_to_delta(seconds=0, minutes= 0, hours= 0, days= 0, weeks=0, **args):
78 | return timedelta(seconds= seconds, minutes= minutes, hours= hours, days= days, weeks=weeks)
79 |
80 | def canonicalize(txt):
81 | """Filter and change text to sentece form"""
82 | ntxt = TsSummarizer.flrg.sub(u'', txt)
83 | return ntxt.strip() if re.match(r'.*[\.\?\!]\s*$', ntxt) else u'{}.'.format(ntxt.strip())
84 | #return ntxt if re.match(r'.*[\.\?]$', ntxt) else u'{}.'.format(ntxt)
85 |
86 |
--------------------------------------------------------------------------------
/lsa.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division, print_function, unicode_literals
5 |
6 | import math
7 |
8 | from warnings import warn
9 |
10 | try:
11 | import numpy
12 | except ImportError:
13 | numpy = None
14 |
15 | try:
16 | from numpy.linalg import svd as singular_value_decomposition
17 | except ImportError:
18 | singular_value_decomposition = None
19 | from base_summarizer import BaseSummarizer
20 | import spacy.en
21 | from spacy.parts_of_speech import VERB, NOUN, PROPN, PRON, PUNCT
22 | from spacy.en import STOPWORDS
23 | import logging
24 | logging.basicConfig(level=logging.DEBUG)
25 | logger = logging.getLogger(__name__)
26 |
27 | class LsaSummarizer(BaseSummarizer):
28 | MIN_DIMENSIONS = 3
29 | REDUCTION_RATIO = 1/1
30 | _stop_words = frozenset()
31 |
32 |
33 | def __init__(self, ):
34 | BaseSummarizer.__init__(self, )
35 | self.nlp = spacy.en.English(entity=False, matcher=False)
36 | self.nlp_doc = None
37 |
38 | @property
39 | def stop_words(self):
40 | return self._stop_words
41 |
42 | @stop_words.setter
43 | def stop_words(self, words):
44 | self._stop_words = frozenset(map(self.normalize_word, words))
45 |
46 | def __call__(self, document, sentences_count, user_dict):
47 | self._ensure_dependecies_installed()
48 | self.nlp_doc = self.nlp(document)
49 | self.user_dict = user_dict
50 | logger.info("Created doc")
51 |
52 | dictionary = self._create_dictionary()
53 | # empty document
54 | if not dictionary:
55 | return ()
56 | matrix = self._create_matrix(dictionary)
57 | matrix = self._compute_term_frequency(matrix)
58 | u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
59 |
60 | ranks = iter(self._compute_ranks(sigma, v))
61 | sents = [s.text for s in self.nlp_doc.sents]
62 | logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents))
63 | new_sents = self._get_best_sentences(sents, sentences_count*2,
64 | lambda s: next(ranks))
65 | filt_sents = [sent for sent in new_sents if self.better_question(sent)]
66 | additional_sents = set(new_sents) - set(filt_sents)
67 | to_add = sentences_count - len(filt_sents)
68 | final_sents = filt_sents
69 | if to_add > 0:
70 | final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True)
71 | logger.info("Filtered sentences %s", filt_sents)
72 | logger.info("Final recommendations are %s", final_sents[:sentences_count])
73 | return final_sents
74 |
75 |
76 | def better_question(self, txt):
77 | if len(txt.split()) > 5:
78 | parse = self.nlp(txt)
79 | for sent in parse.sents:
80 | if len(sent) > 5:
81 | p2 = self.nlp(sent.text)
82 | for (i, wd) in enumerate(p2):
83 | if wd.lemma_ in (u'can', u'should', u'will', u'could', u'why', u'what', u'how', u'is'):
84 | return u'ROOT' in [x.dep_ for x in p2[i+1:]] and u'?' in [x.orth_ for x in p2[i+1:]]
85 |
86 |
87 | def _ensure_dependecies_installed(self):
88 | if numpy is None:
89 | raise ValueError("LSA summarizer requires NumPy. Please, install it by command 'pip install numpy'.")
90 |
91 | def _create_dictionary(self, ):
92 | """Creates mapping key = word, value = row index"""
93 | words = [wd.orth_ for wd in self.nlp_doc if wd.pos != PUNCT]
94 | unique_words = frozenset(w.lemma_ for w in self.nlp_doc if w not in STOPWORDS and w.tag_ != "PRP" and (w.pos == VERB or w.pos == NOUN))
95 | unique_users = frozenset(self.user_dict.values())
96 | logger.info("Have %s unique words" % len(unique_words))
97 | logger.info("Have %s unique users" % len(unique_users))
98 | return dict((w, i) for i, w in enumerate(unique_words|unique_users))
99 |
100 | def collect_bow(self, txt):
101 | sents = nlp(txt).sents
102 | return [x for x in [retrieve_main_bow(sent) for sent in sents] if x]
103 |
104 | def _create_matrix(self, dictionary):
105 | """
106 | Creates matrix of shape |unique words|×|sentences| where cells
107 | contains number of occurences of words (rows) in senteces (cols).
108 | """
109 | sentences = list(self.nlp_doc.sents)
110 | words_count = len(dictionary)
111 | sentences_count = len(sentences)
112 | logger.info ("Have %s sentences " % sentences_count)
113 | if words_count < sentences_count:
114 | message = (
115 | "Number of words (%d) is lower than number of sentences (%d). "
116 | "LSA algorithm may not work properly."
117 | )
118 | logger.warn(message % (words_count, sentences_count))
119 | # create matrix |unique words|×|sentences| filled with zeroes
120 | matrix = numpy.zeros((words_count, sentences_count))
121 | for col, sentence in enumerate(sentences):
122 | for word in [wd.lemma_ for wd in sentence if wd.lemma_ in dictionary]:
123 | matrix[dictionary[word], col] += 1
124 | if sentence.text in self.user_dict and len(self.user_dict[sentence.text]) > 1:
125 | logger.info("Matching sentence %s with user %s", sentence.text, self.user_dict[sentence.text])
126 | matrix[dictionary[self.user_dict[sentence.text]], col] += 1
127 | return matrix
128 |
129 | def _compute_term_frequency(self, matrix, smooth=0.4):
130 | """
131 | Computes TF metrics for each sentence (column) in the given matrix.
132 | You can read more about smoothing parameter at URL below:
133 | http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
134 | """
135 | assert 0.0 <= smooth < 1.0
136 |
137 | max_word_frequencies = numpy.max(matrix, axis=0)
138 | rows, cols = matrix.shape
139 | for row in range(rows):
140 | for col in range(cols):
141 | max_word_frequency = max_word_frequencies[col]
142 | if max_word_frequency != 0:
143 | frequency = matrix[row, col]/max_word_frequency
144 | matrix[row, col] = smooth + (1.0 - smooth)*frequency
145 | return matrix
146 |
147 | def _compute_ranks(self, sigma, v_matrix):
148 | assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable"
149 |
150 | dimensions = max(LsaSummarizer.MIN_DIMENSIONS,
151 | int(len(sigma)*LsaSummarizer.REDUCTION_RATIO))
152 | powered_sigma = tuple(s**2 if i < dimensions else 0.0
153 | for i, s in enumerate(sigma))
154 | ranks = []
155 | # iterate over columns of matrix (rows of transposed matrix)
156 | for column_vector in v_matrix.T:
157 | rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
158 | ranks.append(math.sqrt(rank))
159 | return ranks
160 |
161 | def retrieve_main_bow(tokens):
162 | bow = set()
163 | for tok in tokens:
164 | if tok.pos != PUNCT:
165 | if tok.dep_ == 'advcl' or tok.dep == 'xcomp':
166 | bow.add(' '.join([ti.lower_ for ti in list(tok.children) if tok.tag_ != "PRP" and ti.lower_ not in STOPWORDS]))
167 | bow.add(tok.lower_)
168 | if tok.pos == NOUN or tok.pos == VERB:
169 | if tok.tag_ != "PRP" and tok.lower_ not in STOPWORDS:
170 | bow.add(tok.lower_)
171 | mt = re.sub(r'[\n\t\n]', u'', u' '.join(list(bow))+u'.')
172 | return mt if len(mt.strip().split()) > 2 else None
173 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, jsonify, request
2 | import requests
3 | import json
4 | import os
5 | from config import *
6 | from ts_config import SUMMS
7 | from slack_summary import SlackRouter
8 | app = Flask(__name__)
9 | from utils import maybe_get
10 | global lsa_summ
11 | lsa_summ = None
12 | if "spacy" in SUMMS:
13 | import lsa
14 | import spacy.en
15 | import spacy
16 | lsa_summ = lsa.LsaSummarizer()
17 |
18 |
19 | @app.route("/slack", methods=['POST'])
20 | def slackReq():
21 | global lsa_summ
22 | if "spacy" in SUMMS:
23 | if not lsa_summ:
24 | lsa_summ = lsa.LsaSummarizer()
25 | req_data = request.form
26 | req = {
27 | 'channel_id' : req_data.getlist('channel_id'),
28 | 'channel_name' : maybe_get(req_data, 'channel_name', default=''),
29 | 'user_id' : maybe_get(req_data, 'user_id', default=''),
30 | 'user_name' : maybe_get(req_data, 'user_name', default=''),
31 | 'params' : maybe_get(req_data, 'text', default=''),
32 | 'summ' : lsa_summ
33 | }
34 | if "gensim" in SUMMS and "gensim" in req['params'].split():
35 | req['summ'] = None
36 | return (SlackRouter().get_summary(**req))
37 |
38 |
39 | @app.route("/slacktest", methods=['POST'])
40 | def slackTestReq():
41 | global lsa_summ
42 | if "spacy" in SUMMS:
43 | if not lsa_summ:
44 | lsa_summ = lsa.LsaSummarizer()
45 | req_data = request.form
46 | req = {
47 | 'channel_id' : req_data.getlist('channel_id'),
48 | 'channel_name' : maybe_get(req_data, 'channel_name', default=''),
49 | 'user_id' : maybe_get(req_data, 'user_id', default=''),
50 | 'user_name' : maybe_get(req_data, 'user_name', default=''),
51 | 'params' : maybe_get(req_data, 'text', default=''),
52 | 'summ' : lsa_summ,
53 | 'test' : True
54 | }
55 | if "gensim" in SUMMS and "gensim" in req['params'].split():
56 | req['summ'] = None
57 | return (SlackRouter(test=True).get_summary(**req))
58 |
59 | def main():
60 | port = int(os.environ.get('PORT', 5000))
61 | app.run(host='0.0.0.0', port=port, debug=False)
62 |
63 | if __name__ == "__main__":
64 | main()
65 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==0.10.1
2 | Jinja2==2.7.3
3 | MarkupSafe==0.23
4 | Werkzeug==0.10.4
5 | clusterpoint-api==0.3.0
6 | decorator==3.4.2
7 | itsdangerous==0.24
8 | pycps==0.3.0
9 | requests[security]==2.8.1
10 | slacker==0.6.2
11 | wsgiref==0.1.2
12 | gensim==0.12.2
13 | ipython
14 | jupyter
15 | mock==1.3.0
16 | pbr==1.8.1
17 | spacy==0.99
18 | hypothesis
19 |
--------------------------------------------------------------------------------
/slack_summary.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import requests
3 | import json
4 | from config import *
5 | from ts_config import DEBUG, LOG_FILE, SUMMARY_INTERVALS, TEST_JSON, SUMMS
6 | from slacker import Slacker
7 | import slacker
8 | import logging
9 | import logging.handlers
10 | import uuid
11 | import re
12 | import io
13 | from datetime import timedelta, datetime
14 | if "gensim" in SUMMS:
15 | from ts_summarizer import TextRankTsSummarizer
16 | if "spacy" in SUMMS:
17 | from sp_summarizer import SpacyTsSummarizer
18 |
19 | class SlackRouter(object):
20 | expr = re.compile(r'-?(\d{1,3}?)\s+(\S{1,8})\s*(.*)$')
21 | plural = re.compile(r'([^s]+)s$')
22 | temporals = ['minute', 'min', 'hour', 'day', 'week']
23 |
24 |
25 | def __init__(self, test=False):
26 | self.test = test
27 | self.slack = None if self.test else slacker.Slacker(keys["slack"])
28 | log_level = logging.DEBUG if DEBUG else logging.INFO
29 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
30 | fh = logging.handlers.RotatingFileHandler('./slack_summary_'+LOG_FILE, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5,)
31 | fh.setLevel(log_level)
32 | fh.setFormatter(formatter)
33 | self.logger = logging.getLogger('slack_summary')
34 | self.logger.handlers = []
35 | self.logger.setLevel(log_level)
36 | self.logger.addHandler(fh)
37 |
38 | def get_response(self, channel_id):
39 | self.logger.debug(u'Generating summary for channel: %s', channel_id)
40 | return self.slack.channels.history(channel_id)
41 |
42 | def get_messages(self, channel_id, params):
43 | """Get messages based upon the interval"""
44 | tdelt = self.build_delta(params)
45 | earliest_time = datetime.now()-tdelt
46 | self.logger.debug(u'Earliest time %s', earliest_time)
47 | ts = u'{}.999999'.format(earliest_time.strftime("%s"))
48 | self.logger.debug(u'Channel id %s, TS string %s', channel_id, ts)
49 | response = self.slack.channels.history(channel_id, oldest=ts, count=999)
50 | res = (response.body)
51 | add_more = True
52 | msgs = []
53 | msg_ids = set()
54 | while add_more:
55 | if 'max_msgs' in params and params['max_msgs'] <= len(msgs):
56 | return msgs
57 | if u'messages' in res:
58 | new_set = set([msg['ts'] for msg in res['messages']])
59 | if len(new_set.intersection(msg_ids)) > 0:
60 | self.logger.debug(u'Overlap in messages')
61 | return msgs
62 | msgs.extend(res['messages'])
63 | msg_ids.update(new_set)
64 | self.logger.debug(u'Got %s messages', len(msgs))
65 | else:
66 | return msgs
67 | if 'has_more' in res and res['has_more']:
68 | self.logger.debug(u'Paging for more messages.')
69 | response = self.slack.channels.history(channel_id, oldest=ts, latest=res['messages'][-1]['ts'], count=999)
70 | res = (response.body)
71 | else:
72 | self.logger.debug(u'No more messages.')
73 | add_more = False
74 | return msgs
75 |
76 | def get_summary(self, **args):
77 | channel_id = args['channel_id'] if 'channel_id' in args else None
78 | channel_name = args['channel_name'] if 'channel_name' in args else None
79 | user_id = args['user_id'] if 'user_id' in args else None
80 | user_name = args['user_name'] if 'user_name' in args else None
81 | params = args['params'] if 'params' in args else None
82 | request_id = uuid.uuid1()
83 | response = None
84 | msgs = None
85 | if self.test:
86 | with io.open(TEST_JSON, encoding='utf-8') as iot:
87 | msgs = json.load(iot)[u'messages']
88 | else:
89 | msgs = self.get_messages(channel_id, params)
90 | summ_object = args['summ']
91 | summ_impl = None
92 | summary = u''
93 | if summ_object and "spacy" in SUMMS:
94 | self.logger.info(u'Using spacy')
95 | summ_impl = SpacyTsSummarizer()
96 | summ_impl.set_summarizer(summ_object)
97 | elif "gensim" in SUMMS:
98 | self.logger.info(u'Using gensim')
99 | summ_impl = TextRankTsSummarizer()
100 | if summ_impl:
101 | summ_impl.set_channel(channel_name)
102 | summary = summ_impl.summarize(msgs)
103 | else:
104 | self.logger.warn(u'No summarizer was set!')
105 | self.logger.info(u'Summary request %s user_id: %s', request_id, user_id)
106 | self.logger.info(u'Summary request %s channel_name: %s', request_id, channel_name)
107 | self.logger.info(u'Summary request %s parameters: %s', request_id, params)
108 | self.logger.debug(u'Summary request %s messages: %s', request_id, msgs)
109 | self.logger.info(u'Summary request %s summary:\n %s', request_id, summary)
110 | res = u"*Chat Summary:* \n " + summary + "\n \n"
111 | return res
112 |
113 | def _parse_args(self, commands):
114 | units = None
115 | unit = None
116 | keywords = None
117 | if commands and len(commands.strip()) > 1:
118 | match = SlackRouter.expr.match(commands)
119 | if match:
120 | units, unit, keywords = match.groups()
121 | unit = unit.lower()
122 | umatch = SlackRouter.plural.match(unit)
123 | unit = umatch.groups()[0] if umatch else unit
124 | unit = unit if unit in SlackRouter.temporals else None
125 | if unit and unit == 'min':
126 | unit = 'minute'
127 | units = int(units) if unit else None
128 | else:
129 | keywords = commands
130 | if not unit:
131 | units = None
132 | keywords = commands
133 | return unit, units, keywords
134 |
135 | def build_interval(self, commands):
136 | """Return a single interval for the summarization"""
137 | unit, units, keywords = self._parse_args(commands)
138 | interval = {'size': 3}
139 | if unit:
140 | interval[unit+'s'] = units
141 | interval['txt'] = u"Summary for last {} {}:\n".format(units, unit)
142 | else:
143 | interval['days'] = 5
144 | interval['txt'] = u"Summary for last 5 days:\n"
145 | return [interval]
146 |
147 | def build_delta(self, commands):
148 | """Return a single interval for the summarization"""
149 | unit, units, keywords = self._parse_args(commands)
150 | interval = {'seconds':0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0}
151 | if unit:
152 | interval[unit+'s'] = units
153 | else:
154 | interval['days'] = 5
155 | return timedelta(**interval)
156 |
157 |
--------------------------------------------------------------------------------
/sp_summarizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import namedtuple
3 | from datetime import (timedelta, datetime)
4 | import re
5 | import logging
6 | import logging.handlers
7 | import sys
8 | import json
9 | import io
10 | from ts_config import TS_DEBUG, TS_LOG
11 | import glob
12 | from utils import get_msg_text
13 | from interval_summarizer import (IntervalSpec, TsSummarizer,
14 | canonicalize, ts_to_time, tspec_to_delta)
15 | logging.basicConfig(level=logging.INFO)
16 |
17 | class SpacyTsSummarizer(TsSummarizer):
18 |
19 | def __init__(self, ):
20 | TsSummarizer.__init__(self, )
21 | log_level = logging.DEBUG if TS_DEBUG else logging.INFO
22 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
23 | fh = logging.handlers.RotatingFileHandler('./spacy_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
24 | fh.setLevel(log_level)
25 | fh.setFormatter(formatter)
26 | self.logger = logging.getLogger('sp_summarizer')
27 | self.logger.handlers = []
28 | self.logger.addHandler(fh)
29 |
30 | def set_summarizer(self, spacy_summ):
31 | self.sumr = spacy_summ
32 |
33 | def summarize(self, msgs, range_spec=None):
34 | """Return a summary of the text
35 | TODO: 1. Looks like spacy is not getting the main sentence from the message.
36 | 2. Load times for the spacy summarizer won't cut it. Commenting out now
37 | until this can be fixed
38 | """
39 | size = range_spec['size'] if range_spec and 'size' in range_spec else 3
40 | if not msgs or len(msgs) == 0:
41 | self.logger.warn("No messages to form summary")
42 | return u"\n Unable to form summary here.\n"
43 | txt = range_spec['txt'] if range_spec else u'Summary is'
44 | if range_spec:
45 | self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs))
46 | self.logger.info("Using time range spec %s", range_spec)
47 | start_time = time.strptime(range_spec['start'], "%B %d %Y") if 'start' in range_spec else ts_to_time(min(msgs, key=lambda m: m['ts'])['ts'])
48 | self.logger.info("Start time is %s", start_time)
49 | delt = tspec_to_delta(**range_spec)
50 | end_time = start_time + delt
51 | self.logger.info("End time is %s", end_time)
52 | msgs = [msg for msg in msgs if ts_to_time(msg['ts']) >= start_time and ts_to_time(msg['ts']) <= end_time]
53 | self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs))
54 | summ = txt + u' '
55 | summ_list = []
56 | can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
57 | top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)
58 | can_dict = {key: can_dict[key] for key in top_keys}
59 | self.logger.info("Length of can_dict is %s", len(can_dict))
60 | simple_sum_list = [can_dict[ss] for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]]
61 | simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]])
62 | #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list])
63 | assert(len(simple_sum_list) <= size)
64 | #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))])
65 | if len(msgs) < 10:
66 | #return the longest
67 | summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
68 | else:
69 | max_sents = {}
70 | user_sents = {}
71 | for (txt, msg) in can_dict.items():
72 | if len(txt.split()) > 3:
73 | sl = list(self.sumr.nlp(txt).sents)
74 | max_sents[max(sl, key = lambda x: len(x)).text] = msg
75 | user_sents[max(sl, key = lambda x: len(x)).text] = msg['user'] if 'user' in msg else u''
76 | txt_sum = [v for v in self.sumr(u' '.join(max_sents.keys()), size, user_sents)]
77 | self.logger.info("Canonical keys are \n%s", u' '.join(can_dict.keys()))
78 | self.logger.info("Spacy summ %s", txt_sum)
79 | nlp_summ = u'\n'.join([self.tagged_sum(max_sents[ss]) for ss in txt_sum if len(ss) > 1 and ss in max_sents])
80 | nlp_list = [max_sents[ss] for ss in txt_sum if len(ss) > 1 and ss in max_sents]
81 | for ss in txt_sum:
82 | if ss not in max_sents and len(ss.split()) > 5:
83 | self.logger.info("Searching for: %s", ss)
84 | for (ky, msg) in max_sents.items():
85 | if ss in ky or (len(ky.split()) > 10 and ky in ss) and len(nlp_list) <= size:
86 | nlp_summ += u'\n' + self.tagged_sum(msg)
87 | nlp_list.append(msg)
88 | if len(nlp_list) < 2:
89 | self.logger.info("Failed to find nlp summary using heuristic")
90 | summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
91 | else:
92 | self.logger.info("First msg is %s, %s", nlp_list[0], nlp_list[0]['ts'])
93 | self.logger.info("Sorted is %s", sorted(nlp_list, key=lambda x: x['ts']))
94 | summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(nlp_list, key=lambda x: x['ts'])])
95 | self.logger.info("Summary for segment %s is %s", msgs, summ)
96 | return summ
97 |
98 | def parify_text(self, msg_segment):
99 | ptext = u'. '.join([SpacyTsSummarizer.flrg.sub(u'', msg['text']) for msg in msg_segment if 'text' in msg])
100 | self.logger.debug("Parified text is %s", ptext)
101 | return ptext
102 |
103 | def main():
104 | asd = [{'minutes': 30, 'txt' : u'Summary for first 30 minutes:\n', 'size' : 2}, {'hours':36, 'txt' : u'Summary for next 36 hours:\n', 'size': 3}]
105 | logger = logging.getLogger(__name__)
106 | tr_summ = SpacyTsSummarizer()
107 | all_msgs = []
108 | for msg_file in glob.glob('./data/*.json'):
109 | with io.open(msg_file, encoding='utf-8',) as mf:
110 | all_msgs += json.load(mf)
111 | for filt in asd:
112 | logger.info(tr_summ.summarize(all_msgs, range_spec=filt))
113 |
114 | if __name__ == '__main__':
115 | main()
116 |
--------------------------------------------------------------------------------
/test-events.json:
--------------------------------------------------------------------------------
1 | {"messages": [
2 | {
3 | "type": "message",
4 | "user": "U029LMSEC",
5 | "text": "i\u2019m wondering if in the future we would like some kind of heatmap option for homepage, like\n\n `wpcom_homepage_link_click` with properties: signup_top, signup_bottom, xyz, abc, etc",
6 | "ts": "1441909889.000130"
7 | },
8 | {
9 | "type": "message",
10 | "user": "U0EBEC5T5",
11 | "text": "because i imagine the places we link people will vary quite a bit with tests",
12 | "ts": "1441909928.000131"
13 | },
14 | {
15 | "type": "message",
16 | "user": "U029LMSEC",
17 | "text": "> If there are 2 buttons on the page going to the same link, you could differentiate them by putting in a query parameter to the url",
18 | "ts": "1441910041.000132"
19 | },
20 | {
21 | "type": "message",
22 | "user": "U029LMSEC",
23 | "text": "wondering why that is better than adding a tracks event",
24 | "ts": "1441910059.000133"
25 | },
26 | {
27 | "type": "message",
28 | "user": "U03CGFPKV",
29 | "text": "once we have user properties, we should be able to tell if a specific user is a paid user at a given point in time",
30 | "ts": "1441925382.000186"
31 | },
32 | {
33 | "type": "message",
34 | "user": "U03CGFPKV",
35 | "text": "but we can't really do that efficiently yet",
36 | "ts": "1441925388.000187"
37 | },
38 | {
39 | "type": "message",
40 | "user": "U029LMSEC",
41 | "text": "perf. understood",
42 | "ts": "1441925394.000188"
43 | },
44 | {
45 | "type": "message",
46 | "user": "U029LMSEC",
47 | "text": "thanks for the debrief",
48 | "ts": "1441925398.000189"
49 | }
50 | ]}
51 |
--------------------------------------------------------------------------------
/test_hypothesis_summarizer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import json
3 | import io
4 | from ts_summarizer import (TextRankTsSummarizer)
5 | from interval_summarizer import (IntervalSpec, TsSummarizer,
6 | ts_to_time)
7 | from datetime import datetime
8 | import logging
9 | import sys
10 | import config
11 | from ts_config import DEBUG
12 | from hypothesis import given
13 | from hypothesis.strategies import (sampled_from, lists, just, integers)
14 | import glob
15 | import random
16 | logger = logging.getLogger()
17 | logger.level = logging.DEBUG if DEBUG else logging.INFO
18 | test_json_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
19 | test_json_msgs_c2 = json.load(io.open("./data/test-events-elastic.json", encoding='utf-8'))['messages']
20 | test_json_msgs_c3 = []
21 |
22 | def read_dir(fdir):
23 | coll = []
24 | for jfile in glob.glob('./data/slack-logs-2/{}/*.json'.format(fdir)):
25 | coll += json.load(io.open(jfile, encoding='utf-8'))
26 | return coll
27 |
28 | test_json_msgs_c3 = [(fdir, read_dir(fdir)) for fdir in ['api-test', 'calypso', 'games', 'happiness', 'hg', 'jetpack', 'jetpackfuel', 'livechat', 'tickets', 'vip']]
29 |
30 | print len(test_json_msgs_c3)
31 |
32 | class TestSummarize(unittest.TestCase):
33 |
34 | test_msgs = test_json_msgs
35 |
36 | @given(
37 | lists(elements=sampled_from(test_json_msgs), min_size=3),
38 | integers(min_value=1, max_value=20)
39 | )
40 | def test_text_rank_summarization_ds1_days(self, smp_msgs, days):
41 | """Generate something for N day interval"""
42 | logger.info("Input is %s", smp_msgs)
43 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
44 | summ = TextRankTsSummarizer()
45 | summ.set_channel('elasticsearch')
46 | sumry = summ.summarize(smp_msgs, range_spec=asd)
47 | logger.debug("Summary is %s", sumry)
48 | # Length of summary is at least 1 and no greater than 3
49 | self.assertTrue(len(sumry) >= 1)
50 | self.assertTrue(len(sumry) <= 3)
51 | # Length of summary is less than or equal to the original length
52 | self.assertTrue(len(sumry) <= len(smp_msgs))
53 | # Each message in the summary must correspond to a message
54 |
55 |
56 | @given(
57 | lists(elements=sampled_from(test_json_msgs_c2), min_size=12),
58 | integers(min_value=1, max_value=20)
59 | )
60 | def test_text_rank_summarization_ds2_days(self, smp_msgs, days):
61 | """Generate something for N day interval"""
62 | logger.info("Input is %s", smp_msgs)
63 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
64 | summ = TextRankTsSummarizer()
65 | summ.set_channel('elasticsearch')
66 | sumry = summ.summarize(smp_msgs, range_spec=asd)
67 | logger.debug("Summary is %s", sumry)
68 | # Length of summary is at least 1 and no greater than 3
69 | self.assertTrue(len(sumry) >= 1)
70 | self.assertTrue(len(sumry) <= 3)
71 | # Length of summary is less than or equal to the original length
72 | self.assertTrue(len(sumry) <= len(smp_msgs))
73 | # Each message in the summary must correspond to a message
74 |
75 |
76 | @given(
77 | integers(min_value=1, max_value=1000),
78 | integers(min_value=1, max_value=20)
79 | )
80 | def test_text_rank_summarization_ds3_days(self, sampsize, days):
81 | """Generate something for N day interval"""
82 | channel, ssamp = random.choice(test_json_msgs_c3)
83 | samp = ssamp[random.randint(1,len(ssamp)-2):]
84 | logger.info("Input is segment is %s", samp)
85 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
86 | summ = TextRankTsSummarizer()
87 | summ.set_channel(channel)
88 | sumry = summ.summarize(samp, range_spec=asd)
89 | logger.debug("Summary is %s", sumry)
90 | # Length of summary is at least 1 and no greater than 3
91 | self.assertTrue(len(sumry) >= 1)
92 | self.assertTrue(len(sumry) <= 3)
93 | # Length of summary is less than or equal to the original length
94 | #self.assertTrue(len(sumry) <= len(samp))
95 | # Each message in the summary must correspond to a message
96 |
97 |
98 | @given(lists(elements=sampled_from(test_json_msgs), min_size=1),
99 | integers(min_value=1, max_value=24)
100 | )
101 | def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):
102 | """Generate something for N hour intervals"""
103 | logger.info("Input is %s", smp_msgs)
104 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
105 | summ = TextRankTsSummarizer()
106 | summ.set_channel('elasticsearch')
107 | sumry = summ.summarize(smp_msgs, range_spec=asd)
108 | logger.debug("Summary is %s", sumry)
109 | # Length of summary is at least 1 and no greater than 3
110 | self.assertTrue(len(sumry) >= 1)
111 | self.assertTrue(len(sumry) <= 3)
112 | # Length of summary is less than or equal to the original length
113 | self.assertTrue(len(sumry) <= len(smp_msgs))
114 | # Each message in the summary must correspond to a message
115 |
116 |
117 | @given(lists(elements=sampled_from(test_json_msgs_c2), min_size=1),
118 | integers(min_value=1, max_value=24)
119 | )
120 | def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):
121 | """Generate something for N hour intervals"""
122 | logger.info("Input is %s", smp_msgs)
123 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
124 | summ = TextRankTsSummarizer()
125 | summ.set_channel('elasticsearch')
126 | sumry = summ.summarize(smp_msgs, range_spec=asd)
127 | logger.debug("Summary is %s", sumry)
128 | # Length of summary is at least 1 and no greater than 3
129 | self.assertTrue(len(sumry) >= 1)
130 | self.assertTrue(len(sumry) <= 3)
131 | # Length of summary is less than or equal to the original length
132 | self.assertTrue(len(sumry) <= len(smp_msgs))
133 | # Each message in the summary must correspond to a message
134 |
135 |
136 | @given(
137 | integers(min_value=2, max_value=1000),
138 | integers(min_value=1, max_value=24)
139 | )
140 | def test_text_rank_summarization_ds3_hours(self, sampsize, hours):
141 | """Generate something for N hour intervals"""
142 | channel, ssamp = random.choice(test_json_msgs_c3)
143 | samp = ssamp[random.randint(1,len(ssamp)-2):]
144 | logger.info("Input is segment is %s", samp)
145 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
146 | summ = TextRankTsSummarizer()
147 | summ.set_channel(channel)
148 | sumry = summ.summarize(samp, range_spec=asd)
149 | logger.debug("Summary is %s", sumry)
150 | # Length of summary is at least 1 and no greater than 3
151 | self.assertTrue(len(sumry) >= 1)
152 | self.assertTrue(len(sumry) <= 3)
153 | # Length of summary is less than or equal to the original length
154 | #self.assertTrue(len(sumry) <= len(samp))
155 | # Each message in the summary must correspond to a message
156 |
157 |
158 | if __name__ == '__main__':
159 | unittest.main()
160 |
161 |
--------------------------------------------------------------------------------
/test_service_components.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import mock
3 | from mock import MagicMock, patch
4 | from slacker import Slacker
5 | import slacker
6 | import main
7 | from slack_summary import SlackRouter
8 | from requests import Response
9 | import config
10 | from ts_config import DEBUG, LOG_FILE
11 | import sys
12 | import logging
13 | import logging.handlers
14 | import json
15 | import io
16 |
17 | class Test(unittest.TestCase):
18 | def setUp(self):
19 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20 | log_level = logging.DEBUG if DEBUG else logging.INFO
21 | self.logger = logging.getLogger(__name__)
22 | self.fh = logging.handlers.RotatingFileHandler('./testing_'+LOG_FILE, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5,)
23 | self.fh.setLevel(log_level)
24 | self.fh.setFormatter(formatter)
25 | self.logger.handlers = []
26 | self.logger.addHandler(self.fh)
27 | self.expected = {u'has_more': True, u'messages': [{u'text': u'hmmm...',
28 | u'ts': u'1414028037.000317',
29 | u'type': u'message',
30 | u'user': u'U027LSDDA'}], u'ok': True}
31 | with io.open('./data/test-events-elastic.json', encoding='utf-8') as jf:
32 | self.larger_expected = json.load(jf)
33 | self.myresponse = Response()
34 | self.myresponse.body = self.expected
35 | self.myresponse.status_code = 200
36 | attrs = {'history.return_value': self.myresponse,}
37 | self.channel_mock = MagicMock(**attrs)
38 | self.large_response = Response()
39 | self.large_response.body = self.larger_expected
40 | self.large_response.status_code = 200
41 | attrs2 = {'history.return_value': self.large_response,}
42 | self.channel_mock2 = MagicMock(**attrs2)
43 | main.app.config['TESTING'] = True
44 | self.app = main.app.test_client()
45 |
46 | def tearDown(self):
47 | pass
48 |
49 | @mock.patch('slacker.Slacker')
50 | def test_summary(self, mock_slack):
51 | mock_slack.return_value.channels = self.channel_mock
52 | sr = SlackRouter()
53 | self.assertTrue(sr.get_response('elasticsearch') == self.myresponse)
54 |
55 | @mock.patch('slacker.Slacker')
56 | def test_service(self, mock_slack):
57 | mock_slack.return_value.channels = self.channel_mock
58 | rv = self.app.post('/slack', data=dict(
59 | channel_id='elasticsearch',
60 | channel_name='elasticsearch',
61 | user_id='user123',
62 | user_name='bob',
63 | text='-5 days @bob'
64 | ), follow_redirects=True)
65 | self.logger.handlers = []
66 | self.logger.addHandler(self.fh)
67 | self.logger.info("Response is %s", rv.data)
68 | self.assertTrue(rv.status_code == 200)
69 |
70 | @mock.patch('slacker.Slacker')
71 | def test_service_lr(self, mock_slack):
72 | mock_slack.return_value.channels = self.channel_mock2
73 | rv = self.app.post('/slack', data=dict(
74 | channel_id='elasticsearch',
75 | channel_name='elasticsearch',
76 | user_id='user123456',
77 | user_name='bob2',
78 | text='-2 days @bob'
79 | ), follow_redirects=True)
80 | self.logger.handlers = []
81 | self.logger.addHandler(self.fh)
82 | self.logger.info("Response is %s", rv.data)
83 | self.assertTrue(rv.status_code == 200)
84 |
85 | @mock.patch('slacker.Slacker')
86 | def test_service_no_command(self, mock_slack):
87 | mock_slack.return_value.channels = self.channel_mock2
88 | rv = self.app.post('/slack', data=dict(
89 | channel_id='elasticsearch',
90 | channel_name='elasticsearch',
91 | user_id='user123456',
92 | user_name='bob2',
93 | text=''
94 | ), follow_redirects=True)
95 | self.logger.handlers = []
96 | self.logger.addHandler(self.fh)
97 | self.logger.info("Response is %s", rv.data)
98 | self.assertTrue(rv.status_code == 200)
99 |
100 | @mock.patch('slacker.Slacker')
101 | def test_service_no_text(self, mock_slack):
102 | mock_slack.return_value.channels = self.channel_mock2
103 | rv = self.app.post('/slack', data=dict(
104 | channel_id='elasticsearch',
105 | channel_name='elasticsearch',
106 | user_id='user123456',
107 | user_name='bob2'
108 | ), follow_redirects=True)
109 | self.logger.handlers = []
110 | self.logger.addHandler(self.fh)
111 | self.logger.info("Response is %s", rv.data)
112 | self.assertTrue(rv.status_code == 200)
113 |
114 | @mock.patch('slacker.Slacker')
115 | def test_service_bad_text(self, mock_slack):
116 | mock_slack.return_value.channels = self.channel_mock2
117 | rv = self.app.post('/slack', data=dict(
118 | channel_id='elasticsearch',
119 | channel_name='elasticsearch',
120 | user_id='user123456',
121 | user_name='bob2',
122 | text='adjfalkjldkj adfajldkajflkjadh ndnakdjlkjlkjd'
123 | ), follow_redirects=True)
124 | self.logger.handlers = []
125 | self.logger.addHandler(self.fh)
126 | self.logger.info("Response is %s", rv.data)
127 | self.assertTrue(rv.status_code == 200)
128 |
129 | @mock.patch('slacker.Slacker')
130 | def test_service_bad_units(self, mock_slack):
131 | mock_slack.return_value.channels = self.channel_mock2
132 | rv = self.app.post('/slack', data=dict(
133 | channel_id='elasticsearch',
134 | channel_name='elasticsearch',
135 | user_id='user123456',
136 | user_name='bob2',
137 | text='2 adjfalkjldkj adfajldkajflkjadh ndnakdjlkjlkjd'
138 | ), follow_redirects=True)
139 | self.logger.handlers = []
140 | self.logger.addHandler(self.fh)
141 | self.logger.info("Response is %s", rv.data)
142 | self.assertTrue(rv.status_code == 200)
143 |
144 | @mock.patch('slacker.Slacker')
145 | def test_gensim(self, mock_slack):
146 | mock_slack.return_value.channels = self.channel_mock2
147 | rv = self.app.post('/slack', data=dict(
148 | channel_id='elasticsearch',
149 | channel_name='elasticsearch',
150 | user_id='user123456',
151 | user_name='bob2',
152 | text='2 days gensim'
153 | ), follow_redirects=True)
154 | self.logger.handlers = []
155 | self.logger.addHandler(self.fh)
156 | self.logger.info("Response is %s", rv.data)
157 | self.assertTrue(rv.status_code == 200)
158 |
159 |
160 | if __name__ == '__main__':
161 | unittest.main()
162 |
--------------------------------------------------------------------------------
/test_spacy_with_hypothesis.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import json
3 | import io
4 | from sp_summarizer import (SpacyTsSummarizer)
5 | import hypothesis.settings as hs
6 | from interval_summarizer import (IntervalSpec, TsSummarizer,
7 | ts_to_time)
8 | import lsa
9 | from datetime import datetime
10 | import logging
11 | import sys
12 | import config
13 | from ts_config import DEBUG
14 | from hypothesis import given
15 | from hypothesis.strategies import (sampled_from, lists, just, integers)
16 | import glob
17 | import random
18 | logger = logging.getLogger()
19 | logger.level = logging.DEBUG if DEBUG else logging.INFO
20 | test_json_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
21 | test_json_msgs_c2 = json.load(io.open("./data/test-events-elastic.json", encoding='utf-8'))['messages']
22 | test_json_msgs_c3 = []
23 |
24 | def read_dir(fdir):
25 | coll = []
26 | for jfile in glob.glob('./data/slack-logs-2/{}/*.json'.format(fdir)):
27 | coll += json.load(io.open(jfile, encoding='utf-8'))
28 | return coll
29 |
30 | test_json_msgs_c3 = [(fdir, read_dir(fdir)) for fdir in ['api-test', 'calypso', 'games', 'happiness', 'hg', 'jetpack', 'jetpackfuel', 'livechat', 'tickets', 'vip']]
31 |
32 | class TestSummarize(unittest.TestCase):
33 |
34 | test_msgs = test_json_msgs
35 | summ = SpacyTsSummarizer()
36 | summ.set_summarizer(lsa.LsaSummarizer())
37 |
38 |
39 | @given(
40 | lists(elements=sampled_from(test_json_msgs), min_size=3),
41 | integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
42 | )
43 | def test_text_rank_summarization_ds1_days(self, smp_msgs, days):
44 | """Generate something for N day interval"""
45 | logger.info("Input is %s", smp_msgs)
46 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
47 | #TestSummarize.summ.set_interval()
48 | TestSummarize.summ.set_channel('elasticsearch')
49 | sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
50 | logger.debug("Summary is %s", sumry)
51 | # Length of summary is at least 1 and no greater than 3
52 | self.assertTrue(len(sumry) >= 1)
53 | #self.assertTrue(len(sumry) <= 3)
54 | # Length of summary is less than or equal to the original length
55 | #self.assertTrue(len(sumry) <= len(smp_msgs))
56 | # Each message in the summary must correspond to a message
57 |
58 |
59 | @given(
60 | lists(elements=sampled_from(test_json_msgs_c2), min_size=12),
61 | integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
62 | )
63 | def test_text_rank_summarization_ds2_days(self, smp_msgs, days):
64 | """Generate something for N day interval"""
65 | logger.info("Input is %s", smp_msgs)
66 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
67 | #TestSummarize.summ.set_interval(asd)
68 | TestSummarize.summ.set_channel('elasticsearch')
69 | sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
70 | logger.debug("Summary is %s", sumry)
71 | # Length of summary is at least 1 and no greater than 3
72 | self.assertTrue(len(sumry) >= 1)
73 | #self.assertTrue(len(sumry) <= 3)
74 | # Length of summary is less than or equal to the original length
75 | #self.assertTrue(len(sumry) <= len(smp_msgs))
76 | # Each message in the summary must correspond to a message
77 |
78 |
79 | @given(
80 | integers(min_value=1, max_value=1000),
81 | integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
82 | )
83 | def test_text_rank_summarization_ds3_days(self, sampsize, days):
84 | """Generate something for N day interval"""
85 | channel, ssamp = random.choice(test_json_msgs_c3)
86 | samp = ssamp[random.randint(1,len(ssamp)-2):]
87 | logger.info("Input is segment is %s", samp)
88 | asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
89 | #TestSummarize.summ.set_interval()
90 | TestSummarize.summ.set_channel(channel)
91 | sumry = TestSummarize.summ.summarize(samp, range_spec=asd)
92 | logger.debug("Summary is %s", sumry)
93 | # Length of summary is at least 1 and no greater than 3
94 | self.assertTrue(len(sumry) >= 1)
95 | #self.assertTrue(len(sumry) <= 3)
96 | # Length of summary is less than or equal to the original length
97 | #self.assertTrue(len(sumry) <= len(samp))
98 | # Each message in the summary must correspond to a message
99 |
100 |
101 | @given(lists(elements=sampled_from(test_json_msgs), min_size=1),
102 | integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
103 | )
104 | def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):
105 | """Generate something for N hour intervals"""
106 | logger.info("Input is %s", smp_msgs)
107 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
108 | #TestSummarize.summ.set_interval()
109 | TestSummarize.summ.set_channel('elasticsearch')
110 | sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
111 | logger.debug("Summary is %s", sumry)
112 | # Length of summary is at least 1 and no greater than 3
113 | self.assertTrue(len(sumry) >= 1)
114 | #self.assertTrue(len(sumry) <= 3)
115 | # Length of summary is less than or equal to the original length
116 | #self.assertTrue(len(sumry) <= len(smp_msgs))
117 | # Each message in the summary must correspond to a message
118 |
119 |
120 | @given(lists(elements=sampled_from(test_json_msgs_c2), min_size=1),
121 | integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
122 | )
123 | def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):
124 | """Generate something for N hour intervals"""
125 | logger.info("Input is %s", smp_msgs)
126 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
127 | #TestSummarize.summ.set_interval()
128 | TestSummarize.summ.set_channel('elasticsearch')
129 | sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
130 | logger.debug("Summary is %s", sumry)
131 | # Length of summary is at least 1 and no greater than 3
132 | self.assertTrue(len(sumry) >= 1)
133 | #self.assertTrue(len(sumry) <= 3)
134 | # Length of summary is less than or equal to the original length
135 | #self.assertTrue(len(sumry) <= len(smp_msgs))
136 | # Each message in the summary must correspond to a message
137 |
138 |
139 | @given(
140 | integers(min_value=2, max_value=1000),
141 | integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
142 | )
143 | def test_text_rank_summarization_ds3_hours(self, sampsize, hours):
144 | """Generate something for N hour intervals"""
145 | channel, ssamp = random.choice(test_json_msgs_c3)
146 | samp = ssamp[random.randint(1,len(ssamp)-2):]
147 | TestSummarize.summ.set_channel(channel)
148 | logger.info("Input is segment is %s", samp)
149 | asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
150 | sumry = TestSummarize.summ.summarize(samp, range_spec=asd)
151 | logger.debug("Summary is %s", sumry)
152 | # Length of summary is at least 1 and no greater than 3
153 | self.assertTrue(len(sumry) >= 1)
154 | #self.assertTrue(len(sumry) <= 3)
155 | # Length of summary is less than or equal to the original length
156 | #self.assertTrue(len(sumry) <= len(samp))
157 | # Each message in the summary must correspond to a message
158 |
159 |
160 | if __name__ == '__main__':
161 | unittest.main()
162 |
163 |
--------------------------------------------------------------------------------
/test_summarizer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import json
3 | import io
4 | import config
5 | from ts_config import SUMMS
6 | from interval_summarizer import (IntervalSpec, TsSummarizer,
7 | ts_to_time)
8 | from datetime import datetime
9 | import logging
10 | import logging.handlers
11 | import sys
12 | from ts_config import DEBUG
13 | if "spacy" in SUMMS:
14 | from sp_summarizer import (SpacyTsSummarizer)
15 | import lsa
16 | if "gensim" in SUMMS:
17 | from ts_summarizer import (TextRankTsSummarizer)
18 |
19 | logger = logging.getLogger()
20 | logger.level = logging.DEBUG if DEBUG else logging.INFO
21 |
22 | class TestSummarize(unittest.TestCase):
23 |
24 | test_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
25 |
26 | def test_interval_conversion(self):
27 | self.assertTrue(ts_to_time("1441925382.000186") == datetime.utcfromtimestamp(1441925382))
28 |
29 |
30 | def test_summarizer_tag_display(self):
31 | """Make sure that the display of the tag is correct"""
32 | logger.info("Running the taggger test")
33 | asd = {'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}
34 | summ = TsSummarizer()
35 | summ.set_channel("elasticsearch")
36 | summ_msg = summ.tagged_sum(TestSummarize.test_msgs[1])
37 | logger.debug("Test summ msg is %s", summ_msg)
38 | self.assertTrue(summ_msg == "@Thu-Sep-9-2015 18:32:08 <@U0EBEC5T5>: ")
39 |
40 |
41 | def test_gensim_summarization(self):
42 | """Pass the intervals to summarizer"""
43 | if "gensim" in SUMMS:
44 | asd = [{'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}, {'hours':12, 'size' : 1, 'txt' : u'Summary for last 12 hours:\n'}]
45 | summ = None
46 | summ = TextRankTsSummarizer()
47 | summ.set_channel('elasticsearch')
48 | logger.debug("Testing gensim summarizer")
49 | sumry = summ.summarize(TestSummarize.test_msgs, range_spec=asd)
50 | logger.debug("Summary is %s", sumry)
51 | self.assertTrue(len(sumry) > 1)
52 | else:
53 | pass
54 |
55 | def test_spacy_summarization(self):
56 | """Pass the intervals to summarizer"""
57 | if "spacy" in SUMMS:
58 | asd = [{'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}, {'hours':12, 'size' : 1, 'txt' : u'Summary for last 12 hours:\n'}]
59 | summ = None
60 | lsa_summ = lsa.LsaSummarizer()
61 | summ = SpacyTsSummarizer()
62 | for rs in asd:
63 | summ.set_summarizer(lsa_summ)
64 | summ.set_channel('elasticsearch')
65 | logger.debug("Testing spacy summarizer")
66 | sumry = summ.summarize(TestSummarize.test_msgs, range_spec=rs)
67 | logger.debug("Summary is %s, length %s", sumry, len(sumry))
68 | self.assertTrue(len(sumry) > 1)
69 | else:
70 | pass
71 |
72 |
73 | if __name__ == '__main__':
74 | unittest.main()
75 |
76 |
--------------------------------------------------------------------------------
/ts_config.py:
--------------------------------------------------------------------------------
1 | SUMMARY_INTERVALS = [{'days': 5, 'size': 2}, ]
2 | TS_DEBUG = True
3 | TS_LOG = "ts_summ.log"
4 | DEBUG=True
5 | LOG_FILE="summary.log"
6 | TEST_JSON="./data/test-events-elastic.json"
7 | SUMMS=["spacy"]
8 |
9 |
10 |
--------------------------------------------------------------------------------
/ts_summarizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import namedtuple
3 | from datetime import (timedelta, datetime)
4 | import re
5 | import logging
6 | import logging.handlers
7 | import sys
8 | import json
9 | import io
10 | import lsa
11 | import utils
12 | import base_summarizer
13 | import compat
14 | from gensim.summarization import summarize as gs_sumrz
15 | from gensim.summarization.textcleaner import split_sentences
16 | from gensim.models.word2vec import LineSentence
17 | from ts_config import TS_DEBUG, TS_LOG
18 | import glob
19 | from interval_summarizer import (IntervalSpec, TsSummarizer,
20 | ts_to_time)
21 | from utils import get_msg_text
22 | logging.basicConfig(level=logging.INFO)
23 |
24 | class TextRankTsSummarizer(TsSummarizer):
25 |
26 | def __init__(self, ):
27 | TsSummarizer.__init__(self, )
28 | log_level = logging.DEBUG if TS_DEBUG else logging.INFO
29 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
30 | fh = logging.handlers.RotatingFileHandler('./text_rank_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
31 | fh.setLevel(log_level)
32 | fh.setFormatter(formatter)
33 | self.logger = logging.getLogger('ts_summarizer')
34 | self.logger.handlers = []
35 | self.logger.addHandler(fh)
36 |
37 | def set_summarizer(self, val):
38 | pass
39 |
40 | def summarize(self, msgs, range_spec=None):
41 | """Return a summary of the text
42 | TODO: 1. Looks like spacy is not getting the main sentence from the message.
43 | 2. Load times for the spacy summarizer won't cut it. Commenting out now
44 | until this can be fixed
45 | """
46 | if not msgs or len(msgs) == 0:
47 | self.logger.warn("No messages to form summary")
48 | return u"\n Unable to form summary here.\n"
49 | txt = range_spec['txt'] if range_spec else u'Summary is'
50 | size = range_spec['size'] if range_spec and 'size' in range_spec else 3
51 | summ = txt + u' '
52 | #limit canonical dictionary to top 200 docs
53 | can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
54 | top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:300]
55 | can_dict = {key: can_dict[key] for key in top_keys}
56 | self.logger.info("Length of can_dict is %s", len(can_dict))
57 | simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:3]])
58 | # If the number of messages or vocabulary is too low, just look for a
59 | # promising set of messages
60 | if len(msgs) < 11 or len(can_dict) < 11:
61 | #return the longest
62 | self.logger.warn("Too few messages for NLP.")
63 | summ += simple_sum
64 | else:
65 | max_sents = {}
66 | for (txt, msg) in can_dict.items():
67 | if len(txt.split()) > 3:
68 | #Use the same splitting that gensim does
69 | for snt in split_sentences(txt):
70 | if len(snt.split()) > 100:
71 | snt = u' '.join(snt.split()[:100])
72 | max_sents[snt] = msg
73 | ratio = (size * 2)/ float(len(max_sents.keys()))
74 | #ratio = 0.3
75 | sent1 = u' '.join(can_dict.keys())
76 | sent2 = u' '.join(max_sents.keys())
77 | gn_sum = gs_sumrz(sent1, ratio=ratio, split=True)[:size]
78 | mx_sum = gs_sumrz(sent2, ratio=ratio, split=True)[:size]
79 | self.logger.info("Gensim sum %s", gn_sum)
80 | gs_summ = u'\n'.join([self.tagged_sum(can_dict[ss] if ss in can_dict else max_sents[ss]) for ss in gn_sum if len(ss) > 1 and (ss in max_sents or ss in can_dict)])
81 | for ss in mx_sum:
82 | if ss not in max_sents and ss not in can_dict and len(ss.split()) > 5:
83 | self.logger.info("Searching for: %s", ss)
84 | for (ky, msg) in max_sents.items():
85 | if ss in ky or (len(ky.split()) > 10 and ky in ss):
86 | gs_summ += u'\n' + self.tagged_sum(msg)
87 | if len(gn_sum) > 1:
88 | summ += gs_summ
89 | else:
90 | self.logger.warn("NLP Summarizer produced null output %s", gs_summ)
91 | summ += simple_sum
92 | self.logger.info("Summary for segment %s is %s", msgs, summ)
93 | return summ
94 |
95 | def parify_text(self, msg_segment):
96 | ptext = u'. '.join([TextRankTsSummarizer.flrg.sub(u'', get_msg_text(msg)) for msg in msg_segment])
97 | self.logger.debug("Parified text is %s", ptext)
98 | return ptext
99 |
100 | def canonicalize(txt):
101 | """Change the messages so that each ends with punctation"""
102 | ntxt = TsSummarizer.flrg.sub(u'', txt)
103 | ntxt = ntxt.strip() if re.match(r'.*[\.\?\!]\s*$', ntxt) else u'{}.'.format(ntxt.strip())
104 | return ntxt if len(ntxt.split()) < 100 else u' '.join(ntxt.split()[:100])
105 | #return ntxt if re.match(r'.*[\.\?]$', ntxt) else u'{}.'.format(ntxt)
106 |
107 | def main():
108 | asd = [{'minutes': 30, 'txt' : u'Summary for first 30 minutes:\n', 'size' : 2}, {'hours':36, 'txt' : u'Summary for next 36 hours:\n', 'size': 3}]
109 | logger = logging.getLogger(__name__)
110 | tr_summ = TextRankTsSummarizer(asd)
111 | all_msgs = []
112 | for msg_file in glob.glob('./data/*.json'):
113 | with io.open(msg_file, encoding='utf-8',) as mf:
114 | all_msgs += json.load(mf)
115 | for filt in asd:
116 | logger.info(tr_summ.summarize(all_msgs, range_spec=filt))
117 |
118 | if __name__ == '__main__':
119 | main()
120 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division, print_function, unicode_literals
5 |
6 | class ItemsCount(object):
7 | def __init__(self, value):
8 | self._value = value
9 | self.string_types = (str, unicode)
10 |
11 | def __call__(self, sequence):
12 | if isinstance(self._value, self.string_types):
13 | if self._value.endswith("%"):
14 | total_count = len(sequence)
15 | percentage = int(self._value[:-1])
16 | # at least one sentence should be chosen
17 | count = max(1, total_count*percentage // 100)
18 | return sequence[:count]
19 | else:
20 | return sequence[:int(self._value)]
21 | elif isinstance(self._value, (int, float)):
22 | return sequence[:int(self._value)]
23 | else:
24 | ValueError("Unsuported value of items count '%s'." % self._value)
25 |
26 | def __repr__(self):
27 | return to_string("" % self._value)
28 |
29 | def maybe_get(cont, key, default=None):
30 | return cont[key] if key in cont else default
31 |
32 | def get_msg_text(msg):
33 | """Pull the appropriate text from the message"""
34 | if 'text' in msg and len(msg['text']) > 0:
35 | return msg['text']
36 | if 'attachments' in msg:
37 | ats = msg['attachments']
38 | if len(ats) > 0:
39 | at = ats[0]
40 | att_text = []
41 | if 'title' in at:
42 | att_text.append(at['title'])
43 | if 'text' in at:
44 | att_text.append(at['text'])
45 | max_text = max(att_text, key=lambda txt: len(txt))
46 | if len(max_text) > 0:
47 | return max_text
48 | return u""
49 |
50 |
--------------------------------------------------------------------------------