├── .gitignore
├── Makefile
├── Procfile
├── README.md
├── base_summarizer.py
├── compat.py
├── data
    ├── test-events-channel2.json
    └── test-events-elastic.json
├── img
    ├── hackathon-discussion.png
    └── meeting-discussion.png
├── interval_summarizer.py
├── lsa.py
├── main.py
├── requirements.txt
├── slack_summary.py
├── sp_summarizer.py
├── test-events.json
├── test_hypothesis_summarizer.py
├── test_service_components.py
├── test_spacy_with_hypothesis.py
├── test_summarizer.py
├── ts_config.py
├── ts_summarizer.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | *.pyc


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | platform=$(shell uname -s)
 2 | conda_path=$(shell which conda)
 3 | 
 4 | .PHONY: show check-env venv check run hyp spacy_hyp
 5 | 
 6 | 
 7 | ifeq ($(platform),Darwin)
 8 | 
 9 | ifneq ($(findstring conda,$(conda_path)),conda)
10 | 	$(error Conda not present)
11 | else
12 | 	@echo Conda present at $(conda_path)
13 | endif
14 | 
15 | ifeq ($(SUMMARIZE_VENV),)
16 | SUMMARIZE_VENV=summarize_venv2
17 | endif
18 | ifeq ($(CONDA_ENV_PATH),)
19 | CONDA_ENV_PATH=//anaconda
20 | endif
21 | 
22 | HOST_IP?=10.0.0.10
23 | NB_PORT?=8887
24 | 
25 | PYLIBS := numpy scipy scikit-learn gensim spacy flask
26 | VENVDIR := $(CONDA_ENV_PATH)/envs/$(SUMMARIZE_VENV)
27 | 
28 | $(VENVDIR):
29 | 	test -d $(VENVDIR) || conda create -y -n $(SUMMARIZE_VENV) $(PYLIBS)
30 | 
31 | deps: $(VENVDIR)
32 | 
33 | check: $(VENVDIR)
34 | 	source activate $(SUMMARIZE_VENV);\
35 | 	python ./test_summarizer.py;\
36 | 	python ./test_service_components.py
37 | 
38 | run: $(VENVDIR)
39 | 	source activate $(SUMMARIZE_VENV);\
40 | 	python ./ts_summarizer.py
41 | 
42 | else ifeq ($(platform),Linux)
43 | 
44 | VENVDIR := ./venv
45 | PYVENV := $(VENVDIR)/bin/python
46 | NBVENV := $(VENVDIR)/bin/ipython
47 | PIPVENV := $(VENVDIR)/bin/pip
48 | 
49 | clean:
50 | 	rm -r $(VENVDIR)
51 | 
52 | check: | $(VENVDIR)
53 | 	$(PYVENV) ./test_summarizer.py;\
54 | 	$(PYVENV) ./test_service_components.py
55 | 
56 | hyp: | $(VENVDIR)
57 | 	$(PYVENV) ./test_hypothesis_summarizer.py
58 | 
59 | spacy_hyp: | $(VENVDIR)
60 | 	$(PYVENV) ./test_spacy_with_hypothesis.py
61 | 
62 | run: | $(VENVDIR)
63 | 	$(PYVENV) ./ts_summarizer.py
64 | 
65 | notebook: | $(VENVDIR)
66 | 	$(NBVENV) notebook --ip=$(HOST_IP) --port=$(NB_PORT) --no-browser
67 | 
68 | $(VENVDIR):
69 | 	test -d $(VENVDIR) || (virtualenv $(VENVDIR);\
70 | 	$(PIPVENV) install -r ./requirements.txt;\
71 | 	$(PYVENV) -m spacy.en.download all)
72 | 
73 | else
74 | 	$(error, Unknown platform)
75 | 
76 | endif
77 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn gettingstarted.wsgi --log-file -
2 | web: python main.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Slack Summary
 2 | 
 3 | Summarize it is a chat summarizer plugin for instant messaging applications. It summarizes the large content of chat logs which enables users to quickly understand the current context of the conversation. Currently Summarize it works on top of Slack as its plugin.
 4 | 
 5 | The original relied on an HP cloud concepts expraction api. We've pulled that out to remove any reliance on 3rd party apis, and are going to improve the summarizations.
 6 | 
 7 | ## Installing Summarize It plugin for your slack
 8 | 
 9 | To install the summary package
10 | 
11 | Firsts, create a token for your team `https://api.slack.com/web` 
12 | 
13 |    	   pip install flask requests slacker wsgiref jupyter mock pbr spacy numpy
14 | 
15 | Then run
16 | 
17 |      python -m spacy.en.download all
18 | 
19 | Edit the `config.py` file that it includes the line
20 | 
21 |      keys = {
22 |      	     "slack": "your-token-here"
23 |     	     }
24 | 
25 | Then edit `ts_config.py` file to adjust the debugging options
26 | 
27 |      SUMMARY_INTERVALS = [{'minutes': 10, 'size': 1}, {'hours':12, 'size': 2}]
28 |      TS_DEBUG = True
29 |      TS_LOG = "./ts_summ.log"
30 |      DEBUG=True
31 |      LOG_FILE="./summary.log"
32 | 
33 | Here the `LOG_FILE` stores where notices of users accessing the server is stored and the
34 | value of `DEBUG` determines if detailed logging is stored.
35 | 
36 | The plugin is executed by running
37 | 
38 |     python main.py
39 | 
40 | 
41 | Tests are currently setup to run in a python `virtualenv`. These will executed by
42 | runnning
43 | 
44 | 	make check
45 | 
46 | but realize that the <b>test will install and run in a virtualenv</b>
47 | 
48 | 
49 | To complete the installation
50 | 
51 | 1. Visit `https://<your-team-name>.slack.com/services/new/slash-commands`
52 | 
53 | 2. Enter the command name you wish to use
54 | 
55 | 3. Enter the request url as `<your-deployed-app-url>/slack`
56 | 
57 | ## Using Summarize It plugin with slack
58 | 
59 | Let's assume that that plugin is named <b>summary</b>. The plugin supports a small
60 | command line syntax with allows specification of the previous window of time to look
61 | back. Currently this can be specified in `minutes, days, or weeks`. Keyword search is
62 | coming soon.
63 | 
64 | So to get the key messages from the last 5 days:
65 | 
66 |    /summary 5 days
67 | 
68 | Or to get a summary of the important messages over the last two weeks
69 | 
70 |    /summary 2 weeks
71 | 
72 | 
73 | ## Screenshots
74 | 
75 | #### Hackathon Discussion
76 | ![Hackathon Discussion](img/hackathon-discussion.png)
77 | 
78 | #### Meeting Discussion
79 | ![Meeting Discussion](img/meeting-discussion.png)
80 | 
81 | ## Authors and Contributors
82 | Yask Srivastava (Developer), [Ketan Bhatt](https://github.com/ketanbhatt) (Developer), [Pranu Sarna](https://github.com/psarna94) (Developer) and [Vinayak Mehta](https://github.com/vortex-ape) (Data Scientist).
83 | 
84 | ## Support or Contact
85 | Having trouble with summarize it? Create an issue in the repository GitHub Repo.
86 | 
87 | 


--------------------------------------------------------------------------------
/base_summarizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division, print_function, unicode_literals
 5 | 
 6 | from collections import namedtuple
 7 | from operator import attrgetter
 8 | from utils import ItemsCount
 9 | import logging
10 | logging.basicConfig(level=logging.INFO)
11 | 
12 | SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))
13 | 
14 | class BaseSummarizer(object):
15 |     def __init__(self, ):
16 |         self.logger = logging.getLogger(__name__)
17 |         
18 |     def __call__(self, document, sentences_count):
19 |         raise NotImplementedError("This method should be overriden in subclass")
20 | 
21 |     def normalize_word(self, word):
22 |         return word.lower()
23 | 
24 |     def _get_best_sentences(self, sentences, count, rating, *args, **kwargs):
25 |         rate = rating
26 |         self.logger.info("Sentences are %s" % sentences)
27 | 
28 |         infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
29 |             for o, s in enumerate(sentences))
30 |         # sort sentences by rating in descending order
31 |         infos = sorted(infos, key=attrgetter("rating"), reverse=True)
32 |         # get `count` first best rated sentences
33 |         count = ItemsCount(count)
34 |         # if not isinstance(count, ItemsCount):
35 |         #     count = ItemsCount(count)
36 |         infos = count(infos)
37 |         # sort sentences by their order in document
38 |         infos = sorted(infos, key=attrgetter("order"))
39 | 
40 |         return tuple(i.sentence for i in infos)
41 | 


--------------------------------------------------------------------------------
/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division, print_function, unicode_literals
 5 | 
 6 | from sys import version_info
 7 | 
 8 | 
 9 | PY3 = version_info[0] == 3
10 | 
11 | if PY3:
12 |     bytes = bytes
13 |     unicode = str
14 | else:
15 |     bytes = str
16 |     unicode = unicode
17 | string_types = (bytes, unicode,)
18 | 


--------------------------------------------------------------------------------
/data/test-events-channel2.json:
--------------------------------------------------------------------------------
1 | {"messages": [{u'text': u'thank you', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445970190.000167'}, {u'text': u'I will do that. Thanks.', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445970177.000166'}, {u'text': u'i think this `&amp;clearmemcache=brisket` trick still works <https://wordpress.com/my-stats/?blog=97626868&amp;unit=1&amp;clearmemcache=brisket>, so the correct number appears to be 130.  could you please post on <http://serenityofdata.wordpress.com> with this issue?', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445970158.000165'}, {u'text': u'Sorry, not 135. Just the 145 and 130.', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969964.000164'}, {u'text': u'<https://cloudup.com/cA3JBxJRVUX>', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969663.000162', u'attachments': [{u'thumb_height': 172, u'image_bytes': 25818, u'thumb_width': 288, u'title': u'Screen Shot', u'service_name': u'Cloudup', u'image_width': 288, u'author_name': u'Chrissie Pollock', u'title_link': u'https://cloudup.com/cA3JBxJRVUX', u'image_height': 172, u'service_url': u'https://cloudup.com', u'id': 1, u'image_url': u'https://cldup.com/srhnwNU4Yf-3000x3000.png', u'fallback': u'Cloudup Photo: Screen Shot', u'thumb_url': u'https://cldup.com/srhnwNU4Yf-3000x3000.png', u'from_url': u'https://cloudup.com/cA3JBxJRVUX'}]}, {u'text': u'<https://cloudup.com/cZJMWS6ikXx>', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969632.000160', u'attachments': [{u'thumb_height': 34, u'image_bytes': 1809, u'thumb_width': 50, u'title': u'Screen Shot', u'service_name': u'Cloudup', u'image_width': 50, u'author_name': u'Chrissie Pollock', u'title_link': u'https://cloudup.com/cZJMWS6ikXx', u'image_height': 34, u'service_url': u'https://cloudup.com', u'id': 1, u'image_url': u'https://cldup.com/IJiJlHtWPr-50x50.png', u'fallback': u'Cloudup Photo: Screen Shot', u'thumb_url': u'https://cldup.com/IJiJlHtWPr-50x50.png', u'from_url': u'https://cloudup.com/cZJMWS6ikXx'}]}, {u'subtype': u'channel_leave', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445969621.000159', u'text': u'<@U029KGD6U|andrewspittle> has left the channel'}, {u'text': u'k. Thanks. I have a mobile support user whose Best Views Ever stats keep going down. I have screenshots showing 145 then 135 and now it\u2019s 130. All for the same day in Sept. <https://wordpress.com/my-stats/?blog=97626868&amp;day=2015-09-25&amp;unit=1>', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969589.000158'}, {u'text': u"hi <@U02FKB7KR> i'm in a hangout.  what's the question, i can try to answer async.  i might ask you to post on a datap2 if it's a tricky question :simple_smile:", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445969535.000157'}, {u'text': u'Hello! Can someone help me with a stats question?', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969496.000156'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02FKB7KR', u'ts': u'1445969470.000155', u'text': u'<@U02FKB7KR|chrissiepollock> has joined the channel'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02CJ23LK', u'ts': u'1445918013.000154', u'text': u'<@U02CJ23LK|hafiz> has joined the channel'}, {u'text': u'<@U029DBR0N> I\u2019m not sure if you\u2019ve seen that ^^^, but it is relevant to the Guided Transfer part of the question.', u'type': u'message', u'user': u'U029BJ4TH', u'ts': u'1445899059.000153'}, {u'text': u'<https://movingtheirstuff.wordpress.com/2015/06/18/a-guided-transfer-survey-analysis/>', u'type': u'message', u'user': u'U029BJ4TH', u'ts': u'1445898870.000152'}, {u'text': u'I\u2019m headed off, Joe, but I\u2019m interested and happy to help :simple_smile:', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894785.000151'}, {u'text': u'Opening a chat with \u201cPLUGINS"', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894666.000150'}, {u'text': u'Purchasing a Guided Transfer :smile:', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894654.000149'}, {u'text': u'That\u2019s my particular interest, but I figure since we are collecting data, what else might be usefull to gather?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894646.000148'}, {u'text': u'A content export?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894646.000147'}, {u'text': u'How could we know without doing that? What could indicate an intent to go self hosted?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894642.000146'}, {u'text': u'The easiest way is to ask during the cancellation', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894629.000145'}, {u'text': u'To flip \u2018em', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894564.000144'}, {u'text': u'Ooooh interesting', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445894560.000143'}, {u'text': u'What % of refunds are potential JPOP customers?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894547.000142'}, {u'text': u'One example would be the potential funnel from <http://wp.com|wp.com> to Jetpack plans.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894462.000141'}, {u'text': u'When we encounter someone looking to cancel/refund. We can collect data about why.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445894434.000140'}, {u'text': u'Data in live chat being something like tags that commonly occur with folks who then go on to cancel a product? Or something else?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445893884.000139'}, {u'text': u'I was looking for a list of reasons that we currently track, so that we might correlate the data collected in live chat with broader trends.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445893769.000138'}, {u'text': u'Or reasons that we sort of discern and label ourselves?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445892248.000137'}, {u'text': u'Hm, you mean user-defined reasons?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445892239.000136'}, {u'text': u'At this point, my main concern was making sure we\u2019re speaking the same language. Do we track any cancelation \u2018reasons\u201d other than:\n- No longer use\n- Don\u2019t know what this is\n- Does not work as expected\n- I want to delete my site\n- Other', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891882.000134', u'edited': {u'user': u'U029DBR0N', u'ts': u'1445891894.000000'}}, {u'text': u'There\u2019s some in-progress work on the self-service part and the gathering of refund reasons here. - <https://spectrometry.wordpress.com/2015/08/14/upgrades-management-prototype-i1/>', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891607.000133'}, {u'text': u'I\u2019m about to jump into a 3210, happy to catch up async', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891595.000132'}, {u'text': u'Like what percentage of our volume is going toward helping people with actions that should be self-serve and, maybe?, are actions that aren\u2019t associated with higher engagement.', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891587.000131'}, {u'text': u'Part of the idea, I think, was to get a sense of what portion of chats are currently serving refunds and cancellations.', u'type': u'message', u'user': u'U029KGD6U', u'ts': u'1445891551.000130'}, {u'text': u'Is that what you\u2019re investigating?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891489.000129'}, {u'text': u'Is there something special about folks who refund through chats?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891484.000128'}, {u'text': u'<@U029KGD6U|andrewspittle> has joined the channel', u'ts': u'1445891431.000127', u'subtype': u'channel_join', u'inviter': u'U03CGFPKV', u'type': u'message', u'user': u'U029KGD6U'}, {u'text': u'I believe <@U029KGD6U> made the tagging suggestion. Maybe he has a slightly less-fuzzy recollection of the idea.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445891421.000126'}, {u'text': u'If we have all the data we need, then perhaps the tagging is not needed.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891317.000125'}, {u'text': u'The details of the town hall are slightly\u2026 fuzzy\u2026 in my head. :simple_smile: But I recall a question about why the refund rate was so high. Particularly with the Business plan.', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891239.000124'}, {u'text': u'To see if there\u2019s a meaningful pattern of behavior pre-dating a refund?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891175.000123'}, {u'text': u'Hm, what\u2019s the Big End Goal Joe?', u'type': u'message', u'user': u'U029AS109', u'ts': u'1445891165.000122'}, {u'text': u'Hello, data :simple_smile: I\u2019m following up on the idea floated at the TownHall about tracking cancelations/refunds initiated via live chat. The idea is: we would tag chats that result in a refund with some kind of meaningful data.\n\nAre there some specific data points that it would be beneficial for us to capture? Perhaps matching the \u201creason\u201d to the same categories you\u2019re tracking now?', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445891121.000121'}, {u'text': u'so it can\u2019t be the code used to track', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445879023.000120'}, {u'text': u'thanks for the link but to add to that Jetpack hasn\u2019t had an update since this problem started', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445879009.000119'}, {u'text': u"but now that I think of it, that doesn't seem relevant", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878978.000118'}, {u'text': u'I was thinking of this: <https://mercuryo2.wordpress.com/2015/10/22/autoupdate-data-in-tracks/#comment-5001>', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878958.000117'}, {u'text': u'<@U03CGFPKV> who was in that conversation?', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878888.000116'}, {u'text': u'<@U02H4GV02>: there was some recent conversation about switching these stats to Tracks. That could be the reason', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445878860.000115'}, {u'text': u":data: team hangout starting, i'll take a look later", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878778.000114'}, {u'username': u'<@U02A15G2E|timmyc>', u'display_as_bot': False, u'text': u'<@U02A15G2E|timmyc> uploaded a file: <https://a8c.slack.com/files/timmyc/F0D78TW1X/jetpack_nux_-_metrimattic.png|Jetpack_Nux_-_Metrimattic.png>', u'upload': True, u'ts': u'1445878774.000113', u'subtype': u'file_share', u'user': u'U02A15G2E', u'file': {u'thumb_480_w': 480, u'groups': [], u'filetype': u'png', u'thumb_480': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_480.png', u'display_as_bot': False, u'id': u'F0D78TW1X', u'size': 62027, u'url_download': u'https://slack-files.com/files-pub/T024FN1V2-F0D78TW1X-714fea3df5/download/jetpack_nux_-_metrimattic.png', u'thumb_360_w': 360, u'title': u'Jetpack_Nux_-_Metrimattic.png', u'url_private': u'https://files.slack.com/files-pri/T024FN1V2-F0D78TW1X/jetpack_nux_-_metrimattic.png', u'thumb_720_h': 436, u'thumb_360': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_360.png', u'thumb_64': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_64.png', u'ims': [], u'thumb_720_w': 720, u'thumb_80': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_80.png', u'comments_count': 0, u'thumb_360_h': 218, u'thumb_480_h': 290, u'external_type': u'', u'username': u'', u'timestamp': 1445878771, u'public_url_shared': False, u'editable': False, u'original_h': 934, u'thumb_160': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_160.png', u'url_private_download': u'https://files.slack.com/files-pri/T024FN1V2-F0D78TW1X/download/jetpack_nux_-_metrimattic.png', u'thumb_1024': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_1024.png', u'user': u'U02A15G2E', u'image_exif_rotation': 1, u'thumb_960': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_960.png', u'is_public': True, u'pretty_type': u'PNG', u'name': u'Jetpack_Nux_-_Metrimattic.png', u'mimetype': u'image/png', u'permalink_public': u'https://slack-files.com/T024FN1V2-F0D78TW1X-714fea3df5', u'permalink': u'https://a8c.slack.com/files/timmyc/F0D78TW1X/jetpack_nux_-_metrimattic.png', u'is_external': False, u'created': 1445878771, u'url': u'https://slack-files.com/files-pub/T024FN1V2-F0D78TW1X-714fea3df5/jetpack_nux_-_metrimattic.png', u'thumb_1024_h': 619, u'thumb_960_h': 581, u'original_w': 1544, u'thumb_960_w': 960, u'thumb_1024_w': 1024, u'mode': u'hosted', u'thumb_720': u'https://slack-files.com/files-tmb/T024FN1V2-F0D78TW1X-7d8ca1788c/jetpack_nux_-_metrimattic_720.png', u'channels': [u'C029ENR23']}, u'type': u'message', u'bot_id': None}, {u'text': u'mysterious', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878767.000112'}, {u'text': u'fwiw if you toggle to \u201cYearly\u201d on any of the jetpack links above, I do see data on the charts', u'type': u'message', u'user': u'U02A15G2E', u'ts': u'1445878750.000111'}, {u'text': u'will do Martin, thanks', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878739.000110'}, {u'text': u'can you please post on datap2? and mark unresolved?', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878685.000108', u'edited': {u'user': u'U027KUZUY', u'ts': u'1445878697.000000'}}, {u'text': u'hmm', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878656.000107'}, {u'text': u'<https://mc.a8c.com/s/jetpack-jumpstart>', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878645.000106'}, {u'text': u'<https://mc.a8c.com/s/jetpack-nux/>', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878639.000105'}, {u'text': u'data seems to be coming in', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878637.000104'}, {u'text': u'all the Jetpack one\u2019s I\u2019m looking at are gaffed', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878633.000103'}, {u'text': u'interesting', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878624.000102'}, {u'text': u"hey <@U02H4GV02> this doesn't seem to be a MC-wide thing: <https://mc.a8c.com/s/tweet-store/>", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445878595.000101'}, {u'text': u'Anyone know why stats graphs aren\u2019t working on Mission Control currently? <https://mc.a8c.com/s/jetpack-jitm/>', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878558.000100'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U02H4GV02', u'ts': u'1445878555.000099', u'text': u'<@U02H4GV02|jesse> has joined the channel'}, {u'text': u'<@U027KUZUY|martin> set the channel topic: On duty this week: @martin.  Not around? Not answering? Post on Data P2.', u'ts': u'1445869753.000098', u'topic': u'On duty this week: @martin.  Not around? Not answering? Post on Data P2.', u'subtype': u'channel_topic', u'user': u'U027KUZUY', u'type': u'message'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U029DBR0N', u'ts': u'1445843480.000097', u'text': u'<@U029DBR0N|joe> has joined the channel'}, {u'text': u':data: ++', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535851.000096'}, {u'text': u'We should be getting some new devs on the Tracks squad soon and will have more resources to implement these things', u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535774.000095'}, {u'text': u'Thanks :y:', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445535753.000094'}, {u'text': u'awesome, thanks :simple_smile:', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535749.000093'}, {u'text': u'Added a card here: <https://trello.com/c/qBYISpal>', u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535745.000092'}, {u'text': u'It\u2019s <@U024FNH8S>\u2019s not mine :simple_smile:  Just wasn\u2019t sure if it was on your backlog', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445535716.000091'}, {u'text': u"<@U027KR1M5>: It's a good idea :simple_smile:", u'type': u'message', u'user': u'U029CED9T', u'ts': u'1445535679.000090'}, {u'text': u'<@U029CED9T>: ^  any plans to add annotations to these pages? <https://mc.a8c.com/tracks/trends/?eventname=calypso_themeshowcase_search>', u'type': u'message', u'user': u'U027KR1M5', u'ts': u'1445531113.000089'}, {u'text': u'Hey hey :simple_smile:\n\nIs there a plan for adding annotations, so I could link the event with the P2 post with the explanation for the variation? \n<https://teamhyperion.wordpress.com/2015/10/22/metrics-explaining-calypso_themeshowcase_search-oct-9th-2015-bump/#comment-2804>', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445530662.000088'}, {u'subtype': u'channel_join', u'type': u'message', u'user': u'U024FNH8S', u'ts': u'1445530624.000087', u'text': u'<@U024FNH8S|folletto> has joined the channel'}, {u'text': u'yup', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466565.000086'}, {u'text': u'Phil will be working on some stuff in <https://mc.a8c.com/tracks>, so he needs a missioncontrol checkout, which I think is part of a wpcom sandbox.', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466383.000085'}, {u'text': u'no worries at all.  thanks for the quick reply!', u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466332.000084'}, {u'text': u'sorry to bother', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466327.000083'}, {u'text': u'ok great, thanks <@U027KUZUY>', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466324.000082'}, {u'text': u"hey <@U033TML75> i'm off &amp; on today.  yes, wpcom/MC sandbox.", u'type': u'message', u'user': u'U027KUZUY', u'ts': u'1445466316.000081'}, {u'text': u'k', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466311.000080'}, {u'text': u'I can take care of that if he needs it', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466278.000079'}, {u'text': u'He might need Hue access on Nosara.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466260.000078'}, {u'text': u'I think Martin meant wpcom sandbox', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466238.000077'}, {u'text': u"I don't think he should need a hadoop sandbox, at least not yet.", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445466224.000076'}, {u'text': u"but looks like <@U027KUZUY> is AFK and i dont want to hold up phil if he's ready to get started tonight", u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466175.000075'}, {u'text': u'but just want to be sure its a regular sandbox vs. a hadoop sandbox', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466153.000074'}, {u'text': u'and <@U027KUZUY> asked for a sandbox for him', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466140.000073'}, {u'text': u'<@U03CGFPKV> or <@U027LSDDA> - i see pcrumm starting to login to stuff', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445466133.000072'}, {u'text': u'thanks <@U03CGFPKV>', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384748.000071'}, {u'text': u'sounds good - yeah i need to go eat and sort of pay attention to the mrs. (especially after being gone a week) :wink:', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384745.000070'}, {u'text': u'I can try to help some more tomorrow. If Andy is back, he is certainly the expert here.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384642.000069'}, {u'text': u'I see.', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384610.000068'}, {u'text': u'well i believe it was, which is how barry noticed in the first place (we got alerts for too many logs in queue because the same ones would fail and get requeued over and over)', u'type': u'message', u'user': u'U033TML75', u'ts': u'1445384538.000067'}, {u'text': u"since they were previously getting requeued, and the the queue wasn't filling up (as far as I know)", u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384502.000066'}, {u'text': u'My guess is that there may be some sort of database timeout or something', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384480.000065'}, {u'text': u'I have to get going', u'type': u'message', u'user': u'U03CGFPKV', u'ts': u'1445384456.000064'}]}
2 | 


--------------------------------------------------------------------------------
/data/test-events-elastic.json:
--------------------------------------------------------------------------------
  1 | {"messages": [   {
  2 |         "type": "message",
  3 |         "user": "U029CL0GJ",
  4 |         "text": "presumably those have been resolved by now however it seems the ES query i need to do has since changed as i'm getting an error",
  5 |         "ts": "1414028017.000313"
  6 |     },
  7 |     {
  8 |         "type": "message",
  9 |         "user": "U029CL0GJ",
 10 |         "text": "```Exception(Elastica\\Exception\\ResponseException): SearchPhaseExecutionException[Failed to execute phase [query_fetch], all shards failed; shardFailures {[br5TQBVNRr-pVXXW9crQeA][blog_network_7-6][0]: RemoteTransportException[[<http:\/\/es1.vip.search.dfw.wordpress.com|es1.vip.search.dfw.wordpress.com>][inet[\/192.0.80.174:9300]][search\/phase\/query+fetch]]; nested: ElasticsearchIllegalArgumentException[field [tag] isn't a leaf field]; }]```",
 11 |         "edited": {
 12 |             "user": "U029CL0GJ",
 13 |             "ts": "1414028035.000000"
 14 |         },
 15 |         "ts": "1414028030.000314"
 16 |     },
 17 |     {
 18 |         "type": "message",
 19 |         "user": "U027LSDDA",
 20 |         "text": "hmmm...",
 21 |         "ts": "1414028037.000317"
 22 |     },
 23 |     {
 24 |         "type": "message",
 25 |         "user": "U029CL0GJ",
 26 |         "text": "here's my code. rather than stumbling through it, can someone take a look at see if anything sticks out as being wrong?",
 27 |         "ts": "1414028064.000318"
 28 |     },
 29 |     {
 30 |         "type": "message",
 31 |         "user": "U029CL0GJ",
 32 |         "text": "<https:\/\/mc.a8c.com\/pb\/78ba\/#php>",
 33 |         "ts": "1414028064.000319"
 34 |     },
 35 |     {
 36 |         "type": "message",
 37 |         "user": "U027LSDDA",
 38 |         "text": "its probably a change to how ES handles it",
 39 |         "ts": "1414028067.000320"
 40 |     },
 41 |     {
 42 |         "type": "message",
 43 |         "user": "U027LSDDA",
 44 |         "text": "looking...",
 45 |         "ts": "1414028069.000321"
 46 |     },
 47 |     {
 48 |         "type": "message",
 49 |         "user": "U029CL0GJ",
 50 |         "text": "thanks",
 51 |         "ts": "1414028072.000322"
 52 |     },
 53 |     {
 54 |         "type": "message",
 55 |         "user": "U027LSDDA",
 56 |         "text": "oh, i think the 'tag' in your 'fields' list is not specific enough",
 57 |         "ts": "1414028159.000323"
 58 |     },
 59 |     {
 60 |         "type": "message",
 61 |         "user": "U027LSDDA",
 62 |         "text": "you need say tag.term_id or tag.name, tag.slug",
 63 |         "ts": "1414028174.000324"
 64 |     },
 65 |     {
 66 |         "type": "message",
 67 |         "user": "U027LSDDA",
 68 |         "text": "the goal is to get all posts\/comments a user subscribes to?",
 69 |         "ts": "1414028231.000325"
 70 |     },
 71 |     {
 72 |         "type": "message",
 73 |         "user": "U029CL0GJ",
 74 |         "text": "all of the tabs",
 75 |         "ts": "1414028253.000326"
 76 |     },
 77 |     {
 78 |         "type": "message",
 79 |         "user": "U029CL0GJ",
 80 |         "text": "although i see nick did some commits since i worked on it",
 81 |         "ts": "1414028287.000327"
 82 |     },
 83 |     {
 84 |         "type": "message",
 85 |         "user": "U029CL0GJ",
 86 |         "text": "hmm, maybe he beat me to it",
 87 |         "ts": "1414028290.000328"
 88 |     },
 89 |     {
 90 |         "type": "message",
 91 |         "user": "U029CL0GJ",
 92 |         "text": "meh, still going to commit",
 93 |         "ts": "1414028311.000329"
 94 |     },
 95 |     {
 96 |         "type": "message",
 97 |         "user": "U029CL0GJ",
 98 |         "text": "i ditched all the ajax since ES is so fast",
 99 |         "ts": "1414028318.000330"
100 |     },
101 |     {
102 |         "type": "message",
103 |         "user": "U027LSDDA",
104 |         "text": "cool :simple_smile:",
105 |         "ts": "1414028331.000331"
106 |     },
107 |     {
108 |         "type": "message",
109 |         "user": "U027LSDDA",
110 |         "text": "getting 250 results is probably fine right now. Someday our index may get too big",
111 |         "ts": "1414028371.000332"
112 |     },
113 |     {
114 |         "type": "message",
115 |         "user": "U027LSDDA",
116 |         "text": "but we'll deal with that when it breaks :simple_smile:",
117 |         "ts": "1414028396.000333"
118 |     },
119 |     {
120 |         "type": "message",
121 |         "user": "U029CL0GJ",
122 |         "text": "for tags, it looks like i wanted a meta object: <https:\/\/mc.a8c.com\/pb\/78bb\/#php>",
123 |         "ts": "1414028427.000334"
124 |     },
125 |     {
126 |         "type": "message",
127 |         "user": "U029CL0GJ",
128 |         "text": "all the tags",
129 |         "ts": "1414028466.000335"
130 |     },
131 |     {
132 |         "type": "message",
133 |         "user": "U027LSDDA",
134 |         "text": "i think you have specify each field individually now",
135 |         "ts": "1414028471.000336"
136 |     },
137 |     {
138 |         "type": "message",
139 |         "user": "U027LSDDA",
140 |         "text": "i vaguely remember that change in ES at some point, but don't recall why",
141 |         "ts": "1414028487.000337"
142 |     },
143 |     {
144 |         "type": "message",
145 |         "user": "U027LSDDA",
146 |         "text": "so you need tag.slug and tag.name",
147 |         "ts": "1414028521.000338"
148 |     },
149 |     {
150 |         "type": "message",
151 |         "user": "U027LSDDA",
152 |         "text": "i think its because they don't actually get stored together",
153 |         "ts": "1414028552.000339"
154 |     },
155 |     {
156 |         "type": "message",
157 |         "user": "U029CL0GJ",
158 |         "text": "i just removed tag for now, i'll circle back. no errors but no results either.",
159 |         "ts": "1414028562.000340"
160 |     },
161 |     {
162 |         "type": "message",
163 |         "user": "U027LSDDA",
164 |         "text": "did you get empty results, false, or WP_Error back?",
165 |         "ts": "1414028628.000341"
166 |     },
167 |     {
168 |         "type": "message",
169 |         "user": "U029CL0GJ",
170 |         "text": "empty results",
171 |         "ts": "1414028635.000342"
172 |     },
173 |     {
174 |         "type": "message",
175 |         "user": "U027LSDDA",
176 |         "text": "oh, you probably also need to set 'blog_id' = null",
177 |         "ts": "1414028684.000343"
178 |     },
179 |     {
180 |         "type": "message",
181 |         "user": "U027LSDDA",
182 |         "text": "in $es_query_args",
183 |         "ts": "1414028693.000344"
184 |     },
185 |     {
186 |         "type": "message",
187 |         "user": "U029CL0GJ",
188 |         "text": "query in json form: <https:\/\/mc.a8c.com\/pb\/78bc\/#js>",
189 |         "ts": "1414028697.000345"
190 |     },
191 |     {
192 |         "type": "message",
193 |         "user": "U027LSDDA",
194 |         "text": "that was a change we had to make to our api",
195 |         "ts": "1414028702.000346"
196 |     },
197 |     {
198 |         "type": "message",
199 |         "user": "U027LSDDA",
200 |         "text": "otherwise it will autofilter by the current blog_id",
201 |         "ts": "1414028772.000347"
202 |     },
203 |     {
204 |         "type": "message",
205 |         "user": "U029CL0GJ",
206 |         "text": "that worked i think",
207 |         "ts": "1414028774.000348"
208 |     },
209 |     {
210 |         "type": "message",
211 |         "user": "U029CL0GJ",
212 |         "text": "yeah, loads in a fraction of a second :simple_smile:",
213 |         "ts": "1414028860.000349"
214 |     },
215 |     {
216 |         "type": "message",
217 |         "user": "U027LSDDA",
218 |         "text": "as it should :simple_smile:",
219 |         "ts": "1414028871.000350"
220 |     },
221 |     {
222 |         "type": "message",
223 |         "user": "U029CL0GJ",
224 |         "text": "cool. now to list out post tags and i'll be good to go.",
225 |         "ts": "1414028907.000351"
226 |     },
227 |     {
228 |         "type": "message",
229 |         "user": "U029CL0GJ",
230 |         "text": "thanks for you help!",
231 |         "ts": "1414028909.000352"
232 |     },
233 |     {
234 |         "type": "message",
235 |         "user": "U027LSDDA",
236 |         "text": "anytime",
237 |         "ts": "1414028917.000353"
238 |     },
239 |     {
240 |         "type": "message",
241 |         "user": "U027LSDDA",
242 |         "text": "that delete-by-query image may just have been a random spike",
243 |         "ts": "1414030195.000354"
244 |     },
245 |     {
246 |         "type": "message",
247 |         "user": "U029CL0GJ",
248 |         "text": "this tag format is annoying :disappointed:",
249 |         "ts": "1414030571.000355"
250 |     },
251 |     {
252 |         "type": "message",
253 |         "user": "U027LSDDA",
254 |         "text": "what about it?",
255 |         "ts": "1414030584.000356"
256 |     },
257 |     {
258 |         "type": "message",
259 |         "user": "U029CL0GJ",
260 |         "text": "if there's one tag, it's a string, if there's more than one, it's an array",
261 |         "ts": "1414030587.000357"
262 |     },
263 |     {
264 |         "type": "message",
265 |         "user": "U029CL0GJ",
266 |         "text": "i also have to match up keys between the two fields",
267 |         "ts": "1414030595.000358"
268 |     },
269 |     {
270 |         "type": "message",
271 |         "user": "U027LSDDA",
272 |         "text": "ya",
273 |         "ts": "1414030602.000359"
274 |     },
275 |     {
276 |         "type": "message",
277 |         "user": "U027LSDDA",
278 |         "text": "ya, its not really a full object",
279 |         "ts": "1414030618.000360"
280 |     },
281 |     {
282 |         "type": "message",
283 |         "user": "U029CL0GJ",
284 |         "text": "the string vs array is particularly annoying",
285 |         "ts": "1414030623.000361"
286 |     },
287 |     {
288 |         "type": "message",
289 |         "user": "U027LSDDA",
290 |         "text": "its more for searching against than as a doc store",
291 |         "ts": "1414030629.000362"
292 |     },
293 |     {
294 |         "type": "message",
295 |         "user": "U029CL0GJ",
296 |         "text": "ah",
297 |         "ts": "1414030636.000363"
298 |     },
299 |     {
300 |         "type": "message",
301 |         "user": "U027LSDDA",
302 |         "text": "that's why we usually just get the blog_id, post_id and then use the DB",
303 |         "ts": "1414030645.000364"
304 |     },
305 |     {
306 |         "type": "message",
307 |         "user": "U029CL0GJ",
308 |         "text": "gotcha. that'd be slow here though -- too many `switch_to_blog()`",
309 |         "ts": "1414030660.000365"
310 |     },
311 |     {
312 |         "type": "message",
313 |         "user": "U027LSDDA",
314 |         "text": "the string vs array thing is a pain",
315 |         "ts": "1414030663.000366"
316 |     },
317 |     {
318 |         "type": "message",
319 |         "user": "U027LSDDA",
320 |         "text": "you'll probably hit the cache most of the time with get_blog_post(), but that is a lot of posts",
321 |         "ts": "1414030783.000367"
322 |     },
323 |     {
324 |         "type": "message",
325 |         "user": "U027LSDDA",
326 |         "text": "<@U029CL0GJ>: if you return \"fields\": [ \"_source\" ] that should be the orig doc that we indexed. Might be more consistently formatted for your needs",
327 |         "ts": "1414030941.000368"
328 |     },
329 |     {
330 |         "type": "message",
331 |         "user": "U029CL0GJ",
332 |         "text": "too late, already got it working :simple_smile:",
333 |         "ts": "1414031030.000369"
334 |     },
335 |     {
336 |         "type": "message",
337 |         "user": "U027LSDDA",
338 |         "text": ":simple_smile:",
339 |         "ts": "1414031035.000370"
340 |     },
341 |     {
342 |         "type": "message",
343 |         "user": "U029CL0GJ",
344 |         "text": "but thanks",
345 |         "ts": "1414031051.000371"
346 |     },
347 |     {
348 |         "type": "message",
349 |         "user": "U027LSDDA",
350 |         "text": "np",
351 |         "ts": "1414031055.000372"
352 |     },
353 |     {
354 |         "type": "message",
355 |         "user": "U029CL0GJ",
356 |         "text": "launched: <https:\/\/mc.a8c.com\/p2s\/#p2s-updates>",
357 |         "ts": "1414031171.000373"
358 |     },
359 |     {
360 |         "type": "message",
361 |         "user": "U029CL0GJ",
362 |         "text": "dat speed :smile:",
363 |         "ts": "1414031173.000374"
364 |     },
365 |     {
366 |         "type": "message",
367 |         "user": "U029CL0GJ",
368 |         "text": "literally took 10+ seconds of spinner wheel to load before",
369 |         "ts": "1414031188.000375"
370 |     },
371 |     {
372 |         "type": "message",
373 |         "user": "U027LSDDA",
374 |         "text": "dang that's fast",
375 |         "ts": "1414031188.000376"
376 |     },
377 |     {
378 |         "type": "message",
379 |         "user": "U027LSDDA",
380 |         "text": "nice",
381 |         "ts": "1414031190.000377"
382 |     },
383 |     {
384 |         "type": "message",
385 |         "user": "U029CL0GJ",
386 |         "text": "oops, that's posts only from my tag debugging",
387 |         "ts": "1414031209.000378"
388 |     },
389 |     {
390 |         "type": "message",
391 |         "user": "U029CL0GJ",
392 |         "text": "fixed",
393 |         "ts": "1414031263.000379"
394 |     }]}
395 | 


--------------------------------------------------------------------------------
/img/hackathon-discussion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/Slack-Summary/92ee08557c68728b4aee15ec45d070ac206695a9/img/hackathon-discussion.png


--------------------------------------------------------------------------------
/img/meeting-discussion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/Slack-Summary/92ee08557c68728b4aee15ec45d070ac206695a9/img/meeting-discussion.png


--------------------------------------------------------------------------------
/interval_summarizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from collections import namedtuple
 3 | from datetime import (timedelta, datetime)
 4 | import re
 5 | import logging
 6 | import logging.handlers
 7 | import sys
 8 | import json
 9 | import io
10 | from ts_config import TS_DEBUG, TS_LOG
11 | import glob
12 | from utils import get_msg_text
13 | from slacker import Slacker
14 | from config import keys
15 | 
16 | logging.basicConfig(level=logging.INFO)
17 | 
18 | class IntervalSpec(object):
19 |     slk_ts = re.compile(r'(?P<epoch>[1-9][^\.]+).*')
20 |     
21 | class TsSummarizer(object):
22 |     """Constructs summaries over a set of ranges"""
23 |     flrg = re.compile(r'[\n\r\.]|\&[a-z]+;|<http:[^>]+>|\:[^: ]+\:|`{3}[^`]*`{3}')
24 |     archive_link = u'https://a8c.slack.com/archives/{}/p{}'
25 |     def __init__(self, ):
26 |         self.logger = logging.getLogger(__name__)
27 |         self.channel = None
28 |         self.slack = None
29 |         log_level = logging.DEBUG if TS_DEBUG else logging.INFO
30 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31 |         fh = logging.handlers.RotatingFileHandler('./interval_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
32 |         fh.setLevel(log_level)
33 |         fh.setFormatter(formatter)
34 |         self.logger = logging.getLogger('interval_summarizer')
35 |         self.logger.handlers = []
36 |         self.logger.addHandler(fh)
37 | 
38 |     def summarize(self, messages, range_spec=None):
39 |         """ Produce the input """
40 |         return messages
41 | 
42 |     def report_summary(self, messages, range_spec=None):
43 |         """The interval summaries are joined."""
44 |         return '\n'.join(self.summarize(messages, range_spec=range_spec))
45 | 
46 |     def set_channel(self, channel):
47 |         self.channel = channel
48 | 
49 |     def set_slack(self, conn):
50 |         self.slack = conn
51 | 
52 |     def tagged_sum(self, msg):
53 |         user = "USER UNKNOWN"
54 |         if 'user' in msg:
55 |             user = msg['user']
56 |         elif 'bot_id' in msg:
57 |             user = msg['bot_id']
58 |         elif 'username' in msg and msg['username'] == u'bot':
59 |             user = 'bot'
60 |         split_text = get_msg_text(msg).split()
61 |         text = u' '.join(split_text[:30])+u'...' if len(split_text) > 30 else u' '.join(split_text)
62 |         if self.channel:
63 |             link = TsSummarizer.archive_link.format(self.channel, re.sub(r'\.',u'',msg['ts']))
64 |             text = u'<'+link+'|'+text+'>'
65 |         return u'@{} <@{}>: {}'.format(ts_to_time(msg['ts']).strftime("%a-%b-%-m-%Y %H:%M:%S"), user,  text)
66 |     
67 | 
68 | def ts_to_time(slack_ts):
69 |     """
70 |     Parameters
71 |     slack_ts : string EPOCH.ID
72 |     Return
73 |     datetime
74 |     """
75 |     return datetime.utcfromtimestamp(long(IntervalSpec.slk_ts.search(slack_ts).group('epoch')))
76 | 
77 | def tspec_to_delta(seconds=0, minutes= 0, hours= 0, days= 0, weeks=0, **args):
78 |     return timedelta(seconds= seconds, minutes= minutes, hours= hours, days= days, weeks=weeks)
79 | 
80 | def canonicalize(txt):
81 |     """Filter and change text to sentece form"""
82 |     ntxt = TsSummarizer.flrg.sub(u'', txt)
83 |     return ntxt.strip() if re.match(r'.*[\.\?\!]\s*$', ntxt) else u'{}.'.format(ntxt.strip())
84 |     #return ntxt if re.match(r'.*[\.\?]$', ntxt) else u'{}.'.format(ntxt)
85 | 
86 | 


--------------------------------------------------------------------------------
/lsa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division, print_function, unicode_literals
  5 | 
  6 | import math
  7 | 
  8 | from warnings import warn
  9 | 
 10 | try:
 11 |     import numpy
 12 | except ImportError:
 13 |     numpy = None
 14 | 
 15 | try:
 16 |     from numpy.linalg import svd as singular_value_decomposition
 17 | except ImportError:
 18 |     singular_value_decomposition = None
 19 | from base_summarizer import BaseSummarizer
 20 | import spacy.en
 21 | from spacy.parts_of_speech import VERB, NOUN, PROPN, PRON, PUNCT
 22 | from spacy.en import STOPWORDS
 23 | import logging
 24 | logging.basicConfig(level=logging.DEBUG)
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | class LsaSummarizer(BaseSummarizer):
 28 |     MIN_DIMENSIONS = 3
 29 |     REDUCTION_RATIO = 1/1
 30 |     _stop_words = frozenset()
 31 |     
 32 | 
 33 |     def __init__(self, ):
 34 |         BaseSummarizer.__init__(self, )
 35 |         self.nlp = spacy.en.English(entity=False, matcher=False)
 36 |         self.nlp_doc = None
 37 | 
 38 |     @property
 39 |     def stop_words(self):
 40 |         return self._stop_words
 41 | 
 42 |     @stop_words.setter
 43 |     def stop_words(self, words):
 44 |         self._stop_words = frozenset(map(self.normalize_word, words))
 45 | 
 46 |     def __call__(self, document, sentences_count, user_dict):
 47 |         self._ensure_dependecies_installed()
 48 |         self.nlp_doc = self.nlp(document)
 49 |         self.user_dict = user_dict
 50 |         logger.info("Created doc")
 51 |         
 52 |         dictionary = self._create_dictionary()
 53 |         # empty document
 54 |         if not dictionary:
 55 |             return ()
 56 |         matrix = self._create_matrix(dictionary)
 57 |         matrix = self._compute_term_frequency(matrix)
 58 |         u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
 59 | 
 60 |         ranks = iter(self._compute_ranks(sigma, v))
 61 |         sents = [s.text for s in self.nlp_doc.sents]
 62 |         logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents))
 63 |         new_sents = self._get_best_sentences(sents, sentences_count*2,
 64 |             lambda s: next(ranks))
 65 |         filt_sents = [sent for sent in new_sents if self.better_question(sent)]
 66 |         additional_sents = set(new_sents) - set(filt_sents)
 67 |         to_add = sentences_count - len(filt_sents)
 68 |         final_sents = filt_sents
 69 |         if to_add > 0:
 70 |             final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True)
 71 |         logger.info("Filtered sentences %s", filt_sents)
 72 |         logger.info("Final recommendations are %s", final_sents[:sentences_count])
 73 |         return final_sents
 74 | 
 75 | 
 76 |     def better_question(self, txt):
 77 |         if len(txt.split()) > 5:
 78 |             parse = self.nlp(txt)
 79 |             for sent in parse.sents:
 80 |                 if len(sent) > 5:
 81 |                     p2 = self.nlp(sent.text)
 82 |                     for (i, wd) in enumerate(p2):
 83 |                         if wd.lemma_ in (u'can', u'should', u'will', u'could', u'why', u'what', u'how', u'is'):
 84 |                             return u'ROOT' in [x.dep_ for x in p2[i+1:]] and u'?' in  [x.orth_ for x in p2[i+1:]]              
 85 | 
 86 | 
 87 |     def _ensure_dependecies_installed(self):
 88 |         if numpy is None:
 89 |             raise ValueError("LSA summarizer requires NumPy. Please, install it by command 'pip install numpy'.")
 90 | 
 91 |     def _create_dictionary(self, ):
 92 |         """Creates mapping key = word, value = row index"""
 93 |         words = [wd.orth_ for wd in self.nlp_doc if wd.pos != PUNCT]
 94 |         unique_words = frozenset(w.lemma_ for w in self.nlp_doc if w not in STOPWORDS and w.tag_ != "PRP" and (w.pos == VERB or w.pos == NOUN))
 95 |         unique_users = frozenset(self.user_dict.values())
 96 |         logger.info("Have %s unique words" % len(unique_words))
 97 |         logger.info("Have %s unique users" % len(unique_users))
 98 |         return dict((w, i) for i, w in enumerate(unique_words|unique_users))
 99 | 
100 |     def collect_bow(self, txt):
101 |         sents = nlp(txt).sents
102 |         return [x for x in [retrieve_main_bow(sent) for sent in sents] if x]
103 | 
104 |     def _create_matrix(self, dictionary):
105 |         """
106 |         Creates matrix of shape |unique words|×|sentences| where cells
107 |         contains number of occurences of words (rows) in senteces (cols).
108 |         """
109 |         sentences = list(self.nlp_doc.sents)
110 |         words_count = len(dictionary)
111 |         sentences_count = len(sentences)
112 |         logger.info ("Have %s sentences " % sentences_count)
113 |         if words_count < sentences_count:
114 |             message = (
115 |                 "Number of words (%d) is lower than number of sentences (%d). "
116 |                 "LSA algorithm may not work properly."
117 |             )
118 |             logger.warn(message % (words_count, sentences_count))
119 |         # create matrix |unique words|×|sentences| filled with zeroes
120 |         matrix = numpy.zeros((words_count, sentences_count))
121 |         for col, sentence in enumerate(sentences):
122 |             for word in [wd.lemma_ for wd in sentence if wd.lemma_ in dictionary]:
123 |                 matrix[dictionary[word], col] += 1
124 |             if sentence.text in self.user_dict and len(self.user_dict[sentence.text]) > 1:
125 |                 logger.info("Matching sentence %s with user %s", sentence.text, self.user_dict[sentence.text])
126 |                 matrix[dictionary[self.user_dict[sentence.text]], col] += 1
127 |         return matrix
128 | 
129 |     def _compute_term_frequency(self, matrix, smooth=0.4):
130 |         """
131 |         Computes TF metrics for each sentence (column) in the given matrix.
132 |         You can read more about smoothing parameter at URL below:
133 |         http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
134 |         """
135 |         assert 0.0 <= smooth < 1.0
136 | 
137 |         max_word_frequencies = numpy.max(matrix, axis=0)
138 |         rows, cols = matrix.shape
139 |         for row in range(rows):
140 |             for col in range(cols):
141 |                 max_word_frequency = max_word_frequencies[col]
142 |                 if max_word_frequency != 0:
143 |                     frequency = matrix[row, col]/max_word_frequency
144 |                     matrix[row, col] = smooth + (1.0 - smooth)*frequency
145 |         return matrix
146 | 
147 |     def _compute_ranks(self, sigma, v_matrix):
148 |         assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable"
149 | 
150 |         dimensions = max(LsaSummarizer.MIN_DIMENSIONS,
151 |             int(len(sigma)*LsaSummarizer.REDUCTION_RATIO))
152 |         powered_sigma = tuple(s**2 if i < dimensions else 0.0
153 |             for i, s in enumerate(sigma))
154 |         ranks = []
155 |         # iterate over columns of matrix (rows of transposed matrix)
156 |         for column_vector in v_matrix.T:
157 |             rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
158 |             ranks.append(math.sqrt(rank))
159 |         return ranks
160 | 
161 | def retrieve_main_bow(tokens):
162 |     bow = set()
163 |     for tok in tokens:
164 |         if tok.pos != PUNCT:
165 |             if tok.dep_ == 'advcl' or tok.dep == 'xcomp':
166 |                 bow.add(' '.join([ti.lower_ for ti in list(tok.children) if tok.tag_ != "PRP" and ti.lower_ not in STOPWORDS]))
167 |                 bow.add(tok.lower_)
168 |             if tok.pos == NOUN or tok.pos == VERB:
169 |                 if tok.tag_ != "PRP" and tok.lower_ not in STOPWORDS:
170 |                     bow.add(tok.lower_)
171 |     mt = re.sub(r'[\n\t\n]', u'', u' '.join(list(bow))+u'.')
172 |     return mt if len(mt.strip().split()) > 2 else None
173 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, jsonify, request
 2 | import requests
 3 | import json
 4 | import os
 5 | from config import *
 6 | from ts_config import SUMMS
 7 | from slack_summary import SlackRouter
 8 | app = Flask(__name__)
 9 | from utils import maybe_get
10 | global lsa_summ
11 | lsa_summ = None
12 | if "spacy" in SUMMS:
13 |         import lsa
14 |         import spacy.en
15 |         import spacy
16 |         lsa_summ = lsa.LsaSummarizer()
17 | 
18 | 
19 | @app.route("/slack", methods=['POST'])
20 | def slackReq():
21 |         global lsa_summ
22 |         if "spacy" in SUMMS:
23 |                 if not lsa_summ:
24 |                         lsa_summ = lsa.LsaSummarizer()
25 | 	req_data = request.form
26 |         req = {
27 | 	        'channel_id' : req_data.getlist('channel_id'),
28 |                 'channel_name' : maybe_get(req_data, 'channel_name', default=''),
29 |                 'user_id' : maybe_get(req_data, 'user_id', default=''),
30 |                 'user_name' : maybe_get(req_data, 'user_name', default=''),
31 |                 'params' : maybe_get(req_data, 'text', default=''),
32 |                 'summ' : lsa_summ
33 |                 }
34 |         if "gensim" in SUMMS and "gensim" in req['params'].split():
35 |                 req['summ'] = None
36 | 	return (SlackRouter().get_summary(**req))
37 | 
38 | 
39 | @app.route("/slacktest", methods=['POST'])
40 | def slackTestReq():
41 |         global lsa_summ
42 |         if "spacy" in SUMMS:
43 |                 if not lsa_summ:
44 |                         lsa_summ = lsa.LsaSummarizer()
45 | 	req_data = request.form
46 |         req = {
47 | 	        'channel_id' : req_data.getlist('channel_id'),
48 |                 'channel_name' : maybe_get(req_data, 'channel_name', default=''),
49 |                 'user_id' : maybe_get(req_data, 'user_id', default=''),
50 |                 'user_name' : maybe_get(req_data, 'user_name', default=''),
51 |                 'params' : maybe_get(req_data, 'text', default=''),
52 |                 'summ' : lsa_summ,
53 |                 'test' : True
54 |                 }
55 |         if "gensim" in SUMMS and "gensim" in req['params'].split():
56 |                 req['summ'] = None
57 | 	return (SlackRouter(test=True).get_summary(**req))
58 | 
59 | def main():
60 |         port = int(os.environ.get('PORT', 5000))
61 |         app.run(host='0.0.0.0', port=port, debug=False)
62 |         
63 | if __name__ == "__main__":
64 |         main()
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.10.1
 2 | Jinja2==2.7.3
 3 | MarkupSafe==0.23
 4 | Werkzeug==0.10.4
 5 | clusterpoint-api==0.3.0
 6 | decorator==3.4.2
 7 | itsdangerous==0.24
 8 | pycps==0.3.0
 9 | requests[security]==2.8.1
10 | slacker==0.6.2
11 | wsgiref==0.1.2
12 | gensim==0.12.2
13 | ipython
14 | jupyter
15 | mock==1.3.0
16 | pbr==1.8.1
17 | spacy==0.99
18 | hypothesis
19 | 


--------------------------------------------------------------------------------
/slack_summary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import requests
  3 | import json
  4 | from config import *
  5 | from ts_config import DEBUG, LOG_FILE, SUMMARY_INTERVALS, TEST_JSON, SUMMS
  6 | from slacker import Slacker
  7 | import slacker
  8 | import logging
  9 | import logging.handlers
 10 | import uuid
 11 | import re
 12 | import io
 13 | from datetime import timedelta, datetime
 14 | if "gensim" in SUMMS:
 15 |     from ts_summarizer import TextRankTsSummarizer
 16 | if "spacy" in SUMMS:
 17 |     from sp_summarizer import SpacyTsSummarizer
 18 | 
 19 | class SlackRouter(object):
 20 |     expr = re.compile(r'-?(\d{1,3}?)\s+(\S{1,8})\s*(.*)$')
 21 |     plural = re.compile(r'([^s]+)s$')
 22 |     temporals = ['minute', 'min', 'hour', 'day', 'week']
 23 | 
 24 | 
 25 |     def __init__(self, test=False):
 26 |         self.test = test
 27 |         self.slack = None if self.test else slacker.Slacker(keys["slack"])
 28 |         log_level = logging.DEBUG if DEBUG else logging.INFO
 29 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 30 |         fh = logging.handlers.RotatingFileHandler('./slack_summary_'+LOG_FILE, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5,)
 31 |         fh.setLevel(log_level)
 32 |         fh.setFormatter(formatter)
 33 |         self.logger = logging.getLogger('slack_summary')
 34 |         self.logger.handlers = []
 35 |         self.logger.setLevel(log_level)
 36 |         self.logger.addHandler(fh)
 37 | 
 38 |     def get_response(self, channel_id):
 39 |         self.logger.debug(u'Generating summary for channel: %s', channel_id)
 40 |         return self.slack.channels.history(channel_id)
 41 | 
 42 |     def get_messages(self, channel_id, params):
 43 |         """Get messages based upon the interval"""
 44 |         tdelt = self.build_delta(params)
 45 |         earliest_time = datetime.now()-tdelt
 46 |         self.logger.debug(u'Earliest time %s', earliest_time)
 47 |         ts = u'{}.999999'.format(earliest_time.strftime("%s"))
 48 |         self.logger.debug(u'Channel id %s, TS string %s', channel_id, ts)
 49 |         response =  self.slack.channels.history(channel_id, oldest=ts, count=999)
 50 | 	res = (response.body)
 51 |         add_more = True
 52 |         msgs = []
 53 |         msg_ids = set()
 54 |         while add_more:
 55 |             if 'max_msgs' in params and params['max_msgs'] <= len(msgs):
 56 |                 return msgs
 57 |             if u'messages' in res:
 58 |                 new_set = set([msg['ts'] for msg in res['messages']])
 59 |                 if len(new_set.intersection(msg_ids)) > 0:
 60 |                     self.logger.debug(u'Overlap in messages')
 61 |                     return msgs
 62 |                 msgs.extend(res['messages'])
 63 |                 msg_ids.update(new_set)
 64 |                 self.logger.debug(u'Got %s messages', len(msgs))
 65 |             else:
 66 |                 return msgs    
 67 |             if 'has_more' in res and res['has_more']:
 68 |                 self.logger.debug(u'Paging for more messages.')
 69 |                 response =  self.slack.channels.history(channel_id, oldest=ts, latest=res['messages'][-1]['ts'], count=999)
 70 |                 res = (response.body)
 71 |             else:
 72 |                 self.logger.debug(u'No more messages.')
 73 |                 add_more = False
 74 |         return msgs
 75 | 
 76 |     def get_summary(self, **args):
 77 |         channel_id = args['channel_id'] if 'channel_id' in args else None
 78 |         channel_name = args['channel_name'] if 'channel_name' in args else None
 79 |         user_id = args['user_id'] if 'user_id' in args else None
 80 |         user_name = args['user_name'] if 'user_name' in args else None
 81 |         params = args['params'] if 'params' in args else None
 82 |         request_id = uuid.uuid1()
 83 |         response = None
 84 |         msgs = None
 85 |         if self.test:
 86 |             with io.open(TEST_JSON, encoding='utf-8') as iot:
 87 |                 msgs = json.load(iot)[u'messages']
 88 |         else:
 89 |             msgs = self.get_messages(channel_id, params)
 90 |         summ_object = args['summ']
 91 |         summ_impl = None
 92 |         summary = u''
 93 |         if summ_object and "spacy" in SUMMS:
 94 |             self.logger.info(u'Using spacy')
 95 |             summ_impl = SpacyTsSummarizer()
 96 |             summ_impl.set_summarizer(summ_object)
 97 |         elif "gensim" in SUMMS:
 98 |             self.logger.info(u'Using gensim')
 99 |             summ_impl = TextRankTsSummarizer()
100 |         if summ_impl:
101 |             summ_impl.set_channel(channel_name)
102 |             summary = summ_impl.summarize(msgs)
103 |         else:
104 |             self.logger.warn(u'No summarizer was set!')
105 |         self.logger.info(u'Summary request %s user_id: %s', request_id, user_id)
106 |         self.logger.info(u'Summary request %s channel_name: %s', request_id, channel_name)
107 |         self.logger.info(u'Summary request %s parameters: %s', request_id, params)
108 |         self.logger.debug(u'Summary request %s messages: %s', request_id, msgs)
109 |         self.logger.info(u'Summary request %s summary:\n %s', request_id, summary)
110 | 	res = u"*Chat Summary:* \n " + summary + "\n \n"
111 |         return res
112 | 
113 |     def _parse_args(self, commands):   
114 |         units = None
115 |         unit = None
116 |         keywords = None
117 |         if commands and len(commands.strip()) > 1:
118 |             match = SlackRouter.expr.match(commands)
119 |             if match:
120 |                 units, unit, keywords = match.groups()
121 |                 unit = unit.lower()
122 |                 umatch = SlackRouter.plural.match(unit)
123 |                 unit = umatch.groups()[0] if umatch else unit
124 |                 unit = unit if unit in SlackRouter.temporals else None
125 |                 if unit and unit == 'min':
126 |                     unit = 'minute'
127 |                 units = int(units) if unit else None
128 |             else:
129 |                 keywords = commands
130 |             if not unit:
131 |                 units = None
132 |                 keywords = commands    
133 |         return unit, units, keywords
134 | 
135 |     def build_interval(self, commands):
136 |         """Return a single interval for the summarization"""
137 |         unit, units, keywords = self._parse_args(commands)
138 |         interval = {'size': 3}
139 |         if unit:
140 |             interval[unit+'s'] = units
141 |             interval['txt'] = u"Summary for last {} {}:\n".format(units, unit)
142 |         else:
143 |             interval['days'] = 5
144 |             interval['txt'] = u"Summary for last 5 days:\n"
145 |         return [interval]
146 | 
147 |     def build_delta(self, commands):
148 |         """Return a single interval for the summarization"""
149 |         unit, units, keywords = self._parse_args(commands)
150 |         interval = {'seconds':0, 'minutes': 0, 'hours': 0, 'days': 0, 'weeks': 0}
151 |         if unit:
152 |             interval[unit+'s'] = units
153 |         else:
154 |             interval['days'] = 5
155 |         return timedelta(**interval)
156 |     
157 | 


--------------------------------------------------------------------------------
/sp_summarizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import namedtuple
  3 | from datetime import (timedelta, datetime)
  4 | import re
  5 | import logging
  6 | import logging.handlers
  7 | import sys
  8 | import json
  9 | import io
 10 | from ts_config import TS_DEBUG, TS_LOG
 11 | import glob
 12 | from utils import get_msg_text
 13 | from interval_summarizer import (IntervalSpec, TsSummarizer,
 14 |                                  canonicalize, ts_to_time, tspec_to_delta)
 15 | logging.basicConfig(level=logging.INFO)
 16 | 
 17 | class SpacyTsSummarizer(TsSummarizer):
 18 |     
 19 |     def __init__(self, ):
 20 |         TsSummarizer.__init__(self, )
 21 |         log_level = logging.DEBUG if TS_DEBUG else logging.INFO
 22 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 23 |         fh = logging.handlers.RotatingFileHandler('./spacy_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
 24 |         fh.setLevel(log_level)
 25 |         fh.setFormatter(formatter)
 26 |         self.logger = logging.getLogger('sp_summarizer')
 27 |         self.logger.handlers = []
 28 |         self.logger.addHandler(fh)
 29 | 
 30 |     def set_summarizer(self, spacy_summ):
 31 |         self.sumr = spacy_summ
 32 |                 
 33 |     def summarize(self, msgs, range_spec=None):
 34 |         """Return a summary of the text
 35 |         TODO: 1. Looks like spacy is not getting the main sentence from the message.
 36 |         2. Load times for the spacy summarizer won't cut it. Commenting out now 
 37 |            until this can be fixed
 38 |         """
 39 |         size = range_spec['size'] if range_spec and 'size' in range_spec else 3
 40 |         if not msgs or len(msgs) == 0:
 41 |             self.logger.warn("No messages to form summary")
 42 |             return u"\n Unable to form summary here.\n"
 43 |         txt = range_spec['txt'] if range_spec else u'Summary is'
 44 |         if range_spec:
 45 |             self.logger.info("First 10 messages  %s of %s", msgs[:10], len(msgs)) 
 46 |             self.logger.info("Using time range spec %s", range_spec)
 47 |             start_time = time.strptime(range_spec['start'], "%B %d %Y") if 'start' in range_spec else ts_to_time(min(msgs, key=lambda m: m['ts'])['ts'])
 48 |             self.logger.info("Start time is  %s", start_time)
 49 |             delt = tspec_to_delta(**range_spec)
 50 |             end_time = start_time + delt
 51 |             self.logger.info("End time is  %s", end_time)
 52 |             msgs = [msg for msg in msgs if ts_to_time(msg['ts']) >= start_time and ts_to_time(msg['ts']) <= end_time]
 53 |             self.logger.info("First 10 messages  %s of %s", msgs[:10], len(msgs)) 
 54 |         summ = txt + u' '
 55 |         summ_list = []
 56 |         can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
 57 |         top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)
 58 |         can_dict = {key: can_dict[key] for key in top_keys}
 59 |         self.logger.info("Length of can_dict is %s", len(can_dict))
 60 |         simple_sum_list = [can_dict[ss] for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]]
 61 |         simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]])
 62 |         #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list])
 63 |         assert(len(simple_sum_list) <= size)
 64 |         #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))]) 
 65 |         if len(msgs) < 10:
 66 |             #return the longest
 67 |             summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
 68 |         else:
 69 |             max_sents = {}
 70 |             user_sents = {}
 71 |             for (txt, msg) in can_dict.items():
 72 |                 if len(txt.split()) > 3:
 73 |                     sl = list(self.sumr.nlp(txt).sents)
 74 |                     max_sents[max(sl, key = lambda x: len(x)).text] = msg
 75 |                     user_sents[max(sl, key = lambda x: len(x)).text] = msg['user'] if 'user' in msg else u''
 76 |             txt_sum = [v for v in self.sumr(u' '.join(max_sents.keys()), size, user_sents)]
 77 |             self.logger.info("Canonical keys are \n%s", u' '.join(can_dict.keys()))
 78 |             self.logger.info("Spacy summ %s", txt_sum)
 79 |             nlp_summ = u'\n'.join([self.tagged_sum(max_sents[ss]) for ss in txt_sum if len(ss) > 1 and ss in max_sents])
 80 |             nlp_list = [max_sents[ss] for ss in txt_sum if len(ss) > 1 and ss in max_sents]
 81 |             for ss in txt_sum:
 82 |                 if ss not in max_sents and len(ss.split()) > 5:
 83 |                     self.logger.info("Searching for: %s", ss)
 84 |                     for (ky, msg) in max_sents.items():
 85 |                         if ss in ky or (len(ky.split()) > 10 and ky in ss) and len(nlp_list) <= size:
 86 |                             nlp_summ += u'\n' + self.tagged_sum(msg)
 87 |                             nlp_list.append(msg)
 88 |             if len(nlp_list) < 2:
 89 |                 self.logger.info("Failed to find nlp summary using heuristic")
 90 |                 summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])])
 91 |             else:
 92 |                 self.logger.info("First msg is %s, %s", nlp_list[0], nlp_list[0]['ts'])
 93 |                 self.logger.info("Sorted is %s", sorted(nlp_list, key=lambda x: x['ts']))
 94 |                 summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(nlp_list, key=lambda x: x['ts'])])
 95 |         self.logger.info("Summary for segment %s is %s", msgs, summ) 
 96 |         return summ
 97 | 
 98 |     def parify_text(self, msg_segment):
 99 |         ptext = u'. '.join([SpacyTsSummarizer.flrg.sub(u'', msg['text']) for msg in msg_segment if 'text' in msg])
100 |         self.logger.debug("Parified text is %s", ptext)
101 |         return ptext
102 | 
103 | def main():
104 |     asd = [{'minutes': 30, 'txt' : u'Summary for first 30 minutes:\n', 'size' : 2}, {'hours':36, 'txt' : u'Summary for next 36 hours:\n', 'size': 3}]
105 |     logger = logging.getLogger(__name__)
106 |     tr_summ = SpacyTsSummarizer()
107 |     all_msgs = []
108 |     for msg_file in glob.glob('./data/*.json'):
109 |         with io.open(msg_file, encoding='utf-8',) as mf:
110 |             all_msgs += json.load(mf)
111 |     for filt in asd:
112 |         logger.info(tr_summ.summarize(all_msgs, range_spec=filt))
113 |     
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/test-events.json:
--------------------------------------------------------------------------------
 1 | {"messages": [
 2 |         {
 3 |         "type": "message",
 4 |         "user": "U029LMSEC",
 5 |         "text": "i\u2019m wondering if in the future we would like some kind of heatmap option for homepage, like\n\n `wpcom_homepage_link_click` with properties: signup_top, signup_bottom, xyz, abc, etc",
 6 |         "ts": "1441909889.000130"
 7 |     },
 8 |     {
 9 |         "type": "message",
10 |         "user": "U0EBEC5T5",
11 |         "text": "because i imagine the places we link people will vary quite a bit with tests",
12 |         "ts": "1441909928.000131"
13 |     },
14 |     {
15 |         "type": "message",
16 |         "user": "U029LMSEC",
17 |         "text": "&gt; If there are 2 buttons on the page going to the same link, you could differentiate them by putting in a query parameter to the url",
18 |         "ts": "1441910041.000132"
19 |     },
20 |     {
21 |         "type": "message",
22 |         "user": "U029LMSEC",
23 |         "text": "wondering why that is better than adding a tracks event",
24 |         "ts": "1441910059.000133"
25 |     },
26 |     {
27 |         "type": "message",
28 |         "user": "U03CGFPKV",
29 |         "text": "once we have user properties, we should be able to tell if a specific user is a paid user at a given point in time",
30 |         "ts": "1441925382.000186"
31 |     },
32 |     {
33 |         "type": "message",
34 |         "user": "U03CGFPKV",
35 |         "text": "but we can't really do that efficiently yet",
36 |         "ts": "1441925388.000187"
37 |     },
38 |     {
39 |         "type": "message",
40 |         "user": "U029LMSEC",
41 |         "text": "perf. understood",
42 |         "ts": "1441925394.000188"
43 |     },
44 |     {
45 |         "type": "message",
46 |         "user": "U029LMSEC",
47 |         "text": "thanks for the debrief",
48 |         "ts": "1441925398.000189"
49 |     }
50 | ]}
51 | 


--------------------------------------------------------------------------------
/test_hypothesis_summarizer.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import json
  3 | import io
  4 | from ts_summarizer import (TextRankTsSummarizer)
  5 | from interval_summarizer import (IntervalSpec, TsSummarizer,
  6 |                                  ts_to_time)
  7 | from datetime import datetime
  8 | import logging
  9 | import sys
 10 | import config
 11 | from ts_config import DEBUG
 12 | from hypothesis import given
 13 | from hypothesis.strategies import (sampled_from, lists, just, integers)
 14 | import glob
 15 | import random
 16 | logger = logging.getLogger()
 17 | logger.level = logging.DEBUG if DEBUG else logging.INFO
 18 | test_json_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
 19 | test_json_msgs_c2 = json.load(io.open("./data/test-events-elastic.json", encoding='utf-8'))['messages']
 20 | test_json_msgs_c3 = []
 21 | 
 22 | def read_dir(fdir):
 23 |     coll = []
 24 |     for jfile in glob.glob('./data/slack-logs-2/{}/*.json'.format(fdir)):
 25 |         coll += json.load(io.open(jfile, encoding='utf-8'))
 26 |     return coll
 27 | 
 28 | test_json_msgs_c3 = [(fdir, read_dir(fdir)) for fdir in ['api-test',  'calypso',  'games',  'happiness',  'hg',  'jetpack',  'jetpackfuel',  'livechat',  'tickets',  'vip']]
 29 | 
 30 | print len(test_json_msgs_c3)
 31 | 
 32 | class TestSummarize(unittest.TestCase):
 33 | 
 34 |     test_msgs = test_json_msgs
 35 | 
 36 |     @given(
 37 |         lists(elements=sampled_from(test_json_msgs), min_size=3),
 38 |         integers(min_value=1, max_value=20)
 39 |     )
 40 |     def test_text_rank_summarization_ds1_days(self, smp_msgs, days):
 41 |         """Generate something for N day interval"""
 42 |         logger.info("Input is %s", smp_msgs)
 43 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 44 |         summ = TextRankTsSummarizer()
 45 |         summ.set_channel('elasticsearch')
 46 |         sumry = summ.summarize(smp_msgs, range_spec=asd)
 47 |         logger.debug("Summary is %s", sumry)
 48 |         # Length of summary is at least 1 and no greater than 3
 49 |         self.assertTrue(len(sumry) >= 1)
 50 |         self.assertTrue(len(sumry) <= 3)
 51 |         # Length of summary is less than or equal to the original length
 52 |         self.assertTrue(len(sumry) <= len(smp_msgs))
 53 |         # Each message in the summary must correspond to a message
 54 | 
 55 | 
 56 |     @given(
 57 |         lists(elements=sampled_from(test_json_msgs_c2), min_size=12),
 58 |         integers(min_value=1, max_value=20)
 59 |     )
 60 |     def test_text_rank_summarization_ds2_days(self, smp_msgs, days):
 61 |         """Generate something for N day interval"""
 62 |         logger.info("Input is %s", smp_msgs)
 63 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 64 |         summ = TextRankTsSummarizer()
 65 |         summ.set_channel('elasticsearch')
 66 |         sumry = summ.summarize(smp_msgs, range_spec=asd)
 67 |         logger.debug("Summary is %s", sumry)
 68 |         # Length of summary is at least 1 and no greater than 3
 69 |         self.assertTrue(len(sumry) >= 1)
 70 |         self.assertTrue(len(sumry) <= 3)
 71 |         # Length of summary is less than or equal to the original length
 72 |         self.assertTrue(len(sumry) <= len(smp_msgs))
 73 |         # Each message in the summary must correspond to a message
 74 | 
 75 | 
 76 |     @given(
 77 |         integers(min_value=1, max_value=1000),
 78 |         integers(min_value=1, max_value=20)
 79 |     )
 80 |     def test_text_rank_summarization_ds3_days(self, sampsize, days):
 81 |         """Generate something for N day interval"""
 82 |         channel, ssamp = random.choice(test_json_msgs_c3)
 83 |         samp = ssamp[random.randint(1,len(ssamp)-2):]
 84 |         logger.info("Input is segment is %s", samp)
 85 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 86 |         summ = TextRankTsSummarizer()
 87 |         summ.set_channel(channel)
 88 |         sumry = summ.summarize(samp, range_spec=asd)
 89 |         logger.debug("Summary is %s", sumry)
 90 |         # Length of summary is at least 1 and no greater than 3
 91 |         self.assertTrue(len(sumry) >= 1)
 92 |         self.assertTrue(len(sumry) <= 3)
 93 |         # Length of summary is less than or equal to the original length
 94 |         #self.assertTrue(len(sumry) <= len(samp))
 95 |         # Each message in the summary must correspond to a message
 96 | 
 97 | 
 98 |     @given(lists(elements=sampled_from(test_json_msgs), min_size=1),
 99 |            integers(min_value=1, max_value=24)
100 |     )
101 |     def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):
102 |         """Generate something for N hour intervals"""
103 |         logger.info("Input is %s", smp_msgs)
104 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
105 |         summ = TextRankTsSummarizer()
106 |         summ.set_channel('elasticsearch')
107 |         sumry = summ.summarize(smp_msgs, range_spec=asd)
108 |         logger.debug("Summary is %s", sumry)
109 |         # Length of summary is at least 1 and no greater than 3
110 |         self.assertTrue(len(sumry) >= 1)
111 |         self.assertTrue(len(sumry) <= 3)
112 |         # Length of summary is less than or equal to the original length
113 |         self.assertTrue(len(sumry) <= len(smp_msgs))
114 |         # Each message in the summary must correspond to a message
115 |         
116 | 
117 |     @given(lists(elements=sampled_from(test_json_msgs_c2), min_size=1),
118 |            integers(min_value=1, max_value=24)
119 |     )
120 |     def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):
121 |         """Generate something for N hour intervals"""
122 |         logger.info("Input is %s", smp_msgs)
123 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
124 |         summ = TextRankTsSummarizer()
125 |         summ.set_channel('elasticsearch')
126 |         sumry = summ.summarize(smp_msgs, range_spec=asd)
127 |         logger.debug("Summary is %s", sumry)
128 |         # Length of summary is at least 1 and no greater than 3
129 |         self.assertTrue(len(sumry) >= 1)
130 |         self.assertTrue(len(sumry) <= 3)
131 |         # Length of summary is less than or equal to the original length
132 |         self.assertTrue(len(sumry) <= len(smp_msgs))
133 |         # Each message in the summary must correspond to a message
134 |         
135 | 
136 |     @given(
137 |         integers(min_value=2, max_value=1000),
138 |         integers(min_value=1, max_value=24)
139 |     )
140 |     def test_text_rank_summarization_ds3_hours(self, sampsize, hours):
141 |         """Generate something for N hour intervals"""
142 |         channel, ssamp = random.choice(test_json_msgs_c3)
143 |         samp = ssamp[random.randint(1,len(ssamp)-2):]
144 |         logger.info("Input is segment is %s", samp)
145 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
146 |         summ = TextRankTsSummarizer()
147 |         summ.set_channel(channel)
148 |         sumry = summ.summarize(samp, range_spec=asd)
149 |         logger.debug("Summary is %s", sumry)
150 |         # Length of summary is at least 1 and no greater than 3
151 |         self.assertTrue(len(sumry) >= 1)
152 |         self.assertTrue(len(sumry) <= 3)
153 |         # Length of summary is less than or equal to the original length
154 |         #self.assertTrue(len(sumry) <= len(samp))
155 |         # Each message in the summary must correspond to a message
156 |         
157 | 
158 | if __name__ == '__main__':
159 |     unittest.main()
160 | 
161 | 


--------------------------------------------------------------------------------
/test_service_components.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import mock
  3 | from mock import MagicMock, patch
  4 | from slacker import Slacker
  5 | import slacker
  6 | import main
  7 | from slack_summary import SlackRouter
  8 | from requests import Response
  9 | import config
 10 | from ts_config import DEBUG, LOG_FILE
 11 | import sys
 12 | import logging
 13 | import logging.handlers
 14 | import json
 15 | import io
 16 | 
 17 | class Test(unittest.TestCase):
 18 |     def setUp(self):
 19 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 20 |         log_level = logging.DEBUG if DEBUG else logging.INFO
 21 |         self.logger = logging.getLogger(__name__)
 22 |         self.fh = logging.handlers.RotatingFileHandler('./testing_'+LOG_FILE, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5,)
 23 |         self.fh.setLevel(log_level)
 24 |         self.fh.setFormatter(formatter)
 25 |         self.logger.handlers = []
 26 |         self.logger.addHandler(self.fh)
 27 |         self.expected = {u'has_more': True, u'messages': [{u'text': u'hmmm...',
 28 |                        u'ts': u'1414028037.000317',
 29 |                        u'type': u'message',
 30 |                        u'user': u'U027LSDDA'}], u'ok': True}
 31 |         with io.open('./data/test-events-elastic.json', encoding='utf-8') as jf:
 32 |             self.larger_expected = json.load(jf)
 33 |         self.myresponse = Response()
 34 |         self.myresponse.body = self.expected
 35 |         self.myresponse.status_code = 200
 36 |         attrs = {'history.return_value': self.myresponse,}
 37 |         self.channel_mock = MagicMock(**attrs)
 38 |         self.large_response = Response()
 39 |         self.large_response.body = self.larger_expected
 40 |         self.large_response.status_code = 200
 41 |         attrs2 = {'history.return_value': self.large_response,}
 42 |         self.channel_mock2 = MagicMock(**attrs2)
 43 |         main.app.config['TESTING'] = True
 44 |         self.app = main.app.test_client()
 45 | 
 46 |     def tearDown(self):
 47 |         pass
 48 |     
 49 |     @mock.patch('slacker.Slacker')
 50 |     def test_summary(self, mock_slack):
 51 |         mock_slack.return_value.channels = self.channel_mock
 52 |         sr = SlackRouter()
 53 |         self.assertTrue(sr.get_response('elasticsearch') == self.myresponse)
 54 | 
 55 |     @mock.patch('slacker.Slacker')
 56 |     def test_service(self, mock_slack):
 57 |         mock_slack.return_value.channels = self.channel_mock
 58 |         rv = self.app.post('/slack', data=dict(
 59 |                     channel_id='elasticsearch',
 60 |                     channel_name='elasticsearch',
 61 |                     user_id='user123',
 62 |                     user_name='bob',
 63 |                     text='-5 days @bob'
 64 |                 ), follow_redirects=True)
 65 |         self.logger.handlers = []
 66 |         self.logger.addHandler(self.fh)
 67 |         self.logger.info("Response is %s", rv.data)
 68 |         self.assertTrue(rv.status_code == 200)
 69 | 
 70 |     @mock.patch('slacker.Slacker')
 71 |     def test_service_lr(self, mock_slack):
 72 |         mock_slack.return_value.channels = self.channel_mock2
 73 |         rv = self.app.post('/slack', data=dict(
 74 |                     channel_id='elasticsearch',
 75 |                     channel_name='elasticsearch',
 76 |                     user_id='user123456',
 77 |                     user_name='bob2',
 78 |                     text='-2 days @bob'
 79 |                 ), follow_redirects=True)
 80 |         self.logger.handlers = []
 81 |         self.logger.addHandler(self.fh)
 82 |         self.logger.info("Response is %s", rv.data)
 83 |         self.assertTrue(rv.status_code == 200)
 84 | 
 85 |     @mock.patch('slacker.Slacker')
 86 |     def test_service_no_command(self, mock_slack):
 87 |         mock_slack.return_value.channels = self.channel_mock2
 88 |         rv = self.app.post('/slack', data=dict(
 89 |                     channel_id='elasticsearch',
 90 |                     channel_name='elasticsearch',
 91 |                     user_id='user123456',
 92 |                     user_name='bob2',
 93 |                     text=''
 94 |                 ), follow_redirects=True)
 95 |         self.logger.handlers = []
 96 |         self.logger.addHandler(self.fh)
 97 |         self.logger.info("Response is %s", rv.data)
 98 |         self.assertTrue(rv.status_code == 200)
 99 | 
100 |     @mock.patch('slacker.Slacker')
101 |     def test_service_no_text(self, mock_slack):
102 |         mock_slack.return_value.channels = self.channel_mock2
103 |         rv = self.app.post('/slack', data=dict(
104 |                     channel_id='elasticsearch',
105 |                     channel_name='elasticsearch',
106 |                     user_id='user123456',
107 |                     user_name='bob2'
108 |                 ), follow_redirects=True)
109 |         self.logger.handlers = []
110 |         self.logger.addHandler(self.fh)
111 |         self.logger.info("Response is %s", rv.data)
112 |         self.assertTrue(rv.status_code == 200)
113 | 
114 |     @mock.patch('slacker.Slacker')
115 |     def test_service_bad_text(self, mock_slack):
116 |         mock_slack.return_value.channels = self.channel_mock2
117 |         rv = self.app.post('/slack', data=dict(
118 |                     channel_id='elasticsearch',
119 |                     channel_name='elasticsearch',
120 |                     user_id='user123456',
121 |                     user_name='bob2',
122 |                     text='adjfalkjldkj adfajldkajflkjadh ndnakdjlkjlkjd'
123 |                 ), follow_redirects=True)
124 |         self.logger.handlers = []
125 |         self.logger.addHandler(self.fh)
126 |         self.logger.info("Response is %s", rv.data)
127 |         self.assertTrue(rv.status_code == 200)
128 | 
129 |     @mock.patch('slacker.Slacker')
130 |     def test_service_bad_units(self, mock_slack):
131 |         mock_slack.return_value.channels = self.channel_mock2
132 |         rv = self.app.post('/slack', data=dict(
133 |                     channel_id='elasticsearch',
134 |                     channel_name='elasticsearch',
135 |                     user_id='user123456',
136 |                     user_name='bob2',
137 |                     text='2 adjfalkjldkj adfajldkajflkjadh ndnakdjlkjlkjd'
138 |                 ), follow_redirects=True)
139 |         self.logger.handlers = []
140 |         self.logger.addHandler(self.fh)
141 |         self.logger.info("Response is %s", rv.data)
142 |         self.assertTrue(rv.status_code == 200)
143 | 
144 |     @mock.patch('slacker.Slacker')
145 |     def test_gensim(self, mock_slack):
146 |         mock_slack.return_value.channels = self.channel_mock2
147 |         rv = self.app.post('/slack', data=dict(
148 |                     channel_id='elasticsearch',
149 |                     channel_name='elasticsearch',
150 |                     user_id='user123456',
151 |                     user_name='bob2',
152 |                     text='2 days gensim'
153 |                 ), follow_redirects=True)
154 |         self.logger.handlers = []
155 |         self.logger.addHandler(self.fh)
156 |         self.logger.info("Response is %s", rv.data)
157 |         self.assertTrue(rv.status_code == 200)
158 |         
159 | 
160 | if __name__ == '__main__':
161 |     unittest.main()
162 | 


--------------------------------------------------------------------------------
/test_spacy_with_hypothesis.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import json
  3 | import io
  4 | from sp_summarizer import (SpacyTsSummarizer)
  5 | import hypothesis.settings as hs
  6 | from interval_summarizer import (IntervalSpec, TsSummarizer,
  7 |                                  ts_to_time)
  8 | import lsa
  9 | from datetime import datetime
 10 | import logging
 11 | import sys
 12 | import config
 13 | from ts_config import DEBUG
 14 | from hypothesis import given
 15 | from hypothesis.strategies import (sampled_from, lists, just, integers)
 16 | import glob
 17 | import random
 18 | logger = logging.getLogger()
 19 | logger.level = logging.DEBUG if DEBUG else logging.INFO
 20 | test_json_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
 21 | test_json_msgs_c2 = json.load(io.open("./data/test-events-elastic.json", encoding='utf-8'))['messages']
 22 | test_json_msgs_c3 = []
 23 | 
 24 | def read_dir(fdir):
 25 |     coll = []
 26 |     for jfile in glob.glob('./data/slack-logs-2/{}/*.json'.format(fdir)):
 27 |         coll += json.load(io.open(jfile, encoding='utf-8'))
 28 |     return coll
 29 | 
 30 | test_json_msgs_c3 = [(fdir, read_dir(fdir)) for fdir in ['api-test',  'calypso',  'games',  'happiness',  'hg',  'jetpack',  'jetpackfuel',  'livechat',  'tickets',  'vip']]
 31 | 
 32 | class TestSummarize(unittest.TestCase):
 33 | 
 34 |     test_msgs = test_json_msgs
 35 |     summ = SpacyTsSummarizer()
 36 |     summ.set_summarizer(lsa.LsaSummarizer())
 37 | 
 38 | 
 39 |     @given(
 40 |         lists(elements=sampled_from(test_json_msgs), min_size=3),
 41 |         integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
 42 |     )
 43 |     def test_text_rank_summarization_ds1_days(self, smp_msgs, days):
 44 |         """Generate something for N day interval"""
 45 |         logger.info("Input is %s", smp_msgs)
 46 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 47 |         #TestSummarize.summ.set_interval()
 48 |         TestSummarize.summ.set_channel('elasticsearch')
 49 |         sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
 50 |         logger.debug("Summary is %s", sumry)
 51 |         # Length of summary is at least 1 and no greater than 3
 52 |         self.assertTrue(len(sumry) >= 1)
 53 |         #self.assertTrue(len(sumry) <= 3)
 54 |         # Length of summary is less than or equal to the original length
 55 |         #self.assertTrue(len(sumry) <= len(smp_msgs))
 56 |         # Each message in the summary must correspond to a message
 57 | 
 58 | 
 59 |     @given(
 60 |         lists(elements=sampled_from(test_json_msgs_c2), min_size=12),
 61 |         integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
 62 |     )
 63 |     def test_text_rank_summarization_ds2_days(self, smp_msgs, days):
 64 |         """Generate something for N day interval"""
 65 |         logger.info("Input is %s", smp_msgs)
 66 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 67 |         #TestSummarize.summ.set_interval(asd)
 68 |         TestSummarize.summ.set_channel('elasticsearch')
 69 |         sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
 70 |         logger.debug("Summary is %s", sumry)
 71 |         # Length of summary is at least 1 and no greater than 3
 72 |         self.assertTrue(len(sumry) >= 1)
 73 |         #self.assertTrue(len(sumry) <= 3)
 74 |         # Length of summary is less than or equal to the original length
 75 |         #self.assertTrue(len(sumry) <= len(smp_msgs))
 76 |         # Each message in the summary must correspond to a message
 77 | 
 78 | 
 79 |     @given(
 80 |         integers(min_value=1, max_value=1000),
 81 |         integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
 82 |     )
 83 |     def test_text_rank_summarization_ds3_days(self, sampsize, days):
 84 |         """Generate something for N day interval"""
 85 |         channel, ssamp = random.choice(test_json_msgs_c3)
 86 |         samp = ssamp[random.randint(1,len(ssamp)-2):]
 87 |         logger.info("Input is segment is %s", samp)
 88 |         asd = {'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}
 89 |         #TestSummarize.summ.set_interval()
 90 |         TestSummarize.summ.set_channel(channel)
 91 |         sumry = TestSummarize.summ.summarize(samp, range_spec=asd)
 92 |         logger.debug("Summary is %s", sumry)
 93 |         # Length of summary is at least 1 and no greater than 3
 94 |         self.assertTrue(len(sumry) >= 1)
 95 |         #self.assertTrue(len(sumry) <= 3)
 96 |         # Length of summary is less than or equal to the original length
 97 |         #self.assertTrue(len(sumry) <= len(samp))
 98 |         # Each message in the summary must correspond to a message
 99 | 
100 | 
101 |     @given(lists(elements=sampled_from(test_json_msgs), min_size=1),
102 |            integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
103 |     )
104 |     def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):
105 |         """Generate something for N hour intervals"""
106 |         logger.info("Input is %s", smp_msgs)
107 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
108 |         #TestSummarize.summ.set_interval()
109 |         TestSummarize.summ.set_channel('elasticsearch')
110 |         sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
111 |         logger.debug("Summary is %s", sumry)
112 |         # Length of summary is at least 1 and no greater than 3
113 |         self.assertTrue(len(sumry) >= 1)
114 |         #self.assertTrue(len(sumry) <= 3)
115 |         # Length of summary is less than or equal to the original length
116 |         #self.assertTrue(len(sumry) <= len(smp_msgs))
117 |         # Each message in the summary must correspond to a message
118 |         
119 | 
120 |     @given(lists(elements=sampled_from(test_json_msgs_c2), min_size=1),
121 |            integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
122 |     )
123 |     def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):
124 |         """Generate something for N hour intervals"""
125 |         logger.info("Input is %s", smp_msgs)
126 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
127 |         #TestSummarize.summ.set_interval()
128 |         TestSummarize.summ.set_channel('elasticsearch')
129 |         sumry = TestSummarize.summ.summarize(smp_msgs, range_spec=asd)
130 |         logger.debug("Summary is %s", sumry)
131 |         # Length of summary is at least 1 and no greater than 3
132 |         self.assertTrue(len(sumry) >= 1)
133 |         #self.assertTrue(len(sumry) <= 3)
134 |         # Length of summary is less than or equal to the original length
135 |         #self.assertTrue(len(sumry) <= len(smp_msgs))
136 |         # Each message in the summary must correspond to a message
137 |         
138 | 
139 |     @given(
140 |         integers(min_value=2, max_value=1000),
141 |         integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
142 |     )
143 |     def test_text_rank_summarization_ds3_hours(self, sampsize, hours):
144 |         """Generate something for N hour intervals"""
145 |         channel, ssamp = random.choice(test_json_msgs_c3)
146 |         samp = ssamp[random.randint(1,len(ssamp)-2):]
147 |         TestSummarize.summ.set_channel(channel)
148 |         logger.info("Input is segment is %s", samp)
149 |         asd = {'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}
150 |         sumry = TestSummarize.summ.summarize(samp, range_spec=asd)
151 |         logger.debug("Summary is %s", sumry)
152 |         # Length of summary is at least 1 and no greater than 3
153 |         self.assertTrue(len(sumry) >= 1)
154 |         #self.assertTrue(len(sumry) <= 3)
155 |         # Length of summary is less than or equal to the original length
156 |         #self.assertTrue(len(sumry) <= len(samp))
157 |         # Each message in the summary must correspond to a message
158 |         
159 | 
160 | if __name__ == '__main__':
161 |     unittest.main()
162 | 
163 | 


--------------------------------------------------------------------------------
/test_summarizer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import json
 3 | import io
 4 | import config
 5 | from ts_config import SUMMS
 6 | from interval_summarizer import (IntervalSpec, TsSummarizer,
 7 |                                  ts_to_time)
 8 | from datetime import datetime
 9 | import logging
10 | import logging.handlers
11 | import sys
12 | from ts_config import DEBUG
13 | if "spacy" in SUMMS:
14 |     from sp_summarizer import (SpacyTsSummarizer)
15 |     import lsa
16 | if "gensim" in SUMMS:
17 |     from ts_summarizer import (TextRankTsSummarizer)
18 | 
19 | logger = logging.getLogger()
20 | logger.level = logging.DEBUG if DEBUG else logging.INFO
21 | 
22 | class TestSummarize(unittest.TestCase):
23 | 
24 |     test_msgs = json.load(io.open("./test-events.json", encoding='utf-8'))['messages']
25 | 
26 |     def test_interval_conversion(self):
27 |         self.assertTrue(ts_to_time("1441925382.000186") == datetime.utcfromtimestamp(1441925382))
28 | 
29 | 
30 |     def test_summarizer_tag_display(self):
31 |         """Make sure that the display of the tag is correct"""
32 |         logger.info("Running the taggger test")
33 |         asd = {'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}
34 |         summ = TsSummarizer()
35 |         summ.set_channel("elasticsearch")
36 |         summ_msg = summ.tagged_sum(TestSummarize.test_msgs[1])
37 |         logger.debug("Test summ msg is %s", summ_msg)
38 |         self.assertTrue(summ_msg == "@Thu-Sep-9-2015 18:32:08 <@U0EBEC5T5>: <https://a8c.slack.com/archives/elasticsearch/p1441909928000131|because i imagine the places we link people will vary quite a bit with tests>")
39 | 
40 | 
41 |     def test_gensim_summarization(self):
42 |         """Pass the intervals to summarizer"""
43 |         if "gensim" in SUMMS:
44 |             asd = [{'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}, {'hours':12, 'size' : 1, 'txt' : u'Summary for last 12 hours:\n'}]
45 |             summ = None
46 |             summ = TextRankTsSummarizer()
47 |             summ.set_channel('elasticsearch')
48 |             logger.debug("Testing gensim summarizer")
49 |             sumry = summ.summarize(TestSummarize.test_msgs, range_spec=asd)
50 |             logger.debug("Summary is %s", sumry)
51 |             self.assertTrue(len(sumry) > 1)
52 |         else:
53 |             pass
54 | 
55 |     def test_spacy_summarization(self):
56 |         """Pass the intervals to summarizer"""
57 |         if "spacy" in SUMMS:
58 |             asd = [{'minutes': 60, 'size' : 2, 'txt' : u'Summary for first 60 minutes:\n'}, {'hours':12, 'size' : 1, 'txt' : u'Summary for last 12 hours:\n'}]
59 |             summ = None
60 |             lsa_summ = lsa.LsaSummarizer()
61 |             summ = SpacyTsSummarizer()
62 |             for rs in asd:
63 |                 summ.set_summarizer(lsa_summ)
64 |                 summ.set_channel('elasticsearch')
65 |                 logger.debug("Testing spacy summarizer")
66 |                 sumry = summ.summarize(TestSummarize.test_msgs, range_spec=rs)
67 |                 logger.debug("Summary is %s, length %s", sumry, len(sumry))
68 |                 self.assertTrue(len(sumry) > 1)
69 |         else:
70 |             pass
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 
76 | 


--------------------------------------------------------------------------------
/ts_config.py:
--------------------------------------------------------------------------------
 1 | SUMMARY_INTERVALS = [{'days': 5, 'size': 2}, ]
 2 | TS_DEBUG = True
 3 | TS_LOG = "ts_summ.log"
 4 | DEBUG=True
 5 | LOG_FILE="summary.log"
 6 | TEST_JSON="./data/test-events-elastic.json"
 7 | SUMMS=["spacy"]
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/ts_summarizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import namedtuple
  3 | from datetime import (timedelta, datetime)
  4 | import re
  5 | import logging
  6 | import logging.handlers
  7 | import sys
  8 | import json
  9 | import io
 10 | import lsa
 11 | import utils
 12 | import base_summarizer
 13 | import compat
 14 | from gensim.summarization import summarize as gs_sumrz
 15 | from gensim.summarization.textcleaner import split_sentences
 16 | from gensim.models.word2vec import LineSentence
 17 | from ts_config import TS_DEBUG, TS_LOG
 18 | import glob
 19 | from interval_summarizer import (IntervalSpec, TsSummarizer,
 20 |                                  ts_to_time)
 21 | from utils import get_msg_text
 22 | logging.basicConfig(level=logging.INFO)
 23 | 
 24 | class TextRankTsSummarizer(TsSummarizer):
 25 | 
 26 |     def __init__(self, ):
 27 |         TsSummarizer.__init__(self, )
 28 |         log_level = logging.DEBUG if TS_DEBUG else logging.INFO
 29 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 30 |         fh = logging.handlers.RotatingFileHandler('./text_rank_'+TS_LOG, mode='a', encoding='utf-8', maxBytes=1000000, backupCount=5)
 31 |         fh.setLevel(log_level)
 32 |         fh.setFormatter(formatter)
 33 |         self.logger = logging.getLogger('ts_summarizer')
 34 |         self.logger.handlers = []
 35 |         self.logger.addHandler(fh)
 36 | 
 37 |     def set_summarizer(self, val):
 38 |         pass
 39 |                 
 40 |     def summarize(self, msgs, range_spec=None):
 41 |         """Return a summary of the text
 42 |         TODO: 1. Looks like spacy is not getting the main sentence from the message.
 43 |         2. Load times for the spacy summarizer won't cut it. Commenting out now 
 44 |            until this can be fixed
 45 |         """
 46 |         if not msgs or len(msgs) == 0:
 47 |             self.logger.warn("No messages to form summary")
 48 |             return u"\n Unable to form summary here.\n"
 49 |         txt = range_spec['txt'] if range_spec else u'Summary is'
 50 |         size = range_spec['size'] if range_spec and 'size' in range_spec else 3
 51 |         summ = txt + u' '
 52 |         #limit canonical dictionary to top 200 docs
 53 |         can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
 54 |         top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:300]
 55 |         can_dict = {key: can_dict[key] for key in top_keys}
 56 |         self.logger.info("Length of can_dict is %s", len(can_dict))
 57 |         simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:3]])
 58 |         # If the number of messages or vocabulary is too low, just look for a
 59 |         # promising set of messages
 60 |         if len(msgs) < 11 or len(can_dict) < 11:
 61 |             #return the longest
 62 |             self.logger.warn("Too few messages for NLP.")
 63 |             summ += simple_sum
 64 |         else:
 65 |             max_sents = {}
 66 |             for (txt, msg) in can_dict.items():
 67 |                 if len(txt.split()) > 3:
 68 |                     #Use the same splitting that gensim does
 69 |                     for snt in split_sentences(txt):
 70 |                         if len(snt.split()) > 100:
 71 |                             snt = u' '.join(snt.split()[:100])
 72 |                         max_sents[snt] = msg
 73 |             ratio = (size * 2)/ float(len(max_sents.keys()))
 74 |             #ratio = 0.3
 75 |             sent1 = u' '.join(can_dict.keys())
 76 |             sent2 = u' '.join(max_sents.keys())
 77 |             gn_sum = gs_sumrz(sent1, ratio=ratio, split=True)[:size]
 78 |             mx_sum = gs_sumrz(sent2, ratio=ratio, split=True)[:size]
 79 |             self.logger.info("Gensim sum %s", gn_sum)
 80 |             gs_summ = u'\n'.join([self.tagged_sum(can_dict[ss] if ss in can_dict else max_sents[ss]) for ss in gn_sum if len(ss) > 1 and (ss in max_sents or ss in can_dict)])
 81 |             for ss in mx_sum:
 82 |                 if ss not in max_sents and ss not in can_dict and len(ss.split()) > 5:
 83 |                     self.logger.info("Searching for: %s", ss)
 84 |                     for (ky, msg) in max_sents.items():
 85 |                         if ss in ky or (len(ky.split()) > 10 and ky in ss):
 86 |                             gs_summ += u'\n' + self.tagged_sum(msg)
 87 |             if len(gn_sum) > 1:
 88 |                 summ += gs_summ
 89 |             else:
 90 |                 self.logger.warn("NLP Summarizer produced null output %s", gs_summ)
 91 |                 summ += simple_sum
 92 |         self.logger.info("Summary for segment %s is %s", msgs, summ) 
 93 |         return summ
 94 | 
 95 |     def parify_text(self, msg_segment):
 96 |         ptext = u'. '.join([TextRankTsSummarizer.flrg.sub(u'', get_msg_text(msg)) for msg in msg_segment])
 97 |         self.logger.debug("Parified text is %s", ptext)
 98 |         return ptext
 99 | 
100 | def canonicalize(txt):
101 |     """Change the messages so that each ends with punctation"""
102 |     ntxt = TsSummarizer.flrg.sub(u'', txt)
103 |     ntxt =  ntxt.strip() if re.match(r'.*[\.\?\!]\s*$', ntxt) else u'{}.'.format(ntxt.strip())
104 |     return ntxt if len(ntxt.split()) < 100 else u' '.join(ntxt.split()[:100])
105 |     #return ntxt if re.match(r'.*[\.\?]$', ntxt) else u'{}.'.format(ntxt)
106 | 
107 | def main():
108 |     asd = [{'minutes': 30, 'txt' : u'Summary for first 30 minutes:\n', 'size' : 2}, {'hours':36, 'txt' : u'Summary for next 36 hours:\n', 'size': 3}]
109 |     logger = logging.getLogger(__name__)
110 |     tr_summ = TextRankTsSummarizer(asd)
111 |     all_msgs = []
112 |     for msg_file in glob.glob('./data/*.json'):
113 |         with io.open(msg_file, encoding='utf-8',) as mf:
114 |             all_msgs += json.load(mf)
115 |     for filt in asd:
116 |         logger.info(tr_summ.summarize(all_msgs, range_spec=filt))
117 |     
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division, print_function, unicode_literals
 5 | 
 6 | class ItemsCount(object):
 7 |     def __init__(self, value):
 8 |         self._value = value
 9 |         self.string_types = (str, unicode)
10 | 
11 |     def __call__(self, sequence):
12 |         if isinstance(self._value, self.string_types):
13 |             if self._value.endswith("%"):
14 |                 total_count = len(sequence)
15 |                 percentage = int(self._value[:-1])
16 |                 # at least one sentence should be chosen
17 |                 count = max(1, total_count*percentage // 100)
18 |                 return sequence[:count]
19 |             else:
20 |                 return sequence[:int(self._value)]
21 |         elif isinstance(self._value, (int, float)):
22 |             return sequence[:int(self._value)]
23 |         else:
24 |             ValueError("Unsuported value of items count '%s'." % self._value)
25 | 
26 |     def __repr__(self):
27 |         return to_string("<ItemsCount: %r>" % self._value)
28 | 
29 | def maybe_get(cont, key, default=None):
30 |     return cont[key] if key in cont else default
31 | 
32 | def get_msg_text(msg):
33 |     """Pull the appropriate text from the message"""
34 |     if 'text' in msg and len(msg['text']) > 0:
35 |         return msg['text']
36 |     if 'attachments' in msg:
37 |         ats = msg['attachments']
38 |         if len(ats) > 0:
39 |             at = ats[0]
40 |             att_text = []
41 |             if 'title' in at:
42 |                 att_text.append(at['title'])
43 |             if 'text' in at:
44 |                 att_text.append(at['text'])
45 |             max_text = max(att_text, key=lambda txt: len(txt))
46 |             if len(max_text) > 0:
47 |                 return max_text
48 |     return u""
49 |             
50 | 


--------------------------------------------------------------------------------