├── .gitignore
├── .gitattributes
├── assets
    ├── topically-name_cluster.png
    ├── topically_name_topics.png
    ├── topically-name_topics-example.png
    └── topic-modeling-picture-thousand-texts.png
├── tests
    ├── __init__.py
    └── test_cluster_namer.py
├── topically
    ├── __init__.py
    ├── app.py
    ├── cluster_namers.py
    └── prompts
    │   └── prompts.py
├── CONTRIBUTORS.md
├── pyproject.toml
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | .gitattributes
3 | *.pyc
4 | .DS_Store
5 | dist/*
6 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/assets/topically-name_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/sandbox-topically/main/assets/topically-name_cluster.png


--------------------------------------------------------------------------------
/assets/topically_name_topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/sandbox-topically/main/assets/topically_name_topics.png


--------------------------------------------------------------------------------
/assets/topically-name_topics-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/sandbox-topically/main/assets/topically-name_topics-example.png


--------------------------------------------------------------------------------
/assets/topic-modeling-picture-thousand-texts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/sandbox-topically/main/assets/topic-modeling-picture-thousand-texts.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
2 | #
3 | # Licensed under the MIT License (the "License");
4 | # you may not use this file except in compliance with the License.
5 | #
6 | # You may obtain a copy of the License in the LICENSE file at the top
7 | # level of this repository.
8 | 


--------------------------------------------------------------------------------
/topically/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
 2 | #
 3 | # Licensed under the MIT License (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | #
 6 | # You may obtain a copy of the License in the LICENSE file at the top
 7 | # level of this repository.
 8 | 
 9 | from .app import Topically
10 | 
11 | __version__ = "0.0.4"
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | Thank you for your interest in contributing to this repository. To help maintain
 2 | the quality of the codebase and ensure a quick review of your pull request, you
 3 | should:
 4 | 1. Write clear, clean code and format it in line with the style used in the 
 5 | repository.
 6 | 2. Leave comments, and use docstrings where appropriate.
 7 | 3. Add unit tests for any new functionality you introduce, if a set of test cases
 8 | are already set up in the repository.
 9 | 4. Use git commit messages to leave an informative trace of what additions and
10 | changes were made.
11 | 5. Write an informative high level description of the pull request, changes made,
12 | and the reason for these changes before submitting the pull request.
13 | 
14 | If you have not signed our Contributor License Agreement, you will be asked to
15 | sign one by our automated system when you submit your first pull request to
16 | a Cohere repository.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "topically"
 3 | version = "0.0.4"
 4 | description = ""
 5 | authors = ["Jay Alammar <jay@cohere.ai>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.7"
10 | cohere = "^2.1"
11 | pandas = "^1.2"
12 | bertopic = {version = "*", optional = true}
13 | 
14 | [tool.poetry.extras]
15 | bertopic = ["bertopic"]
16 | 
17 | [tool.poetry.dev-dependencies]
18 | yapf = "^0.32"
19 | toml = "^0.10"
20 | flake8 = "^5.0"
21 | isort = "^5.10"
22 | autoflake = "^1.6"
23 | 
24 | 
25 | [build-system]
26 | requires = ["poetry-core>=1.0.0"]
27 | build-backend = "poetry.core.masonry.api"
28 | [tool.isort]
29 | profile = "google"
30 | skip_glob = "**/__init__.py"
31 | line_length = 120
32 | force_grid_wrap = 0
33 | use_parentheses = true
34 | multi_line_output = 0
35 | float_to_top = true
36 | 
37 | [tool.yapf]
38 | based_on_style = "google"
39 | indent_width = 4
40 | column_limit = 120
41 | 
42 | [tool.yapfignore]
43 | ignore_patterns = [
44 |     ".eggs/*",
45 |     ".git/*"
46 | ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Cohere Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_cluster_namer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
 2 | #
 3 | # Licensed under the MIT License (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | #
 6 | # You may obtain a copy of the License in the LICENSE file at the top
 7 | # level of this repository.
 8 | 
 9 | import cohere
10 | import numpy as np
11 | 
12 | from topically import cluster_namers
13 | from topically import Topically
14 | from topically.app import MockCohereAPI
15 | from topically.cluster_namers import ClusterNamer
16 | 
17 | 
18 | 
19 | def test_reranker():
20 |     """ Test that reranker properly orders generations by likelihood."""
21 | 
22 |     api = MockCohereAPI()
23 |     mock_gens_list = api.generate().generations
24 | 
25 |     ranked_generations = cluster_namers.rerank_by_likelihood(mock_gens_list)
26 |     assert list(ranked_generations) == ['one', 'two']
27 | 
28 | 
29 | def test_cluster_namer():
30 |     """ Test that test_cluster_namer() returns n_samples names."""
31 | 
32 |     app = Topically('', mockAPI=True)
33 | 
34 |     cluster_names, _ = app.name_topics(([0, 0, 0], [1, 1, 1]), num_generations=5)
35 | 
36 |     print(cluster_names)
37 |     assert cluster_names == ['one', 'one', 'one']
38 | 


--------------------------------------------------------------------------------
/topically/app.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
  2 | #
  3 | # Licensed under the MIT License (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | #
  6 | # You may obtain a copy of the License in the LICENSE file at the top
  7 | # level of this repository.
  8 | 
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | import getpass
 11 | import logging
 12 | 
 13 | import cohere
 14 | import numpy as np
 15 | 
 16 | from .cluster_namers import ClusterNamer
 17 | from .prompts.prompts import generic_cluster_naming_prompt
 18 | 
 19 | 
 20 | class Topically(object):
 21 | 
 22 |     def __init__(self, api_key: str = None, mockAPI: bool = False):
 23 |         if mockAPI:
 24 |             self.co = MockCohereAPI()
 25 |         else:
 26 |             if api_key is None:
 27 |                 api_key = getpass.getpass('Enter your Cohere API Key')
 28 | 
 29 |             self.co = cohere.Client(api_key)
 30 | 
 31 |     #TODO: Encapsulate this functionality into cluter_namers 
 32 |     def name_topics(self, X, prompt: str = '', num_generations=1, num_sample_texts=10):
 33 |         """
 34 |         Name clusters using the default prompt. For each cluster, calls the Cohere generate end-point to assign a name to the cluster.
 35 |         Example: If we have ten samples clustered into two clusters (0,1), this makes two generation API calls. That results in two cluster names. We return n_samples,
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |             X: array-like of shape (n_samples, 2)
 40 |               A tuple of two arrays. One is the texts (str), the second is their cluster assignments (int).
 41 | 
 42 |         Returns
 43 |         -------
 44 |             assigned_cluster_names: array-like of length n_samples
 45 |                The cluster name assigned to each text
 46 | 
 47 |         """
 48 | 
 49 |         # Get the texts and their cluster assignments
 50 |         texts, cluster_assignments = X
 51 | 
 52 |         if isinstance(cluster_assignments, list):
 53 |             cluster_assignments = np.array(cluster_assignments)
 54 | 
 55 |         if isinstance(texts, list):
 56 |             texts = np.array(texts)
 57 | 
 58 |         if prompt == '':
 59 |             prompt = generic_cluster_naming_prompt
 60 | 
 61 |         # Instantiate ClusterNamer
 62 |         cluster_namer = ClusterNamer(self.co, prompt, num_generations=num_generations)
 63 | 
 64 |         # Get the unique cluster assignments
 65 |         unique_cluster_assignments = np.unique(cluster_assignments)
 66 | 
 67 |         # Create a dictionary to store the cluster names for each cluster
 68 |         cluster_names = {}
 69 | 
 70 |         extracted = []
 71 |         cluster_names = {}
 72 | 
 73 |         def name_cluster(cluster_number):
 74 |             # Get the texts in this cluster, sample from them
 75 |             cluster_texts = texts[cluster_assignments == cluster_number]
 76 | 
 77 |             if len(cluster_texts) > num_sample_texts:
 78 |                 sample_texts_from_cluster = np.random.choice(cluster_texts, num_sample_texts, replace=False)
 79 |             else:
 80 |                 sample_texts_from_cluster = cluster_texts
 81 | 
 82 |             cluster_name = cluster_namer.predict(sample_texts_from_cluster)
 83 | 
 84 |             logging.info(f'naming cluster {cluster_number}: {cluster_name}')
 85 | 
 86 |             return cluster_number, cluster_name
 87 | 
 88 |         # Name all clusters in parallel
 89 |         with ThreadPoolExecutor(max_workers=8) as executor:
 90 |             for (cluster_number, cluster_name) in executor.map(name_cluster, unique_cluster_assignments):
 91 |                 cluster_names[cluster_number] = cluster_name
 92 | 
 93 | 
 94 |         # Create a list to store the cluster assignments per sample
 95 |         assigned_cluster_names = [cluster_names[cluster_number] for cluster_number in cluster_assignments]
 96 | 
 97 |         return assigned_cluster_names, cluster_names
 98 | 
 99 |     def name_cluster(self, cluster_texts, temperature=0.6, num_generations=1):
100 |         """
101 |         Name a cluster using the default prompt. Calls the Cohere generate end-point to assign a name to the cluster.
102 | 
103 |         Parameters
104 |         ----------
105 |             cluster_texts: array-like of shape (n_samples,)
106 |               The texts in the cluster to be named.
107 | 
108 |         Returns
109 |         -------
110 |             cluster_name: str
111 |                The cluster name assigned to the cluster
112 | 
113 |         """
114 | 
115 |         # Create the prompt, starting with the global task description
116 |         prompt = 'The following texts are from the same cluster. Please name the cluster.'
117 | 
118 |         # Add the data of the current cluster we want to label
119 |         prompt += self.construct_example_for_prompt(cluster_texts)
120 | 
121 |         # Generate the cluster name
122 |         cluster_name = self.generate(prompt, temperature=temperature, num_generations=num_generations)[0]
123 | 
124 |         return cluster_name
125 | 
126 | 
127 | class MockCohereAPI:
128 |     """Mock Cohere API for testing."""
129 | 
130 |     def __init__(self):
131 |         pass
132 | 
133 |     def generate(self, **kwargs):
134 |         mock_gens_list = [
135 |             cohere.generation.Generation(text='two', likelihood=-10, token_likelihoods=None),
136 |             cohere.generation.Generation(text='one', likelihood=-5, token_likelihoods=None)
137 |         ]
138 |         mock_gens = cohere.generation.Generations(generations=mock_gens_list, return_likelihoods=None)
139 | 
140 |         return mock_gens
141 | 


--------------------------------------------------------------------------------
/topically/cluster_namers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
  2 | #
  3 | # Licensed under the MIT License (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | #
  6 | # You may obtain a copy of the License in the LICENSE file at the top
  7 | # level of this repository.
  8 | 
  9 | import cohere.generation
 10 | import numpy as np
 11 | from sklearn.base import BaseEstimator
 12 | 
 13 | 
 14 | class ClusterNamer(BaseEstimator):
 15 |     """ Assign names to clusters of text based on their content using managed Language Models from Cohere."""
 16 |     def __init__(self, co, prompt: str = '', num_generations: int = 1, temperature=0.6):
 17 |         """
 18 |         Name a cluster using the default prompt. Calls the Cohere generate end-point to assign a name to the cluster.
 19 | 
 20 |         Parameters
 21 |         ----------
 22 |             co: Python cohere SDK object
 23 |             prompt: str
 24 |                 The text prompt the generative model uses to name a cluster
 25 |             num_generations: int
 26 |                 The number of candidate generations to get for each cluster. Multiple generations can enhance the quality of cluster names.
 27 |             temperature: float
 28 |                 Typically between 0-1, the temperature value used to control the randomness of the generation model. Lower values lead to more predictable, less "creative" names.
 29 | 
 30 |         """
 31 |         self.co = co
 32 |         self.prompt = prompt
 33 |         self.num_generations = num_generations
 34 |         self.temperature = temperature
 35 | 
 36 |     def make_prompt(self, cluster_example_texts):
 37 |         """
 38 |         Prepare the naming prompt by adding examples from a single cluster to the prompt.
 39 | 
 40 |         Parameters
 41 |         ----------
 42 |             cluster_example_texts: array-like of strings
 43 |                 A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
 44 | 
 45 |         Returns
 46 |         -------
 47 |             prompt: str
 48 |                The naming prompt including the examples from this cluster
 49 | 
 50 |         """
 51 |         # Add the data of the current cluster we want to label
 52 |         return self.prompt + construct_example_for_prompt(cluster_example_texts)
 53 | 
 54 |     def generate(self, cluster_example_texts):
 55 |         """
 56 |         Generate suggest topic name(s)
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |             cluster_example_texts: array-like of strings
 61 |                 A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
 62 | 
 63 |         Returns
 64 |         -------
 65 |             generations: list of Cohere Generation objects.
 66 |                The cluster names suggest by the generative model
 67 | 
 68 |         """
 69 |         # Add the data of the current cluster we want to label
 70 |         prompt = self.make_prompt(cluster_example_texts)
 71 | 
 72 |         # Generate using the language model
 73 |         request = self.co.generate(model='xlarge',
 74 |                                    prompt=prompt,
 75 |                                    max_tokens=50,
 76 |                                    num_generations=self.num_generations,
 77 |                                    return_likelihoods='GENERATION',
 78 |                                    stop_sequences=["\n"])
 79 | 
 80 |         return request.generations
 81 | 
 82 |     def predict(self, texts):
 83 |         """
 84 |         Generate a name for a single topic
 85 | 
 86 |         Parameters
 87 |         ----------
 88 |             texts: array-like of strings
 89 |                 A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
 90 | 
 91 |         Returns
 92 |         -------
 93 |             topic_name: str
 94 |                The suggest name for the topic/cluster.
 95 | 
 96 |         """
 97 |         gens = self.generate(texts)
 98 | 
 99 |         if self.num_generations > 1:
100 |             gens = rerank_by_likelihood(gens)
101 |             return gens[0]
102 | 
103 |         return gens[0].text.strip()
104 | 
105 | 
106 | def rerank_by_likelihood(generations: cohere.generation.Generations):
107 |     """
108 |     Get a list of generations from Cohere's Generate endpoint, order them by highest likelihood score.
109 | 
110 |     Parameters
111 |     ----------
112 |         generations: cohere.generation.Generations object
113 |           Contains a list of generations (length: n_samples) from Cohere's Generate endpoint containing generated texts and token likelihoods.
114 | 
115 |     Returns
116 |     -------
117 |         ordered_generations: array-like of length n_samples
118 |            The same list of generations, except ordered by highest likelihoods
119 | 
120 |     """
121 | 
122 |     # Sort by most likely, most likely generations first
123 |     likelihoods = np.array([gen.likelihood for gen in generations])
124 |     texts = np.array([gen.text.strip() for gen in generations])
125 |     sorted_indexes = likelihoods.argsort()[::-1]
126 |     new_indices = likelihoods[sorted_indexes]
127 |     ordered_generations = texts[sorted_indexes]
128 |     print(ordered_generations)
129 | 
130 |     return ordered_generations
131 | 
132 | 
133 | def construct_example_for_prompt(cluster_example_texts):
134 |     """
135 |     Prepare a single portion of the prompt by stitching the texts as an example
136 | 
137 |     Parameters
138 |     ----------
139 |         cluster_example_texts: array-like of strings
140 |             A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
141 | 
142 |     Returns
143 |     -------
144 |         example_prompt_text: str
145 |            A portion of a naming prompt
146 | 
147 |     """
148 |     example_prompt_text = f'\nCluster:\nSample texts from this cluster:\n'
149 |     for text in cluster_example_texts:
150 |         example_prompt_text += f'- {text}\n'
151 | 
152 |     example_prompt_text += f'Cluster name:'
153 | 
154 |     return example_prompt_text
155 | 


--------------------------------------------------------------------------------
/topically/prompts/prompts.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Cohere Inc. and its affiliates.
  2 | #
  3 | # Licensed under the MIT License (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | #
  6 | # You may obtain a copy of the License in the LICENSE file at the top
  7 | # level of this repository.
  8 | 
  9 | generic_cluster_naming_prompt = """
 10 | This is a list of clusters of messages . Each cluster contains a collection of messages about the same topic. Then a name of collection is mentioned. The name of each cluster is a short, highly-descriptive title
 11 | ---
 12 | Cluster:
 13 | Sample texts from this cluster:
 14 | - Yosemite - Merced River
 15 | - Just before jump into the Merced River wallpaper
 16 | - silvermirage: First Snow on the Merced River [Photographer: Chris Cabot]
 17 | - Three Brothers - Merced River - Yosemite - 2010
 18 | - Merced River in Winter
 19 | 
 20 | Cluster name: Merced River
 21 | ---
 22 | Cluster:
 23 | Sample texts from this cluster:
 24 | - Sara Coolidge - Half Dome Sunset
 25 | - Orange Shades over Half Dome
 26 | - Descent from Mt Hoffman, vew of Half Dome at sunset
 27 | - Half_Dome_at_Sunset_Sentinel_Bridge_3691
 28 | - Gary Hart Photography: Last Light, Half Dome, Yosemite
 29 | 
 30 | Cluster name: Half Dome at sunset
 31 | ---
 32 | Cluster:
 33 | Sample texts from this cluster:
 34 | - The Domes of the Yosemite: 1870 by Albert Bierstadt (Amon Carter Museum of American Art, Fort Worth, TX) - Hudson River School
 35 | - Valley of the Yosemite by Albert Bierstadt
 36 | - ""Domes of Yosemite" by Albert Bierstadt - Uploaded by Vermont Humanities"
 37 | - Albert Bierstadt - Domes of the Yosemite
 38 | - Valley of the Yosemite (1864), Albert Bierstadt
 39 | 
 40 | Cluster name: The Domes of the Yosemite
 41 | ---
 42 | Cluster:
 43 | Sample texts from this cluster:
 44 | - Hi I need your assistance - client ordered the wrong bathroom hardware and I'm trying to return it - it says beyond the return time but we'd appreciate it (unopened)
 45 | - ordered a JBL CHARGE3 on Black Friday. Unplugged it the first time it does this help! maybe you can help too? I don't want a return just a new cord.
 46 | - I just wanted to know what do you mean by "working condition" for old phone exchange??
 47 | - Hi I have a defective product and the seller is being astonishingly unhelpful, please could you advise?
 48 | - How do I report a missing item? Like not in the box with everything else. Your site is useless for this. Thanks!
 49 | 
 50 | Cluster name: Returns and exchanges of defective, damaged, or missing products
 51 | ---
 52 | Cluster:
 53 | Sample texts from this cluster:
 54 | - I paid extra for Next day delivery (to be delivered today: saturday) however on the delivery tracking timeline it says expected delivery Monday. Can you look into why this is the case please? And when it actually will arrive so I can make sure I'm home.
 55 | - isn't prime two day shipping supposed to be guaranteed? what happens when its late?
 56 | - what do you mean by 2-day shipping? customer care executive is saying that 2-day shipping means we can ship anytime and 2 days are counted after shipping. why is he fooling ?
 57 | - the products in my cart have free shipping. Does this mean there is free shipping to all countries? (That are available for shipping ofc)
 58 | 
 59 | Cluster name: Shipping and delivery issues
 60 | ---
 61 | """
 62 | 
 63 | customer_service_tweets_prompt = """
 64 | This is a list of clusters of messages sent by customers to the customer service department of ecommerce company. Each cluster contains a collection of messages about the same topic. In addition to a sample of the messages, a list of keywords describing the collection is mentioned in addition to the name of the collection. The name of each cluster is a short, highly-descriptive title
 65 | ---
 66 | Cluster #0
 67 | Sample messages from this cluster:
 68 | - I paid extra for Next day delivery (to be delivered today: saturday) however on the delivery tracking timeline it says expected delivery Monday. Can you look into why this is the case please? And when it actually will arrive so I can make sure I'm home.
 69 | - isn't prime two day shipping supposed to be guaranteed? what happens when its late?
 70 | - what do you mean by 2-day shipping? customer care executive is saying that 2-day shipping means we can ship anytime and 2 days are counted after shipping. why is he fooling ?
 71 | - the products in my cart have free shipping. Does this mean there is free shipping to all countries? (That are available for shipping ofc)
 72 | 
 73 | Keywords for messages in this cluster: ups, late, monday, tomorrow, paid day, extra, arrive, delivered, day delivery, day shipping
 74 | Cluster name: Shipping and delivery issues
 75 | 
 76 | ---
 77 | Cluster #1
 78 | Sample messages from this cluster:
 79 | - I filled the form and submitted. I hope you solve my issue as I spent 40k and got nothing in return
 80 | - Who can I contact with a complaint. I've used Chat and been offered an unsatisfactory resolution.
 81 | - I have called 4 times and each time I am told the investigation is ongoing - seems to be no sense of urgency
 82 | - hey folks.. My Icoffee that I bought died on me and I want to get it fixed but website is down and nobody answers the phone
 83 | - I contacted you last week and filled in a form as requested but i still haven't had anybody contact me to resolve the issue. please can somebody contact me ASAP
 84 | 
 85 | Keywords for messages in the cluster: mail address, blocked, reply, email address, speak, response, mail, fax, fraud, complaint
 86 | Cluster name: New and unresolved complaints
 87 | ---
 88 | Cluster #2
 89 | Sample messages from this cluster:
 90 | - Hi I need your assistance - client ordered the wrong bathroom hardware and I'm trying to return it - it says beyond the return time but we'd appreciate it (unopened)
 91 | - ordered a JBL CHARGE3 on Black Friday. Unplugged it the first time it does this help! maybe you can help too? I don't want a return just a new cord.
 92 | - I just wanted to know what do you mean by "working condition" for old phone exchange??
 93 | - Hi I have a defective product and the seller is being astonishingly unhelpful, please could you advise?
 94 | - How do I report a missing item? Like not in the box with everything else. Your site is useless for this. Thanks!
 95 | Keywords for messages in the cluster: warranty, returned, exchange, return item, label, defective, refund, replacement, damaged, return
 96 | Cluster name: Returns and exchanges of defective, damaged, or missing products
 97 | ---"""
 98 | 
 99 | news_article_headlines_prompt = """
100 | This is a list of clusters of news article headlines. Each cluster contains a collection of headlines about the same topic. In addition to a sample of the headlines, the name of the cluster is mentioned. The name of each cluster is a short, highly-descriptive title.
101 | ---
102 | Cluster #0
103 | Sample headlines from this cluster:
104 | - 
105 | 
106 | Cluster name:
107 | 
108 | ---
109 | Cluster #1
110 | Sample headlines from this cluster:
111 | - 
112 | 
113 | Cluster name:
114 | 
115 | ---
116 | Cluster #2
117 | Sample headlines from this cluster:
118 | - 
119 | 
120 | Cluster name:
121 | 
122 | ---
123 | """
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | ################################################################################
  3 | #    ____      _                     ____                  _ _                 #
  4 | #   / ___|___ | |__   ___ _ __ ___  / ___|  __ _ _ __   __| | |__   _____  __  #
  5 | #  | |   / _ \| '_ \ / _ \ '__/ _ \ \___ \ / _` | '_ \ / _` | '_ \ / _ \ \/ /  #
  6 | #  | |__| (_) | | | |  __/ | |  __/  ___) | (_| | | | | (_| | |_) | (_) >  <   #
  7 | #   \____\___/|_| |_|\___|_|  \___| |____/ \__,_|_| |_|\__,_|_.__/ \___/_/\_\  #
  8 | #                                                                              #
  9 | # This project is part of Cohere Sandbox, Cohere's Experimental Open Source    #
 10 | # offering. This project provides a library, tooling, or demo making use of    #
 11 | # the Cohere Platform. You should expect (self-)documented, high quality code  #
 12 | # but be warned that this is EXPERIMENTAL. Therefore, also expect rough edges, #
 13 | # non-backwards compatible changes, or potential changes in functionality as   #
 14 | # the library, tool, or demo evolves. Please consider referencing a specific   #
 15 | # git commit or version if depending upon the project in any mission-critical  #
 16 | # code as part of your own projects.                                           #
 17 | #                                                                              #
 18 | # Please don't hesitate to raise issues or submit pull requests, and thanks    #
 19 | # for checking out this project!                                               #
 20 | #                                                                              #
 21 | ################################################################################
 22 | ```
 23 | 
 24 | **Maintainer:** [jalammar](https://github.com/jalammar) \
 25 | **Project maintained until at least:** 2023-04-30
 26 | 
 27 | # A picture is worth a thousand sentences
 28 | 
 29 | <img src="./assets/topic-modeling-picture-thousand-texts.png" />
 30 | When you want to explore thousands or millions of texts (messages, emails, news headlines), topic modeling tools help you make sense of them rapidly and visually.
 31 | 
 32 | # Topically
 33 | 
 34 | Topically is a \[work-in-progress\] suite of tools that help make sense of text collections (messages, articles, emails, news headlines) using large language models.
 35 | 
 36 | Topically's first feature is to name clusters of short texts based on their content. For example, here are news headlines from the machinelearning and investing subreddits, and the names suggested for them by topically:
 37 | 
 38 | <img src="./assets/topically-name_cluster.png" />
 39 | 
 40 | 
 41 | # Usage Example
 42 | Use Topically to name clusters in the course of topic modeling
 43 | 
 44 | ```python
 45 | import topically
 46 | 
 47 | app = topically.Topically('cohere_api_key')
 48 | 
 49 | example_texts = [
 50 | # Three headlines from the machine learning subreddit
 51 | "[Project] From books to presentations in 10s with AR + ML",
 52 | "[D] A Demo from 1993 of 32-year-old Yann LeCun showing off the World's first Convolutional Network for Text Recognition",
 53 | "[R] First Order Motion Model applied to animate paintings",
 54 | 
 55 | # Three headlines from the investing subreddit
 56 | "Robinhood and other brokers literally blocking purchase of $GME, $NOK, $BB, $AMC; allow sells",
 57 | "United Airlines stock down over 5% premarket trading",
 58 | "Bitcoin was nearly $20,000 a year ago today"]
 59 | 
 60 | # We know the first three texts belong to one topic (topic 0), the last three belong to another topic (topic 1)
 61 | example_topics = [0, 0, 0, 1, 1, 1]
 62 | 
 63 | topics_of_examples, topic_names_dict = app.name_topics((example_texts, example_topics)) #Optional:  num_generations=5
 64 | topics_of_examples # Run again to get new suggested names. More text examples should result in better names.
 65 | 
 66 | ```
 67 | 
 68 | Output:
 69 | ```
 70 | ['Text recognition',
 71 |  'Text recognition',
 72 |  'Text recognition',
 73 |  'Stock Market Closing Bell',
 74 |  'Stock Market Closing Bell',
 75 |  'Stock Market Closing Bell']
 76 |  ```
 77 |  
 78 | In this simple example, we know the cluster assignments. In actual applications, a topic modeling library like BERTopic can cluster the texts for us, and then we can name them with topically. 
 79 | 
 80 | # Usage Example: Topically + BERTopic
 81 | Use Topically to name clusters in the course of topic modeling with tools like BERTopic. Get the cluster assignments from BERTopic, and name the clusters with topically. This improves on the keyword topic labels (and can build upon them).
 82 | 
 83 | 
 84 | <img src="./assets/topically_name_topics.png" />
 85 | 
 86 | 
 87 | Here's example code and a colab notebook demonstrating this.
 88 | 
 89 | <a href="https://colab.research.google.com/github/cohere-ai/sandbox-topically/blob/main/notebooks/Intro%20-%20Topically%20with%20BERTopic.ipynb" target="_parent\"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 90 | 
 91 | Code excerpt:
 92 | 
 93 | ```python
 94 | 
 95 | from bertopic import BERTopic
 96 | from topically import Topically
 97 | 
 98 | # Load and initialize BERTopic to use KMeans clustering with 8 clusters only.
 99 | cluster_model = KMeans(n_clusters=8)
100 | topic_model = BERTopic(hdbscan_model=cluster_model)
101 | 
102 | # df is a dataframe. df['title'] is the column of text we're modeling
103 | df['topic'], probabilities = topic_model.fit_transform(df['title'], embeds)
104 | 
105 | # Load topically
106 | app = Topically('cohere_api_key')
107 | 
108 | # name clusters
109 | df['topic_name'], topic_names = app.name_topics((df['title'], df['topic']))
110 | 
111 | df[['title', 'topic', 'topic_name']]
112 | ```
113 | 
114 | 
115 | <img src="./assets/topically-name_topics-example.png" />
116 | 
117 | # Installation
118 | 
119 | You can install topically from pypi:
120 | 
121 | `pip install topically`
122 | 
123 | Optionally, you can also install topically with BERTopic:
124 | 
125 | `pip install topically[bertopic]`
126 | 
127 | 
128 | # How it works
129 | 
130 | Topically uses a generative language model (GPT) to assign a name to the text cluster. It sends a request to [Cohere](https://cohere.ai/)'s managed model (get an [API key](https://dashboard.cohere.ai/welcome/register?utm_source=github&utm_medium=content&utm_campaign=sandbox&utm_content=topically) and use it for free for prototyping).
131 | 
132 | To generate the titles, topically uses a couple of bundled prompts. To get the best names for your use case, it's best to edit the prompt to add more information about the context, and add good cluster names for 3-5 of your clusters.
133 | 
134 | This works best on short texts (given the context length limitations of GPT models). If you're working with long texts, you may experiment with excerpts or summaries of the texts.
135 | 
136 | # Architecture Overview
137 | Topically is pretty simple and early in its life. At the moment, it's made up of two main class:
138 | 
139 | ### `Topically`
140 | This class maintains the client to the [Cohere](https://cohere.ai/) platform, and exposes the main interaction point with Topically (name_topics, at the moment). It lives in app.py.
141 | 
142 | ### `ClusterNamer`
143 | This class deals with preparing the prompts and calling the Generate endpoint to generate suggested topic names. It lives in cluster_namers.py.
144 | 
145 | # Get support
146 | If you have any questions or comments, please file an issue or reach out to us on [Discord](https://discord.gg/co-mmunity).
147 | 
148 | # Contributors
149 | If you would like to contribute to this project, please read `CONTRIBUTORS.md`
150 | in this repository, and sign the Contributor License Agreement before submitting
151 | any pull requests. A link to sign the Cohere CLA will be generated the first time 
152 | you make a pull request to a Cohere repository.
153 | 
154 | # License
155 | Topically has an MIT license, as found in the LICENSE file.
156 | 


--------------------------------------------------------------------------------