├── .devcontainer
    └── devcontainer.json
├── .editorconfig
├── .gitattributes
├── .gitignore
├── .idea
    ├── .gitignore
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── other.xml
    ├── streamlit_topic_modeling.iml
    └── vcs.xml
├── Dockerfile
├── MANIFEST.in
├── Makefile
├── README.rst
├── data
    ├── Inkfree.ttf
    ├── Tweets.csv.zip
    ├── elonmusk.csv.zip
    ├── favicon.png
    ├── is-this-a-topic-modeling.jpg
    └── mf.png
├── docs
    ├── Makefile
    ├── _static
    │   └── .gitignore
    ├── _templates
    │   └── .gitignore
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── readme.rst
├── newsfragments
    └── .gitignore
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── setup.cfg
├── setup.py
├── streamlit_topic_modeling
    ├── __init__.py
    ├── app.py
    └── tests
    │   ├── __init__.py
    │   └── test_app.py
├── towncrier.toml
└── tox.ini


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "streamlit_topic_modeling/app.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run streamlit_topic_modeling/app.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
  1 | root = true
  2 | 
  3 | [*]
  4 | charset = utf-8
  5 | end_of_line = crlf
  6 | indent_size = 4
  7 | indent_style = space
  8 | insert_final_newline = false
  9 | max_line_length = 120
 10 | tab_width = 4
 11 | ij_continuation_indent_size = 8
 12 | ij_formatter_off_tag = @formatter:off
 13 | ij_formatter_on_tag = @formatter:on
 14 | ij_formatter_tags_enabled = false
 15 | ij_smart_tabs = false
 16 | ij_visual_guides = none
 17 | ij_wrap_on_typing = false
 18 | 
 19 | [*.css]
 20 | ij_css_align_closing_brace_with_properties = false
 21 | ij_css_blank_lines_around_nested_selector = 1
 22 | ij_css_blank_lines_between_blocks = 1
 23 | ij_css_brace_placement = end_of_line
 24 | ij_css_enforce_quotes_on_format = false
 25 | ij_css_hex_color_long_format = false
 26 | ij_css_hex_color_lower_case = false
 27 | ij_css_hex_color_short_format = false
 28 | ij_css_hex_color_upper_case = false
 29 | ij_css_keep_blank_lines_in_code = 2
 30 | ij_css_keep_indents_on_empty_lines = false
 31 | ij_css_keep_single_line_blocks = false
 32 | ij_css_properties_order = font,font-family,font-size,font-weight,font-style,font-variant,font-size-adjust,font-stretch,line-height,position,z-index,top,right,bottom,left,display,visibility,float,clear,overflow,overflow-x,overflow-y,clip,zoom,align-content,align-items,align-self,flex,flex-flow,flex-basis,flex-direction,flex-grow,flex-shrink,flex-wrap,justify-content,order,box-sizing,width,min-width,max-width,height,min-height,max-height,margin,margin-top,margin-right,margin-bottom,margin-left,padding,padding-top,padding-right,padding-bottom,padding-left,table-layout,empty-cells,caption-side,border-spacing,border-collapse,list-style,list-style-position,list-style-type,list-style-image,content,quotes,counter-reset,counter-increment,resize,cursor,user-select,nav-index,nav-up,nav-right,nav-down,nav-left,transition,transition-delay,transition-timing-function,transition-duration,transition-property,transform,transform-origin,animation,animation-name,animation-duration,animation-play-state,animation-timing-function,animation-delay,animation-iteration-count,animation-direction,text-align,text-align-last,vertical-align,white-space,text-decoration,text-emphasis,text-emphasis-color,text-emphasis-style,text-emphasis-position,text-indent,text-justify,letter-spacing,word-spacing,text-outline,text-transform,text-wrap,text-overflow,text-overflow-ellipsis,text-overflow-mode,word-wrap,word-break,tab-size,hyphens,pointer-events,opacity,color,border,border-width,border-style,border-color,border-top,border-top-width,border-top-style,border-top-color,border-right,border-right-width,border-right-style,border-right-color,border-bottom,border-bottom-width,border-bottom-style,border-bottom-color,border-left,border-left-width,border-left-style,border-left-color,border-radius,border-top-left-radius,border-top-right-radius,border-bottom-right-radius,border-bottom-left-radius,border-image,border-image-source,border-image-slice,border-image-width,border-image-outset,border-image-repeat,outline,outline-width,outline-style,outline-color,outline-offset,background,background-color,background-image,background-repeat,background-attachment,background-position,background-position-x,background-position-y,background-clip,background-origin,background-size,box-decoration-break,box-shadow,text-shadow
 33 | ij_css_space_after_colon = true
 34 | ij_css_space_before_opening_brace = true
 35 | ij_css_use_double_quotes = true
 36 | ij_css_value_alignment = do_not_align
 37 | 
 38 | [*.properties]
 39 | ij_properties_align_group_field_declarations = false
 40 | ij_properties_keep_blank_lines = false
 41 | ij_properties_key_value_delimiter = equals
 42 | ij_properties_spaces_around_key_value_delimiter = false
 43 | 
 44 | [.editorconfig]
 45 | ij_editorconfig_align_group_field_declarations = false
 46 | ij_editorconfig_space_after_colon = false
 47 | ij_editorconfig_space_after_comma = true
 48 | ij_editorconfig_space_before_colon = false
 49 | ij_editorconfig_space_before_comma = false
 50 | ij_editorconfig_spaces_around_assignment_operators = true
 51 | 
 52 | [{*.ant,*.fxml,*.jhm,*.jnlp,*.jrxml,*.qrc,*.rng,*.tld,*.wsdl,*.xml,*.xsd,*.xsl,*.xslt,*.xul}]
 53 | ij_xml_align_attributes = true
 54 | ij_xml_align_text = false
 55 | ij_xml_attribute_wrap = normal
 56 | ij_xml_block_comment_at_first_column = true
 57 | ij_xml_keep_blank_lines = 2
 58 | ij_xml_keep_indents_on_empty_lines = false
 59 | ij_xml_keep_line_breaks = true
 60 | ij_xml_keep_line_breaks_in_text = true
 61 | ij_xml_keep_whitespaces = false
 62 | ij_xml_keep_whitespaces_around_cdata = preserve
 63 | ij_xml_keep_whitespaces_inside_cdata = false
 64 | ij_xml_line_comment_at_first_column = true
 65 | ij_xml_space_after_tag_name = false
 66 | ij_xml_space_around_equals_in_attribute = false
 67 | ij_xml_space_inside_empty_tag = false
 68 | ij_xml_text_wrap = normal
 69 | 
 70 | [{*.bash,*.sh,*.zsh}]
 71 | indent_size = 2
 72 | tab_width = 2
 73 | ij_shell_binary_ops_start_line = false
 74 | ij_shell_keep_column_alignment_padding = false
 75 | ij_shell_minify_program = false
 76 | ij_shell_redirect_followed_by_space = false
 77 | ij_shell_switch_cases_indented = false
 78 | 
 79 | [{*.cjs,*.js}]
 80 | ij_continuation_indent_size = 4
 81 | ij_javascript_align_imports = false
 82 | ij_javascript_align_multiline_array_initializer_expression = false
 83 | ij_javascript_align_multiline_binary_operation = false
 84 | ij_javascript_align_multiline_chained_methods = false
 85 | ij_javascript_align_multiline_extends_list = false
 86 | ij_javascript_align_multiline_for = true
 87 | ij_javascript_align_multiline_parameters = true
 88 | ij_javascript_align_multiline_parameters_in_calls = false
 89 | ij_javascript_align_multiline_ternary_operation = false
 90 | ij_javascript_align_object_properties = 0
 91 | ij_javascript_align_union_types = false
 92 | ij_javascript_align_var_statements = 0
 93 | ij_javascript_array_initializer_new_line_after_left_brace = false
 94 | ij_javascript_array_initializer_right_brace_on_new_line = false
 95 | ij_javascript_array_initializer_wrap = off
 96 | ij_javascript_assignment_wrap = off
 97 | ij_javascript_binary_operation_sign_on_next_line = false
 98 | ij_javascript_binary_operation_wrap = off
 99 | ij_javascript_blacklist_imports = rxjs/Rx,node_modules/**,**/node_modules/**,@angular/material,@angular/material/typings/**
100 | ij_javascript_blank_lines_after_imports = 1
101 | ij_javascript_blank_lines_around_class = 1
102 | ij_javascript_blank_lines_around_field = 0
103 | ij_javascript_blank_lines_around_function = 1
104 | ij_javascript_blank_lines_around_method = 1
105 | ij_javascript_block_brace_style = end_of_line
106 | ij_javascript_call_parameters_new_line_after_left_paren = false
107 | ij_javascript_call_parameters_right_paren_on_new_line = false
108 | ij_javascript_call_parameters_wrap = off
109 | ij_javascript_catch_on_new_line = false
110 | ij_javascript_chained_call_dot_on_new_line = true
111 | ij_javascript_class_brace_style = end_of_line
112 | ij_javascript_comma_on_new_line = false
113 | ij_javascript_do_while_brace_force = never
114 | ij_javascript_else_on_new_line = false
115 | ij_javascript_enforce_trailing_comma = keep
116 | ij_javascript_extends_keyword_wrap = off
117 | ij_javascript_extends_list_wrap = off
118 | ij_javascript_field_prefix = _
119 | ij_javascript_file_name_style = relaxed
120 | ij_javascript_finally_on_new_line = false
121 | ij_javascript_for_brace_force = never
122 | ij_javascript_for_statement_new_line_after_left_paren = false
123 | ij_javascript_for_statement_right_paren_on_new_line = false
124 | ij_javascript_for_statement_wrap = off
125 | ij_javascript_force_quote_style = false
126 | ij_javascript_force_semicolon_style = false
127 | ij_javascript_function_expression_brace_style = end_of_line
128 | ij_javascript_if_brace_force = never
129 | ij_javascript_import_merge_members = global
130 | ij_javascript_import_prefer_absolute_path = global
131 | ij_javascript_import_sort_members = true
132 | ij_javascript_import_sort_module_name = false
133 | ij_javascript_import_use_node_resolution = true
134 | ij_javascript_imports_wrap = on_every_item
135 | ij_javascript_indent_case_from_switch = true
136 | ij_javascript_indent_chained_calls = true
137 | ij_javascript_indent_package_children = 0
138 | ij_javascript_jsx_attribute_value = braces
139 | ij_javascript_keep_blank_lines_in_code = 2
140 | ij_javascript_keep_first_column_comment = true
141 | ij_javascript_keep_indents_on_empty_lines = false
142 | ij_javascript_keep_line_breaks = true
143 | ij_javascript_keep_simple_blocks_in_one_line = false
144 | ij_javascript_keep_simple_methods_in_one_line = false
145 | ij_javascript_line_comment_add_space = true
146 | ij_javascript_line_comment_at_first_column = false
147 | ij_javascript_method_brace_style = end_of_line
148 | ij_javascript_method_call_chain_wrap = off
149 | ij_javascript_method_parameters_new_line_after_left_paren = false
150 | ij_javascript_method_parameters_right_paren_on_new_line = false
151 | ij_javascript_method_parameters_wrap = off
152 | ij_javascript_object_literal_wrap = on_every_item
153 | ij_javascript_parentheses_expression_new_line_after_left_paren = false
154 | ij_javascript_parentheses_expression_right_paren_on_new_line = false
155 | ij_javascript_place_assignment_sign_on_next_line = false
156 | ij_javascript_prefer_as_type_cast = false
157 | ij_javascript_prefer_explicit_types_function_expression_returns = false
158 | ij_javascript_prefer_explicit_types_function_returns = false
159 | ij_javascript_prefer_explicit_types_vars_fields = false
160 | ij_javascript_prefer_parameters_wrap = false
161 | ij_javascript_reformat_c_style_comments = false
162 | ij_javascript_space_after_colon = true
163 | ij_javascript_space_after_comma = true
164 | ij_javascript_space_after_dots_in_rest_parameter = false
165 | ij_javascript_space_after_generator_mult = true
166 | ij_javascript_space_after_property_colon = true
167 | ij_javascript_space_after_quest = true
168 | ij_javascript_space_after_type_colon = true
169 | ij_javascript_space_after_unary_not = false
170 | ij_javascript_space_before_async_arrow_lparen = true
171 | ij_javascript_space_before_catch_keyword = true
172 | ij_javascript_space_before_catch_left_brace = true
173 | ij_javascript_space_before_catch_parentheses = true
174 | ij_javascript_space_before_class_lbrace = true
175 | ij_javascript_space_before_class_left_brace = true
176 | ij_javascript_space_before_colon = true
177 | ij_javascript_space_before_comma = false
178 | ij_javascript_space_before_do_left_brace = true
179 | ij_javascript_space_before_else_keyword = true
180 | ij_javascript_space_before_else_left_brace = true
181 | ij_javascript_space_before_finally_keyword = true
182 | ij_javascript_space_before_finally_left_brace = true
183 | ij_javascript_space_before_for_left_brace = true
184 | ij_javascript_space_before_for_parentheses = true
185 | ij_javascript_space_before_for_semicolon = false
186 | ij_javascript_space_before_function_left_parenth = true
187 | ij_javascript_space_before_generator_mult = false
188 | ij_javascript_space_before_if_left_brace = true
189 | ij_javascript_space_before_if_parentheses = true
190 | ij_javascript_space_before_method_call_parentheses = false
191 | ij_javascript_space_before_method_left_brace = true
192 | ij_javascript_space_before_method_parentheses = false
193 | ij_javascript_space_before_property_colon = false
194 | ij_javascript_space_before_quest = true
195 | ij_javascript_space_before_switch_left_brace = true
196 | ij_javascript_space_before_switch_parentheses = true
197 | ij_javascript_space_before_try_left_brace = true
198 | ij_javascript_space_before_type_colon = false
199 | ij_javascript_space_before_unary_not = false
200 | ij_javascript_space_before_while_keyword = true
201 | ij_javascript_space_before_while_left_brace = true
202 | ij_javascript_space_before_while_parentheses = true
203 | ij_javascript_spaces_around_additive_operators = true
204 | ij_javascript_spaces_around_arrow_function_operator = true
205 | ij_javascript_spaces_around_assignment_operators = true
206 | ij_javascript_spaces_around_bitwise_operators = true
207 | ij_javascript_spaces_around_equality_operators = true
208 | ij_javascript_spaces_around_logical_operators = true
209 | ij_javascript_spaces_around_multiplicative_operators = true
210 | ij_javascript_spaces_around_relational_operators = true
211 | ij_javascript_spaces_around_shift_operators = true
212 | ij_javascript_spaces_around_unary_operator = false
213 | ij_javascript_spaces_within_array_initializer_brackets = false
214 | ij_javascript_spaces_within_brackets = false
215 | ij_javascript_spaces_within_catch_parentheses = false
216 | ij_javascript_spaces_within_for_parentheses = false
217 | ij_javascript_spaces_within_if_parentheses = false
218 | ij_javascript_spaces_within_imports = false
219 | ij_javascript_spaces_within_interpolation_expressions = false
220 | ij_javascript_spaces_within_method_call_parentheses = false
221 | ij_javascript_spaces_within_method_parentheses = false
222 | ij_javascript_spaces_within_object_literal_braces = false
223 | ij_javascript_spaces_within_object_type_braces = true
224 | ij_javascript_spaces_within_parentheses = false
225 | ij_javascript_spaces_within_switch_parentheses = false
226 | ij_javascript_spaces_within_type_assertion = false
227 | ij_javascript_spaces_within_union_types = true
228 | ij_javascript_spaces_within_while_parentheses = false
229 | ij_javascript_special_else_if_treatment = true
230 | ij_javascript_ternary_operation_signs_on_next_line = false
231 | ij_javascript_ternary_operation_wrap = off
232 | ij_javascript_union_types_wrap = on_every_item
233 | ij_javascript_use_chained_calls_group_indents = false
234 | ij_javascript_use_double_quotes = true
235 | ij_javascript_use_explicit_js_extension = global
236 | ij_javascript_use_path_mapping = always
237 | ij_javascript_use_public_modifier = false
238 | ij_javascript_use_semicolon_after_statement = true
239 | ij_javascript_var_declaration_wrap = normal
240 | ij_javascript_while_brace_force = never
241 | ij_javascript_while_on_new_line = false
242 | ij_javascript_wrap_comments = false
243 | 
244 | [{*.har,*.jsb2,*.jsb3,*.json,.babelrc,.eslintrc,.stylelintrc,bowerrc,jest.config}]
245 | indent_size = 2
246 | ij_json_keep_blank_lines_in_code = 0
247 | ij_json_keep_indents_on_empty_lines = false
248 | ij_json_keep_line_breaks = true
249 | ij_json_space_after_colon = true
250 | ij_json_space_after_comma = true
251 | ij_json_space_before_colon = true
252 | ij_json_space_before_comma = false
253 | ij_json_spaces_within_braces = false
254 | ij_json_spaces_within_brackets = false
255 | ij_json_wrap_long_lines = false
256 | 
257 | [{*.htm,*.html,*.ng,*.sht,*.shtm,*.shtml}]
258 | ij_html_add_new_line_before_tags = body,div,p,form,h1,h2,h3
259 | ij_html_align_attributes = true
260 | ij_html_align_text = false
261 | ij_html_attribute_wrap = normal
262 | ij_html_block_comment_at_first_column = true
263 | ij_html_do_not_align_children_of_min_lines = 0
264 | ij_html_do_not_break_if_inline_tags = title,h1,h2,h3,h4,h5,h6,p
265 | ij_html_do_not_indent_children_of_tags = html,body,thead,tbody,tfoot
266 | ij_html_enforce_quotes = false
267 | ij_html_inline_tags = a,abbr,acronym,b,basefont,bdo,big,br,cite,cite,code,dfn,em,font,i,img,input,kbd,label,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var
268 | ij_html_keep_blank_lines = 2
269 | ij_html_keep_indents_on_empty_lines = false
270 | ij_html_keep_line_breaks = true
271 | ij_html_keep_line_breaks_in_text = true
272 | ij_html_keep_whitespaces = false
273 | ij_html_keep_whitespaces_inside = span,pre,textarea
274 | ij_html_line_comment_at_first_column = true
275 | ij_html_new_line_after_last_attribute = never
276 | ij_html_new_line_before_first_attribute = never
277 | ij_html_quote_style = double
278 | ij_html_remove_new_line_before_tags = br
279 | ij_html_space_after_tag_name = false
280 | ij_html_space_around_equality_in_attribute = false
281 | ij_html_space_inside_empty_tag = false
282 | ij_html_text_wrap = normal
283 | ij_html_uniform_ident = false
284 | 
285 | [{*.markdown,*.md}]
286 | ij_markdown_force_one_space_after_blockquote_symbol = true
287 | ij_markdown_force_one_space_after_header_symbol = true
288 | ij_markdown_force_one_space_after_list_bullet = true
289 | ij_markdown_force_one_space_between_words = true
290 | ij_markdown_keep_indents_on_empty_lines = false
291 | ij_markdown_max_lines_around_block_elements = 1
292 | ij_markdown_max_lines_around_header = 1
293 | ij_markdown_max_lines_between_paragraphs = 1
294 | ij_markdown_min_lines_around_block_elements = 1
295 | ij_markdown_min_lines_around_header = 1
296 | ij_markdown_min_lines_between_paragraphs = 1
297 | 
298 | [{*.py,*.pyw}]
299 | ij_python_align_collections_and_comprehensions = true
300 | ij_python_align_multiline_imports = true
301 | ij_python_align_multiline_parameters = true
302 | ij_python_align_multiline_parameters_in_calls = true
303 | ij_python_blank_line_at_file_end = true
304 | ij_python_blank_lines_after_imports = 1
305 | ij_python_blank_lines_after_local_imports = 0
306 | ij_python_blank_lines_around_class = 1
307 | ij_python_blank_lines_around_method = 1
308 | ij_python_blank_lines_around_top_level_classes_functions = 2
309 | ij_python_blank_lines_before_first_method = 0
310 | ij_python_dict_alignment = 0
311 | ij_python_dict_new_line_after_left_brace = false
312 | ij_python_dict_new_line_before_right_brace = false
313 | ij_python_dict_wrapping = 1
314 | ij_python_from_import_new_line_after_left_parenthesis = false
315 | ij_python_from_import_new_line_before_right_parenthesis = false
316 | ij_python_from_import_parentheses_force_if_multiline = false
317 | ij_python_from_import_trailing_comma_if_multiline = false
318 | ij_python_from_import_wrapping = 1
319 | ij_python_hang_closing_brackets = false
320 | ij_python_keep_blank_lines_in_code = 1
321 | ij_python_keep_blank_lines_in_declarations = 1
322 | ij_python_keep_indents_on_empty_lines = false
323 | ij_python_keep_line_breaks = true
324 | ij_python_new_line_after_colon = false
325 | ij_python_new_line_after_colon_multi_clause = true
326 | ij_python_optimize_imports_always_split_from_imports = false
327 | ij_python_optimize_imports_case_insensitive_order = false
328 | ij_python_optimize_imports_join_from_imports_with_same_source = false
329 | ij_python_optimize_imports_sort_by_type_first = true
330 | ij_python_optimize_imports_sort_imports = true
331 | ij_python_optimize_imports_sort_names_in_from_imports = false
332 | ij_python_space_after_comma = true
333 | ij_python_space_after_number_sign = true
334 | ij_python_space_after_py_colon = true
335 | ij_python_space_before_backslash = true
336 | ij_python_space_before_comma = false
337 | ij_python_space_before_for_semicolon = false
338 | ij_python_space_before_lbracket = false
339 | ij_python_space_before_method_call_parentheses = false
340 | ij_python_space_before_method_parentheses = false
341 | ij_python_space_before_number_sign = true
342 | ij_python_space_before_py_colon = false
343 | ij_python_space_within_empty_method_call_parentheses = false
344 | ij_python_space_within_empty_method_parentheses = false
345 | ij_python_spaces_around_additive_operators = true
346 | ij_python_spaces_around_assignment_operators = true
347 | ij_python_spaces_around_bitwise_operators = true
348 | ij_python_spaces_around_eq_in_keyword_argument = false
349 | ij_python_spaces_around_eq_in_named_parameter = false
350 | ij_python_spaces_around_equality_operators = true
351 | ij_python_spaces_around_multiplicative_operators = true
352 | ij_python_spaces_around_power_operator = true
353 | ij_python_spaces_around_relational_operators = true
354 | ij_python_spaces_around_shift_operators = true
355 | ij_python_spaces_within_braces = false
356 | ij_python_spaces_within_brackets = false
357 | ij_python_spaces_within_method_call_parentheses = false
358 | ij_python_spaces_within_method_parentheses = false
359 | ij_python_use_continuation_indent_for_arguments = false
360 | ij_python_use_continuation_indent_for_collection_and_comprehensions = false
361 | ij_python_wrap_long_lines = false
362 | 
363 | [{*.toml,Cargo.lock,Gopkg.lock,Pipfile}]
364 | ij_toml_keep_indents_on_empty_lines = false
365 | 
366 | [{*.yaml,*.yml}]
367 | indent_size = 2
368 | ij_yaml_align_values_properties = do_not_align
369 | ij_yaml_autoinsert_sequence_marker = true
370 | ij_yaml_block_mapping_on_new_line = false
371 | ij_yaml_indent_sequence_value = true
372 | ij_yaml_keep_indents_on_empty_lines = false
373 | ij_yaml_keep_line_breaks = true
374 | ij_yaml_sequence_on_new_line = false
375 | ij_yaml_space_before_colon = false
376 | ij_yaml_spaces_within_braces = true
377 | ij_yaml_spaces_within_brackets = true
378 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.png binary
2 | *.zip binary


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/pycharm,jupyternotebooks,python,macos,windows,linux
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=pycharm,jupyternotebooks,python,macos,windows,linux
  4 | 
  5 | ### JupyterNotebooks ###
  6 | # gitignore template for Jupyter Notebooks
  7 | # website: http://jupyter.org/
  8 | 
  9 | .ipynb_checkpoints
 10 | */.ipynb_checkpoints/*
 11 | 
 12 | # IPython
 13 | profile_default/
 14 | ipython_config.py
 15 | 
 16 | # Remove previous ipynb_checkpoints
 17 | #   git rm -r .ipynb_checkpoints/
 18 | 
 19 | ### Linux ###
 20 | *~
 21 | 
 22 | # temporary files which can be created if a process still has a handle open of a deleted file
 23 | .fuse_hidden*
 24 | 
 25 | # KDE directory preferences
 26 | .directory
 27 | 
 28 | # Linux trash folder which might appear on any partition or disk
 29 | .Trash-*
 30 | 
 31 | # .nfs files are created when an open file is removed but is still being accessed
 32 | .nfs*
 33 | 
 34 | ### macOS ###
 35 | # General
 36 | .DS_Store
 37 | .AppleDouble
 38 | .LSOverride
 39 | 
 40 | # Icon must end with two \r
 41 | Icon
 42 | 
 43 | 
 44 | # Thumbnails
 45 | ._*
 46 | 
 47 | # Files that might appear in the root of a volume
 48 | .DocumentRevisions-V100
 49 | .fseventsd
 50 | .Spotlight-V100
 51 | .TemporaryItems
 52 | .Trashes
 53 | .VolumeIcon.icns
 54 | .com.apple.timemachine.donotpresent
 55 | 
 56 | # Directories potentially created on remote AFP share
 57 | .AppleDB
 58 | .AppleDesktop
 59 | Network Trash Folder
 60 | Temporary Items
 61 | .apdisk
 62 | 
 63 | ### PyCharm ###
 64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 65 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 66 | 
 67 | # User-specific stuff
 68 | .idea/**/workspace.xml
 69 | .idea/**/tasks.xml
 70 | .idea/**/usage.statistics.xml
 71 | .idea/**/dictionaries
 72 | .idea/**/shelf
 73 | 
 74 | # Generated files
 75 | .idea/**/contentModel.xml
 76 | 
 77 | # Sensitive or high-churn files
 78 | .idea/**/dataSources/
 79 | .idea/**/dataSources.ids
 80 | .idea/**/dataSources.local.xml
 81 | .idea/**/sqlDataSources.xml
 82 | .idea/**/dynamic.xml
 83 | .idea/**/uiDesigner.xml
 84 | .idea/**/dbnavigator.xml
 85 | 
 86 | # Gradle
 87 | .idea/**/gradle.xml
 88 | .idea/**/libraries
 89 | 
 90 | # Gradle and Maven with auto-import
 91 | # When using Gradle or Maven with auto-import, you should exclude module files,
 92 | # since they will be recreated, and may cause churn.  Uncomment if using
 93 | # auto-import.
 94 | # .idea/artifacts
 95 | # .idea/compiler.xml
 96 | # .idea/jarRepositories.xml
 97 | # .idea/modules.xml
 98 | # .idea/*.iml
 99 | # .idea/modules
100 | # *.iml
101 | # *.ipr
102 | 
103 | # CMake
104 | cmake-build-*/
105 | 
106 | # Mongo Explorer plugin
107 | .idea/**/mongoSettings.xml
108 | 
109 | # File-based project format
110 | *.iws
111 | 
112 | # IntelliJ
113 | out/
114 | 
115 | # mpeltonen/sbt-idea plugin
116 | .idea_modules/
117 | 
118 | # JIRA plugin
119 | atlassian-ide-plugin.xml
120 | 
121 | # Cursive Clojure plugin
122 | .idea/replstate.xml
123 | 
124 | # Crashlytics plugin (for Android Studio and IntelliJ)
125 | com_crashlytics_export_strings.xml
126 | crashlytics.properties
127 | crashlytics-build.properties
128 | fabric.properties
129 | 
130 | # Editor-based Rest Client
131 | .idea/httpRequests
132 | 
133 | # Android studio 3.1+ serialized cache file
134 | .idea/caches/build_file_checksums.ser
135 | 
136 | ### PyCharm Patch ###
137 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
138 | 
139 | # *.iml
140 | # modules.xml
141 | # .idea/misc.xml
142 | # *.ipr
143 | 
144 | # Sonarlint plugin
145 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
146 | .idea/**/sonarlint/
147 | 
148 | # SonarQube Plugin
149 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
150 | .idea/**/sonarIssues.xml
151 | 
152 | # Markdown Navigator plugin
153 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
154 | .idea/**/markdown-navigator.xml
155 | .idea/**/markdown-navigator-enh.xml
156 | .idea/**/markdown-navigator/
157 | 
158 | # Cache file creation bug
159 | # See https://youtrack.jetbrains.com/issue/JBR-2257
160 | .idea/$CACHE_FILE$
161 | 
162 | # CodeStream plugin
163 | # https://plugins.jetbrains.com/plugin/12206-codestream
164 | .idea/codestream.xml
165 | 
166 | ### Python ###
167 | # Byte-compiled / optimized / DLL files
168 | __pycache__/
169 | *.py[cod]
170 | *$py.class
171 | 
172 | # C extensions
173 | *.so
174 | 
175 | # Distribution / packaging
176 | .Python
177 | build/
178 | develop-eggs/
179 | dist/
180 | downloads/
181 | eggs/
182 | .eggs/
183 | lib/
184 | lib64/
185 | parts/
186 | sdist/
187 | var/
188 | wheels/
189 | pip-wheel-metadata/
190 | share/python-wheels/
191 | *.egg-info/
192 | .installed.cfg
193 | *.egg
194 | MANIFEST
195 | 
196 | # PyInstaller
197 | #  Usually these files are written by a python script from a template
198 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
199 | *.manifest
200 | *.spec
201 | 
202 | # Installer logs
203 | pip-log.txt
204 | pip-delete-this-directory.txt
205 | 
206 | # Unit test / coverage reports
207 | htmlcov/
208 | .tox/
209 | .nox/
210 | .coverage
211 | .coverage.*
212 | .cache
213 | nosetests.xml
214 | coverage.xml
215 | *.cover
216 | *.py,cover
217 | .hypothesis/
218 | .pytest_cache/
219 | pytestdebug.log
220 | 
221 | # Translations
222 | *.mo
223 | *.pot
224 | 
225 | # Django stuff:
226 | *.log
227 | local_settings.py
228 | db.sqlite3
229 | db.sqlite3-journal
230 | 
231 | # Flask stuff:
232 | instance/
233 | .webassets-cache
234 | 
235 | # Scrapy stuff:
236 | .scrapy
237 | 
238 | # Sphinx documentation
239 | docs/_build/
240 | docs/_build/
241 | 
242 | # PyBuilder
243 | target/
244 | 
245 | # Jupyter Notebook
246 | 
247 | # IPython
248 | 
249 | # pyenv
250 | .python-version
251 | 
252 | # pipenv
253 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
254 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
255 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
256 | #   install all needed dependencies.
257 | #Pipfile.lock
258 | 
259 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
260 | __pypackages__/
261 | 
262 | # Celery stuff
263 | celerybeat-schedule
264 | celerybeat.pid
265 | 
266 | # SageMath parsed files
267 | *.sage.py
268 | 
269 | # Environments
270 | .env
271 | .venv
272 | env/
273 | venv/
274 | ENV/
275 | env.bak/
276 | venv.bak/
277 | pythonenv*
278 | 
279 | # Spyder project settings
280 | .spyderproject
281 | .spyproject
282 | 
283 | # Rope project settings
284 | .ropeproject
285 | 
286 | # mkdocs documentation
287 | /site
288 | 
289 | # mypy
290 | .mypy_cache/
291 | .dmypy.json
292 | dmypy.json
293 | 
294 | # Pyre type checker
295 | .pyre/
296 | 
297 | # pytype static type analyzer
298 | .pytype/
299 | 
300 | # profiling data
301 | .prof
302 | 
303 | ### Windows ###
304 | # Windows thumbnail cache files
305 | Thumbs.db
306 | Thumbs.db:encryptable
307 | ehthumbs.db
308 | ehthumbs_vista.db
309 | 
310 | # Dump file
311 | *.stackdump
312 | 
313 | # Folder config file
314 | [Dd]esktop.ini
315 | 
316 | # Recycle Bin used on file shares
317 | $RECYCLE.BIN/
318 | 
319 | # Windows Installer files
320 | *.cab
321 | *.msi
322 | *.msix
323 | *.msm
324 | *.msp
325 | 
326 | # Windows shortcuts
327 | *.lnk
328 | 
329 | # End of https://www.toptal.com/developers/gitignore/api/pycharm,jupyternotebooks,python,macos,windows,linux


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Black">
4 |     <option name="sdkName" value="Python 3.11" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (streamlit-topic-modeling)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/streamlit_topic_modeling.iml" filepath="$PROJECT_DIR$/.idea/streamlit_topic_modeling.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/streamlit_topic_modeling.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/.venv" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 7 |     </content>
 8 |     <orderEntry type="jdk" jdkName="Python 3.11 (streamlit-topic-modeling)" jdkType="Python SDK" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8
2 | LABEL maintainer="Bryan Patrick Wood <bpw1621@gmail.com>"
3 | 
4 | WORKDIR /usr/src/app
5 | COPY .. .
6 | RUN pip install -U pip && pip install --no-cache-dir -e .
7 | EXPOSE 8501
8 | ENTRYPOINT streamlit run ./streamlit_topic_modeling/app.py


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include pyproject.toml
3 | recursive-include data *.png *.zip *.ttf


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | sphinx-apidoc:
2 | 	sphinx-apidoc -f -o docs . setup.py


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | README
2 | ******
3 | 
4 | A topic modeling GUI application using Streamlit deployed on Streamlit Sharing `here <https://share.streamlit.io/bpw1621/streamlit-topic-modeling/streamlit_topic_modeling/app.py>`_.
5 | 
6 | .. image:: ./data/is-this-a-topic-modeling.jpg
7 | 


--------------------------------------------------------------------------------
/data/Inkfree.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/Inkfree.ttf


--------------------------------------------------------------------------------
/data/Tweets.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/Tweets.csv.zip


--------------------------------------------------------------------------------
/data/elonmusk.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/elonmusk.csv.zip


--------------------------------------------------------------------------------
/data/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/favicon.png


--------------------------------------------------------------------------------
/data/is-this-a-topic-modeling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/is-this-a-topic-modeling.jpg


--------------------------------------------------------------------------------
/data/mf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/mf.png


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore


--------------------------------------------------------------------------------
/docs/_templates/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../../streamlit_topic_modeling'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'Streamlit Topic Modeling'
21 | copyright = '2021, Bryan Patrick Wood'
22 | author = 'Bryan Patrick Wood'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '0.0a0'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.autosummary',
36 |     'sphinx.ext.coverage',
37 |     'sphinx.ext.doctest',
38 |     'sphinx.ext.todo',
39 |     'sphinx.ext.viewcode'
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
49 | 
50 | 
51 | # -- Options for HTML output -------------------------------------------------
52 | 
53 | # The theme to use for HTML and HTML Help pages.  See the documentation for
54 | # a list of builtin themes.
55 | #
56 | html_theme = 'sphinx_rtd_theme'
57 | 
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Streamlit Topic Modeling documentation master file, created by
 2 |    sphinx-quickstart on Sat Jan  9 11:24:07 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Streamlit Topic Modeling's documentation!
 7 | =========================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    readme
14 | 
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst


--------------------------------------------------------------------------------
/newsfragments/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools >= 51.1.1", "wheel"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 6.0
3 | addopts = -ra -q
4 | testpaths =
5 |     streamlit_topic_modeling/tests


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altair==5.4.1
 2 | attrs==24.2.0
 3 | blinker==1.8.2
 4 | cachetools==5.5.0
 5 | certifi==2024.8.30
 6 | charset-normalizer==3.3.2
 7 | click==8.1.7
 8 | colorama==0.4.6
 9 | contourpy==1.3.0
10 | cycler==0.12.1
11 | fonttools==4.53.1
12 | funcy==2.0
13 | gensim==4.3.3
14 | gitdb==4.0.11
15 | GitPython==3.1.43
16 | idna==3.8
17 | Jinja2==3.1.4
18 | joblib==1.4.2
19 | jsonschema==4.23.0
20 | jsonschema-specifications==2023.12.1
21 | kiwisolver==1.4.7
22 | llvmlite==0.43.0
23 | markdown-it-py==3.0.0
24 | MarkupSafe==2.1.5
25 | matplotlib==3.9.2
26 | mdurl==0.1.2
27 | narwhals==1.6.2
28 | nltk==3.9.1
29 | numba==0.60.0
30 | numexpr==2.10.1
31 | numpy==1.26.4
32 | packaging==24.1
33 | pandas==2.2.2
34 | patsy==0.5.6
35 | pillow==10.4.0
36 | plotly==5.24.0
37 | plotly-express==0.4.1
38 | protobuf==5.28.0
39 | pyarrow==17.0.0
40 | pydeck==0.9.1
41 | Pygments==2.18.0
42 | pyLDAvis==3.4.1
43 | pynndescent==0.5.13
44 | pyparsing==3.1.4
45 | python-dateutil==2.9.0.post0
46 | pytz==2024.1
47 | referencing==0.35.1
48 | regex==2024.7.24
49 | requests==2.32.3
50 | rich==13.8.0
51 | rpds-py==0.20.0
52 | scikit-learn==1.5.1
53 | scipy==1.13.1
54 | seaborn==0.13.2
55 | six==1.16.0
56 | smart-open==7.0.4
57 | smmap==5.0.1
58 | statsmodels==0.14.2
59 | streamlit==1.38.0
60 | tenacity==8.5.0
61 | threadpoolctl==3.5.0
62 | toml==0.10.2
63 | tornado==6.4.1
64 | tqdm==4.66.5
65 | typing_extensions==4.12.2
66 | tzdata==2024.1
67 | umap-learn==0.5.6
68 | urllib3==2.2.2
69 | watchdog==4.0.2
70 | wordcloud==1.9.3
71 | wrapt==1.16.0
72 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = streamlit_topic_modeling
 3 | version = attr:streamlit_topic_modeling.__version__
 4 | description = A topic modeling GUI application using Streamlit.
 5 | description-file = README.md
 6 | long_description = file:README.md
 7 | long_description_content_type = text/markdown
 8 | author = Bryan Patrick Wood
 9 | author_email = bpw1621@gmail.com
10 | url = 'https://github.com/bpw1621/streamlit_topic_modeling'
11 | download_url = https://github.com/bpw1621/streamlit_topic_modeling/archive/master.zip
12 | project_urls =
13 |     Homepage = https://bpw1621.github.io/streamlit_topic_modeling
14 |     Source Code = https://github.com/bpw1621/streamlit_topic_modeling
15 |     Documentation = https://streamlit_topic_modeling.readthedocs.io/en/latest/
16 |     Bug Tracker = https://github.com/bpw1621/streamlit_topic_modeling/issues
17 | classifiers =
18 |     Development Status :: 3 - Alpha
19 |     Programming Language :: Python
20 |     Programming Language :: Python :: 3
21 |     Programming Language :: Python :: 3.11
22 |     Programming Language :: Python :: 3 :: Only
23 | 
24 | ;keywords = ...
25 | ;license = ...
26 | 
27 | [options]
28 | zip_safe = False
29 | include_package_data = True
30 | packages = find:
31 | install_requires =
32 |     gensim
33 |     matplotlib
34 |     nltk
35 |     numpy
36 |     pandas
37 |     plotly-express
38 |     plotly
39 |     pyldavis
40 |     regex
41 |     scikit-learn
42 |     seaborn
43 |     streamlit
44 |     umap-learn
45 |     wordcloud
46 | tests_require =
47 |     pytest
48 |     pytest-mock
49 |     pytest-cov
50 | setup_requires =
51 |     setuptools
52 |     pytest-runner
53 | 
54 | [bdist_wheel]
55 | universal = true
56 | 
57 | [options.extras_require]
58 | dev =
59 |     flake8
60 |     tox
61 |     pretty_errors
62 |     twine
63 | doc =
64 |     sphinx
65 |     sphinx_rtd_theme
66 |     towncrier
67 | 
68 | [aliases]
69 | test = pytest
70 | 
71 | [tool:pytest]
72 | collect_ignore = ['setup.py']


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from setuptools import setup
3 | 
4 | setup(setup_cfg=True)
5 | 


--------------------------------------------------------------------------------
/streamlit_topic_modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top-level streamlit_topic_modeling package."""
 2 | 
 3 | import logging
 4 | from logging import NullHandler
 5 | 
 6 | __author__ = 'Bryan Patrick Wood'
 7 | __email__ = 'bpw1621@gmail.com'
 8 | __version__ = '0.0a0'
 9 | 
10 | logging.getLogger(__name__).addHandler(NullHandler())
11 | 


--------------------------------------------------------------------------------
/streamlit_topic_modeling/app.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import gensim
  4 | import matplotlib.colors as mcolors
  5 | import matplotlib.pyplot as plt
  6 | import nltk
  7 | import numpy as np
  8 | import pandas as pd
  9 | import plotly.express as px
 10 | import pyLDAvis.gensim_models
 11 | import regex
 12 | import seaborn as sns
 13 | import streamlit as st
 14 | import streamlit.components.v1 as components
 15 | from gensim import corpora
 16 | from gensim.models import CoherenceModel
 17 | from gensim.utils import simple_preprocess
 18 | from nltk.corpus import stopwords
 19 | from sklearn.decomposition import PCA
 20 | from sklearn.manifold import TSNE
 21 | from umap import UMAP
 22 | from wordcloud import WordCloud
 23 | 
 24 | DEFAULT_HIGHLIGHT_PROBABILITY_MINIMUM = 0.001
 25 | DEFAULT_NUM_TOPICS = 6
 26 | 
 27 | nltk.download("stopwords")
 28 | 
 29 | DATASETS = {
 30 |     'Five Years of Elon Musk Tweets': {
 31 |         'path': './data/elonmusk.csv.zip',
 32 |         'column': 'tweet',
 33 |         'url': 'https://www.kaggle.com/vidyapb/elon-musk-tweets-2015-to-2020',
 34 |         'description': (
 35 |             'I scraped Elon Musk\'s tweets from the last 5 years using twint library. My inspiration behind this is to '
 36 |             'see how public personalities are influencing common people on Social Media Platforms. I would love to see '
 37 |             'some notebooks around this dataset, giving us insights like what are the topics which Tesla mostly tweets '
 38 |             'about? How are Tesla\'s stocks being influenced by his tweets?'
 39 |         )
 40 |     },
 41 |     'Airline Tweets': {
 42 |         'path': './data/Tweets.csv.zip',
 43 |         'column': 'text',
 44 |         'url': 'https://www.kaggle.com/crowdflower/twitter-airline-sentiment',
 45 |         'description': (
 46 |             'A sentiment analysis job about the problems of each major U.S. airline. Twitter data was scraped from '
 47 |             'February of 2015 and contributors were asked to first classify positive, negative, and neutral tweets, '
 48 |             'followed by categorizing negative reasons (such as "late flight" or "rude service").'
 49 |         )
 50 |     }
 51 | }
 52 | 
 53 | 
 54 | def lda_options():
 55 |     return {
 56 |         'num_topics': st.number_input('Number of Topics', min_value=1, value=9,
 57 |                                       help='The number of requested latent topics to be extracted from the training corpus.'),
 58 |         'chunksize': st.number_input('Chunk Size', min_value=1, value=2000,
 59 |                                      help='Number of documents to be used in each training chunk.'),
 60 |         'passes': st.number_input('Passes', min_value=1, value=1,
 61 |                                   help='Number of passes through the corpus during training.'),
 62 |         'update_every': st.number_input('Update Every', min_value=1, value=1,
 63 |                                         help='Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning.'),
 64 |         'alpha': st.selectbox('𝛼', ('symmetric', 'asymmetric', 'auto'),
 65 |                               help='A priori belief on document-topic distribution.'),
 66 |         'eta': st.selectbox('𝜂', (None, 'symmetric', 'auto'), help='A-priori belief on topic-word distribution'),
 67 |         'decay': st.number_input('𝜅', min_value=0.5, max_value=1.0, value=0.5,
 68 |                                  help='A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined.'),
 69 |         'offset': st.number_input('𝜏_0', value=1.0,
 70 |                                   help='Hyper-parameter that controls how much we will slow down the first steps the first few iterations.'),
 71 |         'eval_every': st.number_input('Evaluate Every', min_value=1, value=10,
 72 |                                       help='Log perplexity is estimated every that many updates.'),
 73 |         'iterations': st.number_input('Iterations', min_value=1, value=50,
 74 |                                       help='Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.'),
 75 |         'gamma_threshold': st.number_input('𝛾', min_value=0.0, value=0.001,
 76 |                                            help='Minimum change in the value of the gamma parameters to continue iterating.'),
 77 |         'minimum_probability': st.number_input('Minimum Probability', min_value=0.0, max_value=1.0, value=0.01,
 78 |                                                help='Topics with a probability lower than this threshold will be filtered out.'),
 79 |         'minimum_phi_value': st.number_input('𝜑', min_value=0.0, value=0.01,
 80 |                                              help='if per_word_topics is True, this represents a lower bound on the term probabilities.'),
 81 |         'per_word_topics': st.checkbox('Per Word Topics',
 82 |                                        help='If True, the model also computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count).')
 83 |     }
 84 | 
 85 | 
 86 | def nmf_options():
 87 |     return {
 88 |         'num_topics': st.number_input('Number of Topics', min_value=1, value=9, help='Number of topics to extract.'),
 89 |         'chunksize': st.number_input('Chunk Size', min_value=1, value=2000,
 90 |                                      help='Number of documents to be used in each training chunk.'),
 91 |         'passes': st.number_input('Passes', min_value=1, value=1,
 92 |                                   help='Number of full passes over the training corpus.'),
 93 |         'kappa': st.number_input('𝜅', min_value=0.0, value=1.0, help='Gradient descent step size.'),
 94 |         'minimum_probability': st.number_input('Minimum Probability', min_value=0.0, max_value=1.0, value=0.01,
 95 |                                                help='If normalize is True, topics with smaller probabilities are filtered out. If normalize is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s.'),
 96 |         'w_max_iter': st.number_input('W max iter', min_value=1, value=200,
 97 |                                       help='Maximum number of iterations to train W per each batch.'),
 98 |         'w_stop_condition': st.number_input('W stop cond', min_value=0.0, value=0.0001,
 99 |                                             help=' If error difference gets less than that, training of W stops for the current batch.'),
100 |         'h_max_iter': st.number_input('H max iter', min_value=1, value=50,
101 |                                       help='Maximum number of iterations to train h per each batch.'),
102 |         'h_stop_condition': st.number_input('W stop cond', min_value=0.0, value=0.001,
103 |                                             help='If error difference gets less than that, training of h stops for the current batch.'),
104 |         'eval_every': st.number_input('Evaluate Every', min_value=1, value=10,
105 |                                       help='Number of batches after which l2 norm of (v - Wh) is computed.'),
106 |         'normalize': st.selectbox('Normalize', (True, False, None), help='Whether to normalize the result.')
107 |     }
108 | 
109 | 
110 | MODELS = {
111 |     'Latent Dirichlet Allocation': {
112 |         'options': lda_options,
113 |         'class': gensim.models.LdaModel,
114 |         'help': 'https://radimrehurek.com/gensim/models/ldamodel.html'
115 |     },
116 |     'Non-Negative Matrix Factorization': {
117 |         'options': nmf_options,
118 |         'class': gensim.models.Nmf,
119 |         'help': 'https://radimrehurek.com/gensim/models/nmf.html'
120 |     }
121 | }
122 | 
123 | COLORS = [color for color in mcolors.XKCD_COLORS.values()]
124 | 
125 | WORDCLOUD_FONT_PATH = r'./data/Inkfree.ttf'
126 | 
127 | EMAIL_REGEX_STR = r'\S*@\S*'
128 | MENTION_REGEX_STR = r'@\S*'
129 | HASHTAG_REGEX_STR = r'#\S+'
130 | URL_REGEX_STR = r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
131 | 
132 | 
133 | @st.cache_data()
134 | def generate_texts_df(selected_dataset: str):
135 |     dataset = DATASETS[selected_dataset]
136 |     return pd.read_csv(f'{dataset["path"]}')
137 | 
138 | 
139 | @st.cache_data()
140 | def denoise_docs(texts_df: pd.DataFrame, text_column: str):
141 |     texts = texts_df[text_column].values.tolist()
142 |     remove_regex = regex.compile(f'({EMAIL_REGEX_STR}|{MENTION_REGEX_STR}|{HASHTAG_REGEX_STR}|{URL_REGEX_STR})')
143 |     texts = [regex.sub(remove_regex, '', text) for text in texts]
144 |     docs = [[w for w in simple_preprocess(doc, deacc=True) if w not in stopwords.words('english')] for doc in texts]
145 |     return docs
146 | 
147 | 
148 | @st.cache_data()
149 | def create_bigrams(docs):
150 |     bigram_phrases = gensim.models.Phrases(docs)
151 |     bigram_phraser = gensim.models.phrases.Phraser(bigram_phrases)
152 |     docs = [bigram_phraser[doc] for doc in docs]
153 |     return docs
154 | 
155 | 
156 | @st.cache_data()
157 | def create_trigrams(docs):
158 |     bigram_phrases = gensim.models.Phrases(docs)
159 |     bigram_phraser = gensim.models.phrases.Phraser(bigram_phrases)
160 |     trigram_phrases = gensim.models.Phrases(bigram_phrases[docs])
161 |     trigram_phraser = gensim.models.phrases.Phraser(trigram_phrases)
162 |     docs = [trigram_phraser[bigram_phraser[doc]] for doc in docs]
163 |     return docs
164 | 
165 | 
166 | @st.cache_data()
167 | def generate_docs(texts_df: pd.DataFrame, text_column: str, ngrams: str = None):
168 |     docs = denoise_docs(texts_df, text_column)
169 |     if ngrams == 'bigrams':
170 |         docs = create_bigrams(docs)
171 |     if ngrams == 'trigrams':
172 |         docs = create_trigrams(docs)
173 |     return docs
174 | 
175 | 
176 | @st.cache_data()
177 | def generate_wordcloud(docs, collocations: bool = False):
178 |     wordcloud_text = (' '.join(' '.join(doc) for doc in docs))
179 |     wordcloud = WordCloud(font_path=WORDCLOUD_FONT_PATH, width=700, height=600,
180 |                           background_color='white', collocations=collocations).generate(wordcloud_text)
181 |     return wordcloud
182 | 
183 | 
184 | @st.cache_data()
185 | def prepare_training_data(docs):
186 |     id2word = corpora.Dictionary(docs)
187 |     corpus = [id2word.doc2bow(doc) for doc in docs]
188 |     return id2word, corpus
189 | 
190 | 
191 | @st.cache_data()
192 | def train_model(docs, base_model, **kwargs):
193 |     id2word, corpus = prepare_training_data(docs)
194 |     model = base_model(corpus=corpus, id2word=id2word, **kwargs)
195 |     return id2word, corpus, model
196 | 
197 | 
198 | def clear_session_state():
199 |     for key in ('model_kwargs', 'id2word', 'corpus', 'model', 'previous_perplexity', 'previous_coherence_model_value'):
200 |         if key in st.session_state:
201 |             del st.session_state[key]
202 | 
203 | 
204 | def calculate_perplexity(model, corpus):
205 |     return np.exp2(-model.log_perplexity(corpus))
206 | 
207 | 
208 | def calculate_coherence(model, corpus, coherence):
209 |     coherence_model = CoherenceModel(model=model, corpus=corpus, coherence=coherence)
210 |     return coherence_model.get_coherence()
211 | 
212 | 
213 | @st.cache_data()
214 | def white_or_black_text(background_color):
215 |     # https://stackoverflow.com/questions/3942878/how-to-decide-font-color-in-white-or-black-depending-on-background-color
216 |     red = int(background_color[1:3], 16)
217 |     green = int(background_color[3:5], 16)
218 |     blue = int(background_color[5:], 16)
219 |     return 'black' if (red * 0.299 + green * 0.587 + blue * 0.114) > 186 else 'white'
220 | 
221 | 
222 | def perplexity_section():
223 |     with st.spinner('Calculating Perplexity ...'):
224 |         perplexity = calculate_perplexity(st.session_state.model, st.session_state.corpus)
225 |     key = 'previous_perplexity'
226 |     delta = f'{perplexity - st.session_state[key]:.4}' if key in st.session_state else None
227 |     st.metric(label='Perplexity', value=f'{perplexity:.4f}', delta=delta, delta_color='inverse')
228 |     st.session_state[key] = perplexity
229 |     st.markdown('Viz., https://en.wikipedia.org/wiki/Perplexity')
230 |     st.latex(r'Perplexity = \exp\left(-\frac{\sum_d \log(p(w_d|\Phi, \alpha))}{N}\right)')
231 | 
232 | 
233 | def coherence_section():
234 |     with st.spinner('Calculating Coherence Score ...'):
235 |         coherence = calculate_coherence(st.session_state.model, st.session_state.corpus, 'u_mass')
236 |     key = 'previous_coherence_model_value'
237 |     delta = f'{coherence - st.session_state[key]:.4f}' if key in st.session_state else None
238 |     st.metric(label='Coherence Score', value=f'{coherence:.4f}', delta=delta)
239 |     st.session_state[key] = coherence
240 |     st.markdown('Viz., http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf')
241 |     st.latex(
242 |         r'C_{UMass} = \frac{2}{N \cdot (N - 1)}\sum_{i=2}^N\sum_{j=1}^{i-1}\log\frac{P(w_i, w_j) + \epsilon}{P(w_j)}')
243 | 
244 | 
245 | @st.cache_data()
246 | def train_projection(projection, n_components, df):
247 |     if projection == 'PCA':
248 |         projection_model = PCA(n_components=n_components)
249 |     elif projection == 'T-SNE':
250 |         projection_model = TSNE(n_components=n_components)
251 |     elif projection == 'UMAP':
252 |         projection_model = UMAP(n_components=n_components)
253 |     else:
254 |         raise ValueError(f'Unknown projection: {projection}')
255 |     return projection_model.fit_transform(df)
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     st.set_page_config(page_title='Topic Modeling', page_icon='./data/favicon.png', layout='wide')
260 | 
261 |     preprocessing_options = st.sidebar.form('preprocessing-options')
262 |     with preprocessing_options:
263 |         st.header('Preprocessing Options')
264 |         ngrams = st.selectbox('N-grams', [None, 'bigrams', 'trigams'], help='TODO ...')  # TODO ...
265 |         st.form_submit_button('Preprocess')
266 | 
267 |     visualization_options = st.sidebar.form('visualization-options')
268 |     with visualization_options:
269 |         st.header('Visualization Options')
270 |         collocations = st.checkbox('Enable WordCloud Collocations',
271 |                                    help='Collocations in word clouds enable the display of phrases.')
272 |         highlight_probability_minimum = st.select_slider('Highlight Probability Minimum',
273 |                                                          options=[10 ** exponent for exponent in range(-10, 1)],
274 |                                                          value=DEFAULT_HIGHLIGHT_PROBABILITY_MINIMUM,
275 |                                                          help='Minimum topic probability in order to color highlight a word in the _Topic Highlighted Sentences_ visualization.')
276 |         st.form_submit_button('Apply')
277 | 
278 |     st.title('Topic Modeling')
279 |     st.header('What is topic modeling?')
280 |     with st.expander('Hero Image'):
281 |         st.image('./data/is-this-a-topic-modeling.jpg', caption='No ... no it\'s not ...', use_column_width=True)
282 |     st.markdown(
283 |         'Topic modeling is a broad term. It encompasses a number of specific statistical learning methods. '
284 |         'These methods do the following: explain documents in terms of a set of topics and those topics in terms of '
285 |         'the a set of words. Two very commonly used methods are Latent Dirichlet Allocation (LDA) and Non-Negative '
286 |         'Matrix Factorization (NMF), for instance. Used without additional qualifiers the approach is usually assumed '
287 |         'to be unsupervised although there are semi-supervised and supervised variants.'
288 |     )
289 | 
290 |     with st.expander('Additional Details'):
291 |         st.markdown('The objective can be viewed as a matrix factorization.')
292 |         st.image('./data/mf.png', use_column_width=True)
293 |         st.markdown('This factorization makes the methods much more efficient than directly characterizing documents '
294 |                     'in term of words.')
295 |         st.markdown('More information on LDA and NMF can be found at '
296 |                     'https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation and '
297 |                     'https://en.wikipedia.org/wiki/Non-negative_matrix_factorization, respectively.')
298 | 
299 |     st.header('Datasets')
300 |     st.markdown('Preloaded a couple of small example datasets to illustrate.')
301 |     selected_dataset = st.selectbox('Dataset', [None, *sorted(list(DATASETS.keys()))], on_change=clear_session_state)
302 |     if not selected_dataset:
303 |         st.write('Choose a Dataset to Conintue ...')
304 |         st.stop()
305 | 
306 |     with st.expander('Dataset Description'):
307 |         st.markdown(DATASETS[selected_dataset]['description'])
308 |         st.markdown(DATASETS[selected_dataset]['url'])
309 | 
310 |     text_column = DATASETS[selected_dataset]['column']
311 |     texts_df = generate_texts_df(selected_dataset)
312 |     docs = generate_docs(texts_df, text_column, ngrams=ngrams)
313 | 
314 |     with st.expander('Sample Documents'):
315 |         sample_texts = texts_df[text_column].sample(5).values.tolist()
316 |         for index, text in enumerate(sample_texts):
317 |             st.markdown(f'**{index + 1}**: _{text}_')
318 | 
319 |     with st.expander('Frequency Sized Corpus Wordcloud'):
320 |         wc = generate_wordcloud(docs)
321 |         st.image(wc.to_image(), caption='Dataset Wordcloud (Not A Topic Model)', use_column_width=True)
322 |         st.markdown('These are the remaining words after document preprocessing.')
323 | 
324 |     with st.expander('Document Word Count Distribution'):
325 |         len_docs = [len(doc) for doc in docs]
326 |         fig, ax = plt.subplots()
327 |         sns.histplot(data=pd.DataFrame(len_docs, columns=['Words In Document']), discrete=True, ax=ax)
328 |         st.pyplot(fig)
329 | 
330 |     model_key = st.sidebar.selectbox('Model', [None, *list(MODELS.keys())], on_change=clear_session_state)
331 |     model_options = st.sidebar.form('model-options')
332 |     if not model_key:
333 |         with st.sidebar:
334 |             st.write('Choose a Model to Continue ...')
335 |         st.stop()
336 |     with model_options:
337 |         st.header('Model Options')
338 |         model_kwargs = MODELS[model_key]['options']()
339 |         st.session_state['model_kwargs'] = model_kwargs
340 |         train_model_clicked = st.form_submit_button('Train Model')
341 | 
342 |     if train_model_clicked:
343 |         with st.spinner('Training Model ...'):
344 |             id2word, corpus, model = train_model(docs, MODELS[model_key]['class'], **st.session_state.model_kwargs)
345 |         st.session_state.id2word = id2word
346 |         st.session_state.corpus = corpus
347 |         st.session_state.model = model
348 | 
349 |     if 'model' not in st.session_state:
350 |         st.stop()
351 | 
352 |     st.header('Model')
353 |     st.write(type(st.session_state.model).__name__)
354 |     st.write(st.session_state.model_kwargs)
355 | 
356 |     st.header('Model Results')
357 | 
358 |     topics = st.session_state.model.show_topics(formatted=False, num_words=50,
359 |                                                 num_topics=st.session_state.model_kwargs['num_topics'], log=False)
360 |     with st.expander('Topic Word-Weighted Summaries'):
361 |         topic_summaries = {}
362 |         for topic in topics:
363 |             topic_index = topic[0]
364 |             topic_word_weights = topic[1]
365 |             topic_summaries[topic_index] = ' + '.join(
366 |                 f'{weight:.3f} * {word}' for word, weight in topic_word_weights[:10])
367 |         for topic_index, topic_summary in topic_summaries.items():
368 |             st.markdown(f'**Topic {topic_index}**: _{topic_summary}_')
369 | 
370 |     colors = random.sample(COLORS, k=model_kwargs['num_topics'])
371 |     with st.expander('Top N Topic Keywords Wordclouds'):
372 |         cols = st.columns(3)
373 |         for index, topic in enumerate(topics):
374 |             wc = WordCloud(font_path=WORDCLOUD_FONT_PATH, width=700, height=600,
375 |                            background_color='white', collocations=collocations, prefer_horizontal=1.0,
376 |                            color_func=lambda *args, **kwargs: colors[index])
377 |             with cols[index % 3]:
378 |                 wc.generate_from_frequencies(dict(topic[1]))
379 |                 st.image(wc.to_image(), caption=f'Topic #{index}', use_column_width=True)
380 | 
381 |     with st.expander('Topic Highlighted Sentences'):
382 |         sample = texts_df.sample(10)
383 |         for index, row in sample.iterrows():
384 |             html_elements = []
385 |             for token in row[text_column].split():
386 |                 if st.session_state.id2word.token2id.get(token) is None:
387 |                     html_elements.append(f'<span style="text-decoration:line-through;">{token}</span>')
388 |                 else:
389 |                     term_topics = st.session_state.model.get_term_topics(token, minimum_probability=0)
390 |                     topic_probabilities = [term_topic[1] for term_topic in term_topics]
391 |                     max_topic_probability = max(topic_probabilities) if topic_probabilities else 0
392 |                     if max_topic_probability < highlight_probability_minimum:
393 |                         html_elements.append(token)
394 |                     else:
395 |                         max_topic_index = topic_probabilities.index(max_topic_probability)
396 |                         max_topic = term_topics[max_topic_index]
397 |                         background_color = colors[max_topic[0]]
398 |                         # color = 'white'
399 |                         color = white_or_black_text(background_color)
400 |                         html_elements.append(
401 |                             f'<span style="background-color: {background_color}; color: {color}; opacity: 0.5;">{token}</span>')
402 |             st.markdown(f'Document #{index}: {" ".join(html_elements)}', unsafe_allow_html=True)
403 | 
404 |     has_log_perplexity = hasattr(st.session_state.model, 'log_perplexity')
405 |     with st.expander('Metrics'):
406 |         if has_log_perplexity:
407 |             left_column, right_column = st.columns(2)
408 |             with left_column:
409 |                 perplexity_section()
410 |             with right_column:
411 |                 coherence_section()
412 |         else:
413 |             coherence_section()
414 | 
415 |     with st.expander('Low Dimensional Projections'):
416 |         with st.form('projections-form'):
417 |             left_column, right_column = st.columns(2)
418 |             projection = left_column.selectbox('Projection', ['PCA', 'T-SNE', 'UMAP'], help='TODO ...')
419 |             plot_type = right_column.selectbox('Plot', ['2D', '3D'], help='TODO ...')
420 |             n_components = 3
421 |             columns = [f'proj{i}' for i in range(1, 4)]
422 |             generate_projection_clicked = st.form_submit_button('Generate Projection')
423 | 
424 |         if generate_projection_clicked:
425 |             topic_weights = []
426 |             for index, topic_weight in enumerate(st.session_state.model[st.session_state.corpus]):
427 |                 weight_vector = [0] * int(st.session_state.model_kwargs['num_topics'])
428 |                 for topic, weight in topic_weight:
429 |                     weight_vector[topic] = weight
430 |                 topic_weights.append(weight_vector)
431 |             df = pd.DataFrame(topic_weights)
432 |             dominant_topic = df.idxmax(axis='columns').astype('string')
433 |             dominant_topic_percentage = df.max(axis='columns')
434 |             df = df.assign(dominant_topic=dominant_topic, dominant_topic_percentage=dominant_topic_percentage,
435 |                            text=texts_df[text_column])
436 |             with st.spinner('Training Projection'):
437 |                 projections = train_projection(projection, n_components, df.drop(columns=['dominant_topic', 'dominant_topic_percentage', 'text']).add_prefix('topic_'))
438 |             data = pd.concat([df, pd.DataFrame(projections, columns=columns)], axis=1)
439 | 
440 |             px_options = {'color': 'dominant_topic', 'size': 'dominant_topic_percentage',
441 |                           'hover_data': ['dominant_topic', 'dominant_topic_percentage', 'text']}
442 |             if plot_type == '2D':
443 |                 fig = px.scatter(data, x='proj1', y='proj2', **px_options)
444 |                 st.plotly_chart(fig)
445 |                 fig = px.scatter(data, x='proj1', y='proj3', **px_options)
446 |                 st.plotly_chart(fig)
447 |                 fig = px.scatter(data, x='proj2', y='proj3', **px_options)
448 |                 st.plotly_chart(fig)
449 |             elif plot_type == '3D':
450 |                 fig = px.scatter_3d(data, x='proj1', y='proj2', z='proj3', **px_options)
451 |                 st.plotly_chart(fig)
452 | 
453 |     if hasattr(st.session_state.model, 'inference'):  # gensim Nmf has no 'inference' attribute so pyLDAvis fails
454 |         if st.button('Generate pyLDAvis'):
455 |             with st.spinner('Creating pyLDAvis Visualization ...'):
456 |                 py_lda_vis_data = pyLDAvis.gensim_models.prepare(st.session_state.model, st.session_state.corpus,
457 |                                                                  st.session_state.id2word)
458 |                 py_lda_vis_html = pyLDAvis.prepared_data_to_html(py_lda_vis_data)
459 |             with st.expander('pyLDAvis', expanded=True):
460 |                 st.markdown('pyLDAvis is designed to help users interpret the topics in a topic model that has been '
461 |                             'fit to a corpus of text data. The package extracts information from a fitted LDA topic '
462 |                             'model to inform an interactive web-based visualization.')
463 |                 st.markdown('https://github.com/bmabey/pyLDAvis')
464 |                 components.html(py_lda_vis_html, width=1300, height=800)
465 | 


--------------------------------------------------------------------------------
/streamlit_topic_modeling/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the streamlit_topic_modeling package."""
2 | 


--------------------------------------------------------------------------------
/streamlit_topic_modeling/tests/test_app.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/streamlit_topic_modeling/tests/test_app.py


--------------------------------------------------------------------------------
/towncrier.toml:
--------------------------------------------------------------------------------
1 | [tool.towncrier]
2 | directory = "newsfragments"


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py36, py37, py38, py39
 8 | 
 9 | [testenv]
10 | deps =
11 |     pytest
12 |     pytest-cov
13 |     pytest-mock
14 | commands =
15 |     pytest
16 | 


--------------------------------------------------------------------------------