├── README.md
├── pretrained_pipelines
    ├── readme.md
    ├── sparknlp_pretrained_pipeline_playground.mp4
    └── sparknlp_pretrained_pipeline_playground.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | ### spark-nlp-streamlit
 2 | 
 3 | ## Deprecated in favor of [Streamlit in NLU](https://nlu.johnsnowlabs.com/docs/en/streamlit_viz_examples)
 4 | 
 5 | Code samples to run Spark NLP within Streamlit.io
 6 | 
 7 | ```bash
 8 | $ git clone https://github.com/JohnSnowLabs/spark-nlp-streamlit.git
 9 | $ cd spark-nlp-streamlit
10 | $ pip install -r requirements.txt
11 | $ streamlit run pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py
12 | ```
13 | 


--------------------------------------------------------------------------------
/pretrained_pipelines/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-streamlit/917fc32e8f15984561283c3f16de8573423e7077/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4


--------------------------------------------------------------------------------
/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import SparkSession
  2 | from pyspark.ml import Pipeline
  3 | 
  4 | from sparknlp.annotator import *
  5 | from sparknlp.common import *
  6 | from sparknlp.base import *
  7 | from sparknlp.pretrained import PretrainedPipeline
  8 | 
  9 | import pandas as pd
 10 | 
 11 | import sparknlp
 12 | 
 13 | #spark = sparknlp.start()
 14 | 
 15 | 
 16 | #%%writefile streamlit_healthcare.py
 17 | from pyspark.sql import SparkSession
 18 | 
 19 | import os
 20 | import sys
 21 | 
 22 | import streamlit as st 
 23 | 
 24 | import os
 25 | 
 26 | import pandas as pd
 27 | 
 28 | jar_path = "../jars/"
 29 | 
 30 | spark = SparkSession.builder \
 31 |         .appName("Spark NLP 2.4.5") \
 32 |         .master("local[8]") \
 33 |         .config("spark.driver.memory","16G") \
 34 |         .config("spark.driver.maxResultSize", "1G") \
 35 |         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
 36 |         .config("spark.kryoserializer.buffer.max", "1000M")\
 37 |         .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
 38 |         .getOrCreate()
 39 | 
 40 | 
 41 | SPARK_NLP_PIPELINES = ['explain_document_ml',
 42 | 'explain_document_dl',
 43 | 'recognize_entities_dl', 
 44 | 'explain_document_dl_fast',
 45 | 'onto_recognize_entities_sm',
 46 | 'onto_recognize_entities_lg',
 47 | 'match_datetime',
 48 | 'match_pattern',
 49 | 'match_chunks',
 50 | 'match_phrases',
 51 | 'clean_stop',
 52 | 'clean_pattern',
 53 | 'clean_slang',
 54 | 'check_spelling',
 55 | 'analyze_sentiment',
 56 | 'dependency_parse']
 57 | 
 58 | DEFAULT_TEXT = "Other than being the king of the north, John Snow is a an english physician and a leader in the development of anaesthesia and medical hygiene. He is considered for being the first one using data to cure cholera outbreak in 1834."
 59 | 
 60 | HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 61 | 
 62 | 
 63 | @st.cache(allow_output_mutation=True)
 64 | def load_pipeline(name):
 65 |     #if name=='match_datetime':
 66 |         #return light_Datetime
 67 |     #else:
 68 |     return PretrainedPipeline(name, lang='en')
 69 | 
 70 | 
 71 | @st.cache(allow_output_mutation=True)
 72 | def process_text(model_name, text, mode='slim'):
 73 |     
 74 |     pipeline = load_pipeline(model_name)
 75 | 
 76 |     if mode=='slim':
 77 |         return pipeline.annotate(text)
 78 |     else:
 79 |         return pipeline.fullAnnotate(text)
 80 | 
 81 | st.sidebar.title("Interactive Spark NLP UI")
 82 | st.sidebar.markdown(
 83 |     """
 84 | Process text with Spark NLP pretrained pipelines and more. Using Spark NLP LightPipelines under the hood.
 85 | """
 86 | )
 87 | 
 88 | sparknlp_model = st.sidebar.selectbox("Pipeline name", SPARK_NLP_PIPELINES)
 89 | model_load_state = st.info(f"Loading pretrained pipeline '{sparknlp_model}'...")
 90 | #pipeline = load_pipeline(sparknlp_model)
 91 | model_load_state.empty()
 92 | 
 93 | #st.markdown("Text to analyze")
 94 | 
 95 | text = st.text_area("Text to analyze", DEFAULT_TEXT)
 96 | 
 97 | try:
 98 |     annotated_text = process_text(sparknlp_model, text, mode='slim')
 99 |     full_annotated_text = process_text(sparknlp_model, text, mode='full')[0]
100 | except Exception as e:
101 |     st.write('error in loading the pipeline !')
102 |     annotated_text={}
103 |     full_annotated_text={}
104 |     pass
105 | #stages = pretrained_pipeline.model.stages
106 | 
107 | #stages = ['_'.join(s.split('_')[:-1]) for s in stages]
108 | 
109 | #stages = [s['name'] for s in pretrained_pipeline.model.stages]
110 | 
111 | 
112 | import random
113 | 
114 | def get_color():
115 |     r = lambda: random.randint(100,255)
116 |     return '#%02X%02X%02X' % (r(),r(),r())
117 | 
118 |     
119 | def get_onto_NER_html (annotated_text, labels):
120 |     
121 |     light_data=annotated_text
122 |     
123 |     #html_output = '<center><h3>Results of NER Annotation Pipeline</h3></center>'
124 |     #html_output += '<div style="border:2px solid #747474; margin: 5px; padding: 10px">'
125 |     html_output=''
126 |     
127 |     problem_flag = False
128 |     new_problem = []
129 |     problem_list = []
130 |     
131 |     label_list = list(set([i.split('-')[1] for i in light_data['ner'] if i!='O']))
132 |     
133 |     label_color={}
134 |     
135 |     for l in label_list:
136 |         
137 |         label_color[l]=get_color()
138 |             
139 |     for index, this_token in enumerate(light_data['token']):
140 | 
141 |         try:
142 |             ent = light_data['ner'][index].split('-')[1]
143 |         except:
144 |             ent = light_data['ner'][index]
145 |         
146 |        
147 |         if ent in labels:
148 |             color = label_color[ent]
149 |             html_output+='<SPAN style="background-color: {}">'.format(color) + this_token + " </SPAN>"
150 |         else:
151 |             html_output+=this_token + " "
152 |         
153 | 
154 |     html_output += '</div>'
155 |     html_output += '<div>Color codes:'
156 | 
157 |     for l in labels:
158 |         
159 |         html_output += '<SPAN style="background-color: {}">{}</SPAN>, '.format(label_color[l],l)
160 |    
161 |     return html_output
162 |     
163 | 
164 | 
165 | def show_html(annotated_text):
166 | 
167 |     st.header("Named Entities ({})".format(sparknlp_model))
168 |     st.sidebar.header("Named Entities")
169 | 
170 |     #st.write(annotated_text['ner'])
171 |     label_set = list(set([i.split('-')[1] for i in annotated_text['ner'] if i!='O']))
172 | 
173 |     labels = st.sidebar.multiselect(
174 |             "Entity labels", options=label_set, default=list(label_set)
175 |         )
176 |         
177 |     html = get_onto_NER_html (annotated_text, labels) 
178 |         # Newlines seem to mess with the rendering
179 |     html = html.replace("\n", " ")
180 |     st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
181 | 
182 |     st.write('')
183 |     st.write('')
184 | 
185 | 
186 |     
187 | if sparknlp_model == 'explain_document_dl':
188 |     
189 |     df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner'],
190 |                       'corrected':annotated_text['checked'], 'POS':annotated_text['pos'],
191 |                       'lemmas':annotated_text['lemma'], 'stems':annotated_text['stem']})
192 |     st.dataframe(df)
193 |     
194 | 
195 | elif sparknlp_model == 'explain_document_ml':
196 |     
197 |     #st.write(annotated_text)
198 |     
199 |     df = pd.DataFrame({'token':annotated_text['token'], 
200 |                       'corrected':annotated_text['spell'], 'POS':annotated_text['pos'],
201 |                       'lemmas':annotated_text['lemmas'], 'stems':annotated_text['stems']})
202 |     st.dataframe(df)
203 |     
204 |     
205 | elif sparknlp_model in ['recognize_entities_dl','onto_recognize_entities_sm']:
206 |     
207 |     df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner']})
208 |     st.dataframe(df)
209 |     
210 |     
211 | elif sparknlp_model == 'check_spelling':
212 |     
213 |     df = pd.DataFrame({'token':annotated_text['token'],
214 |                       'corrected':annotated_text['checked']})
215 |     st.dataframe(df)
216 |     
217 | elif sparknlp_model == 'check_spelling':
218 |     
219 |     df = pd.DataFrame({'token':annotated_text['token'],
220 |                       'corrected':annotated_text['checked']})
221 |     st.dataframe(df)
222 |     
223 |     
224 | elif sparknlp_model == 'dependency_parse':
225 |     
226 |     df = pd.DataFrame({'token':annotated_text['token'],
227 |                        'pos':annotated_text['pos'],
228 |                        'dep_mod':annotated_text['dep_mod'],
229 |                       'dep_root':annotated_text['dep_root']})
230 |     st.dataframe(df)
231 |     
232 |     
233 | elif sparknlp_model == 'clean_slang':
234 |     
235 |     try:
236 |         df = pd.DataFrame({'token':annotated_text['token'],
237 |                        'normal':annotated_text['normal']})
238 |         st.dataframe(df)
239 |     except:
240 |         pass
241 |     
242 | if 'entities' in annotated_text.keys():
243 |     st.write('')
244 |     st.write('Named Entities')
245 |     st.write('')
246 |     
247 |     chunks=[]
248 |     entities=[]
249 | 
250 |     #html = get_onto_NER_html (annotated_text)
251 |     #html = html.replace("\n", " ")
252 |     #st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
253 |     show_html(annotated_text)
254 | 
255 |     for n in full_annotated_text['entities']:
256 |         
257 |         chunks.append(n.result)
258 |         entities.append(n.metadata['entity']) 
259 |     
260 |     st.write('')
261 |     st.write('Entities')
262 |     st.dataframe(pd.DataFrame({'chunks':chunks, 'entities':entities}))
263 |     #st.write(annotated_text['entities'])
264 |     
265 |     
266 |     
267 | if 'sentence' in annotated_text.keys():
268 |     st.write('')
269 |     st.write('Sentences')
270 |     st.write('')
271 |     st.write(annotated_text['sentence'])
272 |     #st.dataframe(pd.DataFrame({'sentences':annotated_text['sentence']}))
273 | 
274 | if 'sentiment' in annotated_text.keys():
275 |     
276 |     st.write('')
277 |     st.write('Sentiment')
278 |     st.write('')
279 |     st.dataframe(pd.DataFrame({'sentence':annotated_text['sentence'], 'sentiment':annotated_text['sentiment']}))
280 |     
281 |     
282 | st.subheader('Model Output')
283 | st.write(annotated_text)
284 | 
285 | st.sidebar.markdown("Spark NLP version: {}".format(sparknlp.version()))
286 | st.sidebar.markdown("Apache Spark version: {}".format(spark.version))
287 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark
2 | spark-nlp
3 | streamlit
4 | 


--------------------------------------------------------------------------------