├── README.md
├── pretrained_pipelines
├── readme.md
├── sparknlp_pretrained_pipeline_playground.mp4
└── sparknlp_pretrained_pipeline_playground.py
└── requirements.txt
/README.md:
--------------------------------------------------------------------------------
1 | ### spark-nlp-streamlit
2 |
3 | ## Deprecated in favor of [Streamlit in NLU](https://nlu.johnsnowlabs.com/docs/en/streamlit_viz_examples)
4 |
5 | Code samples to run Spark NLP within Streamlit.io
6 |
7 | ```bash
8 | $ git clone https://github.com/JohnSnowLabs/spark-nlp-streamlit.git
9 | $ cd spark-nlp-streamlit
10 | $ pip install -r requirements.txt
11 | $ streamlit run pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py
12 | ```
13 |
--------------------------------------------------------------------------------
/pretrained_pipelines/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-streamlit/917fc32e8f15984561283c3f16de8573423e7077/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4
--------------------------------------------------------------------------------
/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.ml import Pipeline
3 |
4 | from sparknlp.annotator import *
5 | from sparknlp.common import *
6 | from sparknlp.base import *
7 | from sparknlp.pretrained import PretrainedPipeline
8 |
9 | import pandas as pd
10 |
11 | import sparknlp
12 |
13 | #spark = sparknlp.start()
14 |
15 |
16 | #%%writefile streamlit_healthcare.py
17 | from pyspark.sql import SparkSession
18 |
19 | import os
20 | import sys
21 |
22 | import streamlit as st
23 |
24 | import os
25 |
26 | import pandas as pd
27 |
28 | jar_path = "../jars/"
29 |
30 | spark = SparkSession.builder \
31 | .appName("Spark NLP 2.4.5") \
32 | .master("local[8]") \
33 | .config("spark.driver.memory","16G") \
34 | .config("spark.driver.maxResultSize", "1G") \
35 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
36 | .config("spark.kryoserializer.buffer.max", "1000M")\
37 | .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
38 | .getOrCreate()
39 |
40 |
41 | SPARK_NLP_PIPELINES = ['explain_document_ml',
42 | 'explain_document_dl',
43 | 'recognize_entities_dl',
44 | 'explain_document_dl_fast',
45 | 'onto_recognize_entities_sm',
46 | 'onto_recognize_entities_lg',
47 | 'match_datetime',
48 | 'match_pattern',
49 | 'match_chunks',
50 | 'match_phrases',
51 | 'clean_stop',
52 | 'clean_pattern',
53 | 'clean_slang',
54 | 'check_spelling',
55 | 'analyze_sentiment',
56 | 'dependency_parse']
57 |
58 | DEFAULT_TEXT = "Other than being the king of the north, John Snow is a an english physician and a leader in the development of anaesthesia and medical hygiene. He is considered for being the first one using data to cure cholera outbreak in 1834."
59 |
60 | HTML_WRAPPER = """
{}
"""
61 |
62 |
63 | @st.cache(allow_output_mutation=True)
64 | def load_pipeline(name):
65 | #if name=='match_datetime':
66 | #return light_Datetime
67 | #else:
68 | return PretrainedPipeline(name, lang='en')
69 |
70 |
71 | @st.cache(allow_output_mutation=True)
72 | def process_text(model_name, text, mode='slim'):
73 |
74 | pipeline = load_pipeline(model_name)
75 |
76 | if mode=='slim':
77 | return pipeline.annotate(text)
78 | else:
79 | return pipeline.fullAnnotate(text)
80 |
81 | st.sidebar.title("Interactive Spark NLP UI")
82 | st.sidebar.markdown(
83 | """
84 | Process text with Spark NLP pretrained pipelines and more. Using Spark NLP LightPipelines under the hood.
85 | """
86 | )
87 |
88 | sparknlp_model = st.sidebar.selectbox("Pipeline name", SPARK_NLP_PIPELINES)
89 | model_load_state = st.info(f"Loading pretrained pipeline '{sparknlp_model}'...")
90 | #pipeline = load_pipeline(sparknlp_model)
91 | model_load_state.empty()
92 |
93 | #st.markdown("Text to analyze")
94 |
95 | text = st.text_area("Text to analyze", DEFAULT_TEXT)
96 |
97 | try:
98 | annotated_text = process_text(sparknlp_model, text, mode='slim')
99 | full_annotated_text = process_text(sparknlp_model, text, mode='full')[0]
100 | except Exception as e:
101 | st.write('error in loading the pipeline !')
102 | annotated_text={}
103 | full_annotated_text={}
104 | pass
105 | #stages = pretrained_pipeline.model.stages
106 |
107 | #stages = ['_'.join(s.split('_')[:-1]) for s in stages]
108 |
109 | #stages = [s['name'] for s in pretrained_pipeline.model.stages]
110 |
111 |
112 | import random
113 |
114 | def get_color():
115 | r = lambda: random.randint(100,255)
116 | return '#%02X%02X%02X' % (r(),r(),r())
117 |
118 |
119 | def get_onto_NER_html (annotated_text, labels):
120 |
121 | light_data=annotated_text
122 |
123 | #html_output = 'Results of NER Annotation Pipeline
'
124 | #html_output += ''
125 | html_output=''
126 |
127 | problem_flag = False
128 | new_problem = []
129 | problem_list = []
130 |
131 | label_list = list(set([i.split('-')[1] for i in light_data['ner'] if i!='O']))
132 |
133 | label_color={}
134 |
135 | for l in label_list:
136 |
137 | label_color[l]=get_color()
138 |
139 | for index, this_token in enumerate(light_data['token']):
140 |
141 | try:
142 | ent = light_data['ner'][index].split('-')[1]
143 | except:
144 | ent = light_data['ner'][index]
145 |
146 |
147 | if ent in labels:
148 | color = label_color[ent]
149 | html_output+=''.format(color) + this_token + " "
150 | else:
151 | html_output+=this_token + " "
152 |
153 |
154 | html_output += '
'
155 | html_output += 'Color codes:'
156 |
157 | for l in labels:
158 |
159 | html_output += '{}, '.format(label_color[l],l)
160 |
161 | return html_output
162 |
163 |
164 |
165 | def show_html(annotated_text):
166 |
167 | st.header("Named Entities ({})".format(sparknlp_model))
168 | st.sidebar.header("Named Entities")
169 |
170 | #st.write(annotated_text['ner'])
171 | label_set = list(set([i.split('-')[1] for i in annotated_text['ner'] if i!='O']))
172 |
173 | labels = st.sidebar.multiselect(
174 | "Entity labels", options=label_set, default=list(label_set)
175 | )
176 |
177 | html = get_onto_NER_html (annotated_text, labels)
178 | # Newlines seem to mess with the rendering
179 | html = html.replace("\n", " ")
180 | st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
181 |
182 | st.write('')
183 | st.write('')
184 |
185 |
186 |
187 | if sparknlp_model == 'explain_document_dl':
188 |
189 | df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner'],
190 | 'corrected':annotated_text['checked'], 'POS':annotated_text['pos'],
191 | 'lemmas':annotated_text['lemma'], 'stems':annotated_text['stem']})
192 | st.dataframe(df)
193 |
194 |
195 | elif sparknlp_model == 'explain_document_ml':
196 |
197 | #st.write(annotated_text)
198 |
199 | df = pd.DataFrame({'token':annotated_text['token'],
200 | 'corrected':annotated_text['spell'], 'POS':annotated_text['pos'],
201 | 'lemmas':annotated_text['lemmas'], 'stems':annotated_text['stems']})
202 | st.dataframe(df)
203 |
204 |
205 | elif sparknlp_model in ['recognize_entities_dl','onto_recognize_entities_sm']:
206 |
207 | df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner']})
208 | st.dataframe(df)
209 |
210 |
211 | elif sparknlp_model == 'check_spelling':
212 |
213 | df = pd.DataFrame({'token':annotated_text['token'],
214 | 'corrected':annotated_text['checked']})
215 | st.dataframe(df)
216 |
217 | elif sparknlp_model == 'check_spelling':
218 |
219 | df = pd.DataFrame({'token':annotated_text['token'],
220 | 'corrected':annotated_text['checked']})
221 | st.dataframe(df)
222 |
223 |
224 | elif sparknlp_model == 'dependency_parse':
225 |
226 | df = pd.DataFrame({'token':annotated_text['token'],
227 | 'pos':annotated_text['pos'],
228 | 'dep_mod':annotated_text['dep_mod'],
229 | 'dep_root':annotated_text['dep_root']})
230 | st.dataframe(df)
231 |
232 |
233 | elif sparknlp_model == 'clean_slang':
234 |
235 | try:
236 | df = pd.DataFrame({'token':annotated_text['token'],
237 | 'normal':annotated_text['normal']})
238 | st.dataframe(df)
239 | except:
240 | pass
241 |
242 | if 'entities' in annotated_text.keys():
243 | st.write('')
244 | st.write('Named Entities')
245 | st.write('')
246 |
247 | chunks=[]
248 | entities=[]
249 |
250 | #html = get_onto_NER_html (annotated_text)
251 | #html = html.replace("\n", " ")
252 | #st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
253 | show_html(annotated_text)
254 |
255 | for n in full_annotated_text['entities']:
256 |
257 | chunks.append(n.result)
258 | entities.append(n.metadata['entity'])
259 |
260 | st.write('')
261 | st.write('Entities')
262 | st.dataframe(pd.DataFrame({'chunks':chunks, 'entities':entities}))
263 | #st.write(annotated_text['entities'])
264 |
265 |
266 |
267 | if 'sentence' in annotated_text.keys():
268 | st.write('')
269 | st.write('Sentences')
270 | st.write('')
271 | st.write(annotated_text['sentence'])
272 | #st.dataframe(pd.DataFrame({'sentences':annotated_text['sentence']}))
273 |
274 | if 'sentiment' in annotated_text.keys():
275 |
276 | st.write('')
277 | st.write('Sentiment')
278 | st.write('')
279 | st.dataframe(pd.DataFrame({'sentence':annotated_text['sentence'], 'sentiment':annotated_text['sentiment']}))
280 |
281 |
282 | st.subheader('Model Output')
283 | st.write(annotated_text)
284 |
285 | st.sidebar.markdown("Spark NLP version: {}".format(sparknlp.version()))
286 | st.sidebar.markdown("Apache Spark version: {}".format(spark.version))
287 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark
2 | spark-nlp
3 | streamlit
4 |
--------------------------------------------------------------------------------