├── README.md ├── pretrained_pipelines ├── readme.md ├── sparknlp_pretrained_pipeline_playground.mp4 └── sparknlp_pretrained_pipeline_playground.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | ### spark-nlp-streamlit 2 | 3 | ## Deprecated in favor of [Streamlit in NLU](https://nlu.johnsnowlabs.com/docs/en/streamlit_viz_examples) 4 | 5 | Code samples to run Spark NLP within Streamlit.io 6 | 7 | ```bash 8 | $ git clone https://github.com/JohnSnowLabs/spark-nlp-streamlit.git 9 | $ cd spark-nlp-streamlit 10 | $ pip install -r requirements.txt 11 | $ streamlit run pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py 12 | ``` 13 | -------------------------------------------------------------------------------- /pretrained_pipelines/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-streamlit/917fc32e8f15984561283c3f16de8573423e7077/pretrained_pipelines/sparknlp_pretrained_pipeline_playground.mp4 -------------------------------------------------------------------------------- /pretrained_pipelines/sparknlp_pretrained_pipeline_playground.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.ml import Pipeline 3 | 4 | from sparknlp.annotator import * 5 | from sparknlp.common import * 6 | from sparknlp.base import * 7 | from sparknlp.pretrained import PretrainedPipeline 8 | 9 | import pandas as pd 10 | 11 | import sparknlp 12 | 13 | #spark = sparknlp.start() 14 | 15 | 16 | #%%writefile streamlit_healthcare.py 17 | from pyspark.sql import SparkSession 18 | 19 | import os 20 | import sys 21 | 22 | import streamlit as st 23 | 24 | import os 25 | 26 | import pandas as pd 27 | 28 | jar_path = "../jars/" 29 | 30 | spark = SparkSession.builder \ 31 | .appName("Spark NLP 2.4.5") \ 32 | .master("local[8]") \ 33 | .config("spark.driver.memory","16G") \ 34 | .config("spark.driver.maxResultSize", "1G") \ 35 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ 36 | .config("spark.kryoserializer.buffer.max", "1000M")\ 37 | .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \ 38 | .getOrCreate() 39 | 40 | 41 | SPARK_NLP_PIPELINES = ['explain_document_ml', 42 | 'explain_document_dl', 43 | 'recognize_entities_dl', 44 | 'explain_document_dl_fast', 45 | 'onto_recognize_entities_sm', 46 | 'onto_recognize_entities_lg', 47 | 'match_datetime', 48 | 'match_pattern', 49 | 'match_chunks', 50 | 'match_phrases', 51 | 'clean_stop', 52 | 'clean_pattern', 53 | 'clean_slang', 54 | 'check_spelling', 55 | 'analyze_sentiment', 56 | 'dependency_parse'] 57 | 58 | DEFAULT_TEXT = "Other than being the king of the north, John Snow is a an english physician and a leader in the development of anaesthesia and medical hygiene. He is considered for being the first one using data to cure cholera outbreak in 1834." 59 | 60 | HTML_WRAPPER = """
{}
""" 61 | 62 | 63 | @st.cache(allow_output_mutation=True) 64 | def load_pipeline(name): 65 | #if name=='match_datetime': 66 | #return light_Datetime 67 | #else: 68 | return PretrainedPipeline(name, lang='en') 69 | 70 | 71 | @st.cache(allow_output_mutation=True) 72 | def process_text(model_name, text, mode='slim'): 73 | 74 | pipeline = load_pipeline(model_name) 75 | 76 | if mode=='slim': 77 | return pipeline.annotate(text) 78 | else: 79 | return pipeline.fullAnnotate(text) 80 | 81 | st.sidebar.title("Interactive Spark NLP UI") 82 | st.sidebar.markdown( 83 | """ 84 | Process text with Spark NLP pretrained pipelines and more. Using Spark NLP LightPipelines under the hood. 85 | """ 86 | ) 87 | 88 | sparknlp_model = st.sidebar.selectbox("Pipeline name", SPARK_NLP_PIPELINES) 89 | model_load_state = st.info(f"Loading pretrained pipeline '{sparknlp_model}'...") 90 | #pipeline = load_pipeline(sparknlp_model) 91 | model_load_state.empty() 92 | 93 | #st.markdown("Text to analyze") 94 | 95 | text = st.text_area("Text to analyze", DEFAULT_TEXT) 96 | 97 | try: 98 | annotated_text = process_text(sparknlp_model, text, mode='slim') 99 | full_annotated_text = process_text(sparknlp_model, text, mode='full')[0] 100 | except Exception as e: 101 | st.write('error in loading the pipeline !') 102 | annotated_text={} 103 | full_annotated_text={} 104 | pass 105 | #stages = pretrained_pipeline.model.stages 106 | 107 | #stages = ['_'.join(s.split('_')[:-1]) for s in stages] 108 | 109 | #stages = [s['name'] for s in pretrained_pipeline.model.stages] 110 | 111 | 112 | import random 113 | 114 | def get_color(): 115 | r = lambda: random.randint(100,255) 116 | return '#%02X%02X%02X' % (r(),r(),r()) 117 | 118 | 119 | def get_onto_NER_html (annotated_text, labels): 120 | 121 | light_data=annotated_text 122 | 123 | #html_output = '

Results of NER Annotation Pipeline

' 124 | #html_output += '
' 125 | html_output='' 126 | 127 | problem_flag = False 128 | new_problem = [] 129 | problem_list = [] 130 | 131 | label_list = list(set([i.split('-')[1] for i in light_data['ner'] if i!='O'])) 132 | 133 | label_color={} 134 | 135 | for l in label_list: 136 | 137 | label_color[l]=get_color() 138 | 139 | for index, this_token in enumerate(light_data['token']): 140 | 141 | try: 142 | ent = light_data['ner'][index].split('-')[1] 143 | except: 144 | ent = light_data['ner'][index] 145 | 146 | 147 | if ent in labels: 148 | color = label_color[ent] 149 | html_output+=''.format(color) + this_token + " " 150 | else: 151 | html_output+=this_token + " " 152 | 153 | 154 | html_output += '
' 155 | html_output += '
Color codes:' 156 | 157 | for l in labels: 158 | 159 | html_output += '{}, '.format(label_color[l],l) 160 | 161 | return html_output 162 | 163 | 164 | 165 | def show_html(annotated_text): 166 | 167 | st.header("Named Entities ({})".format(sparknlp_model)) 168 | st.sidebar.header("Named Entities") 169 | 170 | #st.write(annotated_text['ner']) 171 | label_set = list(set([i.split('-')[1] for i in annotated_text['ner'] if i!='O'])) 172 | 173 | labels = st.sidebar.multiselect( 174 | "Entity labels", options=label_set, default=list(label_set) 175 | ) 176 | 177 | html = get_onto_NER_html (annotated_text, labels) 178 | # Newlines seem to mess with the rendering 179 | html = html.replace("\n", " ") 180 | st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) 181 | 182 | st.write('') 183 | st.write('') 184 | 185 | 186 | 187 | if sparknlp_model == 'explain_document_dl': 188 | 189 | df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner'], 190 | 'corrected':annotated_text['checked'], 'POS':annotated_text['pos'], 191 | 'lemmas':annotated_text['lemma'], 'stems':annotated_text['stem']}) 192 | st.dataframe(df) 193 | 194 | 195 | elif sparknlp_model == 'explain_document_ml': 196 | 197 | #st.write(annotated_text) 198 | 199 | df = pd.DataFrame({'token':annotated_text['token'], 200 | 'corrected':annotated_text['spell'], 'POS':annotated_text['pos'], 201 | 'lemmas':annotated_text['lemmas'], 'stems':annotated_text['stems']}) 202 | st.dataframe(df) 203 | 204 | 205 | elif sparknlp_model in ['recognize_entities_dl','onto_recognize_entities_sm']: 206 | 207 | df = pd.DataFrame({'token':annotated_text['token'], 'label':annotated_text['ner']}) 208 | st.dataframe(df) 209 | 210 | 211 | elif sparknlp_model == 'check_spelling': 212 | 213 | df = pd.DataFrame({'token':annotated_text['token'], 214 | 'corrected':annotated_text['checked']}) 215 | st.dataframe(df) 216 | 217 | elif sparknlp_model == 'check_spelling': 218 | 219 | df = pd.DataFrame({'token':annotated_text['token'], 220 | 'corrected':annotated_text['checked']}) 221 | st.dataframe(df) 222 | 223 | 224 | elif sparknlp_model == 'dependency_parse': 225 | 226 | df = pd.DataFrame({'token':annotated_text['token'], 227 | 'pos':annotated_text['pos'], 228 | 'dep_mod':annotated_text['dep_mod'], 229 | 'dep_root':annotated_text['dep_root']}) 230 | st.dataframe(df) 231 | 232 | 233 | elif sparknlp_model == 'clean_slang': 234 | 235 | try: 236 | df = pd.DataFrame({'token':annotated_text['token'], 237 | 'normal':annotated_text['normal']}) 238 | st.dataframe(df) 239 | except: 240 | pass 241 | 242 | if 'entities' in annotated_text.keys(): 243 | st.write('') 244 | st.write('Named Entities') 245 | st.write('') 246 | 247 | chunks=[] 248 | entities=[] 249 | 250 | #html = get_onto_NER_html (annotated_text) 251 | #html = html.replace("\n", " ") 252 | #st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) 253 | show_html(annotated_text) 254 | 255 | for n in full_annotated_text['entities']: 256 | 257 | chunks.append(n.result) 258 | entities.append(n.metadata['entity']) 259 | 260 | st.write('') 261 | st.write('Entities') 262 | st.dataframe(pd.DataFrame({'chunks':chunks, 'entities':entities})) 263 | #st.write(annotated_text['entities']) 264 | 265 | 266 | 267 | if 'sentence' in annotated_text.keys(): 268 | st.write('') 269 | st.write('Sentences') 270 | st.write('') 271 | st.write(annotated_text['sentence']) 272 | #st.dataframe(pd.DataFrame({'sentences':annotated_text['sentence']})) 273 | 274 | if 'sentiment' in annotated_text.keys(): 275 | 276 | st.write('') 277 | st.write('Sentiment') 278 | st.write('') 279 | st.dataframe(pd.DataFrame({'sentence':annotated_text['sentence'], 'sentiment':annotated_text['sentiment']})) 280 | 281 | 282 | st.subheader('Model Output') 283 | st.write(annotated_text) 284 | 285 | st.sidebar.markdown("Spark NLP version: {}".format(sparknlp.version())) 286 | st.sidebar.markdown("Apache Spark version: {}".format(spark.version)) 287 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark 2 | spark-nlp 3 | streamlit 4 | --------------------------------------------------------------------------------