├── environment.yml ├── .gitignore ├── utils.py ├── transcripts ├── transcribed_speech_hf_pipelines.txt └── transcribed_speech_w_corrected_punctuation.txt ├── README.md ├── LICENSE ├── 2_biomedical_ner_summarization.ipynb ├── 0_speech_recognition.ipynb └── 1_grammar_punctuation_correction.ipynb /environment.yml: -------------------------------------------------------------------------------- 1 | # To use: 2 | # $ conda env create -f environment.yml 3 | # $ conda activate 4 | name: machinelearnear-asr-clinical-biomedical-ehr 5 | dependencies: 6 | - python=3.9 7 | - pip 8 | - nb_conda_kernels 9 | - ipykernel 10 | - ipywidgets 11 | - gh 12 | - ffmpeg 13 | - libsndfile 14 | - pip: 15 | - streamlit 16 | - gradio 17 | - torch 18 | - torchaudio 19 | - transformers 20 | - librosa 21 | - thunder-speech 22 | - pyctcdecode 23 | - https://github.com/kpu/kenlm/archive/master.zip 24 | - deepmultilingualpunctuation 25 | - https://github.com/pyannote/pyannote-audio/archive/develop.zip 26 | - stanza 27 | - youtube-dl 28 | - fasttext-langdetect 29 | - wget 30 | - pyspellchecker 31 | - autocorrect -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Repo 2 | downloaded_model/ 3 | latest_silero_models.yml 4 | test_audio_youtube.m4a 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # source: https://skeptric.com/python-diffs/ 2 | 3 | from typing import List, Any, Callable, Tuple, Union 4 | from IPython.display import HTML, display 5 | import re 6 | import difflib 7 | 8 | Token = str 9 | TokenList = List[Token] 10 | 11 | whitespace = re.compile('\s+') 12 | end_sentence = re.compile('[.!?]\s+') 13 | 14 | def tokenize(s:str) -> TokenList: 15 | '''Split a string into tokens''' 16 | return whitespace.split(s) 17 | 18 | def untokenize(ts:TokenList) -> str: 19 | '''Join a list of tokens into a string''' 20 | return ' '.join(ts) 21 | 22 | def sentencize(s:str) -> TokenList: 23 | '''Split a string into a list of sentences''' 24 | return end_sentence.split(s) 25 | 26 | def unsentencise(ts:TokenList) -> str: 27 | '''Join a list of sentences into a string''' 28 | return '. '.join(ts) 29 | 30 | def html_unsentencise(ts:TokenList) -> str: 31 | '''Joing a list of sentences into HTML for display''' 32 | return ''.join(f'

{t}

' for t in ts) 33 | 34 | def mark_text(text:str) -> str: 35 | return f'{text}' 36 | 37 | def mark_span(text:TokenList) -> TokenList: 38 | return [mark_text(token) for token in text] 39 | 40 | def mark_span(text:TokenList) -> TokenList: 41 | if len(text) > 0: 42 | text[0] = '' + text[0] 43 | text[-1] += '' 44 | return text 45 | 46 | def markup_diff(a:TokenList, b:TokenList, 47 | mark:Callable[TokenList, TokenList]=mark_span, 48 | default_mark: Callable[TokenList, TokenList] = lambda x: x, 49 | isjunk:Union[None, Callable[[Token], bool]]=None) -> Tuple[TokenList, TokenList]: 50 | """Returns a and b with any differences processed by mark 51 | 52 | Junk is ignored by the differ 53 | """ 54 | seqmatcher = difflib.SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False) 55 | out_a, out_b = [], [] 56 | for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes(): 57 | markup = default_mark if tag == 'equal' else mark 58 | out_a += markup(a[a0:a1]) 59 | out_b += markup(b[b0:b1]) 60 | assert len(out_a) == len(a) 61 | assert len(out_b) == len(b) 62 | return out_a, out_b 63 | 64 | def align_seqs(a: TokenList, b: TokenList, fill:Token='') -> Tuple[TokenList, TokenList]: 65 | out_a, out_b = [], [] 66 | seqmatcher = difflib.SequenceMatcher(a=a, b=b, autojunk=False) 67 | for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes(): 68 | delta = (a1 - a0) - (b1 - b0) 69 | out_a += a[a0:a1] + [fill] * max(-delta, 0) 70 | out_b += b[b0:b1] + [fill] * max(delta, 0) 71 | assert len(out_a) == len(out_b) 72 | return out_a, out_b 73 | 74 | from itertools import zip_longest 75 | def html_sidebyside(a, b): 76 | # Set the panel display 77 | out = '

' 78 | # There's some CSS in Jupyter notebooks that makes the first pair unalign. This is a workaround 79 | out += '

' 80 | for left, right in zip_longest(a, b, fillvalue=''): 81 | out += f'

{left}

' 82 | out += f'

{right}

' 83 | out += '

' 84 | return out 85 | 86 | import html 87 | def html_diffs(a, b): 88 | a = html.escape(a) 89 | b = html.escape(b) 90 | 91 | out_a, out_b = [], [] 92 | for sent_a, sent_b in zip(*align_seqs(sentencize(a), sentencize(b))): 93 | mark_a, mark_b = markup_diff(tokenize(sent_a), tokenize(sent_b)) 94 | out_a.append(untokenize(mark_a)) 95 | out_b.append(untokenize(mark_b)) 96 | 97 | return html_sidebyside(out_a, out_b) 98 | 99 | def show_diffs(a, b): 100 | display(HTML(html_diffs(a,b))) -------------------------------------------------------------------------------- /transcripts/transcribed_speech_hf_pipelines.txt: -------------------------------------------------------------------------------- 1 | estamos con marcelo ferre es gastro enterólogo cómo le va a doctor buenas noches unasa ver hoy charlaba con nuestros compañeros de trabajo y decía no pero murió de un cáncer y había otra enfermedad previa y cómo es estocuéntenos la enfermedad de croncque esla enfermedad de cron correcto fernando la enfermedad de cron es una enfermedad inflamatoria del intestino sque entra dentro del grupo de las enfermedades inflamatorias intestinales que son enfermedades crónicas son autoinmunes tienen una base genética son raras en el sentido del diagnóstico son poco frecuentes hay pacientes en todo el país empacientes con esta enfermedad nada más nada más en todo el país o al menos diagnosticados se diagnosticado y conocido sicon algún seguimiento por parte de sus médicos uno cada chabitantes puede tener esta enfermedad aunque no la diagnostiquemos tan frecuentemente y en realidadcuesta diagnosticarla en el sentido al no verla y al no estar en contacto con ellos el diagnóstico es de difícil acceso eldoctor cuéntenos por ejemplo esa imagen que estamos observando nosotrosaimagen completa por favor dire vamos a agradecer porqe eldoctor vamos a utilizar una cuestión didáctica esto es un video que nosotros bajamos de las redes como para entender cuálesporahí estamos viendo el aparato digestio correcto en ese aparato digestivo la lupa se está posando en dónde en el intestino delgado yahí va hacía el grueso ahí va ahacía el gruesoeesta enfermedad se manifiesta en el grueso en el delgado o en ambos enambos desde la boca alano aopor eso las complicaciones tremendas porlas complicaciones que tiene la enfermedad cuando no se le diagnosticuae y no se la atrata esta enfermedad es mortal en realidad es una enfermedad crónica y como todas las enfermedades crónicas yo se la comparo con una diabetes por ejemploes una enfermedad que puede terminar siendo mortal pero si no se la controla primero si no se la diagnostica no se la controla y no tiene un seguimiento yo puedo intuir que tengo la enfermedad debien digo de cron por qué síntomas lossíntomas más comunes generalmente gente entre taños esla edad del debut y tienen dolor abdominal pueden tener diarrea pueden tener fiebre pueden tener algún sangrado intestinal pero a su vez tienen manifestaciones extraintestinales como pueden ser dolores articulares es decir es una enfermedad sistémica es una enfermedad que no solamente puede afectar el intestino delgado quees msu parte más frecuente lugar que lo afecta puede afectar el colón pero también hay enfermedad de crón de estómago de sófago son más raras dentro de la rareza de la propia enfermedad estas también son más raras las localizacione nosotrosentendemos que en estos espacios lo que vemos hacer es no llenar de miedo sino que instruir a través de la palabra de utdsye los profesionales de cómo no puede estar advertido esto porque por ejemplo si tengo diarrea esta noche no significa que estoy enfermo para itimatología puede responder un montón de cosas me comió un lechón y me cayó mal pero lo que te quiero decir des estos elementos qsonaquellos que se conjugan para establecer un primer diagnóstico una aproximación yenel caso de hendler aél le demandó como trebarracuatro años para diagnosticarla porque son difíciles de diagnosticar al verse poco son difíciles de diagnosticar todo depende por donde uno vaya a consultar si vos me dices yo voy con estos síntomas que acabamos de hablar a un médico clínico por ahí es más gamodifícil para el clínico que para un gastro enterólogo que las vemos más pero no significa que el clínico no las pueda diagnosticar también las puede diagnosticalguna preguntaesta enfermedad podemos decir que decanta en el cáncer que luego termina con la vida de genre o no necesariamente una cosa vaatada a la otra no necesariamente la enfermedad de crón es una enfermedad repito crónica autinmune que si tiene un seguimiento y tiene un tratamiento no debería terminar en un cancertres patas del tratamiento que usted por ejemplo me diagnosticaría a mí si usted dice mireidalgo usted tiene o padece esta patología tres patas del tratamiento enel tratamiento básicamente tiene que haber un tratamiento psicológicoporque ese paciente tiene que tener una contención tiene que tener un seguimiento y tiene que entender de que esta enfermedad la puede llevar hasta el final de sus días y morir de cualquier otra cosa menos de a enfermedad de crom es muy importante el enfoque y el apoyo psicológico después el tratamiento médico hoy endía hay tratamiento farmacológico un tratamiento farmacológico con anticuerpos monoclonales es un tratamiento realmente que hoy lo curre en todas las obras sociales es decir tiene tratamiento no tiene cura tiene trataiteneso cambia los hábitos de alimentación también los hábitos de alimentación también estos pacientes tienen una dieta a seguir una dieta más conveniente una dieta saludablerica en fibras con un cuidado especial del intestino no es una dieta que no la pueda hacer o sea no es algo extraordinario si vienes una enfermedad rara cuando se detecta se la puede tratar dctor le ha tocado diagnosticar sísss claro no son muchos los pacientes que hahy el mendoza menos site imagina que a nivel país hay ade los que están documentados estadificados y demás es poco frecuente usted ha podido llevar adelante buenos tratamientos sitenés buenos resultados por qué me habloó el tratamiento psicológico porque es una parte muy importante del paciéntte todas las enfermedades inflamatorias del intestino tienen que tener un acompañamiento psicológico porque están influenciando se considera quel intestino es el segundo cerebro y tiene comunicación con nuestro primer cerebro que es el que la geneque todos conocemoshabitualmente exactamente y de la armonía de los dos cerebros el resultado también del seguimiento deestresfastidios ira cibilidad alibrmentaciónmacedentaría mostavaquismo alcolismo bueno todo lo que te puedas imaginar agrede y tiene sus consecuencias doctorcomo mensaje final no hay que temerles estas cosas sino que hay que enfrentarlas y tratarlas totalmente estos pacientes tienen un seguimiento tienen un tratamiento y no es sinónimo ypara tranquilidad de todos estos pacientes que pueden tener o nos pueden estar escuchando no es sinónimo que van a padecer un cáncer deintestino ni mucho menos tienen exactamente estos pacientes las mismas posibilidades de hacer un cáncer de colón que cualquier otro paciente siempre y cuando estén en tratamiento y en seguimiento rhasido umuy amabl gracias por su tiempo buenas noches -------------------------------------------------------------------------------- /transcripts/transcribed_speech_w_corrected_punctuation.txt: -------------------------------------------------------------------------------- 1 | Estamos con Marcelo Ferre. es gastro enterólogo. cómo le va a doctor? buenas noches, Unasa ver. hoy charlaba con nuestros compañeros de trabajo y decía: no, pero murió de un cáncer y había otra enfermedad previa. y cómo es estocuéntenos La enfermedad de Croncque? esla enfermedad de Cron, correcto, Fernando? La enfermedad de Cron es una enfermedad inflamatoria del intestino Sque entra dentro del grupo de las enfermedades inflamatorias intestinales, que son enfermedades crónicas, son autoinmunes, tienen una base genética, Son raras en el sentido del diagnóstico, Son poco frecuentes. Hay pacientes en todo el país empacientes con esta enfermedad, nada más, nada más en todo el país, o al menos diagnosticados, Se diagnosticado y conocido, sicon algún seguimiento por parte de sus médicos. Uno cada chabitantes puede tener esta enfermedad, aunque no la diagnostiquemos tan frecuentemente y en realidadcuesta diagnosticarla en el sentido, al no verla y al no estar en contacto con ellos, el diagnóstico es de difícil acceso. Eldoctor, cuéntenos, por ejemplo, esa imagen que estamos observando nosotrosaimagen completa. por favor dire: vamos a agradecer porqe Eldoctor, vamos a utilizar una cuestión didáctica. Esto es un video que nosotros bajamos de las redes, como para entender cuálesporahí Estamos viendo el aparato digestio correcto. en ese aparato digestivo, La lupa se está posando en dónde, en el intestino delgado, yahí va hacía el grueso. ahí va ahacía el gruesoeesta enfermedad se manifiesta en el grueso, en el delgado o en ambos enambos, desde la boca alano aopor eso las complicaciones tremendas porlas complicaciones que tiene la enfermedad cuando no se le diagnosticuae y no se la atrata. Esta enfermedad es mortal. en realidad es una enfermedad crónica y, como todas las enfermedades crónicas, Yo se la comparo con una diabetes, por ejemploes una enfermedad que puede terminar siendo mortal, pero si no se la controla primero, si no se la diagnostica, no se la controla y no tiene un seguimiento. Yo puedo intuir que tengo la enfermedad, debien digo de cron. por qué síntomas? lossíntomas más comunes, generalmente gente entre taños, esla edad del debut y tienen dolor abdominal. Pueden tener diarrea, Pueden tener fiebre, Pueden tener algún sangrado intestinal, pero a su vez tienen manifestaciones extraintestinales, como pueden ser dolores articulares. Es decir, es una enfermedad sistémica. Es una enfermedad que no solamente puede afectar el intestino delgado, Quees MsU, parte más frecuente, lugar que lo afecta. puede afectar el colón, pero también hay enfermedad de crón, de estómago, de sófago. Son más raras. dentro de la rareza de la propia enfermedad, Estas también son más raras Las localizacione. nosotrosentendemos que en estos espacios, lo que vemos hacer es no llenar de miedo, sino que instruir a través de la palabra de UtDsye Los profesionales, de cómo no puede estar advertido esto? porque, por ejemplo, si tengo diarrea esta noche, no significa que estoy enfermo para itimatología. puede responder un montón de cosas. Me comió un lechón y me cayó mal. pero lo que te quiero decir des estos elementos qsonaquellos que se conjugan para establecer un primer diagnóstico, Una aproximación yenel caso de Hendler Aél le demandó como trebarracuatro años para diagnosticarla, porque son difíciles de diagnosticar. al verse poco, son difíciles de diagnosticar. todo depende por donde uno vaya a consultar. Si vos me dices, Yo voy con estos síntomas que acabamos de hablar a un médico clínico Por ahí, es más gamodifícil para el clínico que para un gastro enterólogo, que las vemos más. Pero no significa que el clínico no las pueda diagnosticar. También las puede diagnosticalguna preguntaesta enfermedad podemos decir que decanta en el cáncer, que luego termina con la vida de genre o no necesariamente una cosa vaatada a la otra? no necesariamente. La enfermedad de crón es una enfermedad, repito, crónica, autinmune, que, si tiene un seguimiento y tiene un tratamiento, no debería terminar en un cancertres patas del tratamiento que usted, por ejemplo, me diagnosticaría a mí. Si usted dice, mireidalgo, usted tiene o padece esta patología, tres patas del tratamiento enel tratamiento básicamente tiene que haber un tratamiento psicológicoporque Ese paciente tiene que tener una contención, tiene que tener un seguimiento y tiene que entender de que esta enfermedad la puede llevar hasta el final de sus días y morir de cualquier otra cosa, menos de a enfermedad de Crom. es muy importante el enfoque y el apoyo psicológico. después el tratamiento médico. hoy endía Hay tratamiento farmacológico, Un tratamiento farmacológico con anticuerpos monoclonales. Es un tratamiento realmente que hoy lo curre en todas las obras sociales. Es decir: tiene tratamiento, no tiene cura, tiene trataiteneso cambia los hábitos de alimentación, También los hábitos de alimentación. También estos pacientes tienen una dieta a seguir, una dieta más conveniente, Una dieta saludablerica en fibras, con un cuidado especial del intestino. No es una dieta que no la pueda hacer, o sea no es algo extraordinario. si vienes una enfermedad rara, cuando se detecta, se la puede tratar. dctor le ha tocado diagnosticar sísss. claro, No son muchos los pacientes que hahy el mendoza menos site imagina que a nivel país hay ade, los que están documentados, estadificados y demás. es poco frecuente. Usted ha podido llevar adelante buenos tratamientos sitenés buenos resultados. por qué me habloó el tratamiento psicológico? Porque es una parte muy importante del paciéntte. Todas las enfermedades inflamatorias del intestino tienen que tener un acompañamiento psicológico, porque están influenciando. se considera quel intestino Es el segundo cerebro y tiene comunicación con nuestro primer cerebro, que es el que la geneque todos conocemoshabitualmente exactamente, y de la armonía de los dos cerebros, El resultado también del seguimiento deestresfastidios, ira, cibilidad, alibrmentaciónmacedentaría, mostavaquismo, alcolismo. bueno, todo lo que te puedas imaginar agrede y tiene sus consecuencias. Doctorcomo mensaje final, No hay que temerles estas cosas, sino que hay que enfrentarlas y tratarlas totalmente. Estos pacientes tienen un seguimiento, tienen un tratamiento, y no es sinónimo ypara tranquilidad De todos estos pacientes que pueden tener o nos pueden estar escuchando. No es sinónimo que van a padecer un cáncer deintestino, ni mucho menos tienen exactamente estos pacientes Las mismas posibilidades de hacer un cáncer de colón que cualquier otro Paciente, siempre y cuando estén en tratamiento y en seguimiento. rhasido Umuy Amabl, gracias por su tiempo. buenas noches. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automatically detect medical entities from speech 2 | 3 | Through this guide, you will learn how to do automatic speech recognition in your language, fix the grammar from that transcribed speech, restore its punctuation, detect biomedical or clinical entities from that text, get a summary of it, and finally how to put everything together. 4 | 5 | ## Getting started with Automatic Speech Recognition 6 | - [Intro to Automatic Speech Recognition on 🤗](https://huggingface.co/tasks/automatic-speech-recognition) 7 | - [Robust Speech Challenge Results on 🤗](https://huggingface.co/spaces/speech-recognition-community-v2/FinalLeaderboard) 8 | - [Mozilla Common Voice 9.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_9_0) 9 | - [Thunder-speech, A Hackable speech recognition library](https://scart97.github.io/thunder-speech/Ultimate%20guide/) 10 | - [SpeechBrain - PyTorch powered speech toolkit](https://speechbrain.github.io/) 11 | - [Neural building blocks for speaker diarization: speech activity detection, speaker change detection, overlapped speech detection, speaker embedding](https://github.com/pyannote/pyannote-audio) 12 | - [SPEECH RECOGNITION WITH WAV2VEC2](https://pytorch.org/tutorials/intermediate/speech_recognition_pipeline_tutorial.html) 13 | - [How to add timestamps to ASR output](https://github.com/huggingface/transformers/issues/11307) 14 | 15 | ## Video Tutorial in YouTube 16 | [![Como hacer tu propia solución de dictado automático de informes médicos (+ repo)](https://img.youtube.com/vi/_0KGck2JU0w/0.jpg)](https://www.youtube.com/watch?v=_0KGck2JU0w) 17 | 18 | ## Requirements 19 | - [SageMaker Studio Lab](https://studiolab.sagemaker.aws/) account. See this [explainer video](https://www.youtube.com/watch?v=FUEIwAsrMP4) to learn more about this. 20 | - Python=3.9 21 | - PyTorch>=1.10 22 | - Hugging Face Transformers 23 | - Several audio processing libraries (see `environment.yml`) 24 | 25 | ## Step by step tutorial 26 | 27 | ### Clone repo and install dependencies 28 | 29 | There are 3 main notebooks to follow, but you can start from `0_speech_recognition.ipynb` [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/machinelearnear/asr-restore-punctuation-summarization-biomedical-ehr/blob/main/0_speech_recognition.ipynb) 30 | 31 | Click on `Copy to project` in the top right corner. This will open the Studio Lab web interface and ask you whether you want to clone the entire repo or just the Notebook. Clone the entire repo and click `Yes` when asked about building the `Conda` environment automatically. You will now be running on top of a `Python` environment with all libraries already installed. 32 | 33 | ### Transcribe speech from audio (YouTube) 34 | 35 | Open `0_speech_recognition.ipynb` and run all steps. For more information, please refer back to [this other repo from machinelearnear](https://github.com/machinelearnear/long-audio-transcription-spanish). You will basically download the audio from a YouTube video by providing the VideoID and then generate a transcript that will be saved locally to `/transcripts`. 36 | 37 | ### Fix grammar and restore punctuation 38 | 39 | Open `1_grammar_punctuation_correction.ipynb` and load your transcribed speech. What we want to do now is to first fix the grammar errors and then base out of that fix the punctuation. This order of doing things is random, try it on your own to see what brings better results. 40 | 41 | I have tested a number of libraries to do spellchecking and ended up with `autocorrect` and `pyspellchecker`. Both of them allow for the addition of custom vocabularies to the spell checker (see [this for example](https://github.com/filyp/autocorrect/issues/17)) so here is where you could use your very own list of relevant words in your domain e.g. radiology, pathology, etc. The way that you would run it is as follows: 42 | 43 | ```python 44 | from spellchecker import SpellChecker 45 | spell_py = SpellChecker(language='es', distance=2) # Spanish dictionary 46 | processed_text = spell_py.correction(input_text) 47 | ``` 48 | 49 | ```python 50 | from autocorrect import Speller 51 | spell_autocorrect = Speller(lang='es',only_replacements=True) 52 | processed_text = spell_autocorrect(input_text) 53 | ``` 54 | 55 | Once we have our corrected text, we apply a model to restore punctuation. There are a number of them, and you can see many links at the bottom of the notebook, but I short-listed it to 2: [deepmultilingualpunctuation](https://github.com/oliverguhr/deepmultilingualpunctuation) and [Silero](https://github.com/snakers4/silero-models#text-enhancement). Both of them allow for the fine-tuning to a specific language. The first library is the one that performs the best even though it was not even trained in Spanish. I'm using a multi-lingual model. 56 | 57 | 58 | ```python 59 | from deepmultilingualpunctuation import PunctuationModel 60 | model = PunctuationModel(model='oliverguhr/fullstop-punctuation-multilingual-base') 61 | result = model.restore_punctuation(output_text) 62 | ``` 63 | 64 | ### Detect medical entities (NER) and run summarisation 65 | 66 | To detect medical entities, we are going to be using [Stanza](https://stanfordnlp.github.io/stanza/), "a collection of accurate and efficient tools for the linguistic analysis of many human languages. Starting from raw text to syntactic analysis and entity recognition, Stanza brings state-of-the-art NLP models to languages of your choosing". There are medical NLP models available in Hugging Face through [the Spanish Government's National NLP Plan](https://huggingface.co/PlanTL-GOB-ES) but they are not yet fine-tuned to detect clinical entities such as `disease`, `treatment`, etc. 67 | 68 | ```python 69 | import stanza 70 | # download and initialize a mimic pipeline with an i2b2 NER model 71 | stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) 72 | nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) 73 | # annotate clinical text 74 | doc = nlp(input_text) 75 | # print out all entities 76 | for ent in doc.entities: 77 | print(f'{ent.text}\t{ent.type}') 78 | ``` 79 | 80 | Summarisation example 81 | 82 | ```python 83 | # model_name = "google/pegasus-large" 84 | model_name = "google/pegasus-xsum" 85 | # model_name = "csebuetnlp/mT5_multilingual_XLSum" 86 | # model_name = "sshleifer/distilbart-cnn-12-6" 87 | # model_name = 'ELiRF/NASES' 88 | from transformers import pipeline 89 | pipe = pipeline(model=model_name) 90 | summary = pipe(input_text,truncation=True) 91 | print(summary[0]['summary_text']) 92 | ``` 93 | 94 | ## Keep reading 95 | - [Pyctcdecode & Speech2text decoding](https://www.youtube.com/watch?v=mp7fHMTnK9A&t=5s) 96 | - [XLS-R: Large-Scale Cross-lingual Speech Representation Learning on 128 Languages](https://www.youtube.com/watch?v=ic_J7ZCROBM) 97 | - [Unlocking global speech with Mozilla Common Voice](https://www.youtube.com/watch?v=Vvn984QmAVg) 98 | - [Reconocimiento automático de voz con Python y HuggingFace en segundos (+ Repo)](https://www.youtube.com/watch?v=wFjPxz22MEs) 99 | - [“SomosNLP”, red internacional de estudiantes, profesionales e investigadores acelerando el avance del NLP en español](https://somosnlp.org/) 100 | - [How to Write a Spelling Corrector](https://norvig.com/spell-correct.html ) 101 | - [Build Spell Checking Models For Any Language In Python](https://medium.com/mlearning-ai/build-spell-checking-models-for-any-language-in-python-aa4489df0a5f ) 102 | - [Grammatical Error Correction](http://nlpprogress.com/english/grammatical_error_correction.html ) 103 | - [FullStop: Multilingual Deep Models for Punctuation Prediction](http://ceur-ws.org/Vol-2957/sepp_paper4.pdf) 104 | - [BioMedIA: Abstractive Question Answering for the BioMedical Domain in Spanish](https://huggingface.co/spaces/hackathon-pln-es/BioMedIA) 105 | - [PlanTL-GOB-ES/bsc-bio-ehr-es-pharmaconer](https://huggingface.co/PlanTL-GOB-ES/bsc-bio-ehr-es-pharmaconer ) 106 | - [Host Hugging Face transformer models using Amazon SageMaker Serverless Inference](https://aws.amazon.com/de/blogs/machine-learning/host-hugging-face-transformer-models-using-amazon-sagemaker-serverless-inference/) 107 | 108 | ## Citations 109 | ```bibtex 110 | Peng Qi, Yuhao Zhang, Yuhui Zhang, Jason Bolton and Christopher D. Manning. 2020. Stanza: A Python Natural Language Processing Toolkit for Many Human Languages. In Association for Computational Linguistics (ACL) System Demonstrations. 2020. [pdf][bib] 111 | 112 | Yuhao Zhang, Yuhui Zhang, Peng Qi, Christopher D. Manning, Curtis P. Langlotz. Biomedical and Clinical English Model Packages in the Stanza Python NLP Library, Journal of the American Medical Informatics Association. 2021. 113 | ``` 114 | 115 | ## Disclaimer 116 | - The content provided in this repository is for demonstration purposes and not meant for production. You should use your own discretion when using the content. 117 | - The ideas and opinions outlined in these examples are my own and do not represent the opinions of AWS. 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /2_biomedical_ner_summarization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "47d4231e-aefd-439a-83df-3dbbab9f30ec", 6 | "metadata": {}, 7 | "source": [ 8 | "# Detect medical entities from transcribed speech" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "efbc0a00-7696-4cf7-a167-270d112dd532", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "input_text = open('transcripts/transcribed_speech_w_corrected_punctuation.txt','r').readlines()[0]" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "0c3cd15a-e205-402a-939d-7018d757dbe9", 24 | "metadata": { 25 | "tags": [] 26 | }, 27 | "source": [ 28 | "## Use `stanza-nlp`\n", 29 | "- https://stanfordnlp.github.io/stanza/\n", 30 | "- https://stanfordnlp.github.io/stanza/available_biomed_models.html\n", 31 | "- https://stanfordnlp.github.io/stanza/biomed_model_usage.html" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "dda4e5f8-59d5-4508-ae38-27bcc1f99671", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import stanza" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "ded69277-626f-4130-ab75-568f96a0c166", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "application/vnd.jupyter.widget-view+json": { 53 | "model_id": "ba4ada694f3f443dbc7e7fd24e3bfb6a", 54 | "version_major": 2, 55 | "version_minor": 0 56 | }, 57 | "text/plain": [ 58 | "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 0%| …" 59 | ] 60 | }, 61 | "metadata": {}, 62 | "output_type": "display_data" 63 | }, 64 | { 65 | "name": "stderr", 66 | "output_type": "stream", 67 | "text": [ 68 | "2022-05-15 14:04:47 WARNING: Can not find ner: i2b2 from official model list. Ignoring it.\n", 69 | "2022-05-15 14:04:47 WARNING: Can not find package: mimic.\n", 70 | "2022-05-15 14:04:47 INFO: Downloading these customized packages for language: es (Spanish)...\n", 71 | "=======================\n", 72 | "| Processor | Package |\n", 73 | "-----------------------\n", 74 | "=======================\n", 75 | "\n", 76 | "2022-05-15 14:04:47 INFO: Finished downloading models and saved to /home/studio-lab-user/stanza_resources.\n" 77 | ] 78 | }, 79 | { 80 | "data": { 81 | "application/vnd.jupyter.widget-view+json": { 82 | "model_id": "22ce69e1b0674bd8b255136955254bb0", 83 | "version_major": 2, 84 | "version_minor": 0 85 | }, 86 | "text/plain": [ 87 | "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 0%| …" 88 | ] 89 | }, 90 | "metadata": {}, 91 | "output_type": "display_data" 92 | }, 93 | { 94 | "data": { 95 | "application/vnd.jupyter.widget-view+json": { 96 | "model_id": "dce9cd738329440589358d33fff7a38e", 97 | "version_major": 2, 98 | "version_minor": 0 99 | }, 100 | "text/plain": [ 101 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/tokenize/mimic.pt: 0%| …" 102 | ] 103 | }, 104 | "metadata": {}, 105 | "output_type": "display_data" 106 | }, 107 | { 108 | "data": { 109 | "application/vnd.jupyter.widget-view+json": { 110 | "model_id": "eef7e690383d48dcba0712504d91fc09", 111 | "version_major": 2, 112 | "version_minor": 0 113 | }, 114 | "text/plain": [ 115 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/pos/mimic.pt: 0%| | …" 116 | ] 117 | }, 118 | "metadata": {}, 119 | "output_type": "display_data" 120 | }, 121 | { 122 | "data": { 123 | "application/vnd.jupyter.widget-view+json": { 124 | "model_id": "3b132c2d881b4751a2f9dff4353b0628", 125 | "version_major": 2, 126 | "version_minor": 0 127 | }, 128 | "text/plain": [ 129 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/lemma/mimic.pt: 0%| …" 130 | ] 131 | }, 132 | "metadata": {}, 133 | "output_type": "display_data" 134 | }, 135 | { 136 | "data": { 137 | "application/vnd.jupyter.widget-view+json": { 138 | "model_id": "dce96d9c0d5e4eefa17842a50e2e9079", 139 | "version_major": 2, 140 | "version_minor": 0 141 | }, 142 | "text/plain": [ 143 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/depparse/mimic.pt: 0%| …" 144 | ] 145 | }, 146 | "metadata": {}, 147 | "output_type": "display_data" 148 | }, 149 | { 150 | "data": { 151 | "application/vnd.jupyter.widget-view+json": { 152 | "model_id": "e2358c2b81b44f318ee9199a5d2dbdc3", 153 | "version_major": 2, 154 | "version_minor": 0 155 | }, 156 | "text/plain": [ 157 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/ner/i2b2.pt: 0%| | 0…" 158 | ] 159 | }, 160 | "metadata": {}, 161 | "output_type": "display_data" 162 | }, 163 | { 164 | "data": { 165 | "application/vnd.jupyter.widget-view+json": { 166 | "model_id": "3f8fd8c0fe6f49e1adf7e4f8bb4efbb3", 167 | "version_major": 2, 168 | "version_minor": 0 169 | }, 170 | "text/plain": [ 171 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/backward_charlm/mimic.pt: 0%|…" 172 | ] 173 | }, 174 | "metadata": {}, 175 | "output_type": "display_data" 176 | }, 177 | { 178 | "data": { 179 | "application/vnd.jupyter.widget-view+json": { 180 | "model_id": "c6f56a05412f401a9f83e9f11f83b621", 181 | "version_major": 2, 182 | "version_minor": 0 183 | }, 184 | "text/plain": [ 185 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/forward_charlm/mimic.pt: 0%| …" 186 | ] 187 | }, 188 | "metadata": {}, 189 | "output_type": "display_data" 190 | }, 191 | { 192 | "data": { 193 | "application/vnd.jupyter.widget-view+json": { 194 | "model_id": "3569144aebf04447a4e26bc0fb81e616", 195 | "version_major": 2, 196 | "version_minor": 0 197 | }, 198 | "text/plain": [ 199 | "Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/pretrain/mimic.pt: 0%| …" 200 | ] 201 | }, 202 | "metadata": {}, 203 | "output_type": "display_data" 204 | }, 205 | { 206 | "name": "stderr", 207 | "output_type": "stream", 208 | "text": [ 209 | "2022-05-15 14:04:56 INFO: Loading these models for language: en (English):\n", 210 | "=======================\n", 211 | "| Processor | Package |\n", 212 | "-----------------------\n", 213 | "| tokenize | mimic |\n", 214 | "| pos | mimic |\n", 215 | "| lemma | mimic |\n", 216 | "| depparse | mimic |\n", 217 | "| ner | i2b2 |\n", 218 | "=======================\n", 219 | "\n", 220 | "2022-05-15 14:04:56 INFO: Use device: cpu\n", 221 | "2022-05-15 14:04:56 INFO: Loading: tokenize\n", 222 | "2022-05-15 14:04:56 INFO: Loading: pos\n", 223 | "2022-05-15 14:04:56 INFO: Loading: lemma\n", 224 | "2022-05-15 14:04:56 INFO: Loading: depparse\n", 225 | "2022-05-15 14:04:56 INFO: Loading: ner\n", 226 | "2022-05-15 14:04:57 INFO: Done loading processors!\n" 227 | ] 228 | }, 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "previa\tPROBLEM\n", 234 | "intestinales\tPROBLEM\n", 235 | "Eldoctor\tTEST\n", 236 | "esa\tTREATMENT\n", 237 | "una diabetes\tPROBLEM\n", 238 | "qué síntomas\tPROBLEM\n", 239 | "lossíntomas\tTEST\n", 240 | "a su vez tienen manifestaciones extraintestinales\tTREATMENT\n", 241 | "También\tTREATMENT\n", 242 | "necesariamente\tPROBLEM\n", 243 | "cibilidad\tTREATMENT\n", 244 | "alibrmentaciónmacedentaría\tTREATMENT\n", 245 | "mostavaquismo\tTREATMENT\n", 246 | "alcolismo\tTREATMENT\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "# download and initialize a mimic pipeline with an i2b2 NER model\n", 252 | "stanza.download('en', package='mimic', processors={'ner': 'i2b2'})\n", 253 | "nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 7, 259 | "id": "be3ec37f-869c-4fd1-bdd6-a47b4adf457a", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "previa\tPROBLEM\n", 267 | "intestinales\tPROBLEM\n", 268 | "Eldoctor\tTEST\n", 269 | "esa\tTREATMENT\n", 270 | "una diabetes\tPROBLEM\n", 271 | "qué síntomas\tPROBLEM\n", 272 | "lossíntomas\tTEST\n", 273 | "a su vez tienen manifestaciones extraintestinales\tTREATMENT\n", 274 | "También\tTREATMENT\n", 275 | "necesariamente\tPROBLEM\n", 276 | "cibilidad\tTREATMENT\n", 277 | "alibrmentaciónmacedentaría\tTREATMENT\n", 278 | "mostavaquismo\tTREATMENT\n", 279 | "alcolismo\tTREATMENT\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "# annotate clinical text\n", 285 | "doc = nlp(input_text)\n", 286 | "# print out all entities\n", 287 | "for ent in doc.entities:\n", 288 | " print(f'{ent.text}\\t{ent.type}')" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "a826affe-307a-4fcf-bab0-f2c24252c6e9", 294 | "metadata": {}, 295 | "source": [ 296 | "## Summarization with Hugging Face Transformers" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 4, 302 | "id": "56178d30-10e7-4530-9971-ebd616fc73a4", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# model_name = \"google/pegasus-large\"\n", 307 | "model_name = \"google/pegasus-xsum\"\n", 308 | "# model_name = \"csebuetnlp/mT5_multilingual_XLSum\"\n", 309 | "# model_name = \"sshleifer/distilbart-cnn-12-6\"\n", 310 | "# model_name = 'ELiRF/NASES'" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 5, 316 | "id": "fb667d41-1308-45bc-9cd1-a9b1aaa79964", 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "application/vnd.jupyter.widget-view+json": { 322 | "model_id": "1ba297fc85484859b983e12a1eb5698d", 323 | "version_major": 2, 324 | "version_minor": 0 325 | }, 326 | "text/plain": [ 327 | "Downloading: 0%| | 0.00/1.67k [00:00