├── img ├── 1.jpeg └── img.png ├── README.md ├── AI-病历文件提取.py └── file_parsing.py /img/1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianchiguaixia/medical_records_extract/HEAD/img/1.jpeg -------------------------------------------------------------------------------- /img/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianchiguaixia/medical_records_extract/HEAD/img/img.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 病历文件信息抽取 2 | ### 背景 3 | 该项目是为了解决医疗行业中的病历文件信息抽取问题,主要包括病历文件的解析和信息抽取两个部分。 4 | 5 | 6 | 7 | ### 数据形式 8 | ![](img/1.jpeg) 9 | 10 | ### 代码结构 11 | 12 | ``` 13 | ├── AI-病历文件提取.py # 前端展示 14 | ├── file_parsing.py # 文件解析 15 | ├── tempDir # 暂存文件夹 16 | ├── static 17 | │ ├── ocr_files # 保存上传的文件 18 | ``` 19 | 20 | 21 | ### 项目启动 22 | 23 | ``` 24 | streamlit run AI-病历文件提取.py 25 | ``` 26 | 27 | 28 | 29 | ### 前端展示 30 | * 前端展示地址:http://ip:8501 31 | ![](img/img.png) 32 | 33 | 34 | -------------------------------------------------------------------------------- /AI-病历文件提取.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # time: 2022/10/17 15:03 3 | # file: AI-病历文件提取.py 4 | 5 | 6 | import streamlit as st 7 | import requests 8 | import streamlit.components.v1 as components 9 | from spacy import displacy 10 | from file_parsing import * 11 | from paddlenlp import Taskflow 12 | 13 | st.set_page_config( 14 | page_title="关键元素抽取", 15 | page_icon="🧊", 16 | menu_items={ 17 | 'Get Help': 'https://www.extremelycoolapp.com/help', 18 | 'Report a bug': "https://www.extremelycoolapp.com/bug", 19 | 'About': "# This is a header. This is an *extremely* cool app!" 20 | } 21 | ) 22 | 23 | st.title("# 病例文件提取 👋\n") 24 | 25 | uploaded_file = st.file_uploader("支持图片,pdf,扫描件,word文件等信息提取") 26 | if uploaded_file is not None: 27 | with open(os.path.join("tempDir", uploaded_file.name), "wb") as f: 28 | f.write(uploaded_file.getbuffer()) 29 | text=file_translate("tempDir"+"/"+uploaded_file.name) 30 | 31 | 32 | col1, col2 = st.columns(2) 33 | with col1: 34 | if uploaded_file is not None: 35 | with st.form(key="my_form"): 36 | example_schema="疾病;药品;年龄;性别" 37 | schema_inputs = st.text_input(label="输入模板", value=example_schema) 38 | submit_button = st.form_submit_button(label="✨ 启动!") 39 | 40 | 41 | if not submit_button: 42 | st.stop() 43 | else: 44 | schema = schema_inputs.split(";") 45 | ie_model = Taskflow('information_extraction', schema=schema) 46 | try: 47 | results = ie_model(text)[0] 48 | standard_list = [] 49 | # [{"start": 4, "end": 10, "label": "ORG"}], 50 | for i,j in results.items(): 51 | tmp_dict = {} 52 | tmp_dict["start"] = j[0]["start"] 53 | tmp_dict["end"] = j[0]["end"] 54 | tmp_dict["label"] = i 55 | standard_list.append(tmp_dict) 56 | 57 | doc = [{ 58 | "text": text, 59 | "ents": standard_list, 60 | "title": None 61 | }] 62 | 63 | html = displacy.render(doc, style="ent", manual=True) 64 | components.html(html, width=350, height=2000, scrolling=True) 65 | except: 66 | pass 67 | 68 | with col2: 69 | try: 70 | if uploaded_file is not None: 71 | st.title("内容提取") 72 | st.write("\n"*3) 73 | for element in standard_list: 74 | st.write(element["label"]) 75 | st.info(text[element["start"]:element["end"]]) 76 | except: 77 | pass 78 | -------------------------------------------------------------------------------- /file_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # time: 2022/10/17 15:04 3 | # file: file_parsing.py 4 | 5 | import json 6 | import requests 7 | from flask import Flask, request, render_template, jsonify, session 8 | from docx import Document 9 | from docx.shared import Inches 10 | import os 11 | import pdfplumber 12 | 13 | app = Flask(__name__) 14 | import requests 15 | import fitz 16 | import shutil 17 | import cv2 18 | import paddlehub as hub 19 | import traceback 20 | 21 | # 加载移动端预训练模型 22 | ocr = hub.Module(name="chinese_ocr_db_crnn_mobile") 23 | 24 | 25 | 26 | def ocr_result(image_path): 27 | np_images = [cv2.imread(image_path)] 28 | results = ocr.recognize_text( 29 | images=np_images, # 图片数据,ndarray.shape 为 [H, W, C],BGR格式; 30 | use_gpu=False, # 是否使用 GPU;若使用GPU,请先设置CUDA_VISIBLE_DEVICES环境变量 31 | output_dir='ocr_result', # 图片的保存路径,默认设为 ocr_result; 32 | visualization=True, # 是否将识别结果保存为图片文件; 33 | box_thresh=0.5, # 检测文本框置信度的阈值; 34 | text_thresh=0.5) # 识别中文文本置信度的阈值; 35 | text=[] 36 | for result in results: 37 | data = result['data'] 38 | save_path = result['save_path'] 39 | for infomation in data: 40 | text.append(infomation['text']) 41 | text="".join(text) 42 | return text 43 | 44 | def get_file_content(filePath): 45 | with open(filePath, 'rb') as fp: 46 | return fp.read() 47 | 48 | def file_translate(file_path): 49 | try: 50 | file=file_path.split("/")[-1] 51 | file_name = file.split(".")[0] 52 | file_type = file.split(".")[-1] 53 | if file_type in ["docx", "doc"]: 54 | document = Document(file_path) 55 | text = "" 56 | for paragraph in document.paragraphs: 57 | text += paragraph.text 58 | text = text.replace("\n", "").replace(" ", "") 59 | return text 60 | 61 | elif file_type.lower() in ["jpg","jpeg","bmp","png"]: 62 | text=ocr_result(file_path) 63 | return text 64 | 65 | elif file_type == "pdf": 66 | pdf = pdfplumber.open(file_path) 67 | text = "" 68 | for page in pdf.pages: 69 | # 获取当前页面的全部文本信息,包括表格中的文字 70 | text += page.extract_text() 71 | text = text.replace("\n", "").replace(" ", "") 72 | 73 | if text != "": 74 | return text 75 | elif text == "": 76 | file_save = "static/ocr_files/" 77 | isExists = os.path.exists(file_save) 78 | # 判断结果 79 | if not isExists: 80 | os.makedirs(file_save) 81 | 82 | pdf = fitz.open(file_path) 83 | print(pdf.pageCount) 84 | text = '' 85 | for pg in range(pdf.pageCount): 86 | page = pdf[pg] 87 | rotate = int(0) 88 | # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。 89 | # 此处若是不做设置,默认图片大小为:792X612, dpi=96 90 | zoom_x = 1.33333333 # (1.33333333-->1056x816) (2-->1584x1224) 91 | zoom_y = 1.33333333 92 | mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) 93 | pix = page.getPixmap(matrix=mat, alpha=False) 94 | 95 | file_isExists = os.path.exists("static/ocr_files/" + file_name) 96 | # 判断结果 97 | if not file_isExists: 98 | os.makedirs("static/ocr_files/" + file_name) 99 | 100 | pix.writePNG("static/ocr_files/" + file_name + "/" + '%s.PNG' % pg) 101 | response = ocr_result("static/ocr_files/" + file_name + "/" + '%s.PNG' % pg) 102 | text +=response 103 | return text 104 | except: 105 | print(traceback.print_exc()) 106 | text="未提取出内容" 107 | 108 | if __name__=="__main__": 109 | # print(file_translate('tempDir/test1.docx')) 110 | # print(file_translate('tempDir/1.jpeg')) 111 | print(file_translate("img/1.jpeg")) 112 | --------------------------------------------------------------------------------