├── img
    ├── 1.jpeg
    └── img.png
├── README.md
├── AI-病历文件提取.py
└── file_parsing.py


/img/1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianchiguaixia/medical_records_extract/HEAD/img/1.jpeg


--------------------------------------------------------------------------------
/img/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tianchiguaixia/medical_records_extract/HEAD/img/img.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 病历文件信息抽取
 2 | ### 背景
 3 | 该项目是为了解决医疗行业中的病历文件信息抽取问题，主要包括病历文件的解析和信息抽取两个部分。
 4 | 
 5 | 
 6 | 
 7 | ### 数据形式
 8 | ![](img/1.jpeg)
 9 | 
10 | ### 代码结构
11 | 
12 | ```
13 | ├── AI-病历文件提取.py     # 前端展示
14 | ├── file_parsing.py       # 文件解析
15 | ├── tempDir               # 暂存文件夹
16 | ├── static
17 | │   ├── ocr_files         # 保存上传的文件
18 | ```
19 | 
20 | 
21 | ### 项目启动
22 | 
23 | ```
24 | streamlit run  AI-病历文件提取.py
25 | ```
26 | 
27 | 
28 | 
29 | ### 前端展示
30 | * 前端展示地址：http://ip:8501
31 | ![](img/img.png)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/AI-病历文件提取.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # time: 2022/10/17 15:03
 3 | # file: AI-病历文件提取.py
 4 | 
 5 | 
 6 | import streamlit as st
 7 | import requests
 8 | import streamlit.components.v1 as components
 9 | from spacy import displacy
10 | from file_parsing import *
11 | from paddlenlp import Taskflow
12 | 
13 | st.set_page_config(
14 |     page_title="关键元素抽取",
15 |     page_icon="🧊",
16 |     menu_items={
17 |         'Get Help': 'https://www.extremelycoolapp.com/help',
18 |         'Report a bug': "https://www.extremelycoolapp.com/bug",
19 |         'About': "# This is a header. This is an *extremely* cool app!"
20 |     }
21 | )
22 | 
23 | st.title("# 病例文件提取 👋\n")
24 | 
25 | uploaded_file = st.file_uploader("支持图片，pdf，扫描件，word文件等信息提取")
26 | if uploaded_file is not None:
27 |     with open(os.path.join("tempDir", uploaded_file.name), "wb") as f:
28 |         f.write(uploaded_file.getbuffer())
29 |     text=file_translate("tempDir"+"/"+uploaded_file.name)
30 | 
31 | 
32 | col1, col2 = st.columns(2)
33 | with col1:
34 |     if uploaded_file is not None:
35 |         with st.form(key="my_form"):
36 |             example_schema="疾病;药品;年龄;性别"
37 |             schema_inputs = st.text_input(label="输入模板", value=example_schema)
38 |             submit_button = st.form_submit_button(label="✨ 启动!")
39 | 
40 | 
41 |         if not submit_button:
42 |             st.stop()
43 |         else:
44 |             schema = schema_inputs.split(";")
45 |             ie_model = Taskflow('information_extraction', schema=schema)
46 |             try:
47 |                 results = ie_model(text)[0]
48 |                 standard_list = []
49 |                 # [{"start": 4, "end": 10, "label": "ORG"}],
50 |                 for i,j in results.items():
51 |                     tmp_dict = {}
52 |                     tmp_dict["start"] = j[0]["start"]
53 |                     tmp_dict["end"] = j[0]["end"]
54 |                     tmp_dict["label"] = i
55 |                     standard_list.append(tmp_dict)
56 | 
57 |                 doc = [{
58 |                     "text": text,
59 |                     "ents": standard_list,
60 |                     "title": None
61 |                 }]
62 | 
63 |                 html = displacy.render(doc, style="ent", manual=True)
64 |                 components.html(html, width=350, height=2000, scrolling=True)
65 |             except:
66 |                 pass
67 | 
68 | with col2:
69 |     try:
70 |         if uploaded_file is not None:
71 |             st.title("内容提取")
72 |             st.write("\n"*3)
73 |             for element in standard_list:
74 |                 st.write(element["label"])
75 |                 st.info(text[element["start"]:element["end"]])
76 |     except:
77 |         pass
78 | 


--------------------------------------------------------------------------------
/file_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # time: 2022/10/17 15:04
  3 | # file: file_parsing.py
  4 | 
  5 | import json
  6 | import requests
  7 | from flask import Flask, request, render_template, jsonify, session
  8 | from docx import Document
  9 | from docx.shared import Inches
 10 | import os
 11 | import pdfplumber
 12 | 
 13 | app = Flask(__name__)
 14 | import requests
 15 | import fitz
 16 | import shutil
 17 | import cv2
 18 | import paddlehub as hub
 19 | import traceback
 20 | 
 21 | # 加载移动端预训练模型
 22 | ocr = hub.Module(name="chinese_ocr_db_crnn_mobile")
 23 | 
 24 | 
 25 | 
 26 | def ocr_result(image_path):
 27 |     np_images = [cv2.imread(image_path)]
 28 |     results = ocr.recognize_text(
 29 |     images=np_images,  # 图片数据，ndarray.shape 为 [H, W, C]，BGR格式；
 30 |     use_gpu=False,  # 是否使用 GPU；若使用GPU，请先设置CUDA_VISIBLE_DEVICES环境变量
 31 |     output_dir='ocr_result',  # 图片的保存路径，默认设为 ocr_result；
 32 |     visualization=True,  # 是否将识别结果保存为图片文件；
 33 |     box_thresh=0.5,  # 检测文本框置信度的阈值；
 34 |     text_thresh=0.5)  # 识别中文文本置信度的阈值；
 35 |     text=[]
 36 |     for result in results:
 37 |         data = result['data']
 38 |         save_path = result['save_path']
 39 |         for infomation in data:
 40 |             text.append(infomation['text'])
 41 |     text="".join(text)
 42 |     return text
 43 | 
 44 | def get_file_content(filePath):
 45 |     with open(filePath, 'rb') as fp:
 46 |         return fp.read()
 47 | 
 48 | def file_translate(file_path):
 49 |     try:
 50 |         file=file_path.split("/")[-1]
 51 |         file_name = file.split(".")[0]
 52 |         file_type = file.split(".")[-1]
 53 |         if file_type in ["docx", "doc"]:
 54 |             document = Document(file_path)
 55 |             text = ""
 56 |             for paragraph in document.paragraphs:
 57 |                 text += paragraph.text
 58 |             text = text.replace("\n", "").replace(" ", "")
 59 |             return text
 60 | 
 61 |         elif file_type.lower() in ["jpg","jpeg","bmp","png"]:
 62 |             text=ocr_result(file_path)
 63 |             return text
 64 | 
 65 |         elif file_type == "pdf":
 66 |             pdf = pdfplumber.open(file_path)
 67 |             text = ""
 68 |             for page in pdf.pages:
 69 |                 # 获取当前页面的全部文本信息，包括表格中的文字
 70 |                 text += page.extract_text()
 71 |             text = text.replace("\n", "").replace(" ", "")
 72 | 
 73 |             if text != "":
 74 |                 return text
 75 |             elif text == "":
 76 |                 file_save = "static/ocr_files/"
 77 |                 isExists = os.path.exists(file_save)
 78 |                 # 判断结果
 79 |                 if not isExists:
 80 |                     os.makedirs(file_save)
 81 | 
 82 |                 pdf = fitz.open(file_path)
 83 |                 print(pdf.pageCount)
 84 |                 text = ''
 85 |                 for pg in range(pdf.pageCount):
 86 |                     page = pdf[pg]
 87 |                     rotate = int(0)
 88 |                     # 每个尺寸的缩放系数为1.3，这将为我们生成分辨率提高2.6的图像。
 89 |                     # 此处若是不做设置，默认图片大小为：792X612, dpi=96
 90 |                     zoom_x = 1.33333333  # (1.33333333-->1056x816)   (2-->1584x1224)
 91 |                     zoom_y = 1.33333333
 92 |                     mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
 93 |                     pix = page.getPixmap(matrix=mat, alpha=False)
 94 | 
 95 |                     file_isExists = os.path.exists("static/ocr_files/" + file_name)
 96 |                     # 判断结果
 97 |                     if not file_isExists:
 98 |                         os.makedirs("static/ocr_files/" + file_name)
 99 | 
100 |                     pix.writePNG("static/ocr_files/" + file_name + "/" + '%s.PNG' % pg)
101 |                     response = ocr_result("static/ocr_files/" + file_name + "/" + '%s.PNG' % pg)
102 |                     text +=response
103 |                 return text
104 |     except:
105 |         print(traceback.print_exc())
106 |         text="未提取出内容"
107 | 
108 | if __name__=="__main__":
109 |     # print(file_translate('tempDir/test1.docx'))
110 |     # print(file_translate('tempDir/1.jpeg'))
111 |     print(file_translate("img/1.jpeg"))
112 | 


--------------------------------------------------------------------------------