├── .gitignore ├── README.md ├── archive ├── identifying_circles.ipynb └── identifying_shapes.ipynb ├── brainstorming_n_planning ├── figma_link.txt └── vision_statement.txt ├── identify_circles.py ├── image_database └── butterfly_valve.png ├── image_tests ├── output1.jpg ├── output2.jpg ├── output3.jpg ├── pid_box_test.png ├── pid_test.JPG ├── pid_test2.JPG ├── pid_test3.JPG ├── pid_test4.JPG ├── pid_test5.JPG └── text_test.JPG ├── img_output.jpg ├── pdf2png.py ├── requirements.txt └── test1.py /.gitignore: -------------------------------------------------------------------------------- 1 | /venv 2 | /.vscode 3 | */.ipynb_chekpoints/*.ipynb 4 | .ipynb_chekpoints 5 | /.ipynb_checkpoints 6 | /debug.log 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pid-scraper 2 | Process Piping and Instrumentation Diagram (PI&D) images to extract relevant data 3 | 4 | # Road Map of this development 5 | 1. Identify circles on P&IDs 6 | 2. Identify letters and any type of information on circles 7 | 3. Identify other shapes 8 | 4. Identify other type of information on P&IDs 9 | 5. Extract complete information 10 | 6. Create a report 11 | 7. Identify differences between P&ID versions 12 | -------------------------------------------------------------------------------- /archive/identifying_shapes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%config IPCompleter.greedy=True\n", 10 | "import cv2\n", 11 | "import numpy as np\n", 12 | "import pytesseract\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "ename": "AttributeError", 23 | "evalue": "'NoneType' object has no attribute 'copy'", 24 | "output_type": "error", 25 | "traceback": [ 26 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 27 | "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", 28 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mimage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimg_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mimg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mimg_orig\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[0mimg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcvtColor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCOLOR_BGR2GRAY\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 29 | "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'copy'" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "img_path = '.\\images_tests\\output1.jpg'\n", 35 | "image = img_path\n", 36 | "img = cv2.imread(image, 1)\n", 37 | "img_orig = img.copy()\n", 38 | "img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 5, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "Requirement already satisfied: opencv-python in c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\lib\\site-packages (4.4.0.46)\n", 51 | "Requirement already satisfied: numpy>=1.17.3 in c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\lib\\site-packages (from opencv-python) (1.18.1)\n" 52 | ] 53 | }, 54 | { 55 | "name": "stderr", 56 | "output_type": "stream", 57 | "text": [ 58 | "WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n", 59 | "You should consider upgrading via the 'c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\scripts\\python.exe -m pip install --upgrade pip' command.\n" 60 | ] 61 | } 62 | ], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3.8.1 64-bit ('venv': venv)", 69 | "language": "python", 70 | "name": "python38164bitvenvvenv12168ff2dfab4907b7f3779333feb305" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.8.1" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 4 87 | } 88 | -------------------------------------------------------------------------------- /brainstorming_n_planning/figma_link.txt: -------------------------------------------------------------------------------- 1 | figma link: 2 | https://www.figma.com/file/RZ3N1hiuigI6oMfCzrKSbI/pnid_scraper -------------------------------------------------------------------------------- /brainstorming_n_planning/vision_statement.txt: -------------------------------------------------------------------------------- 1 | PROJECT VISION STATEMENT 2 | 3 | VISION: 4 | o- What's the purpose of creating the product? => The main purpose is to reduce the time Asset Management Engineers spend on extracting important imformation from engineering 5 | diagrams, in this case, piping and instrumentation diagrams P&ID 6 | o- Which positive change will bring about? => It increases data extraction quality and speeds up the process of scraping data from diagrams 7 | 8 | TARGET GROUP: 9 | o- Which market or market segment does the product address? => Any company, which implements Asset Management Engineering or asset management engineering consulting companies 10 | o- Who are the target customers and users? => Asset Management Engineers 11 | 12 | NEEDS: 13 | o- What problem does the product solve? => the excessive time consumed by engineers to extract data manually and the quality of the data extracted 14 | o- Which benefit does it provide? => Make Asset Management Engineer's life easy 15 | 16 | PRODUCT: 17 | o- What product is it? => A web-based application 18 | o- What makes it stand out? => Simplistic design and use, and high accuracy in indentifying all elements in diagrams 19 | o- is it feasible to develop the product? => it is feasible, even though the major challenge is the ML algorigthm for the shape recognition 20 | 21 | BUSINESS GOALS: 22 | o- How is the product going to benefit the company? => This will be one of various servicies focused on increasing data quality extraction and management 23 | o- What are the business goals? => MAIN GOAL Q1 2021: Develop a Minimum Viable Product (MVP) able to identify different shapes in a P&ID by first quarter of 2021 24 | -------------------------------------------------------------------------------- /identify_circles.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import pytesseract 4 | import matplotlib.pyplot as plt 5 | import imutils 6 | import re 7 | 8 | # load image 9 | img_path = r".\image_tests\output1.jpg" 10 | img = cv2.imread(img_path, 1) 11 | 12 | # height, width, depth and ratio 13 | h, w, d = img.shape 14 | # resized_w = 1400 15 | # ratio = resized_w / w 16 | 17 | resize_factor = 1 18 | 19 | # resize image 20 | resized = imutils.resize(img, width=int(w / resize_factor)) 21 | gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY) 22 | 23 | # blurr the image 24 | blurred = cv2.GaussianBlur(gray, (5, 5), 0) 25 | 26 | # Identify circles 27 | all_circles = cv2.HoughCircles( 28 | blurred, 29 | method=cv2.HOUGH_GRADIENT, 30 | dp=0.1, 31 | minDist=int(100 / resize_factor), 32 | param1=int(20 / resize_factor), 33 | param2=int(10 / resize_factor), 34 | minRadius=int(115 / resize_factor), 35 | maxRadius=int(120 / resize_factor), 36 | ) 37 | circles = np.uint16(np.around(all_circles)) 38 | print("It found " + str(circles.shape[1]) + " circles on the pi&d") 39 | 40 | print(circles) 41 | 42 | count = 0 43 | img_circles = resized.copy() 44 | for circle in circles[0]: 45 | # Annotate circle and centroid 46 | cv2.circle(img_circles, (circle[0], circle[1]), circle[2], (0, 255, 0), 2) 47 | cv2.circle(img_circles, (circle[0], circle[1]), 2, (255, 0, 0), -2) 48 | 49 | # Annotate text 50 | offset_txt = int(circle[2] * 1.2) 51 | cv2.putText( 52 | img_circles, 53 | "Circle " + str(count), 54 | (circle[0] - offset_txt, circle[1] + offset_txt), 55 | cv2.FONT_HERSHEY_SIMPLEX, 56 | 0.3, 57 | (255, 0, 0), 58 | 1, 59 | ) 60 | count += 1 61 | 62 | cv2.imshow("Image", img_circles) 63 | cv2.waitKey(0) 64 | 65 | # Read information in every circle 66 | cropped_imgs = [] 67 | cropped_imgs_txt = [] 68 | img_circle_txt = img.copy() 69 | 70 | pytesseract.pytesseract.tesseract_cmd = ( 71 | r"C:\Users\diego.giraldo\AppData\Local\Tesseract-OCR\Tesseract.exe" 72 | ) 73 | 74 | circles_int = circles[0] * resize_factor 75 | 76 | for circle in circles_int: 77 | x_offset_right = np.uint16(circle[2] * 0.75) 78 | x_offset_left = np.uint16(circle[2] * 0.75) 79 | y_offset_low = np.uint16(circle[2] * 0.75) 80 | y_offset_up = np.uint16(circle[2] * 0.75) 81 | cropped_img_lower = img_circle_txt[ 82 | circle[1] : circle[1] + y_offset_low, 83 | circle[0] - x_offset_left : circle[0] + x_offset_right, 84 | ] 85 | cropped_img_upper = img_circle_txt[ 86 | circle[1] - y_offset_up : circle[1], 87 | circle[0] - x_offset_left : circle[0] + x_offset_right, 88 | ] 89 | # cropped_img = cv2.threshold(cropped_img, 100, 255, cv2.THRESH_BINARY) 90 | cropped_imgs.append(np.append(cropped_img_upper, cropped_img_lower, axis=0)) 91 | 92 | upper = pytesseract.image_to_string( 93 | cropped_img_upper, 94 | lang="eng", 95 | config="--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ", 96 | ) 97 | lower = pytesseract.image_to_string( 98 | cropped_img_lower, 99 | config="--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", 100 | ) 101 | 102 | r_upper = re.compile(r"[A-Z]+") 103 | r_lower = re.compile(r"[A-Z0-9]+") 104 | 105 | if r_upper.match(str(upper)) is not None: 106 | up = r_upper.match(str(upper)).group(0) 107 | 108 | if r_lower.match(str(lower)) is not None: 109 | low = r_lower.match(str(lower)).group(0) 110 | 111 | cropped_imgs_txt.append(up + "-" + low) 112 | 113 | 114 | plt.imsave(fname="img_output.jpg", arr=img_circles) 115 | 116 | for idx, c_img in enumerate(cropped_imgs): 117 | cv2.putText( 118 | c_img, 119 | cropped_imgs_txt[idx], 120 | (50, 75), 121 | cv2.FONT_HERSHEY_SIMPLEX, 122 | 0.7, 123 | (0, 0, 255), 124 | 1, 125 | ) 126 | cv2.imshow("Image", c_img) 127 | cv2.waitKey(0) 128 | -------------------------------------------------------------------------------- /image_database/butterfly_valve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_database/butterfly_valve.png -------------------------------------------------------------------------------- /image_tests/output1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output1.jpg -------------------------------------------------------------------------------- /image_tests/output2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output2.jpg -------------------------------------------------------------------------------- /image_tests/output3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output3.jpg -------------------------------------------------------------------------------- /image_tests/pid_box_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_box_test.png -------------------------------------------------------------------------------- /image_tests/pid_test.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test.JPG -------------------------------------------------------------------------------- /image_tests/pid_test2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test2.JPG -------------------------------------------------------------------------------- /image_tests/pid_test3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test3.JPG -------------------------------------------------------------------------------- /image_tests/pid_test4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test4.JPG -------------------------------------------------------------------------------- /image_tests/pid_test5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test5.JPG -------------------------------------------------------------------------------- /image_tests/text_test.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/text_test.JPG -------------------------------------------------------------------------------- /img_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/img_output.jpg -------------------------------------------------------------------------------- /pdf2png.py: -------------------------------------------------------------------------------- 1 | from pdf2image import convert_from_path 2 | from PIL import Image 3 | import os 4 | 5 | Image.MAX_IMAGE_PIXELS = None 6 | output = "image_tests/" 7 | 8 | 9 | def convert(file, output): 10 | if not os.path.exists(output): 11 | os.makedirs(output) 12 | 13 | pages = convert_from_path(file, 500) 14 | counter = 3 15 | 16 | for page in pages: 17 | my_file = output + "output" + str(counter) + ".jpg" 18 | counter += 1 19 | page.save(my_file, "JPEG") 20 | print(my_file) 21 | 22 | 23 | file = r"C:\Users\diego.giraldo\OneDrive - LOGICAMMS LTD\Downloads\2-8800-A-0625.pdf" 24 | 25 | convert(file, output) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/requirements.txt -------------------------------------------------------------------------------- /test1.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import imutils 5 | 6 | # load image 7 | img_path = r".\image_tests\output3.jpg" 8 | img = cv2.imread(img_path, 1) 9 | 10 | # height, width, depth and ratio 11 | h, w, d = img.shape 12 | resized_w = 1400 13 | ratio = resized_w / w 14 | 15 | # resize image 16 | resized = imutils.resize(img, width=resized_w) 17 | gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY) 18 | 19 | # blurr the image 20 | blurred = cv2.GaussianBlur(gray, (5, 5), 0) 21 | 22 | # Identify circles 23 | all_circles = cv2.HoughCircles( 24 | blurred, 25 | method=cv2.HOUGH_GRADIENT, 26 | dp=0.1, 27 | minDist=10, 28 | param1=20, 29 | param2=10, 30 | minRadius=11, 31 | maxRadius=12, 32 | ) 33 | circles = np.uint16(np.around(all_circles)) 34 | print("It found " + str(circles.shape[1]) + " circles on the pi&d") 35 | 36 | print(all_circles) 37 | 38 | count = 0 39 | img_circles = resized.copy() 40 | for circle in circles[0]: 41 | # Annotate circle and centroid 42 | cv2.circle(img_circles, (circle[0], circle[1]), circle[2], (0, 255, 0), 2) 43 | cv2.circle(img_circles, (circle[0], circle[1]), 2, (255, 0, 0), -2) 44 | 45 | # Annotate text 46 | offset_txt = int(circle[2] * 1.5) 47 | cv2.putText( 48 | img_circles, 49 | "Circle " + str(count), 50 | (circle[0] - offset_txt, circle[1] + offset_txt), 51 | cv2.FONT_HERSHEY_SIMPLEX, 52 | 0.3, 53 | (255, 0, 0), 54 | 1, 55 | ) 56 | count += 1 57 | 58 | cv2.imshow("Image", img_circles) 59 | cv2.waitKey(0) --------------------------------------------------------------------------------