├── .gitignore
├── README.md
├── archive
    ├── identifying_circles.ipynb
    └── identifying_shapes.ipynb
├── brainstorming_n_planning
    ├── figma_link.txt
    └── vision_statement.txt
├── identify_circles.py
├── image_database
    └── butterfly_valve.png
├── image_tests
    ├── output1.jpg
    ├── output2.jpg
    ├── output3.jpg
    ├── pid_box_test.png
    ├── pid_test.JPG
    ├── pid_test2.JPG
    ├── pid_test3.JPG
    ├── pid_test4.JPG
    ├── pid_test5.JPG
    └── text_test.JPG
├── img_output.jpg
├── pdf2png.py
├── requirements.txt
└── test1.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /venv
2 | /.vscode
3 | */.ipynb_chekpoints/*.ipynb
4 | .ipynb_chekpoints
5 | /.ipynb_checkpoints
6 | /debug.log
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pid-scraper
 2 | Process Piping and Instrumentation Diagram (PI&amp;D) images to extract relevant data 
 3 | 
 4 | # Road Map of this development
 5 | 1. Identify circles on P&IDs
 6 | 2. Identify letters and any type of information on circles
 7 | 3. Identify other shapes
 8 | 4. Identify other type of information on P&IDs
 9 | 5. Extract complete information
10 | 6. Create a report
11 | 7. Identify differences between P&ID versions
12 | 


--------------------------------------------------------------------------------
/archive/identifying_shapes.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%config IPCompleter.greedy=True\n",
10 |     "import cv2\n",
11 |     "import numpy as np\n",
12 |     "import pytesseract\n",
13 |     "import matplotlib.pyplot as plt"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 4,
19 |    "metadata": {},
20 |    "outputs": [
21 |     {
22 |      "ename": "AttributeError",
23 |      "evalue": "'NoneType' object has no attribute 'copy'",
24 |      "output_type": "error",
25 |      "traceback": [
26 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
27 |       "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
28 |       "\u001b[1;32m<ipython-input-4-d843b1d4fda0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mimage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimg_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mimg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mimg_orig\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[0mimg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcvtColor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mimg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCOLOR_BGR2GRAY\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
29 |       "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'copy'"
30 |      ]
31 |     }
32 |    ],
33 |    "source": [
34 |     "img_path = '.\\images_tests\\output1.jpg'\n",
35 |     "image = img_path\n",
36 |     "img = cv2.imread(image, 1)\n",
37 |     "img_orig = img.copy()\n",
38 |     "img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "code",
43 |    "execution_count": 5,
44 |    "metadata": {},
45 |    "outputs": [
46 |     {
47 |      "name": "stdout",
48 |      "output_type": "stream",
49 |      "text": [
50 |       "Requirement already satisfied: opencv-python in c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\lib\\site-packages (4.4.0.46)\n",
51 |       "Requirement already satisfied: numpy>=1.17.3 in c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\lib\\site-packages (from opencv-python) (1.18.1)\n"
52 |      ]
53 |     },
54 |     {
55 |      "name": "stderr",
56 |      "output_type": "stream",
57 |      "text": [
58 |       "WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.\n",
59 |       "You should consider upgrading via the 'c:\\users\\diego.giraldo\\pythonscripts\\lcm\\venv\\scripts\\python.exe -m pip install --upgrade pip' command.\n"
60 |      ]
61 |     }
62 |    ],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 3.8.1 64-bit ('venv': venv)",
69 |    "language": "python",
70 |    "name": "python38164bitvenvvenv12168ff2dfab4907b7f3779333feb305"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.8.1"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 4
87 | }
88 | 


--------------------------------------------------------------------------------
/brainstorming_n_planning/figma_link.txt:
--------------------------------------------------------------------------------
1 | figma link:
2 | https://www.figma.com/file/RZ3N1hiuigI6oMfCzrKSbI/pnid_scraper


--------------------------------------------------------------------------------
/brainstorming_n_planning/vision_statement.txt:
--------------------------------------------------------------------------------
 1 | PROJECT VISION STATEMENT
 2 | 
 3 | VISION:
 4 | o- What's the purpose of creating the product? => The main purpose is to reduce the time Asset Management Engineers spend on extracting important imformation from engineering 
 5 | diagrams, in this case, piping and instrumentation diagrams P&ID
 6 | o- Which positive change will bring about? => It increases data extraction quality and speeds up the process of scraping data from diagrams
 7 | 
 8 | TARGET GROUP:
 9 | o- Which market or market segment does the product address? => Any company, which implements Asset Management Engineering or asset management engineering consulting companies
10 | o- Who are the target customers and users? => Asset Management Engineers
11 | 
12 | NEEDS:
13 | o- What problem does the product solve? => the excessive time consumed by engineers to extract data manually and the quality of the data extracted
14 | o- Which benefit does it provide? => Make Asset Management Engineer's life easy
15 | 
16 | PRODUCT:
17 | o- What product is it? => A web-based application
18 | o- What makes it stand out? => Simplistic design and use, and high accuracy in indentifying all elements in diagrams
19 | o- is it feasible to develop the product? => it is feasible, even though the major challenge is the ML algorigthm for the shape recognition
20 | 
21 | BUSINESS GOALS:
22 | o- How is the product going to benefit the company? => This will be one of various servicies focused on increasing data quality extraction and management
23 | o- What are the business goals? => MAIN GOAL Q1 2021: Develop a Minimum Viable Product (MVP) able to identify different shapes in a P&ID by first quarter of 2021
24 | 


--------------------------------------------------------------------------------
/identify_circles.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import pytesseract
  4 | import matplotlib.pyplot as plt
  5 | import imutils
  6 | import re
  7 | 
  8 | # load image
  9 | img_path = r".\image_tests\output1.jpg"
 10 | img = cv2.imread(img_path, 1)
 11 | 
 12 | # height, width, depth and ratio
 13 | h, w, d = img.shape
 14 | # resized_w = 1400
 15 | # ratio = resized_w / w
 16 | 
 17 | resize_factor = 1
 18 | 
 19 | # resize image
 20 | resized = imutils.resize(img, width=int(w / resize_factor))
 21 | gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
 22 | 
 23 | # blurr the image
 24 | blurred = cv2.GaussianBlur(gray, (5, 5), 0)
 25 | 
 26 | # Identify circles
 27 | all_circles = cv2.HoughCircles(
 28 |     blurred,
 29 |     method=cv2.HOUGH_GRADIENT,
 30 |     dp=0.1,
 31 |     minDist=int(100 / resize_factor),
 32 |     param1=int(20 / resize_factor),
 33 |     param2=int(10 / resize_factor),
 34 |     minRadius=int(115 / resize_factor),
 35 |     maxRadius=int(120 / resize_factor),
 36 | )
 37 | circles = np.uint16(np.around(all_circles))
 38 | print("It found " + str(circles.shape[1]) + " circles on the pi&d")
 39 | 
 40 | print(circles)
 41 | 
 42 | count = 0
 43 | img_circles = resized.copy()
 44 | for circle in circles[0]:
 45 |     # Annotate circle and centroid
 46 |     cv2.circle(img_circles, (circle[0], circle[1]), circle[2], (0, 255, 0), 2)
 47 |     cv2.circle(img_circles, (circle[0], circle[1]), 2, (255, 0, 0), -2)
 48 | 
 49 |     # Annotate text
 50 |     offset_txt = int(circle[2] * 1.2)
 51 |     cv2.putText(
 52 |         img_circles,
 53 |         "Circle " + str(count),
 54 |         (circle[0] - offset_txt, circle[1] + offset_txt),
 55 |         cv2.FONT_HERSHEY_SIMPLEX,
 56 |         0.3,
 57 |         (255, 0, 0),
 58 |         1,
 59 |     )
 60 |     count += 1
 61 | 
 62 | cv2.imshow("Image", img_circles)
 63 | cv2.waitKey(0)
 64 | 
 65 | # Read information in every circle
 66 | cropped_imgs = []
 67 | cropped_imgs_txt = []
 68 | img_circle_txt = img.copy()
 69 | 
 70 | pytesseract.pytesseract.tesseract_cmd = (
 71 |     r"C:\Users\diego.giraldo\AppData\Local\Tesseract-OCR\Tesseract.exe"
 72 | )
 73 | 
 74 | circles_int = circles[0] * resize_factor
 75 | 
 76 | for circle in circles_int:
 77 |     x_offset_right = np.uint16(circle[2] * 0.75)
 78 |     x_offset_left = np.uint16(circle[2] * 0.75)
 79 |     y_offset_low = np.uint16(circle[2] * 0.75)
 80 |     y_offset_up = np.uint16(circle[2] * 0.75)
 81 |     cropped_img_lower = img_circle_txt[
 82 |         circle[1] : circle[1] + y_offset_low,
 83 |         circle[0] - x_offset_left : circle[0] + x_offset_right,
 84 |     ]
 85 |     cropped_img_upper = img_circle_txt[
 86 |         circle[1] - y_offset_up : circle[1],
 87 |         circle[0] - x_offset_left : circle[0] + x_offset_right,
 88 |     ]
 89 |     #     cropped_img = cv2.threshold(cropped_img, 100, 255, cv2.THRESH_BINARY)
 90 |     cropped_imgs.append(np.append(cropped_img_upper, cropped_img_lower, axis=0))
 91 | 
 92 |     upper = pytesseract.image_to_string(
 93 |         cropped_img_upper,
 94 |         lang="eng",
 95 |         config="--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ",
 96 |     )
 97 |     lower = pytesseract.image_to_string(
 98 |         cropped_img_lower,
 99 |         config="--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
100 |     )
101 | 
102 |     r_upper = re.compile(r"[A-Z]+")
103 |     r_lower = re.compile(r"[A-Z0-9]+")
104 | 
105 |     if r_upper.match(str(upper)) is not None:
106 |         up = r_upper.match(str(upper)).group(0)
107 | 
108 |     if r_lower.match(str(lower)) is not None:
109 |         low = r_lower.match(str(lower)).group(0)
110 | 
111 |     cropped_imgs_txt.append(up + "-" + low)
112 | 
113 | 
114 | plt.imsave(fname="img_output.jpg", arr=img_circles)
115 | 
116 | for idx, c_img in enumerate(cropped_imgs):
117 |     cv2.putText(
118 |         c_img,
119 |         cropped_imgs_txt[idx],
120 |         (50, 75),
121 |         cv2.FONT_HERSHEY_SIMPLEX,
122 |         0.7,
123 |         (0, 0, 255),
124 |         1,
125 |     )
126 |     cv2.imshow("Image", c_img)
127 |     cv2.waitKey(0)
128 | 


--------------------------------------------------------------------------------
/image_database/butterfly_valve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_database/butterfly_valve.png


--------------------------------------------------------------------------------
/image_tests/output1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output1.jpg


--------------------------------------------------------------------------------
/image_tests/output2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output2.jpg


--------------------------------------------------------------------------------
/image_tests/output3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/output3.jpg


--------------------------------------------------------------------------------
/image_tests/pid_box_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_box_test.png


--------------------------------------------------------------------------------
/image_tests/pid_test.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test.JPG


--------------------------------------------------------------------------------
/image_tests/pid_test2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test2.JPG


--------------------------------------------------------------------------------
/image_tests/pid_test3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test3.JPG


--------------------------------------------------------------------------------
/image_tests/pid_test4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test4.JPG


--------------------------------------------------------------------------------
/image_tests/pid_test5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/pid_test5.JPG


--------------------------------------------------------------------------------
/image_tests/text_test.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/image_tests/text_test.JPG


--------------------------------------------------------------------------------
/img_output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/img_output.jpg


--------------------------------------------------------------------------------
/pdf2png.py:
--------------------------------------------------------------------------------
 1 | from pdf2image import convert_from_path
 2 | from PIL import Image
 3 | import os
 4 | 
 5 | Image.MAX_IMAGE_PIXELS = None
 6 | output = "image_tests/"
 7 | 
 8 | 
 9 | def convert(file, output):
10 |     if not os.path.exists(output):
11 |         os.makedirs(output)
12 | 
13 |     pages = convert_from_path(file, 500)
14 |     counter = 3
15 | 
16 |     for page in pages:
17 |         my_file = output + "output" + str(counter) + ".jpg"
18 |         counter += 1
19 |         page.save(my_file, "JPEG")
20 |         print(my_file)
21 | 
22 | 
23 | file = r"C:\Users\diego.giraldo\OneDrive - LOGICAMMS LTD\Downloads\2-8800-A-0625.pdf"
24 | 
25 | convert(file, output)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diegogiraldog/pid-scraper/8ade8c62c76684952f1c487f4057121e5c40a720/requirements.txt


--------------------------------------------------------------------------------
/test1.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import imutils
 5 | 
 6 | # load image
 7 | img_path = r".\image_tests\output3.jpg"
 8 | img = cv2.imread(img_path, 1)
 9 | 
10 | # height, width, depth and ratio
11 | h, w, d = img.shape
12 | resized_w = 1400
13 | ratio = resized_w / w
14 | 
15 | # resize image
16 | resized = imutils.resize(img, width=resized_w)
17 | gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
18 | 
19 | # blurr the image
20 | blurred = cv2.GaussianBlur(gray, (5, 5), 0)
21 | 
22 | # Identify circles
23 | all_circles = cv2.HoughCircles(
24 |     blurred,
25 |     method=cv2.HOUGH_GRADIENT,
26 |     dp=0.1,
27 |     minDist=10,
28 |     param1=20,
29 |     param2=10,
30 |     minRadius=11,
31 |     maxRadius=12,
32 | )
33 | circles = np.uint16(np.around(all_circles))
34 | print("It found " + str(circles.shape[1]) + " circles on the pi&d")
35 | 
36 | print(all_circles)
37 | 
38 | count = 0
39 | img_circles = resized.copy()
40 | for circle in circles[0]:
41 |     # Annotate circle and centroid
42 |     cv2.circle(img_circles, (circle[0], circle[1]), circle[2], (0, 255, 0), 2)
43 |     cv2.circle(img_circles, (circle[0], circle[1]), 2, (255, 0, 0), -2)
44 | 
45 |     # Annotate text
46 |     offset_txt = int(circle[2] * 1.5)
47 |     cv2.putText(
48 |         img_circles,
49 |         "Circle " + str(count),
50 |         (circle[0] - offset_txt, circle[1] + offset_txt),
51 |         cv2.FONT_HERSHEY_SIMPLEX,
52 |         0.3,
53 |         (255, 0, 0),
54 |         1,
55 |     )
56 |     count += 1
57 | 
58 | cv2.imshow("Image", img_circles)
59 | cv2.waitKey(0)


--------------------------------------------------------------------------------