├── README.md ├── LICENSE ├── .gitignore └── notebooks └── generate_masks.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # data-processing-work 2 | My work in data processing 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Shashi Gharti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[codz] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | #poetry.toml 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 115 | #pdm.lock 116 | #pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # pixi 121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 122 | #pixi.lock 123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 124 | # in the .venv directory. It is recommended not to include this directory in version control. 125 | .pixi 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .envrc 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | # Abstra 179 | # Abstra is an AI-powered process automation framework. 180 | # Ignore directories containing user credentials, local state, and settings. 181 | # Learn more at https://abstra.io/docs 182 | .abstra/ 183 | 184 | # Visual Studio Code 185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 187 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 188 | # you could uncomment the following to ignore the entire vscode folder 189 | # .vscode/ 190 | 191 | # Ruff stuff: 192 | .ruff_cache/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | 197 | # Cursor 198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 200 | # refer to https://docs.cursor.com/context/ignore-files 201 | .cursorignore 202 | .cursorindexingignore 203 | 204 | # Marimo 205 | marimo/_static/ 206 | marimo/_lsp/ 207 | __marimo__/ 208 | -------------------------------------------------------------------------------- /notebooks/generate_masks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "30bf446d", 6 | "metadata": {}, 7 | "source": [ 8 | "This involves image processing step for the breast images obtained through mammography with CC and MLO views. Here I identify the mass area in each image and locate the landmarks for nipple and chest line(top, bottom) . \n", 9 | "\n", 10 | "And using landmarks in two pairs of images from same patients at different time points, I compute the affine matrix. The matrix is then applied in reverse go project maks for earlier mamograms. This helps to identify the locations which might have early signs." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "cb1f97b9-7960-4ba0-b3df-33c3165ce087", 17 | "metadata": { 18 | "id": "e947c340-03a1-44c8-b831-b88812881f7b" 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import csv\n", 23 | "import os\n", 24 | "import cv2\n", 25 | "\n", 26 | "import pickle\n", 27 | "import pandas as pd\n", 28 | "import numpy as np\n", 29 | "from PIL import Image\n", 30 | "from calendar import c\n", 31 | "from cmath import sqrt\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "from aln_tools.gmic import common" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "8a3f1d80-ecdb-4d2f-81a3-f75f5e77e995", 39 | "metadata": {}, 40 | "source": [ 41 | "#### Find the biggest contour and draw the marker around the contour to find the mass. Identify landmarks for nipple and chest points" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "5bc09223-dc9c-419b-ab6d-64616cd98d50", 48 | "metadata": { 49 | "id": "5bc09223-dc9c-419b-ab6d-64616cd98d50" 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "def process_path(image_path, folder):\n", 54 | " paths = image_path.split(\"/\")\n", 55 | " return os.path.join(\"/\".join(paths[:-1]), folder, paths[-1])\n", 56 | "\n", 57 | "def find_contours(binary_image, image, polygon_image):\n", 58 | " y_max = 0\n", 59 | " y_min = 0\n", 60 | " max_area = 0\n", 61 | " biggest_contour = None\n", 62 | " # Scan the breast to see how many bows there are on the image\n", 63 | " # Find contours in the thresholded image\n", 64 | " contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n", 65 | " polygon_image = image.copy()\n", 66 | "\n", 67 | " # Iterate through the contours and draw polygons\n", 68 | " for contour_idx, contour in enumerate(contours):\n", 69 | " # Calculate the area of the polygon\n", 70 | " area = int(cv2.contourArea(contour))\n", 71 | "\n", 72 | " # Calculate the middle point of the polygon\n", 73 | " M = cv2.moments(contour)\n", 74 | " if M[\"m00\"] != 0:\n", 75 | " cx = int(M[\"m10\"] / M[\"m00\"])\n", 76 | " cy = int(M[\"m01\"] / M[\"m00\"])\n", 77 | " middle_point = (cx, cy)\n", 78 | " # Draw the middle point on the image\n", 79 | " #cv2.circle(polygon_image, middle_point, 5, (0, 0, 255), -1)\n", 80 | "\n", 81 | " if area >= 150:\n", 82 | " if area>=max_area:\n", 83 | " biggest_contour = contour\n", 84 | " max_area = area\n", 85 | "\n", 86 | " return biggest_contour\n", 87 | "\n", 88 | "def draw(biggest_contour, polygon_image, image, orig_path, view):\n", 89 | " # Draw the biggest polygon on the image\n", 90 | " epsilon = 0.001 * cv2.arcLength(biggest_contour, True)\n", 91 | " approx_polygon = cv2.approxPolyDP(biggest_contour, epsilon, True)\n", 92 | " cv2.polylines(polygon_image, [approx_polygon], isClosed=True, color=(0, 255, 0), thickness=2)\n", 93 | " # Calculate the area of the biggest polygon\n", 94 | " breast_volume = int(cv2.contourArea(biggest_contour))\n", 95 | "\n", 96 | " # Iterate through the contour points to find y_min and y_max\n", 97 | " y_min = float('inf') # Initialize to positive infinity\n", 98 | " y_max = -1 # Initialize to a negative value\n", 99 | " x_min = -1\n", 100 | " x_max = -1\n", 101 | " y_nipple = -1\n", 102 | " x_nipple = -1\n", 103 | "\n", 104 | " if view == \"RCC\":\n", 105 | " x_nipple = float('inf') # Initialize to positive infinity\n", 106 | " nipple_point = (x_nipple, y_nipple)\n", 107 | "\n", 108 | " for point in biggest_contour:\n", 109 | " x, y = point[0] # Extract x and y coordinates from the point\n", 110 | " if y < y_min:\n", 111 | " # print(y_min)\n", 112 | " y_min = y # Update y_min if a smaller y-coordinate is encountered\n", 113 | " x_min = x\n", 114 | " if y > y_max:\n", 115 | " y_max = y # Update y_max if a larger y-coordinate is encountered\n", 116 | " x_max = x\n", 117 | " if view == \"LCC\":\n", 118 | " if x > x_nipple:\n", 119 | " x_nipple = x\n", 120 | " y_nipple = y\n", 121 | " if view == \"RCC\":\n", 122 | " if x < x_nipple:\n", 123 | " x_nipple = x\n", 124 | " y_nipple = y\n", 125 | "\n", 126 | " # #Draw the nipple from the polygon\n", 127 | " # nipple = (x_nipple, y_nipple)\n", 128 | " # cv2.circle(polygon_image, nipple, 15, (255, 0, 0), -1)\n", 129 | "\n", 130 | " image_height = image.shape[0]\n", 131 | " image_width = image.shape[1]\n", 132 | " image_width_threshold = image_width - 10\n", 133 | " image_width_threshold = 10 if view == \"LCC\" else image_width_threshold\n", 134 | " image_width = 0 if view == \"LCC\" else image_width\n", 135 | "\n", 136 | " # If top coordinate is not on the side line but the bottom coordinate\n", 137 | " if (x_min != image_width):\n", 138 | " if (y_min == 0):\n", 139 | " y_min = float('inf') # Initialize to positive infinity\n", 140 | " x_min = -1\n", 141 | "\n", 142 | " for point in biggest_contour:\n", 143 | " x, y = point[0] # Extract x and y coordinates from the point\n", 144 | " if (y<=y_nipple and y>=10):\n", 145 | " if x> x_min:\n", 146 | " y_min = y # Update y_min if a smaller y-coordinate is encountered\n", 147 | " x_min = x\n", 148 | "\n", 149 | "\n", 150 | "\n", 151 | " # If bottom coordinate is not on the side line but the top coordinate is\n", 152 | " if (x_max != image_width):\n", 153 | " if (y_max == image_height - 1):\n", 154 | " y_max = y_nipple # Initialize to positive infinity\n", 155 | " x_max = x_nipple\n", 156 | "\n", 157 | " for point in biggest_contour:\n", 158 | " x, y = point[0] # Extract x and y coordinates from the point\n", 159 | " if (y >= y_nipple and y <= (image_height - 100) and x <= image_width_threshold):\n", 160 | " if (x >= x_max):\n", 161 | " y_max = y # Update y_max if a bigger y-coordinate is encountered\n", 162 | " x_max = x\n", 163 | " if view == \"LCC\":\n", 164 | " x_min = 0\n", 165 | "\n", 166 | " #Draw the nipple marker on the polygon\n", 167 | " nipple = (x_nipple, y_nipple)\n", 168 | " # print(f\"nipple: {nipple}\")\n", 169 | " cv2.circle(polygon_image, nipple, 15, (255, 0, 0), -1)\n", 170 | "\n", 171 | " #Draw the middle_point on side opposite of breast(chest wall)\n", 172 | " y_middle_point = int(0.5*y_min + 0.5*y_max)\n", 173 | " x_middle_point = x_min\n", 174 | " middle_point = (x_middle_point, y_middle_point)\n", 175 | " cv2.circle(polygon_image, middle_point, 15, (255, 0, 0), -1)\n", 176 | "\n", 177 | " #Draw the points on chest wall\n", 178 | " min_point = (x_min, y_min)\n", 179 | " max_point = (x_min, y_max)\n", 180 | " cv2.circle(polygon_image, min_point, 15, (255, 0, 0), -1)\n", 181 | " cv2.circle(polygon_image, max_point, 15, (255, 0, 0), -1)\n", 182 | "\n", 183 | " #Draw the side line\n", 184 | " cv2.line(polygon_image, min_point,max_point,(255, 0, 0),5)\n", 185 | " #Draw the center line\n", 186 | " cv2.line(polygon_image, middle_point,nipple,(255, 0, 0),5)\n", 187 | "\n", 188 | " # Calculate the angle in radians between the two points\n", 189 | " delta_x = nipple[0] - middle_point[0]\n", 190 | " delta_y = nipple[1] - middle_point[1]\n", 191 | " angle_rad = np.arctan2(delta_y, delta_x)\n", 192 | "\n", 193 | " # Convert the angle from radians to degrees\n", 194 | " angle_deg = int(np.degrees(angle_rad))\n", 195 | "\n", 196 | " # Ensure the angle is positive (0 to 360 degrees)\n", 197 | " if angle_deg < 0:\n", 198 | " angle_deg += 360\n", 199 | "\n", 200 | " #Calculate the dist until the nipple\n", 201 | " dist = int(abs(sqrt( (x_nipple - x_min)**2 + (y_nipple - y_middle_point)**2 )))\n", 202 | " #width = y_max-y_min\n", 203 | "\n", 204 | " # Write image to the file\n", 205 | " image_path = process_path(orig_path, \"processed\")\n", 206 | " cv2.imwrite(image_path, polygon_image)\n", 207 | "\n", 208 | " # print(f\"image shape: {image.shape}\")\n", 209 | " # return [x_min, y_min, x_max, y_max, x_nipple, y_nipple, x_middle_point, y_middle_point, dist, angle_deg, breast_volume]\n", 210 | " return [*min_point, *max_point, *nipple, *middle_point, image.shape[1], image.shape[0]]\n", 211 | "\n", 212 | "\n", 213 | "#50452008_L-CC\n", 214 | "def read_dim(view, image_path=None, image=None):\n", 215 | "\n", 216 | " orig_path = image_path\n", 217 | " gray_image = image\n", 218 | " if image is None:\n", 219 | " # Load the image\n", 220 | " image = cv2.imread(image_path)\n", 221 | "\n", 222 | " # Convert the cropped image to grayscale\n", 223 | " gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n", 224 | "\n", 225 | " # Threshold the grayscale image to create a binary image\n", 226 | " _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY)\n", 227 | "\n", 228 | " # Save the binary image\n", 229 | " image_path = process_path(orig_path, \"binary\")\n", 230 | " cv2.imwrite(image_path, binary_image)\n", 231 | " polygon_image = image.copy()\n", 232 | "\n", 233 | " if view in ['LMLO', 'RMLO']:\n", 234 | " return [1,1]\n", 235 | "\n", 236 | " biggest_contour = find_contours(binary_image, image, polygon_image)\n", 237 | " if view == \"LCC\":\n", 238 | " # print(f\"\\n view: {view}\")\n", 239 | " if biggest_contour is not None:\n", 240 | " return draw(biggest_contour, polygon_image, image, orig_path, view)\n", 241 | " if view == \"RCC\":\n", 242 | " # print(f\"\\n view: {view}\")\n", 243 | " if biggest_contour is not None:\n", 244 | " return draw(biggest_contour, polygon_image, image, orig_path, view)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "id": "cd2ed48b-8d38-402d-84a3-7dfbf8a892c2", 251 | "metadata": { 252 | "id": "cd2ed48b-8d38-402d-84a3-7dfbf8a892c2" 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def process_view(image_path):\n", 257 | " view = ''\n", 258 | " if 'L-CC' in image_path:\n", 259 | " view = 'L-CC'\n", 260 | " if 'L-MLO' in image_path:\n", 261 | " view = 'L-MLO'\n", 262 | " if 'R-CC' in image_path:\n", 263 | " view = 'R-CC'\n", 264 | " if 'R-MLO' in image_path:\n", 265 | " view = 'R-MLO'\n", 266 | " return view\n", 267 | "\n", 268 | "def process_file(folder_path, images):\n", 269 | " processed_list = []\n", 270 | " for image_name in images:\n", 271 | " image_path = os.path.join(folder_path, image_name)\n", 272 | " view = process_view(image_path)\n", 273 | " view = view.replace(\"-\", \"\")\n", 274 | " meta_data = read_dim(view, image_path=image_path)\n", 275 | " data = [image_name] + list(meta_data)\n", 276 | " processed_list.append(data)\n", 277 | " return processed_list\n", 278 | "\n", 279 | "def save_processed_file(processed_file_path, processed_list):\n", 280 | " columns = [\"name\", \"chestline_top_x\", \"chestline_top_y\", \"chestline_bottom_x\", \"chestline_bottom_y\", \"nipple_x\", \"nipple_y\", \"chestline_center_x\", \"chestline_center_y\", \"image_w\", \"image_h\"]\n", 281 | " processed_data_df = pd.DataFrame(processed_list, columns=columns)\n", 282 | " processed_data_df.to_csv(processed_file_path)\n", 283 | "\n", 284 | "\n", 285 | "def get_best_center(file_name, pkl_file_path, laterality='L-CC'):\n", 286 | " with open(pkl_file_path, 'rb') as file:\n", 287 | " data = pickle.load(file)\n", 288 | "\n", 289 | " for exam in data:\n", 290 | " found = exam[laterality][0] == file_name\n", 291 | " if found:\n", 292 | " return exam['best_center'][laterality][0]" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "06894d75-411f-45f3-98ac-119ee1ccde29", 299 | "metadata": { 300 | "id": "06894d75-411f-45f3-98ac-119ee1ccde29" 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "base_path = \"/home/shashi/Desktop/aileen-health-docs/calcification/cropped\"\n", 305 | "\n", 306 | "\n", 307 | "# images\n", 308 | "images_path = os.path.join(base_path, \"images\")\n", 309 | "# images/test.csv\n", 310 | "data_file_path = os.path.join(images_path, \"test.csv\")\n", 311 | "# images/processed\n", 312 | "processed_images_path = os.path.join(images_path, \"processed\")\n", 313 | "# images/processed/processed.csv\n", 314 | "processed_file_path = os.path.join(processed_images_path, 'processed.csv')\n", 315 | "\n", 316 | "# images/resized\n", 317 | "resized_images_path = os.path.join(images_path, \"resized\")\n", 318 | "# images/resized/processed\n", 319 | "resized_processed_images_path = os.path.join(resized_images_path, \"processed\")\n", 320 | "# images/resized/processed/processed.csv\n", 321 | "resized_processed_file_path = os.path.join(resized_processed_images_path, 'processed.csv')\n", 322 | "# images/resized/generated\n", 323 | "resized_generated_images_path = os.path.join(resized_images_path, \"generated\")\n", 324 | "\n", 325 | "# segmentation\n", 326 | "segmentation_path = os.path.join(base_path, \"segmentation\")\n", 327 | "# segmentation/resized\n", 328 | "resized_segmentation_path = os.path.join(segmentation_path, \"resized\")\n", 329 | "# segmentation/resized/generated\n", 330 | "resized_generated_segmentation_path = os.path.join(resized_segmentation_path, \"generated\")\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "id": "d374aa5a-22a4-460a-ac4c-335345143afc", 336 | "metadata": { 337 | "id": "d374aa5a-22a4-460a-ac4c-335345143afc" 338 | }, 339 | "source": [ 340 | "#### Utility functions to load, resize and process image." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "id": "3a86de7b-f85f-4602-8b22-5cd20e004042", 347 | "metadata": { 348 | "id": "3a86de7b-f85f-4602-8b22-5cd20e004042" 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "def pad_image_and_mask_to_max_size(image, max_width, max_height, laterality, mask=None):\n", 353 | " \"\"\"Pads an image and its mask to the given max_width and max_height.\"\"\"\n", 354 | " h, w = image.shape[:2]\n", 355 | " # print(h, w, max_width, max_height)\n", 356 | " # Calculate padding required for width and height\n", 357 | " pad_w = max_width - w\n", 358 | " pad_h = max_height - h\n", 359 | " # print(\"pad_w, pad_h\", pad_w, pad_h)\n", 360 | "\n", 361 | " # Calculate padding for each side\n", 362 | " top, bottom = pad_h // 2, pad_h - pad_h // 2\n", 363 | " if laterality == \"L\":\n", 364 | " left, right = 0, pad_w\n", 365 | " if laterality == \"R\":\n", 366 | " left, right = pad_w, 0\n", 367 | "\n", 368 | " # print(\"top, bottom\", top, bottom)\n", 369 | " # print(\"left, right\", left, right)\n", 370 | "\n", 371 | " # Pad the image with black pixels\n", 372 | " padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])\n", 373 | " # Pad the mask with zeros (assuming the mask is binary or grayscale)\n", 374 | " padded_mask = None\n", 375 | " if mask is not None:\n", 376 | " padded_mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0])\n", 377 | " return padded_image, padded_mask\n", 378 | "\n", 379 | "def get_exam_years_of_patients(data):\n", 380 | " exam_years_of_patients = {}\n", 381 | " for _, patient in data.iterrows():\n", 382 | " patient_id = int(patient[\"anon_patientid\"])\n", 383 | " exam_year = int(patient[\"exam_year\"])\n", 384 | "\n", 385 | " if patient_id not in exam_years_of_patients:\n", 386 | " exam_years_of_patients[patient_id] = []\n", 387 | "\n", 388 | " if exam_year not in exam_years_of_patients[patient_id]:\n", 389 | " exam_years_of_patients[patient_id].append(exam_year)\n", 390 | " return exam_years_of_patients\n", 391 | "\n", 392 | "def get_common_image_size(patient, years, lt_view):\n", 393 | " max_width = 0\n", 394 | " max_height = 0\n", 395 | " images = []\n", 396 | " for year in years:\n", 397 | " image = cv2.imread(os.path.join(base_path, 'images', f'{patient}{year}_{lt_view}.png'))\n", 398 | " height, width = image.shape[:2]\n", 399 | " images.append(image)\n", 400 | "\n", 401 | " if max_height < height:\n", 402 | " max_height = height\n", 403 | " if max_width < width:\n", 404 | " max_width = width\n", 405 | " return images, max_height, max_width\n", 406 | "\n", 407 | "def get_mask_path(image_filename, folder=None):\n", 408 | " segmentation_file_path = \"segmentation\"\n", 409 | " if folder:\n", 410 | " segmentation_file_path = folder\n", 411 | "\n", 412 | " malignant_path = os.path.join(segmentation_file_path, image_filename.replace(\".png\", \"_malignant.png\"))\n", 413 | " benign_path = os.path.join(segmentation_file_path, image_filename.replace(\".png\", \"_benign.png\"))\n", 414 | " path = os.path.join(segmentation_file_path, image_filename)\n", 415 | "\n", 416 | " has_malignant = os.path.exists(malignant_path)\n", 417 | " has_benign = os.path.exists(benign_path)\n", 418 | " has_path = os.path.exists(path)\n", 419 | "\n", 420 | " if has_malignant:\n", 421 | " return malignant_path\n", 422 | "\n", 423 | " if has_benign:\n", 424 | " return benign_path\n", 425 | "\n", 426 | " if has_path:\n", 427 | " return path\n" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "id": "f330043c-b661-427d-9909-318e92232e97", 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "# load last two images starting from the later years\n", 438 | "data_df = pd.read_csv(data_file_path)\n", 439 | "patients = get_exam_years_of_patients(data_df)\n", 440 | "lateralities = ['L-CC', 'L-MLO', 'R-MLO', 'R-CC']\n", 441 | "for patient_id, years in patients.items():\n", 442 | " for lt_view in lateralities:\n", 443 | " source_image_year = years[-1]\n", 444 | " laterality = lt_view[0]\n", 445 | " images, max_height, max_width = get_common_image_size(patient_id, years, lt_view)\n", 446 | " for image, year in zip(images, years): \n", 447 | " image_file_name = f\"{patient_id}{year}_{lt_view}.png\"\n", 448 | " mask_file_name = get_mask_path(image_file_name, segmentation_path)\n", 449 | " mask = None \n", 450 | " if mask_file_name:\n", 451 | " # print(mask_file_name)\n", 452 | " mask = cv2.imread(mask_file_name, cv2.IMREAD_GRAYSCALE)\n", 453 | " image_padded, mask_padded = pad_image_and_mask_to_max_size(image, max_width, max_height, laterality, mask)\n", 454 | " image_file_name = os.path.join(base_path, 'images/resized/', image_file_name)\n", 455 | " cv2.imwrite(image_file_name, image_padded)\n", 456 | " if mask_file_name:\n", 457 | " mask_file_name = mask_file_name.replace(\"segmentation\", \"segmentation/resized\")\n", 458 | " cv2.imwrite(mask_file_name, mask_padded)\n", 459 | " print(f\"Saving: {image_file_name} {mask_file_name}\")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "67faea08-40a3-4f6f-afa0-5dbb8cad230f", 465 | "metadata": {}, 466 | "source": [ 467 | "#### Process masks to generate the marker points" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "id": "de423293-1fa7-473c-af2b-b152f2249d9d", 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "# apply to all files\n", 478 | "images = os.listdir(images_path)\n", 479 | "images = [image for image in images if '.png' in image]\n", 480 | "processed_list = []\n", 481 | "for image_name in images:\n", 482 | " image_path = os.path.join(base_path, \"images/resized\", image_name)\n", 483 | " # print(image_path)\n", 484 | " \n", 485 | " view = process_view(image_path) \n", 486 | " view = view.replace(\"-\", \"\")\n", 487 | " meta_data = read_dim(view, image_path=image_path)\n", 488 | " # print(meta_data)\n", 489 | " data = [image_name] + list(meta_data)\n", 490 | " processed_list.append(data)\n", 491 | "\n", 492 | "\n", 493 | "# Save file\n", 494 | "save_processed_file(resized_processed_file_path, processed_list)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "id": "48c242e1-5cb2-4f25-a2a6-aa021717d3a2", 501 | "metadata": { 502 | "id": "48c242e1-5cb2-4f25-a2a6-aa021717d3a2", 503 | "outputId": "08764233-22d5-48ce-e2f2-5e4375bfa4e9" 504 | }, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "loading file path: /home/shashi/Desktop/aileen-health-docs/calcification/cropped/images/resized/processed/processed.csv\n" 511 | ] 512 | }, 513 | { 514 | "data": { 515 | "text/html": [ 516 | "
| \n", 534 | " | Unnamed: 0 | \n", 535 | "name | \n", 536 | "chestline_top_x | \n", 537 | "chestline_top_y | \n", 538 | "chestline_bottom_x | \n", 539 | "chestline_bottom_y | \n", 540 | "nipple_x | \n", 541 | "nipple_y | \n", 542 | "chestline_center_x | \n", 543 | "chestline_center_y | \n", 544 | "image_w | \n", 545 | "image_h | \n", 546 | "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", 551 | "0 | \n", 552 | "53992009_L-CC.png | \n", 553 | "0 | \n", 554 | "46 | \n", 555 | "0.0 | \n", 556 | "2661.0 | \n", 557 | "1520.0 | \n", 558 | "1276.0 | \n", 559 | "0.0 | \n", 560 | "1353.0 | \n", 561 | "1664.0 | \n", 562 | "2718.0 | \n", 563 | "
| 1 | \n", 566 | "1 | \n", 567 | "50452010_L-CC.png | \n", 568 | "0 | \n", 569 | "79 | \n", 570 | "0.0 | \n", 571 | "2717.0 | \n", 572 | "1667.0 | \n", 573 | "1427.0 | \n", 574 | "0.0 | \n", 575 | "1398.0 | \n", 576 | "1718.0 | \n", 577 | "2812.0 | \n", 578 | "
| 2 | \n", 581 | "2 | \n", 582 | "52882008_R-MLO.png | \n", 583 | "1 | \n", 584 | "1 | \n", 585 | "NaN | \n", 586 | "NaN | \n", 587 | "NaN | \n", 588 | "NaN | \n", 589 | "NaN | \n", 590 | "NaN | \n", 591 | "NaN | \n", 592 | "NaN | \n", 593 | "
| 3 | \n", 596 | "3 | \n", 597 | "53992011_R-CC.png | \n", 598 | "1605 | \n", 599 | "12 | \n", 600 | "1605.0 | \n", 601 | "2596.0 | \n", 602 | "48.0 | \n", 603 | "1293.0 | \n", 604 | "1605.0 | \n", 605 | "1304.0 | \n", 606 | "1606.0 | \n", 607 | "2630.0 | \n", 608 | "
| 4 | \n", 611 | "4 | \n", 612 | "1812013_L-CC.png | \n", 613 | "0 | \n", 614 | "20 | \n", 615 | "0.0 | \n", 616 | "2612.0 | \n", 617 | "1215.0 | \n", 618 | "1264.0 | \n", 619 | "0.0 | \n", 620 | "1316.0 | \n", 621 | "1266.0 | \n", 622 | "2644.0 | \n", 623 | "