├── .gitignore ├── averageface ├── averageface.py ├── frontalface_default.xml ├── images │ ├── airflow-dag-tree.png │ ├── averageface-csgrad.jpg │ ├── barak-obama-landmarks.png │ └── barak-obama.jpg └── landmark.py ├── cspeople ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── scraped │ └── scraped_images_go_here.txt ├── settings.py └── spiders │ ├── __init__.py │ └── cs_grad_people_spider.py ├── dags └── average_faces_pipeline.py ├── readme.md ├── requirements.txt └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | cspeople/scraped/full 3 | cspeople/scraped/full/* 4 | __pycache__/ 5 | shape_predictor_68_face_landmarks.dat 6 | .DS_store -------------------------------------------------------------------------------- /averageface/averageface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import math 5 | 6 | # adapted from: http://www.learnopencv.com/average-face-opencv-c-python-tutorial/ 7 | 8 | 9 | # Read points from text files in directory 10 | def read_points(txt_path): 11 | # Create an array of array of points. 12 | points_array = [] 13 | 14 | # List all files in the directory and read points from text files one by one 15 | for filePath in sorted(os.listdir(txt_path)): 16 | 17 | if filePath.endswith(".txt"): 18 | 19 | # Create an array of points. 20 | tmp_points = [] 21 | 22 | # Read points from filePath 23 | with open(os.path.join(path, filePath)) as file: 24 | for line in file: 25 | x, y = line.split() 26 | if len(x) > 0 and len(y) > 0: 27 | tmp_points.append((int(x), int(y))) 28 | 29 | # Store array of points 30 | points_array.append(tmp_points) 31 | 32 | return points_array 33 | 34 | 35 | # Read all jpg images in folder. 36 | def read_images(dir_path): 37 | # Create array of array of images. 38 | images_array = [] 39 | 40 | # List all files in the directory and read points from text files one by one 41 | for filePath in sorted(os.listdir(dir_path)): 42 | 43 | if filePath.endswith(".jpg"): 44 | # Read image found. 45 | tmp_img = cv2.imread(os.path.join(path, filePath)) 46 | 47 | # Convert to floating point 48 | tmp_img = np.float32(tmp_img) / 255.0 49 | 50 | # Add to array of images 51 | images_array.append(tmp_img) 52 | 53 | return images_array 54 | 55 | 56 | # Compute similarity transform given two sets of two points. 57 | # OpenCV requires 3 pairs of corresponding points. 58 | # We are faking the third one. 59 | 60 | def similarityTransform(inPoints, outPoints): 61 | s60 = math.sin(60 * math.pi / 180) 62 | c60 = math.cos(60 * math.pi / 180) 63 | 64 | inPts = np.copy(inPoints).tolist() 65 | outPts = np.copy(outPoints).tolist() 66 | 67 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0] 68 | yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1] 69 | 70 | inPts.append([np.int(xin), np.int(yin)]) 71 | 72 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0] 73 | yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1] 74 | 75 | outPts.append([np.int(xout), np.int(yout)]) 76 | 77 | tform = cv2.estimateRigidTransform(np.array([inPts]), np.array([outPts]), False) 78 | 79 | return tform 80 | 81 | 82 | # Check if a point is inside a rectangle 83 | def rectContains(rect, point): 84 | if point[0] < rect[0]: 85 | return False 86 | elif point[1] < rect[1]: 87 | return False 88 | elif point[0] > rect[2]: 89 | return False 90 | elif point[1] > rect[3]: 91 | return False 92 | return True 93 | 94 | 95 | # Calculate delanauy triangle 96 | def calculateDelaunayTriangles(rect, points): 97 | # Create subdiv 98 | subdiv = cv2.Subdiv2D(rect) 99 | 100 | # Insert points into subdiv 101 | for p in points: 102 | subdiv.insert((p[0], p[1])) 103 | 104 | # List of triangles. Each triangle is a list of 3 points ( 6 numbers ) 105 | triangleList = subdiv.getTriangleList() 106 | 107 | # Find the indices of triangles in the points array 108 | 109 | delaunayTri = [] 110 | 111 | for t in triangleList: 112 | pt = [] 113 | pt.append((t[0], t[1])) 114 | pt.append((t[2], t[3])) 115 | pt.append((t[4], t[5])) 116 | 117 | pt1 = (t[0], t[1]) 118 | pt2 = (t[2], t[3]) 119 | pt3 = (t[4], t[5]) 120 | 121 | if rectContains(rect, pt1) and rectContains(rect, pt2) and rectContains(rect, pt3): 122 | ind = [] 123 | for j in range(0, 3): 124 | for k in range(0, len(points)): 125 | if abs(pt[j][0] - points[k][0]) < 1.0 and abs(pt[j][1] - points[k][1]) < 1.0: 126 | ind.append(k) 127 | if len(ind) == 3: 128 | delaunayTri.append((ind[0], ind[1], ind[2])) 129 | 130 | return delaunayTri 131 | 132 | 133 | def constrainPoint(p, w, h): 134 | p = (min(max(p[0], 0), w - 1), min(max(p[1], 0), h - 1)) 135 | return p 136 | 137 | 138 | # Apply affine transform calculated using srcTri and dstTri to src and 139 | # output an image of size. 140 | def applyAffineTransform(src, srcTri, dstTri, size): 141 | # Given a pair of triangles, find the affine transform. 142 | warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri)) 143 | 144 | # Apply the Affine Transform just found to the src image 145 | dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, flags=cv2.INTER_LINEAR, 146 | borderMode=cv2.BORDER_REFLECT_101) 147 | 148 | return dst 149 | 150 | 151 | # Warps and alpha blends triangular regions from img1 and img2 to img 152 | def warpTriangle(img1, img2, t1, t2): 153 | # Find bounding rectangle for each triangle 154 | r1 = cv2.boundingRect(np.float32([t1])) 155 | r2 = cv2.boundingRect(np.float32([t2])) 156 | 157 | # Offset points by left top corner of the respective rectangles 158 | t1Rect = [] 159 | t2Rect = [] 160 | t2RectInt = [] 161 | 162 | for i in range(0, 3): 163 | t1Rect.append(((t1[i][0] - r1[0]), (t1[i][1] - r1[1]))) 164 | t2Rect.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1]))) 165 | t2RectInt.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1]))) 166 | 167 | # Get mask by filling triangle 168 | mask = np.zeros((r2[3], r2[2], 3), dtype=np.float32) 169 | cv2.fillConvexPoly(mask, np.int32(t2RectInt), (1.0, 1.0, 1.0), 16, 0) 170 | 171 | # Apply warpImage to small rectangular patches 172 | img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]] 173 | 174 | size = (r2[2], r2[3]) 175 | 176 | img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size) 177 | 178 | img2Rect = img2Rect * mask 179 | 180 | # Copy triangular region of the rectangular patch to the output image 181 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] * ( 182 | (1.0, 1.0, 1.0) - mask) 183 | 184 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] + img2Rect 185 | 186 | 187 | if __name__ == '__main__': 188 | 189 | avgface_proj_root = os.path.abspath(os.path.dirname(__file__)) 190 | path = avgface_proj_root+'/../cspeople/scraped/full' 191 | 192 | # Dimensions of output image 193 | w = 600 194 | h = 600 195 | 196 | # Read points for all images 197 | allPoints = read_points(path) 198 | 199 | # Read all images 200 | images = read_images(path) 201 | 202 | # Eye corners 203 | eyecornerDst = [(np.int(0.3 * w), np.int(h / 3)), (np.int(0.7 * w), np.int(h / 3))] 204 | 205 | imagesNorm = [] 206 | pointsNorm = [] 207 | 208 | # Add boundary points for delaunay triangulation 209 | boundaryPts = np.array( 210 | [(0, 0), (w / 2, 0), (w - 1, 0), (w - 1, h / 2), (w - 1, h - 1), (w / 2, h - 1), (0, h - 1), (0, h / 2)]) 211 | 212 | # Initialize location of average points to 0s 213 | pointsAvg = np.array([(0, 0)] * (len(allPoints[0]) + len(boundaryPts)), np.float32()) 214 | 215 | n = len(allPoints[0]) 216 | 217 | numImages = len(images) 218 | 219 | # Warp images and transform landmarks to output coordinate system, 220 | # and find average of transformed landmarks. 221 | 222 | for i in range(0, numImages): 223 | points1 = allPoints[i] 224 | 225 | # Corners of the eye in input image 226 | eyecornerSrc = [allPoints[i][36], allPoints[i][45]] 227 | 228 | # Compute similarity transform 229 | tform = similarityTransform(eyecornerSrc, eyecornerDst) 230 | 231 | # Apply similarity transformation 232 | img = cv2.warpAffine(images[i], tform, (w, h)) 233 | 234 | # Apply similarity transform on points 235 | points2 = np.reshape(np.array(points1), (68, 1, 2)) 236 | 237 | points = cv2.transform(points2, tform) 238 | 239 | points = np.float32(np.reshape(points, (68, 2))) 240 | 241 | # Append boundary points. Will be used in Delaunay Triangulation 242 | points = np.append(points, boundaryPts, axis=0) 243 | 244 | # Calculate location of average landmark points. 245 | pointsAvg = pointsAvg + points / numImages 246 | 247 | pointsNorm.append(points) 248 | imagesNorm.append(img) 249 | 250 | # Delaunay triangulation 251 | rect = (0, 0, w, h) 252 | dt = calculateDelaunayTriangles(rect, np.array(pointsAvg)) 253 | 254 | # Output image 255 | output = np.zeros((h, w, 3), np.float32()) 256 | 257 | # Warp input images to average image landmarks 258 | for i in range(0, len(imagesNorm)): 259 | img = np.zeros((h, w, 3), np.float32()) 260 | # Transform triangles one by one 261 | for j in range(0, len(dt)): 262 | tin = [] 263 | tout = [] 264 | 265 | for k in range(0, 3): 266 | pIn = pointsNorm[i][dt[j][k]] 267 | pIn = constrainPoint(pIn, w, h) 268 | 269 | pOut = pointsAvg[dt[j][k]] 270 | pOut = constrainPoint(pOut, w, h) 271 | 272 | tin.append(pIn) 273 | tout.append(pOut) 274 | 275 | warpTriangle(imagesNorm[i], img, tin, tout) 276 | 277 | # Add image intensities for averaging 278 | output = output + img 279 | 280 | # Divide by numImages to get average 281 | output = output / numImages 282 | 283 | # Display result 284 | # cv2.imshow('image', output) 285 | avg_img = "images/averageface.jpg" 286 | cv2.imwrite(avg_img, 255 * output) 287 | print('Saved image to {}'.format(avg_img)) 288 | # cv2.waitKey(0) 289 | -------------------------------------------------------------------------------- /averageface/images/airflow-dag-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/airflow-dag-tree.png -------------------------------------------------------------------------------- /averageface/images/averageface-csgrad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/averageface-csgrad.jpg -------------------------------------------------------------------------------- /averageface/images/barak-obama-landmarks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama-landmarks.png -------------------------------------------------------------------------------- /averageface/images/barak-obama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama.jpg -------------------------------------------------------------------------------- /averageface/landmark.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import dlib 4 | import os 5 | 6 | 7 | class LandmarkClassifier: 8 | 9 | cascade_path = "frontalface_default.xml" 10 | predictor_path = "shape_predictor_68_face_landmarks.dat" 11 | landmark_proj_root = os.path.abspath(os.path.dirname(__file__)) 12 | 13 | def __init__(self, image_dir=landmark_proj_root+"/../cspeople/scraped/full"): 14 | # Create the haar cascade 15 | self.faceCascade = cv2.CascadeClassifier(self.cascade_path) 16 | # create the landmark predictor 17 | self.predictor = dlib.shape_predictor(self.predictor_path) 18 | # set image directory 19 | self.image_dir = image_dir 20 | print(self.image_dir) 21 | 22 | def face_detection(self, img): 23 | # convert the image to gray-scale 24 | img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 25 | 26 | # Detect faces in the image 27 | faces = self.faceCascade.detectMultiScale( 28 | img_gray, 29 | scaleFactor=1.05, 30 | minNeighbors=5, 31 | minSize=(100, 100), 32 | flags=cv2.CASCADE_SCALE_IMAGE 33 | ) 34 | 35 | return faces 36 | 37 | def classify(self): 38 | folder = self.image_dir 39 | images = [] 40 | for filename in os.listdir(folder): 41 | if filename.endswith('.jpg'): 42 | print('Processing {}.'.format(filename)) 43 | img_path = os.path.join(folder, filename) 44 | img = cv2.imread(img_path) 45 | landmarks = self.get_landmarks(img) 46 | if landmarks is not None: 47 | self.write_to_file(landmarks, img_path) 48 | else: 49 | print('{} has no faces'.format(filename)) 50 | # remove file as it has no faces 51 | os.remove(img_path) 52 | 53 | return images 54 | 55 | def get_landmarks(self, image): 56 | 57 | faces = self.face_detection(image) 58 | 59 | for (x, y, w, h) in faces: 60 | 61 | # Converting the OpenCV rectangle coordinates to Dlib rectangle 62 | dlib_rect = dlib.rectangle(int(x), int(y), int(x + w), int(y + h)) 63 | detected_landmarks = self.predictor(image, dlib_rect).parts() 64 | landmarks = np.matrix([[p.x, p.y] for p in detected_landmarks]) 65 | 66 | # should only have one face per image 67 | return landmarks 68 | 69 | def write_to_file(self, landmarks, img_path): 70 | 71 | # add txt to end of file to denote landmark representation of image 72 | file = open(img_path + '.txt', "w") 73 | 74 | for idx, point in enumerate(landmarks): 75 | 76 | file.write(str(point[0, 0])) 77 | file.write(" ") 78 | file.write(str(point[0, 1])) 79 | file.write("\n") 80 | 81 | file.close() 82 | 83 | return True 84 | 85 | 86 | LC = LandmarkClassifier() 87 | LC.classify() 88 | -------------------------------------------------------------------------------- /cspeople/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/__init__.py -------------------------------------------------------------------------------- /cspeople/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ScrapytestItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | 17 | class ProfilePictureItem(scrapy.Item): 18 | image_urls = scrapy.Field() 19 | images = scrapy.Field() 20 | 21 | -------------------------------------------------------------------------------- /cspeople/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CspeopleSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /cspeople/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.pipeline.images import ImagesPipeline 2 | from scrapy.http import Request 3 | from scrapy import log 4 | import hashlib 5 | 6 | 7 | class CSGradPeoplePipeline(ImagesPipeline): 8 | 9 | # use default file_path function for sha1 10 | # def file_path(self, request, response=None, info=None): 11 | # just use default 12 | # image_guid = hash(request.url) 13 | # log.msg(image_guid, level=log.DEBUG) 14 | # return 'cspeople/%s' % image_guid + '.jpg' 15 | 16 | def get_media_requests(self, item, info): 17 | yield Request(item['image_urls'][0], meta=item) -------------------------------------------------------------------------------- /cspeople/scraped/scraped_images_go_here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/scraped/scraped_images_go_here.txt -------------------------------------------------------------------------------- /cspeople/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | # -*- coding: utf-8 -*- 3 | 4 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) 5 | BOT_NAME = 'cspeople' 6 | 7 | SPIDER_MODULES = ['cspeople.spiders'] 8 | NEWSPIDER_MODULE = 'cspeople.spiders' 9 | 10 | ITEM_PIPELINES = { 11 | 'cspeople.pipelines.CSGradPeoplePipeline': 1 12 | } 13 | IMAGES_STORE = PROJECT_ROOT + '/scraped/' 14 | FILES_STORE = PROJECT_ROOT + '/scraped/' 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'cspeople (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules, disabled because I am such a rebel. 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | #DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | #DEFAULT_REQUEST_HEADERS = { 41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42 | # 'Accept-Language': 'en', 43 | #} 44 | 45 | # Enable or disable spider middlewares 46 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 47 | #SPIDER_MIDDLEWARES = { 48 | # 'cspeople.middlewares.cspeopleSpiderMiddleware': 543, 49 | #} 50 | 51 | # Enable or disable downloader middlewares 52 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 53 | #DOWNLOADER_MIDDLEWARES = { 54 | # 'cspeople.middlewares.MyCustomDownloaderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable extensions 58 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 59 | #EXTENSIONS = { 60 | # 'scrapy.extensions.telnet.TelnetConsole': None, 61 | #} 62 | 63 | # Configure item pipelines 64 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 65 | #ITEM_PIPELINES = { 66 | # 'cspeople.pipelines.cspeoplePipeline': 300, 67 | #} 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 71 | #AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED = True 85 | #HTTPCACHE_EXPIRATION_SECS = 0 86 | #HTTPCACHE_DIR = 'httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | -------------------------------------------------------------------------------- /cspeople/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /cspeople/spiders/cs_grad_people_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.selector import Selector 3 | from cspeople.items import ProfilePictureItem 4 | 5 | 6 | class CSGradPeople(scrapy.Spider): 7 | name = "csgrad" 8 | # list of allowed domains 9 | allowed_domains = ['http://www.cs.princeton.edu/people/grad'] 10 | start_urls = [ 11 | 'http://www.cs.princeton.edu/people/grad', 12 | ] 13 | 14 | def parse(self, response): 15 | sel = Selector(response) 16 | 17 | xpath = '//div[@class="person-photo"]/img/@src | //div[@class="person-photo"]/a/img/@src' 18 | images = sel.xpath(xpath).extract() 19 | 20 | for image in images: 21 | item = ProfilePictureItem() 22 | item['image_urls'] = [response.urljoin(image)] 23 | yield item 24 | -------------------------------------------------------------------------------- /dags/average_faces_pipeline.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators import PythonOperator, BashOperator 3 | from datetime import datetime, timedelta 4 | import os 5 | 6 | # Following are defaults which can be overridden later on 7 | default_args = { 8 | 'owner': 'Dan M', 9 | 'depends_on_past': False, 10 | 'start_date': datetime(2017, 10, 11), 11 | 'email_on_failure': False, 12 | 'email_on_retry': False, 13 | 'retries': 2, 14 | 'retry_delay': timedelta(minutes=1), 15 | } 16 | 17 | cspeople_scraper = '/Users/dmo/Documents/python/airflow' 18 | cspeople_scraper_path = '/Users/dmo/Documents/python/airflow/cspeople/scraped/full' 19 | averageface_path = '/Users/dmo/Documents/python/airflow/averageface/' 20 | 21 | 22 | dag = DAG('AverageFacePipeline', default_args=default_args) 23 | 24 | 25 | def clear_folder(dir_path=cspeople_scraper_path): 26 | 27 | file_list = os.listdir(dir_path) 28 | for file_name in file_list: 29 | if file_name.endswith('.jpg') or file_name.endswith('.txt'): 30 | os.remove(dir_path + "/" + file_name) 31 | 32 | 33 | def print_scrape_in_progress(): 34 | print('Scraped is in progress!') 35 | 36 | # delete all jpg and txt files in the scraped folder 37 | t1 = PythonOperator( 38 | task_id='clear_scrape_folder', 39 | python_callable=clear_folder, 40 | dag=dag) 41 | 42 | # TODO properly import python classes 43 | t2 = BashOperator( 44 | task_id='scrape_profile_images', 45 | bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper), 46 | dag=dag) 47 | 48 | t3 = PythonOperator( 49 | task_id='scrape_progress', 50 | python_callable=print_scrape_in_progress, 51 | dag=dag) 52 | 53 | t4 = BashOperator( 54 | task_id='create_landmarks', 55 | bash_command='cd {} && python landmark.py'.format(averageface_path), 56 | dag=dag) 57 | 58 | t5 = BashOperator( 59 | task_id='create_average_face', 60 | bash_command='cd {} && python averageface.py'.format(averageface_path), 61 | dag=dag) 62 | 63 | t2.set_upstream(t1) 64 | t3.set_upstream(t1) 65 | t4.set_upstream(t2) 66 | t4.set_upstream(t3) 67 | t5.set_upstream(t4) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Airflow pipeline 2 | 3 | Project will use Airflow, Scrapy, and OpenCV to build out a data pipeline to scrape profile images from a website 4 | and create an averaged image representation of all the profiles. Steps are below: 5 | 1. Clear scraped files folder in cspeople/scraped/full 6 | 2. Scrape images from target site and save in cspeople/scraped/full 7 | 3. Use the facial landmark library to determine 68 key features in a face and save the coordinates in a text file in cspeople/scraped/full 8 | 4. Determine the average face of all pictures using the landmarks and OpenCV, save in averageface/images 9 | 10 | The Airflow DAG graph is below. "scrape_progress" was added just to experiment with dependencies. 11 | 12 | 13 | Facial landmark (dlib) library will detect facial features (68 points) as shown below and save the points into a text file. 14 | 15 | 16 | 17 | (simplified) Points will be averaged to produce an composition of all of the profile images. Below is the actual 18 | output for the averaged face of a Princeton CS graduate student. 19 | 20 | 21 | 1. Copy the DAG file: ```dags/average_faces_pipeline.py``` to your Airflow directory (wherever you initialized it) 22 | 2. Change the ```cspeople_scraper, cspeople_scraper_path, averageface_path``` variables to target the directory 23 | that you cloned this project into. 24 | 25 | Start Airflow: 26 | ``` 27 | airflow webserver -p 8080 28 | airflow scheduler 29 | ``` 30 | 31 | Run the AverageFacePipeline task. 32 | 33 | ##### Customizing data source 34 | 35 | Note: Scraper ```cspeople/spiders/cs_grad_people_spider.py``` is current set to scrape ```http://www.cs.princeton.edu/people/grad``` 36 | and uses xpath to extract images. To average a different data set, change the scrape url and update the xpath (//img/@src - extract all images). The application 37 | will ignore and delete all images that it cannot detect a face in. 38 | 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy==1.4.0 2 | opencv==3.3.0 3 | dlib==19.4 4 | airflow 5 | cv2 6 | numpy -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = cspeople.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = cspeople 12 | --------------------------------------------------------------------------------