├── .gitignore
├── averageface
    ├── averageface.py
    ├── frontalface_default.xml
    ├── images
    │   ├── airflow-dag-tree.png
    │   ├── averageface-csgrad.jpg
    │   ├── barak-obama-landmarks.png
    │   └── barak-obama.jpg
    └── landmark.py
├── cspeople
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── scraped
    │   └── scraped_images_go_here.txt
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── cs_grad_people_spider.py
├── dags
    └── average_faces_pipeline.py
├── readme.md
├── requirements.txt
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | cspeople/scraped/full
3 | cspeople/scraped/full/*
4 | __pycache__/
5 | shape_predictor_68_face_landmarks.dat
6 | .DS_store


--------------------------------------------------------------------------------
/averageface/averageface.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | import math
  5 | 
  6 | # adapted from: http://www.learnopencv.com/average-face-opencv-c-python-tutorial/
  7 | 
  8 | 
  9 | # Read points from text files in directory
 10 | def read_points(txt_path):
 11 |     # Create an array of array of points.
 12 |     points_array = []
 13 | 
 14 |     # List all files in the directory and read points from text files one by one
 15 |     for filePath in sorted(os.listdir(txt_path)):
 16 | 
 17 |         if filePath.endswith(".txt"):
 18 | 
 19 |             # Create an array of points.
 20 |             tmp_points = []
 21 | 
 22 |             # Read points from filePath
 23 |             with open(os.path.join(path, filePath)) as file:
 24 |                 for line in file:
 25 |                     x, y = line.split()
 26 |                     if len(x) > 0 and len(y) > 0:
 27 |                         tmp_points.append((int(x), int(y)))
 28 | 
 29 |             # Store array of points
 30 |             points_array.append(tmp_points)
 31 | 
 32 |     return points_array
 33 | 
 34 | 
 35 | # Read all jpg images in folder.
 36 | def read_images(dir_path):
 37 |     # Create array of array of images.
 38 |     images_array = []
 39 | 
 40 |     # List all files in the directory and read points from text files one by one
 41 |     for filePath in sorted(os.listdir(dir_path)):
 42 | 
 43 |         if filePath.endswith(".jpg"):
 44 |             # Read image found.
 45 |             tmp_img = cv2.imread(os.path.join(path, filePath))
 46 | 
 47 |             # Convert to floating point
 48 |             tmp_img = np.float32(tmp_img) / 255.0
 49 | 
 50 |             # Add to array of images
 51 |             images_array.append(tmp_img)
 52 | 
 53 |     return images_array
 54 | 
 55 | 
 56 | # Compute similarity transform given two sets of two points.
 57 | # OpenCV requires 3 pairs of corresponding points.
 58 | # We are faking the third one.
 59 | 
 60 | def similarityTransform(inPoints, outPoints):
 61 |     s60 = math.sin(60 * math.pi / 180)
 62 |     c60 = math.cos(60 * math.pi / 180)
 63 | 
 64 |     inPts = np.copy(inPoints).tolist()
 65 |     outPts = np.copy(outPoints).tolist()
 66 | 
 67 |     xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0]
 68 |     yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1]
 69 | 
 70 |     inPts.append([np.int(xin), np.int(yin)])
 71 | 
 72 |     xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0]
 73 |     yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1]
 74 | 
 75 |     outPts.append([np.int(xout), np.int(yout)])
 76 | 
 77 |     tform = cv2.estimateRigidTransform(np.array([inPts]), np.array([outPts]), False)
 78 | 
 79 |     return tform
 80 | 
 81 | 
 82 | # Check if a point is inside a rectangle
 83 | def rectContains(rect, point):
 84 |     if point[0] < rect[0]:
 85 |         return False
 86 |     elif point[1] < rect[1]:
 87 |         return False
 88 |     elif point[0] > rect[2]:
 89 |         return False
 90 |     elif point[1] > rect[3]:
 91 |         return False
 92 |     return True
 93 | 
 94 | 
 95 | # Calculate delanauy triangle
 96 | def calculateDelaunayTriangles(rect, points):
 97 |     # Create subdiv
 98 |     subdiv = cv2.Subdiv2D(rect)
 99 | 
100 |     # Insert points into subdiv
101 |     for p in points:
102 |         subdiv.insert((p[0], p[1]))
103 | 
104 |     # List of triangles. Each triangle is a list of 3 points ( 6 numbers )
105 |     triangleList = subdiv.getTriangleList()
106 | 
107 |     # Find the indices of triangles in the points array
108 | 
109 |     delaunayTri = []
110 | 
111 |     for t in triangleList:
112 |         pt = []
113 |         pt.append((t[0], t[1]))
114 |         pt.append((t[2], t[3]))
115 |         pt.append((t[4], t[5]))
116 | 
117 |         pt1 = (t[0], t[1])
118 |         pt2 = (t[2], t[3])
119 |         pt3 = (t[4], t[5])
120 | 
121 |         if rectContains(rect, pt1) and rectContains(rect, pt2) and rectContains(rect, pt3):
122 |             ind = []
123 |             for j in range(0, 3):
124 |                 for k in range(0, len(points)):
125 |                     if abs(pt[j][0] - points[k][0]) < 1.0 and abs(pt[j][1] - points[k][1]) < 1.0:
126 |                         ind.append(k)
127 |             if len(ind) == 3:
128 |                 delaunayTri.append((ind[0], ind[1], ind[2]))
129 | 
130 |     return delaunayTri
131 | 
132 | 
133 | def constrainPoint(p, w, h):
134 |     p = (min(max(p[0], 0), w - 1), min(max(p[1], 0), h - 1))
135 |     return p
136 | 
137 | 
138 | # Apply affine transform calculated using srcTri and dstTri to src and
139 | # output an image of size.
140 | def applyAffineTransform(src, srcTri, dstTri, size):
141 |     # Given a pair of triangles, find the affine transform.
142 |     warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri))
143 | 
144 |     # Apply the Affine Transform just found to the src image
145 |     dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, flags=cv2.INTER_LINEAR,
146 |                          borderMode=cv2.BORDER_REFLECT_101)
147 | 
148 |     return dst
149 | 
150 | 
151 | # Warps and alpha blends triangular regions from img1 and img2 to img
152 | def warpTriangle(img1, img2, t1, t2):
153 |     # Find bounding rectangle for each triangle
154 |     r1 = cv2.boundingRect(np.float32([t1]))
155 |     r2 = cv2.boundingRect(np.float32([t2]))
156 | 
157 |     # Offset points by left top corner of the respective rectangles
158 |     t1Rect = []
159 |     t2Rect = []
160 |     t2RectInt = []
161 | 
162 |     for i in range(0, 3):
163 |         t1Rect.append(((t1[i][0] - r1[0]), (t1[i][1] - r1[1])))
164 |         t2Rect.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1])))
165 |         t2RectInt.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1])))
166 | 
167 |     # Get mask by filling triangle
168 |     mask = np.zeros((r2[3], r2[2], 3), dtype=np.float32)
169 |     cv2.fillConvexPoly(mask, np.int32(t2RectInt), (1.0, 1.0, 1.0), 16, 0)
170 | 
171 |     # Apply warpImage to small rectangular patches
172 |     img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]]
173 | 
174 |     size = (r2[2], r2[3])
175 | 
176 |     img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size)
177 | 
178 |     img2Rect = img2Rect * mask
179 | 
180 |     # Copy triangular region of the rectangular patch to the output image
181 |     img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] * (
182 |     (1.0, 1.0, 1.0) - mask)
183 | 
184 |     img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] + img2Rect
185 | 
186 | 
187 | if __name__ == '__main__':
188 | 
189 |     avgface_proj_root = os.path.abspath(os.path.dirname(__file__))
190 |     path = avgface_proj_root+'/../cspeople/scraped/full'
191 | 
192 |     # Dimensions of output image
193 |     w = 600
194 |     h = 600
195 | 
196 |     # Read points for all images
197 |     allPoints = read_points(path)
198 | 
199 |     # Read all images
200 |     images = read_images(path)
201 | 
202 |     # Eye corners
203 |     eyecornerDst = [(np.int(0.3 * w), np.int(h / 3)), (np.int(0.7 * w), np.int(h / 3))]
204 | 
205 |     imagesNorm = []
206 |     pointsNorm = []
207 | 
208 |     # Add boundary points for delaunay triangulation
209 |     boundaryPts = np.array(
210 |         [(0, 0), (w / 2, 0), (w - 1, 0), (w - 1, h / 2), (w - 1, h - 1), (w / 2, h - 1), (0, h - 1), (0, h / 2)])
211 | 
212 |     # Initialize location of average points to 0s
213 |     pointsAvg = np.array([(0, 0)] * (len(allPoints[0]) + len(boundaryPts)), np.float32())
214 | 
215 |     n = len(allPoints[0])
216 | 
217 |     numImages = len(images)
218 | 
219 |     # Warp images and transform landmarks to output coordinate system,
220 |     # and find average of transformed landmarks.
221 | 
222 |     for i in range(0, numImages):
223 |         points1 = allPoints[i]
224 | 
225 |         # Corners of the eye in input image
226 |         eyecornerSrc = [allPoints[i][36], allPoints[i][45]]
227 | 
228 |         # Compute similarity transform
229 |         tform = similarityTransform(eyecornerSrc, eyecornerDst)
230 | 
231 |         # Apply similarity transformation
232 |         img = cv2.warpAffine(images[i], tform, (w, h))
233 | 
234 |         # Apply similarity transform on points
235 |         points2 = np.reshape(np.array(points1), (68, 1, 2))
236 | 
237 |         points = cv2.transform(points2, tform)
238 | 
239 |         points = np.float32(np.reshape(points, (68, 2)))
240 | 
241 |         # Append boundary points. Will be used in Delaunay Triangulation
242 |         points = np.append(points, boundaryPts, axis=0)
243 | 
244 |         # Calculate location of average landmark points.
245 |         pointsAvg = pointsAvg + points / numImages
246 | 
247 |         pointsNorm.append(points)
248 |         imagesNorm.append(img)
249 | 
250 |     # Delaunay triangulation
251 |     rect = (0, 0, w, h)
252 |     dt = calculateDelaunayTriangles(rect, np.array(pointsAvg))
253 | 
254 |     # Output image
255 |     output = np.zeros((h, w, 3), np.float32())
256 | 
257 |     # Warp input images to average image landmarks
258 |     for i in range(0, len(imagesNorm)):
259 |         img = np.zeros((h, w, 3), np.float32())
260 |         # Transform triangles one by one
261 |         for j in range(0, len(dt)):
262 |             tin = []
263 |             tout = []
264 | 
265 |             for k in range(0, 3):
266 |                 pIn = pointsNorm[i][dt[j][k]]
267 |                 pIn = constrainPoint(pIn, w, h)
268 | 
269 |                 pOut = pointsAvg[dt[j][k]]
270 |                 pOut = constrainPoint(pOut, w, h)
271 | 
272 |                 tin.append(pIn)
273 |                 tout.append(pOut)
274 | 
275 |             warpTriangle(imagesNorm[i], img, tin, tout)
276 | 
277 |         # Add image intensities for averaging
278 |         output = output + img
279 | 
280 |     # Divide by numImages to get average
281 |     output = output / numImages
282 | 
283 |     # Display result
284 |     # cv2.imshow('image', output)
285 |     avg_img = "images/averageface.jpg"
286 |     cv2.imwrite(avg_img, 255 * output)
287 |     print('Saved image to {}'.format(avg_img))
288 |     # cv2.waitKey(0)
289 | 


--------------------------------------------------------------------------------
/averageface/images/airflow-dag-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/airflow-dag-tree.png


--------------------------------------------------------------------------------
/averageface/images/averageface-csgrad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/averageface-csgrad.jpg


--------------------------------------------------------------------------------
/averageface/images/barak-obama-landmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama-landmarks.png


--------------------------------------------------------------------------------
/averageface/images/barak-obama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama.jpg


--------------------------------------------------------------------------------
/averageface/landmark.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | import dlib
 4 | import os
 5 | 
 6 | 
 7 | class LandmarkClassifier:
 8 | 
 9 |     cascade_path = "frontalface_default.xml"
10 |     predictor_path = "shape_predictor_68_face_landmarks.dat"
11 |     landmark_proj_root = os.path.abspath(os.path.dirname(__file__))
12 | 
13 |     def __init__(self, image_dir=landmark_proj_root+"/../cspeople/scraped/full"):
14 |         # Create the haar cascade
15 |         self.faceCascade = cv2.CascadeClassifier(self.cascade_path)
16 |         # create the landmark predictor
17 |         self.predictor = dlib.shape_predictor(self.predictor_path)
18 |         # set image directory
19 |         self.image_dir = image_dir
20 |         print(self.image_dir)
21 | 
22 |     def face_detection(self, img):
23 |         # convert the image to gray-scale
24 |         img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
25 | 
26 |         # Detect faces in the image
27 |         faces = self.faceCascade.detectMultiScale(
28 |             img_gray,
29 |             scaleFactor=1.05,
30 |             minNeighbors=5,
31 |             minSize=(100, 100),
32 |             flags=cv2.CASCADE_SCALE_IMAGE
33 |         )
34 | 
35 |         return faces
36 | 
37 |     def classify(self):
38 |         folder = self.image_dir
39 |         images = []
40 |         for filename in os.listdir(folder):
41 |             if filename.endswith('.jpg'):
42 |                 print('Processing {}.'.format(filename))
43 |                 img_path = os.path.join(folder, filename)
44 |                 img = cv2.imread(img_path)
45 |                 landmarks = self.get_landmarks(img)
46 |                 if landmarks is not None:
47 |                     self.write_to_file(landmarks, img_path)
48 |                 else:
49 |                     print('{} has no faces'.format(filename))
50 |                     # remove file as it has no faces
51 |                     os.remove(img_path)
52 | 
53 |         return images
54 | 
55 |     def get_landmarks(self, image):
56 | 
57 |         faces = self.face_detection(image)
58 | 
59 |         for (x, y, w, h) in faces:
60 | 
61 |             # Converting the OpenCV rectangle coordinates to Dlib rectangle
62 |             dlib_rect = dlib.rectangle(int(x), int(y), int(x + w), int(y + h))
63 |             detected_landmarks = self.predictor(image, dlib_rect).parts()
64 |             landmarks = np.matrix([[p.x, p.y] for p in detected_landmarks])
65 | 
66 |             # should only have one face per image
67 |             return landmarks
68 | 
69 |     def write_to_file(self, landmarks, img_path):
70 | 
71 |         # add txt to end of file to denote landmark representation of image
72 |         file = open(img_path + '.txt', "w")
73 | 
74 |         for idx, point in enumerate(landmarks):
75 | 
76 |             file.write(str(point[0, 0]))
77 |             file.write(" ")
78 |             file.write(str(point[0, 1]))
79 |             file.write("\n")
80 | 
81 |         file.close()
82 | 
83 |         return True
84 | 
85 | 
86 | LC = LandmarkClassifier()
87 | LC.classify()
88 | 


--------------------------------------------------------------------------------
/cspeople/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/__init__.py


--------------------------------------------------------------------------------
/cspeople/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ScrapytestItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 
16 | 
17 | class ProfilePictureItem(scrapy.Item):
18 |     image_urls = scrapy.Field()
19 |     images = scrapy.Field()
20 | 
21 | 


--------------------------------------------------------------------------------
/cspeople/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class CspeopleSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/cspeople/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.pipeline.images import ImagesPipeline
 2 | from scrapy.http import Request
 3 | from scrapy import log
 4 | import hashlib
 5 | 
 6 | 
 7 | class CSGradPeoplePipeline(ImagesPipeline):
 8 | 
 9 |     # use default file_path function for sha1
10 |     # def file_path(self, request, response=None, info=None):
11 |         # just use default
12 |         # image_guid = hash(request.url)
13 |         # log.msg(image_guid, level=log.DEBUG)
14 |         # return 'cspeople/%s' % image_guid + '.jpg'
15 | 
16 |     def get_media_requests(self, item, info):
17 |         yield Request(item['image_urls'][0], meta=item)


--------------------------------------------------------------------------------
/cspeople/scraped/scraped_images_go_here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/scraped/scraped_images_go_here.txt


--------------------------------------------------------------------------------
/cspeople/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
 5 | BOT_NAME = 'cspeople'
 6 | 
 7 | SPIDER_MODULES = ['cspeople.spiders']
 8 | NEWSPIDER_MODULE = 'cspeople.spiders'
 9 | 
10 | ITEM_PIPELINES = {
11 |   'cspeople.pipelines.CSGradPeoplePipeline': 1
12 | }
13 | IMAGES_STORE = PROJECT_ROOT + '/scraped/'
14 | FILES_STORE =  PROJECT_ROOT + '/scraped/'
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'cspeople (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules, disabled because I am such a rebel.
20 | ROBOTSTXT_OBEY = False
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | #   'Accept-Language': 'en',
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    'cspeople.middlewares.cspeopleSpiderMiddleware': 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | #    'cspeople.middlewares.MyCustomDownloaderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable extensions
58 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    'cspeople.pipelines.cspeoplePipeline': 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/cspeople/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/cspeople/spiders/cs_grad_people_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy.selector import Selector
 3 | from cspeople.items import ProfilePictureItem
 4 | 
 5 | 
 6 | class CSGradPeople(scrapy.Spider):
 7 |     name = "csgrad"
 8 |     # list of allowed domains
 9 |     allowed_domains = ['http://www.cs.princeton.edu/people/grad']
10 |     start_urls = [
11 |         'http://www.cs.princeton.edu/people/grad',
12 |     ]
13 | 
14 |     def parse(self, response):
15 |         sel = Selector(response)
16 | 
17 |         xpath = '//div[@class="person-photo"]/img/@src | //div[@class="person-photo"]/a/img/@src'
18 |         images = sel.xpath(xpath).extract()
19 | 
20 |         for image in images:
21 |             item = ProfilePictureItem()
22 |             item['image_urls'] = [response.urljoin(image)]
23 |             yield item
24 | 


--------------------------------------------------------------------------------
/dags/average_faces_pipeline.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators import PythonOperator, BashOperator
 3 | from datetime import datetime, timedelta
 4 | import os
 5 | 
 6 | # Following are defaults which can be overridden later on
 7 | default_args = {
 8 |     'owner': 'Dan M',
 9 |     'depends_on_past': False,
10 |     'start_date': datetime(2017, 10, 11),
11 |     'email_on_failure': False,
12 |     'email_on_retry': False,
13 |     'retries': 2,
14 |     'retry_delay': timedelta(minutes=1),
15 | }
16 | 
17 | cspeople_scraper = '/Users/dmo/Documents/python/airflow'
18 | cspeople_scraper_path = '/Users/dmo/Documents/python/airflow/cspeople/scraped/full'
19 | averageface_path = '/Users/dmo/Documents/python/airflow/averageface/'
20 | 
21 | 
22 | dag = DAG('AverageFacePipeline', default_args=default_args)
23 | 
24 | 
25 | def clear_folder(dir_path=cspeople_scraper_path):
26 | 
27 |     file_list = os.listdir(dir_path)
28 |     for file_name in file_list:
29 |         if file_name.endswith('.jpg') or file_name.endswith('.txt'):
30 |             os.remove(dir_path + "/" + file_name)
31 | 
32 | 
33 | def print_scrape_in_progress():
34 |     print('Scraped is in progress!')
35 | 
36 | # delete all jpg and txt files in the scraped folder
37 | t1 = PythonOperator(
38 |     task_id='clear_scrape_folder',
39 |     python_callable=clear_folder,
40 |     dag=dag)
41 | 
42 | # TODO properly import python classes
43 | t2 = BashOperator(
44 |     task_id='scrape_profile_images',
45 |     bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper),
46 |     dag=dag)
47 | 
48 | t3 = PythonOperator(
49 |     task_id='scrape_progress',
50 |     python_callable=print_scrape_in_progress,
51 |     dag=dag)
52 | 
53 | t4 = BashOperator(
54 |     task_id='create_landmarks',
55 |     bash_command='cd {} && python landmark.py'.format(averageface_path),
56 |     dag=dag)
57 | 
58 | t5 = BashOperator(
59 |     task_id='create_average_face',
60 |     bash_command='cd {} && python averageface.py'.format(averageface_path),
61 |     dag=dag)
62 | 
63 | t2.set_upstream(t1)
64 | t3.set_upstream(t1)
65 | t4.set_upstream(t2)
66 | t4.set_upstream(t3)
67 | t5.set_upstream(t4)


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Airflow pipeline
 2 | 
 3 | Project will use Airflow, Scrapy, and OpenCV to build out a data pipeline to scrape profile images from a website
 4 | and create an averaged image representation of all the profiles. Steps are below:
 5 |  1. Clear scraped files folder in cspeople/scraped/full  
 6 |  2. Scrape images from target site and save in cspeople/scraped/full
 7 |  3. Use the facial landmark library to determine 68 key features in a face and save the coordinates in a text file in cspeople/scraped/full
 8 |  4. Determine the average face of all pictures using the landmarks and OpenCV, save in averageface/images
 9 |  
10 | The Airflow DAG graph is below. "scrape_progress" was added just to experiment with dependencies.  
11 | <img src="/averageface/images/airflow-dag-tree.png" height="200" width="500"/>
12 |  
13 | Facial landmark (dlib) library will detect facial features (68 points) as shown below and save the points into a text file. 
14 |   
15 | <img src="/averageface/images/barak-obama-landmarks.png" height="450" width="425"/>
16 | 
17 | (simplified) Points will be averaged to produce an composition of all of the profile images. Below is the actual 
18 | output for the averaged face of a Princeton CS graduate student.  
19 | <img src="/averageface/images/averageface-csgrad.jpg" height="450" width="450"/>
20 | 
21 |  1. Copy the DAG file: ```dags/average_faces_pipeline.py``` to your Airflow directory (wherever you initialized it)
22 |  2. Change the ```cspeople_scraper, cspeople_scraper_path, averageface_path``` variables to target the directory
23 |   that you cloned this project into.
24 | 
25 | Start Airflow:
26 | ```
27 | airflow webserver -p 8080
28 | airflow scheduler
29 | ```
30 | 
31 | Run the AverageFacePipeline task.
32 | 
33 | ##### Customizing data source
34 | 
35 | Note: Scraper ```cspeople/spiders/cs_grad_people_spider.py``` is current set to scrape ```http://www.cs.princeton.edu/people/grad``` 
36 | and uses xpath to extract images. To average a different data set, change the scrape url and update the xpath (//img/@src - extract all images). The application
37 | will ignore and delete all images that it cannot detect a face in. 
38 |  
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy==1.4.0
2 | opencv==3.3.0
3 | dlib==19.4
4 | airflow
5 | cv2
6 | numpy


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = cspeople.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = cspeople
12 | 


--------------------------------------------------------------------------------