├── .gitignore
├── averageface
├── averageface.py
├── frontalface_default.xml
├── images
│ ├── airflow-dag-tree.png
│ ├── averageface-csgrad.jpg
│ ├── barak-obama-landmarks.png
│ └── barak-obama.jpg
└── landmark.py
├── cspeople
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── scraped
│ └── scraped_images_go_here.txt
├── settings.py
└── spiders
│ ├── __init__.py
│ └── cs_grad_people_spider.py
├── dags
└── average_faces_pipeline.py
├── readme.md
├── requirements.txt
└── scrapy.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | cspeople/scraped/full
3 | cspeople/scraped/full/*
4 | __pycache__/
5 | shape_predictor_68_face_landmarks.dat
6 | .DS_store
--------------------------------------------------------------------------------
/averageface/averageface.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import numpy as np
4 | import math
5 |
6 | # adapted from: http://www.learnopencv.com/average-face-opencv-c-python-tutorial/
7 |
8 |
9 | # Read points from text files in directory
10 | def read_points(txt_path):
11 | # Create an array of array of points.
12 | points_array = []
13 |
14 | # List all files in the directory and read points from text files one by one
15 | for filePath in sorted(os.listdir(txt_path)):
16 |
17 | if filePath.endswith(".txt"):
18 |
19 | # Create an array of points.
20 | tmp_points = []
21 |
22 | # Read points from filePath
23 | with open(os.path.join(path, filePath)) as file:
24 | for line in file:
25 | x, y = line.split()
26 | if len(x) > 0 and len(y) > 0:
27 | tmp_points.append((int(x), int(y)))
28 |
29 | # Store array of points
30 | points_array.append(tmp_points)
31 |
32 | return points_array
33 |
34 |
35 | # Read all jpg images in folder.
36 | def read_images(dir_path):
37 | # Create array of array of images.
38 | images_array = []
39 |
40 | # List all files in the directory and read points from text files one by one
41 | for filePath in sorted(os.listdir(dir_path)):
42 |
43 | if filePath.endswith(".jpg"):
44 | # Read image found.
45 | tmp_img = cv2.imread(os.path.join(path, filePath))
46 |
47 | # Convert to floating point
48 | tmp_img = np.float32(tmp_img) / 255.0
49 |
50 | # Add to array of images
51 | images_array.append(tmp_img)
52 |
53 | return images_array
54 |
55 |
56 | # Compute similarity transform given two sets of two points.
57 | # OpenCV requires 3 pairs of corresponding points.
58 | # We are faking the third one.
59 |
60 | def similarityTransform(inPoints, outPoints):
61 | s60 = math.sin(60 * math.pi / 180)
62 | c60 = math.cos(60 * math.pi / 180)
63 |
64 | inPts = np.copy(inPoints).tolist()
65 | outPts = np.copy(outPoints).tolist()
66 |
67 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0]
68 | yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1]
69 |
70 | inPts.append([np.int(xin), np.int(yin)])
71 |
72 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0]
73 | yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1]
74 |
75 | outPts.append([np.int(xout), np.int(yout)])
76 |
77 | tform = cv2.estimateRigidTransform(np.array([inPts]), np.array([outPts]), False)
78 |
79 | return tform
80 |
81 |
82 | # Check if a point is inside a rectangle
83 | def rectContains(rect, point):
84 | if point[0] < rect[0]:
85 | return False
86 | elif point[1] < rect[1]:
87 | return False
88 | elif point[0] > rect[2]:
89 | return False
90 | elif point[1] > rect[3]:
91 | return False
92 | return True
93 |
94 |
95 | # Calculate delanauy triangle
96 | def calculateDelaunayTriangles(rect, points):
97 | # Create subdiv
98 | subdiv = cv2.Subdiv2D(rect)
99 |
100 | # Insert points into subdiv
101 | for p in points:
102 | subdiv.insert((p[0], p[1]))
103 |
104 | # List of triangles. Each triangle is a list of 3 points ( 6 numbers )
105 | triangleList = subdiv.getTriangleList()
106 |
107 | # Find the indices of triangles in the points array
108 |
109 | delaunayTri = []
110 |
111 | for t in triangleList:
112 | pt = []
113 | pt.append((t[0], t[1]))
114 | pt.append((t[2], t[3]))
115 | pt.append((t[4], t[5]))
116 |
117 | pt1 = (t[0], t[1])
118 | pt2 = (t[2], t[3])
119 | pt3 = (t[4], t[5])
120 |
121 | if rectContains(rect, pt1) and rectContains(rect, pt2) and rectContains(rect, pt3):
122 | ind = []
123 | for j in range(0, 3):
124 | for k in range(0, len(points)):
125 | if abs(pt[j][0] - points[k][0]) < 1.0 and abs(pt[j][1] - points[k][1]) < 1.0:
126 | ind.append(k)
127 | if len(ind) == 3:
128 | delaunayTri.append((ind[0], ind[1], ind[2]))
129 |
130 | return delaunayTri
131 |
132 |
133 | def constrainPoint(p, w, h):
134 | p = (min(max(p[0], 0), w - 1), min(max(p[1], 0), h - 1))
135 | return p
136 |
137 |
138 | # Apply affine transform calculated using srcTri and dstTri to src and
139 | # output an image of size.
140 | def applyAffineTransform(src, srcTri, dstTri, size):
141 | # Given a pair of triangles, find the affine transform.
142 | warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri))
143 |
144 | # Apply the Affine Transform just found to the src image
145 | dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, flags=cv2.INTER_LINEAR,
146 | borderMode=cv2.BORDER_REFLECT_101)
147 |
148 | return dst
149 |
150 |
151 | # Warps and alpha blends triangular regions from img1 and img2 to img
152 | def warpTriangle(img1, img2, t1, t2):
153 | # Find bounding rectangle for each triangle
154 | r1 = cv2.boundingRect(np.float32([t1]))
155 | r2 = cv2.boundingRect(np.float32([t2]))
156 |
157 | # Offset points by left top corner of the respective rectangles
158 | t1Rect = []
159 | t2Rect = []
160 | t2RectInt = []
161 |
162 | for i in range(0, 3):
163 | t1Rect.append(((t1[i][0] - r1[0]), (t1[i][1] - r1[1])))
164 | t2Rect.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1])))
165 | t2RectInt.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1])))
166 |
167 | # Get mask by filling triangle
168 | mask = np.zeros((r2[3], r2[2], 3), dtype=np.float32)
169 | cv2.fillConvexPoly(mask, np.int32(t2RectInt), (1.0, 1.0, 1.0), 16, 0)
170 |
171 | # Apply warpImage to small rectangular patches
172 | img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]]
173 |
174 | size = (r2[2], r2[3])
175 |
176 | img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size)
177 |
178 | img2Rect = img2Rect * mask
179 |
180 | # Copy triangular region of the rectangular patch to the output image
181 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] * (
182 | (1.0, 1.0, 1.0) - mask)
183 |
184 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] + img2Rect
185 |
186 |
187 | if __name__ == '__main__':
188 |
189 | avgface_proj_root = os.path.abspath(os.path.dirname(__file__))
190 | path = avgface_proj_root+'/../cspeople/scraped/full'
191 |
192 | # Dimensions of output image
193 | w = 600
194 | h = 600
195 |
196 | # Read points for all images
197 | allPoints = read_points(path)
198 |
199 | # Read all images
200 | images = read_images(path)
201 |
202 | # Eye corners
203 | eyecornerDst = [(np.int(0.3 * w), np.int(h / 3)), (np.int(0.7 * w), np.int(h / 3))]
204 |
205 | imagesNorm = []
206 | pointsNorm = []
207 |
208 | # Add boundary points for delaunay triangulation
209 | boundaryPts = np.array(
210 | [(0, 0), (w / 2, 0), (w - 1, 0), (w - 1, h / 2), (w - 1, h - 1), (w / 2, h - 1), (0, h - 1), (0, h / 2)])
211 |
212 | # Initialize location of average points to 0s
213 | pointsAvg = np.array([(0, 0)] * (len(allPoints[0]) + len(boundaryPts)), np.float32())
214 |
215 | n = len(allPoints[0])
216 |
217 | numImages = len(images)
218 |
219 | # Warp images and transform landmarks to output coordinate system,
220 | # and find average of transformed landmarks.
221 |
222 | for i in range(0, numImages):
223 | points1 = allPoints[i]
224 |
225 | # Corners of the eye in input image
226 | eyecornerSrc = [allPoints[i][36], allPoints[i][45]]
227 |
228 | # Compute similarity transform
229 | tform = similarityTransform(eyecornerSrc, eyecornerDst)
230 |
231 | # Apply similarity transformation
232 | img = cv2.warpAffine(images[i], tform, (w, h))
233 |
234 | # Apply similarity transform on points
235 | points2 = np.reshape(np.array(points1), (68, 1, 2))
236 |
237 | points = cv2.transform(points2, tform)
238 |
239 | points = np.float32(np.reshape(points, (68, 2)))
240 |
241 | # Append boundary points. Will be used in Delaunay Triangulation
242 | points = np.append(points, boundaryPts, axis=0)
243 |
244 | # Calculate location of average landmark points.
245 | pointsAvg = pointsAvg + points / numImages
246 |
247 | pointsNorm.append(points)
248 | imagesNorm.append(img)
249 |
250 | # Delaunay triangulation
251 | rect = (0, 0, w, h)
252 | dt = calculateDelaunayTriangles(rect, np.array(pointsAvg))
253 |
254 | # Output image
255 | output = np.zeros((h, w, 3), np.float32())
256 |
257 | # Warp input images to average image landmarks
258 | for i in range(0, len(imagesNorm)):
259 | img = np.zeros((h, w, 3), np.float32())
260 | # Transform triangles one by one
261 | for j in range(0, len(dt)):
262 | tin = []
263 | tout = []
264 |
265 | for k in range(0, 3):
266 | pIn = pointsNorm[i][dt[j][k]]
267 | pIn = constrainPoint(pIn, w, h)
268 |
269 | pOut = pointsAvg[dt[j][k]]
270 | pOut = constrainPoint(pOut, w, h)
271 |
272 | tin.append(pIn)
273 | tout.append(pOut)
274 |
275 | warpTriangle(imagesNorm[i], img, tin, tout)
276 |
277 | # Add image intensities for averaging
278 | output = output + img
279 |
280 | # Divide by numImages to get average
281 | output = output / numImages
282 |
283 | # Display result
284 | # cv2.imshow('image', output)
285 | avg_img = "images/averageface.jpg"
286 | cv2.imwrite(avg_img, 255 * output)
287 | print('Saved image to {}'.format(avg_img))
288 | # cv2.waitKey(0)
289 |
--------------------------------------------------------------------------------
/averageface/images/airflow-dag-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/airflow-dag-tree.png
--------------------------------------------------------------------------------
/averageface/images/averageface-csgrad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/averageface-csgrad.jpg
--------------------------------------------------------------------------------
/averageface/images/barak-obama-landmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama-landmarks.png
--------------------------------------------------------------------------------
/averageface/images/barak-obama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/averageface/images/barak-obama.jpg
--------------------------------------------------------------------------------
/averageface/landmark.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | import dlib
4 | import os
5 |
6 |
7 | class LandmarkClassifier:
8 |
9 | cascade_path = "frontalface_default.xml"
10 | predictor_path = "shape_predictor_68_face_landmarks.dat"
11 | landmark_proj_root = os.path.abspath(os.path.dirname(__file__))
12 |
13 | def __init__(self, image_dir=landmark_proj_root+"/../cspeople/scraped/full"):
14 | # Create the haar cascade
15 | self.faceCascade = cv2.CascadeClassifier(self.cascade_path)
16 | # create the landmark predictor
17 | self.predictor = dlib.shape_predictor(self.predictor_path)
18 | # set image directory
19 | self.image_dir = image_dir
20 | print(self.image_dir)
21 |
22 | def face_detection(self, img):
23 | # convert the image to gray-scale
24 | img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
25 |
26 | # Detect faces in the image
27 | faces = self.faceCascade.detectMultiScale(
28 | img_gray,
29 | scaleFactor=1.05,
30 | minNeighbors=5,
31 | minSize=(100, 100),
32 | flags=cv2.CASCADE_SCALE_IMAGE
33 | )
34 |
35 | return faces
36 |
37 | def classify(self):
38 | folder = self.image_dir
39 | images = []
40 | for filename in os.listdir(folder):
41 | if filename.endswith('.jpg'):
42 | print('Processing {}.'.format(filename))
43 | img_path = os.path.join(folder, filename)
44 | img = cv2.imread(img_path)
45 | landmarks = self.get_landmarks(img)
46 | if landmarks is not None:
47 | self.write_to_file(landmarks, img_path)
48 | else:
49 | print('{} has no faces'.format(filename))
50 | # remove file as it has no faces
51 | os.remove(img_path)
52 |
53 | return images
54 |
55 | def get_landmarks(self, image):
56 |
57 | faces = self.face_detection(image)
58 |
59 | for (x, y, w, h) in faces:
60 |
61 | # Converting the OpenCV rectangle coordinates to Dlib rectangle
62 | dlib_rect = dlib.rectangle(int(x), int(y), int(x + w), int(y + h))
63 | detected_landmarks = self.predictor(image, dlib_rect).parts()
64 | landmarks = np.matrix([[p.x, p.y] for p in detected_landmarks])
65 |
66 | # should only have one face per image
67 | return landmarks
68 |
69 | def write_to_file(self, landmarks, img_path):
70 |
71 | # add txt to end of file to denote landmark representation of image
72 | file = open(img_path + '.txt', "w")
73 |
74 | for idx, point in enumerate(landmarks):
75 |
76 | file.write(str(point[0, 0]))
77 | file.write(" ")
78 | file.write(str(point[0, 1]))
79 | file.write("\n")
80 |
81 | file.close()
82 |
83 | return True
84 |
85 |
86 | LC = LandmarkClassifier()
87 | LC.classify()
88 |
--------------------------------------------------------------------------------
/cspeople/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/__init__.py
--------------------------------------------------------------------------------
/cspeople/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ScrapytestItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
16 |
17 | class ProfilePictureItem(scrapy.Item):
18 | image_urls = scrapy.Field()
19 | images = scrapy.Field()
20 |
21 |
--------------------------------------------------------------------------------
/cspeople/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class CspeopleSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/cspeople/pipelines.py:
--------------------------------------------------------------------------------
1 | from scrapy.contrib.pipeline.images import ImagesPipeline
2 | from scrapy.http import Request
3 | from scrapy import log
4 | import hashlib
5 |
6 |
7 | class CSGradPeoplePipeline(ImagesPipeline):
8 |
9 | # use default file_path function for sha1
10 | # def file_path(self, request, response=None, info=None):
11 | # just use default
12 | # image_guid = hash(request.url)
13 | # log.msg(image_guid, level=log.DEBUG)
14 | # return 'cspeople/%s' % image_guid + '.jpg'
15 |
16 | def get_media_requests(self, item, info):
17 | yield Request(item['image_urls'][0], meta=item)
--------------------------------------------------------------------------------
/cspeople/scraped/scraped_images_go_here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DansProjects/airflow-averageface/7f641fc498941c587b03c5c7130de86abf05baee/cspeople/scraped/scraped_images_go_here.txt
--------------------------------------------------------------------------------
/cspeople/settings.py:
--------------------------------------------------------------------------------
1 | import os
2 | # -*- coding: utf-8 -*-
3 |
4 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
5 | BOT_NAME = 'cspeople'
6 |
7 | SPIDER_MODULES = ['cspeople.spiders']
8 | NEWSPIDER_MODULE = 'cspeople.spiders'
9 |
10 | ITEM_PIPELINES = {
11 | 'cspeople.pipelines.CSGradPeoplePipeline': 1
12 | }
13 | IMAGES_STORE = PROJECT_ROOT + '/scraped/'
14 | FILES_STORE = PROJECT_ROOT + '/scraped/'
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'cspeople (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules, disabled because I am such a rebel.
20 | ROBOTSTXT_OBEY = False
21 |
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 |
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | #DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 |
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 |
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 |
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 | # 'Accept-Language': 'en',
43 | #}
44 |
45 | # Enable or disable spider middlewares
46 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | # 'cspeople.middlewares.cspeopleSpiderMiddleware': 543,
49 | #}
50 |
51 | # Enable or disable downloader middlewares
52 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | # 'cspeople.middlewares.MyCustomDownloaderMiddleware': 543,
55 | #}
56 |
57 | # Enable or disable extensions
58 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | # 'scrapy.extensions.telnet.TelnetConsole': None,
61 | #}
62 |
63 | # Configure item pipelines
64 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | # 'cspeople.pipelines.cspeoplePipeline': 300,
67 | #}
68 |
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = 'httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
--------------------------------------------------------------------------------
/cspeople/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/cspeople/spiders/cs_grad_people_spider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy.selector import Selector
3 | from cspeople.items import ProfilePictureItem
4 |
5 |
6 | class CSGradPeople(scrapy.Spider):
7 | name = "csgrad"
8 | # list of allowed domains
9 | allowed_domains = ['http://www.cs.princeton.edu/people/grad']
10 | start_urls = [
11 | 'http://www.cs.princeton.edu/people/grad',
12 | ]
13 |
14 | def parse(self, response):
15 | sel = Selector(response)
16 |
17 | xpath = '//div[@class="person-photo"]/img/@src | //div[@class="person-photo"]/a/img/@src'
18 | images = sel.xpath(xpath).extract()
19 |
20 | for image in images:
21 | item = ProfilePictureItem()
22 | item['image_urls'] = [response.urljoin(image)]
23 | yield item
24 |
--------------------------------------------------------------------------------
/dags/average_faces_pipeline.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators import PythonOperator, BashOperator
3 | from datetime import datetime, timedelta
4 | import os
5 |
6 | # Following are defaults which can be overridden later on
7 | default_args = {
8 | 'owner': 'Dan M',
9 | 'depends_on_past': False,
10 | 'start_date': datetime(2017, 10, 11),
11 | 'email_on_failure': False,
12 | 'email_on_retry': False,
13 | 'retries': 2,
14 | 'retry_delay': timedelta(minutes=1),
15 | }
16 |
17 | cspeople_scraper = '/Users/dmo/Documents/python/airflow'
18 | cspeople_scraper_path = '/Users/dmo/Documents/python/airflow/cspeople/scraped/full'
19 | averageface_path = '/Users/dmo/Documents/python/airflow/averageface/'
20 |
21 |
22 | dag = DAG('AverageFacePipeline', default_args=default_args)
23 |
24 |
25 | def clear_folder(dir_path=cspeople_scraper_path):
26 |
27 | file_list = os.listdir(dir_path)
28 | for file_name in file_list:
29 | if file_name.endswith('.jpg') or file_name.endswith('.txt'):
30 | os.remove(dir_path + "/" + file_name)
31 |
32 |
33 | def print_scrape_in_progress():
34 | print('Scraped is in progress!')
35 |
36 | # delete all jpg and txt files in the scraped folder
37 | t1 = PythonOperator(
38 | task_id='clear_scrape_folder',
39 | python_callable=clear_folder,
40 | dag=dag)
41 |
42 | # TODO properly import python classes
43 | t2 = BashOperator(
44 | task_id='scrape_profile_images',
45 | bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper),
46 | dag=dag)
47 |
48 | t3 = PythonOperator(
49 | task_id='scrape_progress',
50 | python_callable=print_scrape_in_progress,
51 | dag=dag)
52 |
53 | t4 = BashOperator(
54 | task_id='create_landmarks',
55 | bash_command='cd {} && python landmark.py'.format(averageface_path),
56 | dag=dag)
57 |
58 | t5 = BashOperator(
59 | task_id='create_average_face',
60 | bash_command='cd {} && python averageface.py'.format(averageface_path),
61 | dag=dag)
62 |
63 | t2.set_upstream(t1)
64 | t3.set_upstream(t1)
65 | t4.set_upstream(t2)
66 | t4.set_upstream(t3)
67 | t5.set_upstream(t4)
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Airflow pipeline
2 |
3 | Project will use Airflow, Scrapy, and OpenCV to build out a data pipeline to scrape profile images from a website
4 | and create an averaged image representation of all the profiles. Steps are below:
5 | 1. Clear scraped files folder in cspeople/scraped/full
6 | 2. Scrape images from target site and save in cspeople/scraped/full
7 | 3. Use the facial landmark library to determine 68 key features in a face and save the coordinates in a text file in cspeople/scraped/full
8 | 4. Determine the average face of all pictures using the landmarks and OpenCV, save in averageface/images
9 |
10 | The Airflow DAG graph is below. "scrape_progress" was added just to experiment with dependencies.
11 |
12 |
13 | Facial landmark (dlib) library will detect facial features (68 points) as shown below and save the points into a text file.
14 |
15 |
16 |
17 | (simplified) Points will be averaged to produce an composition of all of the profile images. Below is the actual
18 | output for the averaged face of a Princeton CS graduate student.
19 |
20 |
21 | 1. Copy the DAG file: ```dags/average_faces_pipeline.py``` to your Airflow directory (wherever you initialized it)
22 | 2. Change the ```cspeople_scraper, cspeople_scraper_path, averageface_path``` variables to target the directory
23 | that you cloned this project into.
24 |
25 | Start Airflow:
26 | ```
27 | airflow webserver -p 8080
28 | airflow scheduler
29 | ```
30 |
31 | Run the AverageFacePipeline task.
32 |
33 | ##### Customizing data source
34 |
35 | Note: Scraper ```cspeople/spiders/cs_grad_people_spider.py``` is current set to scrape ```http://www.cs.princeton.edu/people/grad```
36 | and uses xpath to extract images. To average a different data set, change the scrape url and update the xpath (//img/@src - extract all images). The application
37 | will ignore and delete all images that it cannot detect a face in.
38 |
39 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy==1.4.0
2 | opencv==3.3.0
3 | dlib==19.4
4 | airflow
5 | cv2
6 | numpy
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = cspeople.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = cspeople
12 |
--------------------------------------------------------------------------------