├── .gitignore ├── LICENSE.md ├── README.md ├── code ├── apple_device_data.csv ├── cam2screen.m ├── cm2pts.m ├── cropRepeatingEdge.m ├── faceGridFromFaceRect.m ├── faceGridFromParams.m ├── generateCrops.m ├── loadAllSubjects.m ├── loadAllSubjectsVectors.m ├── loadAppleDeviceData.m ├── loadSubject.m ├── pts2cm.m └── screen2cam.m ├── models ├── itracker_deploy.prototxt ├── itracker_solver.prototxt ├── itracker_train_val.prototxt ├── mean_images │ ├── mean_face_224.binaryproto │ ├── mean_face_224.mat │ ├── mean_left_224.binaryproto │ ├── mean_left_224.mat │ ├── mean_left_224_new.binaryproto │ ├── mean_right_224.binaryproto │ └── mean_right_224.mat └── snapshots │ ├── itracker25x_iter_92000.caffemodel │ └── itracker_iter_92000.caffemodel └── pytorch ├── ITrackerData.py ├── ITrackerModel.py ├── LICENSE.md ├── README.md ├── checkpoint.pth.tar ├── main.py ├── mean_face_224.mat ├── mean_left_224.mat ├── mean_right_224.mat ├── prepareDataset.py ├── reference_metadata.mat └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /pytorch/__pycache__/*.pyc 2 | /pytorch/__pycache__ 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ### Copyright (c) 2017 - Kyle Krafka, Aditya Khosla, Petr Kellnhofer, Harini Kannan, Suchendra Bhandarkar, Wojciech Matusik, and Antonio Torralba. 2 | 3 | ## LICENSE AGREEMENT FOR USE OF GAZECAPTURE DATABASE AND ITRACKER MODELS 4 | 5 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this License Agreement for Use of GazeCapture Database and iTracker Models ("Research License"). To the extent this Research License may be interpreted as a contract, You are granted the rights mentioned below in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 6 | 7 | ### Section 1 – Definitions 8 | 9 | a. __Licensor__ means the individual(s) or entity(ies) granting rights under this Research License. 10 | 11 | b. __You__ means the individual or entity exercising the Licensed Rights under this Research License. Your has a corresponding meaning. 12 | 13 | c. __Licensed Material__ refers to the GazeCapture database, iTracker models, and any related source. These contain eye-tracking data captured on mobile devices and machine learning models to predict where individuals are looking on these devices. 14 | 15 | ### Section 2 – Scope 16 | 17 | 1. Licensor desires to grant a license to You for the use of the Licensed Material. This license will in no case be considered a transfer of the Licensed Material. 18 | 19 | 2. You shall have no rights with respect to the Licensed Material or any portion thereof and shall not use the Licensed Material except as expressly set forth in this Agreement. 20 | 21 | 3. Subject to the terms and conditions of this Agreement, Licensor hereby grants to You for research use only, a royalty-free, non-exclusive, non-transferable, license subject to the following conditions: 22 | 23 | * The Licensed Material is only for Your research use and, in a need-to-know basis, of those direct research colleagues who belong to the same research institution as You and have adhered to the terms of this license. 24 | 25 | * The Licensed Material will not be copied nor distributed in any form other than for Your backup. 26 | * The Licensed Material will only be used for research purposes and will not be used nor included in commercial applications in any form (such as original files, encrypted files, files containing extracted features, models trained on dataset, other derivative works, etc). 27 | * Any work made public, whatever the form, based directly or indirectly on any part of the Licensed Material must include the following reference: 28 | 29 | > Kyle Krafka, Aditya Khosla, Petr Kellnhofer, Harini Kannan, Suchi Bhandarkar, Wojciech Matusik and Antonio Torralba. “Eye Tracking for Everyone”. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016. 30 | 31 | 4. Licensor complies with the State of Massachusetts legislation in force. It is Your, and only yours, to comply with all the data protection laws that may affect You. 32 | 33 | ### Section 3 – Disclaimer of Warranties and Limitation of Liability 34 | a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 35 | 36 | b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Research License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 37 | 38 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 39 | 40 | ### Section 4 – Term and Termination 41 | a. If You fail to comply with this Research License, then Your rights under this Research License terminate automatically. 42 | 43 | b. Where Your right to use the Licensed Material has terminated under Section 4(a), it reinstates: 44 | 45 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 46 | 47 | 2. upon express reinstatement by the Licensor. 48 | 49 | For the avoidance of doubt, this Section 4(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Research License. 50 | 51 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Research License. 52 | 53 | d. Sections 1, 3, 4, 5 and 6 survive termination of this Research License. 54 | 55 | ### Section 5 – Other Terms and Conditions 56 | 57 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 58 | 59 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Research License. 60 | 61 | ### Section 6 – Interpretation 62 | 63 | a. For the avoidance of doubt, this Research License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Research License. 64 | 65 | b. To the extent possible, if any provision of this Research License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Research License without affecting the enforceability of the remaining terms and conditions. 66 | 67 | c. No term or condition of this Research License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 68 | 69 | d. Nothing in this Research License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 70 | 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Eye Tracking for Everyone Code, Dataset and Models 2 | 3 | ## Introduction 4 | This is the README file for the official code, dataset and model release associated with the 2016 CVPR paper, "Eye Tracking for Everyone". 5 | 6 | The dataset release is broken up into three parts: 7 | 8 | * **Data** (image files and associated metadata) 9 | * **Models** (Caffe model definitions) 10 | * **Code** (some essential scripts to make use of the data) 11 | 12 | Continue reading for more information on each part. 13 | 14 | ## History 15 | Any necessary changes to the dataset will be documented here. 16 | 17 | * **March 2017**: Original code, dataset and models released. 18 | 19 | ## Usage 20 | Usage of this dataset (including all data, models, and code) is subject to the associated license, found in [LICENSE.md](LICENSE.md). The license permits the use of released code, dataset and models for research purposes only. 21 | 22 | We also ask that you cite the associated paper if you make use of this dataset; following is the BibTeX entry: 23 | 24 | ``` 25 | @inproceedings{cvpr2016_gazecapture, 26 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 27 | Title = {Eye Tracking for Everyone}, 28 | Year = {2016}, 29 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 30 | } 31 | ``` 32 | 33 | ## Data 34 | The dataset can be downloaded at the [project website](http://gazecapture.csail.mit.edu/download.php). In the dataset, we include data for 1474 unique subjects. Each numbered directory represents a recording session from one of those subjects. Numbers were assigned sequentially, although some numbers are missing for various reasons (e.g., test recordings, duplicate subjects, or incomplete uploads). 35 | 36 | Inside each directory is a collection of sequentially-numbered images (in the `frames` subdirectory) and JSON files for different pieces of metadata, described below. Many of the variables in the JSON files are arrays, where each element is associated with the frame numbered the same as the index. 37 | 38 | In training our iTracker model, we only made use of frames where the subject's device was able to detect both the user's [face](https://developer.apple.com/reference/avfoundation/avcapturemetadataoutputobjectsdelegate) and [eyes](https://developer.apple.com/reference/coreimage/cidetector) using Apple's built-in libraries. Some subjects had *no* frames with face and eye detections at all. There are 2,445,504 total frames and 1,490,959 with complete Apple detections. For this reason, some frames will be "missing" generated data. 39 | 40 | The dataset is split into three pieces, by subject (i.e., recording number): training, validation, and test. 41 | 42 | Following is a description of each variable: 43 | 44 | ### appleFace.json, appleLeftEye.json, appleRightEye.json 45 | These files describe bounding boxes around the detected face and eyes, logged at recording time using Apple libraries. "Left eye" refers to the subject's physical left eye, which appears on the right side of the image. 46 | 47 | - `X`, `Y`: Position of the top-left corner of the bounding box (in pixels). In `appleFace.json`, this value is relative to the top-left corner of the full frame; in `appleLeftEye.json` and `appleRightEye.json`, it is relative to the top-left corner of the *face crop*. 48 | - `W`, `H`: Width and height of the bounding box (in pixels). 49 | - `IsValid`: Whether or not there was actually a detection. 1 = detection; 0 = no detection. 50 | 51 | ### dotInfo.json 52 | - `DotNum`: Sequence number of the dot (starting from 0) being displayed during that frame. 53 | - `XPts`, `YPts`: Position of the center of the dot (in points; see `screen.json` documentation below for more information on this unit) from the top-left corner of the screen. 54 | - `XCam`, `YCam`: Position of the center of the dot in our prediction space. The position is measured in centimeters and is relative to the camera center, assuming the camera remains in a fixed position in space across all device orientations. I.e., `YCam` values will be negative for portrait mode frames (`Orientation` == 1) since the screen is below the camera, but values will be positive in upside-down portrait mode (`Orientation` == 2) since the screen is above the camera. See Section 4.1 and Figure 6 for more information. 55 | - `Time`: Time (in seconds) since the displayed dot first appeared on the screen. 56 | 57 | ### faceGrid.json 58 | These values describe the "face grid" input features, which were generated from the Apple face detections. Within a 25 x 25 grid of 0 values, these parameters describe where to draw in a box of 1 values to represent the position and size of the face within the frame. 59 | 60 | - `X`, `Y`: Position of the top-left corner of the face box (1-indexed, within a 25 x 25 grid). 61 | - `W`, `H`: Width and height of the face box. 62 | - `IsValid`: Whether the data is valid (1) or not (0). This is equivalent to the intersection of the associated `IsValid` arrays in the apple*.json files (since we required samples to have Apple face and eye detections). 63 | 64 | ### frames.json 65 | The filenames of the frames in the `frames` directory. This information may also be generated from a sequence number counting from 0 to `TotalFrames` - 1 (see `info.json`). 66 | 67 | ### info.json 68 | - `TotalFrames`: The total number of frames for this subject. 69 | - `NumFaceDetections`: The number of frames in which a face was detected. 70 | - `NumEyeDetections`: The number of frames in which eyes were detected. 71 | - `Dataset`: "train," "val," or "test." 72 | - `DeviceName`: The name of the device used in the recording. 73 | 74 | ### motion.json 75 | A stream of motion data (accelerometer, gyroscope, and magnetometer) recorded at 60 Hz, only while frames were being recorded. See Apple's [CMDeviceMotion](https://developer.apple.com/reference/coremotion/cmdevicemotion) class for a description of the values. `DotNum` (counting from 0) and `Time` (in seconds, from the beginning of that dot's recording) are recorded as well. 76 | 77 | ### screen.json 78 | - `H`, `W`: Height and width of the active screen area of the app (in points). This allows us to account for the iOS "Display Zoom" feature (which was used by some subjects) as well as larger status bars (e.g., when a Personal Hotspot is enabled) and split screen views (which was not used by any subjects). See [this](https://developer.apple.com/library/content/documentation/2DDrawing/Conceptual/DrawingPrintingiOS/GraphicsDrawingOverview/GraphicsDrawingOverview.html) and [this](https://www.paintcodeapp.com/news/ultimate-guide-to-iphone-resolutions) page for more information on the unit "points." 79 | - `Orientation`: The orientation of the interface, as described by the enumeration [UIInterfaceOrientation](https://developer.apple.com/reference/uikit/uiinterfaceorientation), where: 80 | - 1: portrait 81 | - 2: portrait, upside down (iPad only) 82 | - 3: landscape, with home button on the right 83 | - 4: landscape, with home button on the left 84 | 85 | ## Models 86 | In the `models` directory, we provide files compatible with [Caffe](http://caffe.berkeleyvision.org/), the deep learning framework. Following are descriptions of the included files: 87 | 88 | - *itracker_train_val.prototxt*: The iTracker architecture. See comments in the file for more information. 89 | - *itracker_deploy.prototxt*: The iTracker architecture expressed in a format suitable for inference (whereas itracker_train_val.prototxt is used for training). 90 | - *itracker_solver.prototxt*: The solver configuration describing how to train the model. 91 | - *mean_images/*: Directory containing 224x224 mean images (in Caffe binaryproto format and MATLAB mat format). These were produced by averaging all training images for each of the left eye, right eye, and face images. 92 | - *snapshots/itracker_iter_92000.caffemodel*: Model parameters after having trained 92,000 iterations, using the original dataset. 93 | - *snapshots/itracker25x_iter_92000.caffemodel*: Model parameters after having trained 92,000 iterations, using the 25x augmented dataset. 94 | 95 | ## Code 96 | We provide some sample code to help you get started using the dataset. Below is a high-level overview, but see individual files for more documentation. Most files are MATLAB scripts/functions. 97 | 98 | - `loadSubject.m`, `loadAllSubjects.m`: Loads metadata from JSON files into MATLAB structs. This requires the [gason MATLAB wrapper](https://github.com/pdollar/coco/tree/master/MatlabAPI) to parse JSON. Note that this struct format is currently only used in a few scripts; others expect same-sized vectors for each piece of metadata and will require some data processing. 99 | - `generateCrops.m`: This will generate the cropped face and eye images required to train iTracker. You must edit the script path to point to the root of the dataset. New images will be saved in subdirectories under each subject. 100 | - `cropRepeatingEdge.m`: Crops an image, repeating edges if the cropped area goes outside of the original image bounds. (Face bounding boxes sometimes extend beyond the frame.) We use this script to mimic the behavior of [imageByClampingToExtent](https://developer.apple.com/reference/coreimage/ciimage/1437628-imagebyclampingtoextent), which we used in the GazeCapture app, and to provide something more natural than black pixels when training the network with fixed-size centered face images. 101 | - `cam2screen.m`, `screen2cam.m`, `cm2pts.m`, `pts2cm.m`: Transformation functions to move between iOS measurements (points), metric measurements (centimeters), and our prediction space. Measurements in the GazeCapture dataset are already included in different formats, but these will be useful for additional processing. 102 | - `apple_device_data.csv`, `loadAppleDeviceData.m`: The CSV file includes measurements we use to determine the position of the center of the camera relative to the screen. We derived these measurements from Apple's Device Dimensional Drawings in their [Accessory Design Guidelines (PDF)](https://developer.apple.com/accessories/Accessory-Design-Guidelines.pdf). The script can be used to load this CSV into your MATLAB workspace. 103 | - `faceGridFromParams.m`: Transform the compact, parameterized version of the face grid (included in metadata) into the actual feature representation (flattened binary mask) used in iTracker. 104 | - `faceGridFromFaceRect.m`: Generate a face grid (either parameterized or the full representation) given a face bounding box within a frame. Parameterized face grids are already included in the metadata, but this is useful if you have new face detections to use. 105 | 106 | Please feel free to contact us if you find any issues with these scripts or would like to request any additional code. 107 | 108 | ## Contact 109 | 110 | Please email any questions or comments to [gazecapture@gmail.com](mailto:gazecapture@gmail.com). 111 | -------------------------------------------------------------------------------- /code/apple_device_data.csv: -------------------------------------------------------------------------------- 1 | DeviceName,DeviceCameraToScreenXMm,DeviceCameraToScreenYMm,DeviceCameraXMm,DeviceCameraYMm,DevicePixelsPerInch,DeviceScreenXMm,DeviceScreenYMm,DeviceScreenWidthMm,DeviceScreenWidthPoints,DeviceScreenWidthPointsZoomed,DeviceScreenHeightMm,DeviceScreenHeightPoints,DeviceScreenHeightPointsZoomed 2 | "iPhone 6s Plus",23.54,8.66,28.33,9.68,401,4.79,18.34,68.36,414,375,121.54,736,667 3 | "iPhone 6s",18.61,8.04,22.92,9.08,326,4.31,17.12,58.49,375,320,104.05,667,568 4 | "iPhone 6 Plus",23.54,8.65,28.25,9.61,401,4.71,18.26,68.36,414,375,121.54,736,667 5 | "iPhone 6",18.61,8.03,22.85,9.01,326,4.24,17.04,58.5,375,320,104.05,667,568 6 | "iPhone 5s",25.85,10.65,29.28,6.07,326,3.43,16.72,51.7,320,NaN,90.39,568,NaN 7 | "iPhone 5c",25.85,10.64,29.59,6.38,326,3.74,17.02,51.7,320,NaN,90.39,568,NaN 8 | "iPhone 5",25.85,10.65,29.28,6.07,326,3.43,16.72,51.7,320,NaN,90.39,568,NaN 9 | "iPhone 4s",14.96,9.78,19.27,10.35,326,4.31,20.13,49.92,320,NaN,74.88,480,NaN 10 | "iPad Mini",60.7,8.7,67.4,10.7,326,6.7,19.4,121.3,768,NaN,161.2,1024,NaN 11 | "iPad Air 2",76.86,7.37,84.74,11.07,264,7.88,18.44,153.71,768,NaN,203.11,1024,NaN 12 | "iPad Air",74.4,9.9,84.7,11.1,264,10.3,21,149,768,NaN,198.1,1024,NaN 13 | "iPad 4",74.5,10.5,92.9,11.1,264,18.4,21.6,149,768,NaN,198.1,1024,NaN 14 | "iPad 3",74.5,10.5,92.9,11.1,132,18.4,21.6,149,768,NaN,198.1,1024,NaN 15 | "iPad 2",74.5,10.5,92.9,11.1,132,18.4,21.6,149,768,NaN,198.1,1024,NaN 16 | "iPad Pro",98.31,10.69,110.29,11.08,264,11.99,21.77,196.61,1024,768,262.15,1366,1024 -------------------------------------------------------------------------------- /code/cam2screen.m: -------------------------------------------------------------------------------- 1 | % cam2screen.m 2 | % 3 | % Convert physcial coordinates (in centimeters) from the camera (i.e., in 4 | % our prediction space) to screen coordinates (in points, or centimeters 5 | % depending on the useCm argument). useCm defaults to false, but in 6 | % practice, we typically set it to true. See screen2cam.m for more 7 | % information; this function behaves similarly. 8 | function [xScreen, yScreen] = cam2screen(xCam, yCam, orientation, ... 9 | device, screenW, screenH, useCm) 10 | 11 | if nargin < 7 12 | useCm = false; 13 | end 14 | 15 | loadAppleDeviceData; 16 | 17 | processed = false(size(xCam)); 18 | xScreen = NaN(size(xCam)); 19 | yScreen = NaN(size(yCam)); 20 | 21 | % First, convert input to millimeters to be compatible with 22 | % apple_device_data.csv. 23 | xCam = xCam * 10; 24 | yCam = yCam * 10; 25 | 26 | % Process device by device. 27 | for i = 1:length(deviceName) 28 | curr = strcmpi(device, deviceName(i)); 29 | xCurr = xCam(curr); 30 | yCurr = yCam(curr); 31 | oCurr = orientation(curr); 32 | o1 = oCurr == 1; 33 | o2 = oCurr == 2; 34 | o3 = oCurr == 3; 35 | o4 = oCurr == 4; 36 | if ~useCm 37 | screenWCurr = screenW(curr); 38 | screenHCurr = screenH(curr); 39 | end 40 | 41 | % Transform so that measurements are relative to the device's origin 42 | % (depending on its orientation). 43 | dX = deviceCameraToScreenXMm(i); 44 | dY = deviceCameraToScreenYMm(i); 45 | dW = deviceScreenWidthMm(i); 46 | dH = deviceScreenHeightMm(i); 47 | xCurr(o1) = xCurr(o1) + dX; 48 | yCurr(o1) = -yCurr(o1) - dY; 49 | xCurr(o2) = xCurr(o2) - dX + dW; 50 | yCurr(o2) = -yCurr(o2) + dY + dH; 51 | xCurr(o3) = xCurr(o3) - dY; 52 | yCurr(o3) = -yCurr(o3) - dX + dW; 53 | xCurr(o4) = xCurr(o4) + dY + dH; 54 | yCurr(o4) = -yCurr(o4) + dX; 55 | 56 | if ~useCm 57 | % Convert from mm to screen points. 58 | xCurr(o1 | o2) = xCurr(o1 | o2) .* (screenWCurr(o1 | o2) ./ dW); 59 | yCurr(o1 | o2) = yCurr(o1 | o2) .* (screenHCurr(o1 | o2) ./ dH); 60 | xCurr(o3 | o4) = xCurr(o3 | o4) .* (screenWCurr(o3 | o4) ./ dH); 61 | yCurr(o3 | o4) = yCurr(o3 | o4) .* (screenHCurr(o3 | o4) ./ dW); 62 | end 63 | 64 | % Store the results. 65 | xScreen(curr) = xCurr; 66 | yScreen(curr) = yCurr; 67 | 68 | processed = processed | curr; 69 | end 70 | 71 | if ~all(processed) 72 | warning(['The following devices were not recognized. Expect NaN ' ... 73 | 'return values.']); 74 | disp(unique(device(~processed))); 75 | end 76 | 77 | if useCm 78 | % Convert from mm to centimeters. 79 | xScreen = xScreen / 10; 80 | yScreen = yScreen / 10; 81 | end 82 | 83 | end 84 | -------------------------------------------------------------------------------- /code/cm2pts.m: -------------------------------------------------------------------------------- 1 | % cm2pts.m 2 | % 3 | % Convert screen coordinates in centimeters to screen coordinates in 4 | % points. See the documentation in screen2cam.m for more information; this 5 | % function behaves similarly. 6 | function [xPts, yPts] = cm2pts(xCm, yCm, orientation, device, screenW, ... 7 | screenH) 8 | 9 | loadAppleDeviceData; 10 | 11 | % First, convert mm to cm. 12 | xCm = xCm * 10; 13 | yCm = yCm * 10; 14 | 15 | processed = false(size(xCm)); 16 | xPts = NaN(size(xCm)); 17 | yPts = NaN(size(yCm)); 18 | 19 | % Process device by device. 20 | for i = 1:length(deviceName) 21 | curr = strcmpi(device, deviceName(i)); 22 | xCurr = xCm(curr); 23 | yCurr = yCm(curr); 24 | oCurr = orientation(curr); 25 | o1 = oCurr == 1; 26 | o2 = oCurr == 2; 27 | o3 = oCurr == 3; 28 | o4 = oCurr == 4; 29 | screenWCurr = screenW(curr); 30 | screenHCurr = screenH(curr); 31 | 32 | % NOTE: This assumes the active screen area is the full screen. This is 33 | % always the case in GazeCapture. Using the active screen area allows us 34 | % to account for Display Zoom. 35 | xCurr(o1 | o2) = ... 36 | xCurr(o1 | o2) .* (screenWCurr(o1 | o2) ./ deviceScreenWidthMm(i)); 37 | yCurr(o1 | o2) = ... 38 | yCurr(o1 | o2) .* (screenHCurr(o1 | o2)) ./ deviceScreenHeightMm(i); 39 | xCurr(o3 | o4) = ... 40 | xCurr(o3 | o4) .* (screenWCurr(o3 | o4)) ./ deviceScreenHeightMm(i); 41 | yCurr(o3 | o4) = ... 42 | yCurr(o3 | o4) .* (screenHCurr(o3 | o4)) ./ deviceScreenWidthMm(i); 43 | 44 | % Store the results. 45 | xPts(curr) = xCurr; 46 | yPts(curr) = yCurr; 47 | 48 | processed = processed | curr; 49 | end 50 | 51 | if ~all(processed) 52 | warning('The following devices were not recognized. Expect NaN return values.'); 53 | disp(unique(device(~processed))); 54 | end 55 | 56 | end 57 | -------------------------------------------------------------------------------- /code/cropRepeatingEdge.m: -------------------------------------------------------------------------------- 1 | % cropRepeatingEdge.m 2 | % 3 | % Discretely crop an image and allow for going beyond image boundaries by 4 | % repeating edge pixels. Images beyond the corners (i.e., beyond both the 5 | % width and the height of the image) will have the closest corner color. 6 | % The crop rectangle should have the format: [x y w h]. If no content is 7 | % visible, the frame will be black. Note: This could also be done with 8 | % padarray but this interface is nice for our purposes. 9 | 10 | function output = cropRepeatingEdge(image, rect) 11 | 12 | cropX = rect(1); 13 | cropY = rect(2); 14 | cropW = rect(3); 15 | cropH = rect(4); 16 | 17 | output = uint8(zeros(cropH, cropW, size(image, 3))); 18 | 19 | leftPadding = max(0, 1 - cropX); 20 | topPadding = max(0, 1 - cropY); 21 | rightPadding = max((cropX + cropW - 1) - size(image, 2), 0); 22 | bottomPadding = max((cropY + cropH - 1) - size(image, 1), 0); 23 | 24 | % Copy content. 25 | contentOutPixelsY = 1 + topPadding : cropH - bottomPadding; 26 | contentOutPixelsX = 1 + leftPadding : cropW - rightPadding; 27 | contentInPixelsY = cropY + topPadding : cropY + cropH - 1 - bottomPadding; 28 | contentInPixelsX = cropX + leftPadding : cropX + cropW - 1 - rightPadding; 29 | output(contentOutPixelsY, contentOutPixelsX, :) ... 30 | = image(contentInPixelsY, contentInPixelsX, :); 31 | 32 | % Checking for an error that occurred. 33 | if numel(contentOutPixelsX) == 0 34 | warning('No out pixels in y direction.'); 35 | output = NaN; 36 | return; 37 | end 38 | if numel(contentOutPixelsY) == 0 39 | warning('No out pixels in y direction.'); 40 | output = NaN; 41 | return; 42 | end 43 | 44 | % Pad directly above and below image. 45 | output(1:topPadding, contentOutPixelsX, :) = ... 46 | repmat(output(contentOutPixelsY(1), contentOutPixelsX, :), ... 47 | [topPadding, 1, 1]); 48 | output(end + 1 - bottomPadding:end, contentOutPixelsX, :) = ... 49 | repmat(output(contentOutPixelsY(end), contentOutPixelsX, :), ... 50 | [bottomPadding, 1, 1]); 51 | 52 | % Pad to the left and right. 53 | output(:, 1:leftPadding, :) = ... 54 | repmat(output(:, contentOutPixelsX(1), :), [1, leftPadding, 1]); 55 | output(:, end + 1 - rightPadding:end, :) = ... 56 | repmat(output(:, contentOutPixelsX(end), :), [1, rightPadding, 1]); 57 | 58 | end 59 | 60 | -------------------------------------------------------------------------------- /code/faceGridFromFaceRect.m: -------------------------------------------------------------------------------- 1 | % faceGridFromFaceRect.m 2 | % 3 | % Given face detection data, generate face grid data. 4 | % 5 | % Input Parameters: 6 | % - frameW/H: The frame in which the detections exist 7 | % - gridW/H: The size of the grid (typically same aspect ratio as the 8 | % frame, but much smaller) 9 | % - labelFaceX/Y/W/H: The face detection (x and y are 0-based image 10 | % coordinates) 11 | % - parameterized: Whether to actually output the grid or just the 12 | % [x y w h] of the 1s square within the gridW x gridH grid. 13 | 14 | function labelFaceGrid = faceGridFromFaceRect(frameW, frameH, gridW, ... 15 | gridH, labelFaceX, labelFaceY, labelFaceW, labelFaceH, parameterized) 16 | 17 | scaleX = gridW / frameW; 18 | scaleY = gridH / frameH; 19 | numSamples = length(labelFaceW); 20 | if parameterized 21 | labelFaceGrid = zeros(numSamples, 4); 22 | else 23 | labelFaceGrid = zeros(numSamples, gridW * gridH); 24 | end 25 | 26 | for i=1:numSamples 27 | grid = zeros(gridH, gridW); 28 | 29 | % Use one-based image coordinates. 30 | xLo = round(labelFaceX(i) * scaleX) + 1; 31 | yLo = round(labelFaceY(i) * scaleY) + 1; 32 | w = round(labelFaceW(i) * scaleX); 33 | h = round(labelFaceH(i) * scaleY); 34 | 35 | if parameterized 36 | labelFaceGrid(i, :) = [xLo yLo w h]; 37 | else 38 | xHi = xLo + w - 1; 39 | yHi = yLo + h - 1; 40 | 41 | % Clamp the values in the range. 42 | xLo = min(gridW, max(1, xLo)); 43 | xHi = min(gridW, max(1, xHi)); 44 | yLo = min(gridH, max(1, yLo)); 45 | yHi = min(gridH, max(1, yHi)); 46 | 47 | grid(yLo:yHi, xLo:xHi) = ones(yHi - yLo + 1, xHi - xLo + 1); 48 | 49 | % Flatten the grid. 50 | grid = grid'; 51 | grid = grid(:)'; 52 | labelFaceGrid(i, :) = grid; 53 | end 54 | end 55 | 56 | end 57 | -------------------------------------------------------------------------------- /code/faceGridFromParams.m: -------------------------------------------------------------------------------- 1 | % faceGridFromParams.m 2 | % 3 | % Given face grid parameters (and optionally size, which is 25 for 4 | % GazeCapture data), return the flattened face grid. 5 | 6 | function labelFaceGrid = ... 7 | faceGridFromParams(labelFaceGridParams, gridW, gridH) 8 | 9 | if nargin < 2 10 | gridW = 25; 11 | gridH = 25; 12 | end 13 | 14 | numSamples = size(labelFaceGridParams, 1); 15 | labelFaceGrid = zeros(numSamples, gridW * gridH); 16 | for i = 1:numSamples 17 | grid = zeros(gridH, gridW); 18 | 19 | xLo = labelFaceGridParams(i, 1); 20 | yLo = labelFaceGridParams(i, 2); 21 | w = labelFaceGridParams(i, 3); 22 | h = labelFaceGridParams(i, 4); 23 | 24 | xHi = xLo + w - 1; 25 | yHi = yLo + h - 1; 26 | 27 | % Clip the values to the range. 28 | xLo = min(gridW, max(1, xLo)); 29 | xHi = min(gridW, max(1, xHi)); 30 | yLo = min(gridH, max(1, yLo)); 31 | yHi = min(gridH, max(1, yHi)); 32 | 33 | grid(yLo:yHi, xLo:xHi) = ones(yHi - yLo + 1, xHi - xLo + 1); 34 | 35 | % Flatten the grid. 36 | grid = grid'; 37 | grid = grid(:)'; 38 | labelFaceGrid(i, :) = grid; 39 | end 40 | 41 | end 42 | -------------------------------------------------------------------------------- /code/generateCrops.m: -------------------------------------------------------------------------------- 1 | % generateCrops.m 2 | % This script generates all of the image crops required to train iTracker. 3 | % It will create three subdirectories in each subject directory: 4 | % "appleFace," "appleLeftEye," and "appleRightEye." 5 | 6 | baseDirectory = '/path/to/data'; 7 | if ~exist(baseDirectory, 'dir') 8 | error(['The specified base directory does not exist. Please edit ' ... 9 | 'the script to specify the root of the numbered subject ' ... 10 | 'directories.']); 11 | end 12 | 13 | subjectDirs = dir(baseDirectory); 14 | for currSubject = subjectDirs' 15 | % Valid subject directories have five-digit numbers. 16 | if ~currSubject.isdir || length(currSubject.name) ~= 5 || ... 17 | ~all(isstrprop(currSubject.name, 'digit')) 18 | continue; 19 | end 20 | disp(['Processing subject ' currSubject.name '...']) 21 | subjectDir = fullfile(baseDirectory, currSubject.name); 22 | s = loadSubject(subjectDir); 23 | appleFaceDir = fullfile(subjectDir, 'appleFace'); 24 | appleLeftEyeDir = fullfile(subjectDir, 'appleLeftEye'); 25 | appleRightEyeDir = fullfile(subjectDir, 'appleRightEye'); 26 | mkdir(appleFaceDir); 27 | mkdir(appleLeftEyeDir); 28 | mkdir(appleRightEyeDir); 29 | 30 | for i = 1:length(s.frames) 31 | frameFilename = s.frames{i}; 32 | frame = imread(fullfile(subjectDir, 'frames', frameFilename)); 33 | % iTracker requires we have face and eye detections; we don't save 34 | % any if we don't have all three. 35 | if isnan(s.appleFace.x(i)) || isnan(s.appleLeftEye.x(i)) || isnan(s.appleRightEye.x(i)) 36 | continue; 37 | end 38 | faceImage = cropRepeatingEdge(frame, round([s.appleFace.x(i) s.appleFace.y(i) s.appleFace.w(i) s.appleFace.h(i)])); 39 | leftEyeImage = cropRepeatingEdge(faceImage, round([s.appleLeftEye.x(i) s.appleLeftEye.y(i) s.appleLeftEye.w(i) s.appleLeftEye.h(i)])); 40 | rightEyeImage = cropRepeatingEdge(faceImage, round([s.appleRightEye.x(i) s.appleRightEye.y(i) s.appleRightEye.w(i) s.appleRightEye.h(i)])); 41 | imwrite(faceImage, fullfile(appleFaceDir, frameFilename)); 42 | imwrite(leftEyeImage, fullfile(appleLeftEyeDir, frameFilename)); 43 | imwrite(rightEyeImage, fullfile(appleRightEyeDir, frameFilename)); 44 | end 45 | end -------------------------------------------------------------------------------- /code/loadAllSubjects.m: -------------------------------------------------------------------------------- 1 | % loadAllSubjects.m 2 | % 3 | % Loads all subject metadata into memory given the base path. 4 | 5 | function subjects = loadAllSubjects(base_data_path) 6 | 7 | subjects = []; 8 | 9 | subjectDirs = dir(base_data_path); 10 | for subjectDir = subjectDirs' 11 | % Valid subject directories have five-digit numbers. 12 | if ~currSubject.isdir || length(currSubject.name) ~= 5 || ... 13 | ~all(isstrprop(currSubject.name, 'digit')) 14 | continue; 15 | end 16 | s = loadSubject(fullfile(base_data_path,subjectDir.name)); 17 | subjects = [subjects; s]; 18 | end 19 | 20 | end 21 | 22 | -------------------------------------------------------------------------------- /code/loadAllSubjectsVectors.m: -------------------------------------------------------------------------------- 1 | % loadAllSubjectsVectors.m 2 | % 3 | % This script loads subject data into same-sized vectors, with a row for 4 | % each sample. Variables will be placed into your workspace and prefixed 5 | % with "gc." This format is not particularly compact (compared to the 6 | % struct format in loadAllSubjects.m) since many values will be repeated 7 | % for a subject, but you may find this convenient, particularly for 8 | % compatibility with some of the provided scripts. This requires the MATLAB 9 | % gason wrapper to read JSON files. You can get it from 10 | % https://github.com/pdollar/coco/tree/master/MatlabAPI. 11 | 12 | %% Don't overwrite workspace variables. 13 | if exist('baseDirectory', 'var') || exist('currSubject', 'var') || ... 14 | exist('input', 'var') || exist('subjectDir', 'var') || ... 15 | exist('subjectDirs', 'var') 16 | error(['A workspace variable in this script will overwrite ' ... 17 | 'existing variables.']); 18 | end 19 | 20 | %% Configuration. 21 | baseDirectory = '/path/to/data'; 22 | 23 | %% Initialize variables. 24 | 25 | % Subject and frame number are sufficient to reconstruct filenames, which 26 | % we avoid loading into memory. 27 | gcSbjNum = []; 28 | gcFrmNum = []; 29 | 30 | % True if face and eyes were detected. Only true samples were used to train 31 | % iTracker. 32 | gcAppleValid = []; 33 | 34 | % On-device face and eye detections (using Apple's detectors). 35 | gcAppleFaceX = []; 36 | gcAppleFaceY = []; 37 | gcAppleFaceW = []; 38 | gcAppleFaceH = []; 39 | gcAppleLeftEyeX = []; 40 | gcAppleLeftEyeY = []; 41 | gcAppleLeftEyeW = []; 42 | gcAppleLeftEyeH = []; 43 | gcAppleRightEyeX = []; 44 | gcAppleRightEyeY = []; 45 | gcAppleRightEyeW = []; 46 | gcAppleRightEyeH = []; 47 | 48 | % Parameterized face grid as [X Y W H]. 49 | gcFaceGridParams = []; 50 | 51 | % String describing the device type. 52 | gcDeviceName = {}; 53 | 54 | % "Active screen area" in points. 55 | gcScreenW = []; 56 | gcScreenH = []; 57 | 58 | gcDotNum = []; 59 | gcDotXPts = []; 60 | gcDotYPts = []; 61 | gcDotXCam = []; 62 | gcDotYCam = []; 63 | gcDotStartTime = []; 64 | 65 | % 1 = portrait; 2 = portrait upside down; 3 = landscape with home button on 66 | % the right; 4 = landscape with home button on the left. 67 | gcOrientation = []; 68 | 69 | % Dataset. 70 | gcTrain = []; 71 | gcVal = []; 72 | gcTest = []; 73 | 74 | %% Load from JSON files. 75 | 76 | if ~exist(baseDirectory, 'dir') 77 | error(['The specified base directory does not exist. Please edit ' ... 78 | 'the script to specify the root of the numbered subject ' ... 79 | 'directories.']); 80 | end 81 | 82 | subjectDirs = dir(baseDirectory); 83 | for currSubject = subjectDirs' 84 | % Valid subject directories have five-digit numbers. 85 | if ~currSubject.isdir || length(currSubject.name) ~= 5 || ... 86 | ~all(isstrprop(currSubject.name, 'digit')) 87 | continue; 88 | end 89 | disp(['Processing subject ' currSubject.name '...']) 90 | subjectDir = fullfile(baseDirectory, currSubject.name); 91 | 92 | % Apple Face Detections 93 | input = gason(fileread(fullfile(subjectDir, 'appleFace.json'))); 94 | input.X(~input.IsValid) = NaN; 95 | input.Y(~input.IsValid) = NaN; 96 | input.W(~input.IsValid) = NaN; 97 | input.H(~input.IsValid) = NaN; 98 | gcAppleFaceX = [gcAppleFaceX; input.X']; 99 | gcAppleFaceY = [gcAppleFaceY; input.Y']; 100 | gcAppleFaceW = [gcAppleFaceW; input.W']; 101 | gcAppleFaceH = [gcAppleFaceH; input.H']; 102 | 103 | % Apple Left Eye Detections 104 | input = gason(fileread(fullfile(subjectDir, 'appleLeftEye.json'))); 105 | input.X(~input.IsValid) = NaN; 106 | input.Y(~input.IsValid) = NaN; 107 | input.W(~input.IsValid) = NaN; 108 | input.H(~input.IsValid) = NaN; 109 | gcAppleLeftEyeX = [gcAppleLeftEyeX; input.X']; 110 | gcAppleLeftEyeY = [gcAppleLeftEyeY; input.Y']; 111 | gcAppleLeftEyeW = [gcAppleLeftEyeW; input.W']; 112 | gcAppleLeftEyeH = [gcAppleLeftEyeH; input.H']; 113 | 114 | % Apple Right Eye Detections 115 | input = gason(fileread(fullfile(subjectDir, 'appleRightEye.json'))); 116 | input.X(~input.IsValid) = NaN; 117 | input.Y(~input.IsValid) = NaN; 118 | input.W(~input.IsValid) = NaN; 119 | input.H(~input.IsValid) = NaN; 120 | gcAppleRightEyeX = [gcAppleRightEyeX; input.X']; 121 | gcAppleRightEyeY = [gcAppleRightEyeY; input.Y']; 122 | gcAppleRightEyeW = [gcAppleRightEyeW; input.W']; 123 | gcAppleRightEyeH = [gcAppleRightEyeH; input.H']; 124 | 125 | % Dot Information 126 | input = gason(fileread(fullfile(subjectDir, 'dotInfo.json'))); 127 | gcDotNum = [gcDotNum; input.DotNum']; 128 | gcDotXPts = [gcDotXPts; input.XPts']; 129 | gcDotYPts = [gcDotYPts; input.YPts']; 130 | gcDotXCam = [gcDotXCam; input.XCam']; 131 | gcDotYCam = [gcDotYCam; input.YCam']; 132 | gcDotStartTime = [gcDotStartTime; input.Time']; 133 | 134 | % Face Grid 135 | input = gason(fileread(fullfile(subjectDir, 'faceGrid.json'))); 136 | input.X(~input.IsValid) = NaN; 137 | input.Y(~input.IsValid) = NaN; 138 | input.W(~input.IsValid) = NaN; 139 | input.H(~input.IsValid) = NaN; 140 | gcFaceGridParams = [gcFaceGridParams; ... 141 | [input.X' input.Y' input.W' input.H']]; 142 | 143 | % Frames 144 | input = gason(fileread(fullfile(subjectDir, 'frames.json'))); 145 | gcFrmNum = [gcFrmNum; cellfun(@(x) str2num(x(1:5)), input)']; 146 | 147 | % Info 148 | input = gason(fileread(fullfile(subjectDir, 'info.json'))); 149 | gcTrain = [gcTrain; repmat(strcmp(input.Dataset, 'train'), input.TotalFrames, 1)]; 150 | gcVal = [gcVal; repmat(strcmp(input.Dataset, 'val'), input.TotalFrames, 1)]; 151 | gcTest = [gcTest; repmat(strcmp(input.Dataset, 'test'), input.TotalFrames, 1)]; 152 | 153 | gcDeviceName = [gcDeviceName; repmat({input.DeviceName}, input.TotalFrames, 1)]; 154 | 155 | gcSbjNum = [gcSbjNum; repmat(str2double(currSubject.name), input.TotalFrames, 1)]; 156 | 157 | % Motion data omitted. Add it if you need it! 158 | 159 | % Screen 160 | input = gason(fileread(fullfile(subjectDir, 'screen.json'))); 161 | gcScreenW = [gcScreenW; input.W']; 162 | gcScreenH = [gcScreenH; input.H']; 163 | gcOrientation = [gcOrientation; input.Orientation']; 164 | 165 | end 166 | 167 | gcAppleValid = ~isnan(gcAppleFaceX) & ~isnan(gcAppleLeftEyeX); 168 | 169 | clear baseDirectory currSubject input subjectDir subjectDirs 170 | -------------------------------------------------------------------------------- /code/loadAppleDeviceData.m: -------------------------------------------------------------------------------- 1 | % loadAppleDeviceData.m 2 | % 3 | % Loads the apple_device_data.csv file into the workspace as variables 4 | % prefixed with "device." 5 | 6 | %% Import data from text file. 7 | % Script for importing data from apple_device_data.csv. 8 | % Auto-generated by MATLAB and tweaked. 9 | 10 | %% If the variables are already in the workspace, terminate early. 11 | if exist('deviceName','var') && ... 12 | exist('deviceCameraToScreenXMm','var') && ... 13 | exist('deviceCameraToScreenYMm','var') && ... 14 | exist('deviceCameraXMm','var') && ... 15 | exist('deviceCameraYMm','var') && ... 16 | exist('devicePixelsPerInch','var') && ... 17 | exist('deviceScreenXMm','var') && ... 18 | exist('deviceScreenYMm','var') && ... 19 | exist('deviceScreenWidthMm','var') && ... 20 | exist('deviceScreenWidthPoints','var') && ... 21 | exist('deviceScreenWidthPointsZoomed','var') && ... 22 | exist('deviceScreenHeightMm','var') && ... 23 | exist('deviceScreenHeightPoints','var') && ... 24 | exist('deviceScreenHeightPointsZoomed','var'); 25 | return; 26 | end 27 | 28 | %% Initialize variables. 29 | filename = 'apple_device_data.csv'; 30 | delimiter = ','; 31 | startRow = 2; 32 | 33 | %% Read columns of data as strings: 34 | % For more information, see the TEXTSCAN documentation. 35 | formatSpec = '%q%q%q%q%q%q%q%q%q%q%q%q%q%q%[^\n\r]'; 36 | 37 | %% Open the text file. 38 | fileID = fopen(filename,'r'); 39 | 40 | %% Read columns of data according to format string. 41 | % This call is based on the structure of the file used to generate this 42 | % code. If an error occurs for a different file, try regenerating the code 43 | % from the Import Tool. 44 | dataArray = textscan(fileID, formatSpec, 'Delimiter', delimiter, 'HeaderLines' ,startRow-1, 'ReturnOnError', false); 45 | 46 | %% Close the text file. 47 | fclose(fileID); 48 | 49 | %% Convert the contents of columns containing numeric strings to numbers. 50 | % Replace non-numeric strings with NaN. 51 | raw = repmat({''},length(dataArray{1}),length(dataArray)-1); 52 | for col=1:length(dataArray)-1 53 | raw(1:length(dataArray{col}),col) = dataArray{col}; 54 | end 55 | numericData = NaN(size(dataArray{1},1),size(dataArray,2)); 56 | 57 | for col=[2,3,4,5,6,7,8,9,10,11,12,13,14] 58 | % Converts strings in the input cell array to numbers. Replaced non-numeric 59 | % strings with NaN. 60 | rawData = dataArray{col}; 61 | for row=1:size(rawData, 1); 62 | % Create a regular expression to detect and remove non-numeric prefixes and 63 | % suffixes. 64 | regexstr = '(?.*?)(?([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?.*)'; 65 | try 66 | result = regexp(rawData{row}, regexstr, 'names'); 67 | numbers = result.numbers; 68 | 69 | % Detected commas in non-thousand locations. 70 | invalidThousandsSeparator = false; 71 | if any(numbers==','); 72 | thousandsRegExp = '^\d+?(\,\d{3})*\.{0,1}\d*$'; 73 | if isempty(regexp(thousandsRegExp, ',', 'once')); 74 | numbers = NaN; 75 | invalidThousandsSeparator = true; 76 | end 77 | end 78 | % Convert numeric strings to numbers. 79 | if ~invalidThousandsSeparator; 80 | numbers = textscan(strrep(numbers, ',', ''), '%f'); 81 | numericData(row, col) = numbers{1}; 82 | raw{row, col} = numbers{1}; 83 | end 84 | catch me 85 | end 86 | end 87 | end 88 | 89 | 90 | %% Split data into numeric and cell columns. 91 | rawNumericColumns = raw(:, [2,3,4,5,6,7,8,9,10,11,12,13,14]); 92 | rawCellColumns = raw(:, 1); 93 | 94 | 95 | %% Replace non-numeric cells with NaN 96 | R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells 97 | rawNumericColumns(R) = {NaN}; % Replace non-numeric cells 98 | 99 | %% Allocate imported array to column variable names 100 | deviceName = rawCellColumns(:, 1); 101 | deviceCameraToScreenXMm = cell2mat(rawNumericColumns(:, 1)); 102 | deviceCameraToScreenYMm = cell2mat(rawNumericColumns(:, 2)); 103 | deviceCameraXMm = cell2mat(rawNumericColumns(:, 3)); 104 | deviceCameraYMm = cell2mat(rawNumericColumns(:, 4)); 105 | devicePixelsPerInch = cell2mat(rawNumericColumns(:, 5)); 106 | deviceScreenXMm = cell2mat(rawNumericColumns(:, 6)); 107 | deviceScreenYMm = cell2mat(rawNumericColumns(:, 7)); 108 | deviceScreenWidthMm = cell2mat(rawNumericColumns(:, 8)); 109 | deviceScreenWidthPoints = cell2mat(rawNumericColumns(:, 9)); 110 | deviceScreenWidthPointsZoomed = cell2mat(rawNumericColumns(:, 10)); 111 | deviceScreenHeightMm = cell2mat(rawNumericColumns(:, 11)); 112 | deviceScreenHeightPoints = cell2mat(rawNumericColumns(:, 12)); 113 | deviceScreenHeightPointsZoomed = cell2mat(rawNumericColumns(:, 13)); 114 | 115 | 116 | %% Clear temporary variables 117 | clearvars filename delimiter startRow formatSpec fileID dataArray ans raw col numericData rawData row regexstr result numbers invalidThousandsSeparator thousandsRegExp me rawNumericColumns rawCellColumns R; -------------------------------------------------------------------------------- /code/loadSubject.m: -------------------------------------------------------------------------------- 1 | % loadSubject.m 2 | % 3 | % This loads subject data into a struct given a path to a subject 4 | % directory. This requires the MATLAB gason wrapper to read JSON files. You 5 | % can get it from https://github.com/pdollar/coco/tree/master/MatlabAPI. 6 | 7 | function output = loadSubject(path) 8 | 9 | % Apple Face Detections 10 | input = gason(fileread(fullfile(path, 'appleFace.json'))); 11 | output.appleFace.x = input.X; 12 | output.appleFace.x(~input.IsValid) = NaN; 13 | output.appleFace.y = input.Y; 14 | output.appleFace.y(~input.IsValid) = NaN; 15 | output.appleFace.w = input.W; 16 | output.appleFace.w(~input.IsValid) = NaN; 17 | output.appleFace.h = input.H; 18 | output.appleFace.h(~input.IsValid) = NaN; 19 | 20 | % Apple Left Eye Detections 21 | input = gason(fileread(fullfile(path, 'appleLeftEye.json'))); 22 | output.appleLeftEye.x = input.X; 23 | output.appleLeftEye.x(~input.IsValid) = NaN; 24 | output.appleLeftEye.y = input.Y; 25 | output.appleLeftEye.y(~input.IsValid) = NaN; 26 | output.appleLeftEye.w = input.W; 27 | output.appleLeftEye.w(~input.IsValid) = NaN; 28 | output.appleLeftEye.h = input.H; 29 | output.appleLeftEye.h(~input.IsValid) = NaN; 30 | 31 | % Apple Right Eye Detections 32 | input = gason(fileread(fullfile(path, 'appleRightEye.json'))); 33 | output.appleRightEye.x = input.X; 34 | output.appleRightEye.x(~input.IsValid) = NaN; 35 | output.appleRightEye.y = input.Y; 36 | output.appleRightEye.y(~input.IsValid) = NaN; 37 | output.appleRightEye.w = input.W; 38 | output.appleRightEye.w(~input.IsValid) = NaN; 39 | output.appleRightEye.h = input.H; 40 | output.appleRightEye.h(~input.IsValid) = NaN; 41 | 42 | % Dot Information 43 | input = gason(fileread(fullfile(path, 'dotInfo.json'))); 44 | output.dot.num = input.DotNum; 45 | output.dot.xPts = input.XPts; 46 | output.dot.yPts = input.YPts; 47 | output.dot.xCam = input.XCam; 48 | output.dot.yCam = input.YCam; 49 | output.dot.time = input.Time; 50 | 51 | % Face Grid 52 | input = gason(fileread(fullfile(path, 'faceGrid.json'))); 53 | output.faceGrid.x = input.X; 54 | output.faceGrid.x(~input.IsValid) = NaN; 55 | output.faceGrid.y = input.Y; 56 | output.faceGrid.y(~input.IsValid) = NaN; 57 | output.faceGrid.w = input.W; 58 | output.faceGrid.w(~input.IsValid) = NaN; 59 | output.faceGrid.h = input.H; 60 | output.faceGrid.h(~input.IsValid) = NaN; 61 | 62 | % Frames 63 | input = gason(fileread(fullfile(path, 'frames.json'))); 64 | output.frames = input; 65 | 66 | % Info 67 | input = gason(fileread(fullfile(path, 'info.json'))); 68 | output.info.totalFrames = input.TotalFrames; 69 | output.info.numFaceDetections = input.NumFaceDetections; 70 | output.info.numEyeDetections = input.NumEyeDetections; 71 | output.info.dataset = input.Dataset; 72 | output.info.deviceName = input.DeviceName; 73 | 74 | % Motion data omitted. Add it if you need it! 75 | 76 | % Screen 77 | input = gason(fileread(fullfile(path, 'screen.json'))); 78 | output.screen.w = input.W; 79 | output.screen.h = input.H; 80 | output.screen.orientation = input.Orientation; 81 | 82 | end 83 | 84 | -------------------------------------------------------------------------------- /code/pts2cm.m: -------------------------------------------------------------------------------- 1 | % pts2cm.m 2 | % 3 | % Convert screen coordinates in points to screen coordinates in 4 | % centimeters. See the documentation in screen2cam.m for more information; 5 | % this function behaves similarly. 6 | function [xCm, yCm] = pts2cm(xPts, yPts, orientation, device, screenW, ... 7 | screenH) 8 | 9 | loadAppleDeviceData; 10 | 11 | processed = false(size(xPts)); 12 | xCm = NaN(size(xPts)); 13 | yCm = NaN(size(yPts)); 14 | 15 | % Process device by device. 16 | for i = 1:length(deviceName) 17 | curr = strcmpi(device, deviceName(i)); 18 | xCurr = xPts(curr); 19 | yCurr = yPts(curr); 20 | oCurr = orientation(curr); 21 | o1 = oCurr == 1; 22 | o2 = oCurr == 2; 23 | o3 = oCurr == 3; 24 | o4 = oCurr == 4; 25 | screenWCurr = screenW(curr); 26 | screenHCurr = screenH(curr); 27 | 28 | xCurr(o1 | o2) = ... 29 | xCurr(o1 | o2) .* (deviceScreenWidthMm(i) ./ screenWCurr(o1 | o2)); 30 | yCurr(o1 | o2) = ... 31 | yCurr(o1 | o2) .* (deviceScreenHeightMm(i) ./ screenHCurr(o1 | o2)); 32 | xCurr(o3 | o4) = ... 33 | xCurr(o3 | o4) .* (deviceScreenHeightMm(i) ./ screenWCurr(o3 | o4)); 34 | yCurr(o3 | o4) = ... 35 | yCurr(o3 | o4) .* (deviceScreenWidthMm(i) ./ screenHCurr(o3 | o4)); 36 | 37 | % Store the results. 38 | xCm(curr) = xCurr; 39 | yCm(curr) = yCurr; 40 | 41 | processed = processed | curr; 42 | end 43 | 44 | if ~all(processed) 45 | warning(['The following devices were not recognized. Expect NaN ' ... 46 | 'return values.']); 47 | disp(unique(device(~processed))); 48 | end 49 | 50 | % Finally, convert mm to cm. 51 | xCm = xCm / 10; 52 | yCm = yCm / 10; 53 | 54 | end 55 | -------------------------------------------------------------------------------- /code/screen2cam.m: -------------------------------------------------------------------------------- 1 | % screen2cam.m 2 | % 3 | % Convert screen coordinates (in points, or centimeters if useCm is true) 4 | % to physical coordinates (in centimeters) from the camera (our prediction 5 | % space). The device data is pulled from apple_device_data.csv via 6 | % loadAppleDeviceData.m. cam2screen.m is the inverse. 7 | % 8 | % Input Parameters 9 | % ================ 10 | % With the exception of useCm (which is a logical scalar), all input 11 | % vectors should be the same size, with an element for each sample to be 12 | % processed. 13 | % - xScreen/yScreen: Screen coordinates from the top-left corner of the 14 | % screen. Positive x is down and positive y is right. Units are 15 | % determined by useCm. 16 | % - orientation: The orientation of the device as an integer (1-4). See the 17 | % README for more information. 18 | % - device: Cell array of strings describing the device name. 19 | % - screenW/screenH: Size of the active screen area. This allows us to 20 | % account for Display Zoom. This assumes the active screen area covers 21 | % the entire screen (which is the case in GazeCapture). 22 | % - useCm: Whether to interpret xScreen/yScreen as points or centimeters. 23 | % Default: points. 24 | % 25 | % Output Parameters 26 | % ================= 27 | % - xCam/yCam: xScreen/yScreen transformed to our prediciton space, 28 | % measured in centimeters from the center of the camera on the device, 29 | % dependent on the orientation of the device. 30 | function [xCam, yCam] = screen2cam(xScreen, yScreen, orientation, ... 31 | device, screenW, screenH, useCm) 32 | 33 | if nargin < 7 34 | useCm = false; 35 | end 36 | 37 | loadAppleDeviceData; 38 | 39 | processed = false(size(xScreen)); 40 | xCam = NaN(size(xScreen)); 41 | yCam = NaN(size(yScreen)); 42 | 43 | % Process device by device. 44 | for i = 1:length(deviceName) 45 | curr = strcmpi(device, deviceName(i)); 46 | xCurr = xScreen(curr); 47 | yCurr = yScreen(curr); 48 | oCurr = orientation(curr); 49 | o1 = oCurr == 1; 50 | o2 = oCurr == 2; 51 | o3 = oCurr == 3; 52 | o4 = oCurr == 4; 53 | if ~useCm 54 | screenWCurr = screenW(curr); 55 | screenHCurr = screenH(curr); 56 | end 57 | dX = deviceCameraToScreenXMm(i); 58 | dY = deviceCameraToScreenYMm(i); 59 | dW = deviceScreenWidthMm(i); 60 | dH = deviceScreenHeightMm(i); 61 | 62 | if ~useCm 63 | xCurr(o1 | o2) = xCurr(o1 | o2) .* (dW ./ screenWCurr(o1 | o2)); 64 | yCurr(o1 | o2) = yCurr(o1 | o2) .* (dH ./ screenHCurr(o1 | o2)); 65 | xCurr(o3 | o4) = xCurr(o3 | o4) .* (dH ./ screenWCurr(o3 | o4)); 66 | yCurr(o3 | o4) = yCurr(o3 | o4) .* (dW ./ screenHCurr(o3 | o4)); 67 | else 68 | % Convert cm to mm. 69 | xCurr(o1 | o2) = xCurr(o1 | o2) .* 10; 70 | yCurr(o1 | o2) = yCurr(o1 | o2) .* 10; 71 | xCurr(o3 | o4) = xCurr(o3 | o4) .* 10; 72 | yCurr(o3 | o4) = yCurr(o3 | o4) .* 10; 73 | end 74 | 75 | % Transform to camera space, depending on the orientation. 76 | xCurr(o1) = xCurr(o1) - dX; 77 | yCurr(o1) = -dY - yCurr(o1); 78 | xCurr(o2) = dX - dW + xCurr(o2); 79 | yCurr(o2) = dY + dH - yCurr(o2); 80 | xCurr(o3) = dY + xCurr(o3); 81 | yCurr(o3) = dW - dX - yCurr(o3); 82 | xCurr(o4) = -dY - dH + xCurr(o4); 83 | yCurr(o4) = dX - yCurr(o4); 84 | 85 | % Store the results. 86 | xCam(curr) = xCurr; 87 | yCam(curr) = yCurr; 88 | 89 | processed = processed | curr; 90 | end 91 | 92 | if ~all(processed) 93 | warning(['The following devices were not recognized. Expect NaN ' ... 94 | 'return values.']); 95 | disp(unique(device(~processed))); 96 | end 97 | 98 | % Finally, convert mm to cm. 99 | xCam = xCam / 10; 100 | yCam = yCam / 10; 101 | 102 | end 103 | -------------------------------------------------------------------------------- /models/itracker_deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "iTracker_test" 2 | input: "image_left" 3 | input_dim: 256 4 | input_dim: 3 5 | input_dim: 224 6 | input_dim: 224 7 | input: "image_right" 8 | input_dim: 256 9 | input_dim: 3 10 | input_dim: 224 11 | input_dim: 224 12 | input: "image_face" 13 | input_dim: 256 14 | input_dim: 3 15 | input_dim: 224 16 | input_dim: 224 17 | input: "facegrid" 18 | input_dim: 256 19 | input_dim: 625 20 | input_dim: 1 21 | input_dim: 1 22 | layer { 23 | name: "conv1" 24 | type: "Convolution" 25 | bottom: "image_left" 26 | top: "conv1" 27 | param { 28 | name: "conv1_w" 29 | lr_mult: 1 30 | decay_mult: 1 31 | } 32 | param { 33 | name: "conv1_b" 34 | lr_mult: 2 35 | decay_mult: 0 36 | } 37 | convolution_param { 38 | num_output: 96 39 | kernel_size: 11 40 | stride: 4 41 | } 42 | } 43 | layer { 44 | name: "relu1" 45 | type: "ReLU" 46 | bottom: "conv1" 47 | top: "conv1" 48 | } 49 | layer { 50 | name: "pool1" 51 | type: "Pooling" 52 | bottom: "conv1" 53 | top: "pool1" 54 | pooling_param { 55 | pool: MAX 56 | kernel_size: 3 57 | stride: 2 58 | } 59 | } 60 | layer { 61 | name: "norm1" 62 | type: "LRN" 63 | bottom: "pool1" 64 | top: "norm1" 65 | lrn_param { 66 | local_size: 5 67 | alpha: 0.0001 68 | beta: 0.75 69 | } 70 | } 71 | layer { 72 | name: "conv2" 73 | type: "Convolution" 74 | bottom: "norm1" 75 | top: "conv2" 76 | param { 77 | name: "conv2_w" 78 | lr_mult: 1 79 | decay_mult: 1 80 | } 81 | param { 82 | name: "conv2_b" 83 | lr_mult: 2 84 | decay_mult: 0 85 | } 86 | convolution_param { 87 | num_output: 256 88 | pad: 2 89 | kernel_size: 5 90 | group: 2 91 | } 92 | } 93 | layer { 94 | name: "relu2" 95 | type: "ReLU" 96 | bottom: "conv2" 97 | top: "conv2" 98 | } 99 | layer { 100 | name: "pool2" 101 | type: "Pooling" 102 | bottom: "conv2" 103 | top: "pool2" 104 | pooling_param { 105 | pool: MAX 106 | kernel_size: 3 107 | stride: 2 108 | } 109 | } 110 | layer { 111 | name: "norm2" 112 | type: "LRN" 113 | bottom: "pool2" 114 | top: "norm2" 115 | lrn_param { 116 | local_size: 5 117 | alpha: 0.0001 118 | beta: 0.75 119 | } 120 | } 121 | layer { 122 | name: "conv3" 123 | type: "Convolution" 124 | bottom: "norm2" 125 | top: "conv3" 126 | param { 127 | name: "conv3_w" 128 | lr_mult: 1 129 | decay_mult: 1 130 | } 131 | param { 132 | name: "conv3_b" 133 | lr_mult: 2 134 | decay_mult: 0 135 | } 136 | convolution_param { 137 | num_output: 384 138 | pad: 1 139 | kernel_size: 3 140 | } 141 | } 142 | layer { 143 | name: "relu3" 144 | type: "ReLU" 145 | bottom: "conv3" 146 | top: "conv3" 147 | } 148 | layer { 149 | name: "conv4_l" 150 | type: "Convolution" 151 | bottom: "conv3" 152 | top: "conv4_l" 153 | param { 154 | lr_mult: 1 155 | decay_mult: 1 156 | } 157 | param { 158 | lr_mult: 2 159 | decay_mult: 0 160 | } 161 | convolution_param { 162 | num_output: 64 163 | pad: 0 164 | kernel_size: 1 165 | } 166 | } 167 | layer { 168 | name: "relu4_l" 169 | type: "ReLU" 170 | bottom: "conv4_l" 171 | top: "conv4_l" 172 | } 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | layer { 184 | name: "conv1_r" 185 | type: "Convolution" 186 | bottom: "image_right" 187 | top: "conv1_r" 188 | param { 189 | name: "conv1_w" 190 | lr_mult: 1 191 | decay_mult: 1 192 | } 193 | param { 194 | name: "conv1_b" 195 | lr_mult: 2 196 | decay_mult: 0 197 | } 198 | convolution_param { 199 | num_output: 96 200 | kernel_size: 11 201 | stride: 4 202 | } 203 | } 204 | layer { 205 | name: "relu1_r" 206 | type: "ReLU" 207 | bottom: "conv1_r" 208 | top: "conv1_r" 209 | } 210 | layer { 211 | name: "pool1_r" 212 | type: "Pooling" 213 | bottom: "conv1_r" 214 | top: "pool1_r" 215 | pooling_param { 216 | pool: MAX 217 | kernel_size: 3 218 | stride: 2 219 | } 220 | } 221 | layer { 222 | name: "norm1_r" 223 | type: "LRN" 224 | bottom: "pool1_r" 225 | top: "norm1_r" 226 | lrn_param { 227 | local_size: 5 228 | alpha: 0.0001 229 | beta: 0.75 230 | } 231 | } 232 | layer { 233 | name: "conv2_r" 234 | type: "Convolution" 235 | bottom: "norm1_r" 236 | top: "conv2_r" 237 | param { 238 | name: "conv2_w" 239 | lr_mult: 1 240 | decay_mult: 1 241 | } 242 | param { 243 | name: "conv2_b" 244 | lr_mult: 2 245 | decay_mult: 0 246 | } 247 | convolution_param { 248 | num_output: 256 249 | pad: 2 250 | kernel_size: 5 251 | group: 2 252 | } 253 | } 254 | layer { 255 | name: "relu2_r" 256 | type: "ReLU" 257 | bottom: "conv2_r" 258 | top: "conv2_r" 259 | } 260 | layer { 261 | name: "pool2_r" 262 | type: "Pooling" 263 | bottom: "conv2_r" 264 | top: "pool2_r" 265 | pooling_param { 266 | pool: MAX 267 | kernel_size: 3 268 | stride: 2 269 | } 270 | } 271 | layer { 272 | name: "norm2_r" 273 | type: "LRN" 274 | bottom: "pool2_r" 275 | top: "norm2_r" 276 | lrn_param { 277 | local_size: 5 278 | alpha: 0.0001 279 | beta: 0.75 280 | } 281 | } 282 | layer { 283 | name: "conv3_r" 284 | type: "Convolution" 285 | bottom: "norm2_r" 286 | top: "conv3_r" 287 | param { 288 | name: "conv3_w" 289 | lr_mult: 1 290 | decay_mult: 1 291 | } 292 | param { 293 | name: "conv3_b" 294 | lr_mult: 2 295 | decay_mult: 0 296 | } 297 | convolution_param { 298 | num_output: 384 299 | pad: 1 300 | kernel_size: 3 301 | } 302 | } 303 | layer { 304 | name: "relu3_r" 305 | type: "ReLU" 306 | bottom: "conv3_r" 307 | top: "conv3_r" 308 | } 309 | layer { 310 | name: "conv4_r" 311 | type: "Convolution" 312 | bottom: "conv3_r" 313 | top: "conv4_r" 314 | param { 315 | lr_mult: 1 316 | decay_mult: 1 317 | } 318 | param { 319 | lr_mult: 2 320 | decay_mult: 0 321 | } 322 | convolution_param { 323 | num_output: 64 324 | pad: 0 325 | kernel_size: 1 326 | } 327 | } 328 | layer { 329 | name: "relu4_r" 330 | type: "ReLU" 331 | bottom: "conv4_r" 332 | top: "conv4_r" 333 | } 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | layer { 345 | name: "conv1_f" 346 | type: "Convolution" 347 | bottom: "image_face" 348 | top: "conv1_f" 349 | param { 350 | name: "conv1_f_w" 351 | lr_mult: 1 352 | decay_mult: 1 353 | } 354 | param { 355 | name: "conv1_f_b" 356 | lr_mult: 2 357 | decay_mult: 0 358 | } 359 | convolution_param { 360 | num_output: 96 361 | kernel_size: 11 362 | stride: 4 363 | } 364 | } 365 | layer { 366 | name: "relu1_f" 367 | type: "ReLU" 368 | bottom: "conv1_f" 369 | top: "conv1_f" 370 | } 371 | layer { 372 | name: "pool1_f" 373 | type: "Pooling" 374 | bottom: "conv1_f" 375 | top: "pool1_f" 376 | pooling_param { 377 | pool: MAX 378 | kernel_size: 3 379 | stride: 2 380 | } 381 | } 382 | layer { 383 | name: "norm1_f" 384 | type: "LRN" 385 | bottom: "pool1_f" 386 | top: "norm1_f" 387 | lrn_param { 388 | local_size: 5 389 | alpha: 0.0001 390 | beta: 0.75 391 | } 392 | } 393 | layer { 394 | name: "conv2_f" 395 | type: "Convolution" 396 | bottom: "norm1_f" 397 | top: "conv2_f" 398 | param { 399 | name: "conv2_f_w" 400 | lr_mult: 1 401 | decay_mult: 1 402 | } 403 | param { 404 | name: "conv2_f_b" 405 | lr_mult: 2 406 | decay_mult: 0 407 | } 408 | convolution_param { 409 | num_output: 256 410 | pad: 2 411 | kernel_size: 5 412 | group: 2 413 | } 414 | } 415 | layer { 416 | name: "relu2_f" 417 | type: "ReLU" 418 | bottom: "conv2_f" 419 | top: "conv2_f" 420 | } 421 | layer { 422 | name: "pool2_f" 423 | type: "Pooling" 424 | bottom: "conv2_f" 425 | top: "pool2_f" 426 | pooling_param { 427 | pool: MAX 428 | kernel_size: 3 429 | stride: 2 430 | } 431 | } 432 | layer { 433 | name: "norm2_f" 434 | type: "LRN" 435 | bottom: "pool2_f" 436 | top: "norm2_f" 437 | lrn_param { 438 | local_size: 5 439 | alpha: 0.0001 440 | beta: 0.75 441 | } 442 | } 443 | layer { 444 | name: "conv3_f" 445 | type: "Convolution" 446 | bottom: "norm2_f" 447 | top: "conv3_f" 448 | param { 449 | name: "conv3_f_w" 450 | lr_mult: 1 451 | decay_mult: 1 452 | } 453 | param { 454 | name: "conv3_f_b" 455 | lr_mult: 2 456 | decay_mult: 0 457 | } 458 | convolution_param { 459 | num_output: 384 460 | pad: 1 461 | kernel_size: 3 462 | } 463 | } 464 | layer { 465 | name: "relu3_f" 466 | type: "ReLU" 467 | bottom: "conv3_f" 468 | top: "conv3_f" 469 | } 470 | layer { 471 | name: "conv4_f" 472 | type: "Convolution" 473 | bottom: "conv3_f" 474 | top: "conv4_f" 475 | param { 476 | lr_mult: 1 477 | decay_mult: 1 478 | } 479 | param { 480 | lr_mult: 2 481 | decay_mult: 0 482 | } 483 | convolution_param { 484 | num_output: 64 485 | pad: 0 486 | kernel_size: 1 487 | } 488 | } 489 | layer { 490 | name: "relu4_f" 491 | type: "ReLU" 492 | bottom: "conv4_f" 493 | top: "conv4_f" 494 | } 495 | layer { 496 | name: "fc1_f" 497 | type: "InnerProduct" 498 | bottom: "conv4_f" 499 | top: "fc1_f" 500 | param { 501 | lr_mult: 1 502 | } 503 | param { 504 | lr_mult: 2 505 | } 506 | inner_product_param { 507 | num_output: 128 508 | } 509 | } 510 | layer { 511 | name: "relufc1_f" 512 | type: "ReLU" 513 | bottom: "fc1_f" 514 | top: "fc1_f" 515 | } 516 | layer { 517 | name: "fc2_f" 518 | type: "InnerProduct" 519 | bottom: "fc1_f" 520 | top: "fc2_f" 521 | param { 522 | lr_mult: 1 523 | } 524 | param { 525 | lr_mult: 2 526 | } 527 | inner_product_param { 528 | num_output: 64 529 | } 530 | } 531 | layer { 532 | name: "relufc2_f" 533 | type: "ReLU" 534 | bottom: "fc2_f" 535 | top: "fc2_f" 536 | } 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | layer { 548 | name: "concat1" 549 | bottom: "conv4_l" 550 | bottom: "conv4_r" 551 | top: "concat1" 552 | type: "Concat" 553 | concat_param { 554 | axis: 1 555 | } 556 | } 557 | layer { 558 | name: "fc1" 559 | type: "InnerProduct" 560 | bottom: "concat1" 561 | top: "fc1" 562 | param { 563 | lr_mult: 1 564 | } 565 | param { 566 | lr_mult: 2 567 | } 568 | inner_product_param { 569 | num_output: 128 570 | } 571 | } 572 | layer { 573 | name: "relufc1" 574 | type: "ReLU" 575 | bottom: "fc1" 576 | top: "fc1" 577 | } 578 | layer { 579 | name: "flatten" 580 | type: "Flatten" 581 | bottom: "facegrid" 582 | top: "flatten" 583 | } 584 | layer { 585 | name: "fg_fc1" 586 | type: "InnerProduct" 587 | bottom: "flatten" 588 | top: "fg_fc1" 589 | param { 590 | lr_mult: 1 591 | } 592 | param { 593 | lr_mult: 2 594 | } 595 | inner_product_param { 596 | num_output: 256 597 | weight_filler { 598 | type: "gaussian" 599 | std: 0.005 600 | } 601 | bias_filler { 602 | type: "constant" 603 | value: 1 604 | } 605 | } 606 | } 607 | layer { 608 | name: "relufgfc1" 609 | type: "ReLU" 610 | bottom: "fg_fc1" 611 | top: "fg_fc1" 612 | } 613 | layer { 614 | name: "fg_fc2" 615 | type: "InnerProduct" 616 | bottom: "fg_fc1" 617 | top: "fg_fc2" 618 | param { 619 | lr_mult: 1 620 | } 621 | param { 622 | lr_mult: 2 623 | } 624 | inner_product_param { 625 | num_output: 128 626 | } 627 | } 628 | layer { 629 | name: "relufgfc2" 630 | type: "ReLU" 631 | bottom: "fg_fc2" 632 | top: "fg_fc2" 633 | } 634 | layer { 635 | name: "concat2" 636 | bottom: "fc1" 637 | bottom: "fg_fc2" 638 | bottom: "fc2_f" 639 | top: "concat2" 640 | type: "Concat" 641 | concat_param { 642 | axis: 1 643 | } 644 | } 645 | layer { 646 | name: "fc2" 647 | type: "InnerProduct" 648 | bottom: "concat2" 649 | top: "fc2" 650 | param { 651 | lr_mult: 1 652 | } 653 | param { 654 | lr_mult: 2 655 | } 656 | inner_product_param { 657 | num_output: 128 658 | } 659 | } 660 | layer { 661 | name: "relufc2" 662 | type: "ReLU" 663 | bottom: "fc2" 664 | top: "fc2" 665 | } 666 | layer { 667 | name: "fc3" 668 | type: "InnerProduct" 669 | bottom: "fc2" 670 | top: "fc3" 671 | param { 672 | lr_mult: 1 673 | } 674 | param { 675 | lr_mult: 2 676 | } 677 | inner_product_param { 678 | num_output: 2 679 | } 680 | } 681 | -------------------------------------------------------------------------------- /models/itracker_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "itracker_train_val.prototxt" 2 | test_iter: 1402 3 | test_interval: 1000 4 | base_lr: 0.001 5 | momentum: 0.9 6 | weight_decay: 0.0005 7 | lr_policy: "step" 8 | gamma: 0.1 9 | stepsize: 75000 10 | display: 20 11 | max_iter: 150000 12 | snapshot: 1000 13 | snapshot_prefix: "snapshots/itracker" 14 | solver_mode: GPU 15 | -------------------------------------------------------------------------------- /models/itracker_train_val.prototxt: -------------------------------------------------------------------------------- 1 | # Input layers 2 | # TODO: Update the "lmdb" paths. You will need to generate these databases from 3 | # the data. For non-image LMDBs, we set "channels" to the number of data 4 | # elements (e.g., 625 [25 * 25] for face grid, 2 for gaze [x and y]) leaving 5 | # "width" and "height" set to 1. 6 | 7 | name: "iTracker" 8 | layer { 9 | name: "itracker" 10 | type: "Data" 11 | top: "image_left" 12 | include { 13 | phase: TRAIN 14 | } 15 | transform_param { 16 | mean_file: "mean_images/mean_left_224.binaryproto" 17 | mirror: false 18 | } 19 | data_param { 20 | source: "lmdb/train_left_224_image_lmdb" 21 | batch_size: 256 22 | backend: LMDB 23 | } 24 | } 25 | layer { 26 | name: "itracker" 27 | type: "Data" 28 | top: "image_left" 29 | include { 30 | phase: TEST 31 | } 32 | transform_param { 33 | mean_file: "mean_images/mean_left_224.binaryproto" 34 | mirror: false 35 | } 36 | data_param { 37 | source: "lmdb/val_left_224_image_lmdb" 38 | batch_size: 256 39 | backend: LMDB 40 | } 41 | } 42 | layer { 43 | name: "itracker" 44 | type: "Data" 45 | top: "image_right" 46 | include { 47 | phase: TRAIN 48 | } 49 | transform_param { 50 | mean_file: "mean_images/mean_right_224.binaryproto" 51 | mirror: false 52 | } 53 | data_param { 54 | source: "lmdb/train_right_224_image_lmdb" 55 | batch_size: 256 56 | backend: LMDB 57 | } 58 | } 59 | layer { 60 | name: "itracker" 61 | type: "Data" 62 | top: "image_right" 63 | include { 64 | phase: TEST 65 | } 66 | transform_param { 67 | mean_file: "mean_images/mean_right_224.binaryproto" 68 | mirror: false 69 | } 70 | data_param { 71 | source: "lmdb/val_right_224_image_lmdb" 72 | batch_size: 256 73 | backend: LMDB 74 | } 75 | } 76 | layer { 77 | name: "itracker" 78 | type: "Data" 79 | top: "image_face" 80 | include { 81 | phase: TRAIN 82 | } 83 | transform_param { 84 | mean_file: "mean_images/mean_face_224.binaryproto" 85 | mirror: false 86 | } 87 | data_param { 88 | source: "lmdb/train_face_224_image_lmdb" 89 | batch_size: 256 90 | backend: LMDB 91 | } 92 | } 93 | layer { 94 | name: "itracker" 95 | type: "Data" 96 | top: "image_face" 97 | include { 98 | phase: TEST 99 | } 100 | transform_param { 101 | mean_file: "mean_images/mean_face_224.binaryproto" 102 | mirror: false 103 | } 104 | data_param { 105 | source: "lmdb/val_face_224_image_lmdb" 106 | batch_size: 256 107 | backend: LMDB 108 | } 109 | } 110 | layer { 111 | name: "itracker" 112 | type: "Data" 113 | top: "facegrid" 114 | data_param { 115 | source: "lmdb/train_facegrid_lmdb" 116 | backend: LMDB 117 | batch_size: 256 118 | } 119 | include { 120 | phase: TRAIN 121 | } 122 | } 123 | layer { 124 | name: "itracker" 125 | type: "Data" 126 | top: "facegrid" 127 | data_param { 128 | source: "lmdb/val_facegrid_lmdb" 129 | backend: LMDB 130 | batch_size: 256 131 | } 132 | include { 133 | phase: TEST 134 | } 135 | } 136 | layer { 137 | name: "gaze" 138 | type: "Data" 139 | top: "gaze" 140 | data_param { 141 | source: "lmdb/train_gaze_lmdb" 142 | backend: LMDB 143 | batch_size: 256 144 | } 145 | include { 146 | phase: TRAIN 147 | } 148 | } 149 | layer { 150 | name: "gaze" 151 | type: "Data" 152 | top: "gaze" 153 | data_param { 154 | source: "lmdb/val_gaze_lmdb" 155 | backend: LMDB 156 | batch_size: 256 157 | } 158 | include { 159 | phase: TEST 160 | } 161 | } 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | # Note: Layer names here differ from those used in the paper (Figure 5). See 173 | # comments below for more details. 174 | 175 | 176 | # Layer "CONV-E1" (left eye copy) 177 | layer { 178 | name: "conv1" 179 | type: "Convolution" 180 | bottom: "image_left" 181 | top: "conv1" 182 | # Note: Parameter names are the same for left- and right-eye layers to enable 183 | # weight sharing. 184 | param { 185 | name: "conv1_w" 186 | lr_mult: 1 187 | decay_mult: 1 188 | } 189 | param { 190 | name: "conv1_b" 191 | lr_mult: 2 192 | decay_mult: 0 193 | } 194 | convolution_param { 195 | num_output: 96 196 | kernel_size: 11 197 | stride: 4 198 | weight_filler { 199 | type: "gaussian" 200 | std: 0.01 201 | } 202 | bias_filler { 203 | type: "constant" 204 | value: 0 205 | } 206 | } 207 | } 208 | layer { 209 | name: "relu1" 210 | type: "ReLU" 211 | bottom: "conv1" 212 | top: "conv1" 213 | } 214 | layer { 215 | name: "pool1" 216 | type: "Pooling" 217 | bottom: "conv1" 218 | top: "pool1" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 3 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "norm1" 227 | type: "LRN" 228 | bottom: "pool1" 229 | top: "norm1" 230 | lrn_param { 231 | local_size: 5 232 | alpha: 0.0001 233 | beta: 0.75 234 | } 235 | } 236 | 237 | # Layer "CONV-E2" (left eye copy) 238 | layer { 239 | name: "conv2" 240 | type: "Convolution" 241 | bottom: "norm1" 242 | top: "conv2" 243 | param { 244 | name: "conv2_w" 245 | lr_mult: 1 246 | decay_mult: 1 247 | } 248 | param { 249 | name: "conv2_b" 250 | lr_mult: 2 251 | decay_mult: 0 252 | } 253 | convolution_param { 254 | num_output: 256 255 | pad: 2 256 | kernel_size: 5 257 | group: 2 258 | weight_filler { 259 | type: "gaussian" 260 | std: 0.01 261 | } 262 | bias_filler { 263 | type: "constant" 264 | value: 1 265 | } 266 | } 267 | } 268 | layer { 269 | name: "relu2" 270 | type: "ReLU" 271 | bottom: "conv2" 272 | top: "conv2" 273 | } 274 | layer { 275 | name: "pool2" 276 | type: "Pooling" 277 | bottom: "conv2" 278 | top: "pool2" 279 | pooling_param { 280 | pool: MAX 281 | kernel_size: 3 282 | stride: 2 283 | } 284 | } 285 | layer { 286 | name: "norm2" 287 | type: "LRN" 288 | bottom: "pool2" 289 | top: "norm2" 290 | lrn_param { 291 | local_size: 5 292 | alpha: 0.0001 293 | beta: 0.75 294 | } 295 | } 296 | 297 | # Layer "CONV-E3" (left eye copy) 298 | layer { 299 | name: "conv3" 300 | type: "Convolution" 301 | bottom: "norm2" 302 | top: "conv3" 303 | param { 304 | name: "conv3_w" 305 | lr_mult: 1 306 | decay_mult: 1 307 | } 308 | param { 309 | name: "conv3_b" 310 | lr_mult: 2 311 | decay_mult: 0 312 | } 313 | convolution_param { 314 | num_output: 384 315 | pad: 1 316 | kernel_size: 3 317 | weight_filler { 318 | type: "gaussian" 319 | std: 0.01 320 | } 321 | bias_filler { 322 | type: "constant" 323 | value: 0 324 | } 325 | } 326 | } 327 | layer { 328 | name: "relu3" 329 | type: "ReLU" 330 | bottom: "conv3" 331 | top: "conv3" 332 | } 333 | 334 | # Layer "CONV-E4" (left eye copy) 335 | layer { 336 | # Note: "_l" is not used on other left layers above to enable easier 337 | # initialization from ImageNet parameters. 338 | name: "conv4_l" 339 | type: "Convolution" 340 | bottom: "conv3" 341 | top: "conv4_l" 342 | param { 343 | lr_mult: 1 344 | decay_mult: 1 345 | } 346 | param { 347 | lr_mult: 2 348 | decay_mult: 0 349 | } 350 | convolution_param { 351 | num_output: 64 352 | pad: 0 353 | kernel_size: 1 354 | weight_filler { 355 | type: "gaussian" 356 | std: 0.01 357 | } 358 | bias_filler { 359 | type: "constant" 360 | value: 0 361 | } 362 | } 363 | } 364 | layer { 365 | name: "relu4_l" 366 | type: "ReLU" 367 | bottom: "conv4_l" 368 | top: "conv4_l" 369 | } 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | # Layer "CONV-E1" (right eye copy) 381 | layer { 382 | name: "conv1_r" 383 | type: "Convolution" 384 | bottom: "image_right" 385 | top: "conv1_r" 386 | param { 387 | name: "conv1_w" 388 | lr_mult: 1 389 | decay_mult: 1 390 | } 391 | param { 392 | name: "conv1_b" 393 | lr_mult: 2 394 | decay_mult: 0 395 | } 396 | convolution_param { 397 | num_output: 96 398 | kernel_size: 11 399 | stride: 4 400 | weight_filler { 401 | type: "gaussian" 402 | std: 0.01 403 | } 404 | bias_filler { 405 | type: "constant" 406 | value: 0 407 | } 408 | } 409 | } 410 | layer { 411 | name: "relu1_r" 412 | type: "ReLU" 413 | bottom: "conv1_r" 414 | top: "conv1_r" 415 | } 416 | layer { 417 | name: "pool1_r" 418 | type: "Pooling" 419 | bottom: "conv1_r" 420 | top: "pool1_r" 421 | pooling_param { 422 | pool: MAX 423 | kernel_size: 3 424 | stride: 2 425 | } 426 | } 427 | layer { 428 | name: "norm1_r" 429 | type: "LRN" 430 | bottom: "pool1_r" 431 | top: "norm1_r" 432 | lrn_param { 433 | local_size: 5 434 | alpha: 0.0001 435 | beta: 0.75 436 | } 437 | } 438 | 439 | # Layer "CONV-E2" (right eye copy) 440 | layer { 441 | name: "conv2_r" 442 | type: "Convolution" 443 | bottom: "norm1_r" 444 | top: "conv2_r" 445 | param { 446 | name: "conv2_w" 447 | lr_mult: 1 448 | decay_mult: 1 449 | } 450 | param { 451 | name: "conv2_b" 452 | lr_mult: 2 453 | decay_mult: 0 454 | } 455 | convolution_param { 456 | num_output: 256 457 | pad: 2 458 | kernel_size: 5 459 | group: 2 460 | weight_filler { 461 | type: "gaussian" 462 | std: 0.01 463 | } 464 | bias_filler { 465 | type: "constant" 466 | value: 1 467 | } 468 | } 469 | } 470 | layer { 471 | name: "relu2_r" 472 | type: "ReLU" 473 | bottom: "conv2_r" 474 | top: "conv2_r" 475 | } 476 | layer { 477 | name: "pool2_r" 478 | type: "Pooling" 479 | bottom: "conv2_r" 480 | top: "pool2_r" 481 | pooling_param { 482 | pool: MAX 483 | kernel_size: 3 484 | stride: 2 485 | } 486 | } 487 | layer { 488 | name: "norm2_r" 489 | type: "LRN" 490 | bottom: "pool2_r" 491 | top: "norm2_r" 492 | lrn_param { 493 | local_size: 5 494 | alpha: 0.0001 495 | beta: 0.75 496 | } 497 | } 498 | 499 | # Layer "CONV-E3" (right eye copy) 500 | layer { 501 | name: "conv3_r" 502 | type: "Convolution" 503 | bottom: "norm2_r" 504 | top: "conv3_r" 505 | param { 506 | name: "conv3_w" 507 | lr_mult: 1 508 | decay_mult: 1 509 | } 510 | param { 511 | name: "conv3_b" 512 | lr_mult: 2 513 | decay_mult: 0 514 | } 515 | convolution_param { 516 | num_output: 384 517 | pad: 1 518 | kernel_size: 3 519 | weight_filler { 520 | type: "gaussian" 521 | std: 0.01 522 | } 523 | bias_filler { 524 | type: "constant" 525 | value: 0 526 | } 527 | } 528 | } 529 | layer { 530 | name: "relu3_r" 531 | type: "ReLU" 532 | bottom: "conv3_r" 533 | top: "conv3_r" 534 | } 535 | 536 | # Layer "CONV-E4" (right eye copy) 537 | layer { 538 | name: "conv4_r" 539 | type: "Convolution" 540 | bottom: "conv3_r" 541 | top: "conv4_r" 542 | param { 543 | lr_mult: 1 544 | decay_mult: 1 545 | } 546 | param { 547 | lr_mult: 2 548 | decay_mult: 0 549 | } 550 | convolution_param { 551 | num_output: 64 552 | pad: 0 553 | kernel_size: 1 554 | weight_filler { 555 | type: "gaussian" 556 | std: 0.01 557 | } 558 | bias_filler { 559 | type: "constant" 560 | value: 0 561 | } 562 | } 563 | } 564 | layer { 565 | name: "relu4_r" 566 | type: "ReLU" 567 | bottom: "conv4_r" 568 | top: "conv4_r" 569 | } 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | # Layer "CONV-F1" 581 | layer { 582 | name: "conv1_f" 583 | type: "Convolution" 584 | bottom: "image_face" 585 | top: "conv1_f" 586 | param { 587 | name: "conv1_f_w" 588 | lr_mult: 1 589 | decay_mult: 1 590 | } 591 | param { 592 | name: "conv1_f_b" 593 | lr_mult: 2 594 | decay_mult: 0 595 | } 596 | convolution_param { 597 | num_output: 96 598 | kernel_size: 11 599 | stride: 4 600 | weight_filler { 601 | type: "gaussian" 602 | std: 0.01 603 | } 604 | bias_filler { 605 | type: "constant" 606 | value: 0 607 | } 608 | } 609 | } 610 | layer { 611 | name: "relu1_f" 612 | type: "ReLU" 613 | bottom: "conv1_f" 614 | top: "conv1_f" 615 | } 616 | layer { 617 | name: "pool1_f" 618 | type: "Pooling" 619 | bottom: "conv1_f" 620 | top: "pool1_f" 621 | pooling_param { 622 | pool: MAX 623 | kernel_size: 3 624 | stride: 2 625 | } 626 | } 627 | layer { 628 | name: "norm1_f" 629 | type: "LRN" 630 | bottom: "pool1_f" 631 | top: "norm1_f" 632 | lrn_param { 633 | local_size: 5 634 | alpha: 0.0001 635 | beta: 0.75 636 | } 637 | } 638 | 639 | # Layer "CONV-F2" 640 | layer { 641 | name: "conv2_f" 642 | type: "Convolution" 643 | bottom: "norm1_f" 644 | top: "conv2_f" 645 | param { 646 | name: "conv2_f_w" 647 | lr_mult: 1 648 | decay_mult: 1 649 | } 650 | param { 651 | name: "conv2_f_b" 652 | lr_mult: 2 653 | decay_mult: 0 654 | } 655 | convolution_param { 656 | num_output: 256 657 | pad: 2 658 | kernel_size: 5 659 | group: 2 660 | weight_filler { 661 | type: "gaussian" 662 | std: 0.01 663 | } 664 | bias_filler { 665 | type: "constant" 666 | value: 1 667 | } 668 | } 669 | } 670 | layer { 671 | name: "relu2_f" 672 | type: "ReLU" 673 | bottom: "conv2_f" 674 | top: "conv2_f" 675 | } 676 | layer { 677 | name: "pool2_f" 678 | type: "Pooling" 679 | bottom: "conv2_f" 680 | top: "pool2_f" 681 | pooling_param { 682 | pool: MAX 683 | kernel_size: 3 684 | stride: 2 685 | } 686 | } 687 | layer { 688 | name: "norm2_f" 689 | type: "LRN" 690 | bottom: "pool2_f" 691 | top: "norm2_f" 692 | lrn_param { 693 | local_size: 5 694 | alpha: 0.0001 695 | beta: 0.75 696 | } 697 | } 698 | 699 | # Layer "CONV-F3" 700 | layer { 701 | name: "conv3_f" 702 | type: "Convolution" 703 | bottom: "norm2_f" 704 | top: "conv3_f" 705 | param { 706 | name: "conv3_f_w" 707 | lr_mult: 1 708 | decay_mult: 1 709 | } 710 | param { 711 | name: "conv3_f_b" 712 | lr_mult: 2 713 | decay_mult: 0 714 | } 715 | convolution_param { 716 | num_output: 384 717 | pad: 1 718 | kernel_size: 3 719 | weight_filler { 720 | type: "gaussian" 721 | std: 0.01 722 | } 723 | bias_filler { 724 | type: "constant" 725 | value: 0 726 | } 727 | } 728 | } 729 | layer { 730 | name: "relu3_f" 731 | type: "ReLU" 732 | bottom: "conv3_f" 733 | top: "conv3_f" 734 | } 735 | 736 | # Layer "CONV-F4" 737 | layer { 738 | name: "conv4_f" 739 | type: "Convolution" 740 | bottom: "conv3_f" 741 | top: "conv4_f" 742 | param { 743 | lr_mult: 1 744 | decay_mult: 1 745 | } 746 | param { 747 | lr_mult: 2 748 | decay_mult: 0 749 | } 750 | convolution_param { 751 | num_output: 64 752 | pad: 0 753 | kernel_size: 1 754 | weight_filler { 755 | type: "gaussian" 756 | std: 0.01 757 | } 758 | bias_filler { 759 | type: "constant" 760 | value: 0 761 | } 762 | } 763 | } 764 | layer { 765 | name: "relu4_f" 766 | type: "ReLU" 767 | bottom: "conv4_f" 768 | top: "conv4_f" 769 | } 770 | 771 | # Layer "FC-F1" 772 | layer { 773 | name: "fc1_f" 774 | type: "InnerProduct" 775 | bottom: "conv4_f" 776 | top: "fc1_f" 777 | param { 778 | lr_mult: 1 779 | } 780 | param { 781 | lr_mult: 2 782 | } 783 | inner_product_param { 784 | num_output: 128 785 | weight_filler { 786 | type: "gaussian" 787 | std: 0.001 788 | } 789 | bias_filler { 790 | type: "constant" 791 | value: 1 792 | } 793 | } 794 | } 795 | layer { 796 | name: "relufc1_f" 797 | type: "ReLU" 798 | bottom: "fc1_f" 799 | top: "fc1_f" 800 | } 801 | 802 | # Layer "FC-F2" 803 | layer { 804 | name: "fc2_f" 805 | type: "InnerProduct" 806 | bottom: "fc1_f" 807 | top: "fc2_f" 808 | param { 809 | lr_mult: 1 810 | } 811 | param { 812 | lr_mult: 2 813 | } 814 | inner_product_param { 815 | num_output: 64 816 | weight_filler { 817 | type: "gaussian" 818 | std: 0.001 819 | } 820 | bias_filler { 821 | type: "constant" 822 | value: 1 823 | } 824 | } 825 | } 826 | layer { 827 | name: "relufc2_f" 828 | type: "ReLU" 829 | bottom: "fc2_f" 830 | top: "fc2_f" 831 | } 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | layer { 843 | name: "concat1" 844 | bottom: "conv4_l" 845 | bottom: "conv4_r" 846 | top: "concat1" 847 | type: "Concat" 848 | concat_param { 849 | axis: 1 850 | } 851 | } 852 | 853 | # Layer "FC-E1" 854 | layer { 855 | name: "fc1" 856 | type: "InnerProduct" 857 | bottom: "concat1" 858 | top: "fc1" 859 | param { 860 | lr_mult: 1 861 | } 862 | param { 863 | lr_mult: 2 864 | } 865 | inner_product_param { 866 | num_output: 128 867 | weight_filler { 868 | type: "gaussian" 869 | std: 0.001 870 | } 871 | bias_filler { 872 | type: "constant" 873 | value: 1 874 | } 875 | } 876 | } 877 | layer { 878 | name: "relufc1" 879 | type: "ReLU" 880 | bottom: "fc1" 881 | top: "fc1" 882 | } 883 | layer { 884 | name: "flatten" 885 | type: "Flatten" 886 | bottom: "facegrid" 887 | top: "flatten" 888 | } 889 | 890 | # Layer "FC-FG1" 891 | layer { 892 | name: "fg_fc1" 893 | type: "InnerProduct" 894 | bottom: "flatten" 895 | top: "fg_fc1" 896 | param { 897 | lr_mult: 1 898 | } 899 | param { 900 | lr_mult: 2 901 | } 902 | inner_product_param { 903 | num_output: 256 904 | weight_filler { 905 | type: "gaussian" 906 | std: 0.001 907 | } 908 | bias_filler { 909 | type: "constant" 910 | value: 1 911 | } 912 | } 913 | } 914 | layer { 915 | name: "relufgfc1" 916 | type: "ReLU" 917 | bottom: "fg_fc1" 918 | top: "fg_fc1" 919 | } 920 | 921 | # Layer "FC-FG2" 922 | layer { 923 | name: "fg_fc2" 924 | type: "InnerProduct" 925 | bottom: "fg_fc1" 926 | top: "fg_fc2" 927 | param { 928 | lr_mult: 1 929 | } 930 | param { 931 | lr_mult: 2 932 | } 933 | inner_product_param { 934 | num_output: 128 935 | weight_filler { 936 | type: "gaussian" 937 | std: 0.001 938 | } 939 | bias_filler { 940 | type: "constant" 941 | value: 1 942 | } 943 | } 944 | } 945 | layer { 946 | name: "relufgfc2" 947 | type: "ReLU" 948 | bottom: "fg_fc2" 949 | top: "fg_fc2" 950 | } 951 | 952 | layer { 953 | name: "concat2" 954 | bottom: "fc1" 955 | bottom: "fg_fc2" 956 | bottom: "fc2_f" 957 | top: "concat2" 958 | type: "Concat" 959 | concat_param { 960 | axis: 1 961 | } 962 | } 963 | 964 | # Layer "FC1" 965 | layer { 966 | name: "fc2" 967 | type: "InnerProduct" 968 | bottom: "concat2" 969 | top: "fc2" 970 | param { 971 | lr_mult: 1 972 | } 973 | param { 974 | lr_mult: 2 975 | } 976 | inner_product_param { 977 | num_output: 128 978 | weight_filler { 979 | type: "gaussian" 980 | std: 0.001 981 | } 982 | bias_filler { 983 | type: "constant" 984 | value: 1 985 | } 986 | } 987 | } 988 | layer { 989 | name: "relufc2" 990 | type: "ReLU" 991 | bottom: "fc2" 992 | top: "fc2" 993 | } 994 | 995 | # Layer "FC2" 996 | layer { 997 | name: "fc3" 998 | type: "InnerProduct" 999 | bottom: "fc2" 1000 | top: "fc3" 1001 | param { 1002 | lr_mult: 1 1003 | } 1004 | param { 1005 | lr_mult: 2 1006 | } 1007 | inner_product_param { 1008 | num_output: 2 1009 | weight_filler { 1010 | type: "gaussian" 1011 | std: 0.001 1012 | } 1013 | bias_filler { 1014 | type: "constant" 1015 | value: 0 1016 | } 1017 | } 1018 | } 1019 | layer { 1020 | name: "loss" 1021 | type: "EuclideanLoss" 1022 | bottom: "fc3" 1023 | bottom: "gaze" 1024 | top: "loss" 1025 | loss_weight: 1 1026 | } 1027 | -------------------------------------------------------------------------------- /models/mean_images/mean_face_224.binaryproto: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_face_224.binaryproto -------------------------------------------------------------------------------- /models/mean_images/mean_face_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_face_224.mat -------------------------------------------------------------------------------- /models/mean_images/mean_left_224.binaryproto: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_left_224.binaryproto -------------------------------------------------------------------------------- /models/mean_images/mean_left_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_left_224.mat -------------------------------------------------------------------------------- /models/mean_images/mean_left_224_new.binaryproto: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_left_224_new.binaryproto -------------------------------------------------------------------------------- /models/mean_images/mean_right_224.binaryproto: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_right_224.binaryproto -------------------------------------------------------------------------------- /models/mean_images/mean_right_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/mean_images/mean_right_224.mat -------------------------------------------------------------------------------- /models/snapshots/itracker25x_iter_92000.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/snapshots/itracker25x_iter_92000.caffemodel -------------------------------------------------------------------------------- /models/snapshots/itracker_iter_92000.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/models/snapshots/itracker_iter_92000.caffemodel -------------------------------------------------------------------------------- /pytorch/ITrackerData.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | import scipy.io as sio 3 | from PIL import Image 4 | import os 5 | import os.path 6 | import torchvision.transforms as transforms 7 | import torch 8 | import numpy as np 9 | import re 10 | 11 | ''' 12 | Data loader for the iTracker. 13 | Use prepareDataset.py to convert the dataset from http://gazecapture.csail.mit.edu/ to proper format. 14 | 15 | Author: Petr Kellnhofer ( pkel_lnho (at) gmai_l.com // remove underscores and spaces), 2018. 16 | 17 | Website: http://gazecapture.csail.mit.edu/ 18 | 19 | Cite: 20 | 21 | Eye Tracking for Everyone 22 | K.Krafka*, A. Khosla*, P. Kellnhofer, H. Kannan, S. Bhandarkar, W. Matusik and A. Torralba 23 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016 24 | 25 | @inproceedings{cvpr2016_gazecapture, 26 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 27 | Title = {Eye Tracking for Everyone}, 28 | Year = {2016}, 29 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 30 | } 31 | 32 | ''' 33 | 34 | MEAN_PATH = './' 35 | 36 | def loadMetadata(filename, silent = False): 37 | try: 38 | # http://stackoverflow.com/questions/6273634/access-array-contents-from-a-mat-file-loaded-using-scipy-io-loadmat-python 39 | if not silent: 40 | print('\tReading metadata from %s...' % filename) 41 | metadata = sio.loadmat(filename, squeeze_me=True, struct_as_record=False) 42 | except: 43 | print('\tFailed to read the meta file "%s"!' % filename) 44 | return None 45 | return metadata 46 | 47 | class SubtractMean(object): 48 | """Normalize an tensor image with mean. 49 | """ 50 | 51 | def __init__(self, meanImg): 52 | self.meanImg = transforms.ToTensor()(meanImg / 255) 53 | 54 | def __call__(self, tensor): 55 | """ 56 | Args: 57 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 58 | Returns: 59 | Tensor: Normalized image. 60 | """ 61 | return tensor.sub(self.meanImg) 62 | 63 | 64 | class ITrackerData(data.Dataset): 65 | def __init__(self, dataPath, split = 'train', imSize=(224,224), gridSize=(25, 25)): 66 | 67 | self.dataPath = dataPath 68 | self.imSize = imSize 69 | self.gridSize = gridSize 70 | 71 | print('Loading iTracker dataset...') 72 | metaFile = os.path.join(dataPath, 'metadata.mat') 73 | #metaFile = 'metadata.mat' 74 | if metaFile is None or not os.path.isfile(metaFile): 75 | raise RuntimeError('There is no such file %s! Provide a valid dataset path.' % metaFile) 76 | self.metadata = loadMetadata(metaFile) 77 | if self.metadata is None: 78 | raise RuntimeError('Could not read metadata file %s! Provide a valid dataset path.' % metaFile) 79 | 80 | self.faceMean = loadMetadata(os.path.join(MEAN_PATH, 'mean_face_224.mat'))['image_mean'] 81 | self.eyeLeftMean = loadMetadata(os.path.join(MEAN_PATH, 'mean_left_224.mat'))['image_mean'] 82 | self.eyeRightMean = loadMetadata(os.path.join(MEAN_PATH, 'mean_right_224.mat'))['image_mean'] 83 | 84 | self.transformFace = transforms.Compose([ 85 | transforms.Resize(self.imSize), 86 | transforms.ToTensor(), 87 | SubtractMean(meanImg=self.faceMean), 88 | ]) 89 | self.transformEyeL = transforms.Compose([ 90 | transforms.Resize(self.imSize), 91 | transforms.ToTensor(), 92 | SubtractMean(meanImg=self.eyeLeftMean), 93 | ]) 94 | self.transformEyeR = transforms.Compose([ 95 | transforms.Resize(self.imSize), 96 | transforms.ToTensor(), 97 | SubtractMean(meanImg=self.eyeRightMean), 98 | ]) 99 | 100 | 101 | if split == 'test': 102 | mask = self.metadata['labelTest'] 103 | elif split == 'val': 104 | mask = self.metadata['labelVal'] 105 | else: 106 | mask = self.metadata['labelTrain'] 107 | 108 | self.indices = np.argwhere(mask)[:,0] 109 | print('Loaded iTracker dataset split "%s" with %d records...' % (split, len(self.indices))) 110 | 111 | def loadImage(self, path): 112 | try: 113 | im = Image.open(path).convert('RGB') 114 | except OSError: 115 | raise RuntimeError('Could not read image: ' + path) 116 | #im = Image.new("RGB", self.imSize, "white") 117 | 118 | return im 119 | 120 | 121 | def makeGrid(self, params): 122 | gridLen = self.gridSize[0] * self.gridSize[1] 123 | grid = np.zeros([gridLen,], np.float32) 124 | 125 | indsY = np.array([i // self.gridSize[0] for i in range(gridLen)]) 126 | indsX = np.array([i % self.gridSize[0] for i in range(gridLen)]) 127 | condX = np.logical_and(indsX >= params[0], indsX < params[0] + params[2]) 128 | condY = np.logical_and(indsY >= params[1], indsY < params[1] + params[3]) 129 | cond = np.logical_and(condX, condY) 130 | 131 | grid[cond] = 1 132 | return grid 133 | 134 | def __getitem__(self, index): 135 | index = self.indices[index] 136 | 137 | imFacePath = os.path.join(self.dataPath, '%05d/appleFace/%05d.jpg' % (self.metadata['labelRecNum'][index], self.metadata['frameIndex'][index])) 138 | imEyeLPath = os.path.join(self.dataPath, '%05d/appleLeftEye/%05d.jpg' % (self.metadata['labelRecNum'][index], self.metadata['frameIndex'][index])) 139 | imEyeRPath = os.path.join(self.dataPath, '%05d/appleRightEye/%05d.jpg' % (self.metadata['labelRecNum'][index], self.metadata['frameIndex'][index])) 140 | 141 | imFace = self.loadImage(imFacePath) 142 | imEyeL = self.loadImage(imEyeLPath) 143 | imEyeR = self.loadImage(imEyeRPath) 144 | 145 | imFace = self.transformFace(imFace) 146 | imEyeL = self.transformEyeL(imEyeL) 147 | imEyeR = self.transformEyeR(imEyeR) 148 | 149 | gaze = np.array([self.metadata['labelDotXCam'][index], self.metadata['labelDotYCam'][index]], np.float32) 150 | 151 | faceGrid = self.makeGrid(self.metadata['labelFaceGrid'][index,:]) 152 | 153 | # to tensor 154 | row = torch.LongTensor([int(index)]) 155 | faceGrid = torch.FloatTensor(faceGrid) 156 | gaze = torch.FloatTensor(gaze) 157 | 158 | return row, imFace, imEyeL, imEyeR, faceGrid, gaze 159 | 160 | 161 | def __len__(self): 162 | return len(self.indices) 163 | -------------------------------------------------------------------------------- /pytorch/ITrackerModel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time, math 5 | from collections import OrderedDict 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.parallel 9 | import torch.backends.cudnn as cudnn 10 | import torch.optim 11 | import torch.utils.data 12 | import torchvision.transforms as transforms 13 | import torchvision.datasets as datasets 14 | import torchvision.models as models 15 | import numpy as np 16 | import torch.utils.model_zoo as model_zoo 17 | from torch.autograd.variable import Variable 18 | 19 | ''' 20 | Pytorch model for the iTracker. 21 | 22 | Author: Petr Kellnhofer ( pkel_lnho (at) gmai_l.com // remove underscores and spaces), 2018. 23 | 24 | Website: http://gazecapture.csail.mit.edu/ 25 | 26 | Cite: 27 | 28 | Eye Tracking for Everyone 29 | K.Krafka*, A. Khosla*, P. Kellnhofer, H. Kannan, S. Bhandarkar, W. Matusik and A. Torralba 30 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016 31 | 32 | @inproceedings{cvpr2016_gazecapture, 33 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 34 | Title = {Eye Tracking for Everyone}, 35 | Year = {2016}, 36 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 37 | } 38 | 39 | ''' 40 | 41 | 42 | class ItrackerImageModel(nn.Module): 43 | # Used for both eyes (with shared weights) and the face (with unqiue weights) 44 | def __init__(self): 45 | super(ItrackerImageModel, self).__init__() 46 | self.features = nn.Sequential( 47 | nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0), 48 | nn.ReLU(inplace=True), 49 | nn.MaxPool2d(kernel_size=3, stride=2), 50 | nn.CrossMapLRN2d(size=5, alpha=0.0001, beta=0.75, k=1.0), 51 | nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2, groups=2), 52 | nn.ReLU(inplace=True), 53 | nn.MaxPool2d(kernel_size=3, stride=2), 54 | nn.CrossMapLRN2d(size=5, alpha=0.0001, beta=0.75, k=1.0), 55 | nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1), 56 | nn.ReLU(inplace=True), 57 | nn.Conv2d(384, 64, kernel_size=1, stride=1, padding=0), 58 | nn.ReLU(inplace=True), 59 | 60 | ) 61 | 62 | def forward(self, x): 63 | x = self.features(x) 64 | x = x.view(x.size(0), -1) 65 | return x 66 | 67 | class FaceImageModel(nn.Module): 68 | 69 | def __init__(self): 70 | super(FaceImageModel, self).__init__() 71 | self.conv = ItrackerImageModel() 72 | self.fc = nn.Sequential( 73 | nn.Linear(12*12*64, 128), 74 | nn.ReLU(inplace=True), 75 | nn.Linear(128, 64), 76 | nn.ReLU(inplace=True), 77 | ) 78 | 79 | def forward(self, x): 80 | x = self.conv(x) 81 | x = self.fc(x) 82 | return x 83 | 84 | class FaceGridModel(nn.Module): 85 | # Model for the face grid pathway 86 | def __init__(self, gridSize = 25): 87 | super(FaceGridModel, self).__init__() 88 | self.fc = nn.Sequential( 89 | nn.Linear(gridSize * gridSize, 256), 90 | nn.ReLU(inplace=True), 91 | nn.Linear(256, 128), 92 | nn.ReLU(inplace=True), 93 | ) 94 | 95 | def forward(self, x): 96 | x = x.view(x.size(0), -1) 97 | x = self.fc(x) 98 | return x 99 | 100 | 101 | 102 | class ITrackerModel(nn.Module): 103 | 104 | 105 | def __init__(self): 106 | super(ITrackerModel, self).__init__() 107 | self.eyeModel = ItrackerImageModel() 108 | self.faceModel = FaceImageModel() 109 | self.gridModel = FaceGridModel() 110 | # Joining both eyes 111 | self.eyesFC = nn.Sequential( 112 | nn.Linear(2*12*12*64, 128), 113 | nn.ReLU(inplace=True), 114 | ) 115 | # Joining everything 116 | self.fc = nn.Sequential( 117 | nn.Linear(128+64+128, 128), 118 | nn.ReLU(inplace=True), 119 | nn.Linear(128, 2), 120 | ) 121 | 122 | def forward(self, faces, eyesLeft, eyesRight, faceGrids): 123 | # Eye nets 124 | xEyeL = self.eyeModel(eyesLeft) 125 | xEyeR = self.eyeModel(eyesRight) 126 | # Cat and FC 127 | xEyes = torch.cat((xEyeL, xEyeR), 1) 128 | xEyes = self.eyesFC(xEyes) 129 | 130 | # Face net 131 | xFace = self.faceModel(faces) 132 | xGrid = self.gridModel(faceGrids) 133 | 134 | # Cat all 135 | x = torch.cat((xEyes, xFace, xGrid), 1) 136 | x = self.fc(x) 137 | 138 | return x 139 | -------------------------------------------------------------------------------- /pytorch/LICENSE.md: -------------------------------------------------------------------------------- 1 | ### Copyright (c) 2017 - Kyle Krafka, Aditya Khosla, Petr Kellnhofer, Harini Kannan, Suchendra Bhandarkar, Wojciech Matusik, and Antonio Torralba. 2 | 3 | ## LICENSE AGREEMENT FOR USE OF GAZECAPTURE DATABASE AND ITRACKER MODELS 4 | 5 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this License Agreement for Use of GazeCapture Database and iTracker Models ("Research License"). To the extent this Research License may be interpreted as a contract, You are granted the rights mentioned below in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 6 | 7 | ### Section 1 – Definitions 8 | 9 | a. __Licensor__ means the individual(s) or entity(ies) granting rights under this Research License. 10 | 11 | b. __You__ means the individual or entity exercising the Licensed Rights under this Research License. Your has a corresponding meaning. 12 | 13 | c. __Licensed Material__ refers to the GazeCapture database, iTracker models, and any related source. These contain eye-tracking data captured on mobile devices and machine learning models to predict where individuals are looking on these devices. 14 | 15 | ### Section 2 – Scope 16 | 17 | 1. Licensor desires to grant a license to You for the use of the Licensed Material. This license will in no case be considered a transfer of the Licensed Material. 18 | 19 | 2. You shall have no rights with respect to the Licensed Material or any portion thereof and shall not use the Licensed Material except as expressly set forth in this Agreement. 20 | 21 | 3. Subject to the terms and conditions of this Agreement, Licensor hereby grants to You for research use only, a royalty-free, non-exclusive, non-transferable, license subject to the following conditions: 22 | 23 | * The Licensed Material is only for Your research use and, in a need-to-know basis, of those direct research colleagues who belong to the same research institution as You and have adhered to the terms of this license. 24 | 25 | * The Licensed Material will not be copied nor distributed in any form other than for Your backup. 26 | * The Licensed Material will only be used for research purposes and will not be used nor included in commercial applications in any form (such as original files, encrypted files, files containing extracted features, models trained on dataset, other derivative works, etc). 27 | * Any work made public, whatever the form, based directly or indirectly on any part of the Licensed Material must include the following reference: 28 | 29 | > Kyle Krafka, Aditya Khosla, Petr Kellnhofer, Harini Kannan, Suchi Bhandarkar, Wojciech Matusik and Antonio Torralba. “Eye Tracking for Everyone”. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016. 30 | 31 | 4. Licensor complies with the State of Massachusetts legislation in force. It is Your, and only yours, to comply with all the data protection laws that may affect You. 32 | 33 | ### Section 3 – Disclaimer of Warranties and Limitation of Liability 34 | a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 35 | 36 | b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Research License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 37 | 38 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 39 | 40 | ### Section 4 – Term and Termination 41 | a. If You fail to comply with this Research License, then Your rights under this Research License terminate automatically. 42 | 43 | b. Where Your right to use the Licensed Material has terminated under Section 4(a), it reinstates: 44 | 45 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 46 | 47 | 2. upon express reinstatement by the Licensor. 48 | 49 | For the avoidance of doubt, this Section 4(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Research License. 50 | 51 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Research License. 52 | 53 | d. Sections 1, 3, 4, 5 and 6 survive termination of this Research License. 54 | 55 | ### Section 5 – Other Terms and Conditions 56 | 57 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 58 | 59 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Research License. 60 | 61 | ### Section 6 – Interpretation 62 | 63 | a. For the avoidance of doubt, this Research License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Research License. 64 | 65 | b. To the extent possible, if any provision of this Research License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Research License without affecting the enforceability of the remaining terms and conditions. 66 | 67 | c. No term or condition of this Research License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 68 | 69 | d. Nothing in this Research License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 70 | 71 | -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | # Eye Tracking for Everyone Pytorch re-implementation 2 | 3 | ## Introduction 4 | This is a Pytorch re-implementation of the 2016 CVPR paper, "Eye Tracking for Everyone". 5 | 6 | It is a simplified version without fine tuning and augmentations which may result to lower performance. It is provided for convenience without any guarantee. For original results please refer to the Caffe version of the code which was used for the CVPR 2016 submission. 7 | 8 | * The combined test L2 error of the provided checkpoint is 2.46 cm. That is the display error distance in cm for both iPad and iPhone together. 9 | * The format of dataset for the loader may differ from the dataset provided at http://gazecapture.csail.mit.edu . Please modify the data loader to fit your needs. 10 | 11 | Implemented by Petr Kellnhofer ( https://people.csail.mit.edu/pkellnho/ ). Refer to the main repository https://github.com/CSAILVision/GazeCapture for more info. 12 | 13 | ## How to use: 14 | 15 | ### Dataset preparation 16 | 17 | 1. Download the GazeCapture dataset from http://gazecapture.csail.mit.edu/download.php 18 | 2. Extract the files (including the sub-archives) to a folder A. The resulting structure should be something like this: 19 | ``` 20 | GazeCapture 21 | \--00002 22 | \--frames 23 | \--appleFace.json 24 | \--00003 25 | ``` 26 | 3. Process the dataset using prepareDataset.py: 27 | ``` 28 | python prepareDataset.py --dataset_path [A = where extracted] --output_path [B = where to save new data] 29 | ``` 30 | It should output: 31 | ```` 32 | ====================== 33 | Summary 34 | ====================== 35 | Total added 1490959 frames from 1471 recordings. 36 | There are no missing files. 37 | There are no extra files that were not in the reference dataset. 38 | The new metadata.mat is an exact match to the reference from GitHub (including ordering) 39 | ```` 40 | and create a file structure in the directory B: 41 | ``` 42 | \---00002 43 | \---appleFace 44 | \---00000.jpg 45 | \---appleLeftEye 46 | \---appleRightEye 47 | \---00003 48 | ... 49 | \---metadata.mat 50 | ``` 51 | 52 | ### Training 53 | ``` 54 | python main.py --data_path [path B] --reset 55 | ``` 56 | 57 | ### Testing 58 | ``` 59 | python main.py --data_path [path B] --sink 60 | ``` 61 | 62 | ## History 63 | Any necessary changes to the dataset will be documented here. 64 | 65 | * **January 2019**: A dataset preprocessing code for an easier deployment. A conversion to pytorch 0.4.1. 66 | * **March 2018**: Original code release. 67 | 68 | ## Terms 69 | Usage of this dataset (including all data, models, and code) is subject to the associated license, found in [LICENSE.md](LICENSE.md). The license permits the use of released code, dataset and models for research purposes only. 70 | 71 | We also ask that you cite the associated paper if you make use of this dataset; following is the BibTeX entry: 72 | 73 | ``` 74 | @inproceedings{cvpr2016_gazecapture, 75 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 76 | Title = {Eye Tracking for Everyone}, 77 | Year = {2016}, 78 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 79 | } 80 | ``` 81 | 82 | ## Code 83 | 84 | Requires CUDA and Python 3.6+ with following packages (exact version may not be necessary): 85 | 86 | * numpy (1.16.4) 87 | * Pillow (6.1.0) 88 | * torch (1.1.0) 89 | * torchfile (0.1.0) 90 | * torchvision (0.3.0a0) 91 | * scipy (1.3.0) 92 | 93 | 94 | ## Contact 95 | 96 | Please email any questions or comments to [gazecapture@gmail.com](mailto:gazecapture@gmail.com). 97 | -------------------------------------------------------------------------------- /pytorch/checkpoint.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/pytorch/checkpoint.pth.tar -------------------------------------------------------------------------------- /pytorch/main.py: -------------------------------------------------------------------------------- 1 | import math, shutil, os, time, argparse 2 | import numpy as np 3 | import scipy.io as sio 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.parallel 8 | import torch.backends.cudnn as cudnn 9 | import torch.optim 10 | import torch.utils.data 11 | import torchvision.transforms as transforms 12 | import torchvision.datasets as datasets 13 | import torchvision.models as models 14 | 15 | from ITrackerData import ITrackerData 16 | from ITrackerModel import ITrackerModel 17 | 18 | ''' 19 | Train/test code for iTracker. 20 | 21 | Author: Petr Kellnhofer ( pkel_lnho (at) gmai_l.com // remove underscores and spaces), 2018. 22 | 23 | Website: http://gazecapture.csail.mit.edu/ 24 | 25 | Cite: 26 | 27 | Eye Tracking for Everyone 28 | K.Krafka*, A. Khosla*, P. Kellnhofer, H. Kannan, S. Bhandarkar, W. Matusik and A. Torralba 29 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016 30 | 31 | @inproceedings{cvpr2016_gazecapture, 32 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 33 | Title = {Eye Tracking for Everyone}, 34 | Year = {2016}, 35 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 36 | } 37 | 38 | ''' 39 | 40 | def str2bool(v): 41 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 42 | return True 43 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 44 | return False 45 | else: 46 | raise argparse.ArgumentTypeError('Boolean value expected.') 47 | 48 | parser = argparse.ArgumentParser(description='iTracker-pytorch-Trainer.') 49 | parser.add_argument('--data_path', help="Path to processed dataset. It should contain metadata.mat. Use prepareDataset.py.") 50 | parser.add_argument('--sink', type=str2bool, nargs='?', const=True, default=False, help="Just sink and terminate.") 51 | parser.add_argument('--reset', type=str2bool, nargs='?', const=True, default=False, help="Start from scratch (do not load).") 52 | args = parser.parse_args() 53 | 54 | # Change there flags to control what happens. 55 | doLoad = not args.reset # Load checkpoint at the beginning 56 | doTest = args.sink # Only run test, no training 57 | 58 | workers = 16 59 | epochs = 25 60 | batch_size = torch.cuda.device_count()*100 # Change if out of cuda memory 61 | 62 | base_lr = 0.0001 63 | momentum = 0.9 64 | weight_decay = 1e-4 65 | print_freq = 10 66 | prec1 = 0 67 | best_prec1 = 1e20 68 | lr = base_lr 69 | 70 | count_test = 0 71 | count = 0 72 | 73 | 74 | 75 | def main(): 76 | global args, best_prec1, weight_decay, momentum 77 | 78 | model = ITrackerModel() 79 | model = torch.nn.DataParallel(model) 80 | model.cuda() 81 | imSize=(224,224) 82 | cudnn.benchmark = True 83 | 84 | epoch = 0 85 | if doLoad: 86 | saved = load_checkpoint() 87 | if saved: 88 | print('Loading checkpoint for epoch %05d with loss %.5f (which is the mean squared error not the actual linear error)...' % (saved['epoch'], saved['best_prec1'])) 89 | state = saved['state_dict'] 90 | try: 91 | model.module.load_state_dict(state) 92 | except: 93 | model.load_state_dict(state) 94 | epoch = saved['epoch'] 95 | best_prec1 = saved['best_prec1'] 96 | else: 97 | print('Warning: Could not read checkpoint!') 98 | 99 | 100 | dataTrain = ITrackerData(dataPath = args.data_path, split='train', imSize = imSize) 101 | dataVal = ITrackerData(dataPath = args.data_path, split='test', imSize = imSize) 102 | 103 | train_loader = torch.utils.data.DataLoader( 104 | dataTrain, 105 | batch_size=batch_size, shuffle=True, 106 | num_workers=workers, pin_memory=True) 107 | 108 | val_loader = torch.utils.data.DataLoader( 109 | dataVal, 110 | batch_size=batch_size, shuffle=False, 111 | num_workers=workers, pin_memory=True) 112 | 113 | 114 | criterion = nn.MSELoss().cuda() 115 | 116 | optimizer = torch.optim.SGD(model.parameters(), lr, 117 | momentum=momentum, 118 | weight_decay=weight_decay) 119 | 120 | # Quick test 121 | if doTest: 122 | validate(val_loader, model, criterion, epoch) 123 | return 124 | 125 | for epoch in range(0, epoch): 126 | adjust_learning_rate(optimizer, epoch) 127 | 128 | for epoch in range(epoch, epochs): 129 | adjust_learning_rate(optimizer, epoch) 130 | 131 | # train for one epoch 132 | train(train_loader, model, criterion, optimizer, epoch) 133 | 134 | # evaluate on validation set 135 | prec1 = validate(val_loader, model, criterion, epoch) 136 | 137 | # remember best prec@1 and save checkpoint 138 | is_best = prec1 < best_prec1 139 | best_prec1 = min(prec1, best_prec1) 140 | save_checkpoint({ 141 | 'epoch': epoch + 1, 142 | 'state_dict': model.state_dict(), 143 | 'best_prec1': best_prec1, 144 | }, is_best) 145 | 146 | 147 | def train(train_loader, model, criterion,optimizer, epoch): 148 | global count 149 | batch_time = AverageMeter() 150 | data_time = AverageMeter() 151 | losses = AverageMeter() 152 | 153 | # switch to train mode 154 | model.train() 155 | 156 | end = time.time() 157 | 158 | for i, (row, imFace, imEyeL, imEyeR, faceGrid, gaze) in enumerate(train_loader): 159 | 160 | # measure data loading time 161 | data_time.update(time.time() - end) 162 | imFace = imFace.cuda() 163 | imEyeL = imEyeL.cuda() 164 | imEyeR = imEyeR.cuda() 165 | faceGrid = faceGrid.cuda() 166 | gaze = gaze.cuda() 167 | 168 | imFace = torch.autograd.Variable(imFace, requires_grad = True) 169 | imEyeL = torch.autograd.Variable(imEyeL, requires_grad = True) 170 | imEyeR = torch.autograd.Variable(imEyeR, requires_grad = True) 171 | faceGrid = torch.autograd.Variable(faceGrid, requires_grad = True) 172 | gaze = torch.autograd.Variable(gaze, requires_grad = False) 173 | 174 | # compute output 175 | output = model(imFace, imEyeL, imEyeR, faceGrid) 176 | 177 | loss = criterion(output, gaze) 178 | 179 | losses.update(loss.data.item(), imFace.size(0)) 180 | 181 | # compute gradient and do SGD step 182 | optimizer.zero_grad() 183 | loss.backward() 184 | optimizer.step() 185 | 186 | # measure elapsed time 187 | batch_time.update(time.time() - end) 188 | end = time.time() 189 | 190 | count=count+1 191 | 192 | print('Epoch (train): [{0}][{1}/{2}]\t' 193 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 194 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 195 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( 196 | epoch, i, len(train_loader), batch_time=batch_time, 197 | data_time=data_time, loss=losses)) 198 | 199 | def validate(val_loader, model, criterion, epoch): 200 | global count_test 201 | batch_time = AverageMeter() 202 | data_time = AverageMeter() 203 | losses = AverageMeter() 204 | lossesLin = AverageMeter() 205 | 206 | # switch to evaluate mode 207 | model.eval() 208 | end = time.time() 209 | 210 | 211 | oIndex = 0 212 | for i, (row, imFace, imEyeL, imEyeR, faceGrid, gaze) in enumerate(val_loader): 213 | # measure data loading time 214 | data_time.update(time.time() - end) 215 | imFace = imFace.cuda() 216 | imEyeL = imEyeL.cuda() 217 | imEyeR = imEyeR.cuda() 218 | faceGrid = faceGrid.cuda() 219 | gaze = gaze.cuda() 220 | 221 | imFace = torch.autograd.Variable(imFace, requires_grad = False) 222 | imEyeL = torch.autograd.Variable(imEyeL, requires_grad = False) 223 | imEyeR = torch.autograd.Variable(imEyeR, requires_grad = False) 224 | faceGrid = torch.autograd.Variable(faceGrid, requires_grad = False) 225 | gaze = torch.autograd.Variable(gaze, requires_grad = False) 226 | 227 | # compute output 228 | with torch.no_grad(): 229 | output = model(imFace, imEyeL, imEyeR, faceGrid) 230 | 231 | loss = criterion(output, gaze) 232 | 233 | lossLin = output - gaze 234 | lossLin = torch.mul(lossLin,lossLin) 235 | lossLin = torch.sum(lossLin,1) 236 | lossLin = torch.mean(torch.sqrt(lossLin)) 237 | 238 | losses.update(loss.data.item(), imFace.size(0)) 239 | lossesLin.update(lossLin.item(), imFace.size(0)) 240 | 241 | # compute gradient and do SGD step 242 | # measure elapsed time 243 | batch_time.update(time.time() - end) 244 | end = time.time() 245 | 246 | 247 | print('Epoch (val): [{0}][{1}/{2}]\t' 248 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 249 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 250 | 'Error L2 {lossLin.val:.4f} ({lossLin.avg:.4f})\t'.format( 251 | epoch, i, len(val_loader), batch_time=batch_time, 252 | loss=losses,lossLin=lossesLin)) 253 | 254 | return lossesLin.avg 255 | 256 | CHECKPOINTS_PATH = '.' 257 | 258 | def load_checkpoint(filename='checkpoint.pth.tar'): 259 | filename = os.path.join(CHECKPOINTS_PATH, filename) 260 | print(filename) 261 | if not os.path.isfile(filename): 262 | return None 263 | state = torch.load(filename) 264 | return state 265 | 266 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 267 | if not os.path.isdir(CHECKPOINTS_PATH): 268 | os.makedirs(CHECKPOINTS_PATH, 0o777) 269 | bestFilename = os.path.join(CHECKPOINTS_PATH, 'best_' + filename) 270 | filename = os.path.join(CHECKPOINTS_PATH, filename) 271 | torch.save(state, filename) 272 | if is_best: 273 | shutil.copyfile(filename, bestFilename) 274 | 275 | 276 | class AverageMeter(object): 277 | """Computes and stores the average and current value""" 278 | def __init__(self): 279 | self.reset() 280 | 281 | def reset(self): 282 | self.val = 0 283 | self.avg = 0 284 | self.sum = 0 285 | self.count = 0 286 | 287 | def update(self, val, n=1): 288 | self.val = val 289 | self.sum += val * n 290 | self.count += n 291 | self.avg = self.sum / self.count 292 | 293 | 294 | def adjust_learning_rate(optimizer, epoch): 295 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 296 | lr = base_lr * (0.1 ** (epoch // 30)) 297 | for param_group in optimizer.state_dict()['param_groups']: 298 | param_group['lr'] = lr 299 | 300 | 301 | if __name__ == "__main__": 302 | main() 303 | print('DONE') 304 | -------------------------------------------------------------------------------- /pytorch/mean_face_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/pytorch/mean_face_224.mat -------------------------------------------------------------------------------- /pytorch/mean_left_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/pytorch/mean_left_224.mat -------------------------------------------------------------------------------- /pytorch/mean_right_224.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/pytorch/mean_right_224.mat -------------------------------------------------------------------------------- /pytorch/prepareDataset.py: -------------------------------------------------------------------------------- 1 | import math, shutil, os, time, argparse, json, re, sys 2 | import numpy as np 3 | import scipy.io as sio 4 | from PIL import Image 5 | 6 | 7 | ''' 8 | Prepares the GazeCapture dataset for use with the pytorch code. Crops images, compiles JSONs into metadata.mat 9 | 10 | Author: Petr Kellnhofer ( pkel_lnho (at) gmai_l.com // remove underscores and spaces), 2018. 11 | 12 | Website: http://gazecapture.csail.mit.edu/ 13 | 14 | Cite: 15 | 16 | Eye Tracking for Everyone 17 | K.Krafka*, A. Khosla*, P. Kellnhofer, H. Kannan, S. Bhandarkar, W. Matusik and A. Torralba 18 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016 19 | 20 | @inproceedings{cvpr2016_gazecapture, 21 | Author = {Kyle Krafka and Aditya Khosla and Petr Kellnhofer and Harini Kannan and Suchendra Bhandarkar and Wojciech Matusik and Antonio Torralba}, 22 | Title = {Eye Tracking for Everyone}, 23 | Year = {2016}, 24 | Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} 25 | } 26 | 27 | ''' 28 | 29 | parser = argparse.ArgumentParser(description='iTracker-pytorch-PrepareDataset.') 30 | parser.add_argument('--dataset_path', help="Path to extracted files. It should have folders called '%%05d' in it.") 31 | parser.add_argument('--output_path', default=None, help="Where to write the output. Can be the same as dataset_path if you wish (=default).") 32 | args = parser.parse_args() 33 | 34 | 35 | 36 | def main(): 37 | if args.output_path is None: 38 | args.output_path = args.dataset_path 39 | 40 | if args.dataset_path is None or not os.path.isdir(args.dataset_path): 41 | raise RuntimeError('No such dataset folder %s!' % args.dataset_path) 42 | 43 | preparePath(args.output_path) 44 | 45 | # list recordings 46 | recordings = os.listdir(args.dataset_path) 47 | recordings = np.array(recordings, np.object) 48 | recordings = recordings[[os.path.isdir(os.path.join(args.dataset_path, r)) for r in recordings]] 49 | recordings.sort() 50 | 51 | # Output structure 52 | meta = { 53 | 'labelRecNum': [], 54 | 'frameIndex': [], 55 | 'labelDotXCam': [], 56 | 'labelDotYCam': [], 57 | 'labelFaceGrid': [], 58 | } 59 | 60 | for i,recording in enumerate(recordings): 61 | print('[%d/%d] Processing recording %s (%.2f%%)' % (i, len(recordings), recording, i / len(recordings) * 100)) 62 | recDir = os.path.join(args.dataset_path, recording) 63 | recDirOut = os.path.join(args.output_path, recording) 64 | 65 | # Read JSONs 66 | appleFace = readJson(os.path.join(recDir, 'appleFace.json')) 67 | if appleFace is None: 68 | continue 69 | appleLeftEye = readJson(os.path.join(recDir, 'appleLeftEye.json')) 70 | if appleLeftEye is None: 71 | continue 72 | appleRightEye = readJson(os.path.join(recDir, 'appleRightEye.json')) 73 | if appleRightEye is None: 74 | continue 75 | dotInfo = readJson(os.path.join(recDir, 'dotInfo.json')) 76 | if dotInfo is None: 77 | continue 78 | faceGrid = readJson(os.path.join(recDir, 'faceGrid.json')) 79 | if faceGrid is None: 80 | continue 81 | frames = readJson(os.path.join(recDir, 'frames.json')) 82 | if frames is None: 83 | continue 84 | # info = readJson(os.path.join(recDir, 'info.json')) 85 | # if info is None: 86 | # continue 87 | # screen = readJson(os.path.join(recDir, 'screen.json')) 88 | # if screen is None: 89 | # continue 90 | 91 | facePath = preparePath(os.path.join(recDirOut, 'appleFace')) 92 | leftEyePath = preparePath(os.path.join(recDirOut, 'appleLeftEye')) 93 | rightEyePath = preparePath(os.path.join(recDirOut, 'appleRightEye')) 94 | 95 | # Preprocess 96 | allValid = np.logical_and(np.logical_and(appleFace['IsValid'], appleLeftEye['IsValid']), np.logical_and(appleRightEye['IsValid'], faceGrid['IsValid'])) 97 | if not np.any(allValid): 98 | continue 99 | 100 | frames = np.array([int(re.match('(\d{5})\.jpg$', x).group(1)) for x in frames]) 101 | 102 | bboxFromJson = lambda data: np.stack((data['X'], data['Y'], data['W'],data['H']), axis=1).astype(int) 103 | faceBbox = bboxFromJson(appleFace) + [-1,-1,1,1] # for compatibility with matlab code 104 | leftEyeBbox = bboxFromJson(appleLeftEye) + [0,-1,0,0] 105 | rightEyeBbox = bboxFromJson(appleRightEye) + [0,-1,0,0] 106 | leftEyeBbox[:,:2] += faceBbox[:,:2] # relative to face 107 | rightEyeBbox[:,:2] += faceBbox[:,:2] 108 | faceGridBbox = bboxFromJson(faceGrid) 109 | 110 | 111 | for j,frame in enumerate(frames): 112 | # Can we use it? 113 | if not allValid[j]: 114 | continue 115 | 116 | # Load image 117 | imgFile = os.path.join(recDir, 'frames', '%05d.jpg' % frame) 118 | if not os.path.isfile(imgFile): 119 | logError('Warning: Could not read image file %s!' % imgFile) 120 | continue 121 | img = Image.open(imgFile) 122 | if img is None: 123 | logError('Warning: Could not read image file %s!' % imgFile) 124 | continue 125 | img = np.array(img.convert('RGB')) 126 | 127 | # Crop images 128 | imFace = cropImage(img, faceBbox[j,:]) 129 | imEyeL = cropImage(img, leftEyeBbox[j,:]) 130 | imEyeR = cropImage(img, rightEyeBbox[j,:]) 131 | 132 | # Save images 133 | Image.fromarray(imFace).save(os.path.join(facePath, '%05d.jpg' % frame), quality=95) 134 | Image.fromarray(imEyeL).save(os.path.join(leftEyePath, '%05d.jpg' % frame), quality=95) 135 | Image.fromarray(imEyeR).save(os.path.join(rightEyePath, '%05d.jpg' % frame), quality=95) 136 | 137 | # Collect metadata 138 | meta['labelRecNum'] += [int(recording)] 139 | meta['frameIndex'] += [frame] 140 | meta['labelDotXCam'] += [dotInfo['XCam'][j]] 141 | meta['labelDotYCam'] += [dotInfo['YCam'][j]] 142 | meta['labelFaceGrid'] += [faceGridBbox[j,:]] 143 | 144 | 145 | # Integrate 146 | meta['labelRecNum'] = np.stack(meta['labelRecNum'], axis = 0).astype(np.int16) 147 | meta['frameIndex'] = np.stack(meta['frameIndex'], axis = 0).astype(np.int32) 148 | meta['labelDotXCam'] = np.stack(meta['labelDotXCam'], axis = 0) 149 | meta['labelDotYCam'] = np.stack(meta['labelDotYCam'], axis = 0) 150 | meta['labelFaceGrid'] = np.stack(meta['labelFaceGrid'], axis = 0).astype(np.uint8) 151 | 152 | # Load reference metadata 153 | print('Will compare to the reference GitHub dataset metadata.mat...') 154 | reference = sio.loadmat('./reference_metadata.mat', struct_as_record=False) 155 | reference['labelRecNum'] = reference['labelRecNum'].flatten() 156 | reference['frameIndex'] = reference['frameIndex'].flatten() 157 | reference['labelDotXCam'] = reference['labelDotXCam'].flatten() 158 | reference['labelDotYCam'] = reference['labelDotYCam'].flatten() 159 | reference['labelTrain'] = reference['labelTrain'].flatten() 160 | reference['labelVal'] = reference['labelVal'].flatten() 161 | reference['labelTest'] = reference['labelTest'].flatten() 162 | 163 | # Find mapping 164 | mKey = np.array(['%05d_%05d' % (rec, frame) for rec, frame in zip(meta['labelRecNum'], meta['frameIndex'])], np.object) 165 | rKey = np.array(['%05d_%05d' % (rec, frame) for rec, frame in zip(reference['labelRecNum'], reference['frameIndex'])], np.object) 166 | mIndex = {k: i for i,k in enumerate(mKey)} 167 | rIndex = {k: i for i,k in enumerate(rKey)} 168 | mToR = np.zeros((len(mKey,)),int) - 1 169 | for i,k in enumerate(mKey): 170 | if k in rIndex: 171 | mToR[i] = rIndex[k] 172 | else: 173 | logError('Did not find rec_frame %s from the new dataset in the reference dataset!' % k) 174 | rToM = np.zeros((len(rKey,)),int) - 1 175 | for i,k in enumerate(rKey): 176 | if k in mIndex: 177 | rToM[i] = mIndex[k] 178 | else: 179 | logError('Did not find rec_frame %s from the reference dataset in the new dataset!' % k, critical = False) 180 | #break 181 | 182 | # Copy split from reference 183 | meta['labelTrain'] = np.zeros((len(meta['labelRecNum'],)),np.bool) 184 | meta['labelVal'] = np.ones((len(meta['labelRecNum'],)),np.bool) # default choice 185 | meta['labelTest'] = np.zeros((len(meta['labelRecNum'],)),np.bool) 186 | 187 | validMappingMask = mToR >= 0 188 | meta['labelTrain'][validMappingMask] = reference['labelTrain'][mToR[validMappingMask]] 189 | meta['labelVal'][validMappingMask] = reference['labelVal'][mToR[validMappingMask]] 190 | meta['labelTest'][validMappingMask] = reference['labelTest'][mToR[validMappingMask]] 191 | 192 | # Write out metadata 193 | metaFile = os.path.join(args.output_path, 'metadata.mat') 194 | print('Writing out the metadata.mat to %s...' % metaFile) 195 | sio.savemat(metaFile, meta) 196 | 197 | # Statistics 198 | nMissing = np.sum(rToM < 0) 199 | nExtra = np.sum(mToR < 0) 200 | totalMatch = len(mKey) == len(rKey) and np.all(np.equal(mKey, rKey)) 201 | print('======================\n\tSummary\n======================') 202 | print('Total added %d frames from %d recordings.' % (len(meta['frameIndex']), len(np.unique(meta['labelRecNum'])))) 203 | if nMissing > 0: 204 | print('There are %d frames missing in the new dataset. This may affect the results. Check the log to see which files are missing.' % nMissing) 205 | else: 206 | print('There are no missing files.') 207 | if nExtra > 0: 208 | print('There are %d extra frames in the new dataset. This is generally ok as they were marked for validation split only.' % nExtra) 209 | else: 210 | print('There are no extra files that were not in the reference dataset.') 211 | if totalMatch: 212 | print('The new metadata.mat is an exact match to the reference from GitHub (including ordering)') 213 | 214 | #import pdb; pdb.set_trace() 215 | input("Press Enter to continue...") 216 | 217 | 218 | 219 | 220 | def readJson(filename): 221 | if not os.path.isfile(filename): 222 | logError('Warning: No such file %s!' % filename) 223 | return None 224 | 225 | with open(filename) as f: 226 | try: 227 | data = json.load(f) 228 | except: 229 | data = None 230 | 231 | if data is None: 232 | logError('Warning: Could not read file %s!' % filename) 233 | return None 234 | 235 | return data 236 | 237 | def preparePath(path, clear = False): 238 | if not os.path.isdir(path): 239 | os.makedirs(path, 0o777) 240 | if clear: 241 | files = os.listdir(path) 242 | for f in files: 243 | fPath = os.path.join(path, f) 244 | if os.path.isdir(fPath): 245 | shutil.rmtree(fPath) 246 | else: 247 | os.remove(fPath) 248 | 249 | return path 250 | 251 | def logError(msg, critical = False): 252 | print(msg) 253 | if critical: 254 | sys.exit(1) 255 | 256 | 257 | def cropImage(img, bbox): 258 | bbox = np.array(bbox, int) 259 | 260 | aSrc = np.maximum(bbox[:2], 0) 261 | bSrc = np.minimum(bbox[:2] + bbox[2:], (img.shape[1], img.shape[0])) 262 | 263 | aDst = aSrc - bbox[:2] 264 | bDst = aDst + (bSrc - aSrc) 265 | 266 | res = np.zeros((bbox[3], bbox[2], img.shape[2]), img.dtype) 267 | res[aDst[1]:bDst[1],aDst[0]:bDst[0],:] = img[aSrc[1]:bSrc[1],aSrc[0]:bSrc[0],:] 268 | 269 | return res 270 | 271 | 272 | if __name__ == "__main__": 273 | main() 274 | print('DONE') 275 | -------------------------------------------------------------------------------- /pytorch/reference_metadata.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSAILVision/GazeCapture/e4af90da3c4c9068fbce433fa648d1f9bc7a394b/pytorch/reference_metadata.mat -------------------------------------------------------------------------------- /pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.4 2 | Pillow==8.2.0 3 | pkg-resources==0.0.0 4 | scipy==1.3.0 5 | six==1.12.0 6 | torch==1.1.0 7 | torchfile==0.1.0 8 | torchvision==0.3.0a0 9 | --------------------------------------------------------------------------------