├── .babelrc ├── .eslintrc ├── .gitignore ├── README.md ├── config └── .gitkeep ├── functions ├── alerting │ └── stats-health.js ├── api │ ├── alexa-ranking.js │ ├── color-bar.js │ ├── color-detection.js │ ├── css-stats.js │ ├── html-stats.js │ ├── http-headers.js │ ├── latest-screenshot.js │ ├── lighthouse.js │ ├── optimize-image.js │ └── technology-detection.js └── daily │ ├── alexa-ranking.js │ ├── color-bar.js │ ├── color-detection.js │ ├── convert-images.js │ ├── css-stats.js │ ├── html-stats.js │ ├── http-headers.js │ ├── lighthouse.js │ └── technology-detection.js ├── layers ├── canvas-lib64-layer.zip └── curl-settings │ └── nodejs │ ├── curl-format.json │ ├── package-lock.json │ └── package.json ├── package-lock.json ├── package.json ├── serverless.yml ├── utils └── index.js └── webpack.config.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | ["@babel/env", {"modules": false}] 4 | ] 5 | } -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "rules": { 3 | // common 4 | "indent": ["warn", 2, {"SwitchCase": 1}], // specify tab or space width for your code 5 | "quotes": ["error", "single", { "allowTemplateLiterals": true }], // specify whether backticks, double or single quotes should be used 6 | "linebreak-style": ["error", "unix"], // disallow mixed 'LF' and 'CRLF' as linebreaks 7 | "semi": ["error", "always"], // require or disallow use of semicolons instead of ASI 8 | // errors 9 | "no-extra-parens": ["error", "functions"], // disallow unnecessary parentheses 10 | "no-unexpected-multiline": ["error"], // Avoid code that looks like two expressions but is actually one 11 | "valid-jsdoc": ["warn"], // Ensure JSDoc comments are valid 12 | // best practices 13 | "block-scoped-var": ["error"], // treat var statements as if they were block scoped 14 | "curly": ["error", "all"], // specify curly brace conventions for all control statements 15 | "default-case": ["error"], // require default case in switch statements 16 | "dot-notation": ["error"], // encourages use of dot notation whenever possible 17 | "dot-location": ["error", "property"], // enforces consistent newlines before or after dots 18 | "eqeqeq": ["error", "smart"], // require the use of === and !== 19 | "guard-for-in": ["error"], // make sure for-in loops have an if statement 20 | "no-alert": ["error"], // disallow the use of alert, confirm, and prompt 21 | "no-caller": ["error"], // disallow use of arguments.caller or arguments.callee 22 | "no-eq-null": ["error"], // disallow comparisons to null without a type-checking operator 23 | "no-eval": ["error"], // disallow use of eval() 24 | "no-extend-native": ["error"], // disallow adding to native types 25 | "no-extra-bind": ["error"], // disallow unnecessary function binding 26 | "no-fallthrough": ["error"], // disallow fallthrough of case statements (recommended) 27 | "no-floating-decimal": ["error"], // disallow the use of leading or trailing decimal points in numeric literals 28 | "no-implied-eval": ["error"], // disallow use of eval()-like methods 29 | "no-iterator": ["error"], // disallow usage of __iterator__ property 30 | "no-labels": ["error"], // disallow use of labeled statements 31 | "no-lone-blocks": ["error"], // disallow unnecessary nested blocks 32 | "no-loop-func": ["error"], // disallow creation of functions within loops 33 | "no-multi-spaces": ["error"], // disallow use of multiple spaces 34 | "no-multi-str": ["error"], // disallow use of multiline strings 35 | "no-native-reassign": ["error"], // disallow reassignments of native objects 36 | "no-new-func": ["error"], // disallow use of new operator for Function object 37 | "no-new-wrappers": ["error"], // disallows creating new instances of String,Number, and Boolean 38 | "no-new": ["error"], // disallow use of the new operator when not part of an assignment or comparison 39 | "no-octal-escape": ["error"], // disallow use of octal escape sequences in string literals, such as var foo = "Copyright \"error"51"; 40 | "no-octal": ["error"], // disallow use of octal literals (recommended) 41 | "no-param-reassign": ["warn", {"props": false}], // disallow reassignment of function parameters 42 | "no-proto": ["error"], // disallow usage of __proto__ property 43 | "no-redeclare": ["error", {"builtinGlobals": true}], // disallow declaring the same variable more than once (recommended) 44 | "no-return-assign": ["error"], // disallow use of assignment in return statement 45 | "no-script-url": ["error"], // disallow use of javascript: urls. 46 | "no-self-compare": ["error"], // disallow comparisons where both sides are exactly the same 47 | "no-sequences": ["error"], // disallow use of the comma operator 48 | "no-throw-literal": ["error"], // restrict what can be thrown as an exception 49 | "no-unused-expressions": ["error"], // disallow usage of expressions in statement position 50 | "no-useless-call": ["error"], // disallow unnecessary .call() and .apply() 51 | "no-useless-concat": ["error"], // disallow unnecessary concatenation of literals or template literals 52 | "no-void": ["error"], // disallow use of the void operator 53 | "no-with": ["error"], // disallow use of the with statement 54 | "radix": ["error"], // require use of the second argument for parseInt() 55 | "wrap-iife": ["error"], // require immediate function invocation to be wrapped in parentheses 56 | // Variables 57 | "no-delete-var": ["error"], // disallow deletion of variables (recommended) 58 | "no-label-var": ["error"], // disallow labels that share a name with a variable 59 | "no-shadow-restricted-names": ["error"], // disallow shadowing of names such as arguments 60 | "no-undef-init": ["error"], // disallow use of undefined when initializing variables 61 | "no-undef": ["error"], // disallow use of undeclared variables unless mentioned in a /*global */ block (recommended) 62 | "no-unused-vars": ["error"], // disallow declaration of variables that are not used in the code (recommended) 63 | // nodejs 64 | "callback-return": ["error", ["callback", "cb", "next"]], // enforce return after a callback 65 | "handle-callback-err": ["error", "^(err\\d?|error\\d?|^.+Err$|^.+Error$)$"], // enforce error handling in callbacks 66 | "no-mixed-requires": ["error", false], // disallow mixing regular variable and require declarations 67 | "no-new-require": ["error"], // disallow use of new operator with the require function 68 | "no-path-concat": ["error"], // disallow string concatenation with __dirname and __filename 69 | "no-sync": ["error"], // disallow use of synchronous methods 70 | // Stylistic 71 | "array-bracket-spacing": ["warn", "never"], // enforce spacing inside array brackets 72 | "block-spacing": ["warn", "never"], // disallow or enforce spaces inside of single line blocks 73 | "brace-style": ["warn", "1tbs", { "allowSingleLine": false }], // enforce one true brace style 74 | "camelcase": ["warn", {"properties": "always"}], // require camel case names 75 | "comma-spacing": ["warn", {"before": false, "after": true}], // enforce spacing before and after comma 76 | "comma-style": ["warn", "last"], // enforce one true comma style 77 | "computed-property-spacing": ["warn", "never"], // require or disallow padding inside computed properties 78 | "consistent-this": ["warn", "self"], // enforce consistent naming when capturing the current execution context 79 | "eol-last": ["off"], // enforce newline at the end of file, with no multiple empty lines 80 | "key-spacing": ["warn", {"beforeColon": false, "afterColon": true}], // enforce spacing between keys and values in object literal properties 81 | "max-nested-callbacks": ["warn", 6], // specify the maximum depth callbacks can be nested 82 | "new-cap": ["warn", {"capIsNewExceptions": ["Router"]}], // require a capital letter for constructors 83 | "new-parens": ["warn"], // disallow the omission of parentheses when invoking a constructor with no arguments 84 | "newline-after-var": ["off", "always"], // require or disallow an empty newline after variable declarations 85 | "no-array-constructor": ["warn"], // disallow use of the Array constructor 86 | "no-lonely-if": ["warn"], // disallow if as the only statement in an else block 87 | "no-mixed-spaces-and-tabs": ["error"], // disallow mixed spaces and tabs for indentation (recommended) 88 | "no-multiple-empty-lines": ["warn", {"max": 1}], // disallow multiple empty lines 89 | "no-nested-ternary": ["warn"], // disallow nested ternary expressions 90 | "no-new-object": ["warn"], // disallow the use of the Object constructor 91 | "no-spaced-func": ["warn"], // disallow space between function identifier and application 92 | "no-trailing-spaces": ["warn", { "skipBlankLines": false }], // disallow trailing whitespace at the end of lines 93 | "no-unneeded-ternary": ["error"], // disallow the use of Boolean literals in conditional expressions 94 | "operator-linebreak": ["warn", "after"], // enforce operators to be placed before or after line breaks 95 | "quote-props": ["error", "as-needed"], // require quotes around object literal property names 96 | "semi-spacing": ["warn", {"before": false, "after": true}], // enforce spacing before and after semicolons 97 | "space-in-parens": ["warn", "never"], // require or disallow spaces inside parentheses 98 | "space-unary-ops": ["warn", {"words": true, "nonwords": false}], // require or disallow spaces before/after unary operators 99 | "spaced-comment": ["warn", "always"], // require or disallow a space immediately following the // or /* in a comment 100 | "wrap-regex": ["warn"], // require regex literals to be wrapped in parentheses 101 | "no-console": ["off"] 102 | }, 103 | "parserOptions": { 104 | "ecmaVersion": 2018, 105 | "sourceType": "module" 106 | }, 107 | "env": { 108 | "node": true, 109 | "es6": true, 110 | "mocha": true 111 | }, 112 | "extends": "eslint:recommended", 113 | "globals": { 114 | "should": true 115 | } 116 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | .env.test 68 | 69 | # parcel-bundler cache (https://parceljs.org/) 70 | .cache 71 | 72 | # next.js build output 73 | .next 74 | 75 | # nuxt.js build output 76 | .nuxt 77 | 78 | # vuepress build output 79 | .vuepress/dist 80 | 81 | # Serverless directories 82 | .serverless/ 83 | 84 | # FuseBox cache 85 | .fusebox/ 86 | 87 | # DynamoDB Local files 88 | .dynamodb/ 89 | 90 | labs/ 91 | tmp/ 92 | 93 | config/* 94 | 95 | # keep serverless configuration folder 96 | !config/.gitkeep 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webcodex 2 | A set of functions to collect data and metrics from websites. 3 | 4 | ## Architecture 5 | The idea behind all functions are to act as thin layers to accommodate some input parameters, do some action and transform the result to satify some output (or to have a specific side-effect, like daily functions). Most of the functions are designed to focus on a single metric but there are few expections (like the color detection). 6 | ### HTTP functions 7 | 8 |

9 | 10 |

11 | 12 | ### Daily functions 13 | These functions are persisting the metrics into DynamoDB and are triggered by an SNS message originated externally. 14 |

15 | 16 |

17 | 18 | ## Project structure 19 | There are two important types of functions included in this project: functions that run on a daily basis and functions that are triggered by HTTP requests. The goal is to be able to re-use as much shared logic as possible between these two different types. The `utils` module is where the shared logic between these two live. In order to bundle just the dependencies needed for a given function the project uses Webpack and defines the necessary imports on a per-function basis, passing these dependencies back to the `utils` module. 20 | 21 | ## Metrics 22 | ### Colors defined in the CSS 23 | To get the data for this metric a request is made to the website and all the stylesheets are aggregated. After that, the following regular expression is used to match hexadecimal (e.g. both `#AAA` and `#AAAAAA), rgb and hsla definitions: 24 | ``` 25 | /(#([\da-f]{3}){1,2}|(rgb|hsl)a\((\d{1,3}%?,\s?){3}(1|0?\.\d+)\)|(rgb|hsl)\(\d{1,3}%?(,\s?\d{1,3}%?){2}\))/ig 26 | ``` 27 | Finally after the list of colors is obtained the list is sorted using [color-sorter](https://github.com/bartveneman/color-sorter). 28 | 29 | ### Color detection 30 | This function uses a headless browser to capture a screenshot and utilizes imagemagick to extract predominant colors from a 50x50 matrix. 31 | ### HTML 32 | This function traverses the DOM tree and aggregates into DOM elements frequency and attributes frequency. 33 | ### CSS 34 | The function first tries to retrieve the CSS content of the website using [axios]() and if that fails then retries using cURL with an specific set of headers (due to browser fringerprinting). After getting the whole CSS content of the website the function uses Project Wallace's [css-analyzer](https://github.com/projectwallace/css-analyzer) to get the CSS metrics. 35 | ### HTTP Headers 36 | The purpose of this function is to collect the HTTP headers using cURL. 37 | ### Lighthouse 38 | The function spins a headless browser to then connect Lighthouse to it and run basic audits. After getting the results back, it uploads the results to Github's gists and returns the 39 | ### Technologies detection 40 | The function focuses on identifyng libraries or frameworks that are loaded in a website. To achieve this, the function runs a headless browser and collects all the `