├── .editorconfig ├── .eslintrc.json ├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FUNDING.yml ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ └── feature_request.md ├── LICENSE.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ └── lint-and-run.yml ├── .gitignore ├── .release-it.json ├── .vim └── coc-settings.json ├── .vscode ├── dataproofer.code-workspace └── extensions.json ├── README.md ├── package.json ├── packages ├── core-suite │ ├── README.md │ ├── index.js │ ├── package.json │ └── src │ │ ├── checkDuplicateRows.js │ │ ├── columnsContainNothing.js │ │ ├── columnsContainsSpecialChars.js │ │ ├── maxBigInteger.js │ │ ├── maxInteger.js │ │ ├── maxSmallInteger.js │ │ ├── maxSummedInteger.js │ │ ├── numberOfRowsIs65k.js │ │ └── stringsHaveExactly255Characters.js ├── dataproofertest-js │ ├── README.md │ ├── index.js │ ├── package.json │ └── util.js ├── geo-suite │ ├── README.md │ ├── index.js │ ├── package.json │ └── src │ │ ├── invalidLngLat.js │ │ └── voidLngLat.js ├── info-suite │ ├── README.md │ ├── index.js │ ├── package.json │ └── src │ │ ├── columnsContainNumbers.js │ │ └── columnsContainStrings.js └── stats-suite │ ├── README.md │ ├── index.js │ ├── package.json │ └── src │ ├── medianAbsoluteDeviationOutliers.js │ └── standardDeviationOutliers.js ├── sample-datasets ├── 65k.csv ├── bad-encoding-census.csv ├── bad-encoding-countries.csv ├── ballpark-prices.csv ├── cigarette-sales.csv ├── darknet-passport-prices.csv ├── defense-contracts.csv ├── foreignfighters.csv ├── geo-test.csv ├── isis-attack-sites.csv ├── killed-by-police-2014-2015.csv ├── lab-animals-by-state.csv ├── large_tweet_data.csv ├── max-integer-test.csv ├── nc-travel-bans.csv ├── nhl-fighting.csv ├── sf-police-salaries.csv ├── sf-police-salaries.psv ├── sf-police-salaries.tsv ├── sf-police-salaries.xls ├── sf-police-salaries.xlsx ├── silk-road-arrests.csv ├── snake-oil-supplements.csv ├── snl-cast-members.csv ├── state-judicial-elections-donations.csv ├── state_table.csv ├── un-presidents.csv ├── vegas-hotel-prices.csv └── weapons-traded-between-countries.csv ├── src ├── README.md ├── index.js ├── package.json ├── processing.js └── rendering.js └── yarn.lock /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | 12 | [*.md] 13 | trim_trailing_whitespace = false 14 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "parser": "@babel/eslint-parser", 4 | "plugins": [ 5 | "@babel", 6 | "prettier" 7 | ], 8 | "parserOptions": { 9 | "ecmaVersion": 12, 10 | "requireConfigFile": false 11 | }, 12 | "env": { 13 | "es6": true, 14 | "browser": false, 15 | "commonjs": true, 16 | "node": true 17 | }, 18 | // Start with rules from eslint recommended setting, and override below. 19 | "extends": [ 20 | "eslint:recommended", 21 | "plugin:prettier/recommended" 22 | ], 23 | // Linting rules are enforced based on the following levels: 24 | // (Reference: http://eslint.org/docs/user-guide/configuring) 25 | // 0: turn the rule off 26 | // 1: turn the rule on as a warning (doesn't affect exit code) 27 | // 2: turn the rule on as an error (exit code is 1 when triggered) 28 | "rules": { 29 | // Disallow littering with unused variables (except function args). 30 | "no-unused-vars": [ 31 | 2, 32 | { 33 | "vars": "all", 34 | "args": "none" 35 | } 36 | ], 37 | "prettier/prettier": [ 38 | "error", 39 | { 40 | "endOfLine": "auto" 41 | } 42 | ] 43 | }, 44 | "ignorePatterns": [ 45 | "dist/", 46 | "node_modules/" 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, and in the interest of 4 | fostering an open and welcoming community, we pledge to respect all people who 5 | contribute through reporting issues, posting feature requests, updating 6 | documentation, submitting pull requests or patches, and other activities. 7 | 8 | We are committed to making participation in this project a harassment-free 9 | experience for everyone, regardless of level of experience, gender, gender 10 | identity and expression, sexual orientation, disability, personal appearance, 11 | body size, race, ethnicity, age, religion, or nationality. 12 | 13 | Examples of unacceptable behavior by participants include: 14 | 15 | * The use of sexualized language or imagery 16 | * Personal attacks 17 | * Trolling or insulting/derogatory comments 18 | * Public or private harassment 19 | * Publishing other's private information, such as physical or electronic 20 | addresses, without explicit permission 21 | * Other unethical or unprofessional conduct 22 | 23 | Project maintainers have the right and responsibility to remove, edit, or 24 | reject comments, commits, code, wiki edits, issues, and other contributions 25 | that are not aligned to this Code of Conduct, or to ban temporarily or 26 | permanently any contributor for other behaviors that they deem inappropriate, 27 | threatening, offensive, or harmful. 28 | 29 | By adopting this Code of Conduct, project maintainers commit themselves to 30 | fairly and consistently applying these principles to every aspect of managing 31 | this project. Project maintainers who do not follow or enforce the Code of 32 | Conduct may be permanently removed from the project team. 33 | 34 | This Code of Conduct applies both within project spaces and in public spaces 35 | when an individual is representing the project or its community. 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 38 | reported by contacting a project maintainer at grich@vocativ.com. All 39 | complaints will be reviewed and investigated and will result in a response that 40 | is deemed necessary and appropriate to the circumstances. Maintainers are 41 | obligated to maintain confidentiality with regard to the reporter of an 42 | incident. 43 | 44 | 45 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 46 | version 1.3.0, available at 47 | [http://contributor-covenant.org/version/1/3/0/][version] 48 | 49 | [homepage]: http://contributor-covenant.org 50 | [version]: http://contributor-covenant.org/version/1/3/0/ 51 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We welcome contributions and suggestions to help us improve this project. If you are unfamiliar with Github, here is their official guide to start [contributing to open source projects](https://guides.github.com/activities/contributing-to-open-source/). 4 | 5 | ## Workflow 6 | 7 | 1. [Fork this repository](https://help.github.com/articles/fork-a-repo) 8 | 2. Create a branch `git checkout -b my-branch` 9 | 3. Stage and commit your changes `git commit -am 'description of my changes’` 10 | 4. Push the changes to your fork `git push origin my-branch` 11 | 5. [Submit a pull request to the parent repo](https://help.github.com/articles/creating-a-pull-request). Please read our [guide to submitting pull requests](https://github.com/inn/docs/blob/master/how-to-work-with-us/pull-requests.md) to see what we expect in a good pull request message. 12 | 6. Pull request should be assigned to: 13 | - [@ejfox](http://github.com/ejfox) (primary) 14 | - [@geraldarthur](http://github.com/geraldarthur) 15 | - [@enjalot](http://github.com/enjalot) 16 | 17 | Additionally, you can [create issues](https://github.com/dataproofer/Dataproofer/issues) on this repo to suggest changes or improvements. If you believe there’s a bug, we encourage you to use [best practices](http://polite.technology/reportabug.html), and please tell us the following: 18 | 19 | * Describe your computer and the file (e.g. What’s your operating system? How big is your dataset?) 20 | * Describe what you were doing (e.g. Running tests? Which test are you running?) 21 | * Describe what you expected to happen (e.g. Did you expect a particular test to fail?) 22 | * Describe what actually happened (e.g. Did the test in question pass or cause Dataproofer to break?) 23 | 24 | And of course you can always email us: [dataproofer@dataproofer.org](mailto:dataproofer@dataproofer.org). 25 | 26 | ## Code Standards 27 | 28 | - Follow our code styleguide (coming soon) and use our [linter configuration file](https://github.com/dataproofer/Dataproofer/blob/master/.eslintrc) with your preferred text editor 29 | - Follow our [Code of Conduct](https://github.com/dataproofer/Dataproofer/CODE_OF_CONDUCT.md). 30 | - Use [markdown syntax](http://daringfireball.net/projects/markdown/syntax) for all text documents. 31 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [dataproofer] 2 | open_collective: dataproofer 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Summary 2 | What's on your mind? 3 | 4 | --- 5 | ### Steps to reproduce 6 | Walk us through what happened so we can identify the issue. 7 | 8 | ### Expected behavior 9 | What was supposed to happen? What happened instead? 10 | 11 | ### Relevant logs and/or screenshots 12 | Drag and drop photos or copy and paste any errors you saw into this section. 13 | 14 | ### Possible fixes? 15 | Want to help us out? Let us know if you might know a better solution. We're an open source project, and we'd like to know what you think. 16 | 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes #[INSERT GITHUB ISSUE NUMBER HERE] 2 | 3 | ## Summary** 4 | 5 | Tell us what you are thinking and how we can best review whether your change is working. 6 | 7 | ### Proposed changes** 8 | 9 | Optionally, provide a few more details like what files were modified, added or removed. 10 | 11 | * What was modified: 12 | * What was added: 13 | * What was removed: 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "npm" # See documentation for possible values 4 | directory: "/" # Location of package manifests 5 | schedule: 6 | interval: "monthly" 7 | -------------------------------------------------------------------------------- /.github/workflows/lint-and-run.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | pull_request: 8 | branches: [dev] 9 | 10 | jobs: 11 | build: 12 | runs-on: ${{ matrix.os }} 13 | env: 14 | CI: true 15 | TZ: America/New_York 16 | strategy: 17 | matrix: 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | node_version: 23 | - 10 24 | - 12 25 | - 14 26 | architecture: 27 | - x64 28 | # an extra windows-x86 run: 29 | include: 30 | - os: windows-2016 31 | node_version: 12 32 | architecture: x86 33 | name: Node ${{ matrix.node_version }} - ${{ matrix.architecture }} on ${{ matrix.os }} 34 | steps: 35 | - name: Checkout 36 | uses: actions/checkout@v2 37 | 38 | - name: Setup Node.js environment 39 | uses: actions/setup-node@v2 40 | with: 41 | node-version: ${{ matrix.node_version }} 42 | architecture: ${{ matrix.architecture }} 43 | 44 | - run: yarn install --frozen-lockfile 45 | - run: yarn lint 46 | - run: yarn dataproofer 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules/ 3 | electron/node_modules/ 4 | executables/ 5 | npm-debug.log 6 | .yarn/* 7 | !.yarn/releases 8 | !.yarn/plugins 9 | !.yarn/sdks 10 | !.yarn/versions 11 | .pnp.* 12 | -------------------------------------------------------------------------------- /.release-it.json: -------------------------------------------------------------------------------- 1 | { 2 | "npm": false, 3 | "github": { 4 | "release": true 5 | }, 6 | "plugins": { 7 | "release-it-yarn-workspaces": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /.vim/coc-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "eslint.packageManager": "yarn" 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/dataproofer.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": { 8 | "editor.formatOnSave": true, 9 | "editor.formatOnPaste": true, 10 | "eslint.workingDirectories": ["Dataproofer"], 11 | "prettier.requireConfig": false 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "arcanis.vscode-zipfs", 4 | "dbaeumer.vscode-eslint", 5 | "esbenp.prettier-vscode" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataproofer 2 | 3 | ![""](http://i.imgur.com/n38R14S.png) 4 | 5 | ## A proofreader for your data 6 | 7 | Every day, more and more data is created. Journalists, analysts, and data visualizers turn that data into stories and insights. 8 | 9 | But before you can make use of any data, you need to know if it’s reliable. Is it weird? Is it clean? Can I use it to write or make a viz? 10 | 11 | This used to be a long manual process, using valuable time and introducing the possibility for human error. People can’t always spot every mistake every time, no matter how hard they try. 12 | 13 | Data proofer is built to automate this process of checking a dataset for errors or potential mistakes. 14 | 15 | ## Getting Started (Desktop) 16 | 17 | Download a .zip of the latest release [from the Dataproofer releases page](https://github.com/dataproofer/Dataproofer/releases). 18 | 19 | Drag the app into your applications folder. 20 | 21 | Select your dataset, which can be either a CSV on your computer, or a Google Sheet that you’ve published to the web. 22 | 23 | Once you select your dataset, you can choose which suites and tests run by turning them on or off. 24 | 25 | Proof your data, get your results, and feel confident about your dataset. 26 | 27 | ## Getting Started (Command Line) 28 | 29 | ```sh 30 | npm install -g dataproofer 31 | ``` 32 | 33 | Read the documentation 34 | 35 | ```sh 36 | dataproofer --help 37 | > Usage: dataproofer 38 | 39 | A proofreader for your data 40 | 41 | Options: 42 | 43 | -h, --help output usage information 44 | -V, --version output the version number 45 | -o, --out file to output results. default stdout 46 | -c, --core run tests from the core suite 47 | -i, --info run tests from the info suite 48 | -a, --stats run tests from the statistical suite 49 | -g, --geo run tests from the geographic suite 50 | -t, --tests comma-separated list to use 51 | -j, --json output JSON of test results 52 | -J, --json-pretty output an indented JSON of test results 53 | -S, --summary output overall test results, excluding pass/fail results 54 | -v, --verbose include descriptions about each column 55 | -x, --exclude exclude tests that passed 56 | 57 | Examples: 58 | 59 | $ dataproofer my_data.csv 60 | ``` 61 | 62 | Run a test 63 | 64 | ```sh 65 | dataproofer data.csv 66 | ``` 67 | 68 | Save the results 69 | 70 | ```sh 71 | dataproofer --json data.csv --out data.json 72 | ``` 73 | 74 | Learn how to run specific test suites or tests and output longer or shorter summaries, use the `--help` flag. 75 | 76 | Found a bug? [Let us know](https://github.com/dataproofer/Dataproofer/issues/new). 77 | 78 | ## Table of Contents 79 | 80 | - [Getting Started (Desktop)](https://github.com/dataproofer/Dataproofer#getting-started-desktop) 81 | - [Getting Started (Command Line)](https://github.com/dataproofer/Dataproofer#getting-started-command-line) 82 | - [Test Suites](https://github.com/dataproofer/Dataproofer#test-suites) 83 | - [Info](https://github.com/dataproofer/Dataproofer#information--diagnostics) 84 | - [Core](https://github.com/dataproofer/Dataproofer#core-suite) 85 | - [Geo](https://github.com/dataproofer/Dataproofer#geo-suite) 86 | - [Stats](https://github.com/dataproofer/Dataproofer#stats-suite) 87 | - [Development](https://github.com/dataproofer/Dataproofer#development) 88 | - [How You Can Help](https://github.com/dataproofer/Dataproofer#how-you-can-help) 89 | - [Modifying a test suite](https://github.com/dataproofer/Dataproofer#modifying-a-test-suite) 90 | - [Create a new test](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 91 | - [name](https://github.com/dataproofer/Dataproofer#name) 92 | - [description](https://github.com/dataproofer/Dataproofer#description) 93 | - [methodology](https://github.com/dataproofer/Dataproofer#methodology) 94 | - [helper scripts](https://github.com/dataproofer/Dataproofer#helper-scripts) 95 | - [Troubleshooting](https://github.com/dataproofer/Dataproofer#troubleshooting-a-test-that-wont-run) 96 | - [Test iteration](https://github.com/dataproofer/Dataproofer#iterating-on-tests) 97 | - [Packaging the Desktop App](https://github.com/dataproofer/Dataproofer#packaging-an-executable) 98 | - [Releasing new versions](https://github.com/dataproofer/Dataproofer#release) 99 | - [Sources](https://github.com/dataproofer/Dataproofer#sources) 100 | - [Thank You](https://github.com/dataproofer/Dataproofer#thank-you) 101 | 102 | ![""](http://i.imgur.com/3YekdjW.png) 103 | 104 | ## Test Suites 105 | 106 | ### [Information & Diagnostics](https://github.com/dataproofer/Dataproofer/tree/dev/packages/info-suite) 107 | 108 | A set of tests that infer descriptive information based on the contents of a table's cells. 109 | 110 | - Check for numeric values in columns 111 | - Check for strings in columns 112 | 113 | ### [Core Suite](https://github.com/dataproofer/Dataproofer/tree/dev/packages/core-suite) 114 | 115 | A set of tests related to common problems and data checks — namely, making sure data has not been truncated by looking for specific cut-off indicators. 116 | 117 | - Check for duplicate rows 118 | - Check for empty columns (no values) 119 | - Check for special, non-typical Latin characters/letters in strings 120 | - Check for **big integer** cut-offs as defined by MySQL and PostgreSQL, common database programs 121 | - Check for **integer** cut-offs as defined by MySQL and PostgreSQL, common database programs 122 | - Check for **small integer** cut-offs as defined by MySQL and PostgreSQL, common database programs 123 | - Check for whether there are exactly 65k rows — an indication there may be missing rows lost when the 124 | data was exported from a database 125 | - Check for strings that are exactly 255 characters — an indication there may be missing data lost when the data was exported from MySQL 126 | 127 | ### [Geo Suite](https://github.com/dataproofer/Dataproofer/tree/dev/packages/geo-suite) 128 | 129 | A set of tests related to common geographic data problems. 130 | 131 | - Check for invalid latitude and longitude values (values outside the range of -180º to 180º) 132 | - Check for void latitude and longitude values (values at 0º,0º) 133 | 134 | ### [Stats Suite](https://github.com/dataproofer/Dataproofer/tree/dev/packages/stats-suite) 135 | 136 | A set of test related to common statistical used to detect outlying data. 137 | 138 | - Check for outliers within a column relative to the column's median 139 | - Check for outliers within a column relative to the column's mean 140 | 141 | ![""](http://i.imgur.com/3YekdjW.png) 142 | 143 | ## Development 144 | 145 | ```sh 146 | git clone https://github.com/dataproofer/Dataproofer.git 147 | cd Dataproofer 148 | yarn 149 | ``` 150 | 151 | ### How You Can Help 152 | 153 | #### Write a test 154 | 155 | See our [test to-do list](https://github.com/dataproofer/Dataproofer/issues?q=is%3Aissue+is%3Aopen+label%3Atest) and leave a comment 156 | 157 | #### Add a feature 158 | 159 | See our [features list](https://github.com/dataproofer/Dataproofer/issues?utf8=✓&q=is%3Aissue+is%3Aopen+-label%3Atest+) and leave a comment 160 | 161 | #### Short on time? 162 | 163 | See our [smaller issues](https://github.com/dataproofer/Dataproofer/issues?q=is%3Aopen+is%3Aissue+label%3Asmall) and leave a comment 164 | 165 | #### Got more time? 166 | 167 | See our [medium-sized issues](https://github.com/dataproofer/Dataproofer/issues?utf8=%E2%9C%93&q=is%3Aopen+is%3Aissue+label%3Amedium) and leave a comment 168 | 169 | #### Plenty of time? 170 | 171 | See our [larger issues](https://github.com/dataproofer/Dataproofer/issues?utf8=%E2%9C%93&q=is%3Aopen+is%3Aissue+label%3Alarge) and leave a comment 172 | 173 | ![](http://i.imgur.com/3YekdjW.png) 174 | 175 | ### Creating a new test 176 | 177 | - Make a copy of the [basic test template](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 178 | - Read the comments and follow along with links 179 | - Let us know if you're running into trouble dataproofer [at] dataproofer.org 180 | - `require` that test in a suite's [index.js](https://github.com/dataproofer/suite-template/blob/master/index.js) 181 | - Add that test to the `exports` in [index.js](https://github.com/dataproofer/suite-template/blob/master/index.js) 182 | 183 | Tests are made up of a few parts. Here's a brief over-view. For a more in-depth look, dive into the [documentation](https://github.com/dataproofer/Dataproofer/tree/dev/packages/dataproofertest-js). 184 | 185 | #### .name() 186 | 187 | This is the name of your test. It shows up in the test-selection screen as well as on the results page 188 | 189 | #### .description() 190 | 191 | This is a text-only description of what the test does, and what it is meant to check. Imagine you are explaining it to a remarkably intelligent 5-year-old. 192 | 193 | #### .methodology() 194 | 195 | This is where the code your test executes lives. Pass it a function that takes in **rows** and **columnHeads** 196 | 197 | **rows** is an array of objects from the data. The object uses column headers as the key, and the row’s value as the value. 198 | 199 | So if your data looks like this: 200 | 201 | | President | Year | 202 | | ----------------- | ---- | 203 | | George Washington | 1789 | 204 | | John Adams | 1797 | 205 | | Thomas Jefferson | 1801 | 206 | 207 | Then the first object in your array of rows will look like this: 208 | 209 | ```json 210 | { president: ‘George Washington’, year: ‘1789’ } 211 | ``` 212 | 213 | and so on. 214 | 215 | Generally, to run a test, you are going to want to loop over each row and do some operations on it — counting cells and using conditionals to detect unwanted values. 216 | 217 | #### Helper Scripts 218 | 219 | Helper scripts help you test and display the results of Dataproofer tests. These are a small set of functions we've found ourselves reusing. 220 | 221 | - isEmpty: detect if a cell is empty 222 | - isNumeric: detect if a cell contains a number 223 | - stripNumeric: remove number formatting like "$" or "%" 224 | - percent: return a number with a "%" sign 225 | 226 | For more information, please see the full `util` [documentation](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 227 | 228 | ![""](http://i.imgur.com/3YekdjW.png) 229 | 230 | ### Troubleshooting a test that won't run 231 | 232 | Tests are run inside a try catch loop in `src/processing.js`. You may wish to temporarily remove the try/catch while iterating on a test. 233 | Otherwise, for now we recommend heavy doses of console.log and the Chrome debugger. 234 | 235 | ### Iterating on tests 236 | 237 | Dataproofer saves a copy of the most recently loaded file in the Application Data directory provided to it by the OS. 238 | You can quickly load the file and run the tests by typing `loadLastFile()` in the console. This saves you several clicks for loading the file and clicking the run button while you are iterating on a test. 239 | If you want to temporarily avoid any clicks you can add the function call to the `ipc.on("last-file-selected",` event handler in `electron/js/controller.js` 240 | 241 | ### Release a new version 242 | 243 | We can push releases to GitHub manually for now: 244 | 245 | ```sh 246 | git tag -a 'v0.1.1' -m "first release" 247 | git push && git push --tags 248 | ``` 249 | 250 | The binary (Dataproofer.app) can be uploaded to the [releases page](https://github.com/dataproofer/Dataproofer/releases) for the tag you pushed, and should be zipped up first (Right click and choose "Compress Dataproofer") 251 | 252 | ![""](http://i.imgur.com/3YekdjW.png) 253 | 254 | ## Sources 255 | 256 | - [A Guide to Bulletproofing Your Data](https://github.com/propublica/guides/blob/master/data-bulletproofing.md), by [ProPublica](https://www.propublica.org/) 257 | - [Checklist to bulletproof your data work](http://www.tommeagher.com/blog/2012/06/checklist.html), by [Tom Meagher](http://www.tommeagher.com/blog/2012/06/checklist.html) (Data Editor, [The Marshall Project](https://www.themarshallproject.org)) 258 | - [The Quartz guide to bad data](https://github.com/Quartz/bad-data-guide), by [Chris Groskopf](github.com/onyxfish) (Things Reporter, [Quartz](http://qz.com)) 259 | - [OpenNewsLabs Data Smells](https://github.com/OpenNewsLabs/datasmells), by [Aurelia Moser](https://github.com/auremoser) ([Mozilla Science Lab](https://www.mozillascience.org/)) 260 | - SRCCON panel notes 261 | - [Handguns and tequila: Avoiding data disasters](https://old.etherpad-mozilla.org/MmSOTIOIDg) 262 | - [How NOT to Skew with Statistics](https://old.etherpad-mozilla.org/bOwBSAeLe5) 263 | 264 | ## Thank You 265 | 266 | ![vocativ-logo](https://cloud.githubusercontent.com/assets/1578563/14050100/e23d531e-f276-11e5-920a-b882eca5933a.png)
267 | ![knight-logo](https://cloud.githubusercontent.com/assets/1578563/14050167/4b12f330-f277-11e5-9773-1f69b9c2484f.png) 268 | 269 | A huge thank you to the [Vocativ](http://vocativ.com) and the [Knight Foundation](http://knightfoundation.org/). This project was funded in part by the Knight Foundation's [Prototype Fund](http://knightfoundation.org/funding-initiatives/knight-prototype-fund/). 270 | 271 | ### Special Thanks 272 | 273 | - Alex Koppelman (interviewee), Editorial Director @ Vocativ 274 | - Allee Manning (interviewee), Data Reporter @ Vocativ 275 | - Allegra Denton (design consulting), Designer @ Vocativ 276 | - Brian Byrne (interviewee), Data Reporter @ Vocativ 277 | - Daniel Littlewood (video producer), Special Projects Producer @ Vocativ 278 | - EJ Fox (project lead), Dataviz Editor @ Vocativ 279 | - Gerald Rich (lead developer), Interactive Producer @ Vocativ 280 | - Ian Johnson (lead developer), Dataproofer 281 | - Jason Das (UX and design), Dataproofer 282 | - Joe Presser (video producer), Dataproofer 283 | - Julia Kastner (concept & name consulting), Project Manager @ Vocativ 284 | - Kelli Vanover (design consulting), Product Manager @ Vocativ 285 | - Nicu Calcea (developer), Data Projects Editor @ GlobalData Media / New Statesman 286 | - Markham Nolan (interviewee), Visuals Editor @ Vocativ 287 | - Rob Di Ieso (design consulting), Art Director @ Vocativ 288 | 289 | ... and the countless journalists who've encouraged us along the way. Thank you! 290 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer", 3 | "version": "2.1.0", 4 | "description": "A proofreader for your data", 5 | "private": true, 6 | "workspaces": [ 7 | "src", 8 | "packages/*" 9 | ], 10 | "main": "./src/index.js", 11 | "bin": { 12 | "dataproofer": "./src/index.js" 13 | }, 14 | "scripts": { 15 | "lint": "yarn eslint . --ext .js", 16 | "release": "release-it" 17 | }, 18 | "devDependencies": { 19 | "@babel/core": "^7.13.15", 20 | "@babel/eslint-parser": "^7.12.16", 21 | "@babel/eslint-plugin": "^7.12.13", 22 | "eslint": "^7.20.0", 23 | "eslint-config-prettier": "^7.2.0", 24 | "eslint-plugin-prettier": "^3.3.1", 25 | "prettier": "^2.2.1", 26 | "release-it": "^14.6.1", 27 | "release-it-yarn-workspaces": "^2.0.1" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /packages/core-suite/README.md: -------------------------------------------------------------------------------- 1 | # core-suite 2 | 3 | Core suite of tests for Dataproofer. These tests relate to common problems and data checks — namely, making sure data has not been truncated by looking for specific cut-off indicators. 4 | 5 | - [Documentation](https://github.com/dataproofer/core-suite/blob/master/README.md) 6 | - [Repository](https://github.com/dataproofer/core-suite/) 7 | - [Issues](https://github.com/dataproofer/core-suite/issues) 8 | 9 | ## Table of Contents 10 | 11 | - [Tests](https://github.com/dataproofer/core-suite#tests) 12 | - [columnsContainNothing.js](https://github.com/dataproofer/core-suite#columnsContainNothing.js) 13 | - [columnsContainsSpecialChars](https://github.com/dataproofer/core-suite#columnsContainsSpecialChars.js) 14 | - [stringsHaveExactly255Characters.js](https://github.com/dataproofer/core-suite#stringsHaveExactly255Charactersjs) 15 | - [maxBigInteger.js](https://github.com/dataproofer/core-suite#maxBigIntegerjs) 16 | - [maxInteger.js](https://github.com/dataproofer/core-suite#maxIntegerjs) 17 | - [maxSmallInteger.js](https://github.com/dataproofer/core-suite#maxSmallIntegerjs) 18 | - [maxSummedInteger.js](https://github.com/dataproofer/core-suite#maxSummedIntegerjs) 19 | - [checkDuplicateRows.js](https://github.com/dataproofer/core-suite#checkDuplicateRowsjs) 20 | - [numberOfRowsIs65k.js](https://github.com/dataproofer/core-suite#numberOfRowsIs65kjs) 21 | - [Development](https://github.com/dataproofer/core-suite#development) 22 | - [Getting Started](https://github.com/dataproofer/core-suite#getting-started) 23 | - [Writing Tests](https://github.com/dataproofer/stats-suite#writing-tests) 24 | - [Building Docs](https://github.com/dataproofer/core-suite#building-docs) 25 | 26 | ## Tests 27 | 28 | # columnsContainNothing.js 29 | 30 | Calculates the percentage of rows that are empty for each column 31 | 32 | **Parameters** 33 | 34 | - `rows` **[Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array)** an array of objects representing rows in the spreadsheet 35 | - `columnHeads` **[Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array)** an array of strings for column names of the spreadsheet 36 | 37 | Returns **[Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object)** describing the result 38 | 39 | # columnsContainsSpecialChars.js 40 | 41 | Calculates the percentage of rows that contain special, non-typical Latin characters for each column 42 | Source: 43 | 44 | **Parameters** 45 | 46 | - `rows` **[Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array)** an array of objects representing rows in the spreadsheet 47 | - `columnHeads` **[Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array)** an array of strings for column names of the spreadsheet 48 | 49 | Returns **[Object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object)** describing the result 50 | 51 | # stringsHaveExactly255Characters.js 52 | 53 | [src/stringsHaveExactly255Characters.js:14-66](https://github.com/dataproofer/core-suite/blob/master/src/stringsHaveExactly255Characters.js#L14-L66 "Source code on GitHub") 54 | 55 | Determine the cells that have exactly 255 characters (SQL upper limit error). See ProPublica's bad data guide for further information 56 | 57 | 58 | **Parameters** 59 | 60 | - `rows` **Array** an array of objects representing rows in the spreadsheet 61 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 62 | 63 | Returns **Object** describing the result 64 | 65 | # maxBigInteger.js 66 | 67 | [src/maxBigInteger.js:15-71](https://github.com/dataproofer/core-suite/blob/master/src/maxBigInteger.js#L15-L71 "Source code on GitHub") 68 | 69 | Indicates an `bigint` at its upper signed limit (MySQL or PostgreSQL) of 9,223,372,036,854,775,807 or its upper unsigned limit (MySQL) of 18,446,744,073,709,551,616. 70 | Common database programs, like MySQL, have a cap on how big of a number it can save. 71 | Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) or [PostgreSQL documentation](http://www.postgresql.org/docs/9.5/interactive/datatype-numeric.html) for more information. 72 | 73 | **Parameters** 74 | 75 | - `rows` **Array** an array of objects representing rows in the spreadsheet 76 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 77 | 78 | Returns **Object** describing the result 79 | 80 | # maxInteger.js 81 | 82 | [src/maxInteger.js:15-71](https://github.com/dataproofer/core-suite/blob/master/src/maxInteger.js#L15-L71 "Source code on GitHub") 83 | 84 | Indicates a integer at its upper signed limit is 2,147,483,647 (MySQL or PostgreSQL) or its upper unsigned limit (MySQL) of 4,294,967,295. 85 | Common database programs, like MySQL, have a cap on how big of a number it can save. 86 | Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) for more information. 87 | 88 | **Parameters** 89 | 90 | - `rows` **Array** an array of objects representing rows in the spreadsheet 91 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 92 | 93 | Returns **Object** describing the result 94 | 95 | # maxSmallInteger.js 96 | 97 | [src/maxSmallInteger.js:15-71](https://github.com/dataproofer/core-suite/blob/master/src/maxSmallInteger.js#L15-L71 "Source code on GitHub") 98 | 99 | Indicates an `smallint` at its upper signed limit (MySQL or PostgreSQL) of 32,767 or its upper unsigned limit (MySQL) of 65,535. 100 | Common database programs, like MySQL, have a cap on how big of a number it can save. 101 | Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) or [PostgreSQL documentation](http://www.postgresql.org/docs/9.5/interactive/datatype-numeric.html) for more information. 102 | 103 | **Parameters** 104 | 105 | - `rows` **Array** an array of objects representing rows in the spreadsheet 106 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 107 | 108 | Returns **Object** describing the result 109 | 110 | # maxSummedInteger.js 111 | 112 | [src/maxSummedInteger.js:15-71](https://github.com/dataproofer/core-suite/blob/master/src/maxSummedInteger.js#L15-L71 "Source code on GitHub") 113 | 114 | Indicates a summed integers at its upper limit of 2,097,152. 115 | Please see the [Integrity Checks](https://github.com/propublica/guides/blob/master/data-bulletproofing.md#integrity-checks-for-every-data-set) section of the ProPublica [Data Bulletproofing Guide](https://github.com/propublica/guides/blob/master/data-bulletproofing.md) for more information. 116 | 117 | **Parameters** 118 | 119 | - `rows` **Array** an array of objects representing rows in the spreadsheet 120 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 121 | 122 | Returns **Object** describing the result 123 | 124 | # checkDuplicateRows.js 125 | 126 | [src/checkDuplicateRows.js:13-73](https://github.com/dataproofer/core-suite/blob/master/src/checkDuplicateRows.js#L13-L73 "Source code on GitHub") 127 | 128 | Check for any duplicate rows in the spreadsheet. Optionally 129 | 130 | **Parameters** 131 | 132 | - `rows` **Array** an array of objects representing rows in the spreadsheet 133 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 134 | - `input` **Object** accept user input, such as selected Columns 135 | 136 | Returns **Object** describing the result 137 | 138 | # numberOfRowsIs65k.js 139 | 140 | [src/numberOfRowsIs65k.js:12-31](https://github.com/dataproofer/core-suite/blob/master/src/numberOfRowsIs65k.js#L12-L31 "Source code on GitHub") 141 | 142 | Test to see if number of rows is exactly 65,536 rows (cutoff by Excel) 143 | 144 | **Parameters** 145 | 146 | - `rows` **Array** an array of objects representing rows in the spreadsheet 147 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 148 | 149 | Returns **Object** describing the result 150 | 151 | ## Development 152 | 153 | ### Getting Started 154 | 155 | git clone git@github.com:dataproofer/core-suite.git 156 | cd core-suite 157 | npm install 158 | 159 | ### Writing Tests 160 | 161 | - [How to](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 162 | - [Helper Scripts](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 163 | - Templates 164 | - [Basic Test](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 165 | - [Advanced Test](https://github.com/dataproofer/suite-template/blob/master/src/myAdvancedTest.js) 166 | 167 | ### Building Docs 168 | 169 | We use [documentation.js](https://github.com/documentationjs/documentation), but have created a handy script for regenerating documentation. 170 | 171 | npm run docs 172 | 173 | Then open up and check your docs in [DOCUMENTATION.md](https://github.com/dataproofer/info-suite/blob/master/DOCUMENTATION.md) 174 | -------------------------------------------------------------------------------- /packages/core-suite/index.js: -------------------------------------------------------------------------------- 1 | // All test suites will have a name and a list 2 | exports = module.exports = { 3 | name: "dataproofer-core-suite", 4 | fullName: "Core Data Tests", 5 | tests: [], // the list of main tests to be run in the suite 6 | subtests: [], // a list of tests that can be triggered by the main tests but wont be run automatically 7 | }; 8 | 9 | var columnsContainNothing = require("./src/columnsContainNothing"); 10 | var columnsContainsSpecialChars = require("./src/columnsContainsSpecialChars"); 11 | var checkDuplicateRows = require("./src/checkDuplicateRows"); 12 | var maxInteger = require("./src/maxInteger"); 13 | var maxSmallInteger = require("./src/maxSmallInteger"); 14 | var maxBigInteger = require("./src/maxBigInteger"); 15 | var maxSummedInteger = require("./src/maxSummedInteger"); 16 | var numberOfRowsIs65k = require("./src/numberOfRowsIs65k"); 17 | var stringsHaveExactly255Characters = require("./src/stringsHaveExactly255Characters"); 18 | 19 | exports.tests.push( 20 | columnsContainNothing, 21 | columnsContainsSpecialChars, 22 | checkDuplicateRows, 23 | numberOfRowsIs65k, 24 | stringsHaveExactly255Characters, 25 | maxInteger, 26 | maxSummedInteger, 27 | maxSmallInteger, 28 | maxBigInteger 29 | ); 30 | -------------------------------------------------------------------------------- /packages/core-suite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer-core-suite", 3 | "version": "2.1.0", 4 | "description": "Core suite of tests for dataproofer", 5 | "main": "index.js", 6 | "scripts": { 7 | "docs": "yarn run documentation/bin/documentation.js ./src/*.js -f md -g -o DOCUMENTATION.md" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/dataproofer/core-suite" 12 | }, 13 | "keywords": [ 14 | "data", 15 | "csv", 16 | "excel" 17 | ], 18 | "author": "DataProofer ", 19 | "license": "GPL-3.0", 20 | "dependencies": { 21 | "dataproofertest-js": "2.1.0", 22 | "lodash": "^4.17.20" 23 | }, 24 | "devDependencies": { 25 | "@babel/core": "^7.13.15", 26 | "@babel/eslint-parser": "^7.12.16", 27 | "@babel/eslint-plugin": "^7.12.13", 28 | "documentation": "^13.1.1", 29 | "eslint": "^7.20.0", 30 | "eslint-config-prettier": "^7.2.0", 31 | "eslint-plugin-prettier": "^3.3.1", 32 | "prettier": "^2.2.1" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /packages/core-suite/src/checkDuplicateRows.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var checkDuplicateRows = new DataprooferTest(); 3 | 4 | /** 5 | * Check for any duplicate rows in the spreadsheet. Optionally 6 | * 7 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 8 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 9 | * @param {Object} input - accept user input, such as selected Columns 10 | * @return {Object} describing the result 11 | */ 12 | checkDuplicateRows 13 | .name("Duplicate Rows") 14 | .description("Check for any identical rows in the spreadsheet") 15 | .conclusion( 16 | "This data may be unreliable if you weren't expecting things to show up twice in exactly the same way. Consult your source." 17 | ) 18 | .methodology(function (rows, columnHeads, input) { 19 | var testState = "passed"; 20 | var selectedColumns = input.selectedColumns; 21 | var columns; 22 | if (selectedColumns && selectedColumns.length) { 23 | columns = selectedColumns; 24 | } else { 25 | columns = columnHeads; 26 | } 27 | 28 | var dupes = {}; 29 | // we will want to mark cells to be highlighted here 30 | var cellsToHighlight = []; 31 | // look through the rows 32 | rows.forEach(function (row, i) { 33 | // we make a row to keep track of cells we want to highlight 34 | var currentRow = {}; 35 | 36 | var hash = ""; // 37 | columns.forEach(function (columnHead) { 38 | hash += row[columnHead] + "-|o.O|-"; 39 | }); 40 | columnHeads.forEach(function (columnHead) { 41 | currentRow[columnHead] = 0; 42 | }); 43 | if (dupes[hash]) { 44 | columns.forEach(function (columnHead) { 45 | currentRow[columnHead] = 1; 46 | testState = "failed"; 47 | }); 48 | dupes[hash].count++; 49 | } else { 50 | dupes[hash] = { count: 1, index: i }; 51 | } 52 | // push our marking row onto our cells array 53 | cellsToHighlight.push(currentRow); 54 | }); 55 | 56 | var numDupes = 0; 57 | Object.keys(dupes).forEach(function (hash) { 58 | if (dupes[hash].count > 1) { 59 | // eslint-disable-next-line no-unused-vars 60 | numDupes++; 61 | } 62 | }); 63 | 64 | var result = { 65 | testState: testState, 66 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 67 | }; 68 | return result; 69 | }); 70 | 71 | module.exports = checkDuplicateRows; 72 | -------------------------------------------------------------------------------- /packages/core-suite/src/columnsContainNothing.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | 4 | var columnsContainNothing = new DataprooferTest(); 5 | 6 | /** 7 | * Calculates the percentage of rows that are empty for each column 8 | * 9 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 10 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 11 | * @return {Object} describing the result 12 | */ 13 | columnsContainNothing 14 | .name("Empty Cells") 15 | .description( 16 | "Calculates the percentage of rows that are empty for each column" 17 | ) 18 | .methodology(function (rows, columnHeads) { 19 | var testState = "passed"; 20 | // we will want to mark cells to be highlighted here 21 | var cellsToHighlight = []; 22 | // look through the rows 23 | rows.forEach(function (row) { 24 | // we make a row to keep track of cells we want to highlight 25 | var currentRow = {}; 26 | columnHeads.forEach(function (columnHead) { 27 | var cell = row[columnHead]; 28 | if (util.isEmpty(cell)) { 29 | currentRow[columnHead] = 1; 30 | testState = "warn"; 31 | } else { 32 | currentRow[columnHead] = 0; 33 | } 34 | }); 35 | // push our marking row onto our cells array 36 | cellsToHighlight.push(currentRow); 37 | }); 38 | 39 | var result = { 40 | testState: testState, 41 | highlightCells: cellsToHighlight, 42 | }; 43 | return result; 44 | }) 45 | .conclusion(function (result) { 46 | var conclusionStr = ""; 47 | var columns = Object.keys(result.columnWise); 48 | columns.forEach(function (column) { 49 | // Column foo: 50 | var currCount = result.columnWise[column]; 51 | if (currCount > 0) { 52 | conclusionStr += 'column "' + column + '": '; 53 | conclusionStr += result.columnWise[column] + " cells, "; 54 | conclusionStr += util.percent( 55 | result.columnWise[column] / result.highlightCells.length 56 | ); 57 | conclusionStr += "
"; 58 | } 59 | }); 60 | return conclusionStr; 61 | }); 62 | 63 | module.exports = columnsContainNothing; 64 | -------------------------------------------------------------------------------- /packages/core-suite/src/columnsContainsSpecialChars.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var columnsContainsSpecialChars = new DataprooferTest(); 4 | 5 | /** 6 | * Calculates the percentage of rows that contain special, non-typical Latin characters for each column 7 | * Source: http://www.w3schools.com/charsets/ref_html_utf8.asp 8 | * 9 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 10 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 11 | * @return {Object} describing the result 12 | */ 13 | columnsContainsSpecialChars 14 | .name("Special Letters & Characters") 15 | .description( 16 | "Determine which cells contain wingdings, boxes, or accented characters. These can cause errors with some visualization & analysis tools." 17 | ) 18 | .methodology(function (rows, columnHeads) { 19 | var testState = "passed"; 20 | // we will want to mark cells to be highlighted here 21 | var cellsToHighlight = []; 22 | 23 | function containsSpecialChar(str) { 24 | var result = false; 25 | // look for characters outside typical Latin 26 | // character codes 27 | // http://www.w3schools.com/charsets/ref_html_utf8.asp 28 | for (var i = 0; i < str.length; i++) { 29 | if (str.charCodeAt(i) > 127) result = true; 30 | } 31 | return result; 32 | } 33 | // look through the rows 34 | rows.forEach(function (row) { 35 | // we make a row to keep track of cells we want to highlight 36 | var currentRow = {}; 37 | columnHeads.forEach(function (columnHead) { 38 | var cell = row[columnHead]; 39 | if (util.isString(cell) && containsSpecialChar(cell)) { 40 | currentRow[columnHead] = 1; 41 | testState = "warn"; 42 | } else { 43 | currentRow[columnHead] = 0; 44 | } 45 | }); 46 | // push our marking row onto our cells array 47 | cellsToHighlight.push(currentRow); 48 | }); 49 | var result = { 50 | testState: testState, 51 | highlightCells: cellsToHighlight, 52 | }; 53 | return result; 54 | }) 55 | .conclusion(function (result) { 56 | var conclusionStr = ""; 57 | var columns = Object.keys(result.columnWise); 58 | columns.forEach(function (column) { 59 | // generate result string 60 | var currCount = result.columnWise[column]; 61 | if (currCount > 0) { 62 | conclusionStr += column + ": "; 63 | conclusionStr += result.columnWise[column] + " cells, "; 64 | conclusionStr += util.percent( 65 | result.columnWise[column] / result.highlightCells.length 66 | ); 67 | conclusionStr += " of column
"; 68 | } 69 | }); 70 | return conclusionStr; 71 | }); 72 | 73 | module.exports = columnsContainsSpecialChars; 74 | -------------------------------------------------------------------------------- /packages/core-suite/src/maxBigInteger.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var maxBigInteger = new DataprooferTest(); 4 | 5 | /** 6 | * Indicates an `bigint` at its upper signed limit (MySQL or PostgreSQL) of 9,223,372,036,854,775,807 or its upper unsigned limit (MySQL) of 18,446,744,073,709,551,616. 7 | * Common database programs, like MySQL, have a cap on how big of a number it can save. 8 | * Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) or [PostgreSQL documentation](http://www.postgresql.org/docs/9.5/interactive/datatype-numeric.html) for more information. 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | maxBigInteger 15 | .name("Big integer at its SQL upper limit") 16 | .description( 17 | "If a column contains numbers, make sure it's not 9,223,372,036,854,775,807 or 18,446,744,073,709,551,616. Common database programs like MySQL and PostgreSQL limit to the size of numbers it can store." 18 | ) 19 | .conclusion( 20 | "It's possible this data was exported from SQL improperly. Consult your source." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var maxBigInts = {}; 24 | columnHeads.forEach(function (columnHead) { 25 | maxBigInts[columnHead] = 0; 26 | }); 27 | // we will want to mark cells to be highlighted here 28 | var cellsToHighlight = []; 29 | var testState = "passed"; 30 | // look through the rows 31 | rows.forEach(function (row) { 32 | // we make a row to keep track of cells we want to highlight 33 | var currentRow = {}; 34 | columnHeads.forEach(function (columnHead) { 35 | var cell = row[columnHead]; 36 | var strippedCell = util.stripNumeric(cell); 37 | var f = parseFloat(strippedCell); 38 | // this will only be true if the cell is a number 39 | if ( 40 | typeof f === "number" && 41 | (f === 9223372036854775807 || f === 18446744073709551615) 42 | ) { 43 | maxBigInts[columnHead] += 1; 44 | currentRow[columnHead] = 1; 45 | testState = "failed"; 46 | } else { 47 | currentRow[columnHead] = 0; 48 | } 49 | }); 50 | // push our marking row onto our cells array 51 | cellsToHighlight.push(currentRow); 52 | }); 53 | 54 | var result = { 55 | testState: testState, 56 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 57 | }; 58 | return result; 59 | }); 60 | 61 | module.exports = maxBigInteger; 62 | -------------------------------------------------------------------------------- /packages/core-suite/src/maxInteger.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var maxInteger = new DataprooferTest(); 4 | 5 | /** 6 | * Indicates a integer at its upper signed limit is 2,147,483,647 (MySQL or PostgreSQL) or its upper unsigned limit (MySQL) of 4,294,967,295. 7 | * Common database programs, like MySQL, have a cap on how big of a number it can save. 8 | * Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) for more information. 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | maxInteger 15 | .name("Integer at its SQL upper limit") 16 | .description( 17 | "If a column contains numbers, make sure it's not 2,147,483,647 or 4,294,967,295. Common database programs like like MySQL and PostgreSQL limit to the size of numbers it can calculate." 18 | ) 19 | .conclusion( 20 | "It's possible this data was exported from SQL improperly. Consult your source." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var testState = "passed"; 24 | var maxInts = {}; 25 | columnHeads.forEach(function (columnHead) { 26 | maxInts[columnHead] = 0; 27 | }); 28 | // we will want to mark cells to be highlighted here 29 | var cellsToHighlight = []; 30 | // look through the rows 31 | rows.forEach(function (row) { 32 | // we make a row to keep track of cells we want to highlight 33 | var currentRow = {}; 34 | columnHeads.forEach(function (columnHead) { 35 | var cell = row[columnHead]; 36 | var strippedCell = util.stripNumeric(cell); 37 | var f = parseFloat(strippedCell); 38 | // this will only be true if the cell is a number 39 | if (typeof f === "number" && (f === 2147483647 || f === 4294967295)) { 40 | maxInts[columnHead] += 1; 41 | currentRow[columnHead] = 1; 42 | testState = "failed"; 43 | } else { 44 | currentRow[columnHead] = 0; 45 | } 46 | }); 47 | // push our marking row onto our cells array 48 | cellsToHighlight.push(currentRow); 49 | }); 50 | var result = { 51 | testState: testState, 52 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 53 | }; 54 | return result; 55 | }); 56 | 57 | module.exports = maxInteger; 58 | -------------------------------------------------------------------------------- /packages/core-suite/src/maxSmallInteger.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var maxSmallInteger = new DataprooferTest(); 4 | 5 | /** 6 | * Indicates an `smallint` at its upper signed limit (MySQL or PostgreSQL) of 32,767 or its upper unsigned limit (MySQL) of 65,535. 7 | * Common database programs, like MySQL, have a cap on how big of a number it can save. 8 | * Please see the [MySQL documentation](https://dev.mysql.com/doc/refman/5.7/en/integer-types.html) or [PostgreSQL documentation](http://www.postgresql.org/docs/9.5/interactive/datatype-numeric.html) for more information. 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | maxSmallInteger 15 | .name("Small integer at its SQL upper limit") 16 | .description( 17 | "If a column contains numbers, make sure it's not 65,535 or 32,767. Common database programs like MySQL limit to the size of numbers it can store." 18 | ) 19 | .conclusion( 20 | "It's possible this data was exported from SQL improperly. Consult your source." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var testState = "passed"; 24 | var maxSmallInts = {}; 25 | columnHeads.forEach(function (columnHead) { 26 | maxSmallInts[columnHead] = 0; 27 | }); 28 | // we will want to mark cells to be highlighted here 29 | var cellsToHighlight = []; 30 | // look through the rows 31 | rows.forEach(function (row) { 32 | // we make a row to keep track of cells we want to highlight 33 | var currentRow = {}; 34 | columnHeads.forEach(function (columnHead) { 35 | var cell = row[columnHead]; 36 | var strippedCell = util.stripNumeric(cell); 37 | var f = parseFloat(strippedCell); 38 | // this will only be true if the cell is a number 39 | if (typeof f === "number" && (f === 32767 || f === 65535)) { 40 | maxSmallInts[columnHead] += 1; 41 | currentRow[columnHead] = 1; 42 | testState = "failed"; 43 | } else { 44 | currentRow[columnHead] = 0; 45 | } 46 | }); 47 | // push our marking row onto our cells array 48 | cellsToHighlight.push(currentRow); 49 | }); 50 | 51 | var result = { 52 | testState: testState, 53 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 54 | }; 55 | return result; 56 | }); 57 | 58 | module.exports = maxSmallInteger; 59 | -------------------------------------------------------------------------------- /packages/core-suite/src/maxSummedInteger.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var maxSummedInteger = new DataprooferTest(); 4 | 5 | /** 6 | * Indicates a summed integers at its upper limit of 2,097,152. 7 | * Please see the [Integrity Checks](https://github.com/propublica/guides/blob/master/data-bulletproofing.md#integrity-checks-for-every-data-set) section of the ProPublica [Data Bulletproofing Guide](https://github.com/propublica/guides/blob/master/data-bulletproofing.md) for more information. 8 | * 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | maxSummedInteger 15 | .name("Summed integer at its upper limit") 16 | .description( 17 | "If a column contains numbers, make sure it's not 2,097,152. Common database programs like MySQL limit to the size of numbers it can calculate." 18 | ) 19 | .conclusion( 20 | "It's possible this data was exported from SQL improperly. Consult your source." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var testState = "passed"; 24 | var maxSummedInts = {}; 25 | columnHeads.forEach(function (columnHead) { 26 | maxSummedInts[columnHead] = 0; 27 | }); 28 | // we will want to mark cells to be highlighted here 29 | var cellsToHighlight = []; 30 | // look through the rows 31 | rows.forEach(function (row) { 32 | // we make a row to keep track of cells we want to highlight 33 | var currentRow = {}; 34 | columnHeads.forEach(function (columnHead) { 35 | var cell = row[columnHead]; 36 | var strippedCell = util.stripNumeric(cell); 37 | var f = parseFloat(strippedCell); 38 | // this will only be true if the cell is a number 39 | if (typeof f === "number" && f === 2097152) { 40 | maxSummedInts[columnHead] += 1; 41 | currentRow[columnHead] = 1; 42 | testState = "failed"; 43 | } else { 44 | currentRow[columnHead] = 0; 45 | } 46 | }); 47 | // push our marking row onto our cells array 48 | cellsToHighlight.push(currentRow); 49 | }); 50 | 51 | var result = { 52 | testState: testState, 53 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 54 | }; 55 | return result; 56 | }); 57 | 58 | module.exports = maxSummedInteger; 59 | -------------------------------------------------------------------------------- /packages/core-suite/src/numberOfRowsIs65k.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var numberOfRowsIs65k = new DataprooferTest(); 3 | 4 | /** 5 | * Test to see if number of rows is exactly 65,536 rows (cutoff by Excel) 6 | * 7 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 8 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 9 | * @return {Object} describing the result 10 | */ 11 | numberOfRowsIs65k 12 | .name("Potentially missing rows") 13 | .description( 14 | "Test to see if number of rows is exactly 65,536 rows (cutoff by Excel)" 15 | ) 16 | .conclusion( 17 | "This dataset has exactly 65,536 rows, which is an export cutoff in Excel. Double-check with your source that you have all the data." 18 | ) 19 | .methodology(function (rows, columnHeads) { 20 | var testState = "passed"; 21 | if (rows.length === 65536) { 22 | testState = "failed"; 23 | } 24 | var result = { 25 | testState: testState, 26 | }; 27 | return result; 28 | }); 29 | 30 | module.exports = numberOfRowsIs65k; 31 | -------------------------------------------------------------------------------- /packages/core-suite/src/stringsHaveExactly255Characters.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var stringsHaveExactly255Characters = new DataprooferTest(); 3 | 4 | /** 5 | * Determine the cells that have exactly 255 characters (SQL upper limit error). See ProPublica's bad data guide for further information 6 | * https://github.com/propublica/guides/blob/master/data-bulletproofing.md#integrity-checks-for-every-data-set 7 | * 8 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 9 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 10 | * @return {Object} describing the result 11 | */ 12 | stringsHaveExactly255Characters 13 | .name("Words at their character limit") 14 | .description( 15 | "Determine the cells that have exactly 255 characters. Database programs like SQL have a limit to the length of words it can output." 16 | ) 17 | .conclusion( 18 | "Strings that are exactly 255 characters are suspicious because it could be an export problem. Double-check with your source that you have all the data." 19 | ) 20 | .methodology(function (rows, columnHeads) { 21 | var testState = "passed"; 22 | var strings = {}; 23 | columnHeads.forEach(function (columnHead) { 24 | strings[columnHead] = 0; 25 | }); 26 | var cellsToHighlight = []; // we will want to mark cells to be highlighted here 27 | var has255 = false; 28 | // look through the rows 29 | rows.forEach(function (row) { 30 | var currentRow = {}; // we make a row to keep track of cells we want to highlight 31 | columnHeads.forEach(function (columnHead) { 32 | var cell = row[columnHead]; 33 | if (cell && cell.length === 255) { 34 | currentRow[columnHead] = 1; 35 | strings[columnHead] += 1; 36 | has255 = true; // we want to know if it occurrs at least once 37 | } else { 38 | currentRow[columnHead] = 0; 39 | } 40 | }); 41 | cellsToHighlight.push(currentRow); // push our marking row onto our cells array 42 | }); 43 | 44 | if (has255) testState = "failed"; 45 | 46 | var result = { 47 | testState: testState, 48 | highlightCells: cellsToHighlight, // a mirror of the dataset, but with a 1 or 0 for each cell if it should be highlighted or not 49 | }; 50 | return result; 51 | }); 52 | 53 | module.exports = stringsHaveExactly255Characters; 54 | -------------------------------------------------------------------------------- /packages/dataproofertest-js/README.md: -------------------------------------------------------------------------------- 1 | # DataprooferTest 2 | 3 | Creates an individual test instance to be used with the [Dataproofer app](https://github.com/dataproofer/Dataproofer/), or as a standalone data check in your JavaScript. 4 | 5 | `var myTest = new DataprooferTest()` 6 | 7 | * [API Documentation](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md) 8 | * [Repository](https://github.com/dataproofer/dataproofertest-js/) 9 | * [Issues](https://github.com/dataproofer/dataproofertest-js/issues) 10 | 11 | # Table of Contents 12 | 13 | * [Development](https://github.com/dataproofer/suite-template-suite#development) 14 | * [Getting Started](https://github.com/dataproofer/stats-suite#getting-started) 15 | * [Writing Tests](https://github.com/dataproofer/stats-suite#writing-tests) 16 | * [Building Docs](https://github.com/dataproofer/suite-template#building-docs) 17 | 18 | ## Development 19 | 20 | ### Getting Started 21 | ``` 22 | git clone git@github.com:dataproofer/dataproofertest-js.git 23 | cd dataproofertest-js 24 | npm run bootstrap 25 | ``` 26 | 27 | ### Writing Tests 28 | 29 | * [How To](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 30 | * [Helper scripts](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 31 | * Templates 32 | * [Basic Test](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 33 | * [Advanced Test](https://github.com/dataproofer/suite-template/blob/master/src/myAdvancedTest.js) 34 | 35 | ### Building Docs 36 | 37 | We use [documentation.js](https://github.com/documentationjs/documentation), but have created a handy script for regenerating documentation. 38 | 39 | ``` 40 | npm run docs 41 | ``` 42 | 43 | Then open up and check your docs in [DOCUMENTATION.md](https://github.com/dataproofer/info-suite/blob/master/DOCUMENTATION.md) -------------------------------------------------------------------------------- /packages/dataproofertest-js/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Creates an individual test instance. 3 | * 4 | * @class DataprooferTest 5 | * @param {Object} options contains testing functions 6 | * @param options.methodology **MUST BE INCLUDED TO WORK.** Begins testing and defaults to empty result object 7 | * @example 8 | * var DataprooferTest = require('DataprooferTest'); 9 | * var myTest = new DataprooferTest({ 10 | * methodology: function(rows, columnHeads, input) { 11 | * var testState; 12 | * if (rows.length > 0) { 13 | * testState = "passed"; 14 | * } else { 15 | * testState = "failed"; 16 | * } 17 | * // RESULTS MUST INCLUDE THESE FIVE KEY VALUES 18 | * return { 19 | * testState: testState, // the result of the test 20 | * highlightedCells: [...] // array of cell objects to highlight 21 | * } 22 | * } 23 | * }); 24 | * 25 | * // { 26 | * // testState: "failed", 27 | * // highlightedCells: [...] 28 | * // }; 29 | */ 30 | var DataprooferTest = function (options) { 31 | if (options) { 32 | var name = this.options.name || null; 33 | var description = this.options.description || null; 34 | var conclusion = this.options.conclusion || null; 35 | var methodology = this.options.methodology || null; 36 | 37 | if (name && typeof name === "string") { 38 | this._name = name; 39 | } else { 40 | this._name = ""; 41 | } 42 | 43 | if (description && typeof description === "string") { 44 | this._description = description; 45 | } else { 46 | this._description = ""; 47 | } 48 | 49 | if (conclusion && typeof conclusion === "string") { 50 | this._conclusion = conclusion; 51 | } else if (typeof conclusion === "function") { 52 | this._conclusionFactory = conclusion; 53 | this._conclusion = ""; 54 | } else { 55 | this._conclusion = ""; 56 | } 57 | 58 | if (methodology && typeof methdology === "function") { 59 | this._methodology = methodology; 60 | } else { 61 | this._methodology = function (rows, columnHeads, input) { 62 | return { 63 | testState: "failed", 64 | name: this._name, 65 | description: this._description, 66 | conclusion: this._conclusion, 67 | highlightedCells: [], 68 | }; 69 | }; 70 | } 71 | } else { 72 | this._name = ""; 73 | this._description = ""; 74 | this._conclusion = ""; 75 | this._methodology = function (rows, columnHeads, input) { 76 | return { 77 | testState: "failed", 78 | name: this._name, 79 | description: this._description, 80 | conclusion: this._conclusion, 81 | highlightedCells: [], 82 | }; 83 | }; 84 | } 85 | }; 86 | 87 | DataprooferTest.prototype = { 88 | /** 89 | * Get a test's full name 90 | * @returns {String} containing the name of the test 91 | * @example 92 | * var myTest = newDataprooferTest({...}); 93 | * myTest.name(); 94 | * 95 | * // "My test" 96 | */ 97 | /** 98 | * Set a test's name 99 | * @param {String} providing a more explicative, full name for the test 100 | * @returns {DataprooferTest} 101 | * @example 102 | * var myTest = newDataprooferTest({...}); 103 | * myTest.name("My test"); 104 | */ 105 | name: function (nameString) { 106 | var result; 107 | if (arguments.length === 0) { 108 | result = this._name; 109 | } else if (typeof nameString === "string") { 110 | this._name = nameString; 111 | result = this; 112 | } else { 113 | result = undefined; 114 | console.error("Must provide a string as the name"); 115 | } 116 | return result; 117 | }, 118 | 119 | /** 120 | * Get a test's description 121 | * @returns {String} containing a description of the test 122 | * @example 123 | * var myTest = newDataprooferTest({...}); 124 | * myTest.description(); 125 | * 126 | * // "Counts the rows of a spreadsheet" 127 | */ 128 | /** 129 | * Set a test's description 130 | * @param {String} providing a description of the test 131 | * @returns {DataprooferTest} 132 | * @example 133 | * var myTest = newDataprooferTest({...}); 134 | * myTest.description("Counts the rows of a spreadsheet"); 135 | */ 136 | description: function (descriptionString) { 137 | var result; 138 | if (arguments.length === 0) { 139 | result = this._description; 140 | } else if (typeof descriptionString === "string") { 141 | this._description = descriptionString; 142 | result = this; 143 | } else { 144 | result = undefined; 145 | console.error("Must provide a string as the description"); 146 | } 147 | return result; 148 | }, 149 | 150 | /** 151 | * Get a test's proofing function, aka its methodology 152 | * @returns {Function} describing the test's current functions 153 | * @example 154 | * var myTest = new DataprooferTest({...}); 155 | * myTest.methodology(); 156 | * 157 | * // function(rows, columnHeads, input) {...} 158 | */ 159 | /** 160 | * Set a test's proofing function, aka its methodology 161 | * @param {Function} that takes a function and returns an object. See {DataprooferTest} 162 | * @returns {DataprooferTest} 163 | * @example 164 | * var myTest = new DataprooferTest({...}); 165 | * myTest.methodology(function); 166 | */ 167 | methodology: function (methodologyFunction) { 168 | var result; 169 | if (arguments.length === 0) { 170 | result = this._methodology; 171 | } else if (typeof methodologyFunction === "function") { 172 | this._methodology = methodologyFunction; 173 | result = this; 174 | } else { 175 | result = undefined; 176 | console.error("Must provide a function to proof data"); 177 | } 178 | return result; 179 | }, 180 | 181 | /** 182 | * Runs a user-specified test. If no test is specified, a default "results" object is returned 183 | * 184 | * @param [rows=[]] an array of row objects; column heads are keys, cells are values. 185 | * @param [columnHeads=[]] an array of column head names 186 | * @param [input=[]] an array objects representing user input 187 | * @returns {Object} default result object describing the test 188 | * @example 189 | * var myTest = new DataprooferTest({...}); 190 | * myTest.proof(rows, columnHeads, input); 191 | * 192 | * // { 193 | * // testState: "failed", # default boolean 194 | * // name: "", # default empty string 195 | * // description: "", # default empty string 196 | * // summary: "", # default empty string 197 | * // highlightedCells: [] # default empty array 198 | * // }; 199 | */ 200 | proof: function (rows, columnHeads, input) { 201 | rows = rows || []; 202 | columnHeads = columnHeads || []; 203 | input = input || {}; 204 | return this._methodology(rows, columnHeads, input); 205 | }, 206 | 207 | /** 208 | * Runs a user-specified test and returns the result as a string 209 | * 210 | * @param [rows=[]] an array of row objects; column heads are keys, cells are values. 211 | * @param [columnHeads=[]] an array of column head names 212 | * @returns {Object} default result object describing the test 213 | * @example 214 | * var myTest = new DataprooferTest({...}); 215 | * myTest.doesPass(rows, columnHeads, input); 216 | * 217 | * // "failed" 218 | */ 219 | doesPass: function (rows, columnHeads, input) { 220 | var _test = new DataprooferTest(); 221 | rows = rows || []; 222 | columnHeads = columnHeads || []; 223 | var result = 224 | _test.proof(rows, columnHeads, input).testState || 225 | "Error: Methodology must return an object with a 'testState' key!"; 226 | return result; 227 | }, 228 | 229 | /** 230 | * Get a test's conclusion — 231 | * next steps someone should take if a test does not pass. 232 | * @returns {String} containing a conclusion if a test does not pass 233 | * @example 234 | * var myTest = newDataprooferTest({...}); 235 | * myTest.conclusion(); 236 | * 237 | * // "This spreadsheet has more than one row." 238 | */ 239 | /** 240 | * Set a test's description with a string independent of the test result 241 | * @param {String} containing a conclusion if a test or does not pass 242 | * @returns {DataprooferTest} 243 | * @example 244 | * var myTest = newDataprooferTest({...}); 245 | * myTest.conclusion("This spreadsheet has more than one row."); 246 | */ 247 | /** 248 | * Set a test's description with a string dependent on the test result 249 | * @param {Function} containing a conclusion if a test or does not pass 250 | * @returns {DataprooferTest} 251 | * @example 252 | * var myTest = newDataprooferTest({...}); 253 | * myTest.conclusion(function(result) { 254 | * if(result.testState === "passed") { 255 | * return "Passed. You may use this dataset" 256 | * } else { 257 | * return "You may not use this dataset" 258 | * } 259 | * }); 260 | */ 261 | conclusion: function (input) { 262 | var result; 263 | if (arguments.length === 0) { 264 | result = this._conclusion; 265 | } else if (typeof input === "object") { 266 | // input is a test result, pass it into the factory to generate 267 | // a conclusion 268 | if (typeof this._conclusionFactory === "function") { 269 | this._conclusion = this._conclusionFactory(input); 270 | } 271 | result = this._conclusion; 272 | } else if (typeof input === "string") { 273 | // set the conclusion as a string 274 | this._conclusion = input; 275 | result = this; 276 | } else if (typeof input === "function") { 277 | // set a function that generates a conclusion based on a result 278 | this._conclusionFactory = input; 279 | result = this; 280 | } else { 281 | result = undefined; 282 | console.error("Must provide a string as the conclusion"); 283 | } 284 | return result; 285 | }, 286 | }; 287 | 288 | module.exports = DataprooferTest; 289 | -------------------------------------------------------------------------------- /packages/dataproofertest-js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofertest-js", 3 | "version": "2.1.0", 4 | "description": "Core JS class for Dataproofer tests", 5 | "main": "index.js", 6 | "scripts": { 7 | "bootstrap": "npm install && npm install -g documentation", 8 | "docs": "yarn run documentation/bin/documentation.js index.js util.js -f md -g -o DOCUMENTATION.md" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git@github.com:dataproofer/dataproofertest-js.git" 13 | }, 14 | "keywords": [ 15 | "dataproofer", 16 | "data", 17 | "analysis" 18 | ], 19 | "author": "DataProofer ", 20 | "license": "GPL-3.0", 21 | "bugs": { 22 | "url": "https://github.com/dataproofer/dataproofertest.js/issues" 23 | }, 24 | "homepage": "https://github.com/dataproofer/dataproofertest.js", 25 | "dependencies": { 26 | "d3": "^4.2.7" 27 | }, 28 | "devDependencies": { 29 | "@babel/core": "^7.13.15", 30 | "@babel/eslint-parser": "^7.12.16", 31 | "@babel/eslint-plugin": "^7.12.13", 32 | "documentation": "^13.1.1", 33 | "eslint": "^7.20.0", 34 | "eslint-config-prettier": "^7.2.0", 35 | "eslint-plugin-prettier": "^3.3.1", 36 | "prettier": "^2.2.1" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /packages/dataproofertest-js/util.js: -------------------------------------------------------------------------------- 1 | /** 2 | * A set of comonly used utility functions for us inside tests 3 | */ 4 | 5 | var d3 = require("d3"); 6 | 7 | module.exports = { 8 | isNumeric: isNumeric, 9 | isString: isString, 10 | stripNumeric: stripNumeric, 11 | isEmpty: isEmpty, 12 | percent: percent, 13 | }; 14 | 15 | /** 16 | * Check if a value is numeric (a cell's value could be a number in string form) 17 | * @param {value=?} A string, number or null value 18 | * @returns {boolean} 19 | */ 20 | function isNumeric(value) { 21 | if (typeof value === "number") return true; 22 | if (typeof value === "string") { 23 | if (value === "") return false; 24 | return !isNaN(value); 25 | } 26 | return false; 27 | } 28 | 29 | /** 30 | * Check if a value is numeric (a cell's value could be a number in string form) 31 | * @param {value=?} A string, number or null value 32 | * @returns {boolean} 33 | */ 34 | function isString(value) { 35 | if (typeof value === "string") return true; 36 | if (typeof value === "number") { 37 | if (value === "") return false; 38 | return !isNaN(value); 39 | } 40 | return false; 41 | } 42 | 43 | /** 44 | * Check if a cell's value is empty 45 | * @param {value=?} A string, number or null value 46 | * @returns {boolean} 47 | */ 48 | function isEmpty(value) { 49 | if (value === null) return true; 50 | if (typeof value === "undefined") return true; 51 | if (value === "") return true; 52 | if (/\S/.test(value) === false) return true; 53 | return false; 54 | } 55 | 56 | /** 57 | * Strip a cell of the following characters: "$", ",", "%" 58 | * @param {value=?} A string, number or null value 59 | * @returns {String} 60 | */ 61 | function stripNumeric(value) { 62 | if (typeof value === "number") return value; 63 | if (!value) value = ""; 64 | if (typeof value === "string") { 65 | value = value.replace(/[$,%\s]/g, ""); 66 | return value; 67 | } 68 | } 69 | 70 | /** 71 | * return a string representing the percentage of a fraction 72 | * @param {value=?} A string, number or null value 73 | * @returns {String} 74 | */ 75 | function percent(fraction) { 76 | var formatPercent = d3.format(".1f"); 77 | return formatPercent(100 * fraction) + "%"; 78 | } 79 | -------------------------------------------------------------------------------- /packages/geo-suite/README.md: -------------------------------------------------------------------------------- 1 | # geo-suite 2 | Suite of geographic and mapping related tests for Dataproofer 3 | 4 | * [Documentation](https://github.com/dataproofer/geo-suite/blob/master/README.md) 5 | * [Repository](https://github.com/dataproofer/geo-suite/) 6 | * [Issues](https://github.com/dataproofer/geo-suite/issues) 7 | 8 | ## Table of Contents 9 | 10 | * [Tests](https://github.com/dataproofer/geo-suite#tests) 11 | * [voidLngLat.js](https://github.com/dataproofer/geo-suite#voidlnglatjs) 12 | * [validLngLat.js](https://github.com/dataproofer/geo-suite#validlnglatjs) 13 | * [Development](https://github.com/dataproofer/geo-suite#development) 14 | * [Getting Started](https://github.com/dataproofer/geo-suite#getting-started) 15 | * [Writing Tests](https://github.com/dataproofer/stats-suite#writing-tests) 16 | * [Building Docs](https://github.com/dataproofer/geo-suite#building-docs) 17 | 18 | ## Tests 19 | 20 | ### voidLngLat.js 21 | 22 | [src/voidLngLat.js:15-142](https://github.com/dataproofer/geo-suite/blob/2a337e71dc8e216b6351bb88a788524f28104441/src/voidLngLat.js#L15-L142 "Source code on GitHub") 23 | 24 | Verify that columns assumed to contain longitude or latitudes have non-zero values. 25 | These are values at 0º,0º. 26 | 27 | **Parameters** 28 | 29 | - `rows` **Array** an array of objects representing rows in the spreadsheet 30 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 31 | 32 | Returns **Object** describing the result 33 | 34 | ### validLngLat.js 35 | 36 | [src/validLngLat.js:14-146](https://github.com/dataproofer/geo-suite/blob/2a337e71dc8e216b6351bb88a788524f28104441/src/validLngLat.js#L14-L146 "Source code on GitHub") 37 | 38 | Verify that columns assumed to contain longitude or latitudes have valid values. 39 | These are values above 180º or below -180º. 40 | 41 | **Parameters** 42 | 43 | - `rows` **Array** an array of objects representing rows in the spreadsheet 44 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 45 | 46 | Returns **Object** describing the result 47 | 48 | 49 | **Parameters** 50 | 51 | - `rows` **Array** an array of objects representing rows in the spreadsheet 52 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 53 | 54 | Returns **Object** result an object describing the result 55 | 56 | 57 | ## Development 58 | 59 | ### Getting Started 60 | 61 | ``` 62 | git clone git@github.com:dataproofer/geo-suite.git 63 | cd geo-suite 64 | npm install 65 | ``` 66 | 67 | ### Writing Tests 68 | 69 | * [How To](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 70 | * [Helper Scripts](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 71 | * Templates 72 | * [Basic Test](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 73 | * [Advanced Test](https://github.com/dataproofer/suite-template/blob/master/src/myAdvancedTest.js) 74 | 75 | ### Building Docs 76 | 77 | We use [documentation.js](https://github.com/documentationjs/documentation), but have created a handy script for regenerating documentation. 78 | 79 | ``` 80 | npm run docs 81 | ``` 82 | 83 | Then open up and check your docs in [DOCUMENTATION.md](https://github.com/dataproofer/info-suite/blob/master/DOCUMENTATION.md) -------------------------------------------------------------------------------- /packages/geo-suite/index.js: -------------------------------------------------------------------------------- 1 | // All test suites will have a name and a list 2 | exports = module.exports = { 3 | name: "dataproofer-geo-suite", 4 | fullName: "Geographic Data Tests", 5 | tests: [], // the list of main tests to be run in the suite 6 | }; 7 | 8 | var invalidLngLat = require("./src/invalidLngLat"); 9 | var voidLngLat = require("./src/voidLngLat"); 10 | 11 | exports.tests.push(invalidLngLat, voidLngLat); 12 | -------------------------------------------------------------------------------- /packages/geo-suite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer-geo-suite", 3 | "version": "2.1.0", 4 | "description": "Suite of geographic and mapping related tests for dataproofer", 5 | "main": "index.js", 6 | "scripts": { 7 | "docs": "yarn run documentation/bin/documentation.js ./src/*.js -f md -g -o DOCUMENTATION.md" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/dataproofer/geo-suite" 12 | }, 13 | "keywords": [ 14 | "data", 15 | "csv", 16 | "excel" 17 | ], 18 | "author": "DataProofer ", 19 | "license": "GPL-3.0", 20 | "dependencies": { 21 | "dataproofertest-js": "2.1.0", 22 | "lodash": "^4.17.20" 23 | }, 24 | "devDependencies": { 25 | "@babel/core": "^7.13.15", 26 | "@babel/eslint-parser": "^7.12.16", 27 | "@babel/eslint-plugin": "^7.12.13", 28 | "documentation": "^13.1.1", 29 | "eslint": "^7.20.0", 30 | "eslint-config-prettier": "^7.2.0", 31 | "eslint-plugin-prettier": "^3.3.1", 32 | "prettier": "^2.2.1" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /packages/geo-suite/src/invalidLngLat.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var invalidLngLat = new DataprooferTest(); 4 | 5 | /** 6 | * Verify that columns assumed to contain longitude or latitudes have valid values. 7 | * These are values above 180º or below -180º longitude and above 90º or below -90º latitude. 8 | * 9 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 10 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 11 | * @return {Object} describing the result 12 | */ 13 | invalidLngLat 14 | .name("Invalid coordinates") 15 | .description( 16 | "Check for invalid longitude and latitude values in columns presumed to contain geographic coordinates" 17 | ) 18 | .methodology(function (rows, columnHeads) { 19 | // Search for columns that could have longitude and/or latitude values 20 | var potentialDoubleCoordinates = [ 21 | "latlon", 22 | "latitude/longitude", 23 | "longitude/latitude", 24 | "lonlat", 25 | "lnglat", 26 | ]; 27 | 28 | var potentialLats = ["latitude", "lat"]; 29 | 30 | var potentialLons = ["longitude", "lng", "lon", "long"]; 31 | 32 | // keep track of the columns which match our criteria 33 | var doubleColumns = []; 34 | var latColumns = []; 35 | var lonColumns = []; 36 | // NOTE: in the future the selectedColumns might override this 37 | columnHeads.forEach(function (column) { 38 | var lower = column.toLowerCase(); 39 | if (potentialDoubleCoordinates.indexOf(lower) >= 0) { 40 | doubleColumns.push(column); 41 | } else if ( 42 | potentialLats.indexOf(lower) >= 0 || 43 | lower.indexOf("latitude") >= 0 44 | ) { 45 | latColumns.push(column); 46 | } else if ( 47 | potentialLons.indexOf(lower) >= 0 || 48 | lower.indexOf("longitude") >= 0 49 | ) { 50 | lonColumns.push(column); 51 | } 52 | }); 53 | 54 | var invalidCoords = {}; 55 | columnHeads.forEach(function (column) { 56 | invalidCoords[column] = 0; 57 | }); 58 | var cellsToHighlight = []; 59 | var testState = "passed"; 60 | if (latColumns.length || lonColumns.length || doubleColumns.length) { 61 | rows.forEach(function (row) { 62 | var highlightRow = {}; 63 | columnHeads.forEach(function (column) { 64 | highlightRow[column] = 0; 65 | }); 66 | doubleColumns.forEach(function (column) { 67 | var cell = row[column]; 68 | if (typeof cell === "string") { 69 | var coords = cell.split(","); 70 | var coords1 = parseFloat(coords[0]); 71 | var coords2 = parseFloat(coords[1]); 72 | var lng, lat; 73 | // string begins w/ 'latitude' or 'lat' 74 | // convert hed to lowercase first & strip whitespace 75 | var sanitizedColHed = column 76 | .toLowerCase() 77 | .replace(/^\s+|\s+$/g, ""); 78 | if (sanitizedColHed.toLowerCase().indexOf("a") === 2) { 79 | lng = coords1; 80 | lat = coords2; 81 | } else { 82 | lng = coords2; 83 | lat = coords1; 84 | } 85 | if (lng > 180 || lat > 90 || lng < -180 || lat < -90) { 86 | testState = "failed"; 87 | invalidCoords[column] += 1; 88 | highlightRow[column] = 1; 89 | } else { 90 | highlightRow[column] = 0; 91 | } 92 | } else { 93 | // this isn't in a format we recognize 94 | testState = false; 95 | invalidCoords[column] += 1; 96 | highlightRow[column] = 1; 97 | } 98 | }); 99 | // checks a single column (either lat or lon) 100 | // we break it out into this function so we can iterate over 101 | // the columns which are suspected to be lat/lon 102 | function checkColumn(column, latlon) { 103 | var cell = row[column]; 104 | if (util.isEmpty(cell)) { 105 | // if the cell is empty its definitely not a valid lat/lon 106 | testState = "failed"; 107 | invalidCoords[column] += 1; 108 | highlightRow[column] = 1; 109 | } else if (util.isNumeric(cell)) { 110 | // if the cell has a numeric value, we check to make sure its in the valid range 111 | var num = parseFloat(cell); 112 | if (latlon === "lon" && (num > 180 || num < -180)) { 113 | testState = "failed"; 114 | invalidCoords[column] += 1; 115 | highlightRow[column] = 1; 116 | } else if (latlon === "lat" && (num > 90 || num < -90)) { 117 | testState = "failed"; 118 | invalidCoords[column] += 1; 119 | highlightRow[column] = 1; 120 | } else { 121 | highlightRow[column] = 0; 122 | } 123 | } else { 124 | // this test could be overly aggressive if we wrongly guess 125 | // that a column contains lat/lon by name only 126 | testState = "failed"; 127 | invalidCoords[column] += 1; 128 | highlightRow[column] = 1; 129 | //highlightRow[column] = 0; 130 | } 131 | } 132 | lonColumns.forEach(function (column) { 133 | checkColumn(column, "lon"); 134 | }); 135 | latColumns.forEach(function (column) { 136 | checkColumn(column, "lat"); 137 | }); 138 | cellsToHighlight.push(highlightRow); 139 | }); 140 | } 141 | 142 | var result = { 143 | testState: testState, 144 | highlightCells: cellsToHighlight, 145 | }; 146 | return result; 147 | }) 148 | .conclusion(function (result) { 149 | var conclusionStr = 150 | "You have latitudes and longitudes that fall outside of the earth's coordinates. They may have been encoded improperly, or it's a mistake.
"; 151 | var columns = Object.keys(result.columnWise); 152 | columns.forEach(function (column) { 153 | // Column foo: 154 | var currCount = result.columnWise[column]; 155 | if (currCount > 0) { 156 | conclusionStr += 'column "' + column + '": '; 157 | conclusionStr += result.columnWise[column] + " cells, "; 158 | conclusionStr += util.percent( 159 | result.columnWise[column] / result.highlightCells.length 160 | ); 161 | conclusionStr += "
"; 162 | } 163 | }); 164 | return conclusionStr; 165 | }); 166 | 167 | module.exports = invalidLngLat; 168 | -------------------------------------------------------------------------------- /packages/geo-suite/src/voidLngLat.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var voidLngLat = new DataprooferTest(); 4 | 5 | /** 6 | * Verify that columns assumed to contain longitude or latitudes have non-zero values. 7 | * These are values at 0º,0º. 8 | * 9 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 10 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 11 | * @return {Object} describing the result 12 | */ 13 | voidLngLat 14 | .name("Void coordinates") 15 | .description( 16 | "Check for non-existent longitude and latitude values in columns presumed to contain geographic coordinates" 17 | ) 18 | .methodology(function (rows, columnHeads) { 19 | // Search for columns that could have longitude and/or latitude values 20 | var potentialDoubleCoordinates = [ 21 | "latlon", 22 | "latitude/longitude", 23 | "longitude/latitude", 24 | "lonlat", 25 | "lnglat", 26 | ]; 27 | 28 | var potentialLats = ["latitude", "lat"]; 29 | 30 | var potentialLons = ["longitude", "lng", "lon", "long"]; 31 | 32 | // keep track of the columns which match our criteria 33 | var doubleColumns = []; 34 | var latColumns = []; 35 | var lonColumns = []; 36 | // NOTE: in the future the selectedColumns might override this 37 | columnHeads.forEach(function (column) { 38 | var lower = column.toLowerCase(); 39 | if (potentialDoubleCoordinates.indexOf(lower) >= 0) { 40 | doubleColumns.push(column); 41 | } else if ( 42 | potentialLats.indexOf(lower) >= 0 || 43 | lower.indexOf("latitude") >= 0 44 | ) { 45 | latColumns.push(column); 46 | } else if ( 47 | potentialLons.indexOf(lower) >= 0 || 48 | lower.indexOf("longitude") >= 0 49 | ) { 50 | lonColumns.push(column); 51 | } 52 | }); 53 | 54 | var voidCoords = {}; 55 | columnHeads.forEach(function (column) { 56 | voidCoords[column] = 0; 57 | }); 58 | var cellsToHighlight = []; 59 | var testState = "passed"; 60 | if (latColumns.length || lonColumns.length || doubleColumns.length) { 61 | rows.forEach(function (row) { 62 | var highlightRow = {}; 63 | columnHeads.forEach(function (column) { 64 | highlightRow[column] = 0; 65 | }); 66 | doubleColumns.forEach(function (column) { 67 | var cell = row[column]; 68 | if (typeof cell === "string") { 69 | var coords = cell.split(","); 70 | var num1 = parseFloat(coords[0]); 71 | var num2 = parseFloat(coords[1]); 72 | if (num1 === 0 && num2 === 0) { 73 | // null island 74 | testState = "failed"; 75 | voidCoords[column] += 1; 76 | highlightRow[column] = 1; 77 | } else { 78 | highlightRow[column] = 0; 79 | } 80 | } else { 81 | // this isn't in a format we recognize 82 | testState = "failed"; 83 | voidCoords[column] += 1; 84 | highlightRow[column] = 1; 85 | } 86 | }); 87 | // we want to know if both columns are zero to detect null island 88 | var zeros = { 89 | lat: false, 90 | lon: false, 91 | }; 92 | // checks a single column (either lat or lon) 93 | // we break it out into this function so we can iterate over 94 | // the columns which are suspected to be lat/lon 95 | function checkColumn(column, latlon) { 96 | var cell = row[column]; 97 | if (util.isEmpty(cell)) { 98 | // if the cell is empty its definitely not a valid lat/lon 99 | testState = "failed"; 100 | voidCoords[column] += 1; 101 | highlightRow[column] = 1; 102 | } else if (util.isNumeric(cell)) { 103 | // if the cell has a numeric value, we check to make sure its in the valid range 104 | var num = parseFloat(cell); 105 | if (num === 0) { 106 | zeros[latlon] = column; 107 | } 108 | highlightRow[column] = 0; 109 | } else { 110 | // this test could be overly aggressive if we wrongly guess 111 | // that a column contains lat/lon by name only 112 | testState = "failed"; 113 | voidCoords[column] += 1; 114 | highlightRow[column] = 1; 115 | //highlightRow[column] = 0; 116 | } 117 | } 118 | lonColumns.forEach(function (column) { 119 | checkColumn(column, "lon"); 120 | }); 121 | latColumns.forEach(function (column) { 122 | checkColumn(column, "lat"); 123 | }); 124 | if (zeros.lon && zeros.lat) { 125 | testState = "failed"; 126 | voidCoords[zeros.lon] += 1; 127 | voidCoords[zeros.lat] += 1; 128 | highlightRow[zeros.lon] = 1; 129 | highlightRow[zeros.lat] = 1; 130 | } 131 | cellsToHighlight.push(highlightRow); 132 | }); 133 | } 134 | 135 | var result = { 136 | testState: testState, 137 | highlightCells: cellsToHighlight, 138 | }; 139 | return result; 140 | }) 141 | .conclusion(function (result) { 142 | var conclusionStr = 143 | 'You have latitudes and longitudes that are "0,0" - which is the ocean off west Africa. This is likely a mistake.
'; 144 | var columns = Object.keys(result.columnWise); 145 | columns.forEach(function (column) { 146 | // Column foo: 147 | var currCount = result.columnWise[column]; 148 | if (currCount > 0) { 149 | conclusionStr += 'column "' + column + '": '; 150 | conclusionStr += result.columnWise[column] + " cells, "; 151 | conclusionStr += util.percent( 152 | result.columnWise[column] / result.highlightCells.length 153 | ); 154 | conclusionStr += "
"; 155 | } 156 | }); 157 | return conclusionStr; 158 | }); 159 | 160 | module.exports = voidLngLat; 161 | -------------------------------------------------------------------------------- /packages/info-suite/README.md: -------------------------------------------------------------------------------- 1 | # suite-template 2 | A template to write data checks for the Dataproofer desktop app or any JavaScript application. 3 | 4 | * [Documentation](https://github.com/dataproofer/info-suite/blob/master/README.md) 5 | * [Repository](https://github.com/dataproofer/info-suite/) 6 | * [Issues](https://github.com/dataproofer/info-suite/issues) 7 | 8 | ## Table of Contents 9 | 10 | * [Tests](https://github.com/dataproofer/info-suite#tests) 11 | * [numberOfRows.js](https://github.com/dataproofer/info-suite#numberofrowsjs) 12 | * [columnsContainNumbers.js](https://github.com/dataproofer/info-suite#columnscontainnumbersjs) 13 | * [columnsContainNothing.js](https://github.com/dataproofer/info-suite#columnscontainnothingjs) 14 | * [Development](https://github.com/dataproofer/info-suite#development) 15 | * [Getting Started](https://github.com/dataproofer/info-suite#getting-started) 16 | * [Writing Tests](https://github.com/dataproofer/stats-suite#writing-tests) 17 | * [Building Docs](https://github.com/dataproofer/info-suite#building-docs) 18 | 19 | ## Tests 20 | 21 | ### numberOfRows.js 22 | 23 | [src/numberOfRows.js:12-22](https://github.com/dataproofer/info-suite/blob/e302a25d3f139124e69ad779c22195ec977861c4/src/numberOfRows.js#L12-L22 "Source code on GitHub") 24 | 25 | Count and display the number of rows 26 | 27 | **Parameters** 28 | 29 | - `rows` **Array** an array of objects representing rows in the spreadsheet 30 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 31 | 32 | Returns **Object** describing the result 33 | 34 | ### columnsContainNumbers.js 35 | 36 | [src/columnsContainNumbers.js:16-66](https://github.com/dataproofer/info-suite/blob/e302a25d3f139124e69ad779c22195ec977861c4/src/columnsContainNumbers.js#L16-L66 "Source code on GitHub") 37 | 38 | Determine the percentage of rows that are numbers for each column 39 | 40 | **Parameters** 41 | 42 | - `rows` **Array** an array of objects representing rows in the spreadsheet 43 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 44 | 45 | Returns **Object** describing the result 46 | 47 | ### columnsContainNothing.js 48 | 49 | [src/columnsContainNothing.js:16-67](https://github.com/dataproofer/info-suite/blob/e302a25d3f139124e69ad779c22195ec977861c4/src/columnsContainNothing.js#L16-L67 "Source code on GitHub") 50 | 51 | Calculates the percentage of rows that are empty for each column 52 | 53 | **Parameters** 54 | 55 | - `rows` **Array** an array of objects representing rows in the spreadsheet 56 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 57 | 58 | Returns **Object** describing the result 59 | 60 | ## Development 61 | 62 | ### Getting Started 63 | 64 | ``` 65 | git clone git@github.com:dataproofer/info-suite.git 66 | cd info-suite 67 | npm install 68 | ``` 69 | 70 | ### Writing Tests 71 | 72 | * [How To](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 73 | * [Helper Scripts](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 74 | * Templates 75 | * [Basic Test](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 76 | * [Advanced Test](https://github.com/dataproofer/suite-template/blob/master/src/myAdvancedTest.js) 77 | 78 | ### Building Docs 79 | 80 | We use [documentation.js](https://github.com/documentationjs/documentation), but have created a handy script for regenerating documentation. 81 | 82 | ``` 83 | npm run docs 84 | ``` 85 | 86 | Then open up and check your docs in [DOCUMENTATION.md](https://github.com/dataproofer/info-suite/blob/master/DOCUMENTATION.md) 87 | -------------------------------------------------------------------------------- /packages/info-suite/index.js: -------------------------------------------------------------------------------- 1 | // All test suites will have a name and a list 2 | exports = module.exports = { 3 | name: "dataproofer-info-suite", // a hyphenated, unique name 4 | fullName: "Information & Diagnostics", // a full name used for display in the desktop app 5 | tests: [], // the list of main tests to be run in the suite 6 | }; 7 | 8 | var columnsContainNumbers = require("./src/columnsContainNumbers"); 9 | var columnsContainStrings = require("./src/columnsContainStrings"); 10 | 11 | exports.tests.push(columnsContainNumbers, columnsContainStrings); 12 | -------------------------------------------------------------------------------- /packages/info-suite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer-info-suite", 3 | "version": "2.1.0", 4 | "description": "A set of informational scripts about Dataproofer spreadsheets", 5 | "main": "index.js", 6 | "scripts": { 7 | "docs": "yarn run documentation/bin/documentation.js ./src/*.js -f md -g -o DOCUMENTATION.md" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/dataproofer/info-suite" 12 | }, 13 | "keywords": [ 14 | "data", 15 | "csv", 16 | "excel" 17 | ], 18 | "author": "DataProofer ", 19 | "license": "GPL-3.0", 20 | "dependencies": { 21 | "dataproofertest-js": "2.1.0", 22 | "lodash": "^4.17.20" 23 | }, 24 | "devDependencies": { 25 | "@babel/core": "^7.13.15", 26 | "@babel/eslint-parser": "^7.12.16", 27 | "@babel/eslint-plugin": "^7.12.13", 28 | "documentation": "^13.1.1", 29 | "eslint": "^7.20.0", 30 | "eslint-config-prettier": "^7.2.0", 31 | "eslint-plugin-prettier": "^3.3.1", 32 | "prettier": "^2.2.1" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /packages/info-suite/src/columnsContainNumbers.js: -------------------------------------------------------------------------------- 1 | var _ = require("lodash"); 2 | var DataprooferTest = require("dataproofertest-js"); 3 | var util = require("dataproofertest-js/util"); 4 | 5 | var columnsContainNumbers = new DataprooferTest(); 6 | 7 | /** 8 | * Determine the percentage of rows that are numbers for each column 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | columnsContainNumbers 15 | .name("Numeric cells") 16 | .description( 17 | "Determine the percentage of rows that are numbers for each column" 18 | ) 19 | .methodology(function (rows, columnHeads) { 20 | var numbers = {}; 21 | var testState = "info"; 22 | columnHeads.forEach(function (columnHead) { 23 | numbers[columnHead] = 0; 24 | }); 25 | var cellsToHighlight = []; // we will want to mark cells to be highlighted here 26 | // look through the rows 27 | rows.forEach(function (row) { 28 | // we make a row to keep track of cells we want to highlight 29 | var currentRow = {}; 30 | columnHeads.forEach(function (columnHead) { 31 | var cell = row[columnHead]; 32 | var strippedCell = util.stripNumeric(cell); 33 | // this will only be true if the cell is a number 34 | if (util.isNumeric(strippedCell)) { 35 | numbers[columnHead] += 1; 36 | currentRow[columnHead] = 1; 37 | } else { 38 | currentRow[columnHead] = 0; 39 | } 40 | }); 41 | // push our marking row onto our cells array 42 | cellsToHighlight.push(currentRow); 43 | }); 44 | 45 | var result = { 46 | testState: testState, 47 | highlightCells: cellsToHighlight, 48 | }; 49 | return result; 50 | }) 51 | .conclusion(function (result) { 52 | var conclusionStr = ""; 53 | var columns = _.keys(result.columnWise); 54 | columns.forEach(function (column) { 55 | // Column foo: 56 | var currCount = result.columnWise[column]; 57 | if (currCount > 0) { 58 | conclusionStr += column + ": "; 59 | conclusionStr += result.columnWise[column] + " cells, "; 60 | conclusionStr += util.percent( 61 | result.columnWise[column] / result.highlightCells.length 62 | ); 63 | conclusionStr += " of column
"; 64 | } 65 | }); 66 | return conclusionStr; 67 | }); 68 | 69 | module.exports = columnsContainNumbers; 70 | -------------------------------------------------------------------------------- /packages/info-suite/src/columnsContainStrings.js: -------------------------------------------------------------------------------- 1 | var _ = require("lodash"); 2 | var DataprooferTest = require("dataproofertest-js"); 3 | var util = require("dataproofertest-js/util"); 4 | 5 | var columnsContainStrings = new DataprooferTest(); 6 | 7 | /** 8 | * Determine the percentage of rows that are strings for each column 9 | * 10 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 11 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 12 | * @return {Object} describing the result 13 | */ 14 | columnsContainStrings 15 | .name("String cells") 16 | .description( 17 | "Determine the percentage of rows that are strings for each column" 18 | ) 19 | .methodology(function (rows, columnHeads) { 20 | var strings = {}; 21 | var testState = "info"; 22 | columnHeads.forEach(function (columnHead) { 23 | strings[columnHead] = 0; 24 | }); 25 | var cellsToHighlight = []; // we will want to mark cells to be highlighted here 26 | // look through the rows 27 | rows.forEach(function (row) { 28 | // we make a row to keep track of cells we want to highlight 29 | var currentRow = {}; 30 | columnHeads.forEach(function (columnHead) { 31 | var cell = row[columnHead]; 32 | var strippedCell = cell; 33 | // first we check to make sure the string isn't a number, and then if its a string 34 | // this is because data always comes as a string from the spreadsheet 35 | if ( 36 | !util.isNumeric(util.stripNumeric(strippedCell)) && 37 | // TODO: check for date when we get it 38 | util.isString(strippedCell) 39 | ) { 40 | strings[columnHead] += 1; 41 | currentRow[columnHead] = 1; 42 | } else { 43 | currentRow[columnHead] = 0; 44 | } 45 | }); 46 | // push our marking row onto our cells array 47 | cellsToHighlight.push(currentRow); 48 | }); 49 | 50 | var result = { 51 | testState: testState, 52 | highlightCells: cellsToHighlight, 53 | }; 54 | return result; 55 | }) 56 | .conclusion(function (result) { 57 | var conclusionStr = ""; 58 | var columns = _.keys(result.columnWise); 59 | columns.forEach(function (column) { 60 | // Column foo: 61 | var currCount = result.columnWise[column]; 62 | if (currCount > 0) { 63 | conclusionStr += column + ": "; 64 | conclusionStr += result.columnWise[column] + " cells, "; 65 | conclusionStr += util.percent( 66 | result.columnWise[column] / result.highlightCells.length 67 | ); 68 | conclusionStr += " of column
"; 69 | } 70 | }); 71 | return conclusionStr; 72 | }); 73 | 74 | module.exports = columnsContainStrings; 75 | -------------------------------------------------------------------------------- /packages/stats-suite/README.md: -------------------------------------------------------------------------------- 1 | # stats-suite 2 | Suite of statistical tests for [Dataproofer](https://github.com/dataproofer/Dataproofer) 3 | 4 | * [Documentation](https://github.com/dataproofer/stats-suite/blob/master/README.md) 5 | * [Repository](https://github.com/dataproofer/stats-suite/) 6 | * [Issues](https://github.com/dataproofer/stats-suite/issues) 7 | 8 | ## Table of Contents 9 | 10 | * [Tests](https://github.com/dataproofer/stats-suite#tests) 11 | * [standardDeviationOutliers.js](https://github.com/dataproofer/stats-suite#standarddeviationoutliersjs) 12 | * [medianAbsoluteDeviationOutliers.js](https://github.com/dataproofer/stats-suite#medianabsolutedeviationoutliersjs) 13 | * [Development](https://github.com/dataproofer/stats-suite#development) 14 | * [Getting Started](https://github.com/dataproofer/stats-suite#getting-started) 15 | * [Writing Tests](https://github.com/dataproofer/stats-suite#writing-tests) 16 | * [Building Docs](https://github.com/dataproofer/stats-suite#documentation) 17 | 18 | ## Tests 19 | 20 | ### standardDeviationOutliers.js 21 | 22 | [src/standardDeviationOutliers.js:18-132](https://github.com/dataproofer/stats-suite/blob/3bf0ba467787a998d1b5436e9212342708cc2d11/src/standardDeviationOutliers.js#L18-L132 "Source code on GitHub") 23 | 24 | Outlier detection using [standard deviation](https://en.wikipedia.org/wiki/standard_deviation) 25 | Examples in comparison with median absolute deviations: 26 | 27 | - [simple-statistics.js](http://simplestatistics.org/docs/#samplestandarddeviation) 28 | - [agate.py](http://agate-stats.readthedocs.org/en/0.3.1/index.html) 29 | 30 | **Parameters** 31 | 32 | - `rows` **Array** an array of objects representing rows in the spreadsheet 33 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 34 | 35 | Returns **Object** describing the result 36 | 37 | ### medianAbsoluteDeviationOutliers.js 38 | 39 | [src/medianAbsoluteDeviationOutliers.js:18-136](https://github.com/dataproofer/stats-suite/blob/3bf0ba467787a998d1b5436e9212342708cc2d11/src/medianAbsoluteDeviationOutliers.js#L18-L136 "Source code on GitHub") 40 | 41 | Outlier detection using [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) 42 | Examples in comparison with standard deviation: 43 | 44 | - [simple-statistics.js](http://simplestatistics.org/docs/#mad) 45 | - [agate.py](http://agate-stats.readthedocs.org/en/0.3.1/index.html) 46 | 47 | **Parameters** 48 | 49 | - `rows` **Array** an array of objects representing rows in the spreadsheet 50 | - `columnHeads` **Array** an array of strings for column names of the spreadsheet 51 | 52 | Returns **Object** describing the result 53 | 54 | ## Development 55 | 56 | ### Getting Started 57 | 58 | ``` 59 | git clone git@github.com:dataproofer/stats-suite.git 60 | cd stats-suite 61 | npm install 62 | ``` 63 | 64 | ### Writing Tests 65 | 66 | * [How to](https://github.com/dataproofer/Dataproofer#creating-a-new-test) 67 | * [Helper scripts documentation](https://github.com/dataproofer/dataproofertest-js/blob/master/DOCUMENTATION.md#util) 68 | * Templates 69 | * [Basic test](https://github.com/dataproofer/suite-template/blob/master/src/myTest.js) 70 | * [Advanced test](https://github.com/dataproofer/suite-template/blob/master/src/myAdvancedTest.js) 71 | 72 | ### Building Docs 73 | 74 | We use [documentation.js](https://github.com/documentationjs/documentation), but have created a handy script for regenerating documentation. 75 | 76 | ``` 77 | npm run docs 78 | ``` 79 | 80 | Then open up and check your docs in [DOCUMENTATION.md](https://github.com/dataproofer/info-suite/blob/master/DOCUMENTATION.md) -------------------------------------------------------------------------------- /packages/stats-suite/index.js: -------------------------------------------------------------------------------- 1 | // All test suites will have a name and a list 2 | exports = module.exports = { 3 | name: "dataproofer-stats-suite", 4 | fullName: "Statistical Data Tests", 5 | tests: [], // the list of main tests to be run in the suite 6 | }; 7 | 8 | var medianAbsoluteDeviationOutliers = require("./src/medianAbsoluteDeviationOutliers"); 9 | var standardDeviationOutliers = require("./src/standardDeviationOutliers"); 10 | 11 | exports.tests.push(standardDeviationOutliers, medianAbsoluteDeviationOutliers); 12 | -------------------------------------------------------------------------------- /packages/stats-suite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer-stats-suite", 3 | "version": "2.1.0", 4 | "description": "Suite of advanced statistics tests for dataproofer", 5 | "main": "index.js", 6 | "scripts": { 7 | "docs": "yarn run documentation/bin/documentation.js ./src/*.js -f md -g -o DOCUMENTATION.md" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/dataproofer/stats-suite" 12 | }, 13 | "keywords": [ 14 | "data", 15 | "csv", 16 | "excel" 17 | ], 18 | "author": "DataProofer ", 19 | "license": "GPL-3.0", 20 | "dependencies": { 21 | "dataproofertest-js": "2.1.0", 22 | "lodash": "^4.17.20", 23 | "simple-statistics": "^1.0.1" 24 | }, 25 | "devDependencies": { 26 | "@babel/core": "^7.13.15", 27 | "@babel/eslint-parser": "^7.12.16", 28 | "@babel/eslint-plugin": "^7.12.13", 29 | "documentation": "^13.1.1", 30 | "eslint": "^7.20.0", 31 | "eslint-config-prettier": "^7.2.0", 32 | "eslint-plugin-prettier": "^3.3.1", 33 | "prettier": "^2.2.1" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /packages/stats-suite/src/medianAbsoluteDeviationOutliers.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var ss = require("simple-statistics"); 4 | var _ = require("lodash"); 5 | var medianAbsoluteDeviationOutliers = new DataprooferTest(); 6 | 7 | /** 8 | * Outlier detection using [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) 9 | * Examples in comparison with standard deviation: 10 | * * [simple-statistics.js](http://simplestatistics.org/docs/#mad) 11 | * * [agate.py](http://agate-stats.readthedocs.org/en/0.3.1/index.html) 12 | * 13 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 14 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 15 | * @return {Object} describing the result 16 | */ 17 | medianAbsoluteDeviationOutliers 18 | .name("Outliers from the median") 19 | .description( 20 | "Outliers are numbers more than three median absolute deviations from the median. Note: this is a more robust test for detecting potential outliers." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var outliersCount = 0; 24 | var testState = "info"; 25 | var deviations = 3; 26 | var columnsAsArraysObj = {}; 27 | // we will count number of numeric + empty cells to see if a column is 28 | // predominantly numeric. We wont count empty cells in outliers calculation 29 | var columnsNumericCount = {}; 30 | var columnsEmptyCount = {}; 31 | var columnsOutliersCount = {}; 32 | 33 | // we will want to keep track of columns which have a mad of 0, which indicates 34 | // most elements are the same (and results in bad distance calculation) 35 | var madZeroColumns = []; 36 | 37 | // convert rows to columns to calculate the deviation 38 | _.each(columnHeads, function (columnHead) { 39 | columnsAsArraysObj[columnHead] = []; 40 | columnsNumericCount[columnHead] = 0; 41 | columnsEmptyCount[columnHead] = 0; 42 | columnsOutliersCount[columnHead] = 0; 43 | }); 44 | 45 | var cellsToHighlight = []; 46 | // add values from selected columns' cells row by row to the new columns objects 47 | _.each(rows, function (row, rowIndex) { 48 | var cellsRow = {}; 49 | _.each(columnHeads, function (currColumn) { 50 | cellsRow[currColumn] = 0; 51 | var value = util.stripNumeric(row[currColumn]); 52 | if (util.isNumeric(value)) { 53 | columnsAsArraysObj[currColumn].push(parseFloat(value)); 54 | columnsNumericCount[currColumn]++; 55 | } else if (util.isEmpty(row[currColumn])) { 56 | columnsEmptyCount[currColumn]++; 57 | } 58 | }); 59 | cellsToHighlight.push(cellsRow); 60 | }); 61 | 62 | // Tutorial for detecting outliers with MAD 63 | // * http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers 64 | // for more, see the following: 65 | // * https://en.wikipedia.org/wiki/Median_absolute_deviation 66 | // examples in comparison with standard deviation: 67 | // * http://simplestatistics.org/docs/#mad 68 | // * http://agate-stats.readthedocs.org/en/0.3.1/index.html 69 | _.each(columnHeads, function (columnHead) { 70 | // we qualify a column for analysis if it is "mostly" numeric 71 | // we add # of empty rows to this count 72 | var numeric = columnsNumericCount[columnHead]; 73 | var empty = columnsEmptyCount[columnHead]; 74 | var ratio = (numeric + empty) / rows.length; 75 | if (ratio < 0.9) { 76 | return; 77 | } 78 | var currentColumn = columnsAsArraysObj[columnHead]; 79 | var median = ss.median(currentColumn); 80 | var mad = ss.mad(currentColumn); 81 | // if mad == 0 we have a lot of identical values and should let the user know 82 | if (mad === 0) { 83 | madZeroColumns.push(columnHead); 84 | return; 85 | } 86 | _.each(rows, function (row, rowIndex) { 87 | var value = util.stripNumeric(row[columnHead]); 88 | if (util.isNumeric(value)) { 89 | var dist = Math.abs(value - median) / mad; 90 | if (dist > deviations) { 91 | // we found an outlier 92 | columnsOutliersCount[columnHead]++; 93 | // eslint-disable-next-line no-unused-vars 94 | outliersCount++; 95 | cellsToHighlight[rowIndex][columnHead] = 1; 96 | // TODO: save highlight cells 97 | } 98 | } 99 | }); 100 | }); 101 | 102 | // testState = (outliersCount > 0)? "info" : "fail"; 103 | 104 | return { 105 | testState: testState, 106 | highlightCells: cellsToHighlight, 107 | }; 108 | }) 109 | .conclusion(function (result) { 110 | var conclusionStr = ""; 111 | var columns = Object.keys(result.columnWise); 112 | columns.forEach(function (column) { 113 | // Column foo: 114 | var currCount = result.columnWise[column]; 115 | if (currCount > 0) { 116 | conclusionStr += 'column "' + column + '": '; 117 | conclusionStr += result.columnWise[column] + " cells, "; 118 | conclusionStr += util.percent( 119 | result.columnWise[column] / result.highlightCells.length 120 | ); 121 | conclusionStr += "
"; 122 | } 123 | }); 124 | return conclusionStr; 125 | }); 126 | 127 | module.exports = medianAbsoluteDeviationOutliers; 128 | -------------------------------------------------------------------------------- /packages/stats-suite/src/standardDeviationOutliers.js: -------------------------------------------------------------------------------- 1 | var DataprooferTest = require("dataproofertest-js"); 2 | var util = require("dataproofertest-js/util"); 3 | var ss = require("simple-statistics"); 4 | var _ = require("lodash"); 5 | var standardDeviationOutliers = new DataprooferTest(); 6 | 7 | /** 8 | * Outlier detection using [standard deviation](https://en.wikipedia.org/wiki/standard_deviation) 9 | * Examples in comparison with median absolute deviations: 10 | * * [simple-statistics.js](http://simplestatistics.org/docs/#samplestandarddeviation) 11 | * * [agate.py](http://agate-stats.readthedocs.org/en/0.3.1/index.html) 12 | * 13 | * @param {Array} rows - an array of objects representing rows in the spreadsheet 14 | * @param {Array} columnHeads - an array of strings for column names of the spreadsheet 15 | * @return {Object} describing the result 16 | */ 17 | standardDeviationOutliers 18 | .name("Outliers from the mean") 19 | .description( 20 | "Outliers are numbers more than three standard deviations from the mean. Note: this is a less robust test and may miss potential outliers." 21 | ) 22 | .methodology(function (rows, columnHeads) { 23 | var outliersCount = 0; 24 | var testState = "info"; 25 | var deviations = 3; 26 | var columnsAsArraysObj = {}; 27 | // we will count number of numeric + empty cells to see if a column is 28 | // predominantly numeric. We wont count empty cells in outliers calculation 29 | var columnsNumericCount = {}; 30 | var columnsEmptyCount = {}; 31 | var columnsOutliersCount = {}; 32 | 33 | // we will want to keep track of columns which have a standard deviation of 0, which indicates 34 | // most elements are the same (and results in bad distance calculation) 35 | var stdDevZeroColumns = []; 36 | 37 | // convert rows to columns to calculate the deviation 38 | _.each(columnHeads, function (columnHead) { 39 | columnsAsArraysObj[columnHead] = []; 40 | columnsNumericCount[columnHead] = 0; 41 | columnsEmptyCount[columnHead] = 0; 42 | columnsOutliersCount[columnHead] = 0; 43 | }); 44 | 45 | var cellsToHighlight = []; 46 | // add values from selected columns' cells row by row to the new columns objects 47 | _.each(rows, function (row, rowIndex) { 48 | var cellsRow = {}; 49 | _.each(columnHeads, function (currColumn) { 50 | cellsRow[currColumn] = 0; 51 | var value = util.stripNumeric(row[currColumn]); 52 | if (util.isNumeric(value)) { 53 | columnsAsArraysObj[currColumn].push(parseFloat(value)); 54 | columnsNumericCount[currColumn]++; 55 | } else if (util.isEmpty(row[currColumn])) { 56 | columnsEmptyCount[currColumn]++; 57 | } 58 | }); 59 | cellsToHighlight.push(cellsRow); 60 | }); 61 | 62 | // examples in comparison with MAD: 63 | // * http://simplestatistics.org/docs/#samplestandarddeviation 64 | // * http://agate-stats.readthedocs.org/en/0.3.1/index.html 65 | _.each(columnHeads, function (columnHead) { 66 | // we qualify a column for analysis if it is "mostly" numeric 67 | // we add # of empty rows to this count 68 | var numeric = columnsNumericCount[columnHead]; 69 | var empty = columnsEmptyCount[columnHead]; 70 | var ratio = (numeric + empty) / rows.length; 71 | if (ratio < 0.9) { 72 | return; 73 | } 74 | var currentColumn = columnsAsArraysObj[columnHead]; 75 | var mean = ss.mean(currentColumn); 76 | var stdDev = ss.sampleStandardDeviation(currentColumn); 77 | // if stdDev == 0 we have a lot of identical values and should let the user know 78 | if (stdDev === 0) { 79 | stdDevZeroColumns.push(columnHead); 80 | return; 81 | } 82 | _.each(rows, function (row, rowIndex) { 83 | var value = util.stripNumeric(row[columnHead]); 84 | if (util.isNumeric(value)) { 85 | var dist = Math.abs(value - mean) / stdDev; 86 | if (dist > deviations) { 87 | // we found an outlier 88 | columnsOutliersCount[columnHead]++; 89 | // eslint-disable-next-line no-unused-vars 90 | outliersCount++; 91 | cellsToHighlight[rowIndex][columnHead] = 1; 92 | // TODO: save highlight cells 93 | } 94 | } 95 | }); 96 | }); 97 | 98 | // testState = (outliersCount > 0)? false : true; 99 | 100 | return { 101 | testState: testState, 102 | highlightCells: cellsToHighlight, 103 | }; 104 | }) 105 | .conclusion(function (result) { 106 | var conclusionStr = ""; 107 | var columns = Object.keys(result.columnWise); 108 | columns.forEach(function (column) { 109 | // Column foo: 110 | var currCount = result.columnWise[column]; 111 | if (currCount > 0) { 112 | conclusionStr += 'column "' + column + '": '; 113 | conclusionStr += result.columnWise[column] + " cells, "; 114 | conclusionStr += util.percent( 115 | result.columnWise[column] / result.highlightCells.length 116 | ); 117 | conclusionStr += "
"; 118 | } 119 | }); 120 | return conclusionStr; 121 | }); 122 | 123 | module.exports = standardDeviationOutliers; 124 | -------------------------------------------------------------------------------- /sample-datasets/bad-encoding-countries.csv: -------------------------------------------------------------------------------- 1 | name (long),administrative name 2 | Aruba,Aruba 3 | Antigua,Antigua and Barbuda 4 | Barbuda,Antigua and Barbuda 5 | Afghanistan,Afghanistan 6 | Angola,Angola 7 | Anguilla,Anguilla 8 | Albania,Albania 9 | Aland Islands,Aland 10 | Andorra,Andorra 11 | United Arab Emirates,United Arab Emirates 12 | Argentina,Argentina 13 | Armenia,Armenia 14 | American Samoa,American Samoa 15 | Antarctica,Antarctica 16 | French Southern and Antarctic Lands,French Southern and Antarctic Lands 17 | Australia,Australia 18 | Austria,Austria 19 | Azerbaijan,Azerbaijan 20 | Brussels,Belgium 21 | Burundi,Burundi 22 | Benin,Benin 23 | Burkina Faso,Burkina Faso 24 | Flemish Region,Belgium 25 | Bangladesh,Bangladesh 26 | Bulgaria,Bulgaria 27 | Federation of Bosnia and Herzegovina,Bosnia and Herzegovina 28 | Bahrain,Bahrain 29 | Bahamas,The Bahamas 30 | Republic Srpska,Bosnia and Herzegovina 31 | Bajo Nuevo Bank (Petrel Islands),Bajo Nuevo Bank (Petrel Is.) 32 | Saint-Barth�lemy,Saint Barthelemy 33 | Belarus,Belarus 34 | Belize,Belize 35 | Bermuda,Bermuda 36 | Bolivia,Bolivia 37 | Navassa Island,United States Minor Outlying Islands 38 | Brazil,Brazil 39 | Barbados,Barbados 40 | Brunei Darussalam,Brunei 41 | Bhutan,Bhutan 42 | Bouvet Island,Norway 43 | Botswana,Botswana 44 | Walloon Region,Belgium 45 | Central African Republic,Central African Republic 46 | Canada,Canada 47 | Cocos Islands,Indian Ocean Territories 48 | Switzerland,Switzerland 49 | Chile,Chile 50 | China,China 51 | C�te d'Ivoire,Ivory Coast 52 | Clipperton Island,Clipperton Island 53 | Cameroon,Cameroon 54 | Cyprus U.N. Buffer Zone,Cyprus No Mans Area 55 | Democratic Republic of the Congo,Democratic Republic of the Congo 56 | Republic of Congo,Republic of Congo 57 | Cook Islands,Cook Islands 58 | Colombia,Colombia 59 | Comoros,Comoros 60 | Cape Verde,Cape Verde 61 | Costa Rica,Costa Rica 62 | Coral Sea Islands,Coral Sea Islands 63 | Cuba,Cuba 64 | Cura�ao,Cura�ao 65 | Christmas Island,Indian Ocean Territories 66 | Cayman Islands,Cayman Islands 67 | Northern Cyprus,Northern Cyprus 68 | Cyprus,Cyprus 69 | Czech Republic,Czech Republic 70 | Germany,Germany 71 | Djibouti,Djibouti 72 | Dominica,Dominica 73 | Denmark,Denmark 74 | Dominican Republic,Dominican Republic 75 | Jarvis Island,United States Minor Outlying Islands 76 | Algeria,Algeria 77 | Ecuador,Ecuador 78 | Egypt,Egypt 79 | England,United Kingdom 80 | Eritrea,Eritrea 81 | Dhekelia,Dhekelia Sovereign Base Area 82 | Spain,Spain 83 | Estonia,Estonia 84 | Ethiopia,Ethiopia 85 | Finland,Finland 86 | Fiji,Fiji 87 | Falkland Islands,Falkland Islands 88 | Baker Island,United States Minor Outlying Islands 89 | Faeroe Islands,Faroe Islands 90 | Federated States of Micronesia,Federated States of Micronesia 91 | France,France 92 | Gabon,Gabon 93 | Gaza,Palestine 94 | Adjara,Georgia 95 | Georgia,Georgia 96 | Guernsey,Guernsey 97 | Ghana,Ghana 98 | Gibraltar,Gibraltar 99 | Guinea,Guinea 100 | Guadeloupe,France 101 | The Gambia,Gambia 102 | Guinea-Bissau,Guinea Bissau 103 | Equatorial Guinea,Equatorial Guinea 104 | Greece,Greece 105 | Grenada,Grenada 106 | Greenland,Greenland 107 | Guatemala,Guatemala 108 | French Guiana,France 109 | Guam,Guam 110 | Guyana,Guyana 111 | Hong Kong,Hong Kong S.A.R. 112 | Heard I. and McDonald Islands,Heard Island and McDonald Islands 113 | Honduras,Honduras 114 | Howland Island,United States Minor Outlying Islands 115 | Croatia,Croatia 116 | Haiti,Haiti 117 | Hungary,Hungary 118 | Indonesia,Indonesia 119 | Isle of Man,Isle of Man 120 | India,India 121 | British Indian Ocean Territory,British Indian Ocean Territory 122 | Iraqi Kurdistan,Iraq 123 | Ireland,Ireland 124 | Iran,Iran 125 | Iraq,Iraq 126 | Iceland,Iceland 127 | Israel,Israel 128 | Italy,Italy 129 | Jamaica,Jamaica 130 | Jersey,Jersey 131 | Jordan,Jordan 132 | Japan,Japan 133 | Johnston Atoll,United States Minor Outlying Islands 134 | Baikonur Cosmodrome,Baykonur Cosmodrome 135 | Siachen Glacier,Siachen Glacier 136 | Kazakhstan,Kazakhstan 137 | Kenya,Kenya 138 | Kyrgyzstan,Kyrgyzstan 139 | Cambodia,Cambodia 140 | Kiribati,Kiribati 141 | Saint Kitts and Nevis,Saint Kitts and Nevis 142 | Korean DMZ (south),South Korea 143 | Korean DMZ (north),North Korea 144 | Republic of Korea,South Korea 145 | Kosovo,Kosovo 146 | Kuwait,Kuwait 147 | Lao PDR,Laos 148 | Lebanon,Lebanon 149 | Liberia,Liberia 150 | Libya,Libya 151 | Saint Lucia,Saint Lucia 152 | Liechtenstein,Liechtenstein 153 | Sri Lanka,Sri Lanka 154 | Lesotho,Lesotho 155 | Lithuania,Lithuania 156 | Luxembourg,Luxembourg 157 | Latvia,Latvia 158 | Macao,Macao S.A.R 159 | Saint-Martin,Saint Martin 160 | Morocco,Morocco 161 | Monaco,Monaco 162 | Moldova,Moldova 163 | Madagascar,Madagascar 164 | Maldives,Maldives 165 | Mexico,Mexico 166 | Marshall Islands,Marshall Islands 167 | Macedonia,Macedonia 168 | Mali,Mali 169 | Malta,Malta 170 | Myanmar,Myanmar 171 | Montenegro,Montenegro 172 | Mongolia,Mongolia 173 | Northern Mariana Islands,Northern Mariana Islands 174 | Mozambique,Mozambique 175 | Midway Islands,United States Minor Outlying Islands 176 | Mauritania,Mauritania 177 | Montserrat,Montserrat 178 | Martinique,France 179 | Mauritius,Mauritius 180 | Malawi,Malawi 181 | Malaysia,Malaysia 182 | Mayotte,France 183 | Namibia,Namibia 184 | New Caledonia,New Caledonia 185 | Niger,Niger 186 | Norfolk Island,Norfolk Island 187 | Nigeria,Nigeria 188 | Nicaragua,Nicaragua 189 | Northern Ireland,United Kingdom 190 | Niue,Niue 191 | Jan Mayen Island,Norway 192 | Netherlands,Netherlands 193 | Caribbean Netherlands,Netherlands 194 | Norway,Norway 195 | Nepal,Nepal 196 | Nauru,Nauru 197 | Svalbard Islands,Norway 198 | New Zealand,New Zealand 199 | Oman,Oman 200 | Pakistan,Pakistan 201 | Panama,Panama 202 | Azores,Portugal 203 | Pitcairn Islands,Pitcairn Islands 204 | Peru,Peru 205 | Paracel Islands,China 206 | Spratly Islands,Spratly Islands 207 | Philippines,Philippines 208 | Palau,Palau 209 | Madeira,Portugal 210 | Bougainville,Papua New Guinea 211 | Papua New Guinea,Papua New Guinea 212 | Poland,Poland 213 | Puerto Rico,Puerto Rico 214 | Dem. Rep. Korea,North Korea 215 | Portugal,Portugal 216 | Paraguay,Paraguay 217 | French Polynesia,French Polynesia 218 | Qatar,Qatar 219 | Reunion,France 220 | Romania,Romania 221 | Russian Federation,Russia 222 | Rwanda,Rwanda 223 | Western Sahara,Western Sahara 224 | Saudi Arabia,Saudi Arabia 225 | Scarborough Reef,Scarborough Reef 226 | Scotland,United Kingdom 227 | Sudan,Sudan 228 | South Sudan,South Sudan 229 | Senegal,Senegal 230 | Serranilla Bank,Serranilla Bank 231 | Singapore,Singapore 232 | South Georgia and South Sandwich Islands,South Georgia and South Sandwich Islands 233 | Saint Helena,Saint Helena 234 | Solomon Islands,Solomon Islands 235 | Sierra Leone,Sierra Leone 236 | El Salvador,El Salvador 237 | San Marino,San Marino 238 | Somaliland,Somaliland 239 | Puntland,Somalia 240 | Somalia,Somalia 241 | Saint Pierre and Miquelon,Saint Pierre and Miquelon 242 | Serbia,Republic of Serbia 243 | Vojvodina,Republic of Serbia 244 | S�o Tom� and Principe,Sao Tome and Principe 245 | Suriname,Suriname 246 | Slovakia,Slovakia 247 | Slovenia,Slovenia 248 | Sweden,Sweden 249 | Swaziland,Swaziland 250 | Sint Maarten,Sint Maarten 251 | Seychelles,Seychelles 252 | UNDOF Zone,Syria 253 | Syrian Arab Republic,Syria 254 | Turks and Caicos Islands,Turks and Caicos Islands 255 | Chad,Chad 256 | Togo,Togo 257 | Thailand,Thailand 258 | Tajikistan,Tajikistan 259 | Tokelau,New Zealand 260 | Turkmenistan,Turkmenistan 261 | Timor-Leste,East Timor 262 | Tonga,Tonga 263 | Trinidad and Tobago,Trinidad and Tobago 264 | Tunisia,Tunisia 265 | Turkey,Turkey 266 | Tuvalu,Tuvalu 267 | Taiwan,Taiwan 268 | Tanzania,United Republic of Tanzania 269 | Zanzibar,United Republic of Tanzania 270 | Uganda,Uganda 271 | Ukraine,Ukraine 272 | Uruguay,Uruguay 273 | United States,United States of America 274 | US Naval Base Guantanamo Bay,US Naval Base Guantanamo Bay 275 | Uzbekistan,Uzbekistan 276 | Vatican,Vatican 277 | Saint Vincent and the Grenadines,Saint Vincent and the Grenadines 278 | Venezuela,Venezuela 279 | British Virgin Islands,British Virgin Islands 280 | United States Virgin Islands,United States Virgin Islands 281 | Vietnam,Vietnam 282 | Vanuatu,Vanuatu 283 | West Bank,Palestine 284 | Wallis and Futuna Islands,Wallis and Futuna 285 | Wales,United Kingdom 286 | Wake Atoll,United States Minor Outlying Islands 287 | Akrotiri,Akrotiri Sovereign Base Area 288 | Samoa,Samoa 289 | Yemen,Yemen 290 | South Africa,South Africa 291 | Zambia,Zambia 292 | Zimbabwe,Zimbabwe -------------------------------------------------------------------------------- /sample-datasets/ballpark-prices.csv: -------------------------------------------------------------------------------- 1 | Ballpark,Two tickets,Two hotdogs,Two beers,Parking,Total Average 2 | Boston Red Sox Game at Fenway Park,96,10.5,15.5,35,157 3 | Chicago Cubs Game at Wrigley Field,64.06,11.5,15.5,25,116.06 4 | New York Yankees Game at Yankee Stadium,56.4,6,12,35,109.4 5 | Toronto Blue Jays Game at Rogers Centre,63.2,9.96,13.58,22.64,109.38 6 | Seattle Mariners Game at Safeco Field,53.71,9,12,20,94.71 7 | New York Mets Game at Citi Field,48.19,12.5,11.5,22,94.19 8 | San Francisco Giants Game at AT&T Park,44.81,11,14,21,90.81 9 | Kansas City Royals Game at Kauffman Stadium,56.4,10,13,10,89.4 10 | Washington Nationals Game at Nationals Park,52.4,10.5,13,10,85.9 11 | Philadelphia Phillies Game at Citizens Bank Park,43.49,7.5,15.5,16,82.49 12 | St. Louis Cardinals Game at Busch Stadium,42.8,8.5,10,20,81.3 13 | Milwaukee Brewers Game at Miller Park,50,7,12,11,80 14 | Cleveland Indians Game at Progressive Field,50.4,6,8,12,76.4 15 | Detroit Tigers Game at Comerica Park,31.6,9,10,20,70.6 16 | Tampa Bay Rays Game at Tropicana Field,49.11,10,10,0,69.11 17 | Chicago White Sox Game at U.S. Cellular Field,27.45,8,13,20,68.45 18 | Houston Astros Game at Minute Maid Park,31.73,9.5,10,15,66.23 19 | Miami Marlins Game at Marlins Park,26.93,12,12,15,65.93 20 | San Diego Padres Game at Petco Park,39.8,8,10,8,65.8 21 | Baltimore Orioles Game at Oriole Park at Camden Yards,41.13,3,13.5,8,65.63 22 | Texas Rangers Game at Globe Life Park in Arlington,33.6,10,10,12,65.6 23 | Atlanta Braves Game at Turner Field,26.34,9.5,14.5,15,65.34 24 | Oakland Athletics Game at O.co Coliseum,23.21,10.5,10,20,63.71 25 | Pittsburgh Pirates Game at PNC Park,29.93,3.25,11,15,62.43 26 | Cincinnati Reds Game at Great American Ball Park,28.8,2,10.5,20,61.3 27 | Minnesota Twins Game at Target Field,30.4,9,15,6,60.4 28 | Colorado Rockies Game at Coors Field,24.8,9.5,12,13,59.3 29 | Arizona Diamondbacks Game at Chase Field,34.43,5.5,8,10,57.93 30 | Los Angeles Dodgers Game at Dodger Stadium,21.6,11,12.5,10,55.1 31 | Los Angeles Angels Game at Angel Stadium of Anaheim,19.6,9,9,10,47.6 -------------------------------------------------------------------------------- /sample-datasets/cigarette-sales.csv: -------------------------------------------------------------------------------- 1 | Station,Warnings,Fines,Warnings per station 0.00X,# Stations in US,Source,Letter Sent?,Recieved Reply?,Ratios 2 | SHELL,237,42,30,"14,000",http://www.shell.us/aboutshell/shell-businesses/retail.html,Yes (Site Form),Complete,0.18 3 | EXXONMOBIL,174,25,23,"11,100",http://www.cnn.com/2008/US/06/12/exxon.mobil/index.html?eref=yahoo,Called,Complete,0.14 4 | BP,170,50,67,"7,500",http://www.bp.com/en/global/corporate/about-bp/bp-worldwide/bp-in-america/our-us-operations/retail.html,Yes,Complete,0.29 5 | 7-ELEVEN,163,25,31,"7,974",https://nrf.com/2014/top100-table,Yes (x2),INCOMPLETE,0.15 6 | WALGREENS,145,8,10,"7,998",https://nrf.com/2014/top100-table,Yes (x2),Complete,0.06 7 | MARATHON,140,12,23,"5,300",http://www.marathonpetroleum.com/brand/Stations_and_Stores/,Yes,Complete,0.09 8 | SUNOCO,118,22,46,"4,800",https://www.svmcards.com/sunoco-gift-cards/ShowDetails.cfm?ProdID=30,Yes,Complete,0.19 9 | CITGO,99,21,15,"14,000",http://www.nytimes.com/2005/03/05/business/05citgo.html?pagewanted=print&position=,Yes (x2),INCOMPLETE,0.21 10 | CVS/PHARMACY,64,5,1,"7,621",https://nrf.com/2014/top100-table,Yes,Complete,0.08 11 | VALERO,63,12,16,"7,300",http://www.valero.com/ourbusiness/pages/companyhistory.aspx,Yes,Complete,0.19 12 | CASEY'S GENERAL STORES,48,7,38,"1,865",https://www.caseys.com/about,Yes (x2),Complete,0.15 13 | FOOD LION,38,6,55,"1,100",http://foodlion.mediaroom.com/index.php?s=20301,Yes (x2),Complete,0.16 14 | GULF,33,10,50,"2,000",https://www.gulfoil.com/AboutGulf/CompanyHistory.aspx,Yes,Complete,0.30 15 | GULF,33,10,50,"2,000",https://www.gulfoil.com/AboutGulf/CompanyHistory.aspx,Yes,Complete,0.30 -------------------------------------------------------------------------------- /sample-datasets/darknet-passport-prices.csv: -------------------------------------------------------------------------------- 1 | Country / Website,Passport Power Rank,Minimum (euro),Maximum (euro),,Minimum USD,Maximum USD,Nucleus 1,AlphaBay,Abaraxas 1,Abaraxas 2,Middle Earth,Crypto Market 1,Crypto Market 2,Hansa,http://passaork4ojyk3ju.onion/,http://vfqnd6mieccqyiit.onion/,http://passporxakpmzurx.onion/index.php,http://abbujjh5vqtq77wg.onion/,http://lsvkntdyvga64cvh.onion/?add-to-cart=48 2 | Denmark,4,1652,3524,,1883,4017,2950,3524,1907,,,,,,,,1652,3100, 3 | Belgium,6,1738,3190,,1981,3637,,3190,,,1738,,,,,,,, 4 | Netherlands,4,1652,3100,,1883,3534,,,,,2348,,,3061,,,1652,3100, 5 | United Kingdom,1,1572,3061,,1792,3490,,,,,2629,1572,1804,3061,,2670,2098,, 6 | France,2,1449,3061,,1652,3490,,1449,1907,,2348,,,3061,,,,, 7 | Spain,6,3061,3061,,3490,3490,,,,,,,,3061,,,,, 8 | Russia,37,1192,2643,,1359,3013,2643,,1907,1192,,,,,,,,, 9 | Lithuania,14,1233,2600,,1406,2964,1586,,1597,1240,,,,,,,1311,2600,1233 10 | Australia,9,2348,2348,,2677,2677,,,,,2348,,,,,,,, 11 | Canada,5,1311,2348,,1495,2677,,,,,2348,,,,,,1311,, 12 | Germany,2,2348,2348,,2677,2677,,,,,2348,,,,,,,, 13 | United States,1,823,1804,,938,2057,,,,,,1624,1804,,823,,,, 14 | Finland,4,1738,1738,,1981,1981,,,,,1738,,,,,,,, 15 | Norway,6,1623,1623,,1850,1850,,,,,,1623,,,,,,, 16 | Italia,3,352,352,,401,401,,,,,,352,,,,,,, 17 | Ireland,6,0,0,,0,0,,,,,,,,,,,,, 18 | Poland,10,0,0,,0,0,,,,,,,,,,,,, 19 | Portugal,6,0,0,,0,0,,,,,,,,,,,,, 20 | Switzerland,5,0,0,,0,0,,,,,,,,,,,,, 21 | Sweden,3,0,0,,0,0,,,,,,,,,,,,, -------------------------------------------------------------------------------- /sample-datasets/foreignfighters.csv: -------------------------------------------------------------------------------- 1 | name,administrative name,2014 approx foreign fighter numbers,2015 approx foreign fighter numbers 2 | Aruba,Aruba,, 3 | Antigua,Antigua and Barbuda,, 4 | Barbuda,Antigua and Barbuda,, 5 | Afghanistan,Afghanistan,, 6 | Angola,Angola,, 7 | Anguilla,Anguilla,, 8 | Albania,Albania,,90 9 | Aland Islands,Aland,, 10 | Andorra,Andorra,, 11 | United Arab Emirates,United Arab Emirates,, 12 | Argentina,Argentina,, 13 | Armenia,Armenia,, 14 | American Samoa,American Samoa,, 15 | Antarctica,Antarctica,, 16 | French Southern and Antarctic Lands,French Southern and Antarctic Lands,, 17 | Australia,Australia,250,70 18 | Austria,Austria,,229 19 | Azerbaijan,Azerbaijan,, 20 | Brussels,Belgium,250, 21 | Burundi,Burundi,, 22 | Benin,Benin,, 23 | Burkina Faso,Burkina Faso,, 24 | Flemish Region,Belgium,250, 25 | Bangladesh,Bangladesh,, 26 | Bulgaria,Bulgaria,, 27 | Federation of Bosnia and Herzegovina,Bosnia and Herzegovina,,156 28 | Bahrain,Bahrain,, 29 | Bahamas,The Bahamas,, 30 | Republic Srpska,Bosnia and Herzegovina,, 31 | Bajo Nuevo Bank (Petrel Islands),Bajo Nuevo Bank (Petrel Is.),, 32 | Saint-Barth�lemy,Saint Barthelemy,, 33 | Belarus,Belarus,, 34 | Belize,Belize,, 35 | Bermuda,Bermuda,, 36 | Bolivia,Bolivia,, 37 | Navassa Island,United States Minor Outlying Islands,, 38 | Brazil,Brazil,, 39 | Barbados,Barbados,, 40 | Brunei Darussalam,Brunei,, 41 | Bhutan,Bhutan,, 42 | Bouvet Island,Norway,, 43 | Botswana,Botswana,, 44 | Walloon Region,Belgium,250, 45 | Central African Republic,Central African Republic,, 46 | Canada,Canada,30,150 47 | Cocos Islands,Indian Ocean Territories,, 48 | Switzerland,Switzerland,10, 49 | Chile,Chile,, 50 | China,China,, 51 | C�te d'Ivoire,Ivory Coast,, 52 | Clipperton Island,Clipperton Island,, 53 | Cameroon,Cameroon,, 54 | Cyprus U.N. Buffer Zone,Cyprus No Mans Area,, 55 | Democratic Republic of the Congo,Democratic Republic of the Congo,, 56 | Republic of Congo,Republic of Congo,, 57 | Cook Islands,Cook Islands,, 58 | Colombia,Colombia,, 59 | Comoros,Comoros,, 60 | Cape Verde,Cape Verde,, 61 | Costa Rica,Costa Rica,, 62 | Coral Sea Islands,Coral Sea Islands,, 63 | Cuba,Cuba,, 64 | Cura�ao,Cura�ao,, 65 | Christmas Island,Indian Ocean Territories,, 66 | Cayman Islands,Cayman Islands,, 67 | Northern Cyprus,Northern Cyprus,, 68 | Cyprus,Cyprus,, 69 | Czech Republic,Czech Republic,, 70 | Germany,Germany,270,720 71 | Djibouti,Djibouti,, 72 | Dominica,Dominica,, 73 | Denmark,Denmark,100,115 74 | Dominican Republic,Dominican Republic,, 75 | Jarvis Island,United States Minor Outlying Islands,, 76 | Algeria,Algeria,,170 77 | Ecuador,Ecuador,, 78 | Egypt,Egypt,,600 79 | England,United Kingdom,400,700 80 | Eritrea,Eritrea,, 81 | Dhekelia,Dhekelia Sovereign Base Area,, 82 | Spain,Spain,51,69 83 | Estonia,Estonia,, 84 | Ethiopia,Ethiopia,, 85 | Finland,Finland,30,70 86 | Fiji,Fiji,, 87 | Falkland Islands,Falkland Islands,, 88 | Baker Island,United States Minor Outlying Islands,, 89 | Faeroe Islands,Faroe Islands,, 90 | Federated States of Micronesia,Federated States of Micronesia,, 91 | France,France,700,1880 92 | Gabon,Gabon,, 93 | Gaza,Palestine,, 94 | Adjara,Georgia,, 95 | Georgia,Georgia,, 96 | Guernsey,Guernsey,, 97 | Ghana,Ghana,, 98 | Gibraltar,Gibraltar,, 99 | Guinea,Guinea,, 100 | Guadeloupe,France,, 101 | The Gambia,Gambia,, 102 | Guinea-Bissau,Guinea Bissau,, 103 | Equatorial Guinea,Equatorial Guinea,, 104 | Greece,Greece,, 105 | Grenada,Grenada,, 106 | Greenland,Greenland,, 107 | Guatemala,Guatemala,, 108 | French Guiana,France,, 109 | Guam,Guam,, 110 | Guyana,Guyana,, 111 | Hong Kong,Hong Kong S.A.R.,, 112 | Heard I. and McDonald Islands,Heard Island and McDonald Islands,, 113 | Honduras,Honduras,, 114 | Howland Island,United States Minor Outlying Islands,, 115 | Croatia,Croatia,, 116 | Haiti,Haiti,, 117 | Hungary,Hungary,, 118 | Indonesia,Indonesia,30,159 119 | Isle of Man,Isle of Man,, 120 | India,India,,7 121 | British Indian Ocean Territory,British Indian Ocean Territory,, 122 | Iraqi Kurdistan,Iraq,, 123 | Ireland,Ireland,25, 124 | Iran,Iran,, 125 | Iraq,Iraq,, 126 | Iceland,Iceland,, 127 | Israel,Israel,,50 128 | Italy,Italy,,65 129 | Jamaica,Jamaica,, 130 | Jersey,Jersey,, 131 | Jordan,Jordan,, 132 | Japan,Japan,,9 133 | Johnston Atoll,United States Minor Outlying Islands,, 134 | Baikonur Cosmodrome,Baykonur Cosmodrome,, 135 | Siachen Glacier,Siachen Glacier,, 136 | Kazakhstan,Kazakhstan,,300 137 | Kenya,Kenya,, 138 | Kyrgyzstan,Kyrgyzstan,, 139 | Cambodia,Cambodia,, 140 | Kiribati,Kiribati,, 141 | Saint Kitts and Nevis,Saint Kitts and Nevis,, 142 | Korean DMZ (south),South Korea,, 143 | Korean DMZ (north),North Korea,, 144 | Republic of Korea,South Korea,, 145 | Kosovo,Kosovo,100, 146 | Kuwait,Kuwait,, 147 | Lao PDR,Laos,, 148 | Lebanon,Lebanon,, 149 | Liberia,Liberia,, 150 | Libya,Libya,, 151 | Saint Lucia,Saint Lucia,, 152 | Liechtenstein,Liechtenstein,, 153 | Sri Lanka,Sri Lanka,, 154 | Lesotho,Lesotho,, 155 | Lithuania,Lithuania,, 156 | Luxembourg,Luxembourg,, 157 | Latvia,Latvia,, 158 | Macao,Macao S.A.R,, 159 | Saint-Martin,Saint Martin,, 160 | Morocco,Morocco,1500, 161 | Monaco,Monaco,, 162 | Moldova,Moldova,, 163 | Madagascar,Madagascar,, 164 | Maldives,Maldives,,200 165 | Mexico,Mexico,, 166 | Marshall Islands,Marshall Islands,, 167 | Macedonia,Macedonia,,130 168 | Mali,Mali,, 169 | Malta,Malta,, 170 | Myanmar,Myanmar,, 171 | Montenegro,Montenegro,, 172 | Mongolia,Mongolia,, 173 | Northern Mariana Islands,Northern Mariana Islands,, 174 | Mozambique,Mozambique,, 175 | Midway Islands,United States Minor Outlying Islands,, 176 | Mauritania,Mauritania,, 177 | Montserrat,Montserrat,, 178 | Martinique,France,, 179 | Mauritius,Mauritius,, 180 | Malawi,Malawi,, 181 | Malaysia,Malaysia,,150 182 | Mayotte,France,, 183 | Namibia,Namibia,, 184 | New Caledonia,New Caledonia,, 185 | Niger,Niger,, 186 | Norfolk Island,Norfolk Island,, 187 | Nigeria,Nigeria,, 188 | Nicaragua,Nicaragua,, 189 | Northern Ireland,United Kingdom,400, 190 | Niue,Niue,, 191 | Jan Mayen Island,Norway,, 192 | Netherlands,Netherlands,120,210 193 | Caribbean Netherlands,Netherlands,, 194 | Norway,Norway,45,80 195 | Nepal,Nepal,, 196 | Nauru,Nauru,, 197 | Svalbard Islands,Norway,, 198 | New Zealand,New Zealand,,10 199 | Oman,Oman,, 200 | Pakistan,Pakistan,,70 201 | Panama,Panama,, 202 | Azores,Portugal,, 203 | Pitcairn Islands,Pitcairn Islands,, 204 | Peru,Peru,, 205 | Paracel Islands,China,, 206 | Spratly Islands,Spratly Islands,, 207 | Philippines,Philippines,, 208 | Palau,Palau,, 209 | Madeira,Portugal,, 210 | Bougainville,Papua New Guinea,, 211 | Papua New Guinea,Papua New Guinea,, 212 | Poland,Poland,, 213 | Puerto Rico,Puerto Rico,, 214 | Dem. Rep. Korea,North Korea,, 215 | Portugal,Portugal,, 216 | Paraguay,Paraguay,, 217 | French Polynesia,French Polynesia,, 218 | Qatar,Qatar,, 219 | Reunion,France,, 220 | Romania,Romania,, 221 | Russian Federation,Russia,800,2400 222 | Rwanda,Rwanda,, 223 | Western Sahara,Western Sahara,, 224 | Saudi Arabia,Saudi Arabia,2500,2275 225 | Scarborough Reef,Scarborough Reef,, 226 | Scotland,United Kingdom,400, 227 | Sudan,Sudan,, 228 | South Sudan,South Sudan,, 229 | Senegal,Senegal,, 230 | Serranilla Bank,Serranilla Bank,, 231 | Singapore,Singapore,1,9 232 | South Georgia and South Sandwich Islands,South Georgia and South Sandwich Islands,, 233 | Saint Helena,Saint Helena,, 234 | Solomon Islands,Solomon Islands,, 235 | Sierra Leone,Sierra Leone,, 236 | El Salvador,El Salvador,, 237 | San Marino,San Marino,, 238 | Somaliland,Somaliland,, 239 | Puntland,Somalia,, 240 | Somalia,Somalia,, 241 | Saint Pierre and Miquelon,Saint Pierre and Miquelon,, 242 | Serbia,Republic of Serbia,, 243 | Vojvodina,Republic of Serbia,, 244 | S�o Tom� and Principe,Sao Tome and Principe,, 245 | Suriname,Suriname,, 246 | Slovakia,Slovakia,, 247 | Slovenia,Slovenia,, 248 | Sweden,Sweden,30,300 249 | Swaziland,Swaziland,, 250 | Sint Maarten,Sint Maarten,, 251 | Seychelles,Seychelles,, 252 | UNDOF Zone,Syria,, 253 | Syrian Arab Republic,Syria,, 254 | Turks and Caicos Islands,Turks and Caicos Islands,, 255 | Chad,Chad,, 256 | Togo,Togo,, 257 | Thailand,Thailand,, 258 | Tajikistan,Tajikistan,,386 259 | Tokelau,New Zealand,, 260 | Turkmenistan,Turkmenistan,, 261 | Timor-Leste,East Timor,, 262 | Tonga,Tonga,, 263 | Trinidad and Tobago,Trinidad and Tobago,, 264 | Tunisia,Tunisia,3000,5000 265 | Turkey,Turkey,400,1400 266 | Tuvalu,Tuvalu,, 267 | Taiwan,Taiwan,, 268 | Tanzania,United Republic of Tanzania,, 269 | Zanzibar,United Republic of Tanzania,, 270 | Uganda,Uganda,, 271 | Ukraine,Ukraine,, 272 | Uruguay,Uruguay,, 273 | United States,United States of America,70,230 274 | US Naval Base Guantanamo Bay,US Naval Base Guantanamo Bay,, 275 | Uzbekistan,Uzbekistan,,500 276 | Vatican,Vatican,, 277 | Saint Vincent and the Grenadines,Saint Vincent and the Grenadines,, 278 | Venezuela,Venezuela,, 279 | British Virgin Islands,British Virgin Islands,, 280 | United States Virgin Islands,United States Virgin Islands,, 281 | Vietnam,Vietnam,, 282 | Vanuatu,Vanuatu,, 283 | West Bank,Palestine,, 284 | Wallis and Futuna Islands,Wallis and Futuna,, 285 | Wales,United Kingdom,400, 286 | Wake Atoll,United States Minor Outlying Islands,, 287 | Akrotiri,Akrotiri Sovereign Base Area,, 288 | Samoa,Samoa,, 289 | Yemen,Yemen,, 290 | South Africa,South Africa,, 291 | Zambia,Zambia,, 292 | Zimbabwe,Zimbabwe,, -------------------------------------------------------------------------------- /sample-datasets/geo-test.csv: -------------------------------------------------------------------------------- 1 | name,latlon,tag 2 | enjalot,"1337.8162046,-122.2911677",bad 3 | gerald,"37.8,-122.2911677",good 4 | ejfox,"97.8162046,222.2911677",bad 5 | null island,"0,0",bad 6 | ejfox,"107.8162046,90.2911677",bad -------------------------------------------------------------------------------- /sample-datasets/isis-attack-sites.csv: -------------------------------------------------------------------------------- 1 | NAME OF SITE,DESCRIPTION,DETAILED DESCRIPTION,LOCATION,LATLONG,LAT,LONG,COMMENTS,"LINKS (NEWS, VIDEO, EXPERT)",EXPERT LINK 2 | Ashur (Qal'at Sherqat),ANCIENT CITY (UNESCO),http://whc.unesco.org/en/list/1130,N35 27 32.004 E43 15 34.992,,,,,http://www.ibtimes.co.uk/isis-blows-unesco-world-heritage-assyrian-site-ashur-near-tikrit-1503367, 3 | Erbil Citadel,CITADEL (UNESCO),http://whc.unesco.org/en/list/1437,"36.191504, 44.009107","36.191504, 44.009107",36.191504,44.009107,"APPARENT ATTEMPT AT DESTRUCTION, REPORTED",http://www.cbsnews.com/news/suicide-bomb-hits-iraq-kurdish-capital-of-erbil/, 4 | Hatra,ANCIENT CITY (UNESCO),http://whc.unesco.org/en/list/277,"35.442745, 42.393838","35.442745, 42.393838",35.442745,42.393838,HIGH-PROFILE,http://www.wsj.com/video/isis-destroys-hatra-artifacts-new-video-claims/AE71A6B3-1B0E-4471-8250-B921B586C161.html,http://whc.unesco.org/en/news/1245/ 5 | Site of Palmyra,ANCIENT CITY (UNESCO),http://whc.unesco.org/en/list/23,"34.560000, 38.267222","34.560000, 38.267222",34.56,38.267222,"IN DANGER, HIGH-PROFILE",http://www.nytimes.com/2015/05/21/world/middleeast/syria-isis-fighters-enter-ancient-city-of-palmyra.html?_r=0, 6 | Abu Qalqal [Abu Abrus],SUFI SHRINE + TOMBS,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_06r.pdf,"36.433333, 38.083333","36.433333, 38.083333",36.433333,38.083333,,,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_06r.pdf 7 | The Armenian Holy Martyrs Church,CHURCH,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_08r.pdf,"35.338780, 40.141714","35.338780, 40.141714",35.33878,40.141714,HIGH-PROFILE,http://armenianweekly.com/2014/09/21/der-zor/,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_08r.pdf 8 | Mezela Nasridin,SHRINE,,"36.451707, 43.342042","36.451707, 43.342042",36.451707,43.342042,,https://twitter.com/matthew__barber/status/523592733999370241, 9 | Mar Gorgis (St. George/St. Markourkas) Monastery,CHURCH,http://www.asor-syrianheritage.org/wp-content/uploads/2015/04/ASOR_CHI_Weekly_Report_32r.pdf,"36.340606, 43.125246","36.340606, 43.125246",36.340606,43.125246,HIGH-PROFILE,http://www.vocativ.com/world/isis-2/isis-saint-georges-church-mosul/, 10 | Imam Dur,TOMB,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,"34.235124, 43.881656","34.235124, 43.881656",34.235124,43.881656,,"https://www.youtube.com/watch?v=0xKJZ1dJrg8, https://conflictantiquities.wordpress.com/2014/10/30/iraq-samarra-islamic-state-destruction-shia-shrine-imam-al-daur/", 11 | Mosul Museum,MUSEUM,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_31r.pdf,"36.337863, 43.139408","36.337863, 43.139408",36.337863,43.139408,HIGH-PROFILE,"https://www.youtube.com/watch?v=S4-IjQpnTUM, http://www.nytimes.com/2015/02/27/world/middleeast/historians-pore- over-isis-video-of-smashed-statues-for-clues-to-whats-been-lost.html", 12 | "Nergal Gate, Nineveh",ANCIENT CITY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.366804, 43.157387","36.366804, 43.157387",36.366804,43.157387,HIGH-PROFILE,"http://www.nytimes.com/2015/02/28/world/middleeast/destruction-of-antiquities-by-militants- is-denounced.html, https://conflictantiquities.wordpress.com/2015/02/26/iraq-mosul-museum-nergal-gate-nineveh- destruction/", 13 | Central Library of Mosul University and other university libraries,LIBRARY,,"36.376483, 43.142166","36.376483, 43.142166",36.376483,43.142166,HIGH-PROFILE,http://bigstory.ap.org/article/1ec4e2a1bb5b4dce97faa462478f7c0e/iraqi-, 14 | Shia shrines of Imam Sa’ad and Khider al-Elias,SHRINE,,"36.372157, 42.450436","36.372157, 42.450436",36.372157,42.450436,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 15 | Hamou Qado Mosque (also known as the Abdullah Chalabi bin Muhammad bin Abdul Kadir Mosque),MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_31r.pdf,"Mosul, Iraq",,,,,"http://www.ibtimes.co.uk/iraq-isis-destroys-19th-century-ottoman-mosque-central-mosul-1490786, http://rudaw.net/english/middleeast/iraq/060320151", 16 | Mosque of Sheikh Muhammad al-Abariqi,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_31r.pdf,"Mosul, Iraq",,,,,http://akhbaar.org/home/2015/3/186174.html, 17 | Minaret of Qutb al-Din Muhammad,MOSQUE (part),http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_33r.pdf,"Sinjar, Iraq",,,,,http://www.ibtimes.com/islamic-state-iraq-battle-isis-bombs- yazidi-religious-site-sinjar-destroys-homes-1855556, 18 | Monasteries of Mar Behnam and Mart Sarah,MONASTARTY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_33r.pdf,"Qaraqosh, Iraq",,,,,http://www.ibtimes.co.uk/isis-blows-famed-4th-century-mar-, 19 | Shrine of Sheikh Fathi,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.346816, 43.117931","36.346816, 43.117931",36.346816,43.117931,,behnam-catholic-monastery-iraq-1492703, 20 | Mausoleum of Imam Ibn Hassan Awn al-Din,SHRINE,"https://www.google.com.tr/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0CB4QFjAA&url=https%3A%2F%2Fconflictantiquities.wordpress.com%2F2014%2F07%2F28%2Fsyria-iraq-islamic-state-destruction-shrine-mashhad-al-imam-awn-al-din%2F&ei=UrKDVfv8PMOlygPb8T8&usg=AFQjCNFE6Ua6mLnq1NbloV1u-o53bgN47A&sig2=Xud5TT6hWnHEBwEUfTBjng&bvm=bv.96042044,d.bGQ","36.339181, 43.13099","36.339181, 43.13099",36.339181,43.13099,,https://www.youtube.com/watch?v=H4wgqyoQEFE, 21 | Qabr al-Bint (also known as Tomb of the Girl),TOMB,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.341384, 43.116628","36.341384, 43.116628",36.341384,43.116628,,"http://www.niqash.org/articles/?id=3479, https://conflictantiquities.wordpress.com/2014/07/28/syria-iraq-islamic-state-destruction-shrine-mashhad-al-imam-awn-al-din/", 22 | Nebi Yunus Mosque Complex (Jonah's Tomb),"MOSQUE, TOMB",http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.348277, 43.159574","36.348277, 43.159574",36.348277,43.159574,,"http://www.theguardian.com/world/2014/jul/24/isis-militants-blow-up-jonah-tomb, https://www.youtube.com/watch?v=5SyCGqcTtFM", 23 | Nebi Seeth Mosque,"MOSQUE, SHRINE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.334846, 43.135473","36.334846, 43.135473",36.334846,43.135473,,"https://www.youtube.com/watch?v=6gnxyZMd_nc, http://english.alarabiya.net/en/News/middle-east/2014/07/26/ISIS-destroy-Prophet- Sheth-shrine-in-Mosul-.html", 24 | Al-Nebi Jarjis Shrine,"MOSQUE, SHRINE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.344568, 43.130538","36.344568, 43.130538",36.344568,43.130538,,"https://www.youtube.com/watch?v=HAbxMGSNjzc, http://www.cbsnews.com/news/islamic-militants-destroy-historic-mosque-in-iraqi- city-of-mosul/", 25 | Mosque and Shrine of Imam al-Muhsin (also known as the Madrasa al-Nuriya),"MOSQUE, SHRINE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"Mosul, Iraq",,,,,https://archive.org/details/ezalt_shirk, 26 | The Assyrian settlement at Nimrud,ANCIENT CITY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_38r.pdf,"32.391776, 44.341282","32.391776, 44.341282",32.391776,44.341282,HIGH-PROFILE,http://www.theguardian.com/world/2015/apr/11/isis-video-destruction-ancient- city-militants-iraq-nimrud; http://www.bbc.com/news/world-middle-east-32273672,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_38r.pdf 27 | Tomb of Imam Yahya ibn al-Qasim,TOMB,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf,"36.354632, 43.122782","36.354632, 43.122782",36.354632,43.122782,,https://www.facebook.com/groups/858813217464395/permalink/862966173715766/,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_39r.pdf 28 | Cemetery in al-Hasakah Governorate,CEMETARY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/05/ASOR_CHI_Weekly_Report_38r.pdf,"Hasakah, Syria ",,,,,https://dump.to/qobur, 29 | Christian cemetery of Tel Kaif,CEMETARY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_33r.pdf,"Tel Kaif, Ninawa, Iraq",,,,,http://english.ankawa.com/?p=14081, 30 | Al-Rawi Tekkiye,SUFI SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_33r.pdf,"Deir ez-Zor, Syria",,,,,,"Deir ez-Zor, Deir ez-Zor Governorate, Syria" 31 | Shrine of Imam Saad Ibn Aqeel Ibn Abi Talib,"SHRINE, MOSQUE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.aljazeera.com/news/middleeast/2014/07/islamic-state-claims-shia-mosque-, 32 | Mosque of Martyrs of Lashkar-e Mulla,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.388452, 42.44541","36.388452, 42.44541",36.388452,42.44541,,http://www.aljazeera.com/news/middleeast/2014/07/islamic-state-claims-shia-mosque-, 33 | Mosque of Sheikh Mohammad Taqi al-Mawla,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,, 34 | Husseiniyeh Mullah Mahmoud (probably the Ar-Mahmoud /Ar Mamut Mosque in online media accounts),MOSQUE,,"Tel Afar, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 35 | Mosque of Ali Ibn Abi Talib,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.372157, 42.450436","36.372157, 42.450436",36.372157,42.450436,,, 36 | Shrine of Khider al-Elias,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 37 | Mosque of Hashim Antr,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 38 | Mosque Imam Sadiq,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 39 | Mosque of al-Abbas,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-, 40 | Mosque Ahl al-Beit,"SHRINE, MOSQUE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.366874, 42.439831","36.366874, 42.439831",36.366874,42.439831,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 41 | Qaddo Mosque,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.aljazeera.com/news/middleeast/2014/07/islamic-state-claims-shia-mosque-, 42 | Mosque of Sheikh Jawad Al Sadiq,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://www.aljazeera.com/news/middleeast/2014/07/islamic-state-claims-shia-mosque- destruction-20147414533266331.html, 43 | Mosque of Imam al-Hakim,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Tel Afar, Iraq",,,,,http://english.shafaaq.com/index.php/security/10325-isil-blow-up-3-shrines-and-shia-, 44 | Tomb and Shrine of Ahmed ar-Rifa'i,"TOMB, SHRINE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Muhallabiyah, Ninawa, Iraq",,,,,http://www.alwasat.ly/ar/mobile/article?articleid=25855, 45 | Tomb of Sheikh Ibrahim,"TOMB, SHRINE",http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Sheikh Ibrahim, Ninawa, Iraq",,,,,http://www.alwasat.ly/ar/mobile/article?articleid=25855, 46 | Shrine of Imam al-Abbas,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Guba, Ninawa, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-, 47 | Mosque of Al-Ridha,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"Guba, Ninawa, Iraq",,,,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-destroys-shrines, 48 | Mosque of Al-Zahraa,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.410132, 43.073197","36.410132, 43.073197",36.410132,43.073197,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-, 49 | Mosque of Al-Imam Hussein,MOSQUE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.410132, 43.073197","36.410132, 43.073197",36.410132,43.073197,,http://www.hrw.org/news/2014/06/27/iraq-isis-kidnaps-shia-turkmen-, 50 | al-Khidr Mosque,MOSQUE,,"Mosul, Iraq",,,,,http://www.shafaaq.com/sh2/index.php/news/iraq-news/91865-2015-02-26-11-57-, 51 | Mosul Central Public Library,LIBRARY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_30r.pdf,"36.337331, 43.141223","36.337331, 43.141223",36.337331,43.141223,HIGH-PROFILE,http://bigstory.ap.org/article/1ec4e2a1bb5b4dce97faa462478f7c0e/iraqi-libraries-ransacked- islamic-state-group-mosul,http://www.unesco.org/new/en/iraq-office/about-this-office/single- view/news/unesco_alarmed_by_news_of_mass_destruction_of_books_in_mosul 52 | Al Tabqa Cemetery,CEMETARY,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_24r.pdf,"Al Tabqa, Raqqa, Syria",,,,,https://www.facebook.com/hbmalshaar3?fref=photo, 53 | "Sufi Maqam, al-Huquf",SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_24r.pdf,"al-Huquf, Suweida, Syria",,,,,http://apsa2011.com/index.php/en/provinces/as-suwayda/monuments.html, 54 | Safirah Sufi Shrine (Name Unknown),SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_20r.pdf,"Safirah, Aleppo, Syria",,,,,, 55 | Tomb of Abed al-Qader Bek al-Azm ibn Ahmed Mouayd Pasha,TOMB,,"Rif Dimashq, Syria",,,,,http://apsa2011.com/index.php/fr/provinces/rif- dimashq/monuments/1096-rif-dimachq-tomb-abed-2.html, 56 | Sa’id Hamad Mahmoud al-Naimi in Salahuddin,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,Central and Northeastern Iraq,,,,,https://www.youtube.com/watch?v=0xKJZ1dJrg8, 57 | Sa’id Saleh Ibrahim Al-Naimi,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,Central and Northeastern Iraq,,,,,https://www.youtube.com/watch?v=0xKJZ1dJrg8, 58 | Saida Novh,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,Central and Northeastern Iraq,,,,,https://www.youtube.com/watch?v=0xKJZ1dJrg8, 59 | Sa’id Saleh Al-Naimi,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,Central and Northeastern Iraq,,,,,https://www.youtube.com/watch?v=0xKJZ1dJrg8, 60 | Sheikh Yahya,SHRINE,http://www.asor-syrianheritage.org/wp-content/uploads/2015/03/ASOR_CHI_Weekly_Report_25r.pdf,Central and Northeastern Iraq,,,,,https://www.youtube.com/watch?v=0xKJZ1dJrg8, 61 | Khorsabad ,ANCIENT CITY,http://www.britishmuseum.org/explore/highlights/articles/k/khorsabad_iraq.aspx,"36.510153, 43.223476","36.510153, 43.223476",36.510153,43.223476,HIGH-PROFILE,http://www.reuters.com/article/2015/03/12/us-mideast-crisis-iraq-destruction-idUSKBN0M726Q20150312, 62 | Virgin Mary Church,CHURCH,,"36.404836, 43.118895","36.404836, 43.118895",36.404836,43.118895,,http://rudaw.net/english/middleeast/iraq/261220142, 63 | Green Church (Church of St. Ahoadamah),CHURCH,,"Tikrit, Iraq",,,,HIGH-PROFILE,"http://www.nytimes.com/2014/09/25/world/middleeast/iraq-militants-destroy-historic-church.html?_r=1, http://rt.com/news/190728-isis-destroys-church-mosque-iraq/", 64 | St. Markourkas,CHURCH,,"Mosul, Iraq",,,,,http://www.iraqinews.com/iraq-war/isis-blows-markourkas-church-mosul/, 65 | Virgin Mary Church,CHURCH,,"36.639519, 40.367941","36.639519, 40.367941",36.639519,40.367941,,http://aranews.net/2015/04/isis-militants-bomb-assyrian-church-on-easter-holiday-in-syria/, 66 | Al-Qubba Husseiniya Mosque,MOSQUE,,"Mosul, Iraq",,,,,http://www.huffingtonpost.co.uk/2014/07/07/isis-blow-up-heretical-shia-mosques-in-mosul-pictures_n_5563860.html, 67 | Jawad Husseiniya Mosque,MOSQUE,,"Tel Afar, Iraq",,,,,http://www.huffingtonpost.co.uk/2014/07/07/isis-blow-up-heretical-shia-mosques-in-mosul-pictures_n_5563860.html, 68 | Al-Arbaleen Mosque,MOSQUE,,"Tikrit, Iraq",,,,,http://www.iraqinews.com/features/urgent-isis-destroys-historical-al-arbain-mosque-tikrit/, 69 | Prophet Daniel's Tomb,TOMB,,"Mosul, Iraq",,,,HIGH-PROFILE,http://english.alarabiya.net/en/News/middle-east/2014/07/25/ISIS-destroys-tombs-of-two-prophets-in-Mosul.html, -------------------------------------------------------------------------------- /sample-datasets/killed-by-police-2014-2015.csv: -------------------------------------------------------------------------------- 1 | ,Year,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014,2015,2015,2015,2015,2015 2 | State,Month,1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5 3 | DE,Delaware,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,1 4 | TX,Texas,6,9,8,8,8,10,11,14,11,10,5,8,17,10,5,10,0 5 | GA,Georgia,1,1,4,3,2,4,3,2,4,2,0,2,2,4,5,2,0 6 | VA,Virginia,0,0,0,0,3,3,2,0,0,4,0,0,1,1,2,3,0 7 | TN,Tennessee,1,1,2,2,1,1,3,1,5,1,4,2,0,3,1,1,0 8 | CA,California,14,8,16,17,18,15,10,19,8,7,24,13,11,11,24,20,0 9 | OK,Oklahoma,4,1,5,1,1,2,1,2,4,3,1,4,3,4,4,10,0 10 | AZ,Arizona,5,4,5,5,2,2,2,5,2,7,3,4,4,4,8,4,0 11 | IL,Illinois,1,3,2,6,3,1,6,5,1,4,1,2,2,1,1,6,0 12 | LA,Louisiana,5,1,1,1,3,1,3,2,5,2,1,1,1,1,3,3,0 13 | OH,Ohio,2,2,2,1,3,0,1,2,4,2,7,2,2,0,6,2,0 14 | MI,Michigan,4,2,2,1,2,1,3,2,0,0,3,1,0,3,1,2,0 15 | FL,Florida,4,6,11,6,11,5,6,7,6,6,9,12,2,8,7,5,0 16 | NY,New York,1,1,2,1,3,2,3,2,3,6,1,1,1,1,2,5,0 17 | OR,Oregon,0,0,1,2,1,3,2,2,0,0,3,1,1,1,3,1,0 18 | CO,Colorado,5,1,2,0,2,2,2,3,0,2,1,1,4,1,2,3,0 19 | SC,South Carolina,2,2,1,2,0,4,3,0,2,0,0,2,0,1,1,4,0 20 | WA,Washington,3,3,5,2,1,5,7,2,2,0,3,0,1,2,2,2,0 21 | NJ,New Jersey,2,1,1,0,1,2,1,1,2,1,4,2,3,0,2,2,0 22 | MS,Mississippi,0,0,0,4,3,2,0,4,2,1,1,2,1,2,2,1,0 23 | NE,Nebraska,0,0,1,0,1,2,0,2,0,0,0,0,2,1,1,1,0 24 | KS,Kansas,0,0,1,1,3,2,2,4,0,1,1,1,2,1,1,1,0 25 | MD,Maryland,1,2,4,2,3,3,2,4,0,1,0,0,2,1,3,2,0 26 | KY,Kentucky,1,1,2,2,3,1,1,0,2,2,2,2,2,0,2,1,0 27 | MO,Missouri,0,1,3,3,5,1,1,2,2,3,3,1,2,3,2,2,0 28 | AR,Arkansas,1,0,1,1,1,0,0,1,0,1,0,0,2,0,0,1,0 29 | NC,North Carolina,2,1,5,3,3,4,3,0,0,4,1,3,0,4,3,1,0 30 | IN,Indiana,2,1,1,2,1,0,0,0,1,1,0,2,1,2,2,3,0 31 | PA,Pennsylvania,2,1,1,1,1,2,1,4,0,2,0,2,1,3,0,1,0 32 | ID,Idaho,0,0,0,0,1,1,1,0,0,0,1,0,1,1,1,1,0 33 | NM,New Mexico,1,0,2,2,3,2,2,2,1,1,1,2,1,1,0,1,0 34 | NV,Nevada,0,2,2,2,1,4,3,1,1,2,3,1,1,1,1,0,0 35 | WV,West Virginia,3,0,0,0,1,1,1,1,0,0,0,0,0,0,2,0,0 36 | MA,Massachusetts,1,0,0,1,1,2,0,1,0,1,0,0,1,1,2,0,0 37 | AL,Alabama,1,1,1,2,3,1,1,1,1,2,3,3,0,3,4,0,0 38 | WI,Wisconsin,1,1,0,1,4,0,1,0,0,0,2,0,0,1,4,0,0 39 | HI,Hawaii,0,0,0,0,0,0,1,1,1,0,0,0,1,0,2,0,0 40 | MN,Minnesota,0,1,0,0,0,0,1,3,1,1,1,2,4,0,1,0,0 41 | CT,Connecticut,0,0,0,1,0,0,0,0,0,2,0,0,0,0,1,0,0 42 | DC,District of Columbia,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0 43 | UT,Utah,2,0,1,2,0,1,1,1,3,2,1,1,3,1,0,0,0 44 | ME,Maine,0,0,0,1,0,0,0,2,1,1,0,1,0,1,0,0,0 45 | AK,Alaska,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0 46 | MT,Montana,0,1,2,1,0,0,0,0,0,0,0,2,2,0,0,0,0 47 | NH,New Hampshire,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0 48 | IA,Iowa,1,0,0,1,0,0,1,2,3,0,0,1,2,0,0,0,0 49 | SD,South Dakota,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0 50 | WY,Wyoming,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0 51 | VI,Virgin Islands,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0 52 | ND,North Dakota,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 53 | VT,Vermont,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 54 | RI,Rhode Island,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -------------------------------------------------------------------------------- /sample-datasets/lab-animals-by-state.csv: -------------------------------------------------------------------------------- 1 | STATES,ALL OTHER COVERED SPECIE,CATS,DOGS,GUINEA PIGS,HAMSTERS,NONHUMAN PRIMATES,PIG,RABBITS,SHEEP,OTHER FARM ANIMALS,Total 2 | AK,386,0,0,0,0,0,0,0,0,0,386 3 | AL,"1,991",147,"1,274",220,17,215,480,943,44,427,"5,758" 4 | AR,18,102,628,162,60,111,556,"1,416",0,114,"3,167" 5 | AZ,"6,432",480,"1,641",222,47,45,444,322,142,262,"10,037" 6 | CA,"23,117","2,005","4,350","12,460","4,420","3,579","5,396","33,368","1,460","12,597","102,752" 7 | CO,"1,380",362,681,"2,398","1,294",0,900,603,461,323,"8,402" 8 | CT,869,0,291,122,"1,010",308,436,256,3,0,"3,295" 9 | DC,512,4,35,584,517,121,447,91,34,11,"2,356" 10 | DE,345,0,0,38,0,0,0,"10,283",53,725,"11,444" 11 | FL,"3,635",503,"1,042",206,219,568,"2,235",328,243,102,"9,081" 12 | GA,"6,336",791,"1,758","1,513","12,453","2,909",992,"4,114",31,432,"31,329" 13 | HI,6,42,87,0,0,0,104,0,0,0,239 14 | IA,538,699,"1,097",727,"17,151",19,"1,392","3,095",344,202,"25,264" 15 | ID,384,63,87,12,0,0,2,48,101,80,777 16 | IL,"3,672","1,101","3,207","2,724","2,357",717,"1,445","2,839",383,666,"19,111" 17 | IN,"2,500",351,951,75,753,116,"1,022",838,158,86,"6,850" 18 | KS,"1,475",854,"2,089","1,033",66,162,322,128,59,268,"6,456" 19 | KY,518,193,380,22,89,79,246,178,23,79,"1,807" 20 | LA,61,150,545,6,40,"2,947",45,498,8,266,"4,566" 21 | MA,"18,114",67,"1,711","21,242","5,353","7,395","6,093","15,496",415,293,"76,179" 22 | MD,"6,385",98,962,"13,796","4,597","8,953","1,628","3,538",270,134,"40,361" 23 | ME,467,246,83,0,0,0,15,131,5,448,"1,395" 24 | MI,"4,323",436,"4,335","16,410",910,"3,192","1,913","6,518",612,286,"38,935" 25 | MN,"1,342","3,201","4,072","27,296",397,256,"3,582","7,561","1,076","1,362","50,145" 26 | MO,"2,031","2,018","2,507","10,496","25,727",145,"1,977","3,544",107,95,"48,647" 27 | MS,168,19,81,297,17,51,356,395,4,58,"1,446" 28 | MT,4,0,0,0,173,7,0,6,5,35,230 29 | NC,"2,708","1,112","2,163","8,987",813,"1,410","1,907","3,017",233,"5,070","27,420" 30 | ND,252,211,211,5,5,0,28,4,328,0,"1,044" 31 | NE,"1,503",76,88,8,939,100,229,176,17,61,"3,197" 32 | NH,827,0,8,0,647,6,40,15,3,0,"1,546" 33 | NJ,"1,698",526,"5,037","14,357","30,135","3,629","1,302","8,239",11,0,"64,934" 34 | NM,267,31,354,304,31,550,70,71,0,0,"1,678" 35 | NV,"2,235",174,478,14,96,0,137,11,1,303,"3,449" 36 | NY,"3,735","1,459","3,475","6,487","12,322","1,994","1,409","2,915",319,600,"34,715" 37 | OH,"2,445","1,802","5,356","31,382","1,094","2,044","3,018","11,519",222,245,"59,127" 38 | OK,"1,559",159,"1,114",628,13,117,91,526,99,682,"4,988" 39 | OR,"2,615",461,193,43,6,"3,334",348,265,247,44,"7,556" 40 | PA,"4,656","1,701","3,309","2,512","1,522","1,611","3,906","16,616",698,"1,257","37,788" 41 | PR,0,0,0,0,0,"4,422",27,10,0,0,"4,459" 42 | RI,218,37,86,4,219,38,74,440,62,13,"1,191" 43 | SC,"2,300",203,309,402,139,246,348,162,7,50,"4,166" 44 | SD,469,12,8,6,33,5,58,44,25,120,780 45 | TN,"3,305",340,614,165,466,255,"1,452",438,0,104,"7,139" 46 | TX,"9,399",517,"1,505","5,518","3,387","3,166","3,386","11,487",655,"3,657","42,677" 47 | UT,633,472,827,"1,271","2,468",10,237,"1,246",387,318,"7,869" 48 | VA,"1,873",79,256,490,290,79,"3,364",421,131,143,"7,126" 49 | VT,214,8,33,134,0,0,60,4,"1,610",2,"2,065" 50 | WA,"4,121",461,"1,155","3,214",238,"2,504","1,192","1,081",31,458,"14,455" 51 | WI,"3,222",364,"7,196","2,881","4,311","6,692","1,010","14,265",146,433,"40,520" 52 | WV,0,59,47,4,6,0,2,129,136,0,383 53 | WY,100,25,56,4,3,0,6,7,126,147,474 54 | TOTAL,"137,363","24,221","67,772","190,881","136,850","64,107","55,729","169,645","11,535","33,058","891,161" -------------------------------------------------------------------------------- /sample-datasets/large_tweet_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataproofer/Dataproofer/0435595218bbfd8f4977260f9d24f50375233e41/sample-datasets/large_tweet_data.csv -------------------------------------------------------------------------------- /sample-datasets/max-integer-test.csv: -------------------------------------------------------------------------------- 1 | name,value,tag 2 | enjalot,"2147483646",good 3 | enjalot,"2,147,483,647",bad 4 | enjalot,"2147483647",bad 5 | gerald,"2,097,151",good 6 | ejfox,"2,097,152",bad 7 | ejfox,"2097152",bad 8 | myersinc,"32767",bad 9 | myersinc,"32,767",bad 10 | myersinc,"65,535",bad 11 | myersinc,"65535",bad 12 | myersinc,"65535",good 13 | big,"9,223,372,036,854,775,807", bad 14 | big,"18,446,744,073,709,551,616", bad -------------------------------------------------------------------------------- /sample-datasets/nc-travel-bans.csv: -------------------------------------------------------------------------------- 1 | States/Cities,North Carolina,MIssissippi 2 | New York,YES,YES 3 | Washington,YES,YES 4 | Seattle,YES,YES 5 | D.C.,YES,NO 6 | Vermont,YES,YES 7 | Atlanta,YES,NO 8 | Boston,YES,NO 9 | San Francisco,YES,NO 10 | Minnesota,YES,NO -------------------------------------------------------------------------------- /sample-datasets/nhl-fighting.csv: -------------------------------------------------------------------------------- 1 | Season,Games,Fights,Fights PerGame,GamesWith Fights,% of GamesWith Fights,Games WithMore ThanOne Fight,# of playerswho fought 2 | 2000-01,1230,684,0.56,469,38.13%,155,329 3 | 2001-02,1230,803,0.65,519,42.20%,172,348 4 | 2002-03,1230,668,0.54,464,37.72%,139,321 5 | 2003-04,1230,789,0.64,506,41.14%,172,340 6 | 2005-06,1230,466,0.38,357,29.02%,80,276 7 | 2006-07,1230,497,0.4,384,31.22%,87,292 8 | 2007-08,1230,664,0.54,473,38.46%,143,324 9 | 2008-09,1230,734,0.6,509,41.38%,173,355 10 | 2009-10,1230,714,0.58,493,40.08%,171,341 11 | 2010-11,1230,645,0.52,458,37.24%,117,348 12 | 2011-12,1230,546,0.44,423,34.39%,98,321 13 | 2012-13,720,347,0.48,264,36.67%,66,245 14 | 2013-14,1230,469,0.38,366,29.76%,78,288 15 | 2014-15,1230,391,0.32,331,26.91%,45,276 16 | 2015-16,1196,335,0.28,279,23.33%,50,265 17 | 2015-16+,1230,345,0.28,287,23.33%,51,- 18 | ,,,,,,, 19 | ,,,,,,, 20 | ,,,,,,, 21 | ,,,,,,, 22 | ,,,,15,,, 23 | ,,,,82,,, -------------------------------------------------------------------------------- /sample-datasets/sf-police-salaries.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataproofer/Dataproofer/0435595218bbfd8f4977260f9d24f50375233e41/sample-datasets/sf-police-salaries.xls -------------------------------------------------------------------------------- /sample-datasets/sf-police-salaries.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataproofer/Dataproofer/0435595218bbfd8f4977260f9d24f50375233e41/sample-datasets/sf-police-salaries.xlsx -------------------------------------------------------------------------------- /sample-datasets/silk-road-arrests.csv: -------------------------------------------------------------------------------- 1 | Name,City,State,Country,geo_longitude,geo_latitude,geo_accuracy,Date of arrest,What did he/she allegedly do?,Online Alias,Picture,Hyperlink Source 2 | Daniel Wayne Fowler,Auckland,,New Zealand,174.7655514,-36.8534665,city,12/1/2014,Imported ecstasy and grew marijuana,,"23-year-old who imported ecstasy and grew marijuana with his wife, Lisa Marie Clark",http://www.stuff.co.nz/national/crime/63779228/Silk-Road-drug-buyers-in-court 3 | Lisa Marie Clark ,Auckland,,New Zealand,174.7655514,-36.8534665,city,12/2/2014,Imported ecstasy and grew marijuana,,"22-year-old who imported ecstasy and grew marijuana with her husband, Daniel Wayne Fowler ",http://www.stuff.co.nz/national/crime/63779228/Silk-Road-drug-buyers-in-court 4 | Jacob Theodore George,Baltimore,Maryland,USA,-76.6108073,39.2908608,city,1/18/2012,Sold heroin and meth,DigitalInk,,http://www.forbes.com/sites/runasandvik/2013/11/07/feds-reveal-arrest-of-another-silk-road-vendor-did-he-become-an-informant/ 5 | Mehran Jahedi,Barrington,Massachusetts,USA,-73.34918621,42.20767315,administrative,5/5/2012,Bought hallucinogenic drugs,,"Student at Bard College at Simon’s Rock who was allegedly accused of selling drugs bought from Silk Road to his classmates. Sentenced to perform 300 hours of community service, stay off drugs and alcohol (with random testing) and lecture underprivileged youth four times a year.",http://www.berkshireeagle.com/news/ci_26681481/former-bard-college-at-simons-rock-student-avoids 6 | Steven Lloyd Sadler,Bellevue,Washington,USA,-122.1923372,47.6144219,city,10/7/2013,"Sold cocaine, heroin and meth",NOD,,http://www.bellevuereporter.com/news/226772661.html 7 | Jenna White,Bellevue,Washington,USA,-122.1923372,47.6144219,city,10/2/2013,Sold heroin,None,,http://www.bellevuereporter.com/news/226772661.html 8 | Jonathan Norling,Bronx,New York,USA,-73.83905379,40.8527855,administrative,2/14/2014,Bought cyanide,,turned himself in at Bronx pd,http://www.nydailynews.com/news/crime/cops-raid-bronx-apartment-tenant-brings-jar-cyanide-police-station-article-1.1620807 9 | Andrew Michael Jones,Charles City,Virginia,USA,-77.0730198,37.3434765,town,12/19/2013,Silk Road moderator,Inigo,,http://mashable.com/2013/12/20/fbi-silk-road-arrests/ 10 | Jeremey Donagal,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,XanaxKing,"35 year-old who led a drug ring which manufactured, distributed, and possessed counterfeit alprazolam (Xanax) pills (stamped as Pfizer) and other drugs",http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 11 | Laurence Lindberg,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,42 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 12 | Alicia Mitts,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,30 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 13 | Thomas Elliott,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,39 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 14 | Michael Tomada,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,42 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 15 | Christopher Neely,Contra Costa,California,USA,0,0,administrative,5/28/2014,Produced and sold fake Xanax pills,,part of Jeremy Donagal's Xanax drug ring. Sitll at large.,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 16 | "Kenneth Koskiniemi,",Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,37 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 17 | Duston Kirk,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,,Produced and sold fake Xanax pills,,38 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 18 | Michael Gonzalez,Contra Costa,California,USA,-121.9175345,37.9034806,administrative,5/28/2014,Produced and sold fake Xanax pills,,41 year-old part of Jeremy Donagal's Xanax drug ring,http://www.mercurynews.com/my-town/ci_25861423/contra-costa-nine-charged-massive-xanax-drug-operation 19 | Gary Davis,Dublin,,Ireland,-6.2600969,53.3494299,city,12/19/2013,Silk Road moderator,Libertas,,http://mashable.com/2013/12/20/fbi-silk-road-arrests/ 20 | David Wayne & Teri Janelle Schell,Durham,California,USA,-121.8038954,39.626536,administrative,11/6/2014,Grew and sold marijuana,,"54 and 59 year-olds who grew 482 marijuana plants in their home and shipped internationally and nationally. Teri worked as a 6th grade teacher in at Ishi Hills Middle School in Oroville, California.",http://www.krcrtv.com/news/local/butte-county-teacher-husband-charged-with-drug-trafficking/29871642 21 | Ayumu Teramoto,Fukuoka,,Japan,130.5302993,33.80173705,administrative,5/9/2014,Bought stimulant drugs,,,http://www.ibtimes.co.uk/japan-makes-first-arrest-over-alleged-drug-trafficking-using-bitcoin-1447857 22 | SweExpress,Helsingborg,Sweden,Sweden,12.7040684,56.0441984,city,10/3/2013,Sold marijuana,,Two Swedish males aged 29 and 34.,http://www.hd.se/nyheter/skane/2013/10/08/langarnas-hemliga-internetkonto/ 23 | Luke William Taylor,Horowhenua,,New Zealand,175.3775793,-40.57766895,administrative,1/1/2014,"Imported methampetamine, marijuana, LSD and ecstasy",,,http://www.stuff.co.nz/national/crime/65443284/accused-in-horowhenua-drugs-case-out-on-bail 24 | Isaac Patiari Teaukura Maki,Invercargill,,New Zealand,168.34843,-46.41222,city,,Bought 5 pills of ecstasy ($60 worth),,IT student ,http://www.stuff.co.nz/southland-times/news/7954851/Drug-importer-jailed 25 | Sheldon Kennedy,Lincoln,Nebraska,USA,-96.6674005,40.8000554,city,2/19/2014,"Sold drugs, guns and counterfeit currency ",edgarnumbers,,http://www.nbcnews.com/tech/internet/silk-road-merchant-arrested-over-sale-drugs-guns-cash-n35691 26 | Matthew Gillum ,Loomis,California,USA,-121.1848326,38.8062635,administrative,3/20/2013,Grew and sold marijuana,SourDieselMan,"Led a marijuana trafficking operation out of Sacramento, California which used the postal service to sell to customers located across the United States. Collaborated with Jolene Chan who processed and packaged shipments.",http://www.news10.net/story/news/2014/01/22/4757311/ 27 | Andrew Graham Hodgson,Manawatu,,New Zealand,175.6636733,-40.3581294,river,1/1/2014,"Imported methampetamine, marijuana, LSD and ecstasy",,29 year old ,http://www.stuff.co.nz/national/crime/63743171/Five-arrests-in-500k-meth-sting 28 | Luke Hanley,Mayborough,Victoria,Australia,152.7,-25.533333,,,Bought LSD,,20 year-old ,http://www.abc.net.au/news/2014-03-14/melbourne-man-avoids-jail-for-ordering-drugs-on-silk-road-websi/5321098 29 | Angel William Quinones,Orlando,Florida,USA,-81.3790462,28.5421175,city,5/13/2014,Imported ecstacy ,"UnderGroundSyndicate, BTCMaster",,http://chicago.suntimes.com/uncategorized/7/71/196718/partner-of-ex-largest-online-drug-dealer-plans-to-plead-guilty/ 30 | Matthew Jones,Orlando,Florida,USA,-81.3790462,28.5421175,city,5/30/2014,Sold painkillers,"Caligirl, Dynamite2k, Dynamite","Jones was the chief technology officer of a Texas payments company, ",http://www.orlandosentinel.com/news/breaking-news/os-orlando-silk-road-matthew-jones-20141229-story.html 31 | Benjamen Patrick Belmont,Palmerston North,,New Zealand,175.6082145,-40.3523065,,4/1/2013,"Imported LSD, cocaine and ecstasy",,19 year-old college student,http://www.stuff.co.nz/national/crime/9896976/Jail-for-drug-importing-student 32 | Preston Bridge,Perth,,Australia,115.8604796,-31.9527121,city,2/1/2013,Bought LSD ,,17 year old. Fell off balcony.,http://www.watoday.com.au/wa-news/teen-dies-after-scarborough-fall-20130219-2eofm.html 33 | Jason Hagen,Portland,Oregon,USA,-122.6707008,45.5250088,helipad,12/1/2013,Sold meth,"""hammertime,"" aka ""Hamer Tyme,"" aka ""Hammer Lynel Tyme, P.C.,"" aka ""Jim Rxxxxxxx,"" aka ""Jay Haben,"" aka ""Kevin Sxxxxxx"" aka ""Quinn Bxxxx"" aka ""J- , ' MAN,"" aka ""JAY,"" aka ""J-Walker,"" aka ""J"";",,http://www.columbian.com/news/2013/dec/18/vancouver-pair-indicted-global-drug-casesilk-road/ 34 | Chelsea Reder,Portland,Oregon,USA,-122.6707008,45.5250088,helipad,12/1/2013,Sold meth,None,,http://www.columbian.com/news/2013/dec/18/vancouver-pair-indicted-global-drug-casesilk-road/ 35 | Richard Webster,Portland,Oregon,USA,-122.6707008,45.5250088,helipad,12/1/2013,Sold meth,None,,http://www.columbian.com/news/2013/dec/18/vancouver-pair-indicted-global-drug-casesilk-road/ 36 | Donald R. Bechen,Portland,Oregon,USA,-122.6707008,45.5250088,helipad,12/1/2013,Sold meth,None,,http://www.columbian.com/news/2013/dec/18/vancouver-pair-indicted-global-drug-casesilk-road/ 37 | Peter Nash,Queensland,,Australia,144.4588889,-21.9182856,administrative,12/1/2013,"Sold heroin, cocaine, meth",“Batman73” and “Samesamebutdifferent”.,,http://mashable.com/2013/12/20/fbi-silk-road-arrests/ 38 | Jolene Chan,Roseville,California,USA,-121.2907535,38.7632095,city,3/20/2013,Grew and sold marijuana,,"Processed and packaged marijuana shipments for Matthew Gillum out of Sacremento, California.",http://www.news10.net/story/news/2014/01/22/4757311/ 39 | Ross Ulbricht,San Francisco,California,USA,-122.4192704,38.7792768,city,10/1/2013,Operated Silk Road,Dread Pirate Roberts,,http://www.usatoday.com/story/news/2015/02/02/silk-road-murders-for-hire/22769635/ 40 | Blake Benthall,San Francisco,California,USA,-122.4192704,37.7792768,city,11/6/2014,Ran Silk Road 2,Defcon,Benthall was active in his local church and bought a Tesla with Silk Road 2 commissions. ,http://www.huffingtonpost.com/2014/11/06/blake-benthall-silk-road_n_6115188.html 41 | Ryan Chamberlain,San Francisco,California,USA,-122.4192704,37.7792768,city,5/31/2014,Bought materials to make a bomb on Black Market Reloaded,None,,http://www.huffingtonpost.com/2014/06/02/ryan-kelly-chamberlain-ii-caught_n_5435247.html 42 | Brian Farrell,Seattle,Washington,USA,-122.3300624,47.6038321,city,1/20/2015,Assisted Blake Benthall in running Silk Road 2,DoctorClu,,http://www.theguardian.com/technology/2015/jan/20/silk-road-brian-farrell-ross-ulbricht-blake-benthall 43 | Curtis Clark Green,Spanish Fork,Utah,USA,-111.654923,40.114955,city,1/1/2013,Silk Road employee,"Flush, Chronicpain",,http://www.forbes.com/sites/ryanmac/2013/11/08/meet-the-silk-road-employee-that-the-dread-pirate-roberts-allegedly-tried-to-murder/ 44 | Jesse Willaim Korff,Tampa Bay,Florida,USA,-82.5723193,27.6886419,bay,1/1/2014,"Negotiated the sale of abrin, a toxin similar to ricin",,,http://www.nbcnews.com/news/us-news/florida-teen-pleads-guilty-selling-deadly-toxins-dark-web-n179271 45 | Michael Duch,Warwick,New York,USA,-74.3598755,41.256483,administrative,10/1/2013,Bought heroin,Deezletime (unconfirmed) ,,http://www.usatoday.com/story/news/2015/01/28/silk-road-heroin-dealer-testifies/22490109/ 46 | Andrew Trevor Harrison,Wellington,,New Zealand,174.7772239,-41.2887639,city,4/16/2014,Bought psychedelics ,,,http://www.stuff.co.nz/dominion-post/news/wellington/9945908/Psychedelic-tabs-ordered-online 47 | Olivia Bolles,Wilmington,Delaware,USA,,,city,11/19/2013,Sold prescription pills,MDPro,,http://www.forbes.com/sites/kashmirhill/2013/11/26/how-a-delaware-doctor-was-linked-to-silk-road-drug-sales/ 48 | Cornelis Slomp,Woerden,,Netherlands,4.876687199,252.08701595,administrative,8/28/2013,"Sold MDMA, cocaine",SuperTrips,,http://www.thefix.com/content/prolific-silk-road-drug-dealer-faces-15-years-behind-bars 49 | -------------------------------------------------------------------------------- /sample-datasets/state_table.csv: -------------------------------------------------------------------------------- 1 | id,state,state_abbrev,country,type,sort,status,occupied,notes,fips,state_ap,standard_federal_region,census_region,census_region_name,census_division,census_division_name,circuit_court 2 | 1,Alabama,AL,USA,state,10,current,occupied,,1,Ala.,IV,3,South,6,East South Central,11 3 | 2,Alaska,AK,USA,state,10,current,occupied,,2,Alaska,X,4,West,9,Pacific,9 4 | 3,Arizona,AZ,USA,state,10,current,occupied,,4,Ariz.,IX,4,West,8,Mountain,9 5 | 4,Arkansas,AR,USA,state,10,current,occupied,,5,Ark.,VI,3,South,7,West South Central,8 6 | 5,California,CA,USA,state,10,current,occupied,,6,Calif.,IX,4,West,9,Pacific,9 7 | 6,Colorado,CO,USA,state,10,current,occupied,,8,Colo.,VIII,4,West,8,Mountain,10 8 | 7,Connecticut,CT,USA,state,10,current,occupied,,9,Conn.,I,1,Northeast,1,New England,2 9 | 8,Delaware,DE,USA,state,10,current,occupied,,10,Del.,III,3,South,5,South Atlantic,3 10 | 9,Florida,FL,USA,state,10,current,occupied,,12,Fla.,IV,3,South,5,South Atlantic,11 11 | 10,Georgia,GA,USA,state,10,current,occupied,,13,Ga.,IV,3,South,5,South Atlantic,11 12 | 11,Hawaii,HI,USA,state,10,current,occupied,,15,Hawaii,IX,4,West,9,Pacific,9 13 | 12,Idaho,ID,USA,state,10,current,occupied,,16,Idaho,X,4,West,8,Mountain,9 14 | 13,Illinois,IL,USA,state,10,current,occupied,,17,Ill.,V,2,Midwest,3,East North Central,7 15 | 14,Indiana,IN,USA,state,10,current,occupied,,18,Ind.,V,2,Midwest,3,East North Central,7 16 | 15,Iowa,IA,USA,state,10,current,occupied,,19,Iowa,VII,2,Midwest,4,West North Central,8 17 | 16,Kansas,KS,USA,state,10,current,occupied,,20,Kan.,VII,2,Midwest,4,West North Central,10 18 | 17,Kentucky,KY,USA,state,10,current,occupied,,21,Ky.,IV,3,South,6,East South Central,6 19 | 18,Louisiana,LA,USA,state,10,current,occupied,,22,La.,VI,3,South,7,West South Central,5 20 | 19,Maine,ME,USA,state,10,current,occupied,,23,Maine,I,1,Northeast,1,New England,1 21 | 20,Maryland,MD,USA,state,10,current,occupied,,24,Md.,III,3,South,5,South Atlantic,4 22 | 21,Massachusetts,MA,USA,state,10,current,occupied,,25,Mass.,I,1,Northeast,1,New England,1 23 | 22,Michigan,MI,USA,state,10,current,occupied,,26,Mich.,V,2,Midwest,3,East North Central,6 24 | 23,Minnesota,MN,USA,state,10,current,occupied,,27,Minn.,V,2,Midwest,4,West North Central,8 25 | 24,Mississippi,MS,USA,state,10,current,occupied,,28,Miss.,IV,3,South,6,East South Central,5 26 | 25,Missouri,MO,USA,state,10,current,occupied,,29,Mo.,VII,2,Midwest,4,West North Central,8 27 | 26,Montana,MT,USA,state,10,current,occupied,,30,Mont.,VIII,4,West,8,Mountain,9 28 | 27,Nebraska,NE,USA,state,10,current,occupied,,31,Nebr.,VII,2,Midwest,4,West North Central,8 29 | 28,Nevada,NV,USA,state,10,current,occupied,,32,Nev.,IX,4,West,8,Mountain,9 30 | 29,New Hampshire,NH,USA,state,10,current,occupied,,33,N.H.,I,1,Northeast,1,New England,1 31 | 30,New Jersey,NJ,USA,state,10,current,occupied,,34,N.J.,II,1,Northeast,2,Mid-Atlantic,3 32 | 31,New Mexico,NM,USA,state,10,current,occupied,,35,N.M.,VI,4,West,8,Mountain,10 33 | 33,North Carolina,NC,USA,state,10,current,occupied,,37,N.C.,IV,3,South,5,South Atlantic,4 34 | 34,North Dakota,ND,USA,state,10,current,occupied,,38,N.D.,VIII,2,Midwest,4,West North Central,8 35 | 35,Ohio,OH,USA,state,10,current,occupied,,39,Ohio,V,2,Midwest,3,East North Central,6 36 | 36,Oklahoma,OK,USA,state,10,current,occupied,,40,Okla.,VI,3,South,7,West South Central,10 37 | 37,Oregon,OR,USA,state,10,current,occupied,,41,Ore.,X,4,West,9,Pacific,9 38 | 38,Pennsylvania,PA,USA,state,10,current,occupied,,42,Pa.,III,1,Northeast,2,Mid-Atlantic,3 39 | 39,Rhode Island,RI,USA,state,10,current,occupied,,44,R.I.,I,1,Northeast,1,New England,1 40 | 40,South Carolina,SC,USA,state,10,current,occupied,,45,S.C.,IV,3,South,5,South Atlantic,4 41 | 41,South Dakota,SD,USA,state,10,current,occupied,,46,S.D.,VIII,2,Midwest,4,West North Central,8 42 | 42,Tennessee,TN,USA,state,10,current,occupied,,47,Tenn.,IV,3,South,6,East South Central,6 43 | 43,Texas,TX,USA,state,10,current,occupied,,48,Texas,VI,3,South,7,West South Central,5 44 | 44,Utah,UT,USA,state,10,current,occupied,,49,Utah,VIII,4,West,8,Mountain,10 45 | 45,Vermont,VT,USA,state,10,current,occupied,,50,Vt.,I,1,Northeast,1,New England,2 46 | 46,Virginia,VA,USA,state,10,current,occupied,,51,Va.,III,3,South,5,South Atlantic,4 47 | 47,Washington,WA,USA,state,10,current,occupied,,53,Wash.,X,4,West,9,Pacific,9 48 | 48,West Virginia,WV,USA,state,10,current,occupied,,54,W.Va.,III,3,South,5,South Atlantic,4 49 | 49,Wisconsin,WI,USA,state,10,current,occupied,,55,Wis.,V,2,Midwest,3,East North Central,7 50 | 50,Wyoming,WY,USA,state,10,current,occupied,,56,Wyo.,VIII,4,West,8,Mountain,10 51 | 51,Washington DC,DC,USA,capitol,10,current,occupied,,11,,III,3,South,5,South Atlantic,D.C. -------------------------------------------------------------------------------- /sample-datasets/un-presidents.csv: -------------------------------------------------------------------------------- 1 | SESSION,YEARS,NAME,GENDER,COUNTRY 2 | First,1946,Mr. Paul-Henri Spaak,M,Belgium 3 | "First 4 | special, Second",1947,Mr. Oswaldo Aranha,M,Brazil 5 | Third,1948,Mr. H. V. Evatt,M,Australia 6 | "Second 7 | special",1948,Mr. José Arce,M,Argentina 8 | Fourth,1949,Mr. Carlos P. Rómulo,M,Philippines 9 | Fifth,1950,Mr. Nasrollah Entezam,M,Iran 10 | Sixth,1951,Mr. Luis Padilla Nervo,M,Mexico 11 | Seventh,1952,Mr. Lester B. Pearson,M,Canada 12 | Eighth,1953,Mrs. Vijaya Lakshmi Pandit,F,India 13 | Ninth,1954,Mr. Eelco N. van Kleffens,M,Netherlands 14 | 10th,1955,Mr. José Maza,M,Chile 15 | "Second emergency special, First 16 | emergency special",1956,Mr. Rudecindo Ortega,M,Chile 17 | 11th,1956,Prince Wan Waithayakon,M,Thailand 18 | "12th, Third emergency special","1957 , 1958",Sir Leslie Munro,M,"New 19 | Zealand" 20 | 13th,1958,Mr. Charles Malik,M,Lebanon 21 | "14th, Fourth 22 | emergency special, 23rd 23 | special","1959, 1960",Mr. Víctor Andrés Belaúnde,M,Peru 24 | "Third 25 | special, 15th","1960, 1961",Mr. Frederick Henry Boland,M,Ireland 26 | 16th,1961,Mr. Mongi Slim,M,Tunisia 27 | "17th, Fourth 28 | special","1962 , 1963",Sir Muhammad Zafrulla Khan,M,Pakistan 29 | 18th,1963,Mr. Carlos Sosa Rodríguez,M,Venezuela 30 | 19th,1964,Mr. Alex Quaison-Sackey,M,Ghana 31 | 20th,1965,Mr. Amintore Fanfani,M,Italy 32 | "Fifth 33 | special, 21st","1966, 1967",Mr. Abdul Rahman Pazhwak,M,Afghanistan 34 | 22nd,1967,Mr. Corneliu Manescu,M,Romania 35 | 23rd,1968,Mr. Emilio Arenales Catalán,M,Guatemala 36 | 24th,1969,Miss Angie E. Brooks,F,Liberia 37 | 25th,1970,Mr. Edvard Hambro,M,Norway 38 | 26th,1971,Mr. Adam Malik,M,Indonesia 39 | 27th,1972,Mr. Stanislaw Trepczynski,M,Poland 40 | "Sixth 41 | special, 28th","1973, 1974",Mr. Leopoldo Benítes,M,Ecuador 42 | "Seventh 43 | special, 29th","1974, 1975",Mr. Abdelaziz Bouteflika,M,Algeria 44 | 30th,1975,Mr. Gaston Thorn,M,Luxembourg 45 | 31st,1976,Mr. H. S. Amerasinghe,M,"Sri 46 | Lanka" 47 | "10th 48 | special, Ninth 49 | special, Eighth 50 | special, 32nd","1977, 1978",Mr. Lazar Mojsov,M,Yugoslavia 51 | 33rd,1978,Mr. Indalecio Liévano,M,Colombia 52 | "11th 53 | special, Seventh emergency special, Sixth 54 | emergency special, 34th","1979, 1980",Mr. Salim A. Salim,M,"United 55 | Republic of Tanzania" 56 | "Eighth 57 | emergency special, 35th",1981,Mr. Rüdiger von Wechmar,M,"Federal 58 | Republic of Germany" 59 | 37th,1982,Mr. Imre Hollai,M,Hungary 60 | "Seventh emergency special (resumed), Ninth 61 | emergency special, 12th 62 | special, 36th",1982,Mr. Ismat T. Kittani,M,Iraq 63 | 38th,1983,Mr. Jorge E. Illueca,M,Panama 64 | 39th,1984,Mr. Paul J. F. Lusaka,M,Zambia 65 | "13th 66 | special, 40th","1985, 1986","Mr. Jaime de Piniés, Mr. Jaime de Piniés",M,Spain 67 | "14th 68 | special, 41st",1986,Mr. Humayun Rasheed Choudhury,M,Bangladesh 69 | 42nd,1987,Mr. Peter Florin,M,"German 70 | Democratic 71 | Republic" 72 | 43rd,1988,Mr. Dante M. Caputo,M,Argentina 73 | "15th 74 | special",1988,Mr. Peter Florin,M,"German 75 | Democratic 76 | Republic" 77 | "18th 78 | special, 17th 79 | special, 80 | 16th 81 | special, 82 | 44th","1989, 1990",Mr. Joseph Nanven Garba,M,Nigeria 83 | 45th,1990,Mr. Guido de Marco,M,Malta 84 | 46th,1991,Mr. Samir S. Shihabi,M,"Saudi 85 | Arabia" 86 | 47th,1992,Mr. Stoyan Ganev,M,Bulgaria 87 | 48th,1993,Mr. Samuel R. Insanally,M,Guyana 88 | 49th,1994,Mr. Amara Essy,M,"Côte 89 | d’Ivoire" 90 | 50th,1995,Prof. Diogo Freitas do Amaral,,Portugal 91 | "10th 92 | emergency 93 | special (resumed), 19th 94 | special, 51st","1996, 1997",Mr. Razali Ismail,M,Malaysia 95 | "20th 96 | special, 10th 97 | emergency 98 | special (resumed), 52nd","1997, 1998",Mr. Hennadiy Udovenko,M,Ukraine 99 | "21st 100 | special, 53rd, 10th emergency 101 | special (resumed)","1998, 1999",Mr. Didier Opertti,M,Uruguay 102 | "54th, 22nd special, 24th 103 | special","1999, 2000",Mr. Theo-Ben Gurirab,M,Namibia 104 | "26th 105 | special, 25th 106 | special, 10th 107 | emergency 108 | special (resumed), 55th","2000, 2001",Mr. Harri Holkeri,M,Finland 109 | "10th 110 | emergency 111 | special (resumed twice), 56th","2001, 2002",Mr. Han Seung-soo,M,"Republic 112 | of Korea" 113 | 57th,2002,Mr. Jan Kavan,M,"Czech 114 | Republic" 115 | "10th 116 | emergency 117 | special (resumed), 58th","2003, 2004",Mr. Julian Robert Hunte,M,"Saint 118 | Lucia" 119 | 59th,2004,Mr. Jean Ping,M,"Gabonese 120 | Republic" 121 | 60th,2005,Mr.Jan Eliasson,M,Sweden 122 | "61st, 10th emergency special (resumed twice)",2006,Ms. Sheikha Haya Rashed Al Khalifa,F,Bahrain 123 | 62nd,2007,Mr. Srgjan Kerim,M,Former Yugoslav Republic of Macedonia 124 | 63rd,2008,Mr. Miguel d’Escoto Brockmann,M,Nicaragua 125 | 64th,2009,Dr. Ali Abdussalam Treki,M,Libya (formerly Libyan Arab Jamahiriya) 126 | 65th,2010,Mr. Joseph Deiss,M,Switzerland 127 | 66th,2011,Mr. Nassir Abdulaziz Al-Nasser,M,Qatar 128 | 67th,2012,Mr. Vuk Jeremić,M,Serbia 129 | 68th,2013,Mr. John W. Ashe,M,Antigua and Barbuda 130 | 69th,2014,Mr. Sam Kutesa,M,Uganda 131 | 70th,2015,Mr. Mogens Lykketoft,M,Denmark 132 | ,,,, 133 | http://www.un.org/pga/70/president/presidents-of-the-general-assembly/,,,, -------------------------------------------------------------------------------- /sample-datasets/vegas-hotel-prices.csv: -------------------------------------------------------------------------------- 1 | Hotel,Stars, Price for this weekend (April 30 - May 03) , (May 07 - May 10) for 2 adults ,Percentage of price change next weekend,Average price for further weekend,Percentage of price change Avg. further weekend, (May 14 - May 17) for 2 adults , (May 21 - May 24) for 2 adults , (May 28 - May 31) for 2 adults , (Jun 04 - Jun 07) for 2 adults , (Jun 11 - Jun 14) for 2 adults , (Jun 18 - Jun 21) for 2 adults 2 | Nobu Hotel,5," $ 2,919.00 ", $ 703.00 ,76%, $ 858.86 ,71%," $ 1,047.00 ", $ 901.00 , $ 925.00 , $ 700.00 , $ 734.00 ," $ 1,002.00 " 3 | Encore Resort Las Vegas,5," $ 2,514.00 ", $ 678.00 ,73%, $ 909.71 ,64%," $ 1,165.00 ", $ 870.00 ," $ 1,397.00 ", $ 747.00 , $ 747.00 , $ 764.00 4 | Wynn Las Vegas,5," $ 2,293.00 ", $ 771.00 ,66%, $ 869.86 ,62%," $ 1,203.00 ", $ 802.00 ," $ 1,258.00 ", $ 647.00 , $ 647.00 , $ 761.00 5 | Mandarin Oriental Las Vegas,5," $ 2,187.00 ", $ 879.00 ,60%," $ 1,033.57 ",53%," $ 1,340.00 "," $ 1,206.00 "," $ 1,270.00 ", $ 781.00 , $ 716.00 ," $ 1,043.00 " 6 | The Cosmopolitan of Las Vegas,5," $ 2,027.00 ", $ 885.00 ,56%," $ 1,003.71 ",50%," $ 1,747.00 "," $ 1,075.00 "," $ 1,142.00 ", $ 797.00 , $ 323.00 ," $ 1,057.00 " 7 | Trump International Hotel Las Vegas,5," $ 1,993.00 ", $ 490.00 ,75%, $ 515.29 ,74%, $ 648.00 , $ 569.00 , $ 503.00 , $ 395.00 , $ 392.00 , $ 610.00 8 | Bill's Gamblin' Hall & Saloon,3," $ 1,358.00 ", $ 661.00 ,51%, $ 764.29 ,44%, $ 872.00 , $ 804.00 , $ 977.00 , $ 562.00 , $ 698.00 , $ 776.00 9 | Jockey Club,3," $ 1,353.00 ", $ 461.00 ,66%, $ 658.67 ,51%, $ 978.00 , $ 733.00 ,, $ 461.00 , $ 461.00 , $ 858.00 10 | Hampton Inn Tropicana,3," $ 1,341.00 ", $ 283.00 ,79%, $ 332.71 ,75%, $ 452.00 , $ 410.00 , $ 266.00 , $ 263.00 , $ 261.00 , $ 394.00 11 | Bellagio Las Vegas,5," $ 1,337.00 ", $ 978.00 ,27%, $ 882.00 ,34%," $ 1,097.00 ", $ 873.00 ," $ 1,139.00 ", $ 620.00 , $ 646.00 , $ 821.00 12 | Jockey Resort Suites,3," $ 1,317.00 ", $ 485.00 ,63%, $ 705.67 ,46%, $ 924.00 , $ 675.00 ,, $ 529.00 , $ 784.00 , $ 837.00 13 | Luxury Suites International at The Signature,4," $ 1,306.00 ", $ 419.00 ,68%, $ 579.14 ,56%, $ 776.00 , $ 585.00 , $ 807.00 , $ 396.00 , $ 452.00 , $ 619.00 14 | Elara A Hilton Grand Vacations Hotel - Center Strip,4.5," $ 1,296.00 ", $ 516.00 ,60%, $ 656.29 ,49%, $ 676.00 , $ 694.00 , $ 694.00 , $ 620.00 , $ 531.00 , $ 863.00 15 | Cancun Resort,3," $ 1,266.00 ", $ 500.00 ,61%, $ 451.86 ,64%, $ 362.00 , $ 629.00 , $ 370.00 , $ 370.00 , $ 416.00 , $ 516.00 16 | Cliffs at Peace Canyon,3," $ 1,227.00 ", $ 563.00 ,54%, $ 475.14 ,61%, $ 563.00 , $ 512.00 , $ 411.00 , $ 411.00 , $ 411.00 , $ 455.00 17 | Residence Inn Las Vegas Hughes Center,3," $ 1,217.00 ", $ 524.00 ,57%, $ 445.33 ,63%, $ 581.00 , $ 534.00 , $ 365.00 , $ 342.00 , $ 326.00 , 18 | Desert Paradise Resort,4," $ 1,208.00 ", $ 393.00 ,67%, $ 410.71 ,66%, $ 376.00 , $ 557.00 , $ 308.00 , $ 367.00 , $ 326.00 , $ 548.00 19 | Gold Coast Hotel and Casino,3," $ 1,169.00 ", $ 233.00 ,80%, $ 325.75 ,72%,,,, $ 268.00 , $ 281.00 , $ 521.00 20 | Signature at MGM Grand,4," $ 1,162.00 ", $ 658.00 ,43%, $ 624.43 ,46%," $ 1,051.00 ", $ 624.00 , $ 607.00 , $ 388.00 , $ 458.00 , $ 585.00 21 | ARIA Resort & Casino,5," $ 1,160.00 ", $ 813.00 ,30%, $ 741.43 ,36%, $ 887.00 , $ 865.00 , $ 837.00 , $ 517.00 , $ 549.00 , $ 722.00 22 | DoubleTree by Hilton Las Vegas Airport,3," $ 1,111.00 ", $ 366.00 ,67%, $ 424.33 ,62%, $ 602.00 , $ 669.00 , $ 303.00 , $ 303.00 , $ 303.00 , 23 | Marriott's Grand Chateau,4," $ 1,105.00 ", $ 534.00 ,52%, $ 608.14 ,45%, $ 635.00 , $ 748.00 , $ 531.00 , $ 531.00 , $ 531.00 , $ 747.00 24 | Embassy Suites Las Vegas,3," $ 1,100.00 ", $ 395.00 ,64%, $ 435.86 ,60%, $ 530.00 , $ 426.00 , $ 365.00 , $ 374.00 , $ 365.00 , $ 596.00 25 | JW Marriott Las Vegas Resort Spa & Golf,4," $ 1,077.00 ", $ 483.00 ,55%, $ 579.75 ,46%, $ 664.00 , $ 702.00 ,, $ 470.00 ,, 26 | Staybridge Suites Las Vegas,3," $ 1,055.00 ", $ 568.00 ,46%, $ 566.00 ,46%,, $ 523.00 , $ 544.00 , $ 523.00 , $ 563.00 , $ 675.00 27 | Polo Towers Suites Las Vegas,3," $ 1,049.00 ", $ 360.00 ,66%, $ 401.71 ,62%, $ 357.00 , $ 533.00 , $ 340.00 , $ 340.00 , $ 340.00 , $ 542.00 28 | Red Rock Casino Resort & Spa,4," $ 1,039.00 ", $ 329.00 ,68%, $ 679.17 ,35%,, $ 561.00 , $ 555.00 , $ 358.00 , $ 542.00 ," $ 1,730.00 " 29 | Palazzo Resort Hotel Las Vegas,5," $ 1,036.00 ", $ 525.00 ,49%, $ 743.00 ,28%, $ 891.00 , $ 887.00 , $ 881.00 , $ 611.00 , $ 584.00 , $ 822.00 30 | Hilton Grand Vacations Suites on the Las Vegas Strip,4," $ 1,027.00 ", $ 333.00 ,68%, $ 492.14 ,52%, $ 517.00 , $ 549.00 , $ 387.00 , $ 313.00 , $ 344.00 ," $ 1,002.00 " 31 | Golden Nugget,4," $ 1,025.00 ", $ 222.00 ,78%, $ 354.86 ,65%, $ 291.00 , $ 533.00 , $ 326.00 , $ 271.00 , $ 268.00 , $ 573.00 32 | Venetian Resort Hotel Las Vegas,5," $ 1,022.00 ", $ 591.00 ,42%, $ 798.71 ,22%, $ 999.00 ," $ 1,010.00 ", $ 886.00 , $ 649.00 , $ 623.00 , $ 833.00 33 | Hard Rock Hotel and Casino,4, $ 994.00 , $ 393.00 ,60%, $ 492.86 ,50%, $ 774.00 , $ 510.00 , $ 520.00 , $ 336.00 , $ 300.00 , $ 617.00 34 | Rumor Hotel,3, $ 982.00 , $ 276.00 ,72%, $ 321.25 ,67%,,,, $ 262.00 , $ 262.00 , $ 485.00 35 | Holiday Inn Club Vacations,3, $ 982.00 , $ 464.00 ,53%, $ 514.00 ,48%,, $ 544.00 , $ 514.00 , $ 373.00 , $ 373.00 , $ 816.00 36 | Vdara Hotel & Spa,5, $ 966.00 , $ 622.00 ,36%, $ 653.71 ,32%, $ 893.00 , $ 722.00 , $ 842.00 , $ 401.00 , $ 497.00 , $ 599.00 37 | Courtyard by Marriott Las Vegas South,3, $ 960.00 , $ 489.00 ,49%, $ 430.71 ,55%, $ 427.00 , $ 400.00 , $ 373.00 , $ 346.00 , $ 412.00 , $ 568.00 38 | "The Westin Casuarina Las Vegas Hotel, Casino & Spa",4, $ 957.00 , $ 404.00 ,58%, $ 423.29 ,56%, $ 610.00 , $ 409.00 , $ 368.00 , $ 252.00 , $ 260.00 , $ 660.00 39 | Caesars Palace Classic Hotel,4, $ 956.00 , $ 547.00 ,43%, $ 650.57 ,32%, $ 918.00 , $ 682.00 , $ 742.00 , $ 463.00 , $ 444.00 , $ 758.00 40 | Holiday Inn Express Las Vegas South,2, $ 943.00 , $ 428.00 ,55%, $ 411.29 ,56%, $ 437.00 , $ 418.00 , $ 379.00 , $ 361.00 , $ 361.00 , $ 495.00 41 | Element by Westin Las Vegas Summerlin,3, $ 937.00 , $ 489.00 ,48%, $ 499.14 ,47%, $ 534.00 , $ 541.00 , $ 461.00 , $ 400.00 , $ 400.00 , $ 669.00 42 | Platinum Hotel and Spa,4, $ 930.00 , $ 398.00 ,57%, $ 567.86 ,39%, $ 680.00 , $ 686.00 , $ 529.00 , $ 405.00 , $ 424.00 , $ 853.00 43 | Fairfield Inn & Suites Las Vegas South,3, $ 926.00 , $ 456.00 ,51%, $ 442.14 ,52%, $ 450.00 , $ 501.00 , $ 467.00 , $ 355.00 , $ 377.00 , $ 489.00 44 | Alexis Park Resort,3, $ 926.00 , $ 288.00 ,69%, $ 278.71 ,70%, $ 248.00 , $ 323.00 , $ 236.00 , $ 171.00 , $ 171.00 , $ 514.00 45 | MGM Grand Hotel and Casino,4, $ 924.00 , $ 674.00 ,27%, $ 596.29 ,35%, $ 760.00 , $ 669.00 , $ 621.00 , $ 363.00 , $ 460.00 , $ 627.00 46 | Four Queens Hotel and Casino,3, $ 915.00 , $ 259.00 ,72%, $ 269.29 ,71%, $ 289.00 , $ 368.00 , $ 220.00 , $ 194.00 , $ 188.00 , $ 367.00 47 | Residence Inn Las Vegas South,3, $ 893.00 , $ 512.00 ,43%, $ 585.50 ,34%, $ 842.00 , $ 635.00 ,, $ 501.00 , $ 444.00 , $ 579.00 48 | Westgate Flamingo Bay at Las Vegas,3, $ 861.00 , $ 531.00 ,38%, $ 434.50 ,50%, $ 531.00 ,, $ 275.00 , $ 275.00 , $ 357.00 , $ 638.00 49 | Las Vegas Marriott,4, $ 859.00 , $ 592.00 ,31%, $ 494.00 ,42%, $ 635.00 , $ 613.00 , $ 389.00 , $ 386.00 , $ 349.00 , 50 | Royal Vacation Suites Hotel Las Vegas,3, $ 835.00 , $ 269.00 ,68%, $ 265.17 ,68%, $ 272.00 ,, $ 275.00 , $ 147.00 , $ 136.00 , $ 492.00 51 | Paris Las Vegas Hotel,4, $ 825.00 , $ 498.00 ,40%, $ 563.71 ,32%, $ 687.00 , $ 713.00 , $ 624.00 , $ 389.00 , $ 381.00 , $ 654.00 52 | Residence Inn Las Vegas Convention Center,3, $ 825.00 , $ 576.00 ,30%, $ 453.33 ,45%, $ 651.00 , $ 524.00 , $ 349.00 , $ 318.00 , $ 302.00 , 53 | Tahiti Village,4, $ 817.00 , $ 418.00 ,49%, $ 512.86 ,37%, $ 536.00 , $ 676.00 , $ 426.00 , $ 427.00 , $ 446.00 , $ 661.00 54 | Mandalay Bay Resort & Casino,4, $ 815.00 ,,100%, $ 689.00 ,15%," $ 1,294.00 ", $ 701.00 ,, $ 413.00 , $ 426.00 , $ 611.00 55 | Tropicana Las Vegas,4, $ 813.00 , $ 408.00 ,50%, $ 416.14 ,49%, $ 592.00 , $ 485.00 , $ 317.00 , $ 320.00 , $ 290.00 , $ 501.00 56 | Best Western Plus Casino Royale,3, $ 798.00 , $ 412.00 ,48%, $ 489.43 ,39%, $ 596.00 , $ 554.00 , $ 487.00 , $ 361.00 , $ 395.00 , $ 621.00 57 | Four Seasons Hotel Las Vegas,5, $ 788.00 ," $ 1,000.00 ",-27%," $ 1,083.00 ",-37%,, $ 973.00 ," $ 2,013.00 ", $ 744.00 , $ 652.00 ," $ 1,116.00 " 58 | Candlewood Suites Las Vegas,3, $ 782.00 , $ 286.00 ,63%, $ 292.86 ,63%, $ 310.00 , $ 314.00 , $ 273.00 , $ 251.00 , $ 240.00 , $ 376.00 59 | Renaissance Las Vegas Hotel,4, $ 769.00 , $ 344.00 ,55%, $ 424.29 ,45%, $ 422.00 , $ 381.00 , $ 456.00 , $ 422.00 , $ 321.00 , $ 624.00 60 | The Mirage Hotel & Casino,4, $ 766.00 , $ 566.00 ,26%, $ 589.57 ,23%, $ 844.00 , $ 666.00 , $ 600.00 , $ 407.00 , $ 472.00 , $ 572.00 61 | New York - New York Hotel and Casino,4, $ 757.00 , $ 492.00 ,35%, $ 515.00 ,32%, $ 694.00 , $ 605.00 , $ 578.00 , $ 326.00 , $ 344.00 , $ 566.00 62 | Holiday Inn Express Hotel and Suites Las Vegas 215 Beltway,3, $ 755.00 , $ 384.00 ,49%, $ 408.43 ,46%, $ 432.00 , $ 395.00 , $ 386.00 , $ 383.00 , $ 401.00 , $ 478.00 63 | Planet Hollywood Resort & Casino,4, $ 754.00 , $ 441.00 ,42%, $ 471.57 ,37%, $ 573.00 , $ 594.00 , $ 507.00 , $ 390.00 , $ 286.00 , $ 510.00 64 | SpringHill Suites Las Vegas Convention Center,3, $ 747.00 , $ 430.00 ,42%, $ 425.57 ,43%, $ 477.00 , $ 501.00 , $ 403.00 , $ 382.00 , $ 353.00 , $ 433.00 65 | Hyatt Place Las Vegas,3, $ 747.00 , $ 452.00 ,39%, $ 412.43 ,45%, $ 411.00 , $ 411.00 , $ 445.00 , $ 341.00 , $ 352.00 , $ 475.00 66 | Extended Stay America Hotel Valley View Las Vegas,2, $ 745.00 , $ 431.00 ,42%, $ 405.20 ,46%, $ 431.00 , $ 398.00 ,, $ 378.00 , $ 388.00 , 67 | Courtyard Las Vegas Convention Center,3, $ 736.00 , $ 510.00 ,31%, $ 414.17 ,44%, $ 396.00 , $ 465.00 , $ 467.00 , $ 368.00 , $ 279.00 , 68 | Extended Stay America - Las Vegas - East Flamingo,3, $ 700.00 , $ 326.00 ,53%, $ 341.00 ,51%, $ 342.00 , $ 368.00 , $ 328.00 ,,, 69 | Monte Carlo Resort & Casino,4, $ 696.00 , $ 433.00 ,38%, $ 483.14 ,31%, $ 650.00 , $ 604.00 , $ 524.00 , $ 313.00 , $ 347.00 , $ 511.00 70 | Terribles Hotel Las Vegas,3, $ 687.00 , $ 230.00 ,67%, $ 325.86 ,53%, $ 423.00 , $ 386.00 , $ 332.00 , $ 209.00 , $ 198.00 , $ 503.00 71 | Las Vegas Airport Travelodge,2, $ 678.00 , $ 351.00 ,48%, $ 338.57 ,50%, $ 351.00 , $ 341.00 , $ 302.00 , $ 292.00 , $ 292.00 , $ 441.00 72 | Fairfield Inn Las Vegas Airport,3, $ 669.00 , $ 494.00 ,26%, $ 366.50 ,45%, $ 338.00 , $ 467.00 , $ 334.00 , $ 287.00 , $ 279.00 , 73 | A Fisher's Inn Motel,2, $ 666.00 , $ 353.00 ,47%, $ 327.00 ,51%, $ 353.00 , $ 386.00 , $ 386.00 , $ 218.00 , $ 218.00 , $ 375.00 74 | SLS Las Vegas Hotel & Casino,3, $ 664.00 , $ 404.00 ,39%, $ 409.29 ,38%, $ 754.00 , $ 552.00 , $ 224.00 , $ 249.00 , $ 249.00 , $ 433.00 75 | Travelers Inn Motel,2, $ 661.00 , $ 366.00 ,45%, $ 370.86 ,44%, $ 366.00 , $ 366.00 , $ 366.00 , $ 370.00 , $ 370.00 , $ 392.00 76 | Desert Hills Motel,1, $ 657.00 , $ 198.00 ,70%, $ 233.50 ,64%, $ 198.00 ,, $ 198.00 , $ 198.00 , $ 198.00 , $ 411.00 77 | Extended Stay America - Las Vegas - Midtown,2, $ 652.00 , $ 305.00 ,53%, $ 276.50 ,58%, $ 305.00 , $ 353.00 , $ 232.00 , $ 232.00 , $ 232.00 , 78 | Westgate Las Vegas Resort and Casino,3.5, $ 652.00 , $ 130.00 ,80%, $ 236.00 ,64%, $ 245.00 , $ 290.00 , $ 246.00 , $ 142.00 , $ 184.00 , $ 415.00 79 | Club De Soleil Hotel Las Vegas,4, $ 643.00 , $ 314.00 ,51%, $ 346.14 ,46%, $ 333.00 , $ 392.00 , $ 332.00 , $ 270.00 , $ 314.00 , $ 468.00 80 | Days Inn Las Vegas At Wild Wild West Gambling Hall,3, $ 642.00 , $ 294.00 ,54%, $ 217.33 ,66%, $ 313.00 , $ 174.00 , $ 175.00 , $ 174.00 , $ 174.00 , 81 | Palms Resort Las Vegas,4, $ 637.00 , $ 260.00 ,59%, $ 373.71 ,41%, $ 452.00 , $ 411.00 , $ 335.00 , $ 355.00 , $ 309.00 , $ 494.00 82 | Hilton Grand Vacations Suites Las Vegas,4, $ 635.00 , $ 344.00 ,46%, $ 396.57 ,38%, $ 601.00 , $ 517.00 , $ 263.00 , $ 251.00 , $ 292.00 , $ 508.00 83 | Suncoast Hotel and Casino,3.5, $ 635.00 , $ 320.00 ,50%, $ 329.83 ,48%, $ 536.00 ,, $ 233.00 , $ 268.00 , $ 272.00 , $ 350.00 84 | El Mirador Motel Las Vegas,2, $ 627.00 ,,100%, $ 200.67 ,68%,,, $ 200.00 , $ 200.00 , $ 202.00 , 85 | Boulder Station Hotel and Casino,3, $ 623.00 , $ 188.00 ,70%, $ 227.57 ,63%, $ 233.00 , $ 297.00 , $ 158.00 , $ 187.00 , $ 162.00 , $ 368.00 86 | Palace Station Hotel and Casino,3, $ 623.00 , $ 171.00 ,73%, $ 228.43 ,63%, $ 227.00 , $ 287.00 , $ 214.00 , $ 151.00 , $ 166.00 , $ 383.00 87 | Tuscany Suites & Casino,3, $ 621.00 , $ 269.00 ,57%, $ 352.71 ,43%, $ 482.00 , $ 376.00 , $ 292.00 , $ 269.00 , $ 279.00 , $ 502.00 88 | Tahiti All-Suite Resort,4, $ 614.00 , $ 320.00 ,48%, $ 346.29 ,44%, $ 376.00 , $ 388.00 , $ 305.00 , $ 274.00 , $ 274.00 , $ 487.00 89 | La Quinta Inn & Suites Las Vegas Airport North Convention Center,2, $ 613.00 , $ 271.00 ,56%, $ 322.86 ,47%, $ 418.00 , $ 361.00 , $ 240.00 , $ 227.00 , $ 222.00 , $ 521.00 90 | Treasure Island - TI Hotel & Casino,4, $ 609.00 , $ 362.00 ,41%, $ 453.29 ,26%, $ 592.00 , $ 496.00 , $ 553.00 , $ 310.00 , $ 289.00 , $ 571.00 91 | Luxor Las Vegas,3, $ 587.00 , $ 477.00 ,19%, $ 471.71 ,20%, $ 559.00 , $ 539.00 , $ 727.00 , $ 222.00 , $ 266.00 , $ 512.00 92 | Motel 6 Las Vegas - Tropicana,2, $ 580.00 , $ 244.00 ,58%, $ 268.86 ,54%, $ 271.00 , $ 278.00 , $ 222.00 , $ 222.00 , $ 222.00 , $ 423.00 93 | The LINQ Hotel and Casino,3, $ 575.00 , $ 305.00 ,47%, $ 375.57 ,35%, $ 390.00 , $ 447.00 , $ 429.00 , $ 287.00 , $ 294.00 , $ 477.00 94 | Super 8 Las Vegas Blvd,2, $ 571.00 , $ 217.00 ,62%, $ 222.43 ,61%, $ 217.00 , $ 206.00 , $ 178.00 , $ 161.00 , $ 161.00 , $ 417.00 95 | Baymont Inn & Suites Las Vegas South Strip,3, $ 567.00 , $ 285.00 ,50%, $ 316.00 ,44%, $ 307.00 , $ 412.00 , $ 254.00 , $ 241.00 , $ 320.00 , $ 393.00 96 | Howard Johnson Tropicana,2, $ 553.00 , $ 242.00 ,56%, $ 259.71 ,53%, $ 261.00 , $ 319.00 , $ 196.00 , $ 189.00 , $ 197.00 , $ 414.00 97 | "Downtown Grand, an Ascend Hotel Collection Member",4, $ 546.00 , $ 259.00 ,53%, $ 316.71 ,42%, $ 530.00 , $ 478.00 , $ 281.00 , $ 193.00 , $ 193.00 , $ 283.00 98 | "Stratosphere Hotel, Casino and Tower",3, $ 544.00 , $ 208.00 ,62%, $ 281.71 ,48%, $ 322.00 , $ 356.00 , $ 269.00 , $ 166.00 , $ 181.00 , $ 470.00 99 | The Orleans Hotel & Casino,3, $ 541.00 , $ 338.00 ,38%, $ 291.60 ,46%, $ 309.00 ,,, $ 227.00 , $ 227.00 , $ 357.00 100 | Crossland Economy Studios - Las Vegas - Boulder Highway,2, $ 538.00 , $ 308.00 ,43%, $ 262.00 ,51%, $ 308.00 , $ 326.00 , $ 210.00 , $ 210.00 , $ 210.00 , 101 | The D Las Vegas,3, $ 535.00 , $ 153.00 ,71%, $ 211.71 ,60%, $ 233.00 , $ 275.00 , $ 181.00 , $ 133.00 , $ 139.00 , $ 368.00 102 | Bonanza Lodge,1, $ 535.00 , $ 239.00 ,55%, $ 285.00 ,47%, $ 294.00 , $ 239.00 , $ 198.00 , $ 195.00 , $ 195.00 , $ 635.00 103 | Fremont Hotel and Casino,3, $ 529.00 ,,100%, $ 303.75 ,43%,,, $ 191.00 , $ 207.00 , $ 176.00 , $ 641.00 104 | Plaza Hotel & Casino,3, $ 527.00 , $ 193.00 ,63%, $ 234.29 ,56%, $ 227.00 , $ 397.00 , $ 181.00 , $ 181.00 , $ 181.00 , $ 280.00 105 | Excalibur Hotel & Casino,3, $ 524.00 , $ 408.00 ,22%, $ 391.43 ,25%, $ 499.00 , $ 491.00 , $ 401.00 , $ 215.00 , $ 239.00 , $ 487.00 106 | El Cortez Hotel & Casino,3, $ 518.00 , $ 138.00 ,73%, $ 165.29 ,68%, $ 177.00 , $ 252.00 , $ 110.00 , $ 110.00 , $ 110.00 , $ 260.00 107 | Sam's Town Hotel and Gambling Hall,3, $ 511.00 , $ 296.00 ,42%, $ 273.14 ,47%, $ 290.00 , $ 309.00 , $ 215.00 , $ 225.00 , $ 223.00 , $ 354.00 108 | Blair House Suites,2, $ 510.00 , $ 250.00 ,51%, $ 262.43 ,49%, $ 359.00 , $ 264.00 , $ 205.00 , $ 168.00 , $ 168.00 , $ 423.00 109 | Gold Spike Hotel and Casino,3, $ 497.00 , $ 197.00 ,60%, $ 264.17 ,47%, $ 297.00 ,, $ 324.00 , $ 181.00 , $ 156.00 , $ 430.00 110 | Golden Gate Hotel & Casino,3, $ 486.00 , $ 128.00 ,74%, $ 194.43 ,60%, $ 178.00 , $ 223.00 , $ 131.00 , $ 123.00 , $ 138.00 , $ 440.00 111 | Flamingo Las Vegas Hotel & Casino,3, $ 480.00 , $ 280.00 ,42%, $ 380.14 ,21%, $ 426.00 , $ 498.00 , $ 437.00 , $ 257.00 , $ 290.00 , $ 473.00 112 | Arizona Charlie's Boulder Casino Hotel,2, $ 476.00 , $ 181.00 ,62%, $ 190.43 ,60%, $ 223.00 , $ 225.00 , $ 148.00 , $ 128.00 , $ 167.00 , $ 261.00 113 | Harrah's Las Vegas Casino & Hotel,3, $ 453.00 , $ 293.00 ,35%, $ 367.71 ,19%, $ 452.00 , $ 415.00 , $ 448.00 , $ 291.00 , $ 283.00 , $ 392.00 114 | Circus Circus Hotel & Casino Las Vegas,3, $ 452.00 , $ 235.00 ,48%, $ 259.43 ,43%, $ 270.00 , $ 316.00 , $ 241.00 , $ 171.00 , $ 171.00 , $ 412.00 115 | BEST WESTERN Main Street Inn,2, $ 428.00 , $ 283.00 ,34%, $ 260.71 ,39%, $ 283.00 , $ 232.00 , $ 191.00 , $ 191.00 , $ 191.00 , $ 454.00 116 | Hooters Casino Hotel,3, $ 394.00 , $ 239.00 ,39%, $ 272.14 ,31%, $ 331.00 , $ 339.00 , $ 281.00 , $ 161.00 , $ 181.00 , $ 373.00 117 | Rio All Suite Hotel Las Vegas,4, $ 387.00 , $ 241.00 ,38%, $ 464.71 ,-20%, $ 370.00 , $ 343.00 , $ 725.00 , $ 330.00 , $ 424.00 , $ 820.00 118 | Hampton Inn Las Vegas - Summerlin,3, $ 351.00 , $ 331.00 ,6%, $ 374.29 ,-7%, $ 351.00 , $ 449.00 , $ 351.00 , $ 338.00 , $ 351.00 , $ 449.00 119 | Las Vegas Hostel,1, $ 280.00 , $ 135.00 ,52%, $ 162.43 ,42%, $ 127.00 , $ 128.00 , $ 128.00 , $ 136.00 , $ 136.00 , $ 347.00 120 | Hostel Cat Las Vegas,1, $ 272.00 , $ 166.00 ,39%, $ 127.33 ,53%, $ 166.00 , $ 126.00 , $ 98.00 , $ 104.00 , $ 104.00 , 121 | Eastside Cannery Casino & Hotel,4, $ 213.00 , $ 209.00 ,2%, $ 213.00 ,0%, $ 314.00 , $ 213.00 , $ 202.00 , $ 152.00 , $ 188.00 , -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | npm install dataproofer 3 | ``` 4 | Read the documentation 5 | ``` 6 | dataproofer --help 7 | > Usage: dataproofer 8 | 9 | A proofreader for your data 10 | 11 | Options: 12 | 13 | -h, --help output usage information 14 | -V, --version output the version number 15 | -o, --out file to output results. default stdout 16 | -c, --core run tests from the core suite 17 | -i, --info run tests from the info suite 18 | -a, --stats run tests from the statistical suite 19 | -g, --geo run tests from the geographic suite 20 | -t, --tests comma-separated list to use 21 | -j, --json output JSON of test results 22 | -J, --json-pretty output an indented JSON of test results 23 | -S, --summary output overall test results, excluding pass/fail results 24 | -v, --verbose include descriptions about each column 25 | -x, --exclude exclude tests that passed 26 | 27 | Examples: 28 | 29 | $ dataproofer my_data.csv 30 | ``` 31 | Run a test 32 | ``` 33 | node index.js data.csv 34 | ``` 35 | Save the results 36 | ``` 37 | node index.js --json data.csv --out data.json 38 | ``` 39 | Learn how to run specific test suites or tests and output longer or shorter summaries, use the `--help` flag. 40 | 41 | Found a bug? [Let us know](https://github.com/dataproofer/Dataproofer/issues/new). 42 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * CLI Interface to Dataproofer 4 | */ 5 | 6 | const pkg = require("./package.json"); 7 | const Processor = require("./processing"); 8 | const Rendering = require("./rendering"); 9 | const Processing = new Processor(); 10 | 11 | const chalk = require("chalk"); 12 | const rw = require("rw"); 13 | const program = require("commander"); 14 | 15 | // this module is being run from the command line 16 | if (require.main === module) { 17 | const SUITES = [ 18 | require("dataproofer-info-suite"), 19 | require("dataproofer-core-suite"), 20 | require("dataproofer-stats-suite"), 21 | require("dataproofer-geo-suite"), 22 | ]; 23 | 24 | var list = function (val) { 25 | return val.split(","); 26 | }; 27 | 28 | var toLower = function (str) { 29 | return str.toLowerCase(); 30 | }; 31 | 32 | program 33 | .version(pkg.version) 34 | .description("A proofreader for your data") 35 | .usage("") 36 | .option("-o, --out ", "file to output results. default stdout") 37 | .option("-c, --core", "run tests from the core suite") 38 | .option("-i, --info", "run tests from the info suite") 39 | .option("-s, --stats", "run tests from the statistical suite") 40 | .option("-g, --geo", "run tests from the geographic suite") 41 | .option("-t, --tests ", "comma-separated list to use", list) 42 | .option("-j, --json", "output JSON of test results") 43 | .option("-J, --json-pretty", "output an indented JSON of test results") 44 | .option( 45 | "-S, --summary", 46 | "output overall test results, excluding pass/fail results" 47 | ) 48 | .option("-v, --verbose", "include descriptions about each column") 49 | .option( 50 | "-e, --exit", 51 | "exit with a console error if any tests fail (useful for CI)" 52 | ) 53 | .option("-x, --exclude", "exclude tests that passed") 54 | .option( 55 | "-m, --sampleMin ", 56 | "minimum number of rows to sample and test", 57 | parseInt 58 | ) 59 | .option( 60 | "-M, --sampleMax ", 61 | "maximum number of rows to sample and test", 62 | parseInt 63 | ) 64 | .addOption( 65 | new program.Option( 66 | "-r, --sampleRatio ", 67 | "ratio of rows to sample from total rows", 68 | parseFloat 69 | ).default(0.25, "25% of the total rows") 70 | ); 71 | 72 | program.on("--help", function () { 73 | console.info(" Examples:"); 74 | console.info(""); 75 | console.info(" $ dataproofer my_data.csv"); 76 | console.info(""); 77 | }); 78 | 79 | program.parse(process.argv); 80 | 81 | var make_red = function make_red(txt) { 82 | return chalk.bold(txt); //display the help text in red on the console 83 | }; 84 | 85 | if (!process.argv.slice(2).length) { 86 | program.outputHelp(make_red); 87 | return; 88 | } 89 | 90 | for (var suite of SUITES) { 91 | if (suite.name.indexOf("core") > -1 && program.opts().core === true) { 92 | suite.tests.forEach(function (test) { 93 | test.active = true; 94 | }); 95 | } else if ( 96 | suite.name.indexOf("info") > -1 && 97 | program.opts().info === true 98 | ) { 99 | suite.tests.forEach(function (test) { 100 | test.active = true; 101 | }); 102 | } else if ( 103 | suite.name.indexOf("stats") > -1 && 104 | program.opts().stats === true 105 | ) { 106 | suite.tests.forEach(function (test) { 107 | test.active = true; 108 | }); 109 | } else if (suite.name.indexOf("geo") > -1 && program.opts().geo === true) { 110 | suite.tests.forEach(function (test) { 111 | test.active = true; 112 | }); 113 | } else if (program.opts().tests) { 114 | suite.tests.forEach(function (test) { 115 | var inputTests = program.opts().tests.map(toLower); 116 | var currTest = test.name().toLowerCase(); 117 | if (inputTests.indexOf(currTest) > -1) test.active = true; 118 | }); 119 | } else if ( 120 | program.opts().core !== true && 121 | program.opts().info !== true && 122 | program.opts().stats !== true && 123 | program.opts().geo !== true && 124 | program.opts().tests !== true 125 | ) { 126 | suite.tests.forEach(function (test) { 127 | test.active = true; 128 | }); 129 | } 130 | } 131 | 132 | var filepath = program.args[0]; 133 | //READ FILE 134 | var allowFileExtensions = ["csv", "tsv", "psv", "xlsx", "xls"]; 135 | 136 | var currFileName = filepath.split("/").pop(), 137 | currExt = currFileName.split(".").pop(), 138 | sampleOpts = { 139 | sampleRatio: program.opts().sampleRatio, 140 | sampleMin: program.opts().sampleMin, 141 | sampleMax: program.opts().sampleMax, 142 | }; 143 | 144 | if (allowFileExtensions.indexOf(currExt) > -1) { 145 | var loadConfig = { 146 | ext: currExt, 147 | filepath: filepath, 148 | filename: currFileName, 149 | sampleOpts: sampleOpts, 150 | }; 151 | var loaded = Processing.load(loadConfig); 152 | var processorConfig = { 153 | suites: SUITES, 154 | renderer: Rendering, 155 | loaded: loaded, 156 | json: program.opts().json || program.opts().jsonPretty, 157 | }; 158 | Processing.run(processorConfig).then(function (processor) { 159 | const { results } = processor; 160 | var suiteNames = Object.keys(results); 161 | 162 | suiteNames.forEach(function (suiteName) { 163 | var testNames = Object.keys(results[suiteName]); 164 | totalTests += testNames.length; 165 | testNames.forEach(function (testName) { 166 | var test = results[suiteName][testName]; 167 | if (program.opts().exclude && test.testState === "passed") { 168 | delete results[suiteName][testName]; 169 | } 170 | }); 171 | }); 172 | 173 | var totalTests = 0; 174 | var totalPassed = 0; 175 | var totalFailed = 0; 176 | var resultStr = "\n"; 177 | suiteNames.forEach(function (suiteName) { 178 | var testNames = Object.keys(results[suiteName]); 179 | totalTests += testNames.length; 180 | testNames.forEach(function (testName) { 181 | var test = results[suiteName][testName]; 182 | resultStr += testName + ": "; 183 | switch (test.testState) { 184 | case "passed": 185 | totalPassed += 1; 186 | resultStr += chalk.green(test.testState) + "\n"; 187 | break; 188 | case "warn": 189 | resultStr += chalk.yellow(test.testState) + "\n"; 190 | break; 191 | case "failed": 192 | resultStr += chalk.red(test.testState) + "\n"; 193 | totalFailed += 1; 194 | break; 195 | case "info": 196 | totalTests -= 1; 197 | resultStr += chalk.blue(test.testState) + "\n"; 198 | break; 199 | } 200 | if (program.opts().verbose === true && test.testState !== "passed") { 201 | resultStr += 202 | chalk.dim(test.conclusion.replace(/
/g, "\n")) + "\n"; 203 | } 204 | }); 205 | }); 206 | 207 | var summaryPct = totalPassed / totalTests; 208 | var summaryColor = () => { 209 | if (summaryPct < 0.7) { 210 | // below 70% is failing 211 | return "red"; 212 | } else if (summaryPct >= 0.7 && summaryPct <= 0.9) { 213 | // between 70% and 90% is average 214 | return "yellow"; 215 | } else if (summaryPct > 0.9) { 216 | // above 90% is excellent 217 | return "green"; 218 | } 219 | }; 220 | var testStr = totalPassed > 1 ? "tests" : "test"; 221 | var summaryStr = chalk`\n{${summaryColor()} {bold ${Math.round( 222 | summaryPct * 100 223 | )}%}\n${totalPassed} ${testStr} passed out of ${totalTests}}\n`; 224 | 225 | if ( 226 | program.opts().watch === true || 227 | program.opts().suites === true || 228 | program.opts().tests === true 229 | ) { 230 | process.stderr.write( 231 | chalk.red("Error: This feature is not currently implemented") 232 | ); 233 | return; 234 | } 235 | 236 | let exit = program.opts().exit && totalFailed > 0; 237 | var done = function () { 238 | if (!program.opts().json || program.opts().jsonPretty) { 239 | process.stdout.write(summaryStr); 240 | process.stdout.write("\n### PROOFED ###\n\n"); 241 | } 242 | if (exit) process.exit(1); 243 | return; 244 | }; 245 | 246 | var outPath = program.opts().out ? program.opts().out : "/dev/stdout"; 247 | 248 | if (program.opts().out) resultStr = resultStr.replace(/\[\d+m/g, ""); 249 | if (program.opts().json === true) { 250 | rw.writeFile(outPath, JSON.stringify(results), done); 251 | return; 252 | } 253 | if (program.opts().jsonPretty === true) { 254 | rw.writeFile(outPath, JSON.stringify(results, null, 2), done); 255 | return; 256 | } 257 | if (program.opts().summary !== true) { 258 | rw.writeFile(outPath, resultStr, done); 259 | } else { 260 | done(); 261 | } 262 | }); 263 | } else { 264 | process.stderr.write( 265 | chalk.red( 266 | "Error: Must use a supported filetype. Currently supported filetypes: " + 267 | allowFileExtensions.join(", "), 268 | "utf8" 269 | ) 270 | ); 271 | } 272 | } 273 | 274 | module.exports = { 275 | Processing: Processing, 276 | Rendering: Rendering, 277 | version: pkg.version, 278 | }; 279 | -------------------------------------------------------------------------------- /src/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataproofer", 3 | "version": "2.1.0", 4 | "description": "A proofreader for your datasets", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "yarn node index.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/dataproofer/dataproofer.git" 12 | }, 13 | "bin": "./index.js", 14 | "keywords": [ 15 | "data", 16 | "csv", 17 | "excel" 18 | ], 19 | "author": "Dataproofer ", 20 | "license": "GPL-3.0", 21 | "bugs": { 22 | "url": "https://github.com/dataproofer/dataproofer/issues" 23 | }, 24 | "homepage": "https://github.com/dataproofer/dataproofer#readme", 25 | "dependencies": { 26 | "chalk": "^4.1.0", 27 | "commander": "^7.2.0", 28 | "d3": "^4.2.7", 29 | "dataproofer-core-suite": "2.1.0", 30 | "dataproofer-geo-suite": "2.1.0", 31 | "dataproofer-info-suite": "2.1.0", 32 | "dataproofer-stats-suite": "2.1.0", 33 | "dataproofertest-js": "2.1.0", 34 | "indian-ocean": "^4.0.2", 35 | "lodash": "^4.17.20", 36 | "optimist": "^0.6.1", 37 | "rw": "^1.3.3", 38 | "xlsx": "^0.16.9" 39 | }, 40 | "devDependencies": { 41 | "@babel/core": "^7.13.15", 42 | "@babel/eslint-parser": "^7.12.16", 43 | "@babel/eslint-plugin": "^7.12.13", 44 | "eslint": "^7.20.0", 45 | "eslint-config-prettier": "^7.2.0", 46 | "eslint-plugin-prettier": "^3.3.1", 47 | "prettier": "^2.2.1" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/processing.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const _ = require("lodash"); 3 | const d3 = require("d3"); 4 | const xlsx = require("xlsx"); 5 | const io = require("indian-ocean"); 6 | const DataprooferTest = require("dataproofertest-js"); 7 | const util = require("dataproofertest-js/util"); 8 | 9 | var Processor = function () { 10 | return this; 11 | }; 12 | 13 | Processor.prototype = { 14 | sampleRows: function (rows, sampleOpts, currFilepath) { 15 | var self = this, 16 | sampleMin = sampleOpts.sampleMin, 17 | sampleMax = sampleOpts.sampleMax, 18 | sampleRatio = sampleOpts.sampleRatio, 19 | totalRows = rows.length; 20 | 21 | var sampleSize = Math.round(sampleRatio * totalRows); 22 | 23 | if (sampleSize < 1000 && totalRows < 1000) { 24 | // test all the rows if there's less than a thousand in total 25 | sampleSize = totalRows; 26 | } else if (sampleMin) { 27 | sampleSize = sampleMin; 28 | } else if (sampleMax) { 29 | sampleSize = sampleMax; 30 | } 31 | 32 | var currRemainingRows, sampledRows; 33 | if (self.remainingRows && self.filepath === currFilepath) { 34 | currRemainingRows = self.remainingRows; 35 | } else { 36 | self.filepath = currFilepath; 37 | currRemainingRows = self.remainingRows = rows; 38 | } 39 | sampledRows = currRemainingRows.slice(0, sampleSize); 40 | self.sampleProgress = sampledRows.length / currRemainingRows.length; 41 | self.remainingRows = currRemainingRows.slice( 42 | sampleSize, 43 | currRemainingRows.length 44 | ); 45 | self.sampledRows = sampledRows; 46 | self.totalRows = totalRows; 47 | 48 | return self; 49 | }, 50 | 51 | load: function (config) { 52 | var self = this, 53 | filepath = path.resolve(config.filepath), 54 | ext = config.ext, 55 | // user can optionally pass in rows and columnHeads already parsed 56 | rows = config.rows, 57 | columnHeads = config.columnHeads, 58 | // user can change sample sizes in the CLI 59 | sampleMin = config.sampleOpts.sampleMin, 60 | sampleMax = config.sampleOpts.sampleMax, 61 | sampleRatio = config.sampleOpts.sampleRatio; 62 | var sampleOpts = { 63 | sampleRatio: sampleRatio, 64 | sampleMin: sampleMin, 65 | sampleMax: sampleMax, 66 | }; 67 | 68 | if (ext) { 69 | // Parse the csv with d3 70 | var nonExcelExtensions = ["csv", "tsv", "psv"]; 71 | var excelExtensions = ["xlsx", "xls"]; 72 | if (nonExcelExtensions.indexOf(ext) > -1) { 73 | rows = io.readDataSync(filepath); 74 | } else if (excelExtensions.indexOf(ext) > -1) { 75 | var sheets = xlsx.readFile(filepath).Sheets; 76 | var firstSheetName = Object.keys(sheets)[0]; 77 | var excelCsvRows = xlsx.utils.sheet_to_csv(sheets[firstSheetName]); 78 | rows = d3.csvParse(excelCsvRows); 79 | } else { 80 | rows = []; 81 | } 82 | } 83 | if (!columnHeads || !columnHeads.length) { 84 | columnHeads = Object.keys(rows[0]); 85 | } 86 | 87 | // TODO: use webworkers or something so we don't need an upper limit 88 | // for now, use sampling 89 | var sampleConfig = self.sampleRows(rows, sampleOpts, filepath); 90 | var { sampledRows, totalRows, sampleProgress } = sampleConfig; 91 | return { 92 | rows: sampledRows, 93 | totalRows: totalRows, 94 | sampleProgress: sampleProgress, 95 | columnHeads: columnHeads, 96 | config: config, 97 | }; 98 | }, 99 | 100 | run: function (config) { 101 | var suites = config.suites; 102 | var Renderer = config.renderer; 103 | var input = config.input; 104 | 105 | var loaded = config.loaded; 106 | 107 | var columnHeads = loaded.columnHeads; 108 | var rows = loaded.rows; 109 | var sampleProgress = loaded.sampleProgress; 110 | var totalRows = loaded.totalRows; 111 | // Initialize the renderer 112 | var renderer = new Renderer({ 113 | filename: loaded.filename, 114 | suites: suites, 115 | columnHeads: columnHeads, 116 | rows: rows, 117 | sampleProgress: sampleProgress, 118 | totalRows: totalRows, 119 | json: config.json, 120 | }); 121 | 122 | var badColumnHeadsTest = new DataprooferTest() 123 | .name("Missing or duplicate column headers") 124 | .description("Check for errors in the header of the spreadsheet") 125 | .methodology(function (rows, columnHeads) { 126 | var badHeaderCount = 0; 127 | var badColumnHeads = []; 128 | var testState = "passed"; 129 | 130 | columnHeads.forEach(function (columnHead, counts) { 131 | if (counts[columnHead] || util.isEmpty(columnHead)) { 132 | var subColumnHead = "Column " + counts; 133 | badColumnHeads.push(subColumnHead); 134 | badHeaderCount += 1; 135 | } else { 136 | counts[columnHead] = 0; 137 | } 138 | return counts; 139 | }, {}); 140 | 141 | if (badHeaderCount > 0) testState = "failed"; 142 | 143 | var result = { 144 | testState: testState, 145 | badColumnHeads: badColumnHeads, 146 | }; 147 | return result; 148 | }); 149 | badColumnHeadsTest.active = true; 150 | 151 | var result = badColumnHeadsTest.proof(rows, columnHeads); 152 | renderer.addResult("dataproofer-info-suite", badColumnHeadsTest, result); 153 | 154 | var cleanedColumnHeads = _.without( 155 | columnHeads, 156 | result.badColumnHeads.join(", ") 157 | ); 158 | var cleanedRows = rows; 159 | 160 | var testsNestArr = suites.map(function (suite) { 161 | var suiteTestsArr = suite.tests.map(function (test) { 162 | test.suiteName = suite.name; 163 | return test; 164 | }); 165 | return suiteTestsArr; 166 | }); 167 | // do a shallow flatten to get an array of tests 168 | var testsFlatArr = _.flatten(testsNestArr); 169 | var testPromisesArr = testsFlatArr 170 | .filter(function (test) { 171 | // run tests flagged test.active === true 172 | return test.active === true; 173 | }) 174 | .map(function (test) { 175 | var testPromise = new Promise(function (resolve) { 176 | var result = test.proof(cleanedRows, cleanedColumnHeads, input); 177 | resolve(result); 178 | }); 179 | testPromise.then(function (result) { 180 | // aggregate the number of highlighted cells for each column 181 | result.columnWise = {}; 182 | if (result && result.highlightCells) { 183 | cleanedColumnHeads.forEach(function (column) { 184 | result.columnWise[column] = result.highlightCells.reduce( 185 | function (count, row) { 186 | // if there is a value in this cell, increment count, otherwise leave it alone 187 | return row[column] ? count + 1 : count; 188 | }, 189 | 0 190 | ); 191 | }); 192 | } 193 | // call the test's conclusion function, if any 194 | test.conclusion(result); 195 | // incrementally report as tests run 196 | renderer.addResult(test.suiteName, test, result); 197 | return renderer; 198 | }); 199 | testPromise.catch(function (reason) { 200 | renderer.addError(test.suiteName, test, reason); 201 | }); 202 | return testPromise; 203 | }); 204 | var testsPromise = Promise.all(testPromisesArr).then( 205 | function (values) { 206 | renderer.done(); 207 | return renderer; 208 | }, 209 | function (reason) { 210 | console.error("Tests failed! Reason: ", reason); // Error! 211 | } 212 | ); 213 | return testsPromise; 214 | }, 215 | }; 216 | 217 | module.exports = Processor; 218 | -------------------------------------------------------------------------------- /src/rendering.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Render the results for all tests in a given set of suites for a dataset 3 | * @class 4 | * @param {Object} configuration including filename and suites 5 | * @return {undefined} 6 | */ 7 | function Renderer(config) { 8 | Object.assign(this, config); 9 | var results = (this.results = {}); 10 | config.suites.forEach(function (suite) { 11 | // console.log("suite name", suite); 12 | results[suite.name] = {}; 13 | }); 14 | } 15 | 16 | /** 17 | * A horrible run-time error has occured, we should let the user know and abort everything. 18 | * @param {Object} error object. should contain a `message` property 19 | * @return {undefined} 20 | */ 21 | Renderer.prototype.error = function (error) { 22 | console.error(error); 23 | }; 24 | 25 | /** 26 | * The renderer can render results as they come so we can show progress to the user as tests complete. 27 | * @param {String} the name of the suite 28 | * @param {String} the name of the test 29 | * @param {Object} the result object. 30 | */ 31 | Renderer.prototype.addResult = function (suite, test, result) { 32 | this.results[suite][test.name()] = result; 33 | this.results[suite][test.name()].conclusion = test.conclusion(); 34 | }; 35 | 36 | /** 37 | * Notify that an error occurred while running a specific test 38 | * @param {String} the name of the suite 39 | * @param {String} the name of the test 40 | * @param {Object} the error object. should contain a `message` property 41 | */ 42 | Renderer.prototype.addError = function (suite, test, error) { 43 | console.warn("Test error:\n", suite, "\n", test.name()); 44 | console.error(error.stack || error); 45 | }; 46 | 47 | /** 48 | * Indicate that we are finished rendering 49 | * @return {undefined} 50 | */ 51 | Renderer.prototype.done = function () { 52 | // finish up 53 | if (!this.json) { 54 | console.info("\ntotal rows", this.totalRows); 55 | console.info("rows sampled", this.rows.length); 56 | } 57 | }; 58 | 59 | module.exports = Renderer; 60 | --------------------------------------------------------------------------------