├── .gitignore ├── 404.html ├── Gemfile ├── README.md ├── _config.yml ├── _includes ├── footer.html ├── head.html ├── header.html └── page-tools.html ├── _layouts ├── apps.html ├── default.html ├── docs.html ├── page.html ├── pattern.html └── post.html ├── _sass ├── _grid-layout.scss ├── base │ ├── _base.scss │ ├── _buttons.scss │ ├── _forms.scss │ ├── _grid-settings.scss │ ├── _lists.scss │ ├── _tables.scss │ ├── _typography.scss │ └── _variables.scss ├── bourbon │ ├── _bourbon-deprecated-upcoming.scss │ ├── _bourbon.scss │ ├── addons │ │ ├── _clearfix.scss │ │ ├── _directional-values.scss │ │ ├── _ellipsis.scss │ │ ├── _font-family.scss │ │ ├── _hide-text.scss │ │ ├── _html5-input-types.scss │ │ ├── _position.scss │ │ ├── _prefixer.scss │ │ ├── _retina-image.scss │ │ ├── _size.scss │ │ ├── _timing-functions.scss │ │ ├── _triangle.scss │ │ └── _word-wrap.scss │ ├── css3 │ │ ├── _animation.scss │ │ ├── _appearance.scss │ │ ├── _backface-visibility.scss │ │ ├── _background-image.scss │ │ ├── _background.scss │ │ ├── _border-image.scss │ │ ├── _border-radius.scss │ │ ├── _box-sizing.scss │ │ ├── _calc.scss │ │ ├── _columns.scss │ │ ├── _filter.scss │ │ ├── _flex-box.scss │ │ ├── _font-face.scss │ │ ├── _font-feature-settings.scss │ │ ├── _hidpi-media-query.scss │ │ ├── _hyphens.scss │ │ ├── _image-rendering.scss │ │ ├── _keyframes.scss │ │ ├── _linear-gradient.scss │ │ ├── _perspective.scss │ │ ├── _placeholder.scss │ │ ├── _radial-gradient.scss │ │ ├── _selection.scss │ │ ├── _text-decoration.scss │ │ ├── _transform.scss │ │ ├── _transition.scss │ │ └── _user-select.scss │ ├── functions │ │ ├── _assign.scss │ │ ├── _color-lightness.scss │ │ ├── _contains.scss │ │ ├── _is-length.scss │ │ ├── _is-size.scss │ │ ├── _modular-scale.scss │ │ ├── _px-to-em.scss │ │ ├── _px-to-rem.scss │ │ ├── _strip-units.scss │ │ ├── _tint-shade.scss │ │ ├── _transition-property-name.scss │ │ └── _unpack.scss │ ├── helpers │ │ ├── _convert-units.scss │ │ ├── _font-source-declaration.scss │ │ ├── _gradient-positions-parser.scss │ │ ├── _is-num.scss │ │ ├── _linear-angle-parser.scss │ │ ├── _linear-gradient-parser.scss │ │ ├── _linear-positions-parser.scss │ │ ├── _linear-side-corner-parser.scss │ │ ├── _radial-arg-parser.scss │ │ ├── _radial-gradient-parser.scss │ │ ├── _radial-positions-parser.scss │ │ ├── _render-gradients.scss │ │ ├── _shape-size-stripper.scss │ │ └── _str-to-num.scss │ └── settings │ │ ├── _asset-pipeline.scss │ │ ├── _prefixer.scss │ │ └── _px-to-em.scss ├── icons.scss ├── mmenu │ ├── addons │ │ ├── jquery.mmenu.buttonbars.scss │ │ ├── jquery.mmenu.counters.scss │ │ ├── jquery.mmenu.dragopen.scss │ │ ├── jquery.mmenu.footer.scss │ │ ├── jquery.mmenu.header.scss │ │ ├── jquery.mmenu.labels.scss │ │ ├── jquery.mmenu.offcanvas.scss │ │ ├── jquery.mmenu.searchfield.scss │ │ └── jquery.mmenu.toggles.scss │ ├── extensions │ │ ├── jquery.mmenu.effects.scss │ │ ├── jquery.mmenu.fullscreen.scss │ │ ├── jquery.mmenu.iconbar.scss │ │ ├── jquery.mmenu.positioning.scss │ │ ├── jquery.mmenu.themes.scss │ │ └── jquery.mmenu.widescreen.scss │ ├── inc │ │ ├── _colors.scss │ │ ├── _mixins.scss │ │ ├── _sizing.scss │ │ └── _variables.scss │ ├── jquery.mmenu.all.scss │ ├── jquery.mmenu.oncanvas.scss │ └── jquery.mmenu.scss ├── neat │ ├── _neat-helpers.scss │ ├── _neat.scss │ ├── functions │ │ ├── _new-breakpoint.scss │ │ └── _private.scss │ ├── grid │ │ ├── _box-sizing.scss │ │ ├── _direction-context.scss │ │ ├── _display-context.scss │ │ ├── _fill-parent.scss │ │ ├── _media.scss │ │ ├── _omega.scss │ │ ├── _outer-container.scss │ │ ├── _pad.scss │ │ ├── _private.scss │ │ ├── _row.scss │ │ ├── _shift.scss │ │ ├── _span-columns.scss │ │ ├── _to-deprecate.scss │ │ └── _visual-grid.scss │ └── settings │ │ ├── _disable-warnings.scss │ │ ├── _grid.scss │ │ └── _visual-grid.scss ├── normalize.scss ├── persons.scss └── syntax.scss ├── contribute ├── adding.html ├── editing.html ├── index.html └── markdown-examples.md ├── core-datasets ├── core-data-curators.md ├── core-datasets-roadmap.md ├── guide │ └── index.md ├── images │ ├── create-repository-init-readme.jpg │ ├── create-repository-name.jpg │ ├── export.jpg │ ├── issues.jpg │ ├── remote-v-links.jpg │ └── repo-create.jpg ├── index.md └── working-with-git.md ├── css ├── fonts │ ├── icomoon.eot │ ├── icomoon.svg │ ├── icomoon.ttf │ └── icomoon.woff ├── main.scss └── widescreen.scss ├── data ├── archive │ ├── data-wrangling-intro.md │ ├── howtogetdata.md │ ├── mapping.md │ └── scraping.md ├── csv.md ├── glossary.md ├── images │ ├── gssImportFormula.jpg │ ├── gssImportFormulaFull.jpg │ ├── gssImportedHTMLTable.jpg │ └── wikipediaTable.jpg ├── index.md ├── patterns │ ├── archiving-twitter.md │ ├── choropleth-maps-from-spreadsheets.md │ ├── cleaning-data-scraped-from-the-web.md │ ├── cleaning-data-with-refine.md │ ├── cleaning-data-with-spreadsheets.md │ ├── cleaning-spending-data-open-refine.md │ ├── datahub.md │ ├── extracting-data-from-pdf-with-tabula.md │ ├── filtering-data-with-spreadsheets.md │ ├── foi.md │ ├── formulae-with-spreadsheets.md │ ├── geo-googledocs.md │ ├── geocoding.md │ ├── getting-data-from-world-bank.md │ ├── how-to-find-data.md │ ├── index.md │ ├── interactive_bubble_charts.md │ ├── intro-to-apis.md │ ├── introduction-to-html.md │ ├── liberating-access-databases.md │ ├── liberating-html-tables.md │ ├── line-charts-from-spreadsheets.md │ ├── mirroring.md │ ├── publishing-work-online.md │ ├── scatterplots-from-spreadsheets.md │ ├── scraper-extension-for-chrome.md │ ├── scraping-beyond-the-basics.md │ ├── scraping-multiple-pages-with-refine-and-scraper.md │ ├── sorting-data-with-spreadsheets.md │ └── sql-output-csv-from-postgresql.md ├── sql-for-data-manipulation.md └── tutorial │ └── index.md ├── favicon.ico ├── feed.xml ├── glossary └── index.md ├── img ├── data-wrench.png ├── home-banner-bg.jpg ├── home-banner.jpg ├── home-banner_2x.jpg ├── open-content.svg ├── open-knowledge-large.png ├── open-knowledge.png └── supporters │ ├── 4ip-footer.png │ ├── hewlett-footer.png │ ├── kf-footer.png │ ├── on-footer.png │ └── osf-footer.png ├── index.html ├── js ├── apps.js ├── home.js ├── main.js ├── plugins.js ├── resources.js └── vendor │ ├── isotope.pkgd.min.js │ ├── jquery-1.11.2.min.js │ ├── jquery.adaptive-backgrounds.min.js │ └── modernizr-2.8.3.min.js ├── start └── index.md ├── tile-wide.png └── tile.png /.gitignore: -------------------------------------------------------------------------------- 1 | _site/* 2 | .sass-cache/ -------------------------------------------------------------------------------- /404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |Sorry, but the page you were trying to view does not exist.
58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # A sample Gemfile 2 | source "https://rubygems.org" 3 | 4 | gem 'github-pages' 5 | gem 'jekyll-redirect-from' 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Github repo for [Data Patterns][dp] - a collection of tips and tricks for data 2 | work. 3 | 4 | [dp]: http://okfnlabs.org/datapatterns/ 5 | 6 | 7 | 8 | [](https://gitter.im/okfn/chat?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 9 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | title: Open Knowledge Labs Handbook 2 | short_title: Labs Handbook 3 | description: "A collection of guides and advice to participate in Open Knowledge Labs" 4 | baseurl: "/handbook" 5 | issues_url: http://github.com/okfn/handbook/issues 6 | url: "http://datapatterns.org/" 7 | github_username: okfn 8 | github_repo: handbook 9 | paypal: admin@okfn.org 10 | contact: "https://discuss.okfn.org/c/open-knowledge-labs" 11 | markdown: kramdown 12 | highlighter: rouge 13 | gems: 14 | - jekyll-redirect-from 15 | devs: 16 | - 17 | name: Dan Fowler 18 | github: danfowler 19 | 20 | name: Gustavo Silva 21 | github: gsilvapt 22 | 23 | googleanalytics: UA-8271754-43 24 | permalink: pretty 25 | openknowledgeribbon: true 26 | 27 | defaults: 28 | - 29 | scope: 30 | path: "" # an empty string here means all files in the project 31 | values: 32 | layout: "page" 33 | edit: true 34 | - 35 | scope: 36 | path: "data/patterns" 37 | values: 38 | section: "patterns" 39 | 40 | -------------------------------------------------------------------------------- /_includes/footer.html: -------------------------------------------------------------------------------- 1 | 56 | -------------------------------------------------------------------------------- /_includes/head.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |Thank you for your interest in in helping to build The Open Data Handbook. We warmly welcome comments, corrections and additions, as well as suggestions for additional sections and areas to examine. For general discussion about the Handbook, please get in touch. To jump in with improvements and additions, read on.
15 | 16 |In order to contribute, you need a little insight of how things work under the hood. We’re not going to go into too much detail here, but the are three components you need some understanding of.
18 | 19 | 24 | 25 |GitHub is a web-based repository hosting service, which amongst other things offers revision control and source code management via a web-based graphical interface.
30 | 31 |Any changes you wish to make, whether they be edits to an existing page, or creating a new one, will most likely be done via the Github website (it is also possible to download and edit the files on your local machine, instructions for this method will be added in the future). All the files for this site can be browsed and edited the Github website. You will need to sign up for a (free) Github account. For full instructions, see Editing a page.
34 | 35 |Jekyll is a static site generator, which allows us to host websites based on our GitHub repositories. Jekyll takes the content, renders Markdown, and produces a complete, static website ready to be viewed on the web.
40 | 41 |All you really need to know about Jekyll is the method it uses to include metadata (ie. page title). Each page needs to start with a section it calls Front Matter, containing the page title. An example is provided in the Adding a page section.
44 | 45 |Markdown is a markup language with plain text formatting, designed so that it can be converted to HTML. Markdown can be used to create rich text using a plain text editor.
50 | 51 |Markdown is your key to formatting the text your provide for this site. By learning a few intuitive rules you’ll be able to ensure your text is formatted with headings, list, quotes etc, without writing any HTML. For examples, head to the Markdown section.
54 |My HTML page with an embedded map
54 | 55 | 56 | 57 | ``` 58 | 59 | You should see a map with marker! 60 | 61 | **TODO**: screenshot image 62 | 63 | This example HTML and javascript makes use of the open source mapping javascript mapping library called 'OpenLayers' and the open licensed maps from [OpenStreetMap.org](). 64 | 65 | In the javascript code we see how to initialise a map object which will appear within a div on your HTML page. A LonLat object is created to represent the centre point of the map. Try playing with the latitude, longitude values. A call to `transform` sorts out the projections, and we use this same location to place a marker. 66 | 67 | 68 | 69 | Geo-locations Latitude and Longitude 70 | ------------------------------------ 71 | 72 | **TODO** 73 | -------------------------------------------------------------------------------- /data/images/gssImportFormula.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/handbook/37d6a29e05261e222d88bb2eb98f8fc8d304106d/data/images/gssImportFormula.jpg -------------------------------------------------------------------------------- /data/images/gssImportFormulaFull.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/handbook/37d6a29e05261e222d88bb2eb98f8fc8d304106d/data/images/gssImportFormulaFull.jpg -------------------------------------------------------------------------------- /data/images/gssImportedHTMLTable.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/handbook/37d6a29e05261e222d88bb2eb98f8fc8d304106d/data/images/gssImportedHTMLTable.jpg -------------------------------------------------------------------------------- /data/images/wikipediaTable.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/handbook/37d6a29e05261e222d88bb2eb98f8fc8d304106d/data/images/wikipediaTable.jpg -------------------------------------------------------------------------------- /data/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Welcome to the Data Wrangling Handbook! 3 | --- 4 | 5 | *Data wrangling for fun and profit* 6 | 7 | This handbook is not a finished document but a collection of opinions and evolving best practices. The purpose is not to present all available options and technologies but to pick one and follow it through. 8 | 9 | The Handbook is also a collaborative effort: if you have a recipe, a tool or a howto and would like to share them, please contribute a patch or make a suggestion. 10 | 11 | The Handbook consists of two main parts: 12 | 13 | * Guides and Tutorials: Guides and tutorials that walks you the main aspects of data wrangling 14 | * [Patterns](patterns/): A set of "patterns" or recipes for doing specific tasks from scraping an HTML table to geocoding in a spreadsheet 15 | 16 | ## Guides and Tutorials 17 | 18 | * [Walk-through of some Data Wrangling basic tasks]({{site.baseurl}}/data/tutorial/) 19 | * [SQL for Data Manipulation](sql-for-data-manipulation/) 20 | * [Introduction CSV - the Lingua Franca of Data](csv/) 21 | * [Glossary of Terms](glossary/) 22 | 23 | ## Contributing 24 | 25 | * Edit directly on the [Handbook Github Repository](https://github.com/okfn/handbook) 26 | * Submit an issue to our [Issue Tracker](https://github.com/okfn/handbook/issue) 27 | 28 | -------------------------------------------------------------------------------- /data/patterns/archiving-twitter.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Archiving Twitter 3 | --- 4 | 5 | Twitter data is only available via the search API for up to 7 days. Data for a given account only goes back a few thousand tweets. Thus archiving tweets can be a useful activity. This entry details a few options and in the process shows some neat tips and tricks for pulling down data. 6 | 7 | Using twarc 8 | ----------- 9 | 10 | [twarc](https://github.com/edsu/twarc) is a powerful command line tool and Python library for archiving Twitter JSON data. You will need to obtain a free API key from Twitter in order to start archiving tweets. 11 | 12 | 13 | Using Javascript and the DataHub 14 | -------------------------------- 15 | 16 | See https://github.com/OKFN-BR/BusaoSP/blob/master/getdata.js 17 | -------------------------------------------------------------------------------- /data/patterns/choropleth-maps-from-spreadsheets.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Creating a Choropleth map 3 | --- 4 | 5 | **This tutorial uses Google spreadsheets to create a choropleth map. There is sample data for this tutorial [`here`](http://dump.tentacleriot.eu/wb-gdp-health-life.csv) 6 | 7 | 1. Filter for a single year (e.q. 2009) insert a new sheet and copy the filtered data into it. 8 | 2. As with all previous charts also here the columns need to be in a special position. 9 | 3. Move your data column (the one you want to use to display) right next to the country names. 10 | 11 |  12 | 13 | 1. Now mark the two columns and select “Chart...” from “insert”. 14 | 2. Under “Charts” select “Map” and then “geo chart - regions”. 15 | 3. You’ll see a preview. Play with the settings in customize to change the map, the colour-scale etc. 16 | 4. A note on colours: the red-green scale that is selected by default is not the best scale. So select a different one showing contrasts nicely. 17 | 18 | -------------------------------------------------------------------------------- /data/patterns/extracting-data-from-pdf-with-tabula.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Extracting Data from PDFs using Tabula 3 | --- 4 | 5 | PDFs can be all forms and shapes - if you’re facing a nicely formatted 6 | PDF that is not scanned give Tabula a shot to extract the information. 7 | How? read the short walkthrough below: 8 | 9 | You’ll need: 10 | 11 | - [Tabula](http://jazzido.github.io/tabula/) 12 | - a PDF: e.g. [](http://www.unhabitat.org/pmss/getElectronicVersion.aspx?nr=3387&alt=1) 13 | 14 | Waltkthrough: Extracting data from PDF tables 15 | --------------------------------------------- 16 | 17 | 1. Download the PDF at: [](http://www.unhabitat.org/pmss/getElectronicVersion.aspx?nr=3387&alt=1) 18 | 2. Start Tabula (most likely by double clicking on the tabula icon) 19 | 3. point your browser tof [](http://127.0.0.1:8080) 20 | 4. Choose the file you want to upload and click Submit 21 |  22 | 5. Wait until the PDF is fully loaded 23 | 6. Scroll down to page 167 - we’ll extract that table. 24 | 7. Click and pull a selection box over the table 25 |  26 | 8. A window will pop up to show how Tabula would extract the data. 27 |  28 | 9. Now download the Data as CSV 29 |  30 | 10. Fantastic you liberated the table from the PDF. Quick and easy wasn’t it? 31 | 32 | 33 | -------------------------------------------------------------------------------- /data/patterns/filtering-data-with-spreadsheets.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Filtering Data 3 | --- 4 | 5 | **This tutorial uses Google spreadsheets to filter data. Other spreadsheet programs work in a similar way - play around and see how they differ.** 6 | 7 | There is sample data for this tutorial [here](http://dump.tentacleriot.eu/wb-gdp-health-life.csv). 8 | 9 | Filtering data in a spreadsheet allows you to shut out the values you don't want to see. For example, in the sample data, some “Country Names” are actually not countries? You'll find things like “World”, “North America” and “Arab World”. Let's filter them out. 10 | 11 | Walkthrough: Filtering Data 12 | --------------------------- 13 | 14 | 1. Select the whole table. 15 | 2. Select “Filter” from the “Data” menu. 16 | 3. You now should see triangles next to the column names in the first row. 17 | 4. Click on the triangle next to country name. 18 | 5. you should see a long list of country names in the box. 19 |  20 | 6. Find those that are not a country and click on them (the green check mark will disappear). 21 | 7. Now you have successfully filtered your dataset. 22 | 8. Go ahead and play with it - the data will not be deleted, it’s just not displayed. 23 | 24 | 25 | -------------------------------------------------------------------------------- /data/patterns/geo-googledocs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Geocoding Data in a Google Docs Spreadsheet 3 | --- 4 | 5 | A very common need is to geocode data in a Google Spreadsheet (for example, in creating TimeMaps with the [Timeliner](http://timeliner.okfnlabs.org/) project). There are several options here: 6 | 7 | By hand – use a Geocoding service (see the [course](http://schoolofdata.org/handbook/courses/geocoding/) on geocoding) and then copy and paste by hand. Use the ImportXML (or ImportCSV) formulae to grab data from a geocoding service – great but with limitations on the number of rows you can code at one time (\~50). Use a Google App Script – the most powerful but requires installation of an App Script in your spreadsheet. In this tutorial I’m going to cover the latter two automated options and specifically focus on option 2. 8 | 9 | Using Formulas All of the following is illustrated live in this [google spreadsheet](https://docs.google.com/a/okfn.org/spreadsheet/ccc?key=0AqR8dXc6Ji4JdHBhY25yQkpHWF9NcEt1d3hrU0JWcUE#gid=0). 10 | 11 | We start with a formula like the following: 12 | ```excel 13 | =ImportXML("http://open.mapquestapi.com/nominatim/v1/search?format=xml&q=London", "//place[1]/@lat") 14 | ``` 15 | This formula uses the ImportXML function to look up XML data from the [Mapquest Nominatim geocoding service](http://open.mapquestapi.com/nominatim/) (see the previous tutorial for more about geocoding services). The first argument to ImportXML is the URL to fetch (in this case the results from querying the geocoding service) and the second part is an XPath expression to select data from that returned XML. In this case, the XPath looks up the first place object in the results: place[1] and then gets the lat (latitude) attribute. To understand this more clearly, here’s the XML returned by that XML query: 16 |  17 | 18 | In reality we want both latitude and longitude, so let’s change it to: 19 | ```excel 20 | =ImportXML("http://open.mapquestapi.com/nominatim/v1/search?format=xml&q=London", "//place[1]/@lat | //place[1]/@lon") 21 | ``` 22 | 23 | This uses an “or” || expression in XPath and the result will now be an array of results that Google Docs will put in 2 cells (one below another). You can see this in Column C of the example spreadsheet. 24 | 25 | What happens if we wanted the data in just one cell, with the two values separated by commas, for example? We could use the JOIN function: 26 | ```excel 27 | =JOIN(",", ImportXML("http://open.mapquestapi.com/nominatim/vi/search?format=xml&q=London", "//place[1]/@lat | //place[1]/@lon")) 28 | ``` 29 | Lastly, we’d like to geocode based on a place name in an another cell in the spreadsheet. To do this we just need to add the place name to our API request to MapQuest’s Nominatim service using the CONCATENATE function (this example assures the value is in cell A2): 30 | ```excel 31 | =ImportXML(CONCATENATE("http://open.mapquestapi.com/nominatim/v1/search?format=xml&q=", A2), "//place[1]/@lat") 32 | =JOIN(",", ImportXML(CONCATENATE("http://open.mapquestapi.com/nominatim/v1/search?format=xml&q=",A2), "//place[1]/@lat | //place[1]/@lon")) 33 | ``` 34 | 35 | App Script 36 | ---------- 37 | 38 | If you want an even more powerful approach you can use a Google App Script. In particular, Development Seed’s [MapBox](http://developmentseed.org/) team have prepared a great ready-made Google AppScript that will do geocoding for you. 39 | 40 | Find the [script plus instructions](https://github.com/mapbox/geo-googledocs) online. 41 | 42 | 43 | -------------------------------------------------------------------------------- /data/patterns/geocoding.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Geocoding / Georeferencing Data 3 | --- 4 | 5 | Geo-Googledocs 6 | -------------- 7 | 8 | * Google docs app script allowing you to do the following with your google docs spreadsheet: 9 | 10 | * Export to :term:`GeoJSON` 11 | * `Geocode` arbitrary addresses 12 | 13 | * Code: [https://github.com/mapbox/geo-googledocs]() 14 | * Docs: [http://developmentseed.org/blog/2011/10/12/mapping-google-doc-spreadsheet/]() 15 | * Author: MapBox (DevelopmentSeed) 16 | 17 | -------------------------------------------------------------------------------- /data/patterns/how-to-find-data.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: How to find data 3 | --- 4 | 5 | If you are looking for further inspiration on how to find data beyond the ways highlighted in the [course](http://schoolofdata.org/handbook/courses/finding-data/) on finding data, read on! 6 | 7 | Edited directories 8 | ------------------ 9 | 10 | One of the [largest directories of open data repositories](http://oad.simmons.edu/oadwiki/Data_repositories) is provided by the [Open Access Directory](http://oad.simmons.edu/oadwiki/About_OAD). Its collection is mostly focused on scientific or research data and is curated by topic area. Topics covered in the directory include archaeology, astronomy, biology, chemistry, computer science, energy, environmental sciences, earth sciences, linguistics, marine sciences, medicine, physics and social sciences. 11 | 12 | [CKAN](http://ckan.net) is a directory that largely works through wiki-like edits. Some of the benefits of CKAN are that it has well developed client libraries that enable you to programmatically access information about each of the datasets within its directory. For example, it is easy to ask it to tell you which datasets have been released into the public domain. 13 | 14 | [Quora](http://www.quora.com) has actually become a great source of information about where to find data on specific topic areas. It has several questions related to this topic which are being continually updated. Some examples include: 15 | 16 | - [What are some free, public data sets?](http://www.quora.com/Data/What-are-some-free-public-data-sets) 17 | - [Where can I get large datasets open to the public?](http://www.quora.com/Data/Where-can-I-get-large-datasets-open-to-the-public) 18 | 19 | 20 | -------------------------------------------------------------------------------- /data/patterns/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Patterns 3 | --- 4 | 5 | This section contains small snippets that will help you in the process of data wrangling. They might be small useful tips of full blown tutorials on tools or topics. 6 | 7 | **Note on the term 'pattern'** 8 | 9 | The term pattern has developed a very specific meaning in software engineering. While we use the term in this sense, the tricks presented are not defined as a pattern using any of the formal templates that have developed for software design patterns. 10 | 11 |` this defines a paragraph. Go ahead and edit it! 15 | 5. You can click on `Publish page` to see how your page will look like (approximately). 16 | 6. On the top you'll always have the possibility to go back and edit. 17 | 7. Now let's add some charts we made. 18 | 8. Go back to one of the charts in the spreadsheet. 19 | 9. Click on the chart. See the small triangle top right of the chart: this is the options menu. 20 | 10. Go and select `Publish chart'85`. There will be a popup with a lot of code in a grey box: 21 |  22 | 11. Copy this code and paste it into the pastehtml (somewhere between `
` and ``). Now if you go and look at your page, the chart should be there. 23 |  24 | 12. Once you are finished, click on publish and you'll get a url to your webpage. use this to share your results with your friends. 25 | 26 | Of course if you already have a blog or something similar you can share the results there 27 | 28 | 29 | -------------------------------------------------------------------------------- /data/patterns/scatterplots-from-spreadsheets.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Walkthrough: Scatterplot" 3 | --- 4 | 5 | **This tutorial uses Google spreadsheets to create a scatterplot. There is sample data for this tutorial 6 | [here](http://dump.tentacleriot.eu/wb-gdp-health-life.csv) .** 7 | 8 | So let’s create a scatterplot. 9 | 10 | 1. Start with World Bank Data. 11 | 2. Copy it to a new sheet and put the columns “healthcare expenditure total per person” and “life expectancy” next to each other. 12 |  13 | 3. Click `insert` `charts...` and select “scatter plot” from charts. 14 | 4. Select the first one, since this is what we want to do. 15 | 5. And there you go: simply adapt the scatterplot so it looks nice. 16 | Don’t forget to label axes. Try to make the dots smaller if there is significant overlap. 17 | 18 | -------------------------------------------------------------------------------- /data/patterns/sorting-data-with-spreadsheets.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Sorting Data with Spreadsheets 3 | --- 4 | 5 | **This tutorial uses Google spreadsheets to sort data. Other spreadsheet programs work in a similar way - play around and see how they differ.** 6 | 7 | There is sample data for this tutorial [here](http://dump.tentacleriot.eu/wb-gdp-health-life.csv). 8 | 9 | Walkthrough: Sorting a dataset. 10 | ------------------------------- 11 | 12 | 1. Select the whole sheet you want to sort. Do this by clicking on the right upper grey field, between the row and column names. 13 |  14 | 2. Select “Sort Range...” from the “Data” menu – this will open an additional Selection 15 | 3. Check the “Data has header row” checkbox 16 |  17 | 4. Select the column you want to sort by in the dropdown menu 18 | 5. Try to sort by GDP – Which country has the lowest? 19 | 6. Try again with different values, can you sort ascending and descending? 20 | 21 | **Tip:** Be careful! A common mistake is to forget to select *all* the data. If you sort without selecting all the data, the rows will no longer match up. 22 | 23 | 24 | -------------------------------------------------------------------------------- /data/patterns/sql-output-csv-from-postgresql.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Outputting CSV from Postgres 3 | --- 4 | 5 | ``` 6 | \f ',' 7 | \a 8 | \t 9 | \o /path/to/my.csv 10 | SELECT field1,field2 FROM some_table; 11 | \o 12 | 13 | ``` 14 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/handbook/37d6a29e05261e222d88bb2eb98f8fc8d304106d/favicon.ico -------------------------------------------------------------------------------- /feed.xml: -------------------------------------------------------------------------------- 1 | --- 2 | layout: null 3 | --- 4 | 5 |Get a quick intro to Labs and its projects and find out how you can get involved and gain skills.
20 | Read more 21 |Find out how to find, clean and analyse data with the handbook.
27 | Read more 28 |Find out about the core datasets projects and how to start contributing Data Packages.
35 | Read more 36 |