├── VERSION ├── .gitignore ├── test ├── dates.txt ├── test.rb └── operations.json ├── Rakefile ├── LICENSE ├── google-refine.gemspec ├── README.textile └── lib └── google-refine.rb /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store -------------------------------------------------------------------------------- /test/dates.txt: -------------------------------------------------------------------------------- 1 | Date 2 | 7 December 2001 3 | July 1 2002 4 | 10/20/10 -------------------------------------------------------------------------------- /test/test.rb: -------------------------------------------------------------------------------- 1 | load '../lib/refine.rb' 2 | 3 | prj = Refine.new('date cleanup', 'dates.txt') 4 | prj.apply_operations('operations.json') 5 | puts prj.export_rows('csv') 6 | prj.delete_project -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | begin 5 | require 'jeweler' 6 | Jeweler::Tasks.new do |gem| 7 | gem.name = "google-refine" 8 | gem.summary = %Q{Client library for interacting with Google Refine instances} 9 | gem.description = %Q{Client library for interacting with Google Refine instances} 10 | gem.email = "max@maxogden.com" 11 | gem.homepage = "http://github.com/maxogden/refine-ruby" 12 | gem.authors = ["Max Ogden"] 13 | gem.add_dependency "json", ">= 1.4.6" 14 | gem.add_dependency "httpclient", ">= 2.1.6.1" 15 | end 16 | Jeweler::GemcutterTasks.new 17 | rescue LoadError 18 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler" 19 | end -------------------------------------------------------------------------------- /test/operations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "op": "core/text-transform", 4 | "description": "Text transform on cells in column Date using expression grel:value.toDate()", 5 | "engineConfig": { 6 | "facets": [], 7 | "mode": "row-based" 8 | }, 9 | "columnName": "Date", 10 | "expression": "grel:value.toDate()", 11 | "onError": "set-to-blank", 12 | "repeat": false, 13 | "repeatCount": 10 14 | }, 15 | { 16 | "op": "core/text-transform", 17 | "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")", 18 | "engineConfig": { 19 | "facets": [], 20 | "mode": "row-based" 21 | }, 22 | "columnName": "Date", 23 | "expression": "grel:value.datePart(\"year\")", 24 | "onError": "set-to-blank", 25 | "repeat": false, 26 | "repeatCount": 10 27 | } 28 | ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 David Huynh & Max Ogden 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /google-refine.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{google-refine} 8 | s.version = "0.1.1" 9 | s.authors = ["Max Ogden", "Michael Bianco"] 10 | s.summary = %q{Client library for interacting with Google Refine instances} 11 | s.description = %q{Client library for interacting with Google Refine instances. Easily work with CSVs from the command line} 12 | s.email = ['max@maxogden.com', 'info@cliffsidedev.com'] 13 | s.extra_rdoc_files = [ 14 | "LICENSE", 15 | "README.textile" 16 | ] 17 | s.files = [ 18 | "LICENSE", 19 | "README.textile", 20 | "Rakefile", 21 | "VERSION", 22 | "google-refine.gemspec", 23 | "lib/google-refine.rb", 24 | "test/dates.txt", 25 | "test/operations.json", 26 | "test/test.rb" 27 | ] 28 | s.homepage = "http://github.com/maxogden/refine-ruby" 29 | s.require_paths = ["lib"] 30 | s.test_files = [ 31 | "test/test.rb" 32 | ] 33 | 34 | s.add_dependency('json', ">= 1.4.6") 35 | s.add_dependency('httpclient', ">= 2.1.6.1") 36 | end 37 | 38 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 |

google-refine

is a Ruby Gem client library for "Google Refine":http://code.google.com/p/google-refine/ 2 | 3 | If you want to port this to another language, check out the "Refine API":https://github.com/maxogden/refine-python/wiki/Refine-API documentation. 4 | 5 | h2. Install 6 | 7 | @gem install google-refine@ 8 | 9 | h2. Example 10 | 11 | Given that you have the following raw data: 12 | 13 |
14 |   
15 |     Date
16 |     7 December 2001
17 |     July 1 2002
18 |     10/20/10
19 |   
20 | 
21 | 22 | Google Refine lets you clean up the data and export your operation history as a JSON instruction set. Here is an example that extracts the year from the above dates: 23 | 24 |
25 | 
26 |   [
27 |     {
28 |       "op": "core/text-transform",
29 |       "description": "Text transform on cells in column Date using expression grel:value.toDate()",
30 |       "engineConfig": {
31 |         "facets": [],
32 |         "mode": "row-based"
33 |       },
34 |       "columnName": "Date",
35 |       "expression": "grel:value.toDate()",
36 |       "onError": "set-to-blank",
37 |       "repeat": false,
38 |       "repeatCount": 10
39 |     },
40 |     {
41 |       "op": "core/text-transform",
42 |       "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")+1",
43 |       "engineConfig": {
44 |         "facets": [],
45 |         "mode": "row-based"
46 |       },
47 |       "columnName": "Date",
48 |       "expression": "grel:value.datePart(\"year\")",
49 |       "onError": "set-to-blank",
50 |       "repeat": false,
51 |       "repeatCount": 10
52 |     }
53 |   ]
54 | 
55 | 
56 | 57 | You can use this gem to apply the operation set to the raw data from ruby. You will need to have Google Refine running on your local computer, or specify an external address (see source): 58 | 59 |
60 |   
61 |     prj = Refine.new('date cleanup', 'dates.txt')
62 |     prj.apply_operations('operations.json')
63 |     puts prj.export_rows('csv')
64 |     prj.delete_project
65 |   
66 | 
67 | 68 | Which outputs: 69 | 70 |
71 |   
72 |     Date
73 |     2001
74 |     2002
75 |     2010
76 |   
77 | 
78 | 79 | h2. Copyright 80 | 81 | Copyright (c) 2011 David Huynh and Max Ogden. See LICENSE for details. 82 | 83 | -------------------------------------------------------------------------------- /lib/google-refine.rb: -------------------------------------------------------------------------------- 1 | require 'httpclient' 2 | require 'cgi' 3 | require 'json' 4 | 5 | class Refine 6 | attr_reader :project_name 7 | attr_reader :project_id 8 | 9 | def self.get_all_project_metadata(server="http://127.0.0.1:3333") 10 | uri = "#{server}/command/core/get-all-project-metadata" 11 | response = HTTPClient.new(server).get(uri) 12 | JSON.parse(response.body) 13 | end 14 | 15 | def initialize(opts = {}) 16 | @server = opts["server"] || "http://127.0.0.1:3333" 17 | @throws_exceptions = opts["throws_exceptions"] || true 18 | 19 | if opts["file_name"] && !opts["file_name"].empty? && opts["project_name"] && !opts["project_name"].empty? 20 | project_name = CGI.escape(opts["project_name"]) 21 | @project_id = create_project(project_name, opts["file_name"]) 22 | @project_name = project_name if @project_id 23 | else 24 | @project_id = opts["project_id"] 25 | 26 | metadata = self.get_project_metadata 27 | @project_name = CGI.escape(metadata["name"]) 28 | end 29 | end 30 | 31 | def create_project(project_name, file_name) 32 | uri = @server + "/command/core/create-project-from-upload" 33 | project_id = false 34 | File.open(file_name) do |file| 35 | body = { 36 | 'project-file' => file, 37 | 'project-name' => project_name 38 | } 39 | response = client.post(uri, body) 40 | url = response.header['Location'] 41 | unless url == [] 42 | project_id = CGI.parse(url[0].split('?')[1])['project'][0] 43 | end 44 | end 45 | raise "Error creating project: #{response}" unless project_id 46 | project_id 47 | end 48 | 49 | def apply_operations(file_name_or_string) 50 | if File.exists?(file_name_or_string) 51 | operations = File.read(file_name_or_string) 52 | else 53 | operations = file_name_or_string 54 | end 55 | 56 | call('apply-operations', 'operations' => file_name_or_string) 57 | end 58 | 59 | def export_rows(opts={}) 60 | format = opts["format"] || 'tsv' 61 | uri = @server + "/command/core/export-rows/#{@project_name}.#{format}" 62 | 63 | body = { 64 | 'engine' => { 65 | "facets" => opts["facets"] || [], 66 | "mode" => "row-based" 67 | }.to_json, 68 | 'options' => opts["options"] || '', 69 | 'project' => @project_id, 70 | 'format' => format 71 | } 72 | 73 | @response = client.post(uri, body) 74 | @response.content 75 | end 76 | 77 | def delete_project 78 | uri = @server + "/command/core/delete-project" 79 | body = { 80 | 'project' => @project_id 81 | } 82 | @response = client.post(uri, body) 83 | JSON.parse(@response.content)['code'] rescue false 84 | end 85 | 86 | # this pattern is pulled from mailchimp/mailchimp-gem 87 | 88 | def call(method, params = {}) 89 | uri = "#{@server}/command/core/#{method}" 90 | params = { "project" => @project_id }.merge(params) 91 | 92 | response = if method.start_with?('get-') 93 | client.get(uri, params) 94 | else 95 | client.post(uri, params) 96 | end 97 | 98 | begin 99 | response = JSON.parse(response.body) 100 | rescue 101 | response = JSON.parse('[' + response.body + ']').first 102 | end 103 | 104 | if @throws_exceptions && response.is_a?(Hash) && response["code"] && response["code"] == "error" 105 | raise "API Error: #{response}" 106 | end 107 | 108 | response 109 | end 110 | 111 | def method_missing(method, *args) 112 | # translate: get_column_info --> get-column-info 113 | call(method.to_s.gsub('_', '-'), *args) 114 | end 115 | 116 | protected 117 | def client 118 | @client ||= HTTPClient.new(@server) 119 | end 120 | end --------------------------------------------------------------------------------