├── VERSION ├── .gitignore ├── test ├── dates.txt ├── test.rb └── operations.json ├── Rakefile ├── LICENSE ├── google-refine.gemspec ├── README.textile └── lib └── google-refine.rb /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store -------------------------------------------------------------------------------- /test/dates.txt: -------------------------------------------------------------------------------- 1 | Date 2 | 7 December 2001 3 | July 1 2002 4 | 10/20/10 -------------------------------------------------------------------------------- /test/test.rb: -------------------------------------------------------------------------------- 1 | load '../lib/refine.rb' 2 | 3 | prj = Refine.new('date cleanup', 'dates.txt') 4 | prj.apply_operations('operations.json') 5 | puts prj.export_rows('csv') 6 | prj.delete_project -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | begin 5 | require 'jeweler' 6 | Jeweler::Tasks.new do |gem| 7 | gem.name = "google-refine" 8 | gem.summary = %Q{Client library for interacting with Google Refine instances} 9 | gem.description = %Q{Client library for interacting with Google Refine instances} 10 | gem.email = "max@maxogden.com" 11 | gem.homepage = "http://github.com/maxogden/refine-ruby" 12 | gem.authors = ["Max Ogden"] 13 | gem.add_dependency "json", ">= 1.4.6" 14 | gem.add_dependency "httpclient", ">= 2.1.6.1" 15 | end 16 | Jeweler::GemcutterTasks.new 17 | rescue LoadError 18 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler" 19 | end -------------------------------------------------------------------------------- /test/operations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "op": "core/text-transform", 4 | "description": "Text transform on cells in column Date using expression grel:value.toDate()", 5 | "engineConfig": { 6 | "facets": [], 7 | "mode": "row-based" 8 | }, 9 | "columnName": "Date", 10 | "expression": "grel:value.toDate()", 11 | "onError": "set-to-blank", 12 | "repeat": false, 13 | "repeatCount": 10 14 | }, 15 | { 16 | "op": "core/text-transform", 17 | "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")", 18 | "engineConfig": { 19 | "facets": [], 20 | "mode": "row-based" 21 | }, 22 | "columnName": "Date", 23 | "expression": "grel:value.datePart(\"year\")", 24 | "onError": "set-to-blank", 25 | "repeat": false, 26 | "repeatCount": 10 27 | } 28 | ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 David Huynh & Max Ogden 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /google-refine.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{google-refine} 8 | s.version = "0.1.1" 9 | s.authors = ["Max Ogden", "Michael Bianco"] 10 | s.summary = %q{Client library for interacting with Google Refine instances} 11 | s.description = %q{Client library for interacting with Google Refine instances. Easily work with CSVs from the command line} 12 | s.email = ['max@maxogden.com', 'info@cliffsidedev.com'] 13 | s.extra_rdoc_files = [ 14 | "LICENSE", 15 | "README.textile" 16 | ] 17 | s.files = [ 18 | "LICENSE", 19 | "README.textile", 20 | "Rakefile", 21 | "VERSION", 22 | "google-refine.gemspec", 23 | "lib/google-refine.rb", 24 | "test/dates.txt", 25 | "test/operations.json", 26 | "test/test.rb" 27 | ] 28 | s.homepage = "http://github.com/maxogden/refine-ruby" 29 | s.require_paths = ["lib"] 30 | s.test_files = [ 31 | "test/test.rb" 32 | ] 33 | 34 | s.add_dependency('json', ">= 1.4.6") 35 | s.add_dependency('httpclient', ">= 2.1.6.1") 36 | end 37 | 38 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 |
14 |
15 | Date
16 | 7 December 2001
17 | July 1 2002
18 | 10/20/10
19 |
20 |
21 |
22 | Google Refine lets you clean up the data and export your operation history as a JSON instruction set. Here is an example that extracts the year from the above dates:
23 |
24 |
25 |
26 | [
27 | {
28 | "op": "core/text-transform",
29 | "description": "Text transform on cells in column Date using expression grel:value.toDate()",
30 | "engineConfig": {
31 | "facets": [],
32 | "mode": "row-based"
33 | },
34 | "columnName": "Date",
35 | "expression": "grel:value.toDate()",
36 | "onError": "set-to-blank",
37 | "repeat": false,
38 | "repeatCount": 10
39 | },
40 | {
41 | "op": "core/text-transform",
42 | "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")+1",
43 | "engineConfig": {
44 | "facets": [],
45 | "mode": "row-based"
46 | },
47 | "columnName": "Date",
48 | "expression": "grel:value.datePart(\"year\")",
49 | "onError": "set-to-blank",
50 | "repeat": false,
51 | "repeatCount": 10
52 | }
53 | ]
54 |
55 |
56 |
57 | You can use this gem to apply the operation set to the raw data from ruby. You will need to have Google Refine running on your local computer, or specify an external address (see source):
58 |
59 |
60 |
61 | prj = Refine.new('date cleanup', 'dates.txt')
62 | prj.apply_operations('operations.json')
63 | puts prj.export_rows('csv')
64 | prj.delete_project
65 |
66 |
67 |
68 | Which outputs:
69 |
70 |
71 |
72 | Date
73 | 2001
74 | 2002
75 | 2010
76 |
77 |
78 |
79 | h2. Copyright
80 |
81 | Copyright (c) 2011 David Huynh and Max Ogden. See LICENSE for details.
82 |
83 |
--------------------------------------------------------------------------------
/lib/google-refine.rb:
--------------------------------------------------------------------------------
1 | require 'httpclient'
2 | require 'cgi'
3 | require 'json'
4 |
5 | class Refine
6 | attr_reader :project_name
7 | attr_reader :project_id
8 |
9 | def self.get_all_project_metadata(server="http://127.0.0.1:3333")
10 | uri = "#{server}/command/core/get-all-project-metadata"
11 | response = HTTPClient.new(server).get(uri)
12 | JSON.parse(response.body)
13 | end
14 |
15 | def initialize(opts = {})
16 | @server = opts["server"] || "http://127.0.0.1:3333"
17 | @throws_exceptions = opts["throws_exceptions"] || true
18 |
19 | if opts["file_name"] && !opts["file_name"].empty? && opts["project_name"] && !opts["project_name"].empty?
20 | project_name = CGI.escape(opts["project_name"])
21 | @project_id = create_project(project_name, opts["file_name"])
22 | @project_name = project_name if @project_id
23 | else
24 | @project_id = opts["project_id"]
25 |
26 | metadata = self.get_project_metadata
27 | @project_name = CGI.escape(metadata["name"])
28 | end
29 | end
30 |
31 | def create_project(project_name, file_name)
32 | uri = @server + "/command/core/create-project-from-upload"
33 | project_id = false
34 | File.open(file_name) do |file|
35 | body = {
36 | 'project-file' => file,
37 | 'project-name' => project_name
38 | }
39 | response = client.post(uri, body)
40 | url = response.header['Location']
41 | unless url == []
42 | project_id = CGI.parse(url[0].split('?')[1])['project'][0]
43 | end
44 | end
45 | raise "Error creating project: #{response}" unless project_id
46 | project_id
47 | end
48 |
49 | def apply_operations(file_name_or_string)
50 | if File.exists?(file_name_or_string)
51 | operations = File.read(file_name_or_string)
52 | else
53 | operations = file_name_or_string
54 | end
55 |
56 | call('apply-operations', 'operations' => file_name_or_string)
57 | end
58 |
59 | def export_rows(opts={})
60 | format = opts["format"] || 'tsv'
61 | uri = @server + "/command/core/export-rows/#{@project_name}.#{format}"
62 |
63 | body = {
64 | 'engine' => {
65 | "facets" => opts["facets"] || [],
66 | "mode" => "row-based"
67 | }.to_json,
68 | 'options' => opts["options"] || '',
69 | 'project' => @project_id,
70 | 'format' => format
71 | }
72 |
73 | @response = client.post(uri, body)
74 | @response.content
75 | end
76 |
77 | def delete_project
78 | uri = @server + "/command/core/delete-project"
79 | body = {
80 | 'project' => @project_id
81 | }
82 | @response = client.post(uri, body)
83 | JSON.parse(@response.content)['code'] rescue false
84 | end
85 |
86 | # this pattern is pulled from mailchimp/mailchimp-gem
87 |
88 | def call(method, params = {})
89 | uri = "#{@server}/command/core/#{method}"
90 | params = { "project" => @project_id }.merge(params)
91 |
92 | response = if method.start_with?('get-')
93 | client.get(uri, params)
94 | else
95 | client.post(uri, params)
96 | end
97 |
98 | begin
99 | response = JSON.parse(response.body)
100 | rescue
101 | response = JSON.parse('[' + response.body + ']').first
102 | end
103 |
104 | if @throws_exceptions && response.is_a?(Hash) && response["code"] && response["code"] == "error"
105 | raise "API Error: #{response}"
106 | end
107 |
108 | response
109 | end
110 |
111 | def method_missing(method, *args)
112 | # translate: get_column_info --> get-column-info
113 | call(method.to_s.gsub('_', '-'), *args)
114 | end
115 |
116 | protected
117 | def client
118 | @client ||= HTTPClient.new(@server)
119 | end
120 | end
--------------------------------------------------------------------------------