├── .gitignore
├── LICENSE
├── README.md
└── access_checker.rb
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | output/
3 | *.csv
4 | *.txt
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013 University Library, University of North Carolina.
2 | Written by Kristina Spurgin.
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Access checker
2 | A simple JRuby script to check for full-text access to e-resource titles. Plain old URL/link checking won't alert you if one of your ebook links points to a valid HTML page reading "NO ACCESS." This script will.
3 |
4 | I wrote an article about the Access Checker: [Getting What We Paid for: a Script to Verify Full Access to E-Resources](http://journal.code4lib.org/articles/9684)
5 |
6 | NOTE: this Access Checker is unaware of the title > volume > issue > article hierarchy of e-journals, and doesn't have a way to input or make sense of holdings date ranges. It was designed to check access in collections of discrete items, each of which has a distinct URL---mainly ebook, streaming media, etc collections.
7 |
8 | **For list of collections/platforms/products supported, see [Access Checker wiki](https://github.com/UNC-Libraries/Access-Checker/wiki)**
9 |
10 | # Requirements
11 | - You must have [JRuby](http://jruby.org/) installed. This script has been tested on JRuby 1.7.3. Installing JRuby is super-easy; point-and-click .exe installers are available for Windows on the [JRuby homepage](http://jruby.org/).
12 |
13 | - Once JRuby is installed, you will need to install the JRuby Gems Celerity and Highline.
14 |
15 | To install these Gems, open the command line shell and type the following commands:
16 | - jruby -S gem install celerity
17 | - jruby -S gem install highline
18 |
19 | # Set up before first-time use
20 | ## Prepare your script directory
21 | Choose or create a directory/folder on your computer in which to place the access_checker.rb script. This directory can be called whatever you want, but here I'll call it the "rubyscripts" directory.
22 |
23 | **For the rest of the instructions, we'll assume the path of the rubyscripts folder is:** C:\Users\you\rubyscripts
24 |
25 | ## Download the script and put it in the rubyscripts directory
26 | * Go to https://github.com/UNC-Libraries/Access-Checker
27 | * Download ZIP file containing the files (bottom of right column)
28 | * Unzip the ZIP file on your computer
29 | * Put a copy of the access_checker.rb file from the unzipped directory into your rubyscripts directory: C:\Users\you\rubyscripts\access_checker.rb
30 |
31 | # How to use
32 | ## Prepare your input file
33 | The script expects a .csv file containing URLs for which to check access. The column containing the URL **MUST** be the last/right-most column. You may include any number of columns (RecordID#, Title, Publication Date, etc.) to the left of the URL column.
34 | Make sure there is only **one** URL per row. To use a tab-delimited file as input, see **Optional arguments** below.
35 |
36 |
37 | All URLs/titles in one input file must be in/on the same package/platform.
38 |
39 | If your URLs are prefixed with proxy strings, and you are running the script from a location where proxying isn't needed for access, deleting the proxy strings from the URLs first will speed up the script. Use Excel Replace All to do this.
40 |
41 | **Put the input file in the rubyscripts directory. Example location: C:\Users\you\rubyscripts\inputfile.csv**
42 |
43 | ## Run the script
44 | * Open your command line shell (this will be Windows PowerShell for most Windows users)
45 | * In shell, move to the rubyscripts directory. Given the example locations listed above, you will type the following and then hit Enter:
46 | ```cd C:\Users\you\rubyscripts```
47 |
48 | In your command line shell, type (substitute in the name of your actual input file and the desired name for your actual output file):
49 |
50 | ```jruby -S access_checker.rb inputfile.csv outputfile.csv```
51 |
52 | You may run into trouble if the filenames or directory names you need to point the Access Checker to contain spaces. In this case, it may work if you enclose the input and output file names/paths with double quotes:
53 |
54 | ```jruby -S access_checker.rb "C:\Users\Your Name\access checker\inputfile.csv" "C:\Users\Your Name\access checker\outputfile.csv"```
55 |
56 | ### Optional arguments
57 | Include optional arguments like so:
58 | * jruby -S access_checker.rb [arguments] [input] [output]
59 | * for example: jruby -S access_checker.rb -t -b inputfile.txt outputfile.csv
60 |
61 | Options:
62 | * -t (or --tab_delimited):
63 |
64 | the input file is read as a tab-delimited file rather than a csv. If newlines or tabs are contained in the data fields themselves, this could cause errors.
65 |
66 | * -b (or --write_utf8_bom)
67 |
68 | when writing to a new (non-existing) output file, manually add a UTF-8 BOM (primary use case: allowing Excel to directly open the csv with proper encoding). Has no effect if appending to an existing output file.
69 |
70 | When asked to input "Package?" enter the 3-4 letter code from the list above the input prompt.
71 |
72 | ## Output
73 | Script will output a .csv file containing all data from the input file, with a new "access" column appended.
74 |
75 | ## If the script chokes/dies (or you need to otherwise stop it) while running...
76 | You don't have to start over from the beginning. Remove all rows already checked (i.e. included in the output file) from the input file and restart the script, using the same output file location.
77 |
78 | The header row will be inserted into the output file again, so watch for that in the final results.
79 |
80 | # How it works
81 | First, this script does not access, download, or touch *ANY* actual full-text content hosted by our providers.
82 |
83 | It simply visits the landing/description/info page for each ostensibly full-text resource---the page a user clicking the link in a catalog record would be brought to, at the same URL that our ILS link checker would ping.
84 |
85 | Depending on the platform/package, it checks for text indicating full or restricted access a) displayed on that page; OR b) buried in the page source code.
86 |
--------------------------------------------------------------------------------
/access_checker.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | # Tested in JRuby 1.7.3
4 | # Written by Kristina Spurgin
5 |
6 | # Usage:
7 | # jruby -S access_checker.rb [arguments] [inputfilelocation] [outputfilelocation]
8 |
9 | # Input file:
10 | # .csv file with:
11 | # - one header row
12 | # - any number of columns to left of final column
13 | # - one URL in final column
14 | # - accepts tab-delimited files through use of arguments
15 |
16 | # Output file:
17 | # .csv file with all the data from the input file, plus a new column containing
18 | # access checker result
19 |
20 | # Optional arguments:
21 | # e.g. jruby -S access_checker.rb -t -b inputfile.txt outputfile.csv
22 | #
23 | # -t (or --tab_delimited):
24 | # The input file is read as a tab-delimited file rather than a csv. If
25 | # newlines or tabs are contained in the data fields themselves, this could
26 | # cause errors. Should work with utf-8 or unicode input files; may not work
27 | # with some other encodings
28 | #
29 | # -b (or --write_utf8_bom)
30 | # When writing to a new (non-existing) output file, manually add a UTF-8 BOM
31 | # (primary use case: allowing Excel to directly open the csv with proper
32 | # encoding). Has no effect if appending to an existing output file.
33 | #
34 |
35 | require 'celerity'
36 | require 'csv'
37 | require 'highline/import'
38 | require 'open-uri'
39 |
40 | puts "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
41 | puts "What platform/package are you access checking?"
42 | puts "Type one of the following:"
43 | puts " asp : Alexander Street Press links"
44 | puts " alman : Al Manhal"
45 | puts " apb : Apabi ebooks"
46 | puts " brep : Brepols (brepolsonline.net)"
47 | puts " cup : Cambridge University Press"
48 | puts " ciao : Columbia International Affairs Online"
49 | puts " cod : Criterion on Demand"
50 | puts " dgry : De Gruyter ebook platform"
51 | puts " dgtla : Digitalia ebooks"
52 | puts " dram : DRAM"
53 | puts " dupsc : Duke University Press (via Silverchair)"
54 | puts " eai : Early American Imprints (Readex)"
55 | puts " ebr : Ebrary links"
56 | puts " ebs : EBSCOhost ebook collection"
57 | puts " end : Endeca - Check for undeleted records"
58 | puts " fmgfod : FMG Films on Demand"
59 | puts " ieee : IEEE"
60 | puts " igi : IGI Global"
61 | puts " kan : Kanopy Streaming Video"
62 | puts " knv : Knovel"
63 | puts " lion : LIterature ONline (Proquest)"
64 | puts " nccorv : NCCO - Check for related volumes"
65 | puts " obo : Oxford Bibliographies Online"
66 | puts " oho : Oxford Handbooks Online"
67 | puts " psynet : Psychotherapy.net videos"
68 | puts " sabov : Sabin Americana - Check for Other Volumes"
69 | puts " skno : SAGE Knowledge links"
70 | puts " srmo : SAGE Research Methods Online links"
71 | puts " scid : ScienceDirect ebooks (Elsevier)"
72 | puts " siam : SIAM: Society for Industrial and Applied Mathmatics"
73 | puts " ss : SerialsSolutions links"
74 | puts " spr : SpringerLink links"
75 | puts " uncfa : UNC Finding Aids"
76 | puts " upso : University Press (inc. Oxford) Scholarship Online links"
77 | puts " waf : Wright American Fiction"
78 | puts " wol : Wiley Online Library"
79 | puts "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
80 |
81 | package = ask("Package? ")
82 | if package == "spr"
83 | get_ebk_pkg = ask("Do you also want to retrieve subject module/ebook package for each title? y/n ")
84 | end
85 |
86 | puts "\nPreparing to check access...\n"
87 |
88 | if ARGV.include?('-t') || ARGV.include?('--tab_delimited')
89 | input_is_tab_delimited = true
90 | ARGV.delete('-t')
91 | ARGV.delete('--tab_delimited')
92 | else
93 | input_is_tab_delimited = false
94 | end
95 |
96 | if ARGV.include?('-b') || ARGV.include?('--write_utf8_bom')
97 | write_utf8_bom = true
98 | ARGV.delete('-b')
99 | ARGV.delete('--write_utf8_bom')
100 | else
101 | write_utf8_bom = false
102 | end
103 |
104 | input = ARGV[0]
105 | output = ARGV[1]
106 |
107 |
108 | if input_is_tab_delimited
109 | begin
110 | # attempt to read the file using default quote_char
111 | csv_data = CSV.read(input,
112 | :headers => true,
113 | :col_sep => "\t")
114 | rescue CSV::MalformedCSVError
115 | begin
116 | # CSV wants unescaped quote_char only around entire fields. So, try
117 | # giving it an unprintable char.
118 | csv_data = CSV.read(input,
119 | :headers => true,
120 | :col_sep => "\t",
121 | :quote_char => "\x00")
122 | rescue CSV::MalformedCSVError
123 | # try to read the file as Unicode; will convert to utf-8
124 | csv_data = CSV.read(input,
125 | :headers => true,
126 | :col_sep => "\t",
127 | :quote_char => "\x00",
128 | :encoding => "BOM|UTF-16LE:UTF-8")
129 | end
130 | end
131 | else
132 | csv_data = CSV.read(input, :headers => true)
133 | end
134 | headers = csv_data.headers
135 |
136 |
137 | if write_utf8_bom and not File.exist?(output)
138 | File.open(output, 'w') do |file|
139 | file.write "\uFEFF"
140 | end
141 | end
142 |
143 |
144 | counter = 0
145 | total = csv_data.count
146 |
147 |
148 | headers << "access"
149 |
150 | if get_ebk_pkg == "y"
151 | headers << "ebook package"
152 | end
153 |
154 | CSV.open(output, "a") do |c|
155 | c << headers
156 | end
157 |
158 |
159 | if package == "kan"
160 | agent_spoof = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
161 | b = Celerity::Browser.new(:browser => :firefox, :user_agent => agent_spoof)
162 | elsif package == "asp" || package == "lion"
163 | # On 12/6/17, ASP was redirecting from http to https when using celerity
164 | # and for unknown reason causing SSL/certificate errors. Disabling
165 | # secure_ssl, which I don't know that we really care about, for ASP.
166 | # ASP should finish making changes to their site in Jan 2018, so some
167 | # time after that see if this exception can be removed. (At the time
168 | # this was happening visiting an http URL in firefox was not redirecting
169 | # and visiting an https URL was not resulting in certificate problems.)
170 | b = Celerity::Browser.new(:browser => :firefox, :secure_ssl => false)
171 | else
172 | b = Celerity::Browser.new(:browser => :firefox)
173 | #b = Celerity::Browser.new(:browser => :firefox, :log_level => :all)
174 | end
175 |
176 |
177 | if package == "oho" || package == "obo"
178 | # unite Oxford logic under upso
179 | package = "upso"
180 | end
181 |
182 |
183 | b.css = false
184 | b.javascript_enabled = false
185 |
186 | # APPDEV-11425: Sage platforms require javascript to load
187 | if package == "srmo" || package == "skno"
188 | b.javascript_enabled = true
189 | end
190 |
191 |
192 | csv_data.each do |r|
193 | row_array = r.to_csv.parse_csv
194 | url = row_array.pop
195 | rest_of_data = row_array
196 |
197 | if package == "ss"
198 | # this creates a new url based on the library code (e.g. VB3LK7EB4T)
199 | # and criteria (e.g. JC_005405622) to get around the angular.js
200 | # it may not work on all serialsolutions URLS. Sample, working urls:
201 | # url = 'http://VB3LK7EB4T.search.serialssolutions.com/?V=1.0&L=VB3LK7EB4T&S=JCs&C=JC_005405622&T=marc'
202 | # url = 'http://VB3LK7EB4T.search.serialssolutions.com/?V=1.0&L=VB3LK7EB4T&S=JCs&C=TC_026248270&T=marc'
203 | match = url.match('://([^.]*).*&C=([^&]*)')
204 | if match and match.size == 3
205 | lib, criteria = match[1..2]
206 | url2 = "http://%s.search.serialssolutions.com/ejp/api/1/libraries/%s/search/types/title_code/%s" % [lib, lib, criteria]
207 | page = open(url2).read
208 | else
209 | page = "This script is not configured to accept this URL structure."
210 | end
211 | else
212 | #
213 | # For every package but SerSol, do this:
214 | #
215 | b.goto(url)
216 | page = b.html
217 | end
218 |
219 | if package == "apb"
220 | sleeptime = 1
221 | if page.match(/type="onlineread"/)
222 | access = "Access probably ok"
223 | else
224 | access = "Check access manually"
225 | end
226 |
227 | elsif package == "alman"
228 | sleeptime = 1
229 | if page.include?("\"AvailabilityMode\":4")
230 | access = "Preview mode"
231 | elsif page.include?("\"AvailabilityMode\":2")
232 | access = "Full access"
233 | elsif page.include?("id=\"searchBox")
234 | access = "No access. Item not found"
235 | else
236 | access = "Check access manually"
237 | end
238 |
239 | elsif package == "asp"
240 | sleeptime = 1
241 | if page.include?("Page Not Found")
242 | access = "Page not found"
243 | elsif page.include?("This is a sample. For full access:")
244 | access = "Sample"
245 | elsif page.include?("Trial login | Alexander Street")
246 | access = "Trial"
247 | elsif page.include?("Your institution does not have access to this particular content.")
248 | access = "Institution does not have access"
249 | elsif page.match(/Book not found./)
266 | access = "Page not found"
267 | elsif page.match(/title="Full Access"/)
268 | access = "Full Access"
269 | else
270 | access = "Check access manually"
271 | end
272 |
273 | elsif package == "ciao"
274 | sleeptime = 1
275 | if page.match(/