├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── pull_request_template.md
    └── workflows
    │   └── wgit.yaml
├── .gitignore
├── .rubocop.yml
├── .ruby-version
├── .toys.rb
├── .yardopts
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── bin
    └── wgit
├── ci.symlink
├── docker
    ├── Dockerfile
    └── mongo-init.js
├── lib
    ├── wgit.rb
    └── wgit
    │   ├── assertable.rb
    │   ├── base.rb
    │   ├── core_ext.rb
    │   ├── crawler.rb
    │   ├── database
    │       ├── adapters
    │       │   ├── in_memory.rb
    │       │   └── mongo_db.rb
    │       ├── database.rb
    │       └── database_adapter.rb
    │   ├── document.rb
    │   ├── document_extractors.rb
    │   ├── dsl.rb
    │   ├── html_to_text.rb
    │   ├── indexer.rb
    │   ├── logger.rb
    │   ├── model.rb
    │   ├── response.rb
    │   ├── robots_parser.rb
    │   ├── url.rb
    │   ├── utils.rb
    │   └── version.rb
├── load.rb
├── test
    ├── helpers
    │   ├── database_helper.rb
    │   ├── database_test_data.rb
    │   ├── in_memory_helper.rb
    │   ├── mongo_db_helper.rb
    │   └── test_helper.rb
    ├── mock
    │   ├── fixtures.rb
    │   ├── fixtures
    │   │   ├── altitudejunkies.com.html
    │   │   ├── anchor_display.html
    │   │   ├── blank.html
    │   │   ├── disallow-all.com
    │   │   │   ├── about.html
    │   │   │   ├── index.html
    │   │   │   └── robots.txt
    │   │   ├── div_display.html
    │   │   ├── external-link-portal.com.html
    │   │   ├── getting_started.html
    │   │   ├── link-to-robots-txt.com.html
    │   │   ├── motherfuckingwebsite.com.html
    │   │   ├── nearest_fragment.html
    │   │   ├── not_found.html
    │   │   ├── odd-extension.com.html
    │   │   ├── php.html
    │   │   ├── quotes.toscrape.com
    │   │   │   └── tag
    │   │   │   │   ├── humor.html
    │   │   │   │   └── humor
    │   │   │   │       └── page
    │   │   │   │           └── 2.html
    │   │   ├── robots.txt.com
    │   │   │   ├── about.html
    │   │   │   ├── contact.html
    │   │   │   ├── index.html
    │   │   │   ├── login.html
    │   │   │   ├── pwreset.html
    │   │   │   └── robots.txt
    │   │   ├── span_display.html
    │   │   ├── static.xx.fbcdn.net.html
    │   │   ├── test-site.com
    │   │   │   ├── about.html
    │   │   │   ├── application.js.html
    │   │   │   ├── contact.html
    │   │   │   ├── index.html
    │   │   │   ├── public
    │   │   │   │   └── records.html
    │   │   │   ├── search.html
    │   │   │   └── theme.css.html
    │   │   ├── test_doc.html
    │   │   ├── txti.es
    │   │   │   ├── about.html
    │   │   │   ├── barry
    │   │   │   │   └── json.html
    │   │   │   ├── how.html
    │   │   │   ├── images.html
    │   │   │   ├── images
    │   │   │   │   └── images.html
    │   │   │   ├── index.html
    │   │   │   └── terms.html
    │   │   ├── wikileaks.org.html
    │   │   ├── www.adventureconsultants.com.html
    │   │   ├── www.belfastpilates.co.uk
    │   │   │   ├── about-us.html
    │   │   │   ├── about-us
    │   │   │   │   ├── our-facilities.html
    │   │   │   │   ├── testimonials.html
    │   │   │   │   └── the-team.html
    │   │   │   ├── author
    │   │   │   │   └── adminbpp.html
    │   │   │   ├── category
    │   │   │   │   └── uncategorized.html
    │   │   │   ├── contact-us.html
    │   │   │   ├── gift-vouchers-now-available-to-purchase.html
    │   │   │   ├── index.html
    │   │   │   ├── latest-news.html
    │   │   │   ├── official-launch-party.html
    │   │   │   ├── physiotheraphy.html
    │   │   │   ├── pilates.html
    │   │   │   ├── pilates
    │   │   │   │   ├── pilates-classes.html
    │   │   │   │   ├── pilates-classes
    │   │   │   │   │   └── pilates-classes-timetable.html
    │   │   │   │   ├── pilates-faqs.html
    │   │   │   │   └── what-is-pilates.html
    │   │   │   ├── privacy-policy.html
    │   │   │   └── youre-invited.html
    │   │   ├── www.facebook.com.html
    │   │   └── www.mountainmadness.com.html
    │   ├── save_page.rb
    │   ├── save_site.rb
    │   └── webmock.rb
    ├── test_assertable.rb
    ├── test_base.rb
    ├── test_core_ext.rb
    ├── test_crawler.rb
    ├── test_database_adapter.rb
    ├── test_document.rb
    ├── test_document_extractors.rb
    ├── test_dsl.rb
    ├── test_gem.rb
    ├── test_html_to_text.rb
    ├── test_in_memory.rb
    ├── test_indexer.rb
    ├── test_load.rb
    ├── test_logger.rb
    ├── test_model.rb
    ├── test_mongo_db.rb
    ├── test_readme.rb
    ├── test_response.rb
    ├── test_robots_parser.rb
    ├── test_url.rb
    ├── test_utils.rb
    └── test_version.rb
└── wgit.gemspec


/.gitattributes:
--------------------------------------------------------------------------------
1 | test/mock/fixtures/**/*.html linguist-vendored
2 | docker/* linguist-vendored
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: michaeltelford
 7 | 
 8 | ---
 9 | 
10 | ## Description
11 | 
12 | A clear and concise description of what the bug is.
13 | 
14 | ## Reproduce
15 | 
16 | Steps to reproduce the behavior:
17 | 
18 | 1. Do X.
19 | 2. Do Y.
20 | 3. ...
21 | 
22 | ## Expected Behavior
23 | 
24 | A clear and concise description of what you expected to happen.
25 | 
26 | ## Possible Solutions
27 | 
28 | 1. ...
29 | 
30 | ## Tests
31 | 
32 | What tests would prove this bug is fixed?
33 | 
34 | 1. ...
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: michaeltelford
 7 | 
 8 | ---
 9 | 
10 | ## Description
11 | 
12 | A clear and concise description of what the feature/problem is. E.g. "I'm always frustrated when ..." or "I'd like to be able to ..." etc.
13 | 
14 | ## Solution
15 | 
16 | A clear and concise description of what you want to happen.
17 | 
18 | ## Alternatives
19 | 
20 | A clear and concise description of any alternative solutions or features you've considered. Is there other software doing something well that should be replicated etc.?
21 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | What do your changes implement/fix?
 4 | 
 5 | ## Checklist
 6 | 
 7 | - Are there tests?
 8 | - Have you reviewed and approved the changes yourself?
 9 | 
10 | Ensure you can answer yes to the above before opening a PR.
11 | 


--------------------------------------------------------------------------------
/.github/workflows/wgit.yaml:
--------------------------------------------------------------------------------
 1 | name: wgit
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - ci
 8 | jobs:
 9 |   wgit-ci:
10 |     runs-on: ubuntu-22.04
11 |     services:
12 |       mongodb:
13 |         image: michaeltelford/mongo-wgit
14 |         ports:
15 |           - 27017:27017
16 |     env:
17 |       WGIT_CONNECTION_STRING: "mongodb://rubyapp:abcdef@localhost/crawler"
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - uses: ruby/setup-ruby@v1
21 |         with:
22 |           # ruby-version: '3.3' # Not needed with a .ruby-version file
23 |           bundler-cache: true # runs 'bundle install' and caches installed gems automatically
24 |       - name: ci
25 |         run: bundle exec toys ci
26 |       - name: docs
27 |         run: bundle exec toys generate_rubydocs
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .bundle
 2 | .byebug_history
 3 | .doc
 4 | .DS_Store
 5 | .env
 6 | .wgit.rb
 7 | .yardoc
 8 | **/pkg
 9 | **/spike.rb
10 | doc
11 | wgit-*.gem
12 | .vscode/
13 | 


--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
 1 | AllCops:
 2 |   Include:
 3 |     - 'lib/**/*.rb'
 4 |     - 'bin/**'
 5 |     - 'test/**/*.rb'
 6 |     - '*.rb'
 7 |     - .toys.rb
 8 | Security/Eval:
 9 |   Exclude:
10 |     - 'bin/wgit'
11 | Metrics/ParameterLists:
12 |   Exclude:
13 |     - 'lib/wgit/dsl.rb'
14 |     - 'lib/wgit/database/database_adapter.rb'
15 | Style/FrozenStringLiteralComment:
16 |   Enabled: false
17 | Style/ClassAndModuleChildren:
18 |   Enabled: false
19 | Layout/HashAlignment:
20 |   Enabled: false
21 | Layout/FirstArrayElementIndentation:
22 |   Enabled: false
23 | Layout/FirstHashElementIndentation:
24 |   Enabled: false
25 | Metrics/ModuleLength:
26 |   Enabled: false
27 | Metrics/ClassLength:
28 |   Enabled: false
29 | Style/Documentation:
30 |   Enabled: false
31 | Metrics/MethodLength:
32 |   Max: 30
33 | Metrics/PerceivedComplexity:
34 |   Max: 12
35 | Metrics/CyclomaticComplexity:
36 |   Max: 12
37 | Layout/LineLength:
38 |   Max: 85
39 |   Exclude:
40 |     - 'test/**/*.rb'
41 | Style/Alias:
42 |   EnforcedStyle: prefer_alias_method
43 | Style/StringLiterals:
44 |   EnforcedStyle: double_quotes
45 | 


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 3.3.0
2 | 


--------------------------------------------------------------------------------
/.toys.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | # The new Rakefile, place any tasks/tools below (listed alphabetically).
  4 | # To load .env vars into the ENV from within a tool definition, use:
  5 | # require 'dotenv/load'
  6 | 
  7 | require "json"
  8 | require "byebug" # Useful for tool development.
  9 | 
 10 | # tool :build
 11 | expand :gem_build
 12 | 
 13 | tool :ci do
 14 |   desc "Runs the CI steps needed for a green build"
 15 | 
 16 |   include :exec, exit_on_nonzero_status: false
 17 |   include :terminal
 18 | 
 19 |   def run
 20 |     run_step "Build gem", "build"
 21 |     run_step "Check documentation", ["generate_docs", "--no-output"]
 22 |     run_step "Run tests", "test"
 23 |   end
 24 | 
 25 |   def run_step(name, tool)
 26 |     if exec_tool(tool).success?
 27 |       puts "** #{name} passed", :green, :bold
 28 |       puts
 29 |     else
 30 |       puts "** #{name} failed, exiting!", :red, :bold
 31 |       exit 1
 32 |     end
 33 |   end
 34 | end
 35 | 
 36 | # tool :clean
 37 | expand :clean, paths: ["pkg", "doc", "tmp", ".doc", ".yardoc"]
 38 | 
 39 | tool :compile do
 40 |   desc "Compile all project Ruby files with warnings"
 41 | 
 42 |   include :exec, exit_on_nonzero_status: true
 43 |   include :terminal
 44 | 
 45 |   def run
 46 |     Dir["**/*.rb", "**/*.gemspec", "bin/wgit"].each do |file|
 47 |       puts "\nCompiling #{file}...", :cyan
 48 |       exec "ruby -cw #{file}"
 49 |     end
 50 |   end
 51 | end
 52 | 
 53 | tool :console do
 54 |   desc "Run the (latest) wgit console script"
 55 | 
 56 |   include :exec, exit_on_nonzero_status: true
 57 | 
 58 |   def run
 59 |     exec "./bin/wgit"
 60 |   end
 61 | end
 62 | 
 63 | # namespace :db
 64 | tool :db do
 65 |   tool :build do
 66 |     desc "Build the mongo DB image from ./docker/Dockerfile"
 67 | 
 68 |     include :exec, exit_on_nonzero_status: true
 69 | 
 70 |     def run
 71 |       exec "docker build --no-cache -t michaeltelford/mongo-wgit ./docker"
 72 |     end
 73 |   end
 74 | 
 75 |   tool :start do
 76 |     desc "Start a local mongo DB docker daemon"
 77 | 
 78 |     include :terminal
 79 |     include :exec, exit_on_nonzero_status: true
 80 | 
 81 |     def run
 82 |       exec "docker run --name mongo-wgit -p 27017:27017 --rm -d michaeltelford/mongo-wgit"
 83 |       puts "Successfully started container 'mongo-wgit'", :green
 84 |     end
 85 |   end
 86 | 
 87 |   tool :stop do
 88 |     desc "Stop the local mongo DB docker container"
 89 | 
 90 |     include :terminal
 91 |     include :exec, exit_on_nonzero_status: true
 92 | 
 93 |     def run
 94 |       exec "docker stop mongo-wgit"
 95 |       puts "Successfully stopped container 'mongo-wgit'", :green
 96 |     end
 97 |   end
 98 | 
 99 |   tool :push do
100 |     desc "Push the local mongo DB image to Docker Hub"
101 | 
102 |     include :exec, exit_on_nonzero_status: true
103 | 
104 |     def run
105 |       exec "docker login" unless docker_authenticated?
106 |       exec "docker push michaeltelford/mongo-wgit"
107 |     end
108 | 
109 |     def docker_authenticated?
110 |       docker_config = "#{Dir.home}/.docker/config.json"
111 |       return false unless File.exist?(docker_config)
112 | 
113 |       config = JSON.parse(File.read(docker_config))
114 |       auths = config["auths"]
115 |       return false unless auths && !auths.empty?
116 | 
117 |       true
118 |     end
119 |   end
120 | end
121 | 
122 | # tool :generate_docs
123 | expand :yardoc do |t|
124 |   t.name = :generate_docs
125 |   t.generate_output_flag = true
126 |   t.fail_on_warning = true
127 |   t.fail_on_undocumented_objects = true
128 | end
129 | 
130 | tool :generate_rubydocs do
131 |   desc "Update wgit's docs on rubydoc.info"
132 | 
133 |   include :terminal
134 |   include :exec, exit_on_nonzero_status: true
135 | 
136 |   def run
137 |     exec "curl 'https://www.rubydoc.info/checkout' \
138 |       -H 'User-Agent: curl' \
139 |       -H 'Accept: */*' \
140 |       -H 'Accept-Language: en-GB,en;q=0.5' --compressed \
141 |       -H 'Content-Type: application/x-www-form-urlencoded' \
142 |       -H 'X-Requested-With: XMLHttpRequest' \
143 |       -H 'Origin: https://www.rubydoc.info' \
144 |       -H 'Connection: keep-alive' \
145 |       -H 'Referer: https://www.rubydoc.info/find/github?q=wgit' \
146 |       --data 'scheme=git&url=git%3A%2F%2Fgithub.com%2Fmichaeltelford%2Fwgit&commit='"
147 |     puts "\nUpdated rubydoc.info successfully", :green
148 |   end
149 | end
150 | 
151 | # tool :install
152 | expand :gem_build do |t|
153 |   t.name = :install
154 |   t.install_gem = true
155 | end
156 | 
157 | tool :lint, delegate_to: :rubocop
158 | 
159 | tool :release do
160 |   desc "The SAFE release task which double checks things!"
161 |   long_desc "Tag and push commits to Github, then build and push the gem to Rubygems."
162 | 
163 |   include :exec, exit_on_nonzero_status: true
164 |   include :terminal
165 | 
166 |   def run
167 |     raise "Error requiring wgit" unless require_relative "lib/wgit"
168 | 
169 |     puts "Releasing #{Wgit.version_str}, using the 'origin' Git remote...", :cyan
170 |     confirmed = confirm "Have you applied the wiki's 'Gem Publishing Checklist'?"
171 |     unless confirmed
172 |       puts "Aborting!", :red
173 |       exit(0)
174 |     end
175 | 
176 |     exec_tool "release_gem"
177 |     puts "Release complete", :green
178 |   end
179 | end
180 | 
181 | # tool :release_gem
182 | expand :gem_build do |t|
183 |   t.name = :release_gem
184 |   t.install_gem = false
185 |   t.push_gem = true
186 |   t.tag = true
187 |   t.push_tag = true
188 | end
189 | 
190 | tool :rubocop do
191 |   desc "Run the rubocop linter, use -a to auto correct"
192 |   flag :autocorrect,    "-a", "--autocorrect"
193 |   flag :autocorrectall, "-A", "--autocorrect-all"
194 |   remaining_args :dirs_or_files
195 | 
196 |   include :exec, exit_on_nonzero_status: true
197 | 
198 |   def run
199 |     command_str = "bundle exec rubocop"
200 |     command_str += " -a" if autocorrect
201 |     command_str += " -A" if autocorrectall
202 |     command_str += " #{dirs_or_files.join(' ')}" if dirs_or_files.any?
203 | 
204 |     exec(command_str)
205 |   end
206 | end
207 | 
208 | tool :setup do
209 |   desc "Sets up the cloned repo for development"
210 | 
211 |   include :exec, exit_on_nonzero_status: true
212 |   include :terminal
213 | 
214 |   def run
215 |     exec_cmd "gem install wgit"
216 |     exec_cmd "touch .env"
217 |     exec_cmd "touch .wgit.rb"
218 | 
219 |     puts "Setup complete", :green
220 |   end
221 | 
222 |   def exec_cmd(command)
223 |     puts "> #{command}", :cyan
224 |     exec command
225 |   end
226 | end
227 | 
228 | # namespace :test
229 | tool :test do
230 |   desc "Run all tests"
231 | 
232 |   include :exec, exit_on_nonzero_status: true
233 | 
234 |   def run
235 |     exec_tool "test all"
236 |   end
237 | 
238 |   # tool :all
239 |   expand :minitest do |t|
240 |     t.name = :all
241 |     t.libs = ["lib"]
242 |     t.files = ["test/test_*.rb"]
243 |   end
244 | 
245 |   tool :file do
246 |     desc "Runs entire test_*.rb file or single test at --line"
247 |     required_arg :file
248 |     flag :line, "-l", "--line=VALUE"
249 | 
250 |     include :exec, exit_on_nonzero_status: true
251 | 
252 |     def run
253 |       exec "bundle exec mtest #{test_cmd}"
254 |     end
255 | 
256 |     def test_cmd
257 |       cmd = options[:file]
258 |       raise "Colon not allowed, use --line" if cmd.include?(":")
259 | 
260 |       cmd = "test/test_#{cmd}" unless cmd.start_with?("test/test_")
261 |       cmd += ".rb" unless cmd.end_with?(".rb")
262 |       cmd += ":#{line}" if line
263 | 
264 |       cmd
265 |     end
266 |   end
267 | 
268 |   tool :infinite_crawl_loop do
269 |     desc "Manually crawl_r URLs to check for an infinite loop condition"
270 | 
271 |     include :terminal
272 | 
273 |     require "wgit"
274 |     require "wgit/core_ext"
275 | 
276 |     def run
277 |       puts "If the crawl is hanging for more than 2 mins, there is an infinite loop",
278 |            :yellow
279 | 
280 |       crawler = Wgit::Crawler.new
281 |       urls = %w[
282 |         https://jaloulangeree.com/
283 |         https://www.belfastpilates.co.uk/
284 |         https://anaeko.com/
285 |       ].to_urls
286 | 
287 |       urls.each_with_index do |url, i|
288 |         crawler.crawl_site(url)
289 |         puts "Successfully crawled site (#{i + 1}/#{urls.size}): #{url}"
290 |       end
291 | 
292 |       puts "Successfully crawled all sites, no infinite loop detected", :green
293 |     end
294 |   end
295 | 
296 |   tool :save_page do
297 |     desc "Download/update a web page test fixture to test/mock/fixtures"
298 |     required_arg :url
299 | 
300 |     include :exec, exit_on_nonzero_status: true
301 |     include :terminal
302 | 
303 |     def run
304 |       load "test/mock/save_page.rb"
305 |       save_page(options[:url])
306 |       puts "Don't forget to mock the page in test/mock/fixtures.rb", :green
307 |     end
308 |   end
309 | 
310 |   tool :save_site do
311 |     desc "Download/update a web site test fixture to test/mock/fixtures"
312 |     required_arg :url
313 |     flag :follow, "-f", "--follow=VALUE"
314 | 
315 |     include :exec, exit_on_nonzero_status: true
316 |     include :terminal
317 | 
318 |     def run
319 |       load "test/mock/save_site.rb"
320 |       xpath = follow || :default
321 |       save_site(options[:url], follow: xpath)
322 |       puts "Don't forget to mock the site in test/mock/fixtures.rb", :green
323 |     end
324 |   end
325 | 
326 |   # tool :smoke
327 |   expand :minitest do |t|
328 |     t.name = :smoke
329 |     t.libs = ["lib"]
330 |     t.files = [
331 |       "test/test_utils.rb",
332 |       "test/test_url.rb",
333 |       "test/test_document.rb",
334 |       "test/test_document_extractors.rb",
335 |       "test/test_response.rb",
336 |       "test/test_crawler.rb"
337 |     ]
338 |   end
339 | end
340 | 
341 | tool :yardoc do
342 |   desc "Generates the YARD docs, use --serve to browse"
343 |   flag :serve, "-s", "--serve"
344 | 
345 |   include :exec, exit_on_nonzero_status: false
346 |   include :terminal
347 | 
348 |   def run
349 |     serve ? serve_docs : exec_tool("generate_docs")
350 |   end
351 | 
352 |   def serve_docs
353 |     url = "http://localhost:8808"
354 | 
355 |     if exec("which pbcopy", out: :null).success?
356 |       exec "echo '#{url}' | pbcopy"
357 |       puts "Copied '#{url}' to clipboard", :green
358 |     elsif exec("which xclip", out: :null).success?
359 |       exec "echo '#{url}' | xclip -sel clip"
360 |       puts "Copied '#{url}' to clipboard", :green
361 |     else
362 |       puts "Install pbcopy or xclip to automatically copy url to clipboard"
363 |     end
364 | 
365 |     exec "bundle exec yard server -r"
366 |   end
367 | end
368 | 


--------------------------------------------------------------------------------
/.yardopts:
--------------------------------------------------------------------------------
1 | --readme README.md
2 | --title 'Wgit Gem Documentation'
3 | --charset utf-8
4 | --markup markdown
5 | --output .doc
6 | --protected
7 | - *.md LICENSE.txt
8 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team on Github. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Consult
 4 | 
 5 | Before you make a contribution, reach out on Github about what changes need made. Otherwise, your time spent might be wasted. Once you're clear on what needs done follow the technical steps below.
 6 | 
 7 | ## Technical Steps
 8 | 
 9 | - Fork the repository
10 | - Create a branch
11 | - Write some tests (which fail)
12 | - Write some code
13 | - Re-run the tests (which now hopefully pass)
14 | - Push your branch to your `origin` remote
15 | - Open a GitHub Pull Request (with the target branch as wgit's (upstream) `master`)
16 | - Apply any requested changes
17 | - Wait for your PR to be merged
18 | 
19 | ## Thanks
20 | 
21 | Thanks in advance for your contribution.
22 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | source 'https://rubygems.org'
4 | 
5 | ruby '>= 3', '< 4'
6 | 
7 | # Specify your gem's dependencies in the gemspec.
8 | gemspec
9 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | PATH
  2 |   remote: .
  3 |   specs:
  4 |     wgit (0.12.0)
  5 |       addressable (~> 2.8)
  6 |       base64 (~> 0.2)
  7 |       ferrum (~> 0.14)
  8 |       mongo (~> 2.19)
  9 |       nokogiri (~> 1.15)
 10 |       typhoeus (~> 1.4)
 11 | 
 12 | GEM
 13 |   remote: https://rubygems.org/
 14 |   specs:
 15 |     addressable (2.8.6)
 16 |       public_suffix (>= 2.0.2, < 6.0)
 17 |     ast (2.4.2)
 18 |     base64 (0.2.0)
 19 |     bigdecimal (3.1.6)
 20 |     bson (4.15.0)
 21 |     byebug (11.1.3)
 22 |     coderay (1.1.3)
 23 |     concurrent-ruby (1.2.3)
 24 |     crack (1.0.0)
 25 |       bigdecimal
 26 |       rexml
 27 |     dotenv (2.8.1)
 28 |     ethon (0.16.0)
 29 |       ffi (>= 1.15.0)
 30 |     ferrum (0.15)
 31 |       addressable (~> 2.5)
 32 |       concurrent-ruby (~> 1.1)
 33 |       webrick (~> 1.7)
 34 |       websocket-driver (~> 0.7)
 35 |     ffi (1.16.3)
 36 |     hashdiff (1.1.0)
 37 |     json (2.7.1)
 38 |     language_server-protocol (3.17.0.3)
 39 |     maxitest (5.4.0)
 40 |       minitest (>= 5.14.0, < 5.21.0)
 41 |     method_source (1.0.0)
 42 |     minitest (5.20.0)
 43 |     mongo (2.19.3)
 44 |       bson (>= 4.14.1, < 5.0.0)
 45 |     nokogiri (1.16.2)
 46 |       racc (~> 1.4)
 47 |     parallel (1.24.0)
 48 |     parser (3.3.0.5)
 49 |       ast (~> 2.4.1)
 50 |       racc
 51 |     prism (0.24.0)
 52 |     pry (0.14.2)
 53 |       coderay (~> 1.1)
 54 |       method_source (~> 1.0)
 55 |     public_suffix (5.0.4)
 56 |     racc (1.7.3)
 57 |     rainbow (3.1.1)
 58 |     regexp_parser (2.9.0)
 59 |     rexml (3.2.6)
 60 |     rubocop (1.61.0)
 61 |       json (~> 2.3)
 62 |       language_server-protocol (>= 3.17.0)
 63 |       parallel (~> 1.10)
 64 |       parser (>= 3.3.0.2)
 65 |       rainbow (>= 2.2.2, < 4.0)
 66 |       regexp_parser (>= 1.8, < 3.0)
 67 |       rexml (>= 3.2.5, < 4.0)
 68 |       rubocop-ast (>= 1.30.0, < 2.0)
 69 |       ruby-progressbar (~> 1.7)
 70 |       unicode-display_width (>= 2.4.0, < 3.0)
 71 |     rubocop-ast (1.31.0)
 72 |       parser (>= 3.3.0.4)
 73 |       prism (>= 0.24.0)
 74 |     ruby-progressbar (1.13.0)
 75 |     toys (0.15.5)
 76 |       toys-core (= 0.15.5)
 77 |     toys-core (0.15.5)
 78 |     typhoeus (1.4.1)
 79 |       ethon (>= 0.9.0)
 80 |     unicode-display_width (2.5.0)
 81 |     webmock (3.23.0)
 82 |       addressable (>= 2.8.0)
 83 |       crack (>= 0.3.2)
 84 |       hashdiff (>= 0.4.0, < 2.0.0)
 85 |     webrick (1.8.1)
 86 |     websocket-driver (0.7.6)
 87 |       websocket-extensions (>= 0.1.0)
 88 |     websocket-extensions (0.1.5)
 89 |     yard (0.9.35)
 90 | 
 91 | PLATFORMS
 92 |   ruby
 93 | 
 94 | DEPENDENCIES
 95 |   byebug (~> 11.1)
 96 |   dotenv (~> 2.8)
 97 |   maxitest (~> 5.4)
 98 |   pry (~> 0.14)
 99 |   rubocop (~> 1.57)
100 |   toys (~> 0.15)
101 |   webmock (~> 3.19)
102 |   wgit!
103 |   yard (~> 0.9)
104 | 
105 | RUBY VERSION
106 |    ruby 3.3.0p0
107 | 
108 | BUNDLED WITH
109 |    2.5.3
110 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 - 2020 Michael Telford
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bin/wgit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "wgit"
 4 | 
 5 | # Shorten the filepath for nicer output to the user.
 6 | def format_path(dir, path)
 7 |   return path.gsub(dir, ".") if dir == Dir.pwd
 8 |   return path.gsub(dir, "~") if dir == Dir.home
 9 | 
10 |   path
11 | end
12 | 
13 | # Load .env file (if it exists somewhere).
14 | def load_env
15 |   begin
16 |     require "dotenv"
17 |   rescue LoadError
18 |     puts "Skipping .env load because 'dotenv' isn't installed"
19 |     puts
20 |     return false
21 |   end
22 | 
23 |   puts "Searching for .env file in local and home directories..."
24 |   success = false
25 | 
26 |   [Dir.pwd, Dir.home].each do |dir|
27 |     path = "#{dir}/.env"
28 |     next unless File.exist?(path)
29 | 
30 |     puts "Loading #{format_path(dir, path)}"
31 |     puts "Call `load_env` after changes to re-load the environment variables"
32 | 
33 |     Dotenv.load(path)
34 |     success = true
35 | 
36 |     break
37 |   end
38 | 
39 |   puts
40 | 
41 |   success
42 | end
43 | 
44 | # Eval .wgit.rb file (if it exists somewhere).
45 | def eval_wgit
46 |   puts "Searching for .wgit.rb file in local and home directories..."
47 |   success = false
48 | 
49 |   [Dir.pwd, Dir.home].each do |dir|
50 |     path = "#{dir}/.wgit.rb"
51 |     next unless File.exist?(path)
52 | 
53 |     puts "Eval'ing #{format_path(dir, path)}"
54 |     puts "Call `eval_wgit` after changes to re-eval the file"
55 | 
56 |     eval(File.read(path))
57 |     success = true
58 | 
59 |     break
60 |   end
61 | 
62 |   puts
63 | 
64 |   success
65 | end
66 | 
67 | # Choose and return which REPL class to use.
68 | # Use Pry if installed or fall back to IRB.
69 | def repl_class
70 |   begin
71 |     require "pry"
72 |     klass = Pry
73 |   rescue LoadError
74 |     require "irb"
75 |     klass = IRB
76 | 
77 |     puts "Using 'irb' REPL because 'pry' isn't installed"
78 |     puts
79 |   end
80 | 
81 |   klass
82 | end
83 | 
84 | ### START OF EXECUTABLE ###
85 | 
86 | load_env
87 | eval_wgit
88 | klass = repl_class
89 | 
90 | puts Wgit.version_str
91 | puts "#{'-' * Wgit.version_str.size}\n\n"
92 | 
93 | klass.start
94 | 
95 | puts "Interactive wgit session complete"
96 | 


--------------------------------------------------------------------------------
/ci.symlink:
--------------------------------------------------------------------------------
1 | ./.github/workflows/wgit.yaml


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mongo:latest
2 | 
3 | ENV MONGO_INITDB_ROOT_USERNAME rubyapp
4 | ENV MONGO_INITDB_ROOT_PASSWORD abcdef
5 | ENV MONGO_INITDB_DATABASE admin
6 | 
7 | COPY mongo-init.js /docker-entrypoint-initdb.d/
8 | 


--------------------------------------------------------------------------------
/docker/mongo-init.js:
--------------------------------------------------------------------------------
 1 | db.auth("rubyapp", "abcdef");
 2 | 
 3 | db = db.getSiblingDB("crawler");
 4 | 
 5 | db.createUser({
 6 |   user: "rubyapp",
 7 |   pwd: "abcdef",
 8 |   roles: [
 9 |     {
10 |       role: "root",
11 |       db: "admin",
12 |     },
13 |   ],
14 | });
15 | 
16 | db.createCollection("urls");
17 | db.createCollection("documents");
18 | 
19 | db.urls.createIndex({ "url" : 1 }, { "unique" : true, "name": "unique_url" });
20 | db.documents.createIndex({ "url.url" : 1 }, { "unique" : true, "name": "unique_url" });
21 | db.documents.createIndex({
22 |   title: "text",
23 |   description: "text",
24 |   keywords: "text",
25 |   text: "text"
26 | },
27 | {
28 |   weights: {
29 |     title: 2,
30 |     description: 2,
31 |     keywords: 2,
32 |     text: 1
33 |   },
34 |   name: "text_search"
35 | });
36 | 


--------------------------------------------------------------------------------
/lib/wgit.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "wgit/version"
 4 | require_relative "wgit/logger"
 5 | require_relative "wgit/assertable"
 6 | require_relative "wgit/utils"
 7 | require_relative "wgit/url"
 8 | require_relative "wgit/html_to_text"
 9 | require_relative "wgit/document"
10 | require_relative "wgit/document_extractors"
11 | require_relative "wgit/crawler"
12 | require_relative "wgit/model"
13 | require_relative "wgit/database/database"
14 | require_relative "wgit/database/database_adapter"
15 | require_relative "wgit/database/adapters/mongo_db"
16 | require_relative "wgit/database/adapters/in_memory"
17 | require_relative "wgit/robots_parser"
18 | require_relative "wgit/indexer"
19 | require_relative "wgit/dsl"
20 | require_relative "wgit/base"
21 | # require_relative 'wgit/core_ext' - Must be explicitly required.
22 | 


--------------------------------------------------------------------------------
/lib/wgit/assertable.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Wgit
  4 |   # Module containing assertion methods including type checking and duck typing.
  5 |   module Assertable
  6 |     # Default type fail message.
  7 |     DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
  8 | 
  9 |     # Wrong method message.
 10 |     NON_ENUMERABLE_MSG = "Expected an Enumerable responding to #each, not: %s"
 11 | 
 12 |     # Enumerable with more than one type across it's elements.
 13 |     MIXED_ENUMERABLE_MSG = "Expected an Enumerable with elements of a single \
 14 | common type"
 15 | 
 16 |     # Default duck fail message.
 17 |     DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
 18 | 
 19 |     # Default required keys message.
 20 |     DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
 21 | present: %s"
 22 | 
 23 |     # Tests if the obj is_a? given type; raises an Exception if not.
 24 |     #
 25 |     # @param obj [Object] The Object to test.
 26 |     # @param type_or_types [Type, Array<Type>] The type/types that obj must
 27 |     #     belong to or an exception is thrown.
 28 |     # @param msg [String] The raised StandardError message, if provided.
 29 |     # @raise [StandardError] If the assertion fails.
 30 |     # @return [Object] The given obj on successful assertion.
 31 |     def assert_types(obj, type_or_types, msg = nil)
 32 |       msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
 33 |       match = if type_or_types.respond_to?(:any?)
 34 |                 type_or_types.any? { |type| obj.is_a?(type) }
 35 |               else
 36 |                 obj.is_a?(type_or_types)
 37 |               end
 38 |       raise msg unless match
 39 | 
 40 |       obj
 41 |     end
 42 | 
 43 |     # Each object within arr must match one of the types listed in
 44 |     # type_or_types; or an exception is raised using msg, if provided.
 45 |     #
 46 |     # @param arr [Enumerable#each] Enumerable of objects to type check.
 47 |     # @param type_or_types [Type, Array<Type>] The allowed type(s).
 48 |     # @param msg [String] The raised StandardError message, if provided.
 49 |     # @raise [StandardError] If the assertion fails.
 50 |     # @return [Object] The given arr on successful assertion.
 51 |     def assert_arr_types(arr, type_or_types, msg = nil)
 52 |       raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
 53 | 
 54 |       arr.each { |obj| assert_types(obj, type_or_types, msg) }
 55 |     end
 56 | 
 57 |     # All objects within arr must match one of the types listed in
 58 |     # type_or_types; or an exception is raised using msg, if provided.
 59 |     # Ancestors of the same type are allowed and considered common.
 60 |     #
 61 |     # @param arr [Enumerable#each] Enumerable of objects to type check.
 62 |     # @param type_or_types [Type, Array<Type>] The allowed type(s).
 63 |     # @param msg [String] The raised StandardError message, if provided.
 64 |     # @raise [StandardError] If the assertion fails.
 65 |     # @return [Object] The given arr on successful assertion.
 66 |     def assert_common_arr_types(arr, type_or_types, msg = nil)
 67 |       raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
 68 | 
 69 |       type = arr.first.class
 70 |       type_match = arr.all? { |obj| type.ancestors.include?(obj.class) }
 71 |       raise MIXED_ENUMERABLE_MSG unless type_match
 72 | 
 73 |       assert_arr_types(arr, type_or_types, msg)
 74 |     end
 75 | 
 76 |     # The obj_or_objs must respond_to? all of the given methods or an
 77 |     # Exception is raised using msg, if provided.
 78 |     #
 79 |     # @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
 80 |     # @param methods [Array<Symbol>] The methods to :respond_to?.
 81 |     # @param msg [String] The raised StandardError message, if provided.
 82 |     # @raise [StandardError] If the assertion fails.
 83 |     # @return [Object] The given obj_or_objs on successful assertion.
 84 |     def assert_respond_to(obj_or_objs, methods, msg = nil)
 85 |       methods = *methods
 86 | 
 87 |       if obj_or_objs.respond_to?(:each)
 88 |         obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
 89 |       else
 90 |         _assert_respond_to(obj_or_objs, methods, msg)
 91 |       end
 92 | 
 93 |       obj_or_objs
 94 |     end
 95 | 
 96 |     # The hash must include? the keys or a KeyError is raised.
 97 |     #
 98 |     # @param hash [Hash] The hash which should include the required keys.
 99 |     # @param keys [Array<String, Symbol>] The keys whose presence to assert.
100 |     # @param msg [String] The raised KeyError message, if provided.
101 |     # @raise [KeyError] If the assertion fails.
102 |     # @return [Hash] The given hash on successful assertion.
103 |     def assert_required_keys(hash, keys, msg = nil)
104 |       msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(", "))
105 |       all_present = keys.all? { |key| hash.keys.include? key }
106 |       raise KeyError, msg unless all_present
107 | 
108 |       hash
109 |     end
110 | 
111 |     private
112 | 
113 |     # obj must respond_to? all methods or an exception is raised.
114 |     def _assert_respond_to(obj, methods, msg = nil)
115 |       raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
116 | 
117 |       msg ||= format(DEFAULT_DUCK_FAIL_MSG, "#{obj.class} (#{obj})", methods)
118 |       match = methods.all? { |method| obj.respond_to?(method) }
119 |       raise msg unless match
120 | 
121 |       obj
122 |     end
123 | 
124 |     alias_method :assert_type,            :assert_types
125 |     alias_method :assert_arr_type,        :assert_arr_types
126 |     alias_method :assert_common_arr_type, :assert_common_arr_types
127 |   end
128 | end
129 | 


--------------------------------------------------------------------------------
/lib/wgit/base.rb:
--------------------------------------------------------------------------------
 1 | module Wgit
 2 |   # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
 3 |   # All subclasses must define a `#parse(doc, &block)` method.
 4 |   class Base
 5 |     extend Wgit::DSL
 6 | 
 7 |     # Runs once before the crawl/index is run. Override as needed.
 8 |     def setup; end
 9 | 
10 |     # Runs once after the crawl/index is complete. Override as needed.
11 |     def teardown; end
12 | 
13 |     # Runs the crawl/index passing each crawled `Wgit::Document` and the given
14 |     # block to the subclass's `#parse` method.
15 |     def self.run(&block)
16 |       crawl_method = @method || :crawl
17 |       obj = new
18 | 
19 |       unless obj.respond_to?(:parse)
20 |         raise "#{obj.class} must respond_to? #parse(doc, &block)"
21 |       end
22 | 
23 |       obj.setup
24 |       send(crawl_method) { |doc| obj.parse(doc, &block) }
25 |       obj.teardown
26 | 
27 |       obj
28 |     end
29 | 
30 |     # Sets the crawl/index method to call when `Base.run` is called.
31 |     # The mode method must match one defined in the `Wgit::Crawler` or
32 |     # `Wgit::Indexer` class.
33 |     #
34 |     # @param method [Symbol] The crawl/index method to call.
35 |     def self.mode(method)
36 |       @method = method
37 |     end
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/lib/wgit/core_ext.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # Script which extends Ruby's core functionality when parsed.
 4 | # Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
 5 | 
 6 | require_relative "url"
 7 | 
 8 | # Extend the standard String functionality.
 9 | class String
10 |   # Converts a String into a Wgit::Url object.
11 |   #
12 |   # @return [Wgit::Url] The converted URL.
13 |   def to_url
14 |     Wgit::Url.parse(self)
15 |   end
16 | end
17 | 
18 | # Extend the standard Enumerable functionality.
19 | module Enumerable
20 |   # Converts each String instance into a Wgit::Url object and returns the new
21 |   # Array.
22 |   #
23 |   # @return [Array<Wgit::Url>] The converted URL's.
24 |   def to_urls
25 |     map { |element| process_url_element(element) }
26 |   end
27 | 
28 |   # Converts each String instance into a Wgit::Url object and returns self
29 |   # having modified the receiver.
30 |   #
31 |   # @return [Array<Wgit::Url>] Self containing the converted URL's.
32 |   def to_urls!
33 |     map! { |element| process_url_element(element) }
34 |   end
35 | end
36 | 
37 | private
38 | 
39 | # Converts the element to a Wgit::Url if the element is a String.
40 | def process_url_element(element)
41 |   element.is_a?(String) ? element.to_url : element
42 | end
43 | 


--------------------------------------------------------------------------------
/lib/wgit/database/adapters/in_memory.rb:
--------------------------------------------------------------------------------
  1 | require_relative "../../utils"
  2 | require_relative "../../url"
  3 | require_relative "../../document"
  4 | require_relative "../../model"
  5 | require_relative "../database_adapter"
  6 | 
  7 | module Wgit::Database
  8 |   # Database implementer class for in-memory (RAM) storage. This DB is mainly used
  9 |   # for testing and experimenting with. This DB is thread safe.
 10 |   class InMemory < DatabaseAdapter
 11 |     # Initializes a thread safe InMemory Database instance.
 12 |     #
 13 |     # @param connection_string [String] Not used but needed to adhere to the
 14 |     #   DatabaseAdapter interface.
 15 |     def initialize(connection_string = nil)
 16 |       # Inits @urls and @docs vars.
 17 |       initialize_store
 18 | 
 19 |       super
 20 |     end
 21 | 
 22 |     # Overrides String#inspect to display collection sizes.
 23 |     #
 24 |     # @return [String] A short textual representation of this object.
 25 |     def inspect
 26 |       "#<Wgit::Database::InMemory num_urls=#{@urls.size} \
 27 | num_docs=#{@docs.size} size=#{size}>"
 28 |     end
 29 | 
 30 |     # The Wgit::Url's collection stored as an in-memory Concurrent::Array.
 31 |     def urls(&block)
 32 |       map_urls(@urls, &block)
 33 |     end
 34 | 
 35 |     # The Wgit::Document's collection stored as an in-memory Concurrent::Array.
 36 |     def docs(&block)
 37 |       map_documents(@docs, &block)
 38 |     end
 39 | 
 40 |     # The raw url Hashes, not mapped into their corresponding Wgit objects.
 41 |     def url_hashes
 42 |       @urls
 43 |     end
 44 | 
 45 |     # The raw doc Hashes, not mapped into their corresponding Wgit objects.
 46 |     def doc_hashes
 47 |       @docs
 48 |     end
 49 | 
 50 |     # Returns the current size of the in-memory database.
 51 |     # An empty database will return a size of 4 because there are 4 bytes in
 52 |     # two empty arrays (urls and docs collections).
 53 |     #
 54 |     # @return [Integer] The current size of the in-memory DB.
 55 |     def size
 56 |       @urls.to_s.size + @docs.to_s.size
 57 |     end
 58 | 
 59 |     # Searches the database's Document#text for the given query. The returned
 60 |     # Documents are sorted for relevance, starting with the most relevant. Each
 61 |     # Document's #score value will be set accordingly.
 62 |     #
 63 |     # @param query [Regexp, #to_s] The regex or text value to search each
 64 |     #   document's @text for.
 65 |     # @param case_sensitive [Boolean] Whether character case must match.
 66 |     # @param whole_sentence [Boolean] Whether multiple words should be searched
 67 |     #   for separately.
 68 |     # @param limit [Integer] The max number of results to return.
 69 |     # @param skip [Integer] The number of results to skip.
 70 |     # @yield [doc] Given each search result (Wgit::Document) returned from the
 71 |     #   DB.
 72 |     # @return [Array<Wgit::Document>] The search results obtained from the DB.
 73 |     def search(
 74 |       query, case_sensitive: false, whole_sentence: true,
 75 |       limit: 10, skip: 0, &block
 76 |     )
 77 |       regex = Wgit::Utils.build_search_regex(
 78 |         query, case_sensitive:, whole_sentence:)
 79 | 
 80 |       # Search the Wgit::Document's, not the raw Hashes.
 81 |       results = docs.select do |doc|
 82 |         score = 0
 83 |         doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
 84 |           score = results_hash.values.sum
 85 |         end
 86 |         next false if score.zero?
 87 | 
 88 |         doc.instance_variable_set :@score, score
 89 |         true
 90 |       end
 91 | 
 92 |       return [] if results.empty?
 93 | 
 94 |       results = results.sort_by { |doc| -doc.score }
 95 | 
 96 |       results = results[skip..]
 97 |       return [] unless results
 98 | 
 99 |       results = results[0...limit] if limit.positive?
100 |       results.each(&block) if block_given?
101 | 
102 |       results
103 |     end
104 | 
105 |     # Deletes everything in the urls and documents collections.
106 |     #
107 |     # @return [Integer] The number of deleted records.
108 |     def empty
109 |       previous_size = @urls.size + @docs.size
110 |       initialize_store
111 | 
112 |       previous_size
113 |     end
114 | 
115 |     # Returns Url records that haven't yet been crawled.
116 |     #
117 |     # @param limit [Integer] The max number of Url's to return. 0 returns all.
118 |     # @param skip [Integer] Skip n amount of Url's.
119 |     # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
120 |     # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
121 |     def uncrawled_urls(limit: 0, skip: 0, &block)
122 |       uncrawled = @urls.reject { |url| url["crawled"] }
123 |       uncrawled = uncrawled[skip..]
124 |       return [] unless uncrawled
125 | 
126 |       uncrawled = uncrawled[0...limit] if limit.positive?
127 |       map_urls(uncrawled, &block)
128 |     end
129 | 
130 |     # Inserts or updates the object in the in-memory database.
131 |     #
132 |     # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
133 |     # @return [Boolean] True if inserted, false if updated.
134 |     def upsert(obj)
135 |       collection, index, model = get_model_info(obj)
136 | 
137 |       if index
138 |         collection[index] = model
139 |         false
140 |       else
141 |         collection << model
142 |         true
143 |       end
144 |     end
145 | 
146 |     # Bulk upserts the objects in the in-memory database collection.
147 |     # You cannot mix collection objs types, all must be Urls or Documents.
148 |     #
149 |     # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
150 |     #   inserted/updated.
151 |     # @return [Integer] The total number of newly inserted objects.
152 |     def bulk_upsert(objs)
153 |       assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
154 | 
155 |       objs.reduce(0) do |inserted, obj|
156 |         inserted += 1 if upsert(obj)
157 |         inserted
158 |       end
159 |     end
160 | 
161 |     private
162 | 
163 |     # Creates a new Concurrent::Array for each collection.
164 |     def initialize_store
165 |       @urls = Concurrent::Array.new
166 |       @docs = Concurrent::Array.new
167 |     end
168 | 
169 |     # Get the database's model info (collection type, index, model) for
170 |     # obj.
171 |     #
172 |     # Use like:
173 |     # ```
174 |     # collection, index, model = get_model_info(obj)
175 |     # ```
176 |     #
177 |     # Raises an error if obj isn't a Wgit::Url or Wgit::Document.
178 |     #
179 |     # @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
180 |     # @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
181 |     # @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
182 |     #   the collection, nil otherwise) and the Wgit::Model of obj.
183 |     def get_model_info(obj)
184 |       obj = obj.dup
185 | 
186 |       case obj
187 |       when Wgit::Url
188 |         key        = obj.to_s
189 |         collection = @urls
190 |         index      = @urls.index { |url| url["url"] == key }
191 |         model      = build_model(obj)
192 |       when Wgit::Document
193 |         key        = obj.url.to_s
194 |         collection = @docs
195 |         index      = @docs.index { |doc| doc["url"]&.[]("url") == key }
196 |         model      = build_model(obj)
197 |       else
198 |         raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
199 |       end
200 | 
201 |       [collection, index, model]
202 |     end
203 |   end
204 | end
205 | 


--------------------------------------------------------------------------------
/lib/wgit/database/database.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "adapters/mongo_db"
 4 | 
 5 | module Wgit
 6 |   # Module providing a Database connection and CRUD operations for the Url and
 7 |   # Document collections that form the Wgit persistence layer.
 8 |   module Database
 9 |     # The default Database adapter class used by Wgit.
10 |     DEFAULT_ADAPTER_CLASS = Wgit::Database::MongoDB
11 | 
12 |     # The Database adapter class to be used by Wgit. Set this based on the
13 |     # Database you want to use. The adapter doesn't exist yet? Write your own.
14 |     @adapter_class = DEFAULT_ADAPTER_CLASS
15 | 
16 |     class << self
17 |       # The Database adapter class to use with Wgit. The adapter you supply
18 |       # should be a subclass of Wgit::Database::DatabaseAdapter and should
19 |       # implement the methods within it, in order to work with Wgit.
20 |       attr_accessor :adapter_class
21 |     end
22 | 
23 |     # Initializes a DatabaseAdapter instance. Is an alias for:
24 |     # `Wgit::Database.adapter_class.new(connection_string)`
25 |     #
26 |     # @param connection_string [String] The connection string needed to connect
27 |     #   to the database.
28 |     # @raise [StandardError] If a connection string isn't provided, either as a
29 |     #   parameter or via the environment.
30 |     def self.new(connection_string = nil)
31 |       Wgit::Database.adapter_class.new(connection_string)
32 |     end
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/lib/wgit/database/database_adapter.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require_relative "../assertable"
  4 | require_relative "../url"
  5 | require_relative "../document"
  6 | require_relative "../model"
  7 | 
  8 | module Wgit::Database
  9 |   # The parent DatabaseAdapter class that should be inherited from when
 10 |   # creating an underlying Database adapter implementation class e.g.
 11 |   # Wgit::Database::MongoDB.
 12 |   #
 13 |   # Listed in this class are the methods that an implementer class must
 14 |   # implement to work with Wgit. Failure to do so will result in a
 15 |   # NotImplementedError being raised.
 16 |   #
 17 |   # While not required, implementing the method `#search_fields=(fields)` in an
 18 |   # adapter class will allow `Wgit::Model.set_search_fields` to call
 19 |   # it. This allows the search fields to be set in one method call, from within
 20 |   # the Wgit::Model class. See this method's docs for more info.
 21 |   #
 22 |   # Also listed in this class are common helper methods available to all
 23 |   # Database implementer subclasses.
 24 |   class DatabaseAdapter
 25 |     include Wgit::Assertable
 26 | 
 27 |     # The NotImplementedError message that gets raised if an implementor class
 28 |     # doesn't implement a method required by Wgit.
 29 |     NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \
 30 |   implemented this method"
 31 | 
 32 |     ###################### START OF INTERFACE METHODS ######################
 33 | 
 34 |     # Initializes a DatabaseAdapter instance.
 35 |     #
 36 |     # The implementor class should establish a DB connection here using the
 37 |     # given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`.
 38 |     # Don't forget to call `super`.
 39 |     #
 40 |     # @param connection_string [String] The connection string needed to connect
 41 |     #   to the database.
 42 |     # @raise [StandardError] If a connection string isn't provided, either as a
 43 |     #   parameter or via the environment.
 44 |     def initialize(connection_string = nil); end
 45 | 
 46 |     # Returns the current size of the database.
 47 |     #
 48 |     # @return [Integer] The current size of the DB.
 49 |     def size
 50 |       raise NotImplementedError, NOT_IMPL_ERR
 51 |     end
 52 | 
 53 |     # Searches the database's Documents for the given query. The
 54 |     # `Wgit::Model.search_fields` should be searched for matches
 55 |     # against the given query. Documents should be sorted starting with the
 56 |     # most relevant. Each returned Document should have it's `score` field set
 57 |     # for relevance.
 58 |     #
 59 |     # @param query [String] The text query to search with.
 60 |     # @param case_sensitive [Boolean] Whether character case must match.
 61 |     # @param whole_sentence [Boolean] Whether multiple words should be searched
 62 |     #   for separately.
 63 |     # @param limit [Integer] The max number of results to return.
 64 |     # @param skip [Integer] The number of results to skip.
 65 |     # @yield [doc] Given each search result (Wgit::Document) returned from the
 66 |     #   DB.
 67 |     # @return [Array<Wgit::Document>] The search results obtained from the DB.
 68 |     def search(
 69 |       query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
 70 |     )
 71 |       raise NotImplementedError, NOT_IMPL_ERR
 72 |     end
 73 | 
 74 |     # Deletes everything in the urls and documents collections.
 75 |     #
 76 |     # @return [Integer] The number of deleted records.
 77 |     def empty
 78 |       raise NotImplementedError, NOT_IMPL_ERR
 79 |     end
 80 | 
 81 |     # Returns Url records that haven't yet been crawled.
 82 |     #
 83 |     # @param limit [Integer] The max number of Url's to return. 0 returns all.
 84 |     # @param skip [Integer] Skip n amount of Url's.
 85 |     # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
 86 |     # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
 87 |     def uncrawled_urls(limit: 0, skip: 0)
 88 |       raise NotImplementedError, NOT_IMPL_ERR
 89 |     end
 90 | 
 91 |     # Inserts or updates the object in the database.
 92 |     #
 93 |     # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
 94 |     # @return [Boolean] True if inserted, false if updated.
 95 |     def upsert(obj)
 96 |       raise NotImplementedError, NOT_IMPL_ERR
 97 |     end
 98 | 
 99 |     # Bulk upserts the objects in the database collection.
100 |     # You cannot mix collection objs types, all must be Urls or Documents.
101 |     #
102 |     # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
103 |     #   inserted/updated.
104 |     # @return [Integer] The total number of newly inserted objects.
105 |     def bulk_upsert(objs)
106 |       raise NotImplementedError, NOT_IMPL_ERR
107 |     end
108 | 
109 |     ###################### END OF INTERFACE METHODS ######################
110 | 
111 |     private
112 | 
113 |     # Returns the correct Wgit::Database:Model for the given obj type.
114 |     #
115 |     # @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for.
116 |     # @return [Hash] The obj model.
117 |     def build_model(obj)
118 |       assert_type(obj, [Wgit::Url, Wgit::Document])
119 | 
120 |       if obj.is_a?(Wgit::Url)
121 |         Wgit::Model.url(obj)
122 |       else
123 |         Wgit::Model.document(obj)
124 |       end
125 |     end
126 | 
127 |     # Map each DB hash object into a Wgit::Document. Each Document is yielded
128 |     # if a block is given before returning the mapped Array of Documents.
129 |     def map_documents(doc_hashes)
130 |       doc_hashes.map do |doc|
131 |         doc = Wgit::Document.new(doc)
132 |         yield(doc) if block_given?
133 |         doc
134 |       end
135 |     end
136 | 
137 |     # Map each DB hash object into a Wgit::Url. Each Url is yielded
138 |     # if a block is given before returning the mapped Array of Urls.
139 |     def map_urls(url_hashes)
140 |       url_hashes.map do |url|
141 |         url = Wgit::Url.new(url)
142 |         yield(url) if block_given?
143 |         url
144 |       end
145 |     end
146 |   end
147 | end
148 | 


--------------------------------------------------------------------------------
/lib/wgit/document_extractors.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | ### Default Document Extractors ###
 4 | 
 5 | # Base.
 6 | Wgit::Document.define_extractor(
 7 |   :base,
 8 |   "//base/@href",
 9 |   singleton: true,
10 |   text_content_only: true
11 | ) do |base|
12 |   Wgit::Url.parse?(base) if base
13 | end
14 | 
15 | # Title.
16 | Wgit::Document.define_extractor(
17 |   :title,
18 |   "//title",
19 |   singleton: true,
20 |   text_content_only: true
21 | )
22 | 
23 | # Description.
24 | Wgit::Document.define_extractor(
25 |   :description,
26 |   '//meta[@name="description"]/@content',
27 |   singleton: true,
28 |   text_content_only: true
29 | )
30 | 
31 | # Author.
32 | Wgit::Document.define_extractor(
33 |   :author,
34 |   '//meta[@name="author"]/@content',
35 |   singleton: true,
36 |   text_content_only: true
37 | )
38 | 
39 | # Keywords.
40 | Wgit::Document.define_extractor(
41 |   :keywords,
42 |   '//meta[@name="keywords"]/@content',
43 |   singleton: true,
44 |   text_content_only: true
45 | ) do |keywords, _source, type|
46 |   if keywords && type == :document
47 |     keywords = keywords.split(",")
48 |     keywords = Wgit::Utils.sanitize(keywords)
49 |   end
50 | 
51 |   keywords
52 | end
53 | 
54 | # Links.
55 | Wgit::Document.define_extractor(
56 |   :links,
57 |   "//a/@href",
58 |   singleton: false,
59 |   text_content_only: true
60 | ) do |links|
61 |   links
62 |     .map { |link| Wgit::Url.parse?(link) }
63 |     .compact # Remove unparsable links.
64 | end
65 | 
66 | # Text.
67 | Wgit::Document.define_extractor(
68 |   :text,
69 |   nil # doc.parser contains all HTML so omit the xpath search.
70 | ) do |text, doc, type|
71 |   if type == :document
72 |     html_to_text = Wgit::HTMLToText.new(doc.parser)
73 |     text = html_to_text.extract
74 |   end
75 | 
76 |   text
77 | end
78 | 


--------------------------------------------------------------------------------
/lib/wgit/html_to_text.rb:
--------------------------------------------------------------------------------
  1 | require_relative "utils"
  2 | require_relative "assertable"
  3 | require "nokogiri"
  4 | 
  5 | module Wgit
  6 |   # Class used to extract the visible page text from a HTML string.
  7 |   # This is in turn used to set the output of a Wgit::Document#text method.
  8 |   class HTMLToText
  9 |     include Assertable
 10 | 
 11 |     # Set of text elements used to extract the visible text.
 12 |     # The element's display (:inline or :block) is used to delimit sentences e.g.
 13 |     # <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
 14 |     # <span>foo</span><span>bar</span> will be extracted as ['foobar'].
 15 |     @text_elements = {
 16 |       a:          :inline,
 17 |       abbr:       :inline,
 18 |       address:    :block,
 19 |       article:    :block,
 20 |       aside:      :block,
 21 |       b:          :inline,
 22 |       bdi:        :inline,
 23 |       bdo:        :inline,
 24 |       blockquote: :block,
 25 |       br:         :block,
 26 |       button:     :block, # Normally inline but Wgit treats as block.
 27 |       caption:    :block,
 28 |       cite:       :inline,
 29 |       code:       :inline,
 30 |       data:       :inline,
 31 |       dd:         :block,
 32 |       del:        :inline,
 33 |       details:    :block,
 34 |       dfn:        :inline,
 35 |       div:        :block,
 36 |       dl:         :block,
 37 |       dt:         :block,
 38 |       em:         :inline,
 39 |       figcaption: :block,
 40 |       figure:     :block,
 41 |       footer:     :block,
 42 |       h1:         :block,
 43 |       h2:         :block,
 44 |       h3:         :block,
 45 |       h4:         :block,
 46 |       h5:         :block,
 47 |       h6:         :block,
 48 |       header:     :block,
 49 |       hr:         :block,
 50 |       i:          :inline,
 51 |       input:      :inline,
 52 |       ins:        :block,
 53 |       kbd:        :inline,
 54 |       label:      :inline,
 55 |       legend:     :block,
 56 |       li:         :block,
 57 |       main:       :block,
 58 |       mark:       :inline,
 59 |       meter:      :block,
 60 |       ol:         :block,
 61 |       option:     :block,
 62 |       output:     :block,
 63 |       p:          :block,
 64 |       pre:        :block,
 65 |       q:          :inline,
 66 |       rb:         :inline,
 67 |       rt:         :inline,
 68 |       ruby:       :inline,
 69 |       s:          :inline,
 70 |       samp:       :inline,
 71 |       section:    :block,
 72 |       small:      :inline,
 73 |       span:       :inline,
 74 |       strong:     :inline,
 75 |       sub:        :inline,
 76 |       summary:    :block,
 77 |       sup:        :inline,
 78 |       td:         :block,
 79 |       textarea:   :block,
 80 |       th:         :block,
 81 |       time:       :inline,
 82 |       u:          :inline,
 83 |       ul:         :block,
 84 |       var:        :inline,
 85 |       wbr:        :inline
 86 |     }
 87 | 
 88 |     class << self
 89 |       # Set of HTML elements that make up the visible text on a page. These
 90 |       # elements are used to initialize the Wgit::Document#text. See the
 91 |       # README.md for how to add to this Hash dynamically.
 92 |       attr_reader :text_elements
 93 |     end
 94 | 
 95 |     # The Nokogiri::HTML document object initialized from a HTML string.
 96 |     attr_reader :parser
 97 | 
 98 |     # Creates a new HTML to text extractor instance.
 99 |     #
100 |     # @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
101 |     # @raise [StandardError] If the given parser is of an invalid type.
102 |     def initialize(parser)
103 |       assert_type(parser, Nokogiri::HTML4::Document)
104 | 
105 |       @parser = parser
106 |     end
107 | 
108 |     # Extracts and returns the text sentences from the @parser HTML.
109 |     #
110 |     # @return [Array<String>] An array of unique text sentences.
111 |     def extract_arr
112 |       return [] if @parser.to_s.empty?
113 | 
114 |       text_str = extract_str
115 | 
116 |       # Split the text_str into an Array of text sentences.
117 |       text_str
118 |         .split("\n")
119 |         .map(&:strip)
120 |         .reject(&:empty?)
121 |     end
122 | 
123 |     # Extracts and returns a text string from the @parser HTML.
124 |     #
125 |     # @return [String] A string of text with \n delimiting sentences.
126 |     def extract_str
127 |       text_str = ""
128 | 
129 |       iterate_child_nodes(@parser) do |node, display|
130 |         # Handle any special cases e.g. skip nodes we don't care about...
131 |         # <pre> nodes should have their contents displayed exactly as is.
132 |         if node_name(node) == :pre
133 |           text_str << "\n"
134 |           text_str << node.text
135 |           next
136 |         end
137 | 
138 |         # Skip any child node of <pre> since they're handled as a special case above.
139 |         next if child_of?(:pre, node)
140 | 
141 |         if node.text?
142 |           # Skip any text element that is purely whitespace.
143 |           next unless valid_text_content?(node.text)
144 |         else
145 |           # Skip a concrete node if it has other concrete child nodes as these
146 |           # will be iterated onto later.
147 |           #
148 |           # Process if node has no children or one child which is a valid text node.
149 |           next unless node.children.empty? || parent_of_text_node_only?(node)
150 |         end
151 | 
152 |         # Apply display rules deciding if a new line is needed before node.text.
153 |         add_new_line = false
154 |         prev = prev_sibling_or_parent(node)
155 | 
156 |         if node.text?
157 |           add_new_line = true unless prev && inline?(prev)
158 |         else
159 |           add_new_line = true if display == :block
160 |           add_new_line = true if prev && block?(prev)
161 |         end
162 | 
163 |         text_str << "\n" if add_new_line
164 |         text_str << format_text(node.text)
165 |       end
166 | 
167 |       text_str
168 |         .strip
169 |         .squeeze("\n")
170 |         .squeeze(" ")
171 |     end
172 | 
173 |     private
174 | 
175 |     def node_name(node)
176 |       node.name&.downcase&.to_sym
177 |     end
178 | 
179 |     def display(node)
180 |       name = node_name(node)
181 |       Wgit::HTMLToText.text_elements[name]
182 |     end
183 | 
184 |     def inline?(node)
185 |       display(node) == :inline
186 |     end
187 | 
188 |     def block?(node)
189 |       display(node) == :block
190 |     end
191 | 
192 |     # Returns the previous sibling of node or nil. Only valid text elements are
193 |     # returned i.e. non duplicates with valid text content.
194 |     def prev_sibling(node)
195 |       prev = node.previous
196 | 
197 |       return nil unless prev
198 |       return prev unless prev.text?
199 |       return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
200 |       return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
201 | 
202 |       prev.previous
203 |     end
204 | 
205 |     # Returns node's previous sibling, parent or nil; in that order. Only valid
206 |     # text elements are returned i.e. non duplicates with valid text content.
207 |     def prev_sibling_or_parent(node)
208 |       prev = prev_sibling(node)
209 |       return prev if prev
210 | 
211 |       node.parent
212 |     end
213 | 
214 |     def child_of?(ancestor_name, node)
215 |       node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
216 |     end
217 | 
218 |     # Returns true if any of the child nodes contain a non empty :text node.
219 |     def parent_of_text_node?(node)
220 |       node.children.any? { |child| child.text? && valid_text_content?(child.text) }
221 |     end
222 | 
223 |     def parent_of_text_node_only?(node)
224 |       node.children.size == 1 && parent_of_text_node?(node)
225 |     end
226 | 
227 |     # Returns true if text is not empty having removed all new lines.
228 |     def valid_text_content?(text)
229 |       !format_text(text).empty?
230 |     end
231 | 
232 |     # Returns true if node is a text node.
233 |     # Duplicate text nodes (that follow a concrete node) are omitted.
234 |     def valid_text_node?(node)
235 |       node.text? && node.text != node.parent.text
236 |     end
237 | 
238 |     def contains_new_line?(text)
239 |       ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
240 |     end
241 | 
242 |     # Remove special characters including any new lines; as semantic HTML will
243 |     # typically use <br> and/or block elements to denote a line break.
244 |     def format_text(text)
245 |       text
246 |         .encode("UTF-8", undef: :replace, invalid: :replace)
247 |         .gsub("\n",       "")
248 |         .gsub('\\n',      "")
249 |         .gsub("\r",       "")
250 |         .gsub('\\r',      "")
251 |         .gsub("\f",       "")
252 |         .gsub('\\f',      "")
253 |         .gsub("\t",       "")
254 |         .gsub('\\t',      "")
255 |         .gsub("&zwnj;",   "")
256 |         .gsub("&nbsp;",   " ")
257 |         .gsub("&#160;",   " ")
258 |         .gsub("&thinsp;", " ")
259 |         .gsub("&ensp;",   " ")
260 |         .gsub("&emsp;",   " ")
261 |         .gsub('\u00a0',   " ")
262 |     end
263 | 
264 |     # Iterate over node and it's child nodes, yielding each to &block.
265 |     # Only HTMLToText.text_elements or valid :text nodes will be yielded.
266 |     # Duplicate text nodes (that follow a concrete node) are omitted.
267 |     def iterate_child_nodes(node, &block)
268 |       display = display(node)
269 |       text_node = valid_text_node?(node)
270 | 
271 |       yield(node, display) if display || text_node
272 |       node.children.each { |child| iterate_child_nodes(child, &block) }
273 |     end
274 | 
275 |     alias_method :extract, :extract_arr
276 |   end
277 | end
278 | 


--------------------------------------------------------------------------------
/lib/wgit/logger.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # FYI: The default logger is set at the bottom of this file.
 4 | 
 5 | require "logger"
 6 | 
 7 | module Wgit
 8 |   # The Logger instance used by Wgit. Set your own custom logger after
 9 |   # requiring this file as needed.
10 |   @logger = nil
11 | 
12 |   # Returns the current Logger instance.
13 |   #
14 |   # @return [Logger] The current Logger instance.
15 |   def self.logger
16 |     @logger
17 |   end
18 | 
19 |   # Sets the current Logger instance.
20 |   #
21 |   # @param logger [Logger] The Logger instance to use.
22 |   # @return [Logger] The current Logger instance having being set.
23 |   def self.logger=(logger)
24 |     @logger = logger
25 |   end
26 | 
27 |   # Returns the default Logger instance.
28 |   #
29 |   # @return [Logger] The default Logger instance.
30 |   def self.default_logger
31 |     logger = Logger.new($stdout, progname: "wgit", level: :info)
32 |     logger.formatter = proc do |_severity, _datetime, progname, msg|
33 |       "[#{progname}] #{msg}\n"
34 |     end
35 |     logger
36 |   end
37 | 
38 |   # Sets the default Logger instance to be used by Wgit.
39 |   #
40 |   # @return [Logger] The default Logger instance.
41 |   def self.use_default_logger
42 |     @logger = default_logger
43 |   end
44 | end
45 | 
46 | Wgit.use_default_logger
47 | 


--------------------------------------------------------------------------------
/lib/wgit/model.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require_relative "./utils"
  4 | 
  5 | module Wgit
  6 |   # Module used to build the Database collection objects, forming a data model.
  7 |   # The models produced are Hash like and therefore DB agnostic. Each model
  8 |   # will contain a unique field used for searching and avoiding duplicates,
  9 |   # this is typically a `url` field. Also contained in the model are the
 10 |   # search fields used in Database and Document #search calls.
 11 |   module Model
 12 |     # The default search fields used in Database and Document #search calls.
 13 |     # The number of matches for each field is multiplied by the field weight,
 14 |     # the total is the search score, used to sort the search results.
 15 |     # Call Wgit::Model.set_default_search_fields` to revert to default.
 16 |     DEFAULT_SEARCH_FIELDS = {
 17 |       title: 2,
 18 |       description: 2,
 19 |       keywords: 2,
 20 |       text: 1
 21 |     }.freeze
 22 | 
 23 |     # The search fields used in Database and Document #search calls.
 24 |     # The number of matches for each field is multiplied by the field weight,
 25 |     # the total is the search score, used to sort the search results.
 26 |     # Call Wgit::Model.set_default_search_fields` to revert to default.
 27 |     @search_fields = DEFAULT_SEARCH_FIELDS
 28 | 
 29 |     # Whether or not to include the Document#html in the #document model.
 30 |     @include_doc_html = false
 31 | 
 32 |     # Whether or not to include the Document#score in the #document model.
 33 |     @include_doc_score = false
 34 | 
 35 |     class << self
 36 |       # The search fields used in Database and Document #search calls.
 37 |       # A custom setter method is also provided for changing these fields.
 38 |       attr_reader :search_fields
 39 | 
 40 |       # Whether or not to include the Document#html in the #document model.
 41 |       attr_accessor :include_doc_html
 42 | 
 43 |       # Whether or not to include the Document#score in the #document model.
 44 |       attr_accessor :include_doc_score
 45 |     end
 46 | 
 47 |     # Sets the search fields used in Database and Document #search calls.
 48 |     #
 49 |     # You can pass the fields as an Array of Symbols which gives each field a
 50 |     # weight of 1 meaning all fields are considered of equal value. Or you can
 51 |     # pass a Hash of Symbol => Int and specify the weights yourself, allowing
 52 |     # you to customise the search rankings.
 53 |     #
 54 |     # Use like:
 55 |     # ```
 56 |     # Wgit::Model.set_search_fields [:title, :text], db
 57 |     # => { title: 1, text: 1 }
 58 |     # Wgit::Model.set_search_fields {title: 2, text: 1}, db
 59 |     # => { title: 2, text: 1 }
 60 |     # ```
 61 |     #
 62 |     # If the given db (database) param responds to #search_fields= then it will
 63 |     # be called and given the fields to set. This should perform whatever the
 64 |     # database adapter needs in order to search using the given fields e.g.
 65 |     # creating a search index. Calling the DB enables the search_fields to be
 66 |     # set globally within Wgit by one method call, this one.
 67 |     #
 68 |     # @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
 69 |     #   the field names with their coresponding search weights.
 70 |     # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
 71 |     #   db responds to #search_fields=, it will be called and given the fields.
 72 |     # @raise [StandardError] If fields is of an incorrect type.
 73 |     # @return [Hash<Symbol, Integer>] The fields and their weights.
 74 |     def self.set_search_fields(fields, db = nil)
 75 |       # We need a Hash of fields => weights (Symbols => Integers).
 76 |       case fields
 77 |       when Array # of Strings/Symbols.
 78 |         fields = fields.map { |field| [field.to_sym, 1] }
 79 |       when Hash  # of Strings/Symbols and Integers.
 80 |         fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
 81 |       else
 82 |         raise "fields must be an Array or Hash, not a #{fields.class}"
 83 |       end
 84 | 
 85 |       @search_fields = fields.to_h
 86 |       db.search_fields = @search_fields if db.respond_to?(:search_fields=)
 87 | 
 88 |       @search_fields
 89 |     end
 90 | 
 91 |     # Sets the search fields used in Database and Document #search calls.
 92 |     #
 93 |     # If the given db (database) param responds to #search_fields= then it will
 94 |     # be called and given the fields to set. This should perform whatever the
 95 |     # database adapter needs in order to search using the given fields e.g.
 96 |     # creating a search index. Calling the DB enables the search_fields to be
 97 |     # set globally within Wgit by one method call, this one.
 98 |     #
 99 |     # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
100 |     #   db responds to #search_fields=, it will be called and given the fields.
101 |     # @return [Hash<Symbol, Integer>] The fields and their weights.
102 |     def self.set_default_search_fields(db = nil)
103 |       set_search_fields(DEFAULT_SEARCH_FIELDS, db)
104 |     end
105 | 
106 |     # The data model for a Wgit::Url collection object and for an embedded
107 |     # 'url' inside a Wgit::Document collection object.
108 |     #
109 |     # The unique field for this model is `model['url']`.
110 |     #
111 |     # @param url [Wgit::Url] The Url data object.
112 |     # @return [Hash] The URL model ready for DB insertion.
113 |     def self.url(url)
114 |       raise "url must respond_to? :to_h" unless url.respond_to?(:to_h)
115 | 
116 |       model = url.to_h
117 |       select_bson_types(model)
118 |     end
119 | 
120 |     # The data model for a Wgit::Document collection object.
121 |     #
122 |     # The unique field for this model is `model['url']['url']`.
123 |     #
124 |     # @param doc [Wgit::Document] The Document data object.
125 |     # @return [Hash] The Document model ready for DB insertion.
126 |     def self.document(doc)
127 |       raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h)
128 | 
129 |       model = doc.to_h(
130 |         include_html: @include_doc_html, include_score: @include_doc_score
131 |       )
132 |       model["url"] = url(doc.url) # Expand Url String into full object.
133 | 
134 |       select_bson_types(model)
135 |     end
136 | 
137 |     # Common fields when inserting a record into the DB.
138 |     #
139 |     # @return [Hash] Insertion fields common to all models.
140 |     def self.common_insert_data
141 |       {
142 |         date_added:    Wgit::Utils.time_stamp,
143 |         date_modified: Wgit::Utils.time_stamp
144 |       }
145 |     end
146 | 
147 |     # Common fields when updating a record in the DB.
148 |     #
149 |     # @return [Hash] Update fields common to all models.
150 |     def self.common_update_data
151 |       {
152 |         date_modified: Wgit::Utils.time_stamp
153 |       }
154 |     end
155 | 
156 |     # Returns the model having removed non bson types (for use with MongoDB).
157 |     #
158 |     # @param model_hash [Hash] The model Hash to sanitize.
159 |     # @return [Hash] The model Hash with non bson types removed.
160 |     def self.select_bson_types(model_hash)
161 |       model_hash.select { |_k, v| v.respond_to?(:bson_type) }
162 |     end
163 |   end
164 | end
165 | 


--------------------------------------------------------------------------------
/lib/wgit/response.rb:
--------------------------------------------------------------------------------
  1 | module Wgit
  2 |   # Response class modeling a generic HTTP GET response.
  3 |   class Response
  4 |     # The underlying HTTP adapter/library response object.
  5 |     attr_accessor :adapter_response
  6 | 
  7 |     # The HTML response body.
  8 |     attr_reader   :body
  9 | 
 10 |     # The HTTP response headers.
 11 |     attr_reader   :headers
 12 | 
 13 |     # The servers IP address.
 14 |     attr_accessor :ip_address
 15 | 
 16 |     # The redirections of the response.
 17 |     attr_reader   :redirections
 18 | 
 19 |     # The HTTP response status code.
 20 |     attr_reader   :status
 21 | 
 22 |     # The total crawl/network time for the response.
 23 |     attr_reader   :total_time
 24 | 
 25 |     # The HTTP request URL.
 26 |     attr_accessor :url
 27 | 
 28 |     # Defaults some values and returns a "blank" Wgit::Response object.
 29 |     def initialize
 30 |       @body         = ""
 31 |       @headers      = {}
 32 |       @redirections = {}
 33 |       @total_time   = 0.0
 34 |     end
 35 | 
 36 |     # Overrides String#inspect to shorten the printed output of a Response.
 37 |     #
 38 |     # @return [String] A short textual representation of this Response.
 39 |     def inspect
 40 |       "#<Wgit::Response url=\"#{@url}\" status=#{status}>"
 41 |     end
 42 | 
 43 |     # Adds time to @total_time (incrementally).
 44 |     #
 45 |     # @param time [Float] The time to add to @total_time.
 46 |     # @return [Float] @total_time's new value.
 47 |     def add_total_time(time)
 48 |       @total_time += time || 0.0
 49 |     end
 50 | 
 51 |     # Sets the HTML response body.
 52 |     #
 53 |     # @param str [String] The new HTML body.
 54 |     # @return [String] @body's new value.
 55 |     def body=(str)
 56 |       @body = str || ""
 57 |     end
 58 | 
 59 |     # Returns the HTML response body or nil (if it's empty).
 60 |     #
 61 |     # @return [String, NilClass] The HTML body or nil if empty.
 62 |     def body_or_nil
 63 |       @body.empty? ? nil : @body
 64 |     end
 65 | 
 66 |     # Returns whether or not a server response is absent.
 67 |     #
 68 |     # @return [Boolean] True if the status is nil or < 1, false otherwise.
 69 |     def failure?
 70 |       !success?
 71 |     end
 72 | 
 73 |     # Sets the headers Hash to the given value. The header keys are mapped
 74 |     # to snake_cased Symbols for consistency.
 75 |     #
 76 |     # @param headers [Hash] The new response headers.
 77 |     # @return [Hash] @headers's new value.
 78 |     def headers=(headers)
 79 |       unless headers
 80 |         @headers = {}
 81 |         return
 82 |       end
 83 | 
 84 |       @headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym }
 85 |     end
 86 | 
 87 |     # Returns whether or not the response is 404 Not Found.
 88 |     #
 89 |     # @return [Boolean] True if 404 Not Found, false otherwise.
 90 |     def not_found?
 91 |       @status == 404
 92 |     end
 93 | 
 94 |     # Returns whether or not the response is 200 OK.
 95 |     #
 96 |     # @return [Boolean] True if 200 OK, false otherwise.
 97 |     def ok?
 98 |       @status == 200
 99 |     end
100 | 
101 |     # Returns whether or not the response is a 3xx Redirect.
102 |     #
103 |     # @return [Boolean] True if 3xx Redirect, false otherwise.
104 |     def redirect?
105 |       return false unless @status
106 | 
107 |       @status.between?(300, 399)
108 |     end
109 | 
110 |     # Returns the number of redirects this response has had.
111 |     #
112 |     # @return [Integer] The number of response redirects.
113 |     def redirect_count
114 |       @redirections.size
115 |     end
116 | 
117 |     # Returns the size of the response body.
118 |     #
119 |     # @return [Integer] The response body size in bytes.
120 |     def size
121 |       @body.size
122 |     end
123 | 
124 |     # Sets the HTML response status.
125 |     #
126 |     # @param int [Integer] The new response status.
127 |     # @return [Integer] @status' new value.
128 |     def status=(int)
129 |       @status = int.positive? ? int : nil
130 |     end
131 | 
132 |     # Returns whether or not a server response is present.
133 |     #
134 |     # @return [Boolean] True if the status is > 0, false otherwise.
135 |     def success?
136 |       return false unless @status
137 | 
138 |       @status.positive?
139 |     end
140 | 
141 |     # Returns whether or not Wgit is banned from indexing this site.
142 |     #
143 |     # @return [Boolean] True if Wgit should not index this site, false
144 |     #   otherwise.
145 |     def no_index?
146 |       headers.fetch(:x_robots_tag, "").downcase.strip == "noindex"
147 |     end
148 | 
149 |     alias_method :code,           :status
150 |     alias_method :content,        :body
151 |     alias_method :crawl_duration, :total_time
152 |     alias_method :to_s,           :body
153 |     alias_method :redirects,      :redirections
154 |     alias_method :length,         :size
155 |   end
156 | end
157 | 


--------------------------------------------------------------------------------
/lib/wgit/robots_parser.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Wgit
  4 |   # The RobotsParser class handles parsing and processing of a web servers
  5 |   # robots.txt file.
  6 |   class RobotsParser
  7 |     include Wgit::Assertable
  8 | 
  9 |     # Key representing the start of a comment.
 10 |     KEY_COMMENT    = "#"
 11 |     # Key value separator used in robots.txt files.
 12 |     KEY_SEPARATOR  = ":"
 13 |     # Key representing a user agent.
 14 |     KEY_USER_AGENT = "User-agent"
 15 |     # Key representing an allow URL rule.
 16 |     KEY_ALLOW      = "Allow"
 17 |     # Key representing a disallow URL rule.
 18 |     KEY_DISALLOW   = "Disallow"
 19 | 
 20 |     # Value representing the Wgit user agent.
 21 |     USER_AGENT_WGIT = :wgit
 22 |     # Value representing any user agent including Wgit.
 23 |     USER_AGENT_ANY  = :*
 24 | 
 25 |     # Value representing any and all paths.
 26 |     PATHS_ALL = %w[/ *].freeze
 27 | 
 28 |     # Hash containing the user-agent allow/disallow URL rules. Looks like:
 29 |     #   allow_paths:    ["/"]
 30 |     #   disallow_paths: ["/accounts", ...]
 31 |     attr_reader :rules
 32 | 
 33 |     # Initializes and returns a Wgit::RobotsParser instance having parsed the
 34 |     # robot.txt contents.
 35 |     #
 36 |     # @param contents [String, #to_s] The contents of the robots.txt file to be
 37 |     #   parsed.
 38 |     def initialize(contents)
 39 |       @rules = {
 40 |         allow_paths: Set.new,
 41 |         disallow_paths: Set.new
 42 |       }
 43 | 
 44 |       assert_respond_to(contents, :to_s)
 45 |       parse(contents.to_s)
 46 |     end
 47 | 
 48 |     # Overrides String#inspect to shorten the printed output of a Parser.
 49 |     #
 50 |     # @return [String] A short textual representation of this Parser.
 51 |     def inspect
 52 |       "#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
 53 |     end
 54 | 
 55 |     # Returns the allow paths/rules for this parser's robots.txt contents.
 56 |     #
 57 |     # @return [Array<String>] The allow paths/rules to follow.
 58 |     def allow_paths
 59 |       @rules[:allow_paths].to_a
 60 |     end
 61 | 
 62 |     # Returns the disallow paths/rules for this parser's robots.txt contents.
 63 |     #
 64 |     # @return [Array<String>] The disallow paths/rules to follow.
 65 |     def disallow_paths
 66 |       @rules[:disallow_paths].to_a
 67 |     end
 68 | 
 69 |     # Returns whether or not there are rules applying to Wgit.
 70 |     #
 71 |     # @return [Boolean] True if there are rules for Wgit to follow, false
 72 |     #   otherwise.
 73 |     def rules?
 74 |       allow_rules? || disallow_rules?
 75 |     end
 76 | 
 77 |     # Returns whether or not there are allow rules applying to Wgit.
 78 |     #
 79 |     # @return [Boolean] True if there are allow rules for Wgit to follow,
 80 |     #   false otherwise.
 81 |     def allow_rules?
 82 |       @rules[:allow_paths].any?
 83 |     end
 84 | 
 85 |     # Returns whether or not there are disallow rules applying to Wgit.
 86 |     #
 87 |     # @return [Boolean] True if there are disallow rules for Wgit to follow,
 88 |     #   false otherwise.
 89 |     def disallow_rules?
 90 |       @rules[:disallow_paths].any?
 91 |     end
 92 | 
 93 |     # Returns whether or not Wgit is banned from indexing this site.
 94 |     #
 95 |     # @return [Boolean] True if Wgit should not index this site, false
 96 |     #   otherwise.
 97 |     def no_index?
 98 |       @rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
 99 |     end
100 | 
101 |     private
102 | 
103 |     # Parses the file contents and sets @rules.
104 |     def parse(contents)
105 |       user_agents = []
106 |       new_block = false
107 | 
108 |       contents.split("\n").each do |line|
109 |         line.strip!
110 |         next if line.empty? || line.start_with?(KEY_COMMENT)
111 | 
112 |         # A user agent block is denoted by N User-agent's followed by N
113 |         # Allow/Disallow's. After which a new block is formed from scratch.
114 |         if start_with_any_case?(line, KEY_USER_AGENT)
115 |           if new_block
116 |             user_agents = []
117 |             new_block = false
118 |           end
119 |           user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
120 |         else
121 |           new_block = true
122 |         end
123 | 
124 |         if start_with_any_case?(line, KEY_ALLOW)
125 |           append_allow_rule(user_agents, line)
126 |         elsif start_with_any_case?(line, KEY_DISALLOW)
127 |           append_disallow_rule(user_agents, line)
128 |         elsif !start_with_any_case?(line, KEY_USER_AGENT)
129 |           Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
130 |         end
131 |       end
132 |     end
133 | 
134 |     # Implements start_with? but case insensitive.
135 |     def start_with_any_case?(str, prefix)
136 |       str.downcase.start_with?(prefix.downcase)
137 |     end
138 | 
139 |     # Returns line with key removed (if present). Otherwise line is returned
140 |     # as given.
141 |     def remove_key(line, key)
142 |       return line unless start_with_any_case?(line, key)
143 |       return line unless line.count(KEY_SEPARATOR) == 1
144 | 
145 |       segs = line.split(KEY_SEPARATOR)
146 |       return "" if segs.size == 1
147 | 
148 |       segs.last.strip
149 |     end
150 | 
151 |     # Don't append * or /, as this means all paths, which is the same as no
152 |     # allow_paths when passed to Wgit::Crawler.
153 |     def append_allow_rule(user_agents, line)
154 |       return unless wgit_user_agent?(user_agents)
155 | 
156 |       path = remove_key(line, KEY_ALLOW)
157 |       path = parse_special_syntax(path)
158 |       return if PATHS_ALL.include?(path)
159 | 
160 |       @rules[:allow_paths] << path
161 |     end
162 | 
163 |     def append_disallow_rule(user_agents, line)
164 |       return unless wgit_user_agent?(user_agents)
165 | 
166 |       path = remove_key(line, KEY_DISALLOW)
167 |       path = parse_special_syntax(path)
168 |       @rules[:disallow_paths] << path
169 |     end
170 | 
171 |     def wgit_user_agent?(user_agents)
172 |       user_agents.any? do |agent|
173 |         [USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
174 |       end
175 |     end
176 | 
177 |     def parse_special_syntax(path)
178 |       # Remove $ e.g. "/blah$" becomes "/blah"
179 |       path = path.gsub("$", "")
180 | 
181 |       # Remove any inline comments e.g. "/blah # comment" becomes "/blah"
182 |       path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
183 | 
184 |       # Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
185 |       path = "*" if path.empty?
186 | 
187 |       path
188 |     end
189 | 
190 |     alias_method :paths, :rules
191 |     alias_method :banned?, :no_index?
192 |   end
193 | end
194 | 


--------------------------------------------------------------------------------
/lib/wgit/version.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
 4 | # contents for later use.
 5 | #
 6 | # @author Michael Telford
 7 | module Wgit
 8 |   # The current gem version of Wgit.
 9 |   VERSION = "0.12.0"
10 | 
11 |   # Returns the current gem version of Wgit as a String.
12 |   def self.version
13 |     VERSION
14 |   end
15 | 
16 |   # Returns the current gem version in a presentation String.
17 |   def self.version_str
18 |     "wgit v#{VERSION}"
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/load.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # Development script which loads (all changes to) the code when called.
 4 | # Note this script doesn't establish a connection to the database.
 5 | 
 6 | load "lib/wgit/version.rb"
 7 | load "lib/wgit/logger.rb"
 8 | load "lib/wgit/assertable.rb"
 9 | load "lib/wgit/utils.rb"
10 | load "lib/wgit/url.rb"
11 | load "lib/wgit/html_to_text.rb"
12 | load "lib/wgit/document.rb"
13 | load "lib/wgit/document_extractors.rb"
14 | load "lib/wgit/crawler.rb"
15 | load "lib/wgit/model.rb"
16 | load "lib/wgit/database/database.rb"
17 | load "lib/wgit/database/database_adapter.rb"
18 | load "lib/wgit/database/adapters/mongo_db.rb"
19 | load "lib/wgit/database/adapters/in_memory.rb"
20 | load "lib/wgit/robots_parser.rb"
21 | load "lib/wgit/indexer.rb"
22 | load "lib/wgit/dsl.rb"
23 | load "lib/wgit/base.rb"
24 | load "lib/wgit/core_ext.rb"
25 | 
26 | include Wgit # Remove the name space around code (for development purposes).
27 | include DSL
28 | include Assertable
29 | 


--------------------------------------------------------------------------------
/test/helpers/database_helper.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require_relative "database_test_data"
  4 | 
  5 | # Helper module used to manipulate any database adapter. This module should
  6 | # be included in other DB helper modules. To do so, you must implement the
  7 | # following underlying methods:
  8 | #
  9 | # db                    # Returns a connected database adapter instance
 10 | # empty_db              # Empties the url and document collections
 11 | # seed_urls(url_hashes) # Seeds the given url hashes
 12 | # seed_docs(doc_hashes) # Seeds the given document hashes
 13 | # url?(url_hash)        # Returns true if the given url hash exists
 14 | # doc?(url_hash)        # Returns true if the given document hash exists
 15 | #
 16 | # The above method implementations should be done using the raw client for
 17 | # your DB adapter, not the Wgit adapter class that you're testing; this way
 18 | # the helpers won't fail before your DB tests fail.
 19 | module DatabaseHelper
 20 |   def self.included(_base)
 21 |     @@urls = []
 22 |     @@docs = []
 23 |   end
 24 | 
 25 |   # Seed what's in the block, comprising of url and doc method calls
 26 |   # (from this module). An integer can be used to specify how many default
 27 |   # objects should be seeded, defaults to 1; or provide your own Wgit:Url and
 28 |   # Wgit:Document instances (which are passed through Wgit::Model). Hashes are
 29 |   # also supported and will be merged with Wgit::Model.common_insert_data.
 30 |   #
 31 |   # Returns the number of seeded/inserted documents in the DB.
 32 |   #
 33 |   # Code example:
 34 |   #   seed do
 35 |   #     url(Wgit::Url | Hash)
 36 |   #     doc(Wgit::Document | Hash)
 37 |   #     urls 3  # Seeds 3 of the default dev url records.
 38 |   #     doc     # Seeds 1 of the default dev doc records.
 39 |   #   end
 40 |   def seed(&block)
 41 |     raise "Must provide a block" unless block_given?
 42 | 
 43 |     @@urls.clear
 44 |     @@docs.clear
 45 | 
 46 |     # &block populates the @@urls and @@docs arrays.
 47 |     instance_eval(&block)
 48 | 
 49 |     seed_urls(@@urls) unless @@urls.empty?
 50 |     seed_docs(@@docs) unless @@docs.empty?
 51 | 
 52 |     @@urls.count + @@docs.count
 53 |   end
 54 | 
 55 |   private
 56 | 
 57 |   # DSL method used within the block passed to DatabaseHelper#seed.
 58 |   # Seeds one or more Wgit::Urls into the DB.
 59 |   def url(url_or_int = 1)
 60 |     case url_or_int
 61 |     when String
 62 |       parsed_url = Wgit::Url.parse(url_or_int)
 63 |       append_url(parsed_url)
 64 |     when Array
 65 |       url_or_int.each { |url| append_url(url) }
 66 |     when Integer
 67 |       url_or_int.times { @@urls << DatabaseTestData.url }
 68 |     else
 69 |       raise "Invalid data type: #{url_or_int.class}"
 70 |     end
 71 |   end
 72 | 
 73 |   # DSL method used within the block passed to DatabaseHelper#seed.
 74 |   # Seeds one or more Wgit::Documents into the DB.
 75 |   def doc(doc_or_int = 1)
 76 |     case doc_or_int
 77 |     when Wgit::Document
 78 |       append_doc(doc_or_int)
 79 |     when Array
 80 |       doc_or_int.each { |doc| append_doc(doc) }
 81 |     when Integer
 82 |       doc_or_int.times { @@docs << DatabaseTestData.doc }
 83 |     else
 84 |       raise "Invalid data type: #{url_or_int.class}"
 85 |     end
 86 |   end
 87 | 
 88 |   # Appends a Url to @@urls.
 89 |   def append_url(url)
 90 |     model_hash = case url
 91 |                  when Wgit::Url
 92 |                    Wgit::Model.url(url)
 93 |                  when Hash
 94 |                    url
 95 |                  else
 96 |                    raise "Invalid data type: #{url.class}"
 97 |                  end
 98 | 
 99 |     @@urls << model_hash.merge(Wgit::Model.common_insert_data)
100 |   end
101 | 
102 |   # Appends a Document to @@docs.
103 |   def append_doc(doc)
104 |     model_hash = case doc
105 |                  when Wgit::Document
106 |                    Wgit::Model.document(doc)
107 |                  when Hash
108 |                    doc
109 |                  else
110 |                    raise "Invalid data type: #{doc.class}"
111 |                  end
112 | 
113 |     @@docs << model_hash.merge(Wgit::Model.common_insert_data)
114 |   end
115 | 
116 |   alias_method :urls, :url
117 |   alias_method :docs, :doc
118 | end
119 | 


--------------------------------------------------------------------------------
/test/helpers/in_memory_helper.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "database_test_data"
 4 | require_relative "database_helper"
 5 | require "mongo"
 6 | 
 7 | # Helper class used to manipulate the InMemory database.
 8 | module InMemoryHelper
 9 |   include DatabaseHelper
10 | 
11 |   # Returns the connected InMemory instance.
12 |   def db
13 |     @db ||= Wgit::Database::InMemory.new
14 |   end
15 | 
16 |   # Deletes everything in the urls and documents collections.
17 |   def empty_db
18 |     # Normally you shouldn't call the adapter class but this just sets new
19 |     # concurrent arrays to the instance vars, so can't really go wrong.
20 |     db.send(:initialize_store)
21 |   end
22 | 
23 |   # Seed an Array of url Hashes into the database.
24 |   def seed_urls(url_hashes)
25 |     url_hashes.each { |url_h| db.url_hashes << url_h }
26 |   end
27 | 
28 |   # Seed an Array of document Hashes into the database.
29 |   def seed_docs(doc_hashes)
30 |     doc_hashes.each { |doc_h| db.doc_hashes << doc_h }
31 |   end
32 | 
33 |   # Returns if the url_hash/record exists in the database.
34 |   def url?(url_hash)
35 |     db.url_hashes.any? { |url| url == url_hash }
36 |   end
37 | 
38 |   # Returns if the doc_hash/record exists in the database.
39 |   def doc?(doc_hash)
40 |     db.doc_hashes.any? { |doc| doc == doc_hash }
41 |   end
42 | end
43 | 


--------------------------------------------------------------------------------
/test/helpers/mongo_db_helper.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "database_test_data"
 4 | require_relative "database_helper"
 5 | require "mongo"
 6 | 
 7 | # Helper class used to manipulate the MongoDB database.
 8 | module MongoDBHelper
 9 |   include DatabaseHelper
10 | 
11 |   # Returns the connected MongoDB instance.
12 |   def db
13 |     @db ||= Wgit::Database::MongoDB.new
14 |   end
15 | 
16 |   # Deletes everything in the urls and documents collections.
17 |   def empty_db
18 |     db.client[:urls].delete_many({})
19 |     db.client[:documents].delete_many({})
20 |   end
21 | 
22 |   # Seed an Array of url Hashes into the database.
23 |   def seed_urls(url_hashes)
24 |     db.client[:urls].insert_many(url_hashes)
25 |   rescue StandardError => e
26 |     err_msg = e.respond_to?(:result) ? e.result["writeErrors"] : e.message
27 |     raise "Write to DB failed - remember that both urls and docs won't \
28 | accept duplicate urls. Exception details: #{err_msg}"
29 |   end
30 | 
31 |   # Seed an Array of document Hashes into the database.
32 |   def seed_docs(doc_hashes)
33 |     db.client[:documents].insert_many(doc_hashes)
34 |   rescue StandardError => e
35 |     err_msg = e.respond_to?(:result) ? e.result["writeErrors"] : e.message
36 |     raise "Write to DB failed - remember that both urls and docs won't \
37 | accept duplicate urls. Exception details: #{err_msg}"
38 |   end
39 | 
40 |   # Returns if the url_hash/record exists in the DB.
41 |   #
42 |   # Different from Wgit::Database::MongoDB#url? because it asserts the full
43 |   # url_hash, not just the presence of the unique 'url' field.
44 |   def url?(url_hash)
45 |     db.client[:urls].find(url_hash).any?
46 |   end
47 | 
48 |   # Returns if the doc_hash/record exists in the DB.
49 |   #
50 |   # Different from Wgit::Database::MongoDB#doc? because it asserts the full
51 |   # doc_hash, not just the presence of the unique 'url' field.
52 |   def doc?(doc_hash)
53 |     db.client[:documents].find(doc_hash).any?
54 |   end
55 | end
56 | 


--------------------------------------------------------------------------------
/test/helpers/test_helper.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | $VERBOSE = nil # Suppress ruby warnings during the test run.
 4 | 
 5 | # Require third party gems.
 6 | require "maxitest/autorun"
 7 | require "maxitest/threads" # Fail on orphaned test threads.
 8 | require "maxitest/timeout"
 9 | require "logger"
10 | require "dotenv"
11 | require "byebug" # Call 'byebug' anywhere in the code to debug.
12 | 
13 | # Require any test helpers.
14 | require_relative "../mock/fixtures" # Mock HTTP responses.
15 | require_relative "database_test_data"
16 | require_relative "database_helper"
17 | require_relative "mongo_db_helper"
18 | require_relative "in_memory_helper"
19 | 
20 | # Require all code being tested, once, in one place.
21 | require_relative "../../lib/wgit"
22 | require_relative "../../lib/wgit/core_ext"
23 | 
24 | Maxitest.timeout  = 60           # Fail test after N seconds.
25 | Wgit.logger.level = Logger::WARN # Remove STDOUT noise from test run.
26 | 
27 | # Test helper class for unit tests. Should be inherited from by all test cases.
28 | class TestHelper < Minitest::Test
29 |   # Fires everytime this class is inherited from.
30 |   def self.inherited(child)
31 |     Dotenv.load # Set the DB connection string from the ENV.
32 |     super       # Run the tests.
33 |   end
34 | 
35 |   # Any helper methods go below, these will be callable from unit tests.
36 | 
37 |   # Flunk (fail) the test if an exception is raised by the given block.
38 |   def refute_exception
39 |     yield
40 |   rescue StandardError => e
41 |     flunk e.message
42 |   end
43 | end
44 | 
45 | # Override type #inspect methods for nicer test failure messages.
46 | class Wgit::Url
47 |   def inspect
48 |     "\"#{self}\""
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/test/mock/fixtures.rb:
--------------------------------------------------------------------------------
 1 | # See `toys -s save` for tasks on saving a web fixture to disk;
 2 | # then mock it's HTTP response below so it's available to crawl in the tests
 3 | # using Wgit. Note that you can mock a response without a saved fixture.
 4 | 
 5 | require_relative "webmock" # DSL for mocking HTTP responses.
 6 | 
 7 | # Custom mock responses, outside of serving a saved fixture from disk.
 8 | stub_page "https://search.yahoo.com"
 9 | stub_page "https://www.google.co.uk"
10 | stub_page "http://www.bing.com"
11 | stub_redirect "http://twitter.com", "https://twitter.com"
12 | stub_page "https://twitter.com"
13 | stub_redirect "https://cms.org", "https://example.com/de/page1"
14 | stub_redirect "https://example.com/de/page1", "/de/folder/page2#blah-on-page2"
15 | stub_page "https://example.com/de/folder/page2#blah-on-page2"
16 | stub_redirect "http://redirect.com/1", "http://redirect.com/2" # First redirect.
17 | stub_redirect "http://redirect.com/2", "http://redirect.com/3" # Second redirect.
18 | stub_redirect "http://redirect.com/3", "http://redirect.com/4" # Third redirect.
19 | stub_redirect "http://redirect.com/4", "http://redirect.com/5" # Fourth redirect.
20 | stub_redirect "http://redirect.com/5", "http://redirect.com/6" # Fifth redirect.
21 | stub_redirect "http://redirect.com/6", "http://redirect.com/7" # Sixth redirect.
22 | stub_page "http://redirect.com/7", fixture: "blank"
23 | stub_page "https://www.xn--ber-goa.com/about"
24 | stub_redirect "http://test-site.com/sneaky", "https://motherfuckingwebsite.com/"
25 | stub_page "http://test-site.com/public/records?q=username", fixture: "test-site.com/public/records"
26 | stub_page "http://test-site.com/public/records#top", fixture: "test-site.com/public/records"
27 | stub_redirect "http://test-site.com/ftp", "http://ftp.test-site.com"
28 | stub_not_found "http://ftp.test-site.com"
29 | stub_redirect "http://test-site.com/smtp", "http://smtp.test-site.com"
30 | stub_page "http://smtp.test-site.com"
31 | stub_redirect "http://myserver.com", "http://www.myserver.com"
32 | stub_redirect "http://www.myserver.com", "http://test-site.com"
33 | stub_timeout "http://doesnt_exist/"
34 | stub_timeout "http://test-site.com/doesntexist"
35 | stub_page "http://odd-extension.com/other.html5", body: "<p>Hello world</p>"
36 | stub_page "http://fonts.googleapis.com"
37 | stub_page "https://blank-site-1.com", fixture: "blank"
38 | stub_page "https://blank-site-2.com", fixture: "blank"
39 | stub_page "https://blank-site-3.com", fixture: "blank"
40 | stub_page "http://blank-site-4.com",  fixture: "blank"
41 | stub_page "https://blank-site-5.com", fixture: "blank"
42 | stub_redirect "http://blank-site-2.com", "https://blank-site-2.com"
43 | stub_redirect "http://blank-site-2.com/robots.txt", "https://blank-site-2.com/robots.txt"
44 | 
45 | # Mock a website whose's content gets updated (between indexes).
46 | stub_request(:get, "http://www.content-updates.com")
47 |   .to_return({ body: "Original content" }, { body: "Updated content" })
48 | 
49 | # Match all *.jpg URL's for belfastpilates.co.uk.
50 | stub_request(:get, Regexp.new("http://www.belfastpilates.co.uk/(.*).(?:jpg|jpeg)"))
51 | 
52 | # Mock robots.txt requests.
53 | stub_request(:get, "http://robots.txt.com/account")
54 |   .to_return(status: 200, headers: { 'X-Robots-Tag': "noindex" }, body: "<p>Robots account</p>")
55 | 
56 | stub_robots_txt_not_found [
57 |   "http://txti.es",
58 |   "http://quotes.toscrape.com",
59 |   "http://test-site.com",
60 |   "https://motherfuckingwebsite.com",
61 |   "http://link-to-robots-txt.com",
62 |   "https://external-link-portal.com",
63 |   "https://blank-site-1.com",
64 |   "https://blank-site-2.com",
65 |   "https://blank-site-3.com",
66 |   "http://blank-site-4.com",
67 |   "https://blank-site-5.com",
68 |   "http://redirect.com",
69 |   "http://www.content-updates.com"
70 | ]
71 | 
72 | # Mock responses based on individual files saved to disk. The URL should match
73 | # the file name (minus the scheme prefix and .html extension suffix).
74 | pages = [
75 |   "https://motherfuckingwebsite.com/",
76 |   "https://wikileaks.org/What-is-Wikileaks.html",
77 |   "https://www.facebook.com",
78 |   "https://static.xx.fbcdn.net/rsrc.php/v3/y1/l/0,cross/NvZ4mNTW3Fd.css",
79 |   "http://altitudejunkies.com",
80 |   "http://www.mountainmadness.com",
81 |   "http://www.adventureconsultants.com",
82 |   "http://odd-extension.com",
83 |   "http://link-to-robots-txt.com",
84 |   "https://external-link-portal.com/"
85 | ]
86 | 
87 | # Mock sites based on a collection of files saved in a directory.
88 | # NOTE: URL's listed below MUST NOT have a path, only a scheme and host.
89 | sites = [
90 |   "http://txti.es/",
91 |   "http://www.belfastpilates.co.uk/",
92 |   "http://test-site.com",
93 |   "http://quotes.toscrape.com/",
94 |   "http://robots.txt.com",
95 |   "http://disallow-all.com"
96 | ]
97 | 
98 | stub_fixtures pages, sites
99 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/altitudejunkies.com.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  4 | 
  5 | <link type="text/css" rel="stylesheet" href="css/global.css" />
  6 | <title>Altitude Junkies | High Altitude Mountaineering Expeditions</title>
  7 | </head>
  8 | <body>
  9 | <div id="center">
 10 | <div id="header">
 11 | <h1><a href="index.html"><span></span></a></h1>
 12 | </div><!-- Close Div #header -->
 13 | <ul id="nav">
 14 | 
 15 | 	<li class="first"><a href="index.html">JUNKIES</a></li>
 16 | 		<li><a href="everesttibet.html">EVEREST</a></li>
 17 | 		<li><a href="amadablam.html">AMA DABLAM</a></li>
 18 | 	<li><a href="cholatse.html">CHOLATSE</a></li>
 19 | <li><a href="kyajori.html">KYAJO RI</a></li>
 20 | <li><a href="merapeak.html">MERA PEAK</a></li>
 21 | <li><a href="cordillerablanca.html">CORDILLERA BLANCA</a></li>
 22 | <li><a href="treks.html">TREKS</a></li>
 23 | <li><a href="leaders.html">ABOUT US</a></li>
 24 | <li><a href="news.html">NEWS</a></li>
 25 | <li><a href="gunks.html">THE GUNKS</a></ul>
 26 | 
 27 | 
 28 | <div id="content" class="cholatse2">
 29 | 
 30 | <div id="right">
 31 | <div class="wrap">
 32 | <div class="wrap2">
 33 | <dl>
 34 | 
 35 | <dt>Upcoming Expeditions</dt>
 36 | <dt><a class="open" href="everesttibet.html">Everest, Tibet</a></dt>
 37 | <dd>April 20 - May 31, 2020 (42 Days)</dd>
 38 | 
 39 | <dt><a class="open" href="cordillerablanca.html">Cordillera Blanca, Peru</a></dt>
 40 | <dd>June 13 - June 26, 2020 (14 Days)</dd>
 41 | 
 42 | <dt><a class="open" href="merapeak.html">Mera Peak, Nepal</a></dt>
 43 | <dd>October 3 - October 14, 2020 (12 Days)</dd>
 44 | 
 45 | <dt><a class="open" href="cholatse.html">Cholatse, Nepal</a></dt>
 46 | <dd>October 17 - November 6, 2020 (21 Days)</dd>
 47 | 
 48 | <dt><a class="open" href="amadablam.html">Ama Dablam, Nepal</a></dt>
 49 | <dd>November 14 - December 4, 2020 (21 Days)</dd>
 50 | 
 51 | 
 52 | </dl>
 53 | </div>
 54 | </div>
 55 | </div>
 56 | 
 57 | <p>Altitude Junkies is established as one of the premier outfitters offering professionally managed mountaineering expeditions.</p>
 58 | 
 59 | <p>Our expeditions are non-guided like traditional commercial expeditions with certified IFMGA guides and a 4:1 climber to guide ratio. On our expeditions we have a single expedition leader who manages the expedition logistics and staff. In the Cordillera Blanca of Peru where we practice roped glacier travel we may have two expedition leaders.</p>
 60 | 
 61 | <p>All our expedition leaders are professional climbers and have reached the summits of the mountains that they lead the expeditions to, some numerous times. They climb year round worldwide and have the experience needed to make the expedition successful and safe. All of our leaders have extensive experience in the effects and medical treatment of high altitude ailments.</p>
 62 | 
 63 | <p>We primarily focus on the less crowded 8,000-meter and more challenging technical 6,000-meter peaks. We have organized many expeditions to 8,000-meter peaks including Everest, Lhotse, Dhaulagiri, Makalu and Manaslu in Nepal; Gasherbrum I, Gasherbrum II and Broad Peak in Pakistan; and Everest, Cho Oyu and Shishapangma in Tibet.<p>
 64 | 
 65 | <p>During the summer months we switch from climbing in the Himalayas to the lower peaks of Cordillera Blanca in Peru. We offer an expeditions to some of the Cordillera Blancas most popular peaks as well as bespoke expeditions to any Cordillera Blanca peak.</p>
 66 | 
 67 | <p>We do not advertise the lowest price for our respective expeditions to form a team of twenty or more climbers for profitability. We prefer to keep our expeditions to a maximum team size of eight climbers plus leaders as we feel a smaller expedition allows for a more personal experience on the mountain. The quality of our food and services at our respective base camps is considered one of the premier operations in the Himalayas and better than most of the high cost expeditions. Not having several western guides means that our expeditions are more affordable than most for qualified climbers. We only run professional expeditions and our focus is on safety and quality rather than the quantity of climbers joining our expeditions.</p>
 68 | 
 69 | <p>To maximize our chances of summit success we use Nepalese Sherpa on our Nepal Himalayan expeditions. Our Sherpa team have climbed with us over many years and are under the directorship of our UIAGM certified guide Sirdar, Pasang Ongcho Sherpa. Base and advanced base camps are well staffed with our Sherpa cooks and kitchen assistants. They have all been trained by professional western chefs to produce a varied menu and are knowledgeable of food safety and safe hygiene practices. Our base camps are stocked with fresh local and imported foods for a varied and nutrient conscious diet.</p>
 70 | 
 71 | <p>The equipment used at base camp and high altitude camps, is of the highest quality and replaced on a regular basis for safety. We only use the finest high altitude tents available, made by Black Diamond and Mountain Hardwear, to withstand the extreme weather conditions that are encountered in the high mountains.</p>
 72 | 
 73 | <p>In base camp we provide showers, heated and carpeted dining tents, and solar panels for lighting and the charging of electronic devices. All of our expeditions have a comprehensive medical chest, medical oxygen, portable altitude chamber and personnel who are familiar with their usage. Each leader, Sherpa and climber has their own personal avalanche beacon and two-way radio on the mountain and we have base station radios at the respective base camps.</p>
 74 | 
 75 | <p>All of our Nepal expeditions use private helicopter transport whenever possible over fixed wing flights. We believe helicopters are a safer option, especially when the weather is marginal for the Lukla flights. Helicopters also allow us to reach remote areas which are off limits to fixed wing aircraft. This is beneficial to make our expedition durations shorter.</p>
 76 | 
 77 | <p>To maximize our summit success on our expeditions we use a professional meteorological service for up to the minute weather forecasts.</p>
 78 | 
 79 | <p>Himalayan peaks are a serious undertaking and climbers need to be aware there are certain risks that are out of the control of Altitude Junkies. We prefer to describe our expeditions as professionally managed rather than guided. A true guided expedition is only where the guides have full UIAGM certification (alpine, rock and ski certified), which is the only internationally recognized qualification for mountain guides and there is a 2:1 or smaller guide to climber ratio. If you need to be guided, look for guides with full UIAGM certification.</p>
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | <p>As the name suggests, Altitude Junkies organizes expeditions for like-minded climbers who are addicted to climbing the world&#39;s high mountains. Come and get high, climb with the Altitude Junkies.</p>
 86 | 
 87 | <p style="font-size: 10px">Photo credit: Phil "Disco" Huddy - Cholatse
 88 | </p>
 89 | <br />
 90 | <a class="contact" href="mailto:info@altitudejunkies.com">Contact us: info@altitudejunkies.com</a></div>
 91 | 
 92 | <!-- Close Div #content -->
 93 | 
 94 | 
 95 | 
 96 | <div id="logos">
 97 | <ul>
 98 | 
 99 | <li><a href="http://www.mountainhouse.com"><img alt="Mountain house logo" src="logos/mountain_house_logo2.gif" /></a></li>
100 | <li><a href="http://www.summitoxygen.com"><img alt="Summit logo" src="logos/summit_logo2.gif" /></a></li>
101 | <li><a href="http://www.brunton.com"><img alt="Brunton logo" src="logos/brunton_logo2.gif" /></a></li>
102 | <li><a href="http://www.suunto.com"><img alt="Suunto logo" src="logos/suunto_logo2.gif" /></a></li>
103 | <li><a href="http://www.mountainhardwear.com"><img alt="Mountain hard wear logo" src="logos/mountain_hard_wear_logo2.gif" /></a></li>
104 | <li><a href="http://www.blackdiamondequipment.com"><img alt="Black diamond logo" src="logos/black_diamond_logo2.gif" /></a></li>
105 | <li><a href="http://www.salomon.com"><img alt="Salomon logo" src="logos/salomon_logo2.gif" /></a></li>
106 | <li><a href="http://www.pelican.com"><img alt="Pelican logo" src="logos/pelican_logo2.gif" /></a></li>
107 | <li><a href="http://www.julbousa.com"><img alt="Julbo logo" src="logos/julbo_logo2.gif" /></a></li>
108 | <li class="last"><a href="http://www.cascadedesigns.com/msr"><img alt="Msr logo" src="logos/msr_logo2.gif" /></a></li>
109 | </ul></div><!-- Close Div #logos-->
110 | 
111 | <div id="footer">
112 | <ul>
113 | 	<li class="first"><a href="index.html">JUNKIES</a></li><li>|</li>
114 | 		<li><a href="everesttibet.html">EVEREST</a></li><li>|</li>
115 | <li><a href="amadablam.html">AMA DABLAM</a></li><li>|</li>
116 | 	<li><a href="cholatse.html">CHOLATSE</a></li><li>|</li>
117 | 		<li><a href="kyajori.html">KYAJO RI</a></li><li>|</li>
118 | 				<li><a href="merapeak.html">MERA PEAK</a></li><li>|</li>
119 | 	<li><a href="cordillerablanca.html">CORDILLERA BLANCA</a></li><li>|</li>
120 | 			<li><a href="treks.html">TREKS</a></li><li>|</li>
121 | 	<li><a href="leaders.html">ABOUT US</a></li><li>|</li>
122 | 	<li><a href="news.html">NEWS</a></li><li>|</li>
123 | 	<li><a href="gunks.html">THE GUNKS</a></li>
124 | </ul>
125 | 
126 | <p>Copyright &copy; 2006-2019 Altitude Junkies. All Rights Reserved</p>
127 | </div>
128 | <!-- Close Div #footer--></div>
129 | <!-- Close Div #center --></body>
130 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/anchor_display.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <!-- ['About', 'Foo Location Bar', 'Contact Contact2Contact3'] -->
 4 |         <div><a href="/about">About</div>
 5 |         <div>Foo <a href="/location">Location</a> Bar</div>
 6 |         <div>
 7 |             <a href="/contact">Contact</a> <a href="/contact">Contact2</a>
 8 |             <a href="/contact">Contact3</a>
 9 |         </div>
10 |     </body>
11 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/blank.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Blank Page</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Blank Page</h1>
 8 |   <p>This page is intentionally blank with very little HTML.</p>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/disallow-all.com/about.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Disallow All</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Disallow All Test Site</h1>
 8 |   <p>About page, which shouldn't be indexed.</p>
 9 |   <a href="/">Home</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/disallow-all.com/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Disallow All</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Disallow All Test Site</h1>
 8 |   <p>This website disallows wgit from indexing any of its content, including this page. This is done via the robots.txt page.</p>
 9 |   <a href="/about">About</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/disallow-all.com/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: googlebot
2 | Crawl-delay: 4
3 | Allow: *
4 | 
5 | User-agent: wgit
6 | Disallow: /
7 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/div_display.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <!-- ['foo', 'bar'] -->
4 |         <script>print("hello world");</script>
5 |         <div>foo</div><div>bar</div>
6 |     </body>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/external-link-portal.com.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>External Link Portal</title>
 5 | </head>
 6 | <body>
 7 |   <h1>External Link Portal</h1>
 8 |   <p>This site contains external links to other site fixtures.</p>
 9 |   <h2>External Links</h2>
10 |   <a href="https://blank-site-1.com">Blank Site 1</a>
11 |   <a href="http://blank-site-2.com">Blank Site 2</a>
12 |   <a href="https://blank-site-3.com">Blank Site 3</a>
13 |   <a href="http://blank-site-4.com">Blank Site 4</a>
14 |   <a href="https://blank-site-5.com">Blank Site 5</a>
15 | </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/getting_started.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html
 3 |   <body>
 4 |     <p>Running the following Wgit code will programmatically configure your database:</p>
 5 | <div class="highlight highlight-source-ruby notranslate position-relative overflow-auto" data-snippet-clipboard-copy-content="db = Wgit::Database.new '&lt;connection_string&gt;'
 6 | 
 7 | db.create_collections
 8 | db.create_unique_indexes
 9 | db.text_index = Wgit::Database::DEFAULT_TEXT_INDEX"><pre><span class="pl-s1">db</span> <span class="pl-c1">=</span> <span class="pl-v">Wgit</span>::<span class="pl-v">Database</span><span class="pl-kos">.</span><span class="pl-en">new</span> <span class="pl-s">'&lt;connection_string&gt;'</span>
10 | 
11 | <span class="pl-s1">db</span><span class="pl-kos">.</span><span class="pl-en">create_collections</span>
12 | <span class="pl-s1">db</span><span class="pl-kos">.</span><span class="pl-en">create_unique_indexes</span>
13 | <span class="pl-s1">db</span><span class="pl-kos">.</span><span class="pl-en">text_index</span> <span class="pl-c1">=</span> <span class="pl-v">Wgit</span>::<span class="pl-v">Database</span>::<span class="pl-c1">DEFAULT_TEXT_INDEX</span></pre></div>
14 | <p>Or take a look at the <a href="https://github.com/michaeltelford/wgit/blob/master/docker/mongo-init.js#L16">mongo_init.js</a> file for the equivalent Javascript commands.</p>
15 | <p><strong>Note</strong>: The <em>text search index</em> lists all document fields to be searched by MongoDB when calling <code>Wgit::Database#search</code>. Therefore, you should append this list with any other fields that you want searched. For example, if you <a href="#Extending-The-API">extend the API</a> then you might want to search your new fields in the database by adding them to the index above. This can be done programmatically with:</p>
16 |   </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/link-to-robots-txt.com.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Link to Robots.txt</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Link to Robots.txt Test Site</h1>
 8 |   <p>This site contains an external to robot.txt test site.</p>
 9 |   <a href="http://robots.txt.com">Robots.txt Test Site</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/motherfuckingwebsite.com.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |     
 7 |     <!-- FOR THE CURIOUS: This site was made by @thebarrytone. Don't tell my mom. -->
 8 |     
 9 |     <title>Motherfucking Website</title>
10 | </head>
11 | 
12 | <body>
13 |     <header>
14 |         <h1>This is a motherfucking website.</h1>
15 |         <aside>And it's fucking perfect.</aside>
16 |     </header>
17 |         
18 |         <h2>Seriously, what the fuck else do you want?</h2>
19 |         
20 |         <p>You probably build websites and think your shit is special. You think your 13 megabyte parallax-ative home page is going to get you some fucking Awwward banner you can glue to the top corner of your site. You think your 40-pound jQuery file and 83 polyfills give IE7 a boner because it finally has box-shadow. Wrong, motherfucker. Let me describe your perfect-ass website:</p>
21 |         
22 |         <ul>
23 |             <li>Shit's lightweight and loads fast</li>
24 |             <li>Fits on all your shitty screens</li>
25 |             <li>Looks the same in all your shitty browsers</li>
26 |             <li>The motherfucker's accessible to every asshole that visits your site</li>
27 |             <li>Shit's legible and gets your fucking point across (if you had one instead of just 5mb pics of hipsters drinking coffee)</li>
28 |         </ul>
29 |         
30 |         <h3>Well guess what, motherfucker:</h3>
31 |         
32 |         <p>You. Are. Over-designing. Look at this shit. It's a motherfucking website. Why the fuck do you need to animate a fucking trendy-ass banner flag when I hover over that useless piece of shit? You spent hours on it and added 80 kilobytes to your fucking site, and some motherfucker jabbing at it on their iPad with fat sausage fingers will never see that shit. Not to mention blind people will never see that shit, but they don't see any of your shitty shit.</p>
33 |         
34 |         <p>You never knew it, but this is your perfect website. Here's why.</p>
35 |         
36 |         <h2>It's fucking lightweight</h2>
37 |         
38 |         <p>This entire page weighs less than the gradient-meshed facebook logo on your fucking Wordpress site. Did you seriously load 100kb of jQuery UI just so you could animate the fucking background color of a div? You loaded all 7 fontfaces of a shitty webfont just so you could say "Hi." at 100px height at the beginning of your site? You piece of shit.</p>
39 |         
40 |         <h2>It's responsive</h2>
41 |         
42 |         <p>You dumbass. You thought you needed media queries to be responsive, but no. Responsive means that it responds to whatever motherfucking screensize it's viewed on. This site doesn't care if you're on an iMac or a motherfucking Tamagotchi.</p>
43 |         
44 |         <h2>It fucking works</h2>
45 |         
46 |         <p>Look at this shit. You can read it ... that is, if you can read, motherfucker. It makes sense. It has motherfucking hierarchy. It's using HTML5 tags so you and your bitch-ass browser know what the fuck's in this fucking site. That's semantics, motherfucker.</p>
47 |         
48 |         <p>It has content on the fucking screen. Your site has three bylines and link to your dribbble account, but you spread it over 7 full screens and make me click some bobbing button to show me how cool the jQuery ScrollTo plugin is.</p>
49 |         
50 |         <p>Cross-browser compatibility? Load this motherfucker in IE6. I fucking dare you.</p>
51 |         
52 |         <h2>This is a website. Look at it.  You've never seen one before.</h2>
53 |         
54 |         <p>Like the man who's never grown out his beard has no idea what his true natural state is, you have no fucking idea what a website is. All you have ever seen are shitty skeuomorphic bastardizations of what should be text communicating a fucking message. This is a real, naked website. Look at it. It's fucking beautiful.</p>
55 |         
56 |         <h3>Yes, this is fucking satire, you fuck</h3>
57 |         
58 |         <p>I'm not actually saying your shitty site should look like this. What I'm saying is that all the problems we have with websites are <strong>ones we create ourselves</strong>. Websites aren't broken by default, they are functional, high-performing, and accessible. You break them. You son-of-a-bitch.</p>
59 |         
60 |         <blockquote cite="https://www.vitsoe.com/us/about/good-design">"Good design is as little design as possible."<br>
61 |             - some German motherfucker
62 |         </blockquote>
63 |     
64 |     <hr>
65 |     
66 |     <h2>Epilogue</h2>
67 |     <p>From the philosophies expressed (poorly) above, <a href="http://txti.es">txti</a> was created. You should try it today to make your own motherfucking websites.</p>
68 |     
69 |     <!-- yes, I know...wanna fight about it? -->
70 |     <script>
71 |       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
72 |       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
73 |       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
74 |       })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
75 |     
76 |       ga('create', 'UA-45956659-1', 'motherfuckingwebsite.com');
77 |       ga('send', 'pageview');
78 |     </script>
79 |     
80 | </body>
81 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/nearest_fragment.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |     </head>
 5 |     <body>
 6 |         <p>Hello1</p> <!-- nil -->
 7 |         <div>
 8 |             <a href="#fragment1">Anchor1</a>
 9 |             <p>Hello2</p> <!-- #fragment1 -->
10 |         </div>
11 |         <p>Hello3</p> <!-- nil -->
12 |         <div>
13 |             <p>Hello4</p> <!-- nil -->
14 |         </div>
15 |         <a href="#fragment2">Anchor2</a>
16 |         <a href="#fragment3">Anchor3</a>
17 |         <p>Hello5</p> <!-- #fragment3 -->
18 |         <div>
19 |             <a href="#fragment4">Anchor4</a>
20 |             <p>Hello6</p> <!-- #fragment4 -->
21 |         </div>
22 |         <div>
23 |             <div>
24 |                 <p>Hello7</p> <!-- #fragment4 -->
25 |             </div>
26 |         </div>
27 |         <a href="#fragment5">Anchor5</a>
28 |         <div>
29 |             <div>
30 |                 <div>
31 |                     <!-- viewBox camelCase is used to test case insensitivity -->
32 |                     <a href="#fragment6" viewBox="0 0 16 16">Anchor6</a>
33 |                 </div>
34 |                 <p>Hello8</p> <!-- #fragment6 -->
35 |             </div>
36 |         </div>
37 |         <a href="#fragment7">Hello9</a>
38 |         <h2>
39 |             <a href="#fragment8">Anchor8</a>
40 |         </h2>
41 |         <div>
42 |             <p>Hello10</p> <!-- #fragment8 -->
43 |         </div>
44 |     </body>
45 | </html>
46 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/not_found.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Not Found</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Not Found</h1>
 8 |   <p>The page you're looking for cannot be found.</p>
 9 | </body>
10 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/odd-extension.com.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <head></head>
3 |   <body>
4 |     <a href="other.html5">Some other page with an odd extension</a>
5 |   </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/php.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html><!-- http://www.php.com/index.php -->
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="Content-type" content="text/html; charset=utf-8">
 5 |   <title>A PHP Webpage</title>
 6 |   <meta name="author" content="Michael Telford">
 7 |   <meta name="keywords" content="PHP, php, web-dev">
 8 |   <link rel="stylesheet" href="styles.css">
 9 |   <script type="text/javascript" src="http://www.php.com/client.js"></script>
10 | </head>
11 | <body>
12 |   <h1>Welcome to a PHP Webpage</h1>
13 |   <p>All internal page links below should contain a .php extension.</p>
14 |   <a href="about.php">About</a>
15 |   <img src="emoji.png" alt="Sunglasses" height="40" width="40">
16 |   <a href="?foo=bar">Foo bar on this page (index.php)</a>
17 |   <a href="http://www.web-dev.com">External Site - Web Dev Dot Com</a>
18 | </body>
19 | </html>
20 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/quotes.toscrape.com/tag/humor.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 | 	<meta charset="UTF-8">
  5 | 	<title>Quotes to Scrape</title>
  6 |     <link rel="stylesheet" href="/static/bootstrap.min.css">
  7 |     <link rel="stylesheet" href="/static/main.css">
  8 | </head>
  9 | <body>
 10 |     <div class="container">
 11 |         <div class="row header-box">
 12 |             <div class="col-md-8">
 13 |                 <h1>
 14 |                     <a href="/" style="text-decoration: none">Quotes to Scrape</a>
 15 |                 </h1>
 16 |             </div>
 17 |             <div class="col-md-4">
 18 |                 <p>
 19 |                 
 20 |                     <a href="/login">Login</a>
 21 |                 
 22 |                 </p>
 23 |             </div>
 24 |         </div>
 25 |     
 26 | 
 27 | <h3>Viewing tag: <a href="/tag/humor/page/1/">humor</a></h3>
 28 | 
 29 | <div class="row">
 30 |     <div class="col-md-8">
 31 | 
 32 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 33 |         <span class="text" itemprop="text">���The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.���</span>
 34 |         <span>by <small class="author" itemprop="author">Jane Austen</small>
 35 |         <a href="/author/Jane-Austen">(about)</a>
 36 |         </span>
 37 |         <div class="tags">
 38 |             Tags:
 39 |             <meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" /    > 
 40 |             
 41 |             <a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
 42 |             
 43 |             <a class="tag" href="/tag/books/page/1/">books</a>
 44 |             
 45 |             <a class="tag" href="/tag/classic/page/1/">classic</a>
 46 |             
 47 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 48 |             
 49 |         </div>
 50 |     </div>
 51 | 
 52 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 53 |         <span class="text" itemprop="text">���A day without sunshine is like, you know, night.���</span>
 54 |         <span>by <small class="author" itemprop="author">Steve Martin</small>
 55 |         <a href="/author/Steve-Martin">(about)</a>
 56 |         </span>
 57 |         <div class="tags">
 58 |             Tags:
 59 |             <meta class="keywords" itemprop="keywords" content="humor,obvious,simile" /    > 
 60 |             
 61 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 62 |             
 63 |             <a class="tag" href="/tag/obvious/page/1/">obvious</a>
 64 |             
 65 |             <a class="tag" href="/tag/simile/page/1/">simile</a>
 66 |             
 67 |         </div>
 68 |     </div>
 69 | 
 70 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 71 |         <span class="text" itemprop="text">���Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.���</span>
 72 |         <span>by <small class="author" itemprop="author">Garrison Keillor</small>
 73 |         <a href="/author/Garrison-Keillor">(about)</a>
 74 |         </span>
 75 |         <div class="tags">
 76 |             Tags:
 77 |             <meta class="keywords" itemprop="keywords" content="humor,religion" /    > 
 78 |             
 79 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 80 |             
 81 |             <a class="tag" href="/tag/religion/page/1/">religion</a>
 82 |             
 83 |         </div>
 84 |     </div>
 85 | 
 86 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 87 |         <span class="text" itemprop="text">���Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.���</span>
 88 |         <span>by <small class="author" itemprop="author">Jim Henson</small>
 89 |         <a href="/author/Jim-Henson">(about)</a>
 90 |         </span>
 91 |         <div class="tags">
 92 |             Tags:
 93 |             <meta class="keywords" itemprop="keywords" content="humor" /    > 
 94 |             
 95 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 96 |             
 97 |         </div>
 98 |     </div>
 99 | 
100 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
101 |         <span class="text" itemprop="text">���All you need is love. But a little chocolate now and then doesn&#39;t hurt.���</span>
102 |         <span>by <small class="author" itemprop="author">Charles M. Schulz</small>
103 |         <a href="/author/Charles-M-Schulz">(about)</a>
104 |         </span>
105 |         <div class="tags">
106 |             Tags:
107 |             <meta class="keywords" itemprop="keywords" content="chocolate,food,humor" /    > 
108 |             
109 |             <a class="tag" href="/tag/chocolate/page/1/">chocolate</a>
110 |             
111 |             <a class="tag" href="/tag/food/page/1/">food</a>
112 |             
113 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
114 |             
115 |         </div>
116 |     </div>
117 | 
118 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
119 |         <span class="text" itemprop="text">���Remember, we&#39;re madly in love, so it&#39;s all right to kiss me anytime you feel like it.���</span>
120 |         <span>by <small class="author" itemprop="author">Suzanne Collins</small>
121 |         <a href="/author/Suzanne-Collins">(about)</a>
122 |         </span>
123 |         <div class="tags">
124 |             Tags:
125 |             <meta class="keywords" itemprop="keywords" content="humor" /    > 
126 |             
127 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
128 |             
129 |         </div>
130 |     </div>
131 | 
132 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
133 |         <span class="text" itemprop="text">���Some people never go crazy. What truly horrible lives they must lead.���</span>
134 |         <span>by <small class="author" itemprop="author">Charles Bukowski</small>
135 |         <a href="/author/Charles-Bukowski">(about)</a>
136 |         </span>
137 |         <div class="tags">
138 |             Tags:
139 |             <meta class="keywords" itemprop="keywords" content="humor" /    > 
140 |             
141 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
142 |             
143 |         </div>
144 |     </div>
145 | 
146 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
147 |         <span class="text" itemprop="text">���The trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.���</span>
148 |         <span>by <small class="author" itemprop="author">Terry Pratchett</small>
149 |         <a href="/author/Terry-Pratchett">(about)</a>
150 |         </span>
151 |         <div class="tags">
152 |             Tags:
153 |             <meta class="keywords" itemprop="keywords" content="humor,open-mind,thinking" /    > 
154 |             
155 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
156 |             
157 |             <a class="tag" href="/tag/open-mind/page/1/">open-mind</a>
158 |             
159 |             <a class="tag" href="/tag/thinking/page/1/">thinking</a>
160 |             
161 |         </div>
162 |     </div>
163 | 
164 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
165 |         <span class="text" itemprop="text">���Think left and think right and think low and think high. Oh, the thinks you can think up if only you try!���</span>
166 |         <span>by <small class="author" itemprop="author">Dr. Seuss</small>
167 |         <a href="/author/Dr-Seuss">(about)</a>
168 |         </span>
169 |         <div class="tags">
170 |             Tags:
171 |             <meta class="keywords" itemprop="keywords" content="humor,philosophy" /    > 
172 |             
173 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
174 |             
175 |             <a class="tag" href="/tag/philosophy/page/1/">philosophy</a>
176 |             
177 |         </div>
178 |     </div>
179 | 
180 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
181 |         <span class="text" itemprop="text">���The reason I talk to myself is because I���m the only one whose answers I accept.���</span>
182 |         <span>by <small class="author" itemprop="author">George Carlin</small>
183 |         <a href="/author/George-Carlin">(about)</a>
184 |         </span>
185 |         <div class="tags">
186 |             Tags:
187 |             <meta class="keywords" itemprop="keywords" content="humor,insanity,lies,lying,self-indulgence,truth" /    > 
188 |             
189 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
190 |             
191 |             <a class="tag" href="/tag/insanity/page/1/">insanity</a>
192 |             
193 |             <a class="tag" href="/tag/lies/page/1/">lies</a>
194 |             
195 |             <a class="tag" href="/tag/lying/page/1/">lying</a>
196 |             
197 |             <a class="tag" href="/tag/self-indulgence/page/1/">self-indulgence</a>
198 |             
199 |             <a class="tag" href="/tag/truth/page/1/">truth</a>
200 |             
201 |         </div>
202 |     </div>
203 | 
204 |     <nav>
205 |         <ul class="pager">
206 |             
207 |             
208 |             <li class="next">
209 |                 <a href="/tag/humor/page/2/">Next <span aria-hidden="true">&rarr;</span></a>
210 |             </li>
211 |             
212 |         </ul>
213 |     </nav>
214 |     </div>
215 |     <div class="col-md-4 tags-box">
216 |         
217 |             <h2>Top Ten tags</h2>
218 |             
219 |             <span class="tag-item">
220 |             <a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
221 |             </span>
222 |             
223 |             <span class="tag-item">
224 |             <a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
225 |             </span>
226 |             
227 |             <span class="tag-item">
228 |             <a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
229 |             </span>
230 |             
231 |             <span class="tag-item">
232 |             <a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
233 |             </span>
234 |             
235 |             <span class="tag-item">
236 |             <a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
237 |             </span>
238 |             
239 |             <span class="tag-item">
240 |             <a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
241 |             </span>
242 |             
243 |             <span class="tag-item">
244 |             <a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
245 |             </span>
246 |             
247 |             <span class="tag-item">
248 |             <a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
249 |             </span>
250 |             
251 |             <span class="tag-item">
252 |             <a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
253 |             </span>
254 |             
255 |             <span class="tag-item">
256 |             <a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
257 |             </span>
258 |             
259 |         
260 |     </div>
261 | </div>
262 | 
263 |     </div>
264 |     <footer class="footer">
265 |         <div class="container">
266 |             <p class="text-muted">
267 |                 Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
268 |             </p>
269 |             <p class="copyright">
270 |                 Made with <span class='sh-red'>���</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
271 |             </p>
272 |         </div>
273 |     </footer>
274 | </body>
275 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/quotes.toscrape.com/tag/humor/page/2.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 | 	<meta charset="UTF-8">
  5 | 	<title>Quotes to Scrape</title>
  6 |     <link rel="stylesheet" href="/static/bootstrap.min.css">
  7 |     <link rel="stylesheet" href="/static/main.css">
  8 | </head>
  9 | <body>
 10 |     <div class="container">
 11 |         <div class="row header-box">
 12 |             <div class="col-md-8">
 13 |                 <h1>
 14 |                     <a href="/" style="text-decoration: none">Quotes to Scrape</a>
 15 |                 </h1>
 16 |             </div>
 17 |             <div class="col-md-4">
 18 |                 <p>
 19 |                 
 20 |                     <a href="/login">Login</a>
 21 |                 
 22 |                 </p>
 23 |             </div>
 24 |         </div>
 25 |     
 26 | 
 27 | <h3>Viewing tag: <a href="/tag/humor/page/1/">humor</a></h3>
 28 | 
 29 | <div class="row">
 30 |     <div class="col-md-8">
 31 | 
 32 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 33 |         <span class="text" itemprop="text">���I am free of all prejudice. I hate everyone equally. ���</span>
 34 |         <span>by <small class="author" itemprop="author">W.C. Fields</small>
 35 |         <a href="/author/W-C-Fields">(about)</a>
 36 |         </span>
 37 |         <div class="tags">
 38 |             Tags:
 39 |             <meta class="keywords" itemprop="keywords" content="humor,sinister" /    > 
 40 |             
 41 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 42 |             
 43 |             <a class="tag" href="/tag/sinister/page/1/">sinister</a>
 44 |             
 45 |         </div>
 46 |     </div>
 47 | 
 48 |     <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
 49 |         <span class="text" itemprop="text">���A lady&#39;s imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.���</span>
 50 |         <span>by <small class="author" itemprop="author">Jane Austen</small>
 51 |         <a href="/author/Jane-Austen">(about)</a>
 52 |         </span>
 53 |         <div class="tags">
 54 |             Tags:
 55 |             <meta class="keywords" itemprop="keywords" content="humor,love,romantic,women" /    > 
 56 |             
 57 |             <a class="tag" href="/tag/humor/page/1/">humor</a>
 58 |             
 59 |             <a class="tag" href="/tag/love/page/1/">love</a>
 60 |             
 61 |             <a class="tag" href="/tag/romantic/page/1/">romantic</a>
 62 |             
 63 |             <a class="tag" href="/tag/women/page/1/">women</a>
 64 |             
 65 |         </div>
 66 |     </div>
 67 | 
 68 |     <nav>
 69 |         <ul class="pager">
 70 |             
 71 |             <li class="previous">
 72 |                 <a href="/tag/humor/page/1/"><span aria-hidden="true">&larr;</span> Previous</a>
 73 |             </li>
 74 |             
 75 |             
 76 |         </ul>
 77 |     </nav>
 78 |     </div>
 79 |     <div class="col-md-4 tags-box">
 80 |         
 81 |             <h2>Top Ten tags</h2>
 82 |             
 83 |             <span class="tag-item">
 84 |             <a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
 85 |             </span>
 86 |             
 87 |             <span class="tag-item">
 88 |             <a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
 89 |             </span>
 90 |             
 91 |             <span class="tag-item">
 92 |             <a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
 93 |             </span>
 94 |             
 95 |             <span class="tag-item">
 96 |             <a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
 97 |             </span>
 98 |             
 99 |             <span class="tag-item">
100 |             <a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
101 |             </span>
102 |             
103 |             <span class="tag-item">
104 |             <a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
105 |             </span>
106 |             
107 |             <span class="tag-item">
108 |             <a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
109 |             </span>
110 |             
111 |             <span class="tag-item">
112 |             <a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
113 |             </span>
114 |             
115 |             <span class="tag-item">
116 |             <a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
117 |             </span>
118 |             
119 |             <span class="tag-item">
120 |             <a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
121 |             </span>
122 |             
123 |         
124 |     </div>
125 | </div>
126 | 
127 |     </div>
128 |     <footer class="footer">
129 |         <div class="container">
130 |             <p class="text-muted">
131 |                 Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
132 |             </p>
133 |             <p class="copyright">
134 |                 Made with <span class='sh-red'>���</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
135 |             </p>
136 |         </div>
137 |     </footer>
138 | </body>
139 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/about.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Robots.txt</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Robots.txt Test Site</h1>
 8 |   <p>About page.</p>
 9 |   <a href="/">Home</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/contact.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Robots.txt</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Robots.txt Test Site</h1>
 8 |   <p>Contact page.</p>
 9 |   <a href="/">Home</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Robots.txt</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Robots.txt Test Site</h1>
 8 |   <a href="login">Login</a>
 9 |   <a href="pwreset">Reset Password</a>
10 |   <a href="account">Account</a>
11 |   <a href="about">About</a>
12 |   <a href="contact">Contact</a>
13 | </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/login.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Robots.txt</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Robots.txt Test Site</h1>
 8 |   <p>Login page.</p>
 9 |   <a href="/">Home</a>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/pwreset.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Robots.txt</title>
 5 |   <meta name="robots" content="noindex">
 6 | </head>
 7 | <body>
 8 |   <h1>Robots.txt Test Site</h1>
 9 |   <p>Password reset page.</p>
10 |   <a href="/">Home</a>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/robots.txt.com/robots.txt:
--------------------------------------------------------------------------------
 1 | User-agent: Slurp
 2 | Crawl-delay: 4
 3 | 
 4 | User-agent: wgit
 5 | Allow: /
 6 | Disallow: /login
 7 | 
 8 | User-agent: yacybot
 9 | Disallow: *
10 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/span_display.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <!-- [
 4 |             'Running the following Wgit code will programmatically configure your database:',
 5 |             "db = Wgit::Database.new '<connection_string>'"
 6 |         ] -->
 7 |         <script>print("hello world");</script>
 8 |         <p>Running the following Wgit code will programmatically configure your database:</p>
 9 |         <span class="pl-s1">db</span> <span class="pl-c1">=</span> <span class="pl-v">Wgit</span>::<span class="pl-v">Database</span><span class="pl-kos">.</span><span class="pl-en">new</span> <span class="pl-s">'&lt;connection_string&gt;'</span>
10 |     </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/about.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <base href="public">
 5 |   <title>About</title>
 6 | </head>
 7 | <body>
 8 |   <h1>About</h1>
 9 |   <a href="records">Records</a>
10 | </body>
11 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/application.js.html:
--------------------------------------------------------------------------------
1 | alert("blah");
2 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/contact.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Contact</title>
 5 | </head>
 6 | <body>
 7 |   <h1>Contact</h1>
 8 |   <a href="search">Search</a>
 9 |   <a href="sneaky">Sneaky</a>
10 |   <a href="doesntexist">Invalid URL</a>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Home</title>
 5 |   <script src="application.js"></script>
 6 | </head>
 7 | <body>
 8 |   <h1>Home</h1>
 9 |   <a href="contact">Contact</a>
10 |   <a href="search">Search</a>
11 |   <a href="about">About</a>
12 |   <a href="https://test-site.com/about">Absolute About</a>
13 |   <a href="http://test-site.co.uk">Test Site UK</a>
14 |   <a href="http://ftp.test-site.com">FTP Test Site</a>
15 |   <a href="ftp">FTP Test Site Redirect</a>
16 | </body>
17 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/public/records.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Records</title>
 5 | </head>
 6 | <body>
 7 |   <h1 id="top">Records</h1>
 8 |   <a href="search">Search</a>
 9 |   <a href="?q=username">Search</a>
10 |   <a href="#top">Top of Page</a>
11 | </body>
12 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/search.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>Search</title>
 5 |   <link rel="stylesheet" type="text/css" href="theme.css">
 6 | </head>
 7 | <body>
 8 |   <h1>Search</h1>
 9 |   <a href="/">Home</a>
10 |   <a href="contact">Contact</a>
11 | </body>
12 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/test-site.com/theme.css.html:
--------------------------------------------------------------------------------
 1 | body {
 2 |   background-color: lightblue;
 3 | }
 4 | 
 5 | h1 {
 6 |   color: white;
 7 |   text-align: center;
 8 | }
 9 | 
10 | p {
11 |   font-family: verdana;
12 |   font-size: 20px;
13 | }
14 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/test_doc.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html><!-- http://www.mytestsite.com/home -->
 2 | <html>
 3 | 
 4 | <head>
 5 |   <meta http-equiv="Content-type" content="text/html; charset=utf-8">
 6 |   <title>My Test Webpage</title>
 7 |   <meta name="description" content="Webpage for testing the wgit gem">
 8 |   <meta name="author" content="Michael Telford">
 9 |   <meta name="keywords" content="Minitest, Ruby, Test Document">
10 |   <link rel="stylesheet" href="/styles.css">
11 |   <script type="text/javascript" src="http://www.mytestsite.com/client.js"></script>
12 |   <script type="text/javascript" src="http://www.external-scripts.com/code.js"></script>
13 |   <script type="text/javascript">var msg = "Hello from html head";</script>
14 | </head>
15 | 
16 | <body id="main-body" onload="">
17 |   <script type="text/javascript">var msg = "Hello from html body";</script>
18 |   <h1>Howdy!</h1><br>
19 |   <div><a href="#welcome">Welcome</a></div>
20 |   <div><a href="?foo=bar">Foo Bar</a></div>
21 |   <div><a href="http://www.google.co.uk">Google</a></div>
22 |   <div><a href="//fonts.googleapis.com">Scheme-relative URL</a></div>
23 |   <div><a href="http://www.mytestsite.com/security.html">Security</a></div>
24 |   <h2 id="welcome">Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.</h2>
25 |   <div><a href="/about.html">About</a></div>
26 |   <div><a href="about.html/">About 2</a><!-- This duplicate URL is deliberate --></div>
27 |   <div><a href="/">Index</a></div>
28 |   <br>
29 |   <br>
30 |   <img src="https://www.w3schools.com/html/pic_trulli.jpg" alt="Image alt text" height="20" width="20">
31 |   <p>This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.</p>
32 |   <div>
33 |     Here is a table:
34 |     <table>
35 |       <tr>
36 |         <th>Country</th>
37 |         <th>Capital</th>
38 |       </tr>
39 |       <tr>
40 |         <td>England</td>
41 |         <td>London</td>
42 |       </tr>
43 |       <tr>
44 |         <td>Ireland</td>
45 |         <td>Dublin</td>
46 |       </tr>
47 |     </table>
48 |   </div>
49 |   <br />
50 |   <div id="minitest">
51 |     Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed.
52 |   </div>
53 |   <br>
54 |   <div id="login-form">
55 |     <form action="#">
56 |       <div>
57 |         <label for="username">Username:</label><br />
58 |         <input name="username" type="text" />
59 |       </div>
60 |       <div>
61 |         <label for="password">Password:</label><br />
62 |         <input name="password" type="password" />
63 |       </div>
64 |       <div>
65 |         <button type="submit">Login</button>
66 |         <button type="reset">Clear Form</button>
67 |       </div>
68 |     </form>
69 |   </div>
70 |   <br />
71 |   <a href="http://www.yahoo.com">Yahoo</a><br>
72 |   <a href="/contact.html">Contact</a><br>
73 |   <a href="http://www.bing.com/">Bing</a><br>
74 |   <a href="http://www.mytestsite.com">Index 2</a><br><!-- Duplicate of / -->
75 |   <a href="http://www.mytestsite.com/">Index 3</a><br><!-- Duplicate of / -->
76 |   <a href="http://www.mytestsite.com/tests.html">Tests</a><br>
77 |   <a href="https://search.yahoo.com/search?q=hello&page=2">Yahoo Search</a><br>
78 |   <a href="/blog#about-us">Blog</a><br>
79 |   <a href="https://example.com/blog#about-us">Example.com Blog</a><br>
80 |   <a href="/contents/">Contents</a><br>
81 |   <a href="http://ftp.mytestsite.com">Same Domain FTP Server</a><br>
82 |   <a href="http://ftp.mytestsite.com/">Same Domain FTP Server 2</a><br><!-- Duplicate of ftp.mytestsite.com -->
83 |   <a href="http://ftp.mytestsite.com/files">Same Domain FTP Server Files</a><br>
84 | </body>
85 | 
86 | </html>
87 | 


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/about.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 | 	<meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <meta name="Description" content="Learn what txti is and how you can help keep it free.">
 7 |     <link href="/css/styles.min.css" type="text/css" rel="stylesheet">
 8 | 	<title>About txti</title>
 9 | </head>
10 | 
11 | <body>
12 | 	<h1>Meet <a href="/">txti</a></h1>
13 | 
14 | 	<p>Txti is fast web pages for everybody. Most of the world still does not have internet, but many websites from countries like the United States are big and complicated. This makes it hard for people with slow internet to use these sites. It is even harder for those people to put their own thoughts on the internet. With txti, anyone can use any device to share their story.</p>
15 |     
16 |     <p>Txti was created by <a href="http://twitter.com/thebarrytone">Barry T. Smith</a> because he believes that high speed internet is a responsibility, not a service people buy. He got a lot of help from his friend <a href="http://twitter.com/neatnikfun">Adam Newbold</a>.</p>
17 |     
18 |     <h2>How you can help <a href="/">txti</a></h2>
19 |     
20 |     <p>We want to keep txti free to use forever. There are a few easy ways you can help. The best way you can help is by using txti regularly and by telling your friends about txti. Next, you can follow txti (<a href="http://twitter.com/txties">@txties</a>) and txti's creator, Barry (<a href="http://twitter.com/thebarrytone">@thebarrytone</a>), on twitter.</p>
21 |         
22 |     <p>If you really believe that txti is important, please consider <a href="https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU">donating money with PayPal</a>.</p>
23 |     <p>But if you're really awesome, you could tip some dogecoin to this address: <code>DAon8fhTHbme13vc5phqk9JmHWesZfxYjX</code></p>
24 |     
25 |     <p><strong>Thank you for using txti!</strong></p>
26 |     
27 |     <p class="footer"><a href="/">txti home</a></p>
28 | 
29 | </body>
30 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/how.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 | 	<meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <meta name="Description" content="Learn how to use txti to make the fastest, simplest web page on the internet.">
 7 |     <link href="/css/styles.min.css" type="text/css" rel="stylesheet">
 8 | 	<title>How to use txti</title>
 9 | </head>
10 | 
11 | <body>
12 | 	<h1>How to make a web page with <a href="/">txti</a></h1>
13 |     
14 |     <p>The only thing you have to give txti is the "Content." Txti takes care of the rest, but you can take advantage of these options:</p>
15 |     <dl>
16 |         <dt>Custom URL</dt>
17 |         <dd>This is the part that goes after "txti.es/" like "txti.es/barry". Currently only accepts lowercase letters (a-z), numbers (0-9), and dashes (-). If you put anything else in there, txti will change it.</dd>
18 |         <dt>Custom Edit Code</dt>
19 |         <dd>Txti will give you a random edit code, but you can change it. You have to have this to make changes to the txti, and it can never EVER be retrieved if lost.</dd>
20 |         <dt>Title</dt>
21 |         <dd>Give your txti a relevant title so it shows up better when shared on social media, search results, and in browser windows.</dd>
22 |         <dt>Author</dt>
23 |         <dd>Let people know who made this! This shows up in some searches and social media posts, and your Twitter handle works really nicely if you post the link on Twitter.</dd>
24 |         <dt>Description</dt>
25 |         <dd>This is a short (200 character) summary of the page. This shows up in Twitter cards and search results. Txti will automatically use the first 200 characters of your txti as a description if you don't provide one.</dd>
26 |     </dl>
27 | 
28 |     <h2>Make your txti easy to read and understand</h2>
29 |     <p>You can make your txti robust and full-featured with links, images, lists, headings, and more. Txti uses a popular set of rules called <a href="http://en.wikipedia.org/wiki/Markdown">Markdown</a>.</p>
30 |     <ul>
31 |         <li>#Heading 1 = biggest heading<br>
32 |             ##Heading 2 = second biggest heading<br>
33 |             ###Heading 3 = third biggest, and so on
34 |         </li>
35 |         <li>Return once starts a new line. Return twice (leaving an empty line) starts a new paragraph.</li>
36 |         <li>*italics* = <em>italics</em></li>
37 |         <li>**bold** = <strong>bold</strong></li>
38 |         <li>Links: [link to txti](http://txti.es) = <a href="http://txti.es">link to txti</a> (note: be sure to include the "http://" part of the link)</li>
39 |         <li>Images*: ![Monkey selfie](http://i.imgur.com/FXSBf8c.jpg)
40 |         <li>Bulleted lists:<br>
41 |             - Bulleted item a<br>
42 |             - Bulleted item b<br>
43 |             - Bulleted item c
44 |         </li>
45 |         <li>Numbered lists:<br>
46 |             1. Numbered item 1<br>
47 |             2. Numbered item 2<br>
48 |             3. Numbered item 3
49 |         </li>
50 |     </ul>
51 |     <p>*Images must be uploaded somewhere else (we recommend <a href="http://imgur.com">imgur.com</a>). Because images can really slow down pages and we're all about "fast web pages for everybody," images are not displayed by default. The reader has the option of displaying the images in the page or just viewing links to them. See an example at <a href="/images">txti.es/images</a>.</p>
52 |     
53 |     <h2>Advanced stuff</h2>
54 |     <p>Txti uses Twitter Cards, so when you share a txti link on Twitter, it will show bigger summary of the link (see above to read about the custom options you have).</p>
55 |     <p>Txti has a minimal API so you can use the content of your txti in other applications. Just add /json, /xml, or /html to the end of your txti's URL (such as <a href="/barry/json">txti.es/barry/json</a>).</p>
56 | 
57 |     <h2>Important notes</h2>
58 |     <ul>
59 |         <li>Txti will delete any pages that are more than 6 months old but have been viewed less than two times ever. If you make a txti and forget about it, it might not be there 6 months later.</li>
60 |         <li>Legal concerns or pages you think violate txti's <a href="/terms">terms of service</a> can be sent to <a href="mailto:legal@txti.es">legal@txti.es.</a></li>
61 |     </ul>
62 |     <p class="footer"><a href="/">txti home</a></p>
63 | 
64 | </body>
65 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/images.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 | 	<meta charset="utf-8">
 5 | 	<meta name="viewport" content="width=device-width">
 6 | 	<meta name="author" content="@txties">
 7 | 	<meta name="description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
 8 | 	<meta property="og:url" content="http://txti.es/images">
 9 | 	<meta property="og:title" content="Made via txti.es:">
10 | 	<meta property="og:site_name" content="txti">
11 | 	<meta property="og:description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
12 | 	<meta name="twitter:card" content="summary">
13 | 	<meta name="twitter:title" content="Made via txti.es:">
14 | 	<meta name="twitter:description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
15 | 	<meta name="twitter:creator" content="@txties">
16 | 	<title>Made via txti.es:</title>
17 | 	<style type="text/css">body {font-size: 1.1em; line-height: 1.5em; max-width: 45em; margin: auto; padding: 0 2%;} img {max-width: 100%; display: block; margin: .75em auto;}</style>
18 | </head>
19 | <body>
20 | <div style="background: pink; text-align: center; padding: .25em;">This txti has images. Read without them, or <a href="images/images">click here to load the images</a>.</div><h1>Images in txti</h1> <p>All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed so the user doesn't need to load images to get the idea. It creates a link so the user can click to view (or they can click the pink banner that appears above at first).</p> <p> <a href="http://i.imgur.com/xPGf9bZ.jpg">[Image: This is an image of txti on a flip phone]</a> </p> <p>Doing it this way means that anyone can open any page on txti and know that it will load super fast! Then they can choose to view images if they have the bandwidth.</p>
21 | <p style="text-align: right"><a href="/">txti</a></p>
22 | </body>
23 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/images/images.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 | 	<meta charset="utf-8">
 5 | 	<meta name="viewport" content="width=device-width">
 6 | 	<meta name="author" content="@txties">
 7 | 	<meta name="description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
 8 | 	<meta property="og:url" content="http://txti.es/images">
 9 | 	<meta property="og:title" content="Made via txti.es:">
10 | 	<meta property="og:site_name" content="txti">
11 | 	<meta property="og:description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
12 | 	<meta name="twitter:card" content="summary">
13 | 	<meta name="twitter:title" content="Made via txti.es:">
14 | 	<meta name="twitter:description" content="Images in txti All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed ...">
15 | 	<meta name="twitter:creator" content="@txties">
16 | 	<title>Made via txti.es:</title>
17 | 	<style type="text/css">body {font-size: 1.1em; line-height: 1.5em; max-width: 45em; margin: auto; padding: 0 2%;} img {max-width: 100%; display: block; margin: .75em auto;}</style>
18 | </head>
19 | <body>
20 | <h1>Images in txti</h1>
21 | <p>All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed so the user doesn't need to load images to get the idea. It creates a link so the user can click to view (or they can click the pink banner that appears above at first).</p>
22 | <p><img alt="This is an image of txti on a flip phone" src="http://i.imgur.com/xPGf9bZ.jpg" /></p>
23 | <p>Doing it this way means that anyone can open any page on txti and know that it will load super fast! Then they can choose to view images if they have the bandwidth.</p>
24 | <p style="text-align: right"><a href="/">txti</a></p>
25 | </body>
26 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <meta name="Description" content="Txti is a free service that lets you create the fastest, simplest, most shareable web pages on the internet using any phone, tablet, or computer you have.">
 7 |     <meta name="author" content="Barry T. Smith">
 8 |     
 9 |     <meta property="og:url" content="http://txti.es">
10 |     <meta property="og:title" content="txti - Fast web pages for everybody">
11 |     <meta property="og:image" content="http://txti.es/favicon-196x196.png">
12 |     <meta property="og:site_name" content="txti">
13 |     <meta property="og:description" content="Txti is a free service that lets you create the fastest, simplest, most shareable web pages on the internet using any phone, tablet, or computer you have.">
14 |     
15 |     <meta name="twitter:card" content="summary">
16 |     <meta name="twitter:site" content="@txties">
17 |     <meta name="twitter:creator" content="@thebarrytone">
18 |     <meta name="twitter:title" content="txti - Fast web pages for everybody">
19 |     <meta name="twitter:description" content="Txti is a free service that lets you create the fastest, simplest, most shareable web pages on the internet using any phone, tablet, or computer you have.">
20 |     
21 |     <title>txti - Fast web pages for everybody</title>
22 |     
23 |     <link rel="shortcut icon" href="/favicon.ico">
24 |     <link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-57x57.png">
25 |     <link rel="apple-touch-icon" sizes="114x114" href="/apple-touch-icon-114x114.png">
26 |     <link rel="apple-touch-icon" sizes="72x72" href="/apple-touch-icon-72x72.png">
27 |     <link rel="apple-touch-icon" sizes="144x144" href="/apple-touch-icon-144x144.png">
28 |     <link rel="apple-touch-icon" sizes="60x60" href="/apple-touch-icon-60x60.png">
29 |     <link rel="apple-touch-icon" sizes="120x120" href="/apple-touch-icon-120x120.png">
30 |     <link rel="apple-touch-icon" sizes="76x76" href="/apple-touch-icon-76x76.png">
31 |     <link rel="apple-touch-icon" sizes="152x152" href="/apple-touch-icon-152x152.png">
32 |     <link rel="icon" type="image/png" href="/favicon-196x196.png" sizes="196x196">
33 |     <link rel="icon" type="image/png" href="/favicon-160x160.png" sizes="160x160">
34 |     <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
35 |     <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
36 |     <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
37 |     <meta name="msapplication-TileColor" content="#9f00a7">
38 |     <meta name="msapplication-TileImage" content="/mstile-144x144.png">
39 |     <meta name="msapplication-config" content="/browserconfig.xml">
40 |     
41 |     <link href="/css/styles.min.css" rel="stylesheet" type="text/css">
42 | </head>
43 | <body>
44 |     
45 | <h1><a href="http://txti.es">txti</a></h1>
46 | <p>Fast web pages for everybody.</p>
47 | <a href="http://txti.es/about">What is txti?</a> | <a href="http://txti.es/how">How to use txti</a>
48 | <h2>Create a txti</h2>
49 | <form id="create-a-txti" method="post" action="http://txti.es">
50 | 	<label for="content-input" >Content (required)</label>
51 | 	<textarea class="text-input" id="content-input" name="content"></textarea>
52 | 	<input type="hidden" name="form_level" value="1">
53 | 	<p>By continuing, you agree to the <a href="http://txti.es/terms">terms of service.</a></p>
54 | 	<label for="username" class="nope">If you are human, leave this field blank (required):</label><input type="text" id="username" class="nope" name="username">
55 | 	<input type="submit" id="submit" name="submit" value="Save and done">
56 | 	 or <input type="submit" id="increase_form_level" name="increase_form_level" value="Show more options">
57 | </form>
58 |     
59 | </body>
60 | </html>


--------------------------------------------------------------------------------
/test/mock/fixtures/txti.es/terms.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 | 	<meta charset="utf-8">
 5 | 	<meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <meta name="Description" content="Legal agreement for using txti.">
 7 |     <link href="/css/styles.min.css" type="text/css" rel="stylesheet">
 8 | 	<title>txti - Terms of Service</title>
 9 | </head>
10 | 
11 | <body>
12 |     <h1><a href="/">txti</a> Terms of Service</h1>
13 |     
14 |     <h2>IMPORTANT - READ BEFORE USING.</h2>
15 | 
16 |     <p>Do not use this Application until You have carefully read the following terms and conditions.  By choosing or clicking ���I Agree��� (or similar), You acknowledge and accept the terms and conditions of this agreement (���Agreement���).  If You do not agree, You are granted no rights in or to the Application and shall not use the Application.</p>
17 |     
18 |     <p>This Application is owned and provided by TXTI, LLC. (���TXTI,��� ���We,��� or ���Us���).  Questions concerning this Application or its operation should be directed to https://twitter.com/thebarrytone.  Use of the Application is offered to You conditioned on Your acceptance without modification of this Agreeement. You agree to familiarize Yourself with and abide by the Agreement if and when You use the Application.</p>
19 | 
20 |     <h2>DEFINITIONS</h2>
21 |     <ul>
22 |         <li>"You" or "Your" means the person who is being licensed to use the Application.</li>
23 |         <li>"Application" means all computer programs and documentation related to the TXTI application which focuses on creating text-based websites.</li>
24 |     </ul>
25 | 
26 |     <h2>LICENSE GRANT</h2>
27 |     <p>TXTI hereby grants to You a non-exclusive, non-transferable limited license to use the Application.</p>
28 | 
29 |     <h2>POLICY</h2>
30 |     <ul>
31 |         <li>You are responsible for everything you post/upload.  If We, in Our sole judgment, deem that You have violated this policy or Agreement, We reserve the right to terminate Your access to the Application and remove Your material.  TXTI shall have no obligation or liability with regard to any deleted material.</li>
32 |         <li>This site is not for sexually explicit material including, but not limited to, child exploitation. You grant us permission to disclose your private information to law enforcement, if We, in Our sole judgment, deem that You have violated this policy in any way or if we receive a subpoena or questions from law enforcement officers investigating child exploitation or any other violation of State, Federal or Local laws.</li>
33 |         <li>This site is not for obscene material.  Material that is designed to predominantly appeal to prurient interests, or which goes substantially beyond customary limits of society, is strictly prohibited.</li>
34 |         <li>Respect copyright. Only post material that You made or that You are authorized to use. This means do not use content that someone else owns the copyright to (e.g., lyrics, text, etc.) without necessary authorizations.</li>
35 |         <li>We encourage free speech and defend everyone's right to express unpopular points of view. But We do not permit hate speech (speech which attacks or demeans a group based on race or ethnic origin, religion, disability, gender, age, veteran status, and sexual orientation/gender identity).</li>
36 |         <li>Predatory behavior, stalking, threats, harassment, intimidation, invading privacy, revealing other people���s personal information, and inciting others to commit violent acts shall be considered a violation of this Policy.</li>
37 |         <li>TXTI will delete any pages that are more than 6 months old but have been viewed less than two times ever. If you make a TXTI and forget about it, it might not be there 6 months later.</li>
38 |     </ul>
39 | 
40 |     <h2>TERMINATION</h2>
41 |     <p>TXTI retains the right to, at any time, and in its sole discretion, terminate this AGREEMENT.  Upon the termination of this AGREEMENT, all rights granted to You under this AGREEMENT shall immediately terminate and You shall discontinue all use of the Application.</p>
42 | 
43 |     <h2>SECURITY</h2>
44 |     <p>Any user names and/or passwords used for this Application are for individual use only. You will be responsible for the security of Your user name and/or password (if any).  TXTI is under no obligation to preserve or make accessible any data uploaded or posted through TXTI.</p>
45 | 
46 |     <h2>NO REPRESENTATIONS OR WARRANTIES</h2>
47 |     <p>TXTI makes no representations or warranties that this Application is free of defects, viruses or other harmful components. TXTI shall not be responsible for any damages or loss that may result from the hacking or infiltration of this Application or associated computer systems and data servers. YOU HAVE THE SOLE RESPONSIBILITY FOR ADEQUATE PROTECTION AND BACKUP OF DATA AND/OR EQUIPMENT USED IN CONNECTION WITH THIS APPLICATION AND YOU AGREE TO HOLD TXTI HARMLESS FROM, AND YOU COVENANT NOT TO SUE TXTI FOR, ANY CLAIMS BASED ON THE USE OF THIS APPLICATION, INCLUDING CLAIMS FOR LOST DATA, WORK DELAYS OR LOST PROFITS RESULTING FROM USE OF MATERIALS OR CONTENT FROM THIS APPLICATION. THE APPLICATION MAY CONTAIN TECHNICAL INACCURACIES, OUTDATED INFORMATION AND TYPOGRAPHICAL ERRORS. ALL MATERIALS, INFORMATION, APPLICATION, AND SERVICES INCLUDED IN OR AVAILABLE THROUGH THIS APPLICATION ARE PROVIDED ���AS IS��� AND ���AS AVAILABLE.��� TXTI DISCLAIMS ALL WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. TXTI MAKES NO WARRANTIES OR REPRESENTATIONS CONCERNING THE SUITABILITY, RELIABILITY, AVAILABILITY, TIMELINESS, OR ACCURACY OF THE INFORMATION, PRODUCTS OR SERVICES CONTAINED IN OR OBTAINED THROUGH THE APPLICATION FOR ANY PURPOSE. SOME JURISDICTIONS DO NOT PERMIT THE EXCLUSION OF CERTAIN WARRANTIES; THESE EXCLUSIONS MAY NOT APPLY TO YOU.  NO AGENT OF TXTI IS AUTHORIZED TO ALTER OR EXCEED THE WARRANTY OBLIGATIONS OF TXTI AS SET FORTH HEREIN.  ANY IMPLIED WARRANTIES THAT CANNOT BE EXCLUDED ARE LIMITED TO THE SHORTEST PERIOD PERMITTED BY THE APPLICABLE LAW.</p>
48 | 
49 |     <h2>DISCLAIMER OF LIABILITY</h2>
50 |     <p>UNDER NO CIRCUMSTANCES SHALL TXTI OR ITS SUBSIDIARIES, AFFILIATES, LICENSORS, SERVICE PROVIDERS, CONTENT PROVIDERS, EMPLOYEES, AGENTS, OFFICERS, AND DIRECTORS BE LIABLE FOR ANY DIRECT, INDIRECT, PUNITIVE, INCIDENTAL, SPECIAL, CONSEQUENTIAL, OR ANY OTHER DAMAGES WHATSOEVER THAT MAY RESULT FROM THE USE OF OR THE INABILITY TO USE THIS APPLICATION, INCLUDING WITHOUT LIMITATION, DAMAGES ARISING FROM MISTAKES, OMISSIONS, INTERRUPTIONS, DETERIORATION OR CORRUPTION OF FILES, DELETION OR CORRUPTION OF EMAIL, ERRORS, LOSS OF DATA, LOSS OF PROFITS, DEFECTS, VIRUSES, AND/OR DELAYS. THIS LIMITATION APPLIES WHETHER THE ALLEGED LIABILITY IS BASED ON CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY OR OTHERWISE, EVEN IF TXTI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. YOU ACKNOWLEDGE AND AGREE THAT THE CONSIDERATION WHICH TXTI IS CHARGING HEREUNDER DOES NOT INCLUDE ANY CONSIDERATION FOR ASSUMPTION BY TXTI OF THE RISK OF LICENSEE'S CONSEQUENTIAL OR INCIDENTAL DAMAGES WHICH MAY ARISE IN CONNECTION WITH LICENSEE'S USE OF THE APPLICATION AND DOCUMENTATION.  SOME JURISDICTIONS DO NOT PERMIT THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES; TXTI���S LIABILITY IN SUCH JURISDICTIONS SHALL BE LIMITED TO THE FULLEST EXTENT PERMITTED BY LAW.  The essential purpose of this provision is to limit the potential liability of TXTI arising out of this AGREEMENT. The parties acknowledge that the limitations set forth in this Section are integral to the amount of consideration levied in connection with the license of the Application and that, were TXTI to assume any further liability other than as set forth herein, such consideration would of necessity be set substantially higher.  If You are dissatisfied with any portion of the Application, Your sole remedy is to cease using it.</p>
51 | 
52 |     <h2>INDEMNITY</h2>
53 |     <p>You agree to defend, indemnify, and hold harmless TXTI and its employees, agents, directors, officers and shareholders, from and against all liabilities, claims, damages, and expenses (including without limitation reasonable attorneys��� fees and costs) arising out of Your use of this Application, Your breach of this AGREEMENT, or Your infringement of the intellectual property rights of third parties.</p> 
54 | 
55 |     <h2>GENERAL</h2>
56 |     <p>This AGREEMENT shall be given effect to the fullest extent permissible by law. In case any one or more of the provisions contained in this agreement shall for any reason be held to be invalid, illegal or unenforceable in any respect, such invalidity, illegality, or unenforceability shall not affect any other provisions hereof, and this agreement shall be construed to give maximum legal effect to the intent expressed herein.</p>
57 | 
58 |     <ul>
59 |         <li>This agreement is governed by, and construed in accordance with the laws of the State of Ohio, without giving effect to any principles of conflicts of law. You hereby consent to the exclusive jurisdiction and venue of the courts of the State of Ohio or, if appropriate, the United States District Court for the Southern District of Ohio for the resolution of all disputes arising out of or relating to the use of this Application and the associated services. The United Nations Convention on Contracts for the International Sale of Goods does not apply to this software or the software license pertaining to this agreement.</li>
60 |         <li>TXTI may assign this AGREEMENT, in whole or in part, at any time.</li>
61 |         <li>This AGREEMENT constitutes the entire agreement between TXTI and You with respect to this Application, and these agreements supersede all prior or contemporaneous communications, proposals, and agreements, whether electronic, oral, or written, between TXTI and You with respect to the Application. As such, these terms of use represent the entire understanding relating to the use of this Application and prevail over any prior or contemporaneous, conflicting or additional communications.</li>
62 |         <li>TXTI���s performance of this AGREEMENT is subject to existing laws and legal process. Nothing contained in this AGREEMENT is in derogation of TXTI���s right to comply with governmental, court and law enforcement requests relating to Your use of the Application, or information collected by TXTI in connection with such use.</li>
63 |         <li>Any unauthorized access, modification or change of any information, or any interference with the availability of or access to this Application is strictly prohibited. TXTI reserves all legal rights and remedies available to it and this disclaimer shall in no way be deemed a limitation or waiver of any other rights TXTI may have.</li>
64 |         <li>TXTI may change the terms of this AGREEMENT from time to time. You agree to check the TXTI's website, where the latest copy of the AGREEMENT will be posted, for any material changes. You expressly agree that the continued use of any software provided by the TXTI after the effective date of any change will constitute your consent to any such revised AGREEMENT. If at any time You do not accept any such revision, You must cease the use of the Application.</li>
65 |         <li>The provisions which, by their nature, should survive termination of this AGREEMENT shall do so.</li>
66 |     </ul>
67 | 
68 | <h2>RESERVATION OF RIGHTS</h2>
69 |     <p>All rights not expressly granted herein are reserved exclusively and entirely to TXTI.</p>
70 | 
71 | 
72 | <p>END OF AGREEMENT</p>
73 | <p>TXTI is a registered trademark of TXTI, LLC. All rights reserved.</p>
74 |     <p class="footer"><a href="/">txti home</a></p>
75 | </body>
76 | </html>


--------------------------------------------------------------------------------
/test/mock/save_page.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # Script to save a single web page's HTML to disk. For example,
 5 | # http://blah.com/admin/about will be saved as:
 6 | # <path_to_script>/fixtures/blah.com.html
 7 | # Call this script like: `ruby save_page.rb "http://blah.com"` or use toys task.
 8 | 
 9 | require_relative "../../lib/wgit"
10 | require "fileutils"
11 | 
12 | def save_page(url)
13 |   url     = Wgit::Url.parse(url)
14 |   path    = "#{File.expand_path(__dir__)}/fixtures"
15 |   crawler = Wgit::Crawler.new
16 | 
17 |   FileUtils.mkdir_p(path)
18 |   Dir.chdir(path)
19 | 
20 |   # Save the HTML file for the page.
21 |   crawler.crawl_url(url) do |doc|
22 |     if doc.empty?
23 |       puts "Invalid URL: #{doc.url}"
24 |       next
25 |     end
26 | 
27 |     file_path = url.to_host
28 |     file_path += ".html" unless file_path.end_with? ".html"
29 |     puts "Saving document #{file_path}"
30 |     File.open(file_path, "w") { |f| f.write(doc.html) }
31 |   end
32 | end
33 | 
34 | if $PROGRAM_NAME == __FILE__
35 |   raise "ARGV[0] must be a URL" unless ARGV[0]
36 | 
37 |   url = ARGV[0]
38 |   save_page(url)
39 | end
40 | 


--------------------------------------------------------------------------------
/test/mock/save_site.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # A script which saves a website's HTML to disk. For example,
 5 | # http://blah.com/admin/about will be saved as:
 6 | # <path_to_script>/fixtures/blah.com/admin/about.html
 7 | # Call this script like: `ruby save_site.rb http://blah.com` or use toys task.
 8 | 
 9 | require_relative "../../lib/wgit"
10 | require "fileutils"
11 | 
12 | def save_site(base_url, follow: :default)
13 |   base_url = Wgit::Url.parse(base_url)
14 |   path     = "#{File.expand_path(__dir__)}/fixtures/#{base_url.to_host}"
15 |   crawler  = Wgit::Crawler.new
16 | 
17 |   FileUtils.mkdir_p(path)
18 |   Dir.chdir(path)
19 | 
20 |   # Save the site to disk.
21 |   crawler.crawl_site(base_url, follow: follow) do |doc|
22 |     url = doc.url
23 | 
24 |     if doc.empty?
25 |       puts "Invalid URL: #{url}"
26 |       next
27 |     end
28 | 
29 |     # Save the index.html file to disk.
30 |     if !base_url.omit_slashes.to_path && url.omit_slashes == base_url.omit_slashes
31 |       puts "Saving document #{base_url.to_host}/index.html"
32 |       File.open("index.html", "w") { |f| f.write(doc.html) }
33 |       next
34 |     end
35 | 
36 |     # Work out the file structure on disk.
37 |     segs = url.omit_base.split("/").reject(&:empty?)
38 |     dir = ""
39 |     if segs.length == 1
40 |       file_name = segs[0]
41 |     else
42 |       file_name = segs.pop
43 |       segs.each { |seg| dir += "#{seg}/" }
44 |       dir.chop! # Remove trailing slash.
45 |     end
46 | 
47 |     # Create the directory if necessary.
48 |     if dir != ""
49 |       FileUtils.mkdir_p(dir)
50 |       dir += "/"
51 |     end
52 | 
53 |     file_path = dir + file_name
54 |     file_path += ".html" unless file_path.end_with? ".html"
55 | 
56 |     # Save the HTML file for the page.
57 |     puts "Saving document #{base_url.to_host}/#{file_path}"
58 |     File.open(file_path, "w") { |f| f.write(doc.html) }
59 |   end
60 | end
61 | 
62 | if $PROGRAM_NAME == __FILE__
63 |   raise "ARGV[0] must be a URL" unless ARGV[0]
64 | 
65 |   base_url = ARGV[0]
66 |   xpath    = ARGV[1] || :default
67 |   save_site(base_url, follow: xpath)
68 | end
69 | 


--------------------------------------------------------------------------------
/test/mock/webmock.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "webmock"
  4 | require "uri"
  5 | 
  6 | include WebMock::API
  7 | 
  8 | WebMock.enable!
  9 | WebMock.disable_net_connect!(allow: %w[127.0.0.1 vlang.io duckduckgo.com])
 10 | 
 11 | # Any custom Typhoeus mocking (missing from Webmock) goes below.
 12 | class Typhoeus::Response
 13 |   def total_time
 14 |     total_time = options[:total_time]
 15 |     return total_time if total_time.positive?
 16 | 
 17 |     rand(0.2...0.7)
 18 |   end
 19 | 
 20 |   def primary_ip
 21 |     "192.241.176.#{rand(10..99)}"
 22 |   end
 23 | end
 24 | 
 25 | def fixtures_dir
 26 |   "test/mock/fixtures"
 27 | end
 28 | 
 29 | # Return the contents of a HTML fixture file.
 30 | def fixture(file)
 31 |   file = "#{file}.html" if %w[.html robots.txt].none? { |ext| file.end_with?(ext) }
 32 |   file_path = file.start_with?(fixtures_dir) ? file : "#{fixtures_dir}/#{file}"
 33 |   File.read(file_path)
 34 | end
 35 | 
 36 | # Return the default HTML fixture data.
 37 | def default_html
 38 |   fixture("test_doc")
 39 | end
 40 | 
 41 | # Stub a single webpage. Stubs both:
 42 | # http://blah.com/hi and http://blah.com/hi/ (with trailing slash).
 43 | def stub_page(url, status: 200, body: default_html, fixture: nil)
 44 |   body = fixture(fixture) if fixture
 45 |   stub_request(:get, url).to_return(status: status, body: body)
 46 | 
 47 |   # Webmock only mocks a trailing slash if there's no path so we do it.
 48 |   path = URI(url).path
 49 |   return if path.empty? || path == "/"
 50 | 
 51 |   alt_url = url.end_with?("/") ? url.chop : "#{url}/"
 52 |   stub_request(:get, alt_url).to_return(status: status, body: body)
 53 | end
 54 | 
 55 | # Stub a single page 404 not found.
 56 | def stub_not_found(url)
 57 |   stub_page(url, status: 404, fixture: "not_found")
 58 | end
 59 | 
 60 | # Stub a 404 not found for /robots.txt.
 61 | def stub_robots_txt_not_found(urls)
 62 |   urls.each do |url|
 63 |     suffix = url.end_with?("/robots.txt") ? "" : "/robots.txt"
 64 |     stub_not_found(url + suffix)
 65 |   end
 66 | end
 67 | 
 68 | # Stub a single page 301 redirect.
 69 | def stub_redirect(from, to)
 70 |   stub_request(:get, from).to_return(status: 301, headers: { 'Location': to })
 71 | end
 72 | 
 73 | # Stub a single page network timeout/unknown host error.
 74 | def stub_timeout(url)
 75 |   stub_request(:get, url).to_timeout
 76 | end
 77 | 
 78 | # Stub an entire website recursively according to what's saved on the file
 79 | # system. Assumes the fixture data exists on disk.
 80 | def stub_dir(url, path, dir)
 81 |   url.chop!  if url.end_with?("/")  # Remove trailing slash.
 82 |   path.chop! if path.end_with?("/") #   "
 83 |   dir.chop!  if dir.end_with?("/")  #   "
 84 | 
 85 |   url  += "/#{dir}" unless URI(url).host == dir
 86 |   path += "/#{dir}"
 87 | 
 88 |   objects = Dir["#{path}/{*,.*}"]
 89 |             .reject { |f| f.end_with?(".") || f.end_with?("..") }
 90 |   files   = objects
 91 |             .select { |obj| File.file?(obj) }
 92 |             .reject { |f| f.end_with?("index.html") }
 93 |             .map { |f| f.end_with?(".html") ? f[0..-6] : f } # Remove extension.
 94 |   dirs    = objects
 95 |             .select { |obj| File.directory?(obj) }
 96 | 
 97 |   files.each { |f| stub_page("#{url}/#{f.split('/').last}", fixture: f) }
 98 |   dirs.each  { |d| stub_dir(url, path, d.split("/").last) }
 99 | end
100 | 
101 | # Stub all single webpages and full websites from the fixtures directory.
102 | def stub_fixtures(pages, sites)
103 |   pages.each do |url|
104 |     path = URI(url).host
105 |     stub_page(url, fixture: path)
106 |   end
107 | 
108 |   sites.each do |url|
109 |     dir        = URI(url).host
110 |     index_file = "#{dir}/index.html"
111 |     index_path = "#{fixtures_dir}/#{index_file}"
112 | 
113 |     stub_page(url, fixture: index_file) if File.exist?(index_path)
114 |     stub_dir(url, fixtures_dir, dir)
115 |   end
116 | end
117 | 


--------------------------------------------------------------------------------
/test/test_assertable.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require_relative "helpers/test_helper"
  4 | 
  5 | # Test class for the Assertable module functions.
  6 | class TestAssertable < TestHelper
  7 |   include Wgit::Assertable
  8 | 
  9 |   # Run non DB tests in parallel for speed.
 10 |   parallelize_me!
 11 | 
 12 |   # Runs before every test.
 13 |   def setup; end
 14 | 
 15 |   def test_assert_types__pass
 16 |     assert_equal "Hello World!", assert_types("Hello World!", String)
 17 |     assert_equal [1, 2, 3], assert_types([1, 2, 3], [Array, String])
 18 |     assert_equal "/about", assert_types("/about".to_url, String)
 19 |   end
 20 | 
 21 |   def test_assert_types__fail
 22 |     e = assert_raises(StandardError) { assert_types "Hello World!", Integer }
 23 |     assert_equal "Expected: Integer, Actual: String", e.message
 24 | 
 25 |     e = assert_raises StandardError do
 26 |       assert_types [1, 2, 3], [TrueClass, Integer], "An Array is expected"
 27 |     end
 28 |     assert_equal "An Array is expected", e.message
 29 |   end
 30 | 
 31 |   def test_assert_arr_types__pass
 32 |     assert_equal [1, true, "Boom!"], assert_arr_types([1, true, "Boom!"], [Integer, TrueClass, String])
 33 |     assert_equal [1, true, "/about"], assert_arr_types([1, true, "/about".to_url], [Integer, TrueClass, String])
 34 |   end
 35 | 
 36 |   def test_assert_arr_types__fail
 37 |     e = assert_raises StandardError do
 38 |       assert_arr_types [1, true, "Boom!"], [Integer, String]
 39 |     end
 40 |     s = "Expected: [Integer, String], Actual: TrueClass"
 41 | 
 42 |     assert_equal s, e.message
 43 |   end
 44 | 
 45 |   def test_assert_arr_types__non_enumerable
 46 |     e = assert_raises StandardError do
 47 |       assert_arr_type "non enumerable", Integer
 48 |     end
 49 |     s = "Expected an Enumerable responding to #each, not: String"
 50 | 
 51 |     assert_equal s, e.message
 52 |   end
 53 | 
 54 |   def test_assert_common_arr_types__pass
 55 |     url = "/about".to_url
 56 |     assert_equal [1, 2, 3], assert_common_arr_types([1, 2, 3], [Integer, String])
 57 |     assert_equal [url, "/about"], assert_common_arr_type([url, "/about"], String)
 58 |   end
 59 | 
 60 |   def test_assert_common_arr_types__fail
 61 |     e = assert_raises StandardError do
 62 |       assert_common_arr_types [1, "Boom!"], [Integer, String]
 63 |     end
 64 |     s = "Expected an Enumerable with elements of a single common type"
 65 | 
 66 |     assert_equal s, e.message
 67 |   end
 68 | 
 69 |   def test_assert_common_arr_types__non_enumerable
 70 |     e = assert_raises StandardError do
 71 |       assert_common_arr_type "non enumerable", Integer
 72 |     end
 73 |     s = "Expected an Enumerable responding to #each, not: String"
 74 | 
 75 |     assert_equal s, e.message
 76 |   end
 77 | 
 78 |   def test_assert_respond_to__pass
 79 |     objs = ["Hello World!", [1, 2, 3]]
 80 | 
 81 |     assert_equal objs, assert_respond_to(objs, %i[equal? include?])
 82 |   end
 83 | 
 84 |   def test_assert_respond_to__fail
 85 |     objs = ["Hello World!", [1, 2, 3]]
 86 | 
 87 |     e = assert_raises StandardError do
 88 |       assert_equal objs, assert_respond_to(objs, %i[equal? each])
 89 |     end
 90 |     assert_equal(
 91 |       "String (Hello World!) doesn't respond_to? [:equal?, :each]",
 92 |       e.message
 93 |     )
 94 |   end
 95 | 
 96 |   def test_assert_respond_to__single_method
 97 |     objs = ["Hello World!", [1, 2, 3]]
 98 | 
 99 |     assert_equal objs, assert_respond_to(objs, :length)
100 |   end
101 | 
102 |   def assert_required_keys__pass
103 |     hash = { 'NAME': "Mick", 'AGE': 30 }
104 | 
105 |     assert_equal hash, assert_required_keys(hash, %w[NAME AGE])
106 |   end
107 | 
108 |   def assert_required_keys__fail
109 |     hash = { 'NAME': "Mick", 'AGE': 30 }
110 | 
111 |     e = assert_raises(KeyError { assert_required_keys(hash, %w[NAME ADDRESS]) })
112 |     assert_equal(
113 |       "Some or all of the required keys are not present: NAME, ADDRESS",
114 |       e.message
115 |     )
116 |   end
117 | end
118 | 


--------------------------------------------------------------------------------
/test/test_base.rb:
--------------------------------------------------------------------------------
 1 | require_relative "helpers/test_helper"
 2 | 
 3 | # The test class is at the bottom of this file.
 4 | 
 5 | class QuotesCrawler < Wgit::Base
 6 |   mode   :crawl_site
 7 |   start  "http://quotes.toscrape.com/tag/humor/"
 8 |   follow "//li[@class='next']/a/@href"
 9 | 
10 |   # We use the 2 suffix to avoid conflicting with tests elsewhere.
11 |   extract :quotes2,  "//div[@class='quote']/span[@class='text']", singleton: false
12 |   extract :authors2, "//div[@class='quote']/span/small",          singleton: false
13 | 
14 |   def parse(doc)
15 |     doc.quotes2.zip(doc.authors2).each do |arr|
16 |       yield({
17 |         quote:  arr.first,
18 |         author: arr.last
19 |       })
20 |     end
21 |   end
22 | end
23 | 
24 | class NoParseCrawler < Wgit::Base
25 |   mode   :crawl
26 |   start  "http://quotes.toscrape.com/tag/humor/"
27 |   follow "//li[@class='next']/a/@href"
28 | end
29 | 
30 | class DefaultModeCrawler < Wgit::Base
31 |   start "http://quotes.toscrape.com/tag/humor/"
32 | 
33 |   def parse(doc)
34 |     yield doc.url
35 |   end
36 | end
37 | 
38 | class SetupTeardownCrawler < Wgit::Base
39 |   attr_reader :count
40 | 
41 |   start "http://quotes.toscrape.com/tag/humor/"
42 | 
43 |   def initialize
44 |     @count = 0
45 |   end
46 | 
47 |   def setup
48 |     @count += 1
49 |   end
50 | 
51 |   def parse(_doc)
52 |     @count += 1
53 |   end
54 | 
55 |   def teardown
56 |     @count += 1
57 |   end
58 | end
59 | 
60 | # Test class for the Base class logic.
61 | class TestBase < TestHelper
62 |   # Runs before every test.
63 |   def setup; end
64 | 
65 |   def test_quotes_crawler
66 |     quotes = []
67 |     QuotesCrawler.run { |quote| quotes << quote }
68 | 
69 |     assert_equal 12, quotes.size
70 |     assert({
71 |       quote: "“A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.”",
72 |       author: "Jane Austen"
73 |     }, quotes.last)
74 | 
75 |     # Clean up the extractors for other tests.
76 |     Wgit::Document.remove_extractor :quotes2
77 |     Wgit::Document.remove_extractor :authors2
78 |   end
79 | 
80 |   def test_no_parse_crawler
81 |     ex = assert_raises(StandardError) { NoParseCrawler.run }
82 |     assert_equal "NoParseCrawler must respond_to? #parse(doc, &block)", ex.message
83 |   end
84 | 
85 |   def test_default_mode_crawler
86 |     DefaultModeCrawler.run do |url|
87 |       assert_equal "http://quotes.toscrape.com/tag/humor/", url
88 |     end
89 |   end
90 | 
91 |   def test_setup_teardown_crawler
92 |     crawler = SetupTeardownCrawler.run
93 |     assert_equal 3, crawler.count
94 |   end
95 | end
96 | 


--------------------------------------------------------------------------------
/test/test_core_ext.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "helpers/test_helper"
 4 | 
 5 | # Test class for the Ruby core extension methods.
 6 | class TestCoreExt < TestHelper
 7 |   # Run non DB tests in parallel for speed.
 8 |   parallelize_me!
 9 | 
10 |   # Runs before every test.
11 |   def setup; end
12 | 
13 |   def test_string_to_url
14 |     s = "http://www.google.co.uk"
15 |     url = s.to_url
16 |     assert_instance_of Wgit::Url, url
17 |     assert_equal s, url
18 |     assert_equal url.object_id, url.to_url.object_id
19 |   end
20 | 
21 |   def test_array_to_urls
22 |     url_strs = [
23 |       "http://altitudejunkies.com",
24 |       "http://www.mountainmadness.com",
25 |       "http://www.adventureconsultants.com"
26 |     ]
27 |     urls = url_strs.to_urls
28 | 
29 |     assert(url_strs.all? { |url| url.instance_of? String })
30 |     assert(urls.all? { |url| url.instance_of? Wgit::Url })
31 | 
32 |     url_strs = [
33 |       "http://altitudejunkies.com",
34 |       true,
35 |       "http://www.adventureconsultants.com"
36 |     ]
37 |     urls = url_strs.to_urls
38 | 
39 |     assert url_strs.first.instance_of? String
40 |     refute(urls.all? { |url| url.instance_of? Wgit::Url })
41 |     assert urls.first.instance_of? Wgit::Url
42 |     assert urls[1].instance_of? TrueClass
43 |     assert urls.last.instance_of? Wgit::Url
44 |   end
45 | 
46 |   def test_array_to_urls!
47 |     urls = [
48 |       "http://altitudejunkies.com",
49 |       "http://www.mountainmadness.com",
50 |       "http://www.adventureconsultants.com"
51 |     ].to_urls!
52 | 
53 |     assert(urls.all? { |url| url.instance_of? Wgit::Url })
54 | 
55 |     urls = [
56 |       "http://altitudejunkies.com",
57 |       true,
58 |       "http://www.adventureconsultants.com"
59 |     ].to_urls!
60 | 
61 |     refute(urls.all? { |url| url.instance_of? Wgit::Url })
62 |     assert urls.first.instance_of? Wgit::Url
63 |     assert urls[1].instance_of? TrueClass
64 |     assert urls.last.instance_of? Wgit::Url
65 |   end
66 | end
67 | 


--------------------------------------------------------------------------------
/test/test_database_adapter.rb:
--------------------------------------------------------------------------------
 1 | require_relative "helpers/test_helper"
 2 | 
 3 | # Test class which includes the Wgit::DSL for testing with.
 4 | class TestClass
 5 |   include Wgit::DSL
 6 | end
 7 | 
 8 | # Test class for the Wgit Database.adapter_class accessor methods.
 9 | # This class should also test any Wgit code that calls:
10 | # `Wgit::Database.adapter_class.new`; which ensures changing adapters works.
11 | class TestDatabaseAdapter < TestHelper
12 |   # Runs before every test.
13 |   def setup; end
14 | 
15 |   # Runs after every test.
16 |   def teardown
17 |     # Reset the database adapter back to the default.
18 |     Wgit::Database.adapter_class = Wgit::Database::DEFAULT_ADAPTER_CLASS
19 |   end
20 | 
21 |   def test_adapter_class__default
22 |     assert_equal Wgit::Database::DEFAULT_ADAPTER_CLASS, Wgit::Database.adapter_class
23 |   end
24 | 
25 |   def test_adapter_class__accessor
26 |     Wgit::Database.adapter_class = Wgit::Database::InMemory
27 | 
28 |     assert_equal Wgit::Database::InMemory, Wgit::Database.adapter_class
29 |   end
30 | 
31 |   def test_adapter_class__indexer
32 |     Wgit::Database.adapter_class = Wgit::Database::InMemory
33 |     indexer = Wgit::Indexer.new
34 | 
35 |     assert_equal Wgit::Database::InMemory, indexer.db.class
36 |   end
37 | 
38 |   def test_adapter_class__dsl
39 |     Wgit::Database.adapter_class = Wgit::Database::InMemory
40 |     test_class = TestClass.new
41 | 
42 |     assert_equal Wgit::Database::InMemory, test_class.send(:get_db).class
43 |   end
44 | 
45 |   def test_database_new_alias
46 |     Wgit::Database.adapter_class = Wgit::Database::InMemory
47 |     db = Wgit::Database.new
48 | 
49 |     assert_instance_of Wgit::Database::InMemory, db
50 |   end
51 | end
52 | 


--------------------------------------------------------------------------------
/test/test_gem.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "helpers/test_helper"
 4 | 
 5 | # Test class for requiring the wgit gem.
 6 | class TestGem < TestHelper
 7 |   # Runs before every test.
 8 |   def setup; end
 9 | 
10 |   # Test the wgit.rb file loads the API correctly.
11 |   def test_require
12 |     refute_exception { require("wgit") }
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/test/test_html_to_text.rb:
--------------------------------------------------------------------------------
  1 | require_relative "helpers/test_helper"
  2 | 
  3 | # Test class for utility module functions.
  4 | class TestHTMLToText < TestHelper
  5 |   # Run non DB tests in parallel for speed.
  6 |   parallelize_me!
  7 | 
  8 |   # Runs before every test.
  9 |   def setup
 10 |     @use_cases = [
 11 |       # inline parent
 12 |       "<inline_parent><inline>*</inline></inline_parent>",
 13 |       "<inline_parent><inline>*</block></inline_parent>",
 14 |       "<inline_parent><block>*</inline></inline_parent>",
 15 |       "<inline_parent><block>*</block></inline_parent>",
 16 | 
 17 |       # block parent
 18 |       "<block_parent><inline>*</inline></block_parent>",
 19 |       "<block_parent><inline>*</block></block_parent>",
 20 |       "<block_parent><block>*</inline></block_parent>",
 21 |       "<block_parent><block>*</block></block_parent>"
 22 |     ]
 23 | 
 24 |     @content_variations = [
 25 |       "",
 26 |       "foobar",
 27 |       "foo bar",
 28 |       " foo bar  ",
 29 |       " ",
 30 |       "    ",
 31 |       "\n",
 32 |       "  \n ",
 33 |       " \n foo bar \n ",
 34 |       "<br>",
 35 |       "<hr>"
 36 |     ]
 37 | 
 38 |     # For each use_case * text_variation combo above, what do we expect.
 39 |     @expected = [
 40 |       # inline parent - inline inline
 41 |       "prepost",
 42 |       "prefoobarpost",
 43 |       "prefoo barpost",
 44 |       "pre foo bar post",
 45 |       "pre post",
 46 |       "pre post",
 47 |       "prepost",
 48 |       "pre post",
 49 |       "pre foo bar post",
 50 |       "pre\npost",
 51 |       "pre\npost",
 52 | 
 53 |       # inline parent - inline block
 54 |       "pre\npost",
 55 |       "prefoobar\npost",
 56 |       "prefoo bar\npost",
 57 |       "pre foo bar \npost",
 58 |       "pre \npost",
 59 |       "pre \npost",
 60 |       "pre\npost",
 61 |       "pre \npost",
 62 |       "pre foo bar \npost",
 63 |       "pre\npost",
 64 |       "pre\npost",
 65 | 
 66 |       # inline parent - block inline
 67 |       "pre\npost",
 68 |       "pre\nfoobarpost",
 69 |       "pre\nfoo barpost",
 70 |       "pre\n foo bar post",
 71 |       "pre\n post",
 72 |       "pre\n post",
 73 |       "pre\npost",
 74 |       "pre\n \npost",
 75 |       "pre\n foo bar post",
 76 |       "pre\npost",
 77 |       "pre\npost",
 78 | 
 79 |       # inline parent - block block
 80 |       "pre\npost",
 81 |       "pre\nfoobar\npost",
 82 |       "pre\nfoo bar\npost",
 83 |       "pre\n foo bar \npost",
 84 |       "pre\n \npost",
 85 |       "pre\n \npost",
 86 |       "pre\npost",
 87 |       "pre\n \npost",
 88 |       "pre\n foo bar \npost",
 89 |       "pre\npost",
 90 |       "pre\npost",
 91 | 
 92 |       #######
 93 | 
 94 |       # block parent - inline inline
 95 |       "prepost",
 96 |       "prefoobarpost",
 97 |       "prefoo barpost",
 98 |       "pre foo bar post",
 99 |       "pre post",
100 |       "pre post",
101 |       "prepost",
102 |       "pre post",
103 |       "pre foo bar post",
104 |       "pre\npost",
105 |       "pre\npost",
106 | 
107 |       # block parent - inline block
108 |       "pre\npost",
109 |       "prefoobar\npost",
110 |       "prefoo bar\npost",
111 |       "pre foo bar \npost",
112 |       "pre \npost",
113 |       "pre \npost",
114 |       "pre\npost",
115 |       "pre \npost",
116 |       "pre foo bar \npost",
117 |       "pre\npost",
118 |       "pre\npost",
119 | 
120 |       # block parent - block inline
121 |       "pre\npost",
122 |       "pre\nfoobarpost",
123 |       "pre\nfoo barpost",
124 |       "pre\n foo bar post",
125 |       "pre\n post",
126 |       "pre\n post",
127 |       "pre\npost",
128 |       "pre\n \npost",
129 |       "pre\n foo bar post",
130 |       "pre\npost",
131 |       "pre\npost",
132 | 
133 |       # block parent - block block
134 |       "pre\npost",
135 |       "pre\nfoobar\npost",
136 |       "pre\nfoo bar\npost",
137 |       "pre\n foo bar \npost",
138 |       "pre\n \npost",
139 |       "pre\n \npost",
140 |       "pre\npost",
141 |       "pre\n \npost",
142 |       "pre\n foo bar \npost",
143 |       "pre\npost",
144 |       "pre\npost"
145 |     ]
146 |   end
147 | 
148 |   def test_extract_text_str
149 |     total_test_cases = @use_cases.size * @content_variations.size
150 |     should_fail = false
151 |     fail_count = 0
152 |     i = 0
153 | 
154 |     raise "invalid @expected array" unless total_test_cases == @expected.size
155 | 
156 |     @use_cases.each do |use_case|
157 |       @content_variations.each do |content|
158 |         nodes = gsub_use_case_content(use_case, content)
159 |         parser = Nokogiri::HTML("<html><body>#{nodes}</body></html>")
160 | 
161 |         expected = @expected[i]
162 |         actual = Wgit::HTMLToText.new(parser).extract_str
163 | 
164 |         i += 1
165 |         assert true # Add our assertion to minitest's total.
166 |         has_passed = expected == actual
167 |         next if has_passed
168 | 
169 |         Wgit::Utils.pprint("CASE_#{i}", prefix: "TEST_EXTRACT_TEXT_STR", new_line: true,
170 |           use_case: use_case, content: content, nodes: nodes, expected: expected, actual: actual)
171 | 
172 |         should_fail = true
173 |         fail_count += 1
174 |       end
175 |     end
176 | 
177 |     return unless should_fail
178 | 
179 |     Wgit::Utils.pprint("SUMMARY", prefix: "TEST_EXTRACT_TEXT_STR", new_line: true,
180 |       total_test_cases: total_test_cases, total_failing_cases: fail_count)
181 | 
182 |     flunk "test_extract_text_str failed, see logs above for info"
183 |   end
184 | 
185 |   def test_extract__anchors
186 |     url = "http://example.com".to_url
187 |     html = File.read "./test/mock/fixtures/anchor_display.html"
188 |     doc = Wgit::Document.new url, html
189 | 
190 |     assert_equal ["About", "Foo Location Bar", "Contact Contact2 Contact3"], doc.text
191 |   end
192 | 
193 |   def test_extract__spans
194 |     url = "http://example.com".to_url
195 |     html = File.read "./test/mock/fixtures/span_display.html"
196 |     doc = Wgit::Document.new url, html
197 | 
198 |     assert_equal [
199 |       "Running the following Wgit code will programmatically configure your database:",
200 |       "db = Wgit::Database.new '<connection_string>'"
201 |     ], doc.text
202 |   end
203 | 
204 |   def test_extract__divs
205 |     url = "http://example.com".to_url
206 |     html = File.read "./test/mock/fixtures/div_display.html"
207 |     doc = Wgit::Document.new url, html
208 | 
209 |     assert_equal %w[foo bar], doc.text
210 |   end
211 | 
212 |   def test_extract__getting_started_wiki
213 |     url = "http://example.com".to_url
214 |     html = File.read "./test/mock/fixtures/getting_started.html"
215 |     doc = Wgit::Document.new url, html
216 | 
217 |     assert_equal [
218 |       "Running the following Wgit code will programmatically configure your database:",
219 |       "db = Wgit::Database.new '<connection_string>'",
220 |       "db.create_collections",
221 |       "db.create_unique_indexes",
222 |       "db.text_index = Wgit::Database::DEFAULT_TEXT_INDEX",
223 |       "Or take a look at the mongo_init.js file for the equivalent Javascript commands.",
224 |       "Note: The text search index lists all document fields to be searched by MongoDB when calling Wgit::Database#search. Therefore, you should append this list with any other fields that you want searched. For example, if you extend the API then you might want to search your new fields in the database by adding them to the index above. This can be done programmatically with:"
225 |     ], doc.text
226 |   end
227 | 
228 |   def test_extract__dups_are_not_removed
229 |     doc = Wgit::Document.new "http://www.mytestsite.com/home", <<~HTML
230 |       <p>Note: The text search index lists all document fields.</p>
231 |       <hr>
232 |       <p>Note: The text search index lists all document fields.</p>
233 |     HTML
234 | 
235 |     assert_equal [
236 |       "Note: The text search index lists all document fields.",
237 |       "Note: The text search index lists all document fields."
238 |     ], doc.text
239 |   end
240 | 
241 |   private
242 | 
243 |   def gsub_use_case_content(use_case, content)
244 |     use_case
245 |       .gsub("<inline_parent>",  "<span>")
246 |       .gsub("</inline_parent>", "</span>")
247 |       .gsub("<block_parent>",   "<div>")
248 |       .gsub("</block_parent>",  "</div>")
249 |       .gsub("<inline>",         "<span>pre</span>")
250 |       .gsub("</inline>",        "<span>post</span>")
251 |       .gsub("<block>",          "<div>pre</div>")
252 |       .gsub("</block>",         "<div>post</div>")
253 |       .gsub("*",                content)
254 |   end
255 | end
256 | 


--------------------------------------------------------------------------------
/test/test_in_memory.rb:
--------------------------------------------------------------------------------
  1 | require_relative "helpers/test_helper"
  2 | 
  3 | # Test class for the Database::InMemory adapter logic.
  4 | # WARNING: The in-memory DB is cleared down prior to each test run.
  5 | class TestInMemory < TestHelper
  6 |   include InMemoryHelper
  7 | 
  8 |   # Runs before every test.
  9 |   def setup
 10 |     Wgit::Model.set_default_search_fields
 11 | 
 12 |     empty_db
 13 | 
 14 |     @url = Wgit::Url.new(DatabaseTestData.url)
 15 |     @doc = Wgit::Document.new(DatabaseTestData.doc)
 16 | 
 17 |     @urls = Array.new(3) { Wgit::Url.new(DatabaseTestData.url) }
 18 |     @docs = Array.new(3) { Wgit::Document.new(DatabaseTestData.doc) }
 19 |   end
 20 | 
 21 |   # Runs after every test.
 22 |   def teardown
 23 |     # Remove any defined extractors to avoid interfering with other tests.
 24 |     return unless Wgit::Document.remove_extractor(:code)
 25 | 
 26 |     Wgit::Document.send(:remove_method, :code)
 27 |   end
 28 | 
 29 |   def test_initialize
 30 |     db2 = Wgit::Database::InMemory.new
 31 | 
 32 |     refute_nil db2
 33 |     assert_empty db2.urls
 34 |     assert_empty db2.docs
 35 |   end
 36 | 
 37 |   def test_bulk_upsert__urls
 38 |     urls = [
 39 |       "http://example.com",   # Gets inserted.
 40 |       "http://example.com/2", # Gets inserted.
 41 |       "http://example.com",   # Dup of 1, will be updated.
 42 |       "http://example.com/3"  # Gets inserted.
 43 |     ].to_urls
 44 |     count = db.bulk_upsert(urls)
 45 | 
 46 |     assert_equal 3, count
 47 |     assert_equal([
 48 |       "http://example.com",
 49 |       "http://example.com/2",
 50 |       "http://example.com/3"
 51 |     ], db.urls)
 52 |   end
 53 | 
 54 |   def test_bulk_upsert__docs
 55 |     urls = [
 56 |       "http://example.com",   # Gets inserted.
 57 |       "http://example.com/2", # Gets inserted.
 58 |       "http://example.com",   # Dup of urls[0], will be updated.
 59 |       "http://example.com/3"  # Gets inserted.
 60 |     ].to_urls
 61 | 
 62 |     # Map each of the urls above into a document.
 63 |     docs = urls.map do |url|
 64 |       doc_hash = DatabaseTestData.doc(url: url, append_suffix: false)
 65 |       Wgit::Document.new(doc_hash)
 66 |     end
 67 | 
 68 |     count = db.bulk_upsert(docs)
 69 | 
 70 |     assert_equal 3, count
 71 |     assert_equal([
 72 |       "http://example.com",
 73 |       "http://example.com/2",
 74 |       "http://example.com/3"
 75 |     ], db.docs.map(&:url))
 76 |   end
 77 | 
 78 |   def test_docs
 79 |     # Test empty docs result.
 80 |     assert_empty db.docs
 81 | 
 82 |     seed { docs @docs }
 83 |     docs = db.docs
 84 | 
 85 |     # Test non empty docs results.
 86 |     assert(docs.all? { |doc| doc.instance_of? Wgit::Document })
 87 |     assert_equal 3, docs.length
 88 |   end
 89 | 
 90 |   def test_urls
 91 |     # Test empty urls result.
 92 |     assert_empty db.urls
 93 |     assert_empty db.uncrawled_urls
 94 | 
 95 |     # Seed url data to the DB.
 96 |     # Url 1 crawled == false, Url 2 & 3 crawled == true.
 97 |     @urls.first.crawled = false
 98 |     seed { urls @urls }
 99 | 
100 |     urls = db.urls
101 |     uncrawled_urls = db.uncrawled_urls
102 | 
103 |     # Test urls.
104 |     assert(urls.all? { |url| url.instance_of? Wgit::Url })
105 |     assert_equal 3, urls.length
106 | 
107 |     # Test uncrawled_urls.
108 |     assert(uncrawled_urls.all? { |url| url.instance_of? Wgit::Url })
109 |     assert_equal 1, uncrawled_urls.length
110 |   end
111 | 
112 |   def test_urls__with_redirects
113 |     # Seed url data to the DB.
114 |     # Url with redirects populated.
115 |     redirects_hash = { "http://example.com" => "https://example.com" }
116 |     @urls.first.redirects = redirects_hash
117 |     seed { urls @urls }
118 | 
119 |     urls = db.urls
120 | 
121 |     # Test urls.
122 |     assert(urls.all? { |url| url.instance_of? Wgit::Url })
123 |     assert_equal 3, urls.length
124 |     assert_equal redirects_hash, urls.first.redirects
125 |   end
126 | 
127 |   def test_search
128 |     # doc1 = 1.0 (match), doc2 = 0.0, doc3 = 2.0
129 |     @docs.first.text << "Foo Bar"
130 |     @docs.last.text << "Foo Bar"
131 |     @docs.last.text << "foO bAr"
132 | 
133 |     seed { docs @docs }
134 | 
135 |     # Test no results.
136 |     assert_empty db.search("doesnt_exist_123")
137 | 
138 |     # Test two results sorted by relevance.
139 |     results = db.search("foo bar")
140 | 
141 |     assert_equal 2, results.length
142 |     results.all? { |doc| doc.instance_of? Wgit::Document }
143 | 
144 |     assert_equal @docs.last.url, results.first.url
145 |     assert_equal @docs.first.url, results.last.url
146 |     assert results.first.score > results.last.score
147 |   end
148 | 
149 |   def test_search__case_sensitive
150 |     @docs.last.text << "Foo Bar"
151 |     seed { docs @docs }
152 | 
153 |     # Test case_sensitive: false and block.
154 |     count = 0
155 |     results = db.search("foo bar", case_sensitive: false) do |doc|
156 |       assert_instance_of Wgit::Document, doc
157 |       count += 1
158 |     end
159 |     assert_equal 1, count
160 |     assert_equal 1, results.length
161 |     assert(results.all? { |doc| doc.instance_of? Wgit::Document })
162 | 
163 |     # Test case_sensitive: true.
164 |     assert_empty db.search("foo bar", case_sensitive: true)
165 |   end
166 | 
167 |   def test_search__whole_sentence
168 |     @docs.last.text << "Foo Bar"
169 |     seed { docs @docs }
170 | 
171 |     # Test whole_sentence: false.
172 |     results = db.search("bar foo", whole_sentence: false)
173 |     assert_equal 1, results.length
174 |     assert results.first.instance_of?(Wgit::Document)
175 | 
176 |     # Test whole_sentence: true.
177 |     assert_empty db.search("bar foo", whole_sentence: true)
178 | 
179 |     # Test case_sensitive: true and whole_sentence: true.
180 |     results = db.search("Foo Bar", case_sensitive: true, whole_sentence: true)
181 |     assert_equal 1, results.length
182 |     assert results.first.instance_of?(Wgit::Document)
183 |   end
184 | 
185 |   def test_search__limit
186 |     # First doc has highest textScore and so on...
187 |     @docs.reverse.each_with_index do |doc, i|
188 |       i.times { doc.text << "Everest" }
189 |     end
190 |     seed { docs @docs }
191 | 
192 |     assert_equal 3, db.search("everest").length
193 | 
194 |     # Test limit.
195 |     results = db.search("everest", limit: 2)
196 |     assert_equal 2, results.length
197 | 
198 |     results.each_with_index do |doc, i|
199 |       doc.instance_of? Wgit::Document
200 |       assert_equal @docs[i], doc
201 |       assert_equal @docs[i].url.to_h, doc.url.to_h
202 |     end
203 |   end
204 | 
205 |   def test_search__skip
206 |     # First doc has highest textScore and so on...
207 |     @docs.reverse.each_with_index do |doc, i|
208 |       i.times { doc.text << "Everest" }
209 |     end
210 |     seed { docs @docs }
211 | 
212 |     # Test skip.
213 |     results = db.search("everest", skip: 1)
214 |     assert_equal 2, results.length
215 | 
216 |     results.each_with_index do |doc, i|
217 |       doc.instance_of? Wgit::Document
218 |       assert_equal @docs[i + 1], doc
219 |       assert_equal @docs[i + 1].url.to_h, doc.url.to_h
220 |     end
221 | 
222 |     # Test limit and skip.
223 |     results = db.search("everest", limit: 1, skip: 1)
224 |     assert_equal 1, results.length
225 | 
226 |     results.each do |doc|
227 |       doc.instance_of? Wgit::Document
228 |       assert_equal @docs[1], doc
229 |       assert_equal @docs[1].url.to_h, doc.url.to_h
230 |     end
231 |   end
232 | 
233 |   def test_search__special_char
234 |     @doc = Wgit::Document.new @url, <<~HTML
235 |       <p>Hello, this is to test :colon text searches</p>
236 |     HTML
237 | 
238 |     seed { doc @doc }
239 | 
240 |     # Test the result comes back.
241 |     results = db.search(":colon")
242 | 
243 |     assert_equal 1, results.length
244 |     results.all? { |doc| doc.instance_of? Wgit::Document }
245 |   end
246 | 
247 |   def test_search__default_search_fields
248 |     # => title    (2 hit  * 2 weight == 4)
249 |     # => text     (3 hits * 1 weight == 3)
250 |     # => keywords (1 hits * 2 weight == 2)
251 |     # => keywords (1 hits * 2 weight == 2)
252 |     # ------------------------------------
253 |     # => Total match score:          == 11
254 |     test_doc = Wgit::Document.new({
255 |       "url" => "http://www.mytestsite.com/home",
256 |       "title" => "abc abc",
257 |       "keywords" => ["abc 2", "abc 3"],
258 |       "text" => "abc abc abc"
259 |     })
260 |     seed { doc test_doc }
261 | 
262 |     results = db.search("abc")
263 | 
264 |     assert_equal(1, results.size)
265 |     assert_equal(11, results.first.score)
266 |   end
267 | 
268 |   def test_search__set_search_fields
269 |     Wgit::Document.define_extractor(:code, nil)
270 |     Wgit::Model.set_search_fields(%i[code foo]) # @code exists, @foo doesn't.
271 | 
272 |     test_doc = Wgit::Document.new("http://www.mytestsite.com/home")
273 |     test_doc.instance_variable_set(:@code, 'print("hello world")') # Score of 1.
274 |     seed { doc test_doc }
275 | 
276 |     results = db.search("hello")
277 | 
278 |     assert_equal(1, results.size)
279 |     assert_equal(1, results.first.score)
280 |   end
281 | 
282 |   def test_size
283 |     # An empty db has two empty arrays taking up 4 bytes.
284 |     assert_equal 4, db.size
285 |   end
286 | 
287 |   def test_empty
288 |     seed do
289 |       urls 3
290 |       docs 2
291 |     end
292 | 
293 |     assert_equal 5, db.empty
294 |     assert_equal 0, (db.urls.size + db.docs.size)
295 |   end
296 | end
297 | 


--------------------------------------------------------------------------------
/test/test_load.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "helpers/test_helper"
 4 | 
 5 | # Test class for the load script (used in dev).
 6 | class TestLoad < TestHelper
 7 |   # Runs before every test.
 8 |   def setup; end
 9 | 
10 |   def test_load
11 |     assert load("load.rb")
12 |     Wgit.logger.level = Logger::WARN
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/test/test_logger.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "helpers/test_helper"
 4 | 
 5 | # Test class for the Wgit Logger methods.
 6 | class TestLogger < TestHelper
 7 |   # Runs before every test.
 8 |   def setup; end
 9 | 
10 |   # Runs after every test.
11 |   def teardown
12 |     Wgit.use_default_logger
13 |     Wgit.logger.level = Logger::WARN
14 |   end
15 | 
16 |   def test_logger
17 |     assert Wgit.logger.is_a?(Logger)
18 |   end
19 | 
20 |   def test_logger=
21 |     stdout = Logger.new $stdout
22 |     assert_equal stdout, Wgit.logger = stdout
23 |   end
24 | 
25 |   def test_default_logger
26 |     assert Wgit.default_logger.is_a?(Logger)
27 |     assert_equal 1, Wgit.default_logger.level
28 |     assert_equal "wgit", Wgit.default_logger.progname
29 |   end
30 | 
31 |   def test_use_default_logger
32 |     assert Wgit.use_default_logger.is_a?(Logger)
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/test/test_model.rb:
--------------------------------------------------------------------------------
  1 | require_relative "helpers/test_helper"
  2 | 
  3 | # Test class for the data Model methods.
  4 | class TestModel < TestHelper
  5 |   # Run non DB tests in parallel for speed.
  6 |   parallelize_me!
  7 | 
  8 |   # Runs before every test.
  9 |   def setup
 10 |     Wgit::Model.set_default_search_fields
 11 | 
 12 |     Wgit::Model.include_doc_html  = false
 13 |     Wgit::Model.include_doc_score = false
 14 |   end
 15 | 
 16 |   def test_search_fields__default
 17 |     assert_equal Wgit::Model::DEFAULT_SEARCH_FIELDS, Wgit::Model.search_fields
 18 |   end
 19 | 
 20 |   def test_set_search_fields__fails
 21 |     ex = assert_raises(StandardError) { Wgit::Model.set_search_fields(true) }
 22 |     assert_equal "fields must be an Array or Hash, not a TrueClass", ex.message
 23 |   end
 24 | 
 25 |   def test_set_search_fields__symbols
 26 |     fields = Wgit::Model.set_search_fields(%i[title code])
 27 | 
 28 |     assert_equal({ title: 1, code: 1 }, fields)
 29 |     assert_equal({ title: 1, code: 1 }, Wgit::Model.search_fields)
 30 |   end
 31 | 
 32 |   def test_set_search_fields__hash
 33 |     fields = Wgit::Model.set_search_fields({ title: 2, code: 1 })
 34 | 
 35 |     assert_equal({ title: 2, code: 1 }, fields)
 36 |     assert_equal({ title: 2, code: 1 }, Wgit::Model.search_fields)
 37 |   end
 38 | 
 39 |   def test_set_search_fields__db
 40 |     # Create a mock DB that is called when passed to the Wgit::Model.
 41 |     mock_db = Struct.new do
 42 |       def search_fields=(fields)
 43 |         raise unless fields == { title: 2, code: 1 }
 44 |       end
 45 |     end
 46 |     db = mock_db.new
 47 | 
 48 |     refute_exception do
 49 |       fields = Wgit::Model.set_search_fields({ title: 2, code: 1 }, db)
 50 |       assert_equal({ title: 2, code: 1 }, fields)
 51 |     end
 52 |   end
 53 | 
 54 |   def test_url
 55 |     url = Wgit::Url.new(
 56 |       "http://example.com",
 57 |       crawled: true,
 58 |       date_crawled: Time.now,
 59 |       crawl_duration: 1.3
 60 |     )
 61 | 
 62 |     model = Wgit::Model.url(url)
 63 | 
 64 |     assert_equal %w[crawl_duration crawled date_crawled redirects url], model.keys.sort
 65 |     refute model.values.any?(&:nil?)
 66 |   end
 67 | 
 68 |   def test_document
 69 |     doc = Wgit::Document.new Wgit::Url.new(
 70 |       "http://example.com",
 71 |       crawled: true,
 72 |       date_crawled: Time.now,
 73 |       crawl_duration: 1.3
 74 |     )
 75 | 
 76 |     model = Wgit::Model.document(doc)
 77 | 
 78 |     assert_equal %w[author base description keywords links text title url], model.keys.sort
 79 |     assert_equal %w[crawl_duration crawled date_crawled redirects url], model["url"].keys.sort
 80 |     refute model["url"].values.any?(&:nil?)
 81 |   end
 82 | 
 83 |   def test_document__include_html
 84 |     doc = Wgit::Document.new "http://example.com".to_url, "<html>Hello</html>"
 85 | 
 86 |     Wgit::Model.include_doc_html = true
 87 |     model = Wgit::Model.document(doc)
 88 | 
 89 |     assert Wgit::Model.include_doc_html
 90 |     assert_equal "<html>Hello</html>", model["html"]
 91 |   end
 92 | 
 93 |   def test_document__include_score
 94 |     doc = Wgit::Document.new({
 95 |       "url" => "http://example.com",
 96 |       "score" => 10.5
 97 |     })
 98 | 
 99 |     Wgit::Model.include_doc_score = true
100 |     model = Wgit::Model.document(doc)
101 | 
102 |     assert Wgit::Model.include_doc_score
103 |     assert_equal 10.5, model["score"]
104 |   end
105 | end
106 | 


--------------------------------------------------------------------------------
/test/test_readme.rb:
--------------------------------------------------------------------------------
  1 | require_relative "helpers/test_helper"
  2 | 
  3 | # Test class for code snippets in the README.md.
  4 | # WARNING: Certain tests will clear down the DB prior to the test run.
  5 | class TestReadme < TestHelper
  6 |   include Wgit::DSL
  7 | 
  8 |   # Runs before every test.
  9 |   def setup; end
 10 | 
 11 |   def test_quotes__dsl
 12 |     ### PUT README CODE BELOW ###
 13 | 
 14 |     # require 'wgit'
 15 |     # require 'json'
 16 | 
 17 |     # include Wgit::DSL
 18 | 
 19 |     start  "http://quotes.toscrape.com/tag/humor/"
 20 |     follow "//li[@class='next']/a/@href"
 21 | 
 22 |     extract :quotes,  "//div[@class='quote']/span[@class='text']", singleton: false
 23 |     extract :authors, "//div[@class='quote']/span/small",          singleton: false
 24 | 
 25 |     quotes = []
 26 | 
 27 |     crawl_site do |doc|
 28 |       doc.quotes.zip(doc.authors).each do |arr|
 29 |         quotes << {
 30 |           quote:  arr.first,
 31 |           author: arr.last
 32 |         }
 33 |       end
 34 |     end
 35 | 
 36 |     # puts JSON.generate(quotes)
 37 | 
 38 |     ### PUT README CODE ABOVE ###
 39 | 
 40 |     assert_equal 12, quotes.size
 41 | 
 42 |     # Clean up the extractors for other tests.
 43 |     Wgit::Document.remove_extractor :quotes
 44 |     Wgit::Document.remove_extractor :authors
 45 |   end
 46 | 
 47 |   def test_quotes__dsl_index
 48 |     ### PUT README CODE BELOW ###
 49 | 
 50 |     # require 'wgit'
 51 | 
 52 |     # include Wgit::DSL
 53 | 
 54 |     # Wgit.logger.level = Logger::WARN
 55 | 
 56 |     # ENV['WGIT_CONNECTION_STRING'] = 'mongodb://user:password@localhost/crawler'
 57 | 
 58 |     start  "http://quotes.toscrape.com/tag/humor/"
 59 |     follow "//li[@class='next']/a/@href"
 60 | 
 61 |     extract :quotes,  "//div[@class='quote']/span[@class='text']", singleton: false
 62 |     extract :authors, "//div[@class='quote']/span/small",          singleton: false
 63 | 
 64 |     index_site
 65 |     results = search "prejudice", stream: nil
 66 | 
 67 |     ### PUT README CODE ABOVE ###
 68 | 
 69 |     assert_equal 1, results.size
 70 |     assert_equal "http://quotes.toscrape.com/tag/humor/page/2/", results.first.url
 71 | 
 72 |     # Clean up the extractors for other tests.
 73 |     Wgit::Document.remove_extractor :quotes
 74 |     Wgit::Document.remove_extractor :authors
 75 |   end
 76 | 
 77 |   def test_quotes__classes
 78 |     ### PUT README CODE BELOW ###
 79 | 
 80 |     # require 'wgit'
 81 |     # require 'json'
 82 | 
 83 |     crawler = Wgit::Crawler.new
 84 |     url     = Wgit::Url.new("http://quotes.toscrape.com/tag/humor/")
 85 |     quotes  = []
 86 | 
 87 |     Wgit::Document.define_extractor(:quotes,  "//div[@class='quote']/span[@class='text']", singleton: false)
 88 |     Wgit::Document.define_extractor(:authors, "//div[@class='quote']/span/small",          singleton: false)
 89 | 
 90 |     crawler.crawl_site(url, follow: "//li[@class='next']/a/@href") do |doc|
 91 |       doc.quotes.zip(doc.authors).each do |arr|
 92 |         quotes << {
 93 |           quote:  arr.first,
 94 |           author: arr.last
 95 |         }
 96 |       end
 97 |     end
 98 | 
 99 |     # puts JSON.generate(quotes)
100 | 
101 |     ### PUT README CODE ABOVE ###
102 | 
103 |     assert_equal 12, quotes.size
104 | 
105 |     # Clean up the extractors for other tests.
106 |     Wgit::Document.remove_extractor :quotes
107 |     Wgit::Document.remove_extractor :authors
108 |   end
109 | end
110 | 


--------------------------------------------------------------------------------
/test/test_response.rb:
--------------------------------------------------------------------------------
  1 | require_relative "helpers/test_helper"
  2 | 
  3 | # Test class for the Response methods.
  4 | class TestResponse < TestHelper
  5 |   # Run non DB tests in parallel for speed.
  6 |   parallelize_me!
  7 | 
  8 |   # Runs before every test.
  9 |   def setup; end
 10 | 
 11 |   def test_initialize
 12 |     r = Wgit::Response.new
 13 | 
 14 |     assert_empty r.body
 15 |     assert_empty r.headers
 16 |     assert_empty r.redirections
 17 |     assert_equal 0.0, r.total_time
 18 |   end
 19 | 
 20 |   def test_add_total_time
 21 |     r = Wgit::Response.new
 22 | 
 23 |     assert_equal 0.2, r.add_total_time(0.2)
 24 |     assert_equal 0.5, r.add_total_time(0.3)
 25 |   end
 26 | 
 27 |   def test_body_equals
 28 |     r = Wgit::Response.new
 29 | 
 30 |     r.body = nil
 31 |     assert_equal "", r.body
 32 | 
 33 |     r.body = "hello world"
 34 |     assert_equal "hello world", r.body
 35 |   end
 36 | 
 37 |   def test_body_or_nil
 38 |     r = Wgit::Response.new
 39 |     assert_nil r.body_or_nil
 40 | 
 41 |     r.body = "hello world"
 42 |     assert_equal "hello world", r.body
 43 |   end
 44 | 
 45 |   def test_failure?
 46 |     r = Wgit::Response.new
 47 |     assert r.failure?
 48 | 
 49 |     r.status = 500
 50 |     refute r.failure?
 51 | 
 52 |     r.status = 0
 53 |     assert r.failure?
 54 | 
 55 |     r.status = 200
 56 |     refute r.failure?
 57 |   end
 58 | 
 59 |   def test_headers_equals
 60 |     r = Wgit::Response.new
 61 | 
 62 |     r.headers = { "Content-Type" => "text/html" }
 63 |     assert_equal({ content_type: "text/html" }, r.headers)
 64 |   end
 65 | 
 66 |   def test_not_found?
 67 |     r = Wgit::Response.new
 68 | 
 69 |     r.status = 400
 70 |     refute r.not_found?
 71 | 
 72 |     r.status = 404
 73 |     assert r.not_found?
 74 |   end
 75 | 
 76 |   def test_ok?
 77 |     r = Wgit::Response.new
 78 | 
 79 |     r.status = 204
 80 |     refute r.ok?
 81 | 
 82 |     r.status = 200
 83 |     assert r.ok?
 84 |   end
 85 | 
 86 |   def test_redirect?
 87 |     r = Wgit::Response.new
 88 |     refute r.redirect?
 89 | 
 90 |     r.status = 200
 91 |     refute r.redirect?
 92 | 
 93 |     r.status = 301
 94 |     assert r.redirect?
 95 |   end
 96 | 
 97 |   def test_redirect_count
 98 |     r = Wgit::Response.new
 99 |     r.redirections["a"] = "foo"
100 |     r.redirections["b"] = "bar"
101 | 
102 |     assert_equal 2, r.redirect_count
103 |   end
104 | 
105 |   def test_size
106 |     r = Wgit::Response.new
107 |     assert_equal 0, r.size
108 | 
109 |     r.body = "hello world"
110 |     assert_equal 11, r.size
111 |   end
112 | 
113 |   def test_status_equals
114 |     r = Wgit::Response.new
115 | 
116 |     r.status = 0
117 |     assert_nil r.status
118 | 
119 |     r.status = 200
120 |     assert_equal 200, r.status
121 |   end
122 | 
123 |   def test_success?
124 |     r = Wgit::Response.new
125 |     refute r.success?
126 | 
127 |     r.status = 200
128 |     assert r.success?
129 | 
130 |     r.status = 500
131 |     assert r.success?
132 | 
133 |     r.status = 0
134 |     refute r.success?
135 |   end
136 | 
137 |   def test_no_index?
138 |     r = Wgit::Response.new
139 |     refute r.no_index?
140 | 
141 |     r.headers = { "X-Robots-Tag" => "index" }
142 |     refute r.no_index?
143 | 
144 |     r.headers = { "X-Robots-Tag" => "noindex" }
145 |     assert r.no_index?
146 |   end
147 | end
148 | 


--------------------------------------------------------------------------------
/test/test_version.rb:
--------------------------------------------------------------------------------
 1 | require_relative "helpers/test_helper"
 2 | 
 3 | # Test class for the Wgit version.
 4 | class TestVersion < TestHelper
 5 |   # Runs before every test.
 6 |   def setup; end
 7 | 
 8 |   def test_version_const
 9 |     assert_instance_of String, Wgit::VERSION
10 |     assert_equal 2, Wgit::VERSION.count(".")
11 |   end
12 | 
13 |   def test_version
14 |     assert_equal Wgit::VERSION, Wgit.version
15 |   end
16 | 
17 |   def test_version_str
18 |     assert_equal "wgit v#{Wgit::VERSION}", Wgit.version_str
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/wgit.gemspec:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "./lib/wgit/version"
 4 | 
 5 | Gem::Specification.new do |s|
 6 |   s.name     = "wgit"
 7 |   s.version  = Wgit::VERSION
 8 |   s.date     = Time.now.strftime("%Y-%m-%d")
 9 |   s.author   = "Michael Telford"
10 |   s.email    = "michael.telford@live.com"
11 |   s.homepage = "https://github.com/michaeltelford/wgit"
12 |   s.license  = "MIT"
13 | 
14 |   s.summary = <<~TEXT
15 |     Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically extract the data you want from the web.
16 |   TEXT
17 |   s.description = <<~TEXT
18 |     Wgit was primarily designed to crawl static HTML websites to index and search their content - providing the basis of any search engine; but Wgit is suitable for many application domains including: URL parsing, data mining and statistical analysis.
19 |   TEXT
20 | 
21 |   s.require_paths = %w[lib]
22 |   s.files = Dir[
23 |     "./lib/**/*.rb",
24 |     "bin/wgit",
25 |     "*.md",
26 |     "LICENSE.txt",
27 |     ".yardopts"
28 |   ]
29 |   s.bindir = "bin"
30 |   s.executable = "wgit"
31 |   s.post_install_message = "Added the 'wgit' executable to $PATH"
32 |   s.metadata = {
33 |     "yard.run" => "yri",
34 |     "source_code_uri" => "https://github.com/michaeltelford/wgit",
35 |     "changelog_uri" => "https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md",
36 |     "bug_tracker_uri" => "https://github.com/michaeltelford/wgit/issues",
37 |     "documentation_uri" => "https://www.rubydoc.info/gems/wgit"
38 |   }
39 | 
40 |   s.platform              = Gem::Platform::RUBY
41 |   s.required_ruby_version = ">= 3", "< 4"
42 | 
43 |   s.add_runtime_dependency "addressable", "~> 2.8"
44 |   s.add_runtime_dependency "base64", "~> 0.2"
45 |   s.add_runtime_dependency "ferrum", "~> 0.14"
46 |   s.add_runtime_dependency "mongo", "~> 2.19"
47 |   s.add_runtime_dependency "nokogiri", "~> 1.15"
48 |   s.add_runtime_dependency "typhoeus", "~> 1.4"
49 | 
50 |   s.add_development_dependency "byebug", "~> 11.1"
51 |   s.add_development_dependency "dotenv", "~> 2.8"
52 |   s.add_development_dependency "maxitest", "~> 5.4"
53 |   s.add_development_dependency "pry", "~> 0.14"
54 |   s.add_development_dependency "rubocop", "~> 1.57"
55 |   s.add_development_dependency "toys", "~> 0.15"
56 |   s.add_development_dependency "webmock", "~> 3.19"
57 |   s.add_development_dependency "yard", "~> 0.9"
58 | 
59 |   # Only allow gem pushes to rubygems.org.
60 |   unless s.respond_to?(:metadata)
61 |     raise "Only RubyGems 2.0 or newer can protect against public gem pushes"
62 |   end
63 | 
64 |   s.metadata["allowed_push_host"] = "https://rubygems.org"
65 | end
66 | 


--------------------------------------------------------------------------------