├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── pull_request_template.md └── workflows │ └── wgit.yaml ├── .gitignore ├── .rubocop.yml ├── .ruby-version ├── .toys.rb ├── .yardopts ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── bin └── wgit ├── ci.symlink ├── docker ├── Dockerfile └── mongo-init.js ├── lib ├── wgit.rb └── wgit │ ├── assertable.rb │ ├── base.rb │ ├── core_ext.rb │ ├── crawler.rb │ ├── database │ ├── adapters │ │ ├── in_memory.rb │ │ └── mongo_db.rb │ ├── database.rb │ └── database_adapter.rb │ ├── document.rb │ ├── document_extractors.rb │ ├── dsl.rb │ ├── html_to_text.rb │ ├── indexer.rb │ ├── logger.rb │ ├── model.rb │ ├── response.rb │ ├── robots_parser.rb │ ├── url.rb │ ├── utils.rb │ └── version.rb ├── load.rb ├── test ├── helpers │ ├── database_helper.rb │ ├── database_test_data.rb │ ├── in_memory_helper.rb │ ├── mongo_db_helper.rb │ └── test_helper.rb ├── mock │ ├── fixtures.rb │ ├── fixtures │ │ ├── altitudejunkies.com.html │ │ ├── anchor_display.html │ │ ├── blank.html │ │ ├── disallow-all.com │ │ │ ├── about.html │ │ │ ├── index.html │ │ │ └── robots.txt │ │ ├── div_display.html │ │ ├── external-link-portal.com.html │ │ ├── getting_started.html │ │ ├── link-to-robots-txt.com.html │ │ ├── motherfuckingwebsite.com.html │ │ ├── nearest_fragment.html │ │ ├── not_found.html │ │ ├── odd-extension.com.html │ │ ├── php.html │ │ ├── quotes.toscrape.com │ │ │ └── tag │ │ │ │ ├── humor.html │ │ │ │ └── humor │ │ │ │ └── page │ │ │ │ └── 2.html │ │ ├── robots.txt.com │ │ │ ├── about.html │ │ │ ├── contact.html │ │ │ ├── index.html │ │ │ ├── login.html │ │ │ ├── pwreset.html │ │ │ └── robots.txt │ │ ├── span_display.html │ │ ├── static.xx.fbcdn.net.html │ │ ├── test-site.com │ │ │ ├── about.html │ │ │ ├── application.js.html │ │ │ ├── contact.html │ │ │ ├── index.html │ │ │ ├── public │ │ │ │ └── records.html │ │ │ ├── search.html │ │ │ └── theme.css.html │ │ ├── test_doc.html │ │ ├── txti.es │ │ │ ├── about.html │ │ │ ├── barry │ │ │ │ └── json.html │ │ │ ├── how.html │ │ │ ├── images.html │ │ │ ├── images │ │ │ │ └── images.html │ │ │ ├── index.html │ │ │ └── terms.html │ │ ├── wikileaks.org.html │ │ ├── www.adventureconsultants.com.html │ │ ├── www.belfastpilates.co.uk │ │ │ ├── about-us.html │ │ │ ├── about-us │ │ │ │ ├── our-facilities.html │ │ │ │ ├── testimonials.html │ │ │ │ └── the-team.html │ │ │ ├── author │ │ │ │ └── adminbpp.html │ │ │ ├── category │ │ │ │ └── uncategorized.html │ │ │ ├── contact-us.html │ │ │ ├── gift-vouchers-now-available-to-purchase.html │ │ │ ├── index.html │ │ │ ├── latest-news.html │ │ │ ├── official-launch-party.html │ │ │ ├── physiotheraphy.html │ │ │ ├── pilates.html │ │ │ ├── pilates │ │ │ │ ├── pilates-classes.html │ │ │ │ ├── pilates-classes │ │ │ │ │ └── pilates-classes-timetable.html │ │ │ │ ├── pilates-faqs.html │ │ │ │ └── what-is-pilates.html │ │ │ ├── privacy-policy.html │ │ │ └── youre-invited.html │ │ ├── www.facebook.com.html │ │ └── www.mountainmadness.com.html │ ├── save_page.rb │ ├── save_site.rb │ └── webmock.rb ├── test_assertable.rb ├── test_base.rb ├── test_core_ext.rb ├── test_crawler.rb ├── test_database_adapter.rb ├── test_document.rb ├── test_document_extractors.rb ├── test_dsl.rb ├── test_gem.rb ├── test_html_to_text.rb ├── test_in_memory.rb ├── test_indexer.rb ├── test_load.rb ├── test_logger.rb ├── test_model.rb ├── test_mongo_db.rb ├── test_readme.rb ├── test_response.rb ├── test_robots_parser.rb ├── test_url.rb ├── test_utils.rb └── test_version.rb └── wgit.gemspec /.gitattributes: -------------------------------------------------------------------------------- 1 | test/mock/fixtures/**/*.html linguist-vendored 2 | docker/* linguist-vendored 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: michaeltelford 7 | 8 | --- 9 | 10 | ## Description 11 | 12 | A clear and concise description of what the bug is. 13 | 14 | ## Reproduce 15 | 16 | Steps to reproduce the behavior: 17 | 18 | 1. Do X. 19 | 2. Do Y. 20 | 3. ... 21 | 22 | ## Expected Behavior 23 | 24 | A clear and concise description of what you expected to happen. 25 | 26 | ## Possible Solutions 27 | 28 | 1. ... 29 | 30 | ## Tests 31 | 32 | What tests would prove this bug is fixed? 33 | 34 | 1. ... 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: michaeltelford 7 | 8 | --- 9 | 10 | ## Description 11 | 12 | A clear and concise description of what the feature/problem is. E.g. "I'm always frustrated when ..." or "I'd like to be able to ..." etc. 13 | 14 | ## Solution 15 | 16 | A clear and concise description of what you want to happen. 17 | 18 | ## Alternatives 19 | 20 | A clear and concise description of any alternative solutions or features you've considered. Is there other software doing something well that should be replicated etc.? 21 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | What do your changes implement/fix? 4 | 5 | ## Checklist 6 | 7 | - Are there tests? 8 | - Have you reviewed and approved the changes yourself? 9 | 10 | Ensure you can answer yes to the above before opening a PR. 11 | -------------------------------------------------------------------------------- /.github/workflows/wgit.yaml: -------------------------------------------------------------------------------- 1 | name: wgit 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | - ci 8 | jobs: 9 | wgit-ci: 10 | runs-on: ubuntu-22.04 11 | services: 12 | mongodb: 13 | image: michaeltelford/mongo-wgit 14 | ports: 15 | - 27017:27017 16 | env: 17 | WGIT_CONNECTION_STRING: "mongodb://rubyapp:abcdef@localhost/crawler" 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: ruby/setup-ruby@v1 21 | with: 22 | # ruby-version: '3.3' # Not needed with a .ruby-version file 23 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 24 | - name: ci 25 | run: bundle exec toys ci 26 | - name: docs 27 | run: bundle exec toys generate_rubydocs 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .bundle 2 | .byebug_history 3 | .doc 4 | .DS_Store 5 | .env 6 | .wgit.rb 7 | .yardoc 8 | **/pkg 9 | **/spike.rb 10 | doc 11 | wgit-*.gem 12 | .vscode/ 13 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | Include: 3 | - 'lib/**/*.rb' 4 | - 'bin/**' 5 | - 'test/**/*.rb' 6 | - '*.rb' 7 | - .toys.rb 8 | Security/Eval: 9 | Exclude: 10 | - 'bin/wgit' 11 | Metrics/ParameterLists: 12 | Exclude: 13 | - 'lib/wgit/dsl.rb' 14 | - 'lib/wgit/database/database_adapter.rb' 15 | Style/FrozenStringLiteralComment: 16 | Enabled: false 17 | Style/ClassAndModuleChildren: 18 | Enabled: false 19 | Layout/HashAlignment: 20 | Enabled: false 21 | Layout/FirstArrayElementIndentation: 22 | Enabled: false 23 | Layout/FirstHashElementIndentation: 24 | Enabled: false 25 | Metrics/ModuleLength: 26 | Enabled: false 27 | Metrics/ClassLength: 28 | Enabled: false 29 | Style/Documentation: 30 | Enabled: false 31 | Metrics/MethodLength: 32 | Max: 30 33 | Metrics/PerceivedComplexity: 34 | Max: 12 35 | Metrics/CyclomaticComplexity: 36 | Max: 12 37 | Layout/LineLength: 38 | Max: 85 39 | Exclude: 40 | - 'test/**/*.rb' 41 | Style/Alias: 42 | EnforcedStyle: prefer_alias_method 43 | Style/StringLiterals: 44 | EnforcedStyle: double_quotes 45 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 3.3.0 2 | -------------------------------------------------------------------------------- /.toys.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # The new Rakefile, place any tasks/tools below (listed alphabetically). 4 | # To load .env vars into the ENV from within a tool definition, use: 5 | # require 'dotenv/load' 6 | 7 | require "json" 8 | require "byebug" # Useful for tool development. 9 | 10 | # tool :build 11 | expand :gem_build 12 | 13 | tool :ci do 14 | desc "Runs the CI steps needed for a green build" 15 | 16 | include :exec, exit_on_nonzero_status: false 17 | include :terminal 18 | 19 | def run 20 | run_step "Build gem", "build" 21 | run_step "Check documentation", ["generate_docs", "--no-output"] 22 | run_step "Run tests", "test" 23 | end 24 | 25 | def run_step(name, tool) 26 | if exec_tool(tool).success? 27 | puts "** #{name} passed", :green, :bold 28 | puts 29 | else 30 | puts "** #{name} failed, exiting!", :red, :bold 31 | exit 1 32 | end 33 | end 34 | end 35 | 36 | # tool :clean 37 | expand :clean, paths: ["pkg", "doc", "tmp", ".doc", ".yardoc"] 38 | 39 | tool :compile do 40 | desc "Compile all project Ruby files with warnings" 41 | 42 | include :exec, exit_on_nonzero_status: true 43 | include :terminal 44 | 45 | def run 46 | Dir["**/*.rb", "**/*.gemspec", "bin/wgit"].each do |file| 47 | puts "\nCompiling #{file}...", :cyan 48 | exec "ruby -cw #{file}" 49 | end 50 | end 51 | end 52 | 53 | tool :console do 54 | desc "Run the (latest) wgit console script" 55 | 56 | include :exec, exit_on_nonzero_status: true 57 | 58 | def run 59 | exec "./bin/wgit" 60 | end 61 | end 62 | 63 | # namespace :db 64 | tool :db do 65 | tool :build do 66 | desc "Build the mongo DB image from ./docker/Dockerfile" 67 | 68 | include :exec, exit_on_nonzero_status: true 69 | 70 | def run 71 | exec "docker build --no-cache -t michaeltelford/mongo-wgit ./docker" 72 | end 73 | end 74 | 75 | tool :start do 76 | desc "Start a local mongo DB docker daemon" 77 | 78 | include :terminal 79 | include :exec, exit_on_nonzero_status: true 80 | 81 | def run 82 | exec "docker run --name mongo-wgit -p 27017:27017 --rm -d michaeltelford/mongo-wgit" 83 | puts "Successfully started container 'mongo-wgit'", :green 84 | end 85 | end 86 | 87 | tool :stop do 88 | desc "Stop the local mongo DB docker container" 89 | 90 | include :terminal 91 | include :exec, exit_on_nonzero_status: true 92 | 93 | def run 94 | exec "docker stop mongo-wgit" 95 | puts "Successfully stopped container 'mongo-wgit'", :green 96 | end 97 | end 98 | 99 | tool :push do 100 | desc "Push the local mongo DB image to Docker Hub" 101 | 102 | include :exec, exit_on_nonzero_status: true 103 | 104 | def run 105 | exec "docker login" unless docker_authenticated? 106 | exec "docker push michaeltelford/mongo-wgit" 107 | end 108 | 109 | def docker_authenticated? 110 | docker_config = "#{Dir.home}/.docker/config.json" 111 | return false unless File.exist?(docker_config) 112 | 113 | config = JSON.parse(File.read(docker_config)) 114 | auths = config["auths"] 115 | return false unless auths && !auths.empty? 116 | 117 | true 118 | end 119 | end 120 | end 121 | 122 | # tool :generate_docs 123 | expand :yardoc do |t| 124 | t.name = :generate_docs 125 | t.generate_output_flag = true 126 | t.fail_on_warning = true 127 | t.fail_on_undocumented_objects = true 128 | end 129 | 130 | tool :generate_rubydocs do 131 | desc "Update wgit's docs on rubydoc.info" 132 | 133 | include :terminal 134 | include :exec, exit_on_nonzero_status: true 135 | 136 | def run 137 | exec "curl 'https://www.rubydoc.info/checkout' \ 138 | -H 'User-Agent: curl' \ 139 | -H 'Accept: */*' \ 140 | -H 'Accept-Language: en-GB,en;q=0.5' --compressed \ 141 | -H 'Content-Type: application/x-www-form-urlencoded' \ 142 | -H 'X-Requested-With: XMLHttpRequest' \ 143 | -H 'Origin: https://www.rubydoc.info' \ 144 | -H 'Connection: keep-alive' \ 145 | -H 'Referer: https://www.rubydoc.info/find/github?q=wgit' \ 146 | --data 'scheme=git&url=git%3A%2F%2Fgithub.com%2Fmichaeltelford%2Fwgit&commit='" 147 | puts "\nUpdated rubydoc.info successfully", :green 148 | end 149 | end 150 | 151 | # tool :install 152 | expand :gem_build do |t| 153 | t.name = :install 154 | t.install_gem = true 155 | end 156 | 157 | tool :lint, delegate_to: :rubocop 158 | 159 | tool :release do 160 | desc "The SAFE release task which double checks things!" 161 | long_desc "Tag and push commits to Github, then build and push the gem to Rubygems." 162 | 163 | include :exec, exit_on_nonzero_status: true 164 | include :terminal 165 | 166 | def run 167 | raise "Error requiring wgit" unless require_relative "lib/wgit" 168 | 169 | puts "Releasing #{Wgit.version_str}, using the 'origin' Git remote...", :cyan 170 | confirmed = confirm "Have you applied the wiki's 'Gem Publishing Checklist'?" 171 | unless confirmed 172 | puts "Aborting!", :red 173 | exit(0) 174 | end 175 | 176 | exec_tool "release_gem" 177 | puts "Release complete", :green 178 | end 179 | end 180 | 181 | # tool :release_gem 182 | expand :gem_build do |t| 183 | t.name = :release_gem 184 | t.install_gem = false 185 | t.push_gem = true 186 | t.tag = true 187 | t.push_tag = true 188 | end 189 | 190 | tool :rubocop do 191 | desc "Run the rubocop linter, use -a to auto correct" 192 | flag :autocorrect, "-a", "--autocorrect" 193 | flag :autocorrectall, "-A", "--autocorrect-all" 194 | remaining_args :dirs_or_files 195 | 196 | include :exec, exit_on_nonzero_status: true 197 | 198 | def run 199 | command_str = "bundle exec rubocop" 200 | command_str += " -a" if autocorrect 201 | command_str += " -A" if autocorrectall 202 | command_str += " #{dirs_or_files.join(' ')}" if dirs_or_files.any? 203 | 204 | exec(command_str) 205 | end 206 | end 207 | 208 | tool :setup do 209 | desc "Sets up the cloned repo for development" 210 | 211 | include :exec, exit_on_nonzero_status: true 212 | include :terminal 213 | 214 | def run 215 | exec_cmd "gem install wgit" 216 | exec_cmd "touch .env" 217 | exec_cmd "touch .wgit.rb" 218 | 219 | puts "Setup complete", :green 220 | end 221 | 222 | def exec_cmd(command) 223 | puts "> #{command}", :cyan 224 | exec command 225 | end 226 | end 227 | 228 | # namespace :test 229 | tool :test do 230 | desc "Run all tests" 231 | 232 | include :exec, exit_on_nonzero_status: true 233 | 234 | def run 235 | exec_tool "test all" 236 | end 237 | 238 | # tool :all 239 | expand :minitest do |t| 240 | t.name = :all 241 | t.libs = ["lib"] 242 | t.files = ["test/test_*.rb"] 243 | end 244 | 245 | tool :file do 246 | desc "Runs entire test_*.rb file or single test at --line" 247 | required_arg :file 248 | flag :line, "-l", "--line=VALUE" 249 | 250 | include :exec, exit_on_nonzero_status: true 251 | 252 | def run 253 | exec "bundle exec mtest #{test_cmd}" 254 | end 255 | 256 | def test_cmd 257 | cmd = options[:file] 258 | raise "Colon not allowed, use --line" if cmd.include?(":") 259 | 260 | cmd = "test/test_#{cmd}" unless cmd.start_with?("test/test_") 261 | cmd += ".rb" unless cmd.end_with?(".rb") 262 | cmd += ":#{line}" if line 263 | 264 | cmd 265 | end 266 | end 267 | 268 | tool :infinite_crawl_loop do 269 | desc "Manually crawl_r URLs to check for an infinite loop condition" 270 | 271 | include :terminal 272 | 273 | require "wgit" 274 | require "wgit/core_ext" 275 | 276 | def run 277 | puts "If the crawl is hanging for more than 2 mins, there is an infinite loop", 278 | :yellow 279 | 280 | crawler = Wgit::Crawler.new 281 | urls = %w[ 282 | https://jaloulangeree.com/ 283 | https://www.belfastpilates.co.uk/ 284 | https://anaeko.com/ 285 | ].to_urls 286 | 287 | urls.each_with_index do |url, i| 288 | crawler.crawl_site(url) 289 | puts "Successfully crawled site (#{i + 1}/#{urls.size}): #{url}" 290 | end 291 | 292 | puts "Successfully crawled all sites, no infinite loop detected", :green 293 | end 294 | end 295 | 296 | tool :save_page do 297 | desc "Download/update a web page test fixture to test/mock/fixtures" 298 | required_arg :url 299 | 300 | include :exec, exit_on_nonzero_status: true 301 | include :terminal 302 | 303 | def run 304 | load "test/mock/save_page.rb" 305 | save_page(options[:url]) 306 | puts "Don't forget to mock the page in test/mock/fixtures.rb", :green 307 | end 308 | end 309 | 310 | tool :save_site do 311 | desc "Download/update a web site test fixture to test/mock/fixtures" 312 | required_arg :url 313 | flag :follow, "-f", "--follow=VALUE" 314 | 315 | include :exec, exit_on_nonzero_status: true 316 | include :terminal 317 | 318 | def run 319 | load "test/mock/save_site.rb" 320 | xpath = follow || :default 321 | save_site(options[:url], follow: xpath) 322 | puts "Don't forget to mock the site in test/mock/fixtures.rb", :green 323 | end 324 | end 325 | 326 | # tool :smoke 327 | expand :minitest do |t| 328 | t.name = :smoke 329 | t.libs = ["lib"] 330 | t.files = [ 331 | "test/test_utils.rb", 332 | "test/test_url.rb", 333 | "test/test_document.rb", 334 | "test/test_document_extractors.rb", 335 | "test/test_response.rb", 336 | "test/test_crawler.rb" 337 | ] 338 | end 339 | end 340 | 341 | tool :yardoc do 342 | desc "Generates the YARD docs, use --serve to browse" 343 | flag :serve, "-s", "--serve" 344 | 345 | include :exec, exit_on_nonzero_status: false 346 | include :terminal 347 | 348 | def run 349 | serve ? serve_docs : exec_tool("generate_docs") 350 | end 351 | 352 | def serve_docs 353 | url = "http://localhost:8808" 354 | 355 | if exec("which pbcopy", out: :null).success? 356 | exec "echo '#{url}' | pbcopy" 357 | puts "Copied '#{url}' to clipboard", :green 358 | elsif exec("which xclip", out: :null).success? 359 | exec "echo '#{url}' | xclip -sel clip" 360 | puts "Copied '#{url}' to clipboard", :green 361 | else 362 | puts "Install pbcopy or xclip to automatically copy url to clipboard" 363 | end 364 | 365 | exec "bundle exec yard server -r" 366 | end 367 | end 368 | -------------------------------------------------------------------------------- /.yardopts: -------------------------------------------------------------------------------- 1 | --readme README.md 2 | --title 'Wgit Gem Documentation' 3 | --charset utf-8 4 | --markup markdown 5 | --output .doc 6 | --protected 7 | - *.md LICENSE.txt 8 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team on Github. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Consult 4 | 5 | Before you make a contribution, reach out on Github about what changes need made. Otherwise, your time spent might be wasted. Once you're clear on what needs done follow the technical steps below. 6 | 7 | ## Technical Steps 8 | 9 | - Fork the repository 10 | - Create a branch 11 | - Write some tests (which fail) 12 | - Write some code 13 | - Re-run the tests (which now hopefully pass) 14 | - Push your branch to your `origin` remote 15 | - Open a GitHub Pull Request (with the target branch as wgit's (upstream) `master`) 16 | - Apply any requested changes 17 | - Wait for your PR to be merged 18 | 19 | ## Thanks 20 | 21 | Thanks in advance for your contribution. 22 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | ruby '>= 3', '< 4' 6 | 7 | # Specify your gem's dependencies in the gemspec. 8 | gemspec 9 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | wgit (0.12.0) 5 | addressable (~> 2.8) 6 | base64 (~> 0.2) 7 | ferrum (~> 0.14) 8 | mongo (~> 2.19) 9 | nokogiri (~> 1.15) 10 | typhoeus (~> 1.4) 11 | 12 | GEM 13 | remote: https://rubygems.org/ 14 | specs: 15 | addressable (2.8.6) 16 | public_suffix (>= 2.0.2, < 6.0) 17 | ast (2.4.2) 18 | base64 (0.2.0) 19 | bigdecimal (3.1.6) 20 | bson (4.15.0) 21 | byebug (11.1.3) 22 | coderay (1.1.3) 23 | concurrent-ruby (1.2.3) 24 | crack (1.0.0) 25 | bigdecimal 26 | rexml 27 | dotenv (2.8.1) 28 | ethon (0.16.0) 29 | ffi (>= 1.15.0) 30 | ferrum (0.15) 31 | addressable (~> 2.5) 32 | concurrent-ruby (~> 1.1) 33 | webrick (~> 1.7) 34 | websocket-driver (~> 0.7) 35 | ffi (1.16.3) 36 | hashdiff (1.1.0) 37 | json (2.7.1) 38 | language_server-protocol (3.17.0.3) 39 | maxitest (5.4.0) 40 | minitest (>= 5.14.0, < 5.21.0) 41 | method_source (1.0.0) 42 | minitest (5.20.0) 43 | mongo (2.19.3) 44 | bson (>= 4.14.1, < 5.0.0) 45 | nokogiri (1.16.2) 46 | racc (~> 1.4) 47 | parallel (1.24.0) 48 | parser (3.3.0.5) 49 | ast (~> 2.4.1) 50 | racc 51 | prism (0.24.0) 52 | pry (0.14.2) 53 | coderay (~> 1.1) 54 | method_source (~> 1.0) 55 | public_suffix (5.0.4) 56 | racc (1.7.3) 57 | rainbow (3.1.1) 58 | regexp_parser (2.9.0) 59 | rexml (3.2.6) 60 | rubocop (1.61.0) 61 | json (~> 2.3) 62 | language_server-protocol (>= 3.17.0) 63 | parallel (~> 1.10) 64 | parser (>= 3.3.0.2) 65 | rainbow (>= 2.2.2, < 4.0) 66 | regexp_parser (>= 1.8, < 3.0) 67 | rexml (>= 3.2.5, < 4.0) 68 | rubocop-ast (>= 1.30.0, < 2.0) 69 | ruby-progressbar (~> 1.7) 70 | unicode-display_width (>= 2.4.0, < 3.0) 71 | rubocop-ast (1.31.0) 72 | parser (>= 3.3.0.4) 73 | prism (>= 0.24.0) 74 | ruby-progressbar (1.13.0) 75 | toys (0.15.5) 76 | toys-core (= 0.15.5) 77 | toys-core (0.15.5) 78 | typhoeus (1.4.1) 79 | ethon (>= 0.9.0) 80 | unicode-display_width (2.5.0) 81 | webmock (3.23.0) 82 | addressable (>= 2.8.0) 83 | crack (>= 0.3.2) 84 | hashdiff (>= 0.4.0, < 2.0.0) 85 | webrick (1.8.1) 86 | websocket-driver (0.7.6) 87 | websocket-extensions (>= 0.1.0) 88 | websocket-extensions (0.1.5) 89 | yard (0.9.35) 90 | 91 | PLATFORMS 92 | ruby 93 | 94 | DEPENDENCIES 95 | byebug (~> 11.1) 96 | dotenv (~> 2.8) 97 | maxitest (~> 5.4) 98 | pry (~> 0.14) 99 | rubocop (~> 1.57) 100 | toys (~> 0.15) 101 | webmock (~> 3.19) 102 | wgit! 103 | yard (~> 0.9) 104 | 105 | RUBY VERSION 106 | ruby 3.3.0p0 107 | 108 | BUNDLED WITH 109 | 2.5.3 110 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 - 2020 Michael Telford 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin/wgit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "wgit" 4 | 5 | # Shorten the filepath for nicer output to the user. 6 | def format_path(dir, path) 7 | return path.gsub(dir, ".") if dir == Dir.pwd 8 | return path.gsub(dir, "~") if dir == Dir.home 9 | 10 | path 11 | end 12 | 13 | # Load .env file (if it exists somewhere). 14 | def load_env 15 | begin 16 | require "dotenv" 17 | rescue LoadError 18 | puts "Skipping .env load because 'dotenv' isn't installed" 19 | puts 20 | return false 21 | end 22 | 23 | puts "Searching for .env file in local and home directories..." 24 | success = false 25 | 26 | [Dir.pwd, Dir.home].each do |dir| 27 | path = "#{dir}/.env" 28 | next unless File.exist?(path) 29 | 30 | puts "Loading #{format_path(dir, path)}" 31 | puts "Call `load_env` after changes to re-load the environment variables" 32 | 33 | Dotenv.load(path) 34 | success = true 35 | 36 | break 37 | end 38 | 39 | puts 40 | 41 | success 42 | end 43 | 44 | # Eval .wgit.rb file (if it exists somewhere). 45 | def eval_wgit 46 | puts "Searching for .wgit.rb file in local and home directories..." 47 | success = false 48 | 49 | [Dir.pwd, Dir.home].each do |dir| 50 | path = "#{dir}/.wgit.rb" 51 | next unless File.exist?(path) 52 | 53 | puts "Eval'ing #{format_path(dir, path)}" 54 | puts "Call `eval_wgit` after changes to re-eval the file" 55 | 56 | eval(File.read(path)) 57 | success = true 58 | 59 | break 60 | end 61 | 62 | puts 63 | 64 | success 65 | end 66 | 67 | # Choose and return which REPL class to use. 68 | # Use Pry if installed or fall back to IRB. 69 | def repl_class 70 | begin 71 | require "pry" 72 | klass = Pry 73 | rescue LoadError 74 | require "irb" 75 | klass = IRB 76 | 77 | puts "Using 'irb' REPL because 'pry' isn't installed" 78 | puts 79 | end 80 | 81 | klass 82 | end 83 | 84 | ### START OF EXECUTABLE ### 85 | 86 | load_env 87 | eval_wgit 88 | klass = repl_class 89 | 90 | puts Wgit.version_str 91 | puts "#{'-' * Wgit.version_str.size}\n\n" 92 | 93 | klass.start 94 | 95 | puts "Interactive wgit session complete" 96 | -------------------------------------------------------------------------------- /ci.symlink: -------------------------------------------------------------------------------- 1 | ./.github/workflows/wgit.yaml -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mongo:latest 2 | 3 | ENV MONGO_INITDB_ROOT_USERNAME rubyapp 4 | ENV MONGO_INITDB_ROOT_PASSWORD abcdef 5 | ENV MONGO_INITDB_DATABASE admin 6 | 7 | COPY mongo-init.js /docker-entrypoint-initdb.d/ 8 | -------------------------------------------------------------------------------- /docker/mongo-init.js: -------------------------------------------------------------------------------- 1 | db.auth("rubyapp", "abcdef"); 2 | 3 | db = db.getSiblingDB("crawler"); 4 | 5 | db.createUser({ 6 | user: "rubyapp", 7 | pwd: "abcdef", 8 | roles: [ 9 | { 10 | role: "root", 11 | db: "admin", 12 | }, 13 | ], 14 | }); 15 | 16 | db.createCollection("urls"); 17 | db.createCollection("documents"); 18 | 19 | db.urls.createIndex({ "url" : 1 }, { "unique" : true, "name": "unique_url" }); 20 | db.documents.createIndex({ "url.url" : 1 }, { "unique" : true, "name": "unique_url" }); 21 | db.documents.createIndex({ 22 | title: "text", 23 | description: "text", 24 | keywords: "text", 25 | text: "text" 26 | }, 27 | { 28 | weights: { 29 | title: 2, 30 | description: 2, 31 | keywords: 2, 32 | text: 1 33 | }, 34 | name: "text_search" 35 | }); 36 | -------------------------------------------------------------------------------- /lib/wgit.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "wgit/version" 4 | require_relative "wgit/logger" 5 | require_relative "wgit/assertable" 6 | require_relative "wgit/utils" 7 | require_relative "wgit/url" 8 | require_relative "wgit/html_to_text" 9 | require_relative "wgit/document" 10 | require_relative "wgit/document_extractors" 11 | require_relative "wgit/crawler" 12 | require_relative "wgit/model" 13 | require_relative "wgit/database/database" 14 | require_relative "wgit/database/database_adapter" 15 | require_relative "wgit/database/adapters/mongo_db" 16 | require_relative "wgit/database/adapters/in_memory" 17 | require_relative "wgit/robots_parser" 18 | require_relative "wgit/indexer" 19 | require_relative "wgit/dsl" 20 | require_relative "wgit/base" 21 | # require_relative 'wgit/core_ext' - Must be explicitly required. 22 | -------------------------------------------------------------------------------- /lib/wgit/assertable.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Wgit 4 | # Module containing assertion methods including type checking and duck typing. 5 | module Assertable 6 | # Default type fail message. 7 | DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s" 8 | 9 | # Wrong method message. 10 | NON_ENUMERABLE_MSG = "Expected an Enumerable responding to #each, not: %s" 11 | 12 | # Enumerable with more than one type across it's elements. 13 | MIXED_ENUMERABLE_MSG = "Expected an Enumerable with elements of a single \ 14 | common type" 15 | 16 | # Default duck fail message. 17 | DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s" 18 | 19 | # Default required keys message. 20 | DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \ 21 | present: %s" 22 | 23 | # Tests if the obj is_a? given type; raises an Exception if not. 24 | # 25 | # @param obj [Object] The Object to test. 26 | # @param type_or_types [Type, Array] The type/types that obj must 27 | # belong to or an exception is thrown. 28 | # @param msg [String] The raised StandardError message, if provided. 29 | # @raise [StandardError] If the assertion fails. 30 | # @return [Object] The given obj on successful assertion. 31 | def assert_types(obj, type_or_types, msg = nil) 32 | msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class) 33 | match = if type_or_types.respond_to?(:any?) 34 | type_or_types.any? { |type| obj.is_a?(type) } 35 | else 36 | obj.is_a?(type_or_types) 37 | end 38 | raise msg unless match 39 | 40 | obj 41 | end 42 | 43 | # Each object within arr must match one of the types listed in 44 | # type_or_types; or an exception is raised using msg, if provided. 45 | # 46 | # @param arr [Enumerable#each] Enumerable of objects to type check. 47 | # @param type_or_types [Type, Array] The allowed type(s). 48 | # @param msg [String] The raised StandardError message, if provided. 49 | # @raise [StandardError] If the assertion fails. 50 | # @return [Object] The given arr on successful assertion. 51 | def assert_arr_types(arr, type_or_types, msg = nil) 52 | raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each) 53 | 54 | arr.each { |obj| assert_types(obj, type_or_types, msg) } 55 | end 56 | 57 | # All objects within arr must match one of the types listed in 58 | # type_or_types; or an exception is raised using msg, if provided. 59 | # Ancestors of the same type are allowed and considered common. 60 | # 61 | # @param arr [Enumerable#each] Enumerable of objects to type check. 62 | # @param type_or_types [Type, Array] The allowed type(s). 63 | # @param msg [String] The raised StandardError message, if provided. 64 | # @raise [StandardError] If the assertion fails. 65 | # @return [Object] The given arr on successful assertion. 66 | def assert_common_arr_types(arr, type_or_types, msg = nil) 67 | raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each) 68 | 69 | type = arr.first.class 70 | type_match = arr.all? { |obj| type.ancestors.include?(obj.class) } 71 | raise MIXED_ENUMERABLE_MSG unless type_match 72 | 73 | assert_arr_types(arr, type_or_types, msg) 74 | end 75 | 76 | # The obj_or_objs must respond_to? all of the given methods or an 77 | # Exception is raised using msg, if provided. 78 | # 79 | # @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check. 80 | # @param methods [Array] The methods to :respond_to?. 81 | # @param msg [String] The raised StandardError message, if provided. 82 | # @raise [StandardError] If the assertion fails. 83 | # @return [Object] The given obj_or_objs on successful assertion. 84 | def assert_respond_to(obj_or_objs, methods, msg = nil) 85 | methods = *methods 86 | 87 | if obj_or_objs.respond_to?(:each) 88 | obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) } 89 | else 90 | _assert_respond_to(obj_or_objs, methods, msg) 91 | end 92 | 93 | obj_or_objs 94 | end 95 | 96 | # The hash must include? the keys or a KeyError is raised. 97 | # 98 | # @param hash [Hash] The hash which should include the required keys. 99 | # @param keys [Array] The keys whose presence to assert. 100 | # @param msg [String] The raised KeyError message, if provided. 101 | # @raise [KeyError] If the assertion fails. 102 | # @return [Hash] The given hash on successful assertion. 103 | def assert_required_keys(hash, keys, msg = nil) 104 | msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(", ")) 105 | all_present = keys.all? { |key| hash.keys.include? key } 106 | raise KeyError, msg unless all_present 107 | 108 | hash 109 | end 110 | 111 | private 112 | 113 | # obj must respond_to? all methods or an exception is raised. 114 | def _assert_respond_to(obj, methods, msg = nil) 115 | raise "methods must respond_to? :all?" unless methods.respond_to?(:all?) 116 | 117 | msg ||= format(DEFAULT_DUCK_FAIL_MSG, "#{obj.class} (#{obj})", methods) 118 | match = methods.all? { |method| obj.respond_to?(method) } 119 | raise msg unless match 120 | 121 | obj 122 | end 123 | 124 | alias_method :assert_type, :assert_types 125 | alias_method :assert_arr_type, :assert_arr_types 126 | alias_method :assert_common_arr_type, :assert_common_arr_types 127 | end 128 | end 129 | -------------------------------------------------------------------------------- /lib/wgit/base.rb: -------------------------------------------------------------------------------- 1 | module Wgit 2 | # Class to inherit from, as an alternative form of using the `Wgit::DSL`. 3 | # All subclasses must define a `#parse(doc, &block)` method. 4 | class Base 5 | extend Wgit::DSL 6 | 7 | # Runs once before the crawl/index is run. Override as needed. 8 | def setup; end 9 | 10 | # Runs once after the crawl/index is complete. Override as needed. 11 | def teardown; end 12 | 13 | # Runs the crawl/index passing each crawled `Wgit::Document` and the given 14 | # block to the subclass's `#parse` method. 15 | def self.run(&block) 16 | crawl_method = @method || :crawl 17 | obj = new 18 | 19 | unless obj.respond_to?(:parse) 20 | raise "#{obj.class} must respond_to? #parse(doc, &block)" 21 | end 22 | 23 | obj.setup 24 | send(crawl_method) { |doc| obj.parse(doc, &block) } 25 | obj.teardown 26 | 27 | obj 28 | end 29 | 30 | # Sets the crawl/index method to call when `Base.run` is called. 31 | # The mode method must match one defined in the `Wgit::Crawler` or 32 | # `Wgit::Indexer` class. 33 | # 34 | # @param method [Symbol] The crawl/index method to call. 35 | def self.mode(method) 36 | @method = method 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/wgit/core_ext.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Script which extends Ruby's core functionality when parsed. 4 | # Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`. 5 | 6 | require_relative "url" 7 | 8 | # Extend the standard String functionality. 9 | class String 10 | # Converts a String into a Wgit::Url object. 11 | # 12 | # @return [Wgit::Url] The converted URL. 13 | def to_url 14 | Wgit::Url.parse(self) 15 | end 16 | end 17 | 18 | # Extend the standard Enumerable functionality. 19 | module Enumerable 20 | # Converts each String instance into a Wgit::Url object and returns the new 21 | # Array. 22 | # 23 | # @return [Array] The converted URL's. 24 | def to_urls 25 | map { |element| process_url_element(element) } 26 | end 27 | 28 | # Converts each String instance into a Wgit::Url object and returns self 29 | # having modified the receiver. 30 | # 31 | # @return [Array] Self containing the converted URL's. 32 | def to_urls! 33 | map! { |element| process_url_element(element) } 34 | end 35 | end 36 | 37 | private 38 | 39 | # Converts the element to a Wgit::Url if the element is a String. 40 | def process_url_element(element) 41 | element.is_a?(String) ? element.to_url : element 42 | end 43 | -------------------------------------------------------------------------------- /lib/wgit/database/adapters/in_memory.rb: -------------------------------------------------------------------------------- 1 | require_relative "../../utils" 2 | require_relative "../../url" 3 | require_relative "../../document" 4 | require_relative "../../model" 5 | require_relative "../database_adapter" 6 | 7 | module Wgit::Database 8 | # Database implementer class for in-memory (RAM) storage. This DB is mainly used 9 | # for testing and experimenting with. This DB is thread safe. 10 | class InMemory < DatabaseAdapter 11 | # Initializes a thread safe InMemory Database instance. 12 | # 13 | # @param connection_string [String] Not used but needed to adhere to the 14 | # DatabaseAdapter interface. 15 | def initialize(connection_string = nil) 16 | # Inits @urls and @docs vars. 17 | initialize_store 18 | 19 | super 20 | end 21 | 22 | # Overrides String#inspect to display collection sizes. 23 | # 24 | # @return [String] A short textual representation of this object. 25 | def inspect 26 | "#" 28 | end 29 | 30 | # The Wgit::Url's collection stored as an in-memory Concurrent::Array. 31 | def urls(&block) 32 | map_urls(@urls, &block) 33 | end 34 | 35 | # The Wgit::Document's collection stored as an in-memory Concurrent::Array. 36 | def docs(&block) 37 | map_documents(@docs, &block) 38 | end 39 | 40 | # The raw url Hashes, not mapped into their corresponding Wgit objects. 41 | def url_hashes 42 | @urls 43 | end 44 | 45 | # The raw doc Hashes, not mapped into their corresponding Wgit objects. 46 | def doc_hashes 47 | @docs 48 | end 49 | 50 | # Returns the current size of the in-memory database. 51 | # An empty database will return a size of 4 because there are 4 bytes in 52 | # two empty arrays (urls and docs collections). 53 | # 54 | # @return [Integer] The current size of the in-memory DB. 55 | def size 56 | @urls.to_s.size + @docs.to_s.size 57 | end 58 | 59 | # Searches the database's Document#text for the given query. The returned 60 | # Documents are sorted for relevance, starting with the most relevant. Each 61 | # Document's #score value will be set accordingly. 62 | # 63 | # @param query [Regexp, #to_s] The regex or text value to search each 64 | # document's @text for. 65 | # @param case_sensitive [Boolean] Whether character case must match. 66 | # @param whole_sentence [Boolean] Whether multiple words should be searched 67 | # for separately. 68 | # @param limit [Integer] The max number of results to return. 69 | # @param skip [Integer] The number of results to skip. 70 | # @yield [doc] Given each search result (Wgit::Document) returned from the 71 | # DB. 72 | # @return [Array] The search results obtained from the DB. 73 | def search( 74 | query, case_sensitive: false, whole_sentence: true, 75 | limit: 10, skip: 0, &block 76 | ) 77 | regex = Wgit::Utils.build_search_regex( 78 | query, case_sensitive:, whole_sentence:) 79 | 80 | # Search the Wgit::Document's, not the raw Hashes. 81 | results = docs.select do |doc| 82 | score = 0 83 | doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash| 84 | score = results_hash.values.sum 85 | end 86 | next false if score.zero? 87 | 88 | doc.instance_variable_set :@score, score 89 | true 90 | end 91 | 92 | return [] if results.empty? 93 | 94 | results = results.sort_by { |doc| -doc.score } 95 | 96 | results = results[skip..] 97 | return [] unless results 98 | 99 | results = results[0...limit] if limit.positive? 100 | results.each(&block) if block_given? 101 | 102 | results 103 | end 104 | 105 | # Deletes everything in the urls and documents collections. 106 | # 107 | # @return [Integer] The number of deleted records. 108 | def empty 109 | previous_size = @urls.size + @docs.size 110 | initialize_store 111 | 112 | previous_size 113 | end 114 | 115 | # Returns Url records that haven't yet been crawled. 116 | # 117 | # @param limit [Integer] The max number of Url's to return. 0 returns all. 118 | # @param skip [Integer] Skip n amount of Url's. 119 | # @yield [url] Given each Url object (Wgit::Url) returned from the DB. 120 | # @return [Array] The uncrawled Urls obtained from the DB. 121 | def uncrawled_urls(limit: 0, skip: 0, &block) 122 | uncrawled = @urls.reject { |url| url["crawled"] } 123 | uncrawled = uncrawled[skip..] 124 | return [] unless uncrawled 125 | 126 | uncrawled = uncrawled[0...limit] if limit.positive? 127 | map_urls(uncrawled, &block) 128 | end 129 | 130 | # Inserts or updates the object in the in-memory database. 131 | # 132 | # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update. 133 | # @return [Boolean] True if inserted, false if updated. 134 | def upsert(obj) 135 | collection, index, model = get_model_info(obj) 136 | 137 | if index 138 | collection[index] = model 139 | false 140 | else 141 | collection << model 142 | true 143 | end 144 | end 145 | 146 | # Bulk upserts the objects in the in-memory database collection. 147 | # You cannot mix collection objs types, all must be Urls or Documents. 148 | # 149 | # @param objs [Array, Array] The objs to be 150 | # inserted/updated. 151 | # @return [Integer] The total number of newly inserted objects. 152 | def bulk_upsert(objs) 153 | assert_common_arr_types(objs, [Wgit::Url, Wgit::Document]) 154 | 155 | objs.reduce(0) do |inserted, obj| 156 | inserted += 1 if upsert(obj) 157 | inserted 158 | end 159 | end 160 | 161 | private 162 | 163 | # Creates a new Concurrent::Array for each collection. 164 | def initialize_store 165 | @urls = Concurrent::Array.new 166 | @docs = Concurrent::Array.new 167 | end 168 | 169 | # Get the database's model info (collection type, index, model) for 170 | # obj. 171 | # 172 | # Use like: 173 | # ``` 174 | # collection, index, model = get_model_info(obj) 175 | # ``` 176 | # 177 | # Raises an error if obj isn't a Wgit::Url or Wgit::Document. 178 | # 179 | # @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for. 180 | # @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document. 181 | # @return [Array] The collection type, the obj's index (if in 182 | # the collection, nil otherwise) and the Wgit::Model of obj. 183 | def get_model_info(obj) 184 | obj = obj.dup 185 | 186 | case obj 187 | when Wgit::Url 188 | key = obj.to_s 189 | collection = @urls 190 | index = @urls.index { |url| url["url"] == key } 191 | model = build_model(obj) 192 | when Wgit::Document 193 | key = obj.url.to_s 194 | collection = @docs 195 | index = @docs.index { |doc| doc["url"]&.[]("url") == key } 196 | model = build_model(obj) 197 | else 198 | raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}" 199 | end 200 | 201 | [collection, index, model] 202 | end 203 | end 204 | end 205 | -------------------------------------------------------------------------------- /lib/wgit/database/database.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "adapters/mongo_db" 4 | 5 | module Wgit 6 | # Module providing a Database connection and CRUD operations for the Url and 7 | # Document collections that form the Wgit persistence layer. 8 | module Database 9 | # The default Database adapter class used by Wgit. 10 | DEFAULT_ADAPTER_CLASS = Wgit::Database::MongoDB 11 | 12 | # The Database adapter class to be used by Wgit. Set this based on the 13 | # Database you want to use. The adapter doesn't exist yet? Write your own. 14 | @adapter_class = DEFAULT_ADAPTER_CLASS 15 | 16 | class << self 17 | # The Database adapter class to use with Wgit. The adapter you supply 18 | # should be a subclass of Wgit::Database::DatabaseAdapter and should 19 | # implement the methods within it, in order to work with Wgit. 20 | attr_accessor :adapter_class 21 | end 22 | 23 | # Initializes a DatabaseAdapter instance. Is an alias for: 24 | # `Wgit::Database.adapter_class.new(connection_string)` 25 | # 26 | # @param connection_string [String] The connection string needed to connect 27 | # to the database. 28 | # @raise [StandardError] If a connection string isn't provided, either as a 29 | # parameter or via the environment. 30 | def self.new(connection_string = nil) 31 | Wgit::Database.adapter_class.new(connection_string) 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/wgit/database/database_adapter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "../assertable" 4 | require_relative "../url" 5 | require_relative "../document" 6 | require_relative "../model" 7 | 8 | module Wgit::Database 9 | # The parent DatabaseAdapter class that should be inherited from when 10 | # creating an underlying Database adapter implementation class e.g. 11 | # Wgit::Database::MongoDB. 12 | # 13 | # Listed in this class are the methods that an implementer class must 14 | # implement to work with Wgit. Failure to do so will result in a 15 | # NotImplementedError being raised. 16 | # 17 | # While not required, implementing the method `#search_fields=(fields)` in an 18 | # adapter class will allow `Wgit::Model.set_search_fields` to call 19 | # it. This allows the search fields to be set in one method call, from within 20 | # the Wgit::Model class. See this method's docs for more info. 21 | # 22 | # Also listed in this class are common helper methods available to all 23 | # Database implementer subclasses. 24 | class DatabaseAdapter 25 | include Wgit::Assertable 26 | 27 | # The NotImplementedError message that gets raised if an implementor class 28 | # doesn't implement a method required by Wgit. 29 | NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \ 30 | implemented this method" 31 | 32 | ###################### START OF INTERFACE METHODS ###################### 33 | 34 | # Initializes a DatabaseAdapter instance. 35 | # 36 | # The implementor class should establish a DB connection here using the 37 | # given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`. 38 | # Don't forget to call `super`. 39 | # 40 | # @param connection_string [String] The connection string needed to connect 41 | # to the database. 42 | # @raise [StandardError] If a connection string isn't provided, either as a 43 | # parameter or via the environment. 44 | def initialize(connection_string = nil); end 45 | 46 | # Returns the current size of the database. 47 | # 48 | # @return [Integer] The current size of the DB. 49 | def size 50 | raise NotImplementedError, NOT_IMPL_ERR 51 | end 52 | 53 | # Searches the database's Documents for the given query. The 54 | # `Wgit::Model.search_fields` should be searched for matches 55 | # against the given query. Documents should be sorted starting with the 56 | # most relevant. Each returned Document should have it's `score` field set 57 | # for relevance. 58 | # 59 | # @param query [String] The text query to search with. 60 | # @param case_sensitive [Boolean] Whether character case must match. 61 | # @param whole_sentence [Boolean] Whether multiple words should be searched 62 | # for separately. 63 | # @param limit [Integer] The max number of results to return. 64 | # @param skip [Integer] The number of results to skip. 65 | # @yield [doc] Given each search result (Wgit::Document) returned from the 66 | # DB. 67 | # @return [Array] The search results obtained from the DB. 68 | def search( 69 | query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0 70 | ) 71 | raise NotImplementedError, NOT_IMPL_ERR 72 | end 73 | 74 | # Deletes everything in the urls and documents collections. 75 | # 76 | # @return [Integer] The number of deleted records. 77 | def empty 78 | raise NotImplementedError, NOT_IMPL_ERR 79 | end 80 | 81 | # Returns Url records that haven't yet been crawled. 82 | # 83 | # @param limit [Integer] The max number of Url's to return. 0 returns all. 84 | # @param skip [Integer] Skip n amount of Url's. 85 | # @yield [url] Given each Url object (Wgit::Url) returned from the DB. 86 | # @return [Array] The uncrawled Urls obtained from the DB. 87 | def uncrawled_urls(limit: 0, skip: 0) 88 | raise NotImplementedError, NOT_IMPL_ERR 89 | end 90 | 91 | # Inserts or updates the object in the database. 92 | # 93 | # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update. 94 | # @return [Boolean] True if inserted, false if updated. 95 | def upsert(obj) 96 | raise NotImplementedError, NOT_IMPL_ERR 97 | end 98 | 99 | # Bulk upserts the objects in the database collection. 100 | # You cannot mix collection objs types, all must be Urls or Documents. 101 | # 102 | # @param objs [Array, Array] The objs to be 103 | # inserted/updated. 104 | # @return [Integer] The total number of newly inserted objects. 105 | def bulk_upsert(objs) 106 | raise NotImplementedError, NOT_IMPL_ERR 107 | end 108 | 109 | ###################### END OF INTERFACE METHODS ###################### 110 | 111 | private 112 | 113 | # Returns the correct Wgit::Database:Model for the given obj type. 114 | # 115 | # @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for. 116 | # @return [Hash] The obj model. 117 | def build_model(obj) 118 | assert_type(obj, [Wgit::Url, Wgit::Document]) 119 | 120 | if obj.is_a?(Wgit::Url) 121 | Wgit::Model.url(obj) 122 | else 123 | Wgit::Model.document(obj) 124 | end 125 | end 126 | 127 | # Map each DB hash object into a Wgit::Document. Each Document is yielded 128 | # if a block is given before returning the mapped Array of Documents. 129 | def map_documents(doc_hashes) 130 | doc_hashes.map do |doc| 131 | doc = Wgit::Document.new(doc) 132 | yield(doc) if block_given? 133 | doc 134 | end 135 | end 136 | 137 | # Map each DB hash object into a Wgit::Url. Each Url is yielded 138 | # if a block is given before returning the mapped Array of Urls. 139 | def map_urls(url_hashes) 140 | url_hashes.map do |url| 141 | url = Wgit::Url.new(url) 142 | yield(url) if block_given? 143 | url 144 | end 145 | end 146 | end 147 | end 148 | -------------------------------------------------------------------------------- /lib/wgit/document_extractors.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | ### Default Document Extractors ### 4 | 5 | # Base. 6 | Wgit::Document.define_extractor( 7 | :base, 8 | "//base/@href", 9 | singleton: true, 10 | text_content_only: true 11 | ) do |base| 12 | Wgit::Url.parse?(base) if base 13 | end 14 | 15 | # Title. 16 | Wgit::Document.define_extractor( 17 | :title, 18 | "//title", 19 | singleton: true, 20 | text_content_only: true 21 | ) 22 | 23 | # Description. 24 | Wgit::Document.define_extractor( 25 | :description, 26 | '//meta[@name="description"]/@content', 27 | singleton: true, 28 | text_content_only: true 29 | ) 30 | 31 | # Author. 32 | Wgit::Document.define_extractor( 33 | :author, 34 | '//meta[@name="author"]/@content', 35 | singleton: true, 36 | text_content_only: true 37 | ) 38 | 39 | # Keywords. 40 | Wgit::Document.define_extractor( 41 | :keywords, 42 | '//meta[@name="keywords"]/@content', 43 | singleton: true, 44 | text_content_only: true 45 | ) do |keywords, _source, type| 46 | if keywords && type == :document 47 | keywords = keywords.split(",") 48 | keywords = Wgit::Utils.sanitize(keywords) 49 | end 50 | 51 | keywords 52 | end 53 | 54 | # Links. 55 | Wgit::Document.define_extractor( 56 | :links, 57 | "//a/@href", 58 | singleton: false, 59 | text_content_only: true 60 | ) do |links| 61 | links 62 | .map { |link| Wgit::Url.parse?(link) } 63 | .compact # Remove unparsable links. 64 | end 65 | 66 | # Text. 67 | Wgit::Document.define_extractor( 68 | :text, 69 | nil # doc.parser contains all HTML so omit the xpath search. 70 | ) do |text, doc, type| 71 | if type == :document 72 | html_to_text = Wgit::HTMLToText.new(doc.parser) 73 | text = html_to_text.extract 74 | end 75 | 76 | text 77 | end 78 | -------------------------------------------------------------------------------- /lib/wgit/html_to_text.rb: -------------------------------------------------------------------------------- 1 | require_relative "utils" 2 | require_relative "assertable" 3 | require "nokogiri" 4 | 5 | module Wgit 6 | # Class used to extract the visible page text from a HTML string. 7 | # This is in turn used to set the output of a Wgit::Document#text method. 8 | class HTMLToText 9 | include Assertable 10 | 11 | # Set of text elements used to extract the visible text. 12 | # The element's display (:inline or :block) is used to delimit sentences e.g. 13 | #
foo
bar
will be extracted as ['foo', 'bar'] whereas 14 | # foobar will be extracted as ['foobar']. 15 | @text_elements = { 16 | a: :inline, 17 | abbr: :inline, 18 | address: :block, 19 | article: :block, 20 | aside: :block, 21 | b: :inline, 22 | bdi: :inline, 23 | bdo: :inline, 24 | blockquote: :block, 25 | br: :block, 26 | button: :block, # Normally inline but Wgit treats as block. 27 | caption: :block, 28 | cite: :inline, 29 | code: :inline, 30 | data: :inline, 31 | dd: :block, 32 | del: :inline, 33 | details: :block, 34 | dfn: :inline, 35 | div: :block, 36 | dl: :block, 37 | dt: :block, 38 | em: :inline, 39 | figcaption: :block, 40 | figure: :block, 41 | footer: :block, 42 | h1: :block, 43 | h2: :block, 44 | h3: :block, 45 | h4: :block, 46 | h5: :block, 47 | h6: :block, 48 | header: :block, 49 | hr: :block, 50 | i: :inline, 51 | input: :inline, 52 | ins: :block, 53 | kbd: :inline, 54 | label: :inline, 55 | legend: :block, 56 | li: :block, 57 | main: :block, 58 | mark: :inline, 59 | meter: :block, 60 | ol: :block, 61 | option: :block, 62 | output: :block, 63 | p: :block, 64 | pre: :block, 65 | q: :inline, 66 | rb: :inline, 67 | rt: :inline, 68 | ruby: :inline, 69 | s: :inline, 70 | samp: :inline, 71 | section: :block, 72 | small: :inline, 73 | span: :inline, 74 | strong: :inline, 75 | sub: :inline, 76 | summary: :block, 77 | sup: :inline, 78 | td: :block, 79 | textarea: :block, 80 | th: :block, 81 | time: :inline, 82 | u: :inline, 83 | ul: :block, 84 | var: :inline, 85 | wbr: :inline 86 | } 87 | 88 | class << self 89 | # Set of HTML elements that make up the visible text on a page. These 90 | # elements are used to initialize the Wgit::Document#text. See the 91 | # README.md for how to add to this Hash dynamically. 92 | attr_reader :text_elements 93 | end 94 | 95 | # The Nokogiri::HTML document object initialized from a HTML string. 96 | attr_reader :parser 97 | 98 | # Creates a new HTML to text extractor instance. 99 | # 100 | # @param parser [Nokogiri::HTML4::Document] The nokogiri parser object. 101 | # @raise [StandardError] If the given parser is of an invalid type. 102 | def initialize(parser) 103 | assert_type(parser, Nokogiri::HTML4::Document) 104 | 105 | @parser = parser 106 | end 107 | 108 | # Extracts and returns the text sentences from the @parser HTML. 109 | # 110 | # @return [Array] An array of unique text sentences. 111 | def extract_arr 112 | return [] if @parser.to_s.empty? 113 | 114 | text_str = extract_str 115 | 116 | # Split the text_str into an Array of text sentences. 117 | text_str 118 | .split("\n") 119 | .map(&:strip) 120 | .reject(&:empty?) 121 | end 122 | 123 | # Extracts and returns a text string from the @parser HTML. 124 | # 125 | # @return [String] A string of text with \n delimiting sentences. 126 | def extract_str 127 | text_str = "" 128 | 129 | iterate_child_nodes(@parser) do |node, display| 130 | # Handle any special cases e.g. skip nodes we don't care about... 131 | #
 nodes should have their contents displayed exactly as is.
132 |         if node_name(node) == :pre
133 |           text_str << "\n"
134 |           text_str << node.text
135 |           next
136 |         end
137 | 
138 |         # Skip any child node of 
 since they're handled as a special case above.
139 |         next if child_of?(:pre, node)
140 | 
141 |         if node.text?
142 |           # Skip any text element that is purely whitespace.
143 |           next unless valid_text_content?(node.text)
144 |         else
145 |           # Skip a concrete node if it has other concrete child nodes as these
146 |           # will be iterated onto later.
147 |           #
148 |           # Process if node has no children or one child which is a valid text node.
149 |           next unless node.children.empty? || parent_of_text_node_only?(node)
150 |         end
151 | 
152 |         # Apply display rules deciding if a new line is needed before node.text.
153 |         add_new_line = false
154 |         prev = prev_sibling_or_parent(node)
155 | 
156 |         if node.text?
157 |           add_new_line = true unless prev && inline?(prev)
158 |         else
159 |           add_new_line = true if display == :block
160 |           add_new_line = true if prev && block?(prev)
161 |         end
162 | 
163 |         text_str << "\n" if add_new_line
164 |         text_str << format_text(node.text)
165 |       end
166 | 
167 |       text_str
168 |         .strip
169 |         .squeeze("\n")
170 |         .squeeze(" ")
171 |     end
172 | 
173 |     private
174 | 
175 |     def node_name(node)
176 |       node.name&.downcase&.to_sym
177 |     end
178 | 
179 |     def display(node)
180 |       name = node_name(node)
181 |       Wgit::HTMLToText.text_elements[name]
182 |     end
183 | 
184 |     def inline?(node)
185 |       display(node) == :inline
186 |     end
187 | 
188 |     def block?(node)
189 |       display(node) == :block
190 |     end
191 | 
192 |     # Returns the previous sibling of node or nil. Only valid text elements are
193 |     # returned i.e. non duplicates with valid text content.
194 |     def prev_sibling(node)
195 |       prev = node.previous
196 | 
197 |       return nil unless prev
198 |       return prev unless prev.text?
199 |       return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
200 |       return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
201 | 
202 |       prev.previous
203 |     end
204 | 
205 |     # Returns node's previous sibling, parent or nil; in that order. Only valid
206 |     # text elements are returned i.e. non duplicates with valid text content.
207 |     def prev_sibling_or_parent(node)
208 |       prev = prev_sibling(node)
209 |       return prev if prev
210 | 
211 |       node.parent
212 |     end
213 | 
214 |     def child_of?(ancestor_name, node)
215 |       node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
216 |     end
217 | 
218 |     # Returns true if any of the child nodes contain a non empty :text node.
219 |     def parent_of_text_node?(node)
220 |       node.children.any? { |child| child.text? && valid_text_content?(child.text) }
221 |     end
222 | 
223 |     def parent_of_text_node_only?(node)
224 |       node.children.size == 1 && parent_of_text_node?(node)
225 |     end
226 | 
227 |     # Returns true if text is not empty having removed all new lines.
228 |     def valid_text_content?(text)
229 |       !format_text(text).empty?
230 |     end
231 | 
232 |     # Returns true if node is a text node.
233 |     # Duplicate text nodes (that follow a concrete node) are omitted.
234 |     def valid_text_node?(node)
235 |       node.text? && node.text != node.parent.text
236 |     end
237 | 
238 |     def contains_new_line?(text)
239 |       ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
240 |     end
241 | 
242 |     # Remove special characters including any new lines; as semantic HTML will
243 |     # typically use 
and/or block elements to denote a line break. 244 | def format_text(text) 245 | text 246 | .encode("UTF-8", undef: :replace, invalid: :replace) 247 | .gsub("\n", "") 248 | .gsub('\\n', "") 249 | .gsub("\r", "") 250 | .gsub('\\r', "") 251 | .gsub("\f", "") 252 | .gsub('\\f', "") 253 | .gsub("\t", "") 254 | .gsub('\\t', "") 255 | .gsub("‌", "") 256 | .gsub(" ", " ") 257 | .gsub(" ", " ") 258 | .gsub(" ", " ") 259 | .gsub(" ", " ") 260 | .gsub(" ", " ") 261 | .gsub('\u00a0', " ") 262 | end 263 | 264 | # Iterate over node and it's child nodes, yielding each to &block. 265 | # Only HTMLToText.text_elements or valid :text nodes will be yielded. 266 | # Duplicate text nodes (that follow a concrete node) are omitted. 267 | def iterate_child_nodes(node, &block) 268 | display = display(node) 269 | text_node = valid_text_node?(node) 270 | 271 | yield(node, display) if display || text_node 272 | node.children.each { |child| iterate_child_nodes(child, &block) } 273 | end 274 | 275 | alias_method :extract, :extract_arr 276 | end 277 | end 278 | -------------------------------------------------------------------------------- /lib/wgit/logger.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # FYI: The default logger is set at the bottom of this file. 4 | 5 | require "logger" 6 | 7 | module Wgit 8 | # The Logger instance used by Wgit. Set your own custom logger after 9 | # requiring this file as needed. 10 | @logger = nil 11 | 12 | # Returns the current Logger instance. 13 | # 14 | # @return [Logger] The current Logger instance. 15 | def self.logger 16 | @logger 17 | end 18 | 19 | # Sets the current Logger instance. 20 | # 21 | # @param logger [Logger] The Logger instance to use. 22 | # @return [Logger] The current Logger instance having being set. 23 | def self.logger=(logger) 24 | @logger = logger 25 | end 26 | 27 | # Returns the default Logger instance. 28 | # 29 | # @return [Logger] The default Logger instance. 30 | def self.default_logger 31 | logger = Logger.new($stdout, progname: "wgit", level: :info) 32 | logger.formatter = proc do |_severity, _datetime, progname, msg| 33 | "[#{progname}] #{msg}\n" 34 | end 35 | logger 36 | end 37 | 38 | # Sets the default Logger instance to be used by Wgit. 39 | # 40 | # @return [Logger] The default Logger instance. 41 | def self.use_default_logger 42 | @logger = default_logger 43 | end 44 | end 45 | 46 | Wgit.use_default_logger 47 | -------------------------------------------------------------------------------- /lib/wgit/model.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "./utils" 4 | 5 | module Wgit 6 | # Module used to build the Database collection objects, forming a data model. 7 | # The models produced are Hash like and therefore DB agnostic. Each model 8 | # will contain a unique field used for searching and avoiding duplicates, 9 | # this is typically a `url` field. Also contained in the model are the 10 | # search fields used in Database and Document #search calls. 11 | module Model 12 | # The default search fields used in Database and Document #search calls. 13 | # The number of matches for each field is multiplied by the field weight, 14 | # the total is the search score, used to sort the search results. 15 | # Call Wgit::Model.set_default_search_fields` to revert to default. 16 | DEFAULT_SEARCH_FIELDS = { 17 | title: 2, 18 | description: 2, 19 | keywords: 2, 20 | text: 1 21 | }.freeze 22 | 23 | # The search fields used in Database and Document #search calls. 24 | # The number of matches for each field is multiplied by the field weight, 25 | # the total is the search score, used to sort the search results. 26 | # Call Wgit::Model.set_default_search_fields` to revert to default. 27 | @search_fields = DEFAULT_SEARCH_FIELDS 28 | 29 | # Whether or not to include the Document#html in the #document model. 30 | @include_doc_html = false 31 | 32 | # Whether or not to include the Document#score in the #document model. 33 | @include_doc_score = false 34 | 35 | class << self 36 | # The search fields used in Database and Document #search calls. 37 | # A custom setter method is also provided for changing these fields. 38 | attr_reader :search_fields 39 | 40 | # Whether or not to include the Document#html in the #document model. 41 | attr_accessor :include_doc_html 42 | 43 | # Whether or not to include the Document#score in the #document model. 44 | attr_accessor :include_doc_score 45 | end 46 | 47 | # Sets the search fields used in Database and Document #search calls. 48 | # 49 | # You can pass the fields as an Array of Symbols which gives each field a 50 | # weight of 1 meaning all fields are considered of equal value. Or you can 51 | # pass a Hash of Symbol => Int and specify the weights yourself, allowing 52 | # you to customise the search rankings. 53 | # 54 | # Use like: 55 | # ``` 56 | # Wgit::Model.set_search_fields [:title, :text], db 57 | # => { title: 1, text: 1 } 58 | # Wgit::Model.set_search_fields {title: 2, text: 1}, db 59 | # => { title: 2, text: 1 } 60 | # ``` 61 | # 62 | # If the given db (database) param responds to #search_fields= then it will 63 | # be called and given the fields to set. This should perform whatever the 64 | # database adapter needs in order to search using the given fields e.g. 65 | # creating a search index. Calling the DB enables the search_fields to be 66 | # set globally within Wgit by one method call, this one. 67 | # 68 | # @param fields [Array, Hash] The field names or 69 | # the field names with their coresponding search weights. 70 | # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If 71 | # db responds to #search_fields=, it will be called and given the fields. 72 | # @raise [StandardError] If fields is of an incorrect type. 73 | # @return [Hash] The fields and their weights. 74 | def self.set_search_fields(fields, db = nil) 75 | # We need a Hash of fields => weights (Symbols => Integers). 76 | case fields 77 | when Array # of Strings/Symbols. 78 | fields = fields.map { |field| [field.to_sym, 1] } 79 | when Hash # of Strings/Symbols and Integers. 80 | fields = fields.map { |field, weight| [field.to_sym, weight.to_i] } 81 | else 82 | raise "fields must be an Array or Hash, not a #{fields.class}" 83 | end 84 | 85 | @search_fields = fields.to_h 86 | db.search_fields = @search_fields if db.respond_to?(:search_fields=) 87 | 88 | @search_fields 89 | end 90 | 91 | # Sets the search fields used in Database and Document #search calls. 92 | # 93 | # If the given db (database) param responds to #search_fields= then it will 94 | # be called and given the fields to set. This should perform whatever the 95 | # database adapter needs in order to search using the given fields e.g. 96 | # creating a search index. Calling the DB enables the search_fields to be 97 | # set globally within Wgit by one method call, this one. 98 | # 99 | # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If 100 | # db responds to #search_fields=, it will be called and given the fields. 101 | # @return [Hash] The fields and their weights. 102 | def self.set_default_search_fields(db = nil) 103 | set_search_fields(DEFAULT_SEARCH_FIELDS, db) 104 | end 105 | 106 | # The data model for a Wgit::Url collection object and for an embedded 107 | # 'url' inside a Wgit::Document collection object. 108 | # 109 | # The unique field for this model is `model['url']`. 110 | # 111 | # @param url [Wgit::Url] The Url data object. 112 | # @return [Hash] The URL model ready for DB insertion. 113 | def self.url(url) 114 | raise "url must respond_to? :to_h" unless url.respond_to?(:to_h) 115 | 116 | model = url.to_h 117 | select_bson_types(model) 118 | end 119 | 120 | # The data model for a Wgit::Document collection object. 121 | # 122 | # The unique field for this model is `model['url']['url']`. 123 | # 124 | # @param doc [Wgit::Document] The Document data object. 125 | # @return [Hash] The Document model ready for DB insertion. 126 | def self.document(doc) 127 | raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h) 128 | 129 | model = doc.to_h( 130 | include_html: @include_doc_html, include_score: @include_doc_score 131 | ) 132 | model["url"] = url(doc.url) # Expand Url String into full object. 133 | 134 | select_bson_types(model) 135 | end 136 | 137 | # Common fields when inserting a record into the DB. 138 | # 139 | # @return [Hash] Insertion fields common to all models. 140 | def self.common_insert_data 141 | { 142 | date_added: Wgit::Utils.time_stamp, 143 | date_modified: Wgit::Utils.time_stamp 144 | } 145 | end 146 | 147 | # Common fields when updating a record in the DB. 148 | # 149 | # @return [Hash] Update fields common to all models. 150 | def self.common_update_data 151 | { 152 | date_modified: Wgit::Utils.time_stamp 153 | } 154 | end 155 | 156 | # Returns the model having removed non bson types (for use with MongoDB). 157 | # 158 | # @param model_hash [Hash] The model Hash to sanitize. 159 | # @return [Hash] The model Hash with non bson types removed. 160 | def self.select_bson_types(model_hash) 161 | model_hash.select { |_k, v| v.respond_to?(:bson_type) } 162 | end 163 | end 164 | end 165 | -------------------------------------------------------------------------------- /lib/wgit/response.rb: -------------------------------------------------------------------------------- 1 | module Wgit 2 | # Response class modeling a generic HTTP GET response. 3 | class Response 4 | # The underlying HTTP adapter/library response object. 5 | attr_accessor :adapter_response 6 | 7 | # The HTML response body. 8 | attr_reader :body 9 | 10 | # The HTTP response headers. 11 | attr_reader :headers 12 | 13 | # The servers IP address. 14 | attr_accessor :ip_address 15 | 16 | # The redirections of the response. 17 | attr_reader :redirections 18 | 19 | # The HTTP response status code. 20 | attr_reader :status 21 | 22 | # The total crawl/network time for the response. 23 | attr_reader :total_time 24 | 25 | # The HTTP request URL. 26 | attr_accessor :url 27 | 28 | # Defaults some values and returns a "blank" Wgit::Response object. 29 | def initialize 30 | @body = "" 31 | @headers = {} 32 | @redirections = {} 33 | @total_time = 0.0 34 | end 35 | 36 | # Overrides String#inspect to shorten the printed output of a Response. 37 | # 38 | # @return [String] A short textual representation of this Response. 39 | def inspect 40 | "#" 41 | end 42 | 43 | # Adds time to @total_time (incrementally). 44 | # 45 | # @param time [Float] The time to add to @total_time. 46 | # @return [Float] @total_time's new value. 47 | def add_total_time(time) 48 | @total_time += time || 0.0 49 | end 50 | 51 | # Sets the HTML response body. 52 | # 53 | # @param str [String] The new HTML body. 54 | # @return [String] @body's new value. 55 | def body=(str) 56 | @body = str || "" 57 | end 58 | 59 | # Returns the HTML response body or nil (if it's empty). 60 | # 61 | # @return [String, NilClass] The HTML body or nil if empty. 62 | def body_or_nil 63 | @body.empty? ? nil : @body 64 | end 65 | 66 | # Returns whether or not a server response is absent. 67 | # 68 | # @return [Boolean] True if the status is nil or < 1, false otherwise. 69 | def failure? 70 | !success? 71 | end 72 | 73 | # Sets the headers Hash to the given value. The header keys are mapped 74 | # to snake_cased Symbols for consistency. 75 | # 76 | # @param headers [Hash] The new response headers. 77 | # @return [Hash] @headers's new value. 78 | def headers=(headers) 79 | unless headers 80 | @headers = {} 81 | return 82 | end 83 | 84 | @headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym } 85 | end 86 | 87 | # Returns whether or not the response is 404 Not Found. 88 | # 89 | # @return [Boolean] True if 404 Not Found, false otherwise. 90 | def not_found? 91 | @status == 404 92 | end 93 | 94 | # Returns whether or not the response is 200 OK. 95 | # 96 | # @return [Boolean] True if 200 OK, false otherwise. 97 | def ok? 98 | @status == 200 99 | end 100 | 101 | # Returns whether or not the response is a 3xx Redirect. 102 | # 103 | # @return [Boolean] True if 3xx Redirect, false otherwise. 104 | def redirect? 105 | return false unless @status 106 | 107 | @status.between?(300, 399) 108 | end 109 | 110 | # Returns the number of redirects this response has had. 111 | # 112 | # @return [Integer] The number of response redirects. 113 | def redirect_count 114 | @redirections.size 115 | end 116 | 117 | # Returns the size of the response body. 118 | # 119 | # @return [Integer] The response body size in bytes. 120 | def size 121 | @body.size 122 | end 123 | 124 | # Sets the HTML response status. 125 | # 126 | # @param int [Integer] The new response status. 127 | # @return [Integer] @status' new value. 128 | def status=(int) 129 | @status = int.positive? ? int : nil 130 | end 131 | 132 | # Returns whether or not a server response is present. 133 | # 134 | # @return [Boolean] True if the status is > 0, false otherwise. 135 | def success? 136 | return false unless @status 137 | 138 | @status.positive? 139 | end 140 | 141 | # Returns whether or not Wgit is banned from indexing this site. 142 | # 143 | # @return [Boolean] True if Wgit should not index this site, false 144 | # otherwise. 145 | def no_index? 146 | headers.fetch(:x_robots_tag, "").downcase.strip == "noindex" 147 | end 148 | 149 | alias_method :code, :status 150 | alias_method :content, :body 151 | alias_method :crawl_duration, :total_time 152 | alias_method :to_s, :body 153 | alias_method :redirects, :redirections 154 | alias_method :length, :size 155 | end 156 | end 157 | -------------------------------------------------------------------------------- /lib/wgit/robots_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Wgit 4 | # The RobotsParser class handles parsing and processing of a web servers 5 | # robots.txt file. 6 | class RobotsParser 7 | include Wgit::Assertable 8 | 9 | # Key representing the start of a comment. 10 | KEY_COMMENT = "#" 11 | # Key value separator used in robots.txt files. 12 | KEY_SEPARATOR = ":" 13 | # Key representing a user agent. 14 | KEY_USER_AGENT = "User-agent" 15 | # Key representing an allow URL rule. 16 | KEY_ALLOW = "Allow" 17 | # Key representing a disallow URL rule. 18 | KEY_DISALLOW = "Disallow" 19 | 20 | # Value representing the Wgit user agent. 21 | USER_AGENT_WGIT = :wgit 22 | # Value representing any user agent including Wgit. 23 | USER_AGENT_ANY = :* 24 | 25 | # Value representing any and all paths. 26 | PATHS_ALL = %w[/ *].freeze 27 | 28 | # Hash containing the user-agent allow/disallow URL rules. Looks like: 29 | # allow_paths: ["/"] 30 | # disallow_paths: ["/accounts", ...] 31 | attr_reader :rules 32 | 33 | # Initializes and returns a Wgit::RobotsParser instance having parsed the 34 | # robot.txt contents. 35 | # 36 | # @param contents [String, #to_s] The contents of the robots.txt file to be 37 | # parsed. 38 | def initialize(contents) 39 | @rules = { 40 | allow_paths: Set.new, 41 | disallow_paths: Set.new 42 | } 43 | 44 | assert_respond_to(contents, :to_s) 45 | parse(contents.to_s) 46 | end 47 | 48 | # Overrides String#inspect to shorten the printed output of a Parser. 49 | # 50 | # @return [String] A short textual representation of this Parser. 51 | def inspect 52 | "#" 53 | end 54 | 55 | # Returns the allow paths/rules for this parser's robots.txt contents. 56 | # 57 | # @return [Array] The allow paths/rules to follow. 58 | def allow_paths 59 | @rules[:allow_paths].to_a 60 | end 61 | 62 | # Returns the disallow paths/rules for this parser's robots.txt contents. 63 | # 64 | # @return [Array] The disallow paths/rules to follow. 65 | def disallow_paths 66 | @rules[:disallow_paths].to_a 67 | end 68 | 69 | # Returns whether or not there are rules applying to Wgit. 70 | # 71 | # @return [Boolean] True if there are rules for Wgit to follow, false 72 | # otherwise. 73 | def rules? 74 | allow_rules? || disallow_rules? 75 | end 76 | 77 | # Returns whether or not there are allow rules applying to Wgit. 78 | # 79 | # @return [Boolean] True if there are allow rules for Wgit to follow, 80 | # false otherwise. 81 | def allow_rules? 82 | @rules[:allow_paths].any? 83 | end 84 | 85 | # Returns whether or not there are disallow rules applying to Wgit. 86 | # 87 | # @return [Boolean] True if there are disallow rules for Wgit to follow, 88 | # false otherwise. 89 | def disallow_rules? 90 | @rules[:disallow_paths].any? 91 | end 92 | 93 | # Returns whether or not Wgit is banned from indexing this site. 94 | # 95 | # @return [Boolean] True if Wgit should not index this site, false 96 | # otherwise. 97 | def no_index? 98 | @rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) } 99 | end 100 | 101 | private 102 | 103 | # Parses the file contents and sets @rules. 104 | def parse(contents) 105 | user_agents = [] 106 | new_block = false 107 | 108 | contents.split("\n").each do |line| 109 | line.strip! 110 | next if line.empty? || line.start_with?(KEY_COMMENT) 111 | 112 | # A user agent block is denoted by N User-agent's followed by N 113 | # Allow/Disallow's. After which a new block is formed from scratch. 114 | if start_with_any_case?(line, KEY_USER_AGENT) 115 | if new_block 116 | user_agents = [] 117 | new_block = false 118 | end 119 | user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym 120 | else 121 | new_block = true 122 | end 123 | 124 | if start_with_any_case?(line, KEY_ALLOW) 125 | append_allow_rule(user_agents, line) 126 | elsif start_with_any_case?(line, KEY_DISALLOW) 127 | append_disallow_rule(user_agents, line) 128 | elsif !start_with_any_case?(line, KEY_USER_AGENT) 129 | Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}") 130 | end 131 | end 132 | end 133 | 134 | # Implements start_with? but case insensitive. 135 | def start_with_any_case?(str, prefix) 136 | str.downcase.start_with?(prefix.downcase) 137 | end 138 | 139 | # Returns line with key removed (if present). Otherwise line is returned 140 | # as given. 141 | def remove_key(line, key) 142 | return line unless start_with_any_case?(line, key) 143 | return line unless line.count(KEY_SEPARATOR) == 1 144 | 145 | segs = line.split(KEY_SEPARATOR) 146 | return "" if segs.size == 1 147 | 148 | segs.last.strip 149 | end 150 | 151 | # Don't append * or /, as this means all paths, which is the same as no 152 | # allow_paths when passed to Wgit::Crawler. 153 | def append_allow_rule(user_agents, line) 154 | return unless wgit_user_agent?(user_agents) 155 | 156 | path = remove_key(line, KEY_ALLOW) 157 | path = parse_special_syntax(path) 158 | return if PATHS_ALL.include?(path) 159 | 160 | @rules[:allow_paths] << path 161 | end 162 | 163 | def append_disallow_rule(user_agents, line) 164 | return unless wgit_user_agent?(user_agents) 165 | 166 | path = remove_key(line, KEY_DISALLOW) 167 | path = parse_special_syntax(path) 168 | @rules[:disallow_paths] << path 169 | end 170 | 171 | def wgit_user_agent?(user_agents) 172 | user_agents.any? do |agent| 173 | [USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase) 174 | end 175 | end 176 | 177 | def parse_special_syntax(path) 178 | # Remove $ e.g. "/blah$" becomes "/blah" 179 | path = path.gsub("$", "") 180 | 181 | # Remove any inline comments e.g. "/blah # comment" becomes "/blah" 182 | path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}") 183 | 184 | # Replace an empty path with * e.g. "Allow: " becomes "Allow: *" 185 | path = "*" if path.empty? 186 | 187 | path 188 | end 189 | 190 | alias_method :paths, :rules 191 | alias_method :banned?, :no_index? 192 | end 193 | end 194 | -------------------------------------------------------------------------------- /lib/wgit/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page 4 | # contents for later use. 5 | # 6 | # @author Michael Telford 7 | module Wgit 8 | # The current gem version of Wgit. 9 | VERSION = "0.12.0" 10 | 11 | # Returns the current gem version of Wgit as a String. 12 | def self.version 13 | VERSION 14 | end 15 | 16 | # Returns the current gem version in a presentation String. 17 | def self.version_str 18 | "wgit v#{VERSION}" 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /load.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Development script which loads (all changes to) the code when called. 4 | # Note this script doesn't establish a connection to the database. 5 | 6 | load "lib/wgit/version.rb" 7 | load "lib/wgit/logger.rb" 8 | load "lib/wgit/assertable.rb" 9 | load "lib/wgit/utils.rb" 10 | load "lib/wgit/url.rb" 11 | load "lib/wgit/html_to_text.rb" 12 | load "lib/wgit/document.rb" 13 | load "lib/wgit/document_extractors.rb" 14 | load "lib/wgit/crawler.rb" 15 | load "lib/wgit/model.rb" 16 | load "lib/wgit/database/database.rb" 17 | load "lib/wgit/database/database_adapter.rb" 18 | load "lib/wgit/database/adapters/mongo_db.rb" 19 | load "lib/wgit/database/adapters/in_memory.rb" 20 | load "lib/wgit/robots_parser.rb" 21 | load "lib/wgit/indexer.rb" 22 | load "lib/wgit/dsl.rb" 23 | load "lib/wgit/base.rb" 24 | load "lib/wgit/core_ext.rb" 25 | 26 | include Wgit # Remove the name space around code (for development purposes). 27 | include DSL 28 | include Assertable 29 | -------------------------------------------------------------------------------- /test/helpers/database_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "database_test_data" 4 | 5 | # Helper module used to manipulate any database adapter. This module should 6 | # be included in other DB helper modules. To do so, you must implement the 7 | # following underlying methods: 8 | # 9 | # db # Returns a connected database adapter instance 10 | # empty_db # Empties the url and document collections 11 | # seed_urls(url_hashes) # Seeds the given url hashes 12 | # seed_docs(doc_hashes) # Seeds the given document hashes 13 | # url?(url_hash) # Returns true if the given url hash exists 14 | # doc?(url_hash) # Returns true if the given document hash exists 15 | # 16 | # The above method implementations should be done using the raw client for 17 | # your DB adapter, not the Wgit adapter class that you're testing; this way 18 | # the helpers won't fail before your DB tests fail. 19 | module DatabaseHelper 20 | def self.included(_base) 21 | @@urls = [] 22 | @@docs = [] 23 | end 24 | 25 | # Seed what's in the block, comprising of url and doc method calls 26 | # (from this module). An integer can be used to specify how many default 27 | # objects should be seeded, defaults to 1; or provide your own Wgit:Url and 28 | # Wgit:Document instances (which are passed through Wgit::Model). Hashes are 29 | # also supported and will be merged with Wgit::Model.common_insert_data. 30 | # 31 | # Returns the number of seeded/inserted documents in the DB. 32 | # 33 | # Code example: 34 | # seed do 35 | # url(Wgit::Url | Hash) 36 | # doc(Wgit::Document | Hash) 37 | # urls 3 # Seeds 3 of the default dev url records. 38 | # doc # Seeds 1 of the default dev doc records. 39 | # end 40 | def seed(&block) 41 | raise "Must provide a block" unless block_given? 42 | 43 | @@urls.clear 44 | @@docs.clear 45 | 46 | # &block populates the @@urls and @@docs arrays. 47 | instance_eval(&block) 48 | 49 | seed_urls(@@urls) unless @@urls.empty? 50 | seed_docs(@@docs) unless @@docs.empty? 51 | 52 | @@urls.count + @@docs.count 53 | end 54 | 55 | private 56 | 57 | # DSL method used within the block passed to DatabaseHelper#seed. 58 | # Seeds one or more Wgit::Urls into the DB. 59 | def url(url_or_int = 1) 60 | case url_or_int 61 | when String 62 | parsed_url = Wgit::Url.parse(url_or_int) 63 | append_url(parsed_url) 64 | when Array 65 | url_or_int.each { |url| append_url(url) } 66 | when Integer 67 | url_or_int.times { @@urls << DatabaseTestData.url } 68 | else 69 | raise "Invalid data type: #{url_or_int.class}" 70 | end 71 | end 72 | 73 | # DSL method used within the block passed to DatabaseHelper#seed. 74 | # Seeds one or more Wgit::Documents into the DB. 75 | def doc(doc_or_int = 1) 76 | case doc_or_int 77 | when Wgit::Document 78 | append_doc(doc_or_int) 79 | when Array 80 | doc_or_int.each { |doc| append_doc(doc) } 81 | when Integer 82 | doc_or_int.times { @@docs << DatabaseTestData.doc } 83 | else 84 | raise "Invalid data type: #{url_or_int.class}" 85 | end 86 | end 87 | 88 | # Appends a Url to @@urls. 89 | def append_url(url) 90 | model_hash = case url 91 | when Wgit::Url 92 | Wgit::Model.url(url) 93 | when Hash 94 | url 95 | else 96 | raise "Invalid data type: #{url.class}" 97 | end 98 | 99 | @@urls << model_hash.merge(Wgit::Model.common_insert_data) 100 | end 101 | 102 | # Appends a Document to @@docs. 103 | def append_doc(doc) 104 | model_hash = case doc 105 | when Wgit::Document 106 | Wgit::Model.document(doc) 107 | when Hash 108 | doc 109 | else 110 | raise "Invalid data type: #{doc.class}" 111 | end 112 | 113 | @@docs << model_hash.merge(Wgit::Model.common_insert_data) 114 | end 115 | 116 | alias_method :urls, :url 117 | alias_method :docs, :doc 118 | end 119 | -------------------------------------------------------------------------------- /test/helpers/in_memory_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "database_test_data" 4 | require_relative "database_helper" 5 | require "mongo" 6 | 7 | # Helper class used to manipulate the InMemory database. 8 | module InMemoryHelper 9 | include DatabaseHelper 10 | 11 | # Returns the connected InMemory instance. 12 | def db 13 | @db ||= Wgit::Database::InMemory.new 14 | end 15 | 16 | # Deletes everything in the urls and documents collections. 17 | def empty_db 18 | # Normally you shouldn't call the adapter class but this just sets new 19 | # concurrent arrays to the instance vars, so can't really go wrong. 20 | db.send(:initialize_store) 21 | end 22 | 23 | # Seed an Array of url Hashes into the database. 24 | def seed_urls(url_hashes) 25 | url_hashes.each { |url_h| db.url_hashes << url_h } 26 | end 27 | 28 | # Seed an Array of document Hashes into the database. 29 | def seed_docs(doc_hashes) 30 | doc_hashes.each { |doc_h| db.doc_hashes << doc_h } 31 | end 32 | 33 | # Returns if the url_hash/record exists in the database. 34 | def url?(url_hash) 35 | db.url_hashes.any? { |url| url == url_hash } 36 | end 37 | 38 | # Returns if the doc_hash/record exists in the database. 39 | def doc?(doc_hash) 40 | db.doc_hashes.any? { |doc| doc == doc_hash } 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /test/helpers/mongo_db_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "database_test_data" 4 | require_relative "database_helper" 5 | require "mongo" 6 | 7 | # Helper class used to manipulate the MongoDB database. 8 | module MongoDBHelper 9 | include DatabaseHelper 10 | 11 | # Returns the connected MongoDB instance. 12 | def db 13 | @db ||= Wgit::Database::MongoDB.new 14 | end 15 | 16 | # Deletes everything in the urls and documents collections. 17 | def empty_db 18 | db.client[:urls].delete_many({}) 19 | db.client[:documents].delete_many({}) 20 | end 21 | 22 | # Seed an Array of url Hashes into the database. 23 | def seed_urls(url_hashes) 24 | db.client[:urls].insert_many(url_hashes) 25 | rescue StandardError => e 26 | err_msg = e.respond_to?(:result) ? e.result["writeErrors"] : e.message 27 | raise "Write to DB failed - remember that both urls and docs won't \ 28 | accept duplicate urls. Exception details: #{err_msg}" 29 | end 30 | 31 | # Seed an Array of document Hashes into the database. 32 | def seed_docs(doc_hashes) 33 | db.client[:documents].insert_many(doc_hashes) 34 | rescue StandardError => e 35 | err_msg = e.respond_to?(:result) ? e.result["writeErrors"] : e.message 36 | raise "Write to DB failed - remember that both urls and docs won't \ 37 | accept duplicate urls. Exception details: #{err_msg}" 38 | end 39 | 40 | # Returns if the url_hash/record exists in the DB. 41 | # 42 | # Different from Wgit::Database::MongoDB#url? because it asserts the full 43 | # url_hash, not just the presence of the unique 'url' field. 44 | def url?(url_hash) 45 | db.client[:urls].find(url_hash).any? 46 | end 47 | 48 | # Returns if the doc_hash/record exists in the DB. 49 | # 50 | # Different from Wgit::Database::MongoDB#doc? because it asserts the full 51 | # doc_hash, not just the presence of the unique 'url' field. 52 | def doc?(doc_hash) 53 | db.client[:documents].find(doc_hash).any? 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /test/helpers/test_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $VERBOSE = nil # Suppress ruby warnings during the test run. 4 | 5 | # Require third party gems. 6 | require "maxitest/autorun" 7 | require "maxitest/threads" # Fail on orphaned test threads. 8 | require "maxitest/timeout" 9 | require "logger" 10 | require "dotenv" 11 | require "byebug" # Call 'byebug' anywhere in the code to debug. 12 | 13 | # Require any test helpers. 14 | require_relative "../mock/fixtures" # Mock HTTP responses. 15 | require_relative "database_test_data" 16 | require_relative "database_helper" 17 | require_relative "mongo_db_helper" 18 | require_relative "in_memory_helper" 19 | 20 | # Require all code being tested, once, in one place. 21 | require_relative "../../lib/wgit" 22 | require_relative "../../lib/wgit/core_ext" 23 | 24 | Maxitest.timeout = 60 # Fail test after N seconds. 25 | Wgit.logger.level = Logger::WARN # Remove STDOUT noise from test run. 26 | 27 | # Test helper class for unit tests. Should be inherited from by all test cases. 28 | class TestHelper < Minitest::Test 29 | # Fires everytime this class is inherited from. 30 | def self.inherited(child) 31 | Dotenv.load # Set the DB connection string from the ENV. 32 | super # Run the tests. 33 | end 34 | 35 | # Any helper methods go below, these will be callable from unit tests. 36 | 37 | # Flunk (fail) the test if an exception is raised by the given block. 38 | def refute_exception 39 | yield 40 | rescue StandardError => e 41 | flunk e.message 42 | end 43 | end 44 | 45 | # Override type #inspect methods for nicer test failure messages. 46 | class Wgit::Url 47 | def inspect 48 | "\"#{self}\"" 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /test/mock/fixtures.rb: -------------------------------------------------------------------------------- 1 | # See `toys -s save` for tasks on saving a web fixture to disk; 2 | # then mock it's HTTP response below so it's available to crawl in the tests 3 | # using Wgit. Note that you can mock a response without a saved fixture. 4 | 5 | require_relative "webmock" # DSL for mocking HTTP responses. 6 | 7 | # Custom mock responses, outside of serving a saved fixture from disk. 8 | stub_page "https://search.yahoo.com" 9 | stub_page "https://www.google.co.uk" 10 | stub_page "http://www.bing.com" 11 | stub_redirect "http://twitter.com", "https://twitter.com" 12 | stub_page "https://twitter.com" 13 | stub_redirect "https://cms.org", "https://example.com/de/page1" 14 | stub_redirect "https://example.com/de/page1", "/de/folder/page2#blah-on-page2" 15 | stub_page "https://example.com/de/folder/page2#blah-on-page2" 16 | stub_redirect "http://redirect.com/1", "http://redirect.com/2" # First redirect. 17 | stub_redirect "http://redirect.com/2", "http://redirect.com/3" # Second redirect. 18 | stub_redirect "http://redirect.com/3", "http://redirect.com/4" # Third redirect. 19 | stub_redirect "http://redirect.com/4", "http://redirect.com/5" # Fourth redirect. 20 | stub_redirect "http://redirect.com/5", "http://redirect.com/6" # Fifth redirect. 21 | stub_redirect "http://redirect.com/6", "http://redirect.com/7" # Sixth redirect. 22 | stub_page "http://redirect.com/7", fixture: "blank" 23 | stub_page "https://www.xn--ber-goa.com/about" 24 | stub_redirect "http://test-site.com/sneaky", "https://motherfuckingwebsite.com/" 25 | stub_page "http://test-site.com/public/records?q=username", fixture: "test-site.com/public/records" 26 | stub_page "http://test-site.com/public/records#top", fixture: "test-site.com/public/records" 27 | stub_redirect "http://test-site.com/ftp", "http://ftp.test-site.com" 28 | stub_not_found "http://ftp.test-site.com" 29 | stub_redirect "http://test-site.com/smtp", "http://smtp.test-site.com" 30 | stub_page "http://smtp.test-site.com" 31 | stub_redirect "http://myserver.com", "http://www.myserver.com" 32 | stub_redirect "http://www.myserver.com", "http://test-site.com" 33 | stub_timeout "http://doesnt_exist/" 34 | stub_timeout "http://test-site.com/doesntexist" 35 | stub_page "http://odd-extension.com/other.html5", body: "

Hello world

" 36 | stub_page "http://fonts.googleapis.com" 37 | stub_page "https://blank-site-1.com", fixture: "blank" 38 | stub_page "https://blank-site-2.com", fixture: "blank" 39 | stub_page "https://blank-site-3.com", fixture: "blank" 40 | stub_page "http://blank-site-4.com", fixture: "blank" 41 | stub_page "https://blank-site-5.com", fixture: "blank" 42 | stub_redirect "http://blank-site-2.com", "https://blank-site-2.com" 43 | stub_redirect "http://blank-site-2.com/robots.txt", "https://blank-site-2.com/robots.txt" 44 | 45 | # Mock a website whose's content gets updated (between indexes). 46 | stub_request(:get, "http://www.content-updates.com") 47 | .to_return({ body: "Original content" }, { body: "Updated content" }) 48 | 49 | # Match all *.jpg URL's for belfastpilates.co.uk. 50 | stub_request(:get, Regexp.new("http://www.belfastpilates.co.uk/(.*).(?:jpg|jpeg)")) 51 | 52 | # Mock robots.txt requests. 53 | stub_request(:get, "http://robots.txt.com/account") 54 | .to_return(status: 200, headers: { 'X-Robots-Tag': "noindex" }, body: "

Robots account

") 55 | 56 | stub_robots_txt_not_found [ 57 | "http://txti.es", 58 | "http://quotes.toscrape.com", 59 | "http://test-site.com", 60 | "https://motherfuckingwebsite.com", 61 | "http://link-to-robots-txt.com", 62 | "https://external-link-portal.com", 63 | "https://blank-site-1.com", 64 | "https://blank-site-2.com", 65 | "https://blank-site-3.com", 66 | "http://blank-site-4.com", 67 | "https://blank-site-5.com", 68 | "http://redirect.com", 69 | "http://www.content-updates.com" 70 | ] 71 | 72 | # Mock responses based on individual files saved to disk. The URL should match 73 | # the file name (minus the scheme prefix and .html extension suffix). 74 | pages = [ 75 | "https://motherfuckingwebsite.com/", 76 | "https://wikileaks.org/What-is-Wikileaks.html", 77 | "https://www.facebook.com", 78 | "https://static.xx.fbcdn.net/rsrc.php/v3/y1/l/0,cross/NvZ4mNTW3Fd.css", 79 | "http://altitudejunkies.com", 80 | "http://www.mountainmadness.com", 81 | "http://www.adventureconsultants.com", 82 | "http://odd-extension.com", 83 | "http://link-to-robots-txt.com", 84 | "https://external-link-portal.com/" 85 | ] 86 | 87 | # Mock sites based on a collection of files saved in a directory. 88 | # NOTE: URL's listed below MUST NOT have a path, only a scheme and host. 89 | sites = [ 90 | "http://txti.es/", 91 | "http://www.belfastpilates.co.uk/", 92 | "http://test-site.com", 93 | "http://quotes.toscrape.com/", 94 | "http://robots.txt.com", 95 | "http://disallow-all.com" 96 | ] 97 | 98 | stub_fixtures pages, sites 99 | -------------------------------------------------------------------------------- /test/mock/fixtures/altitudejunkies.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Altitude Junkies | High Altitude Mountaineering Expeditions 7 | 8 | 9 |
10 | 13 | 26 | 27 | 28 |
29 | 30 | 56 | 57 |

Altitude Junkies is established as one of the premier outfitters offering professionally managed mountaineering expeditions.

58 | 59 |

Our expeditions are non-guided like traditional commercial expeditions with certified IFMGA guides and a 4:1 climber to guide ratio. On our expeditions we have a single expedition leader who manages the expedition logistics and staff. In the Cordillera Blanca of Peru where we practice roped glacier travel we may have two expedition leaders.

60 | 61 |

All our expedition leaders are professional climbers and have reached the summits of the mountains that they lead the expeditions to, some numerous times. They climb year round worldwide and have the experience needed to make the expedition successful and safe. All of our leaders have extensive experience in the effects and medical treatment of high altitude ailments.

62 | 63 |

We primarily focus on the less crowded 8,000-meter and more challenging technical 6,000-meter peaks. We have organized many expeditions to 8,000-meter peaks including Everest, Lhotse, Dhaulagiri, Makalu and Manaslu in Nepal; Gasherbrum I, Gasherbrum II and Broad Peak in Pakistan; and Everest, Cho Oyu and Shishapangma in Tibet.

64 | 65 |

During the summer months we switch from climbing in the Himalayas to the lower peaks of Cordillera Blanca in Peru. We offer an expeditions to some of the Cordillera Blancas most popular peaks as well as bespoke expeditions to any Cordillera Blanca peak.

66 | 67 |

We do not advertise the lowest price for our respective expeditions to form a team of twenty or more climbers for profitability. We prefer to keep our expeditions to a maximum team size of eight climbers plus leaders as we feel a smaller expedition allows for a more personal experience on the mountain. The quality of our food and services at our respective base camps is considered one of the premier operations in the Himalayas and better than most of the high cost expeditions. Not having several western guides means that our expeditions are more affordable than most for qualified climbers. We only run professional expeditions and our focus is on safety and quality rather than the quantity of climbers joining our expeditions.

68 | 69 |

To maximize our chances of summit success we use Nepalese Sherpa on our Nepal Himalayan expeditions. Our Sherpa team have climbed with us over many years and are under the directorship of our UIAGM certified guide Sirdar, Pasang Ongcho Sherpa. Base and advanced base camps are well staffed with our Sherpa cooks and kitchen assistants. They have all been trained by professional western chefs to produce a varied menu and are knowledgeable of food safety and safe hygiene practices. Our base camps are stocked with fresh local and imported foods for a varied and nutrient conscious diet.

70 | 71 |

The equipment used at base camp and high altitude camps, is of the highest quality and replaced on a regular basis for safety. We only use the finest high altitude tents available, made by Black Diamond and Mountain Hardwear, to withstand the extreme weather conditions that are encountered in the high mountains.

72 | 73 |

In base camp we provide showers, heated and carpeted dining tents, and solar panels for lighting and the charging of electronic devices. All of our expeditions have a comprehensive medical chest, medical oxygen, portable altitude chamber and personnel who are familiar with their usage. Each leader, Sherpa and climber has their own personal avalanche beacon and two-way radio on the mountain and we have base station radios at the respective base camps.

74 | 75 |

All of our Nepal expeditions use private helicopter transport whenever possible over fixed wing flights. We believe helicopters are a safer option, especially when the weather is marginal for the Lukla flights. Helicopters also allow us to reach remote areas which are off limits to fixed wing aircraft. This is beneficial to make our expedition durations shorter.

76 | 77 |

To maximize our summit success on our expeditions we use a professional meteorological service for up to the minute weather forecasts.

78 | 79 |

Himalayan peaks are a serious undertaking and climbers need to be aware there are certain risks that are out of the control of Altitude Junkies. We prefer to describe our expeditions as professionally managed rather than guided. A true guided expedition is only where the guides have full UIAGM certification (alpine, rock and ski certified), which is the only internationally recognized qualification for mountain guides and there is a 2:1 or smaller guide to climber ratio. If you need to be guided, look for guides with full UIAGM certification.

80 | 81 | 82 | 83 | 84 | 85 |

As the name suggests, Altitude Junkies organizes expeditions for like-minded climbers who are addicted to climbing the world's high mountains. Come and get high, climb with the Altitude Junkies.

86 | 87 |

Photo credit: Phil "Disco" Huddy - Cholatse 88 |

89 |
90 | Contact us: info@altitudejunkies.com
91 | 92 | 93 | 94 | 95 | 96 |
97 |
    98 | 99 |
  • Mountain house logo
  • 100 |
  • Summit logo
  • 101 |
  • Brunton logo
  • 102 |
  • Suunto logo
  • 103 |
  • Mountain hard wear logo
  • 104 |
  • Black diamond logo
  • 105 |
  • Salomon logo
  • 106 |
  • Pelican logo
  • 107 |
  • Julbo logo
  • 108 |
  • Msr logo
  • 109 |
110 | 111 | 128 |
129 | 130 | -------------------------------------------------------------------------------- /test/mock/fixtures/anchor_display.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 | Contact Contact2 8 | Contact3 9 |
10 | 11 | -------------------------------------------------------------------------------- /test/mock/fixtures/blank.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Blank Page 5 | 6 | 7 |

Blank Page

8 |

This page is intentionally blank with very little HTML.

9 | 10 | 11 | -------------------------------------------------------------------------------- /test/mock/fixtures/disallow-all.com/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Disallow All 5 | 6 | 7 |

Disallow All Test Site

8 |

About page, which shouldn't be indexed.

9 | Home 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/disallow-all.com/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Disallow All 5 | 6 | 7 |

Disallow All Test Site

8 |

This website disallows wgit from indexing any of its content, including this page. This is done via the robots.txt page.

9 | About 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/disallow-all.com/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: googlebot 2 | Crawl-delay: 4 3 | Allow: * 4 | 5 | User-agent: wgit 6 | Disallow: / 7 | -------------------------------------------------------------------------------- /test/mock/fixtures/div_display.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
foo
bar
6 | 7 | 8 | -------------------------------------------------------------------------------- /test/mock/fixtures/external-link-portal.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | External Link Portal 5 | 6 | 7 |

External Link Portal

8 |

This site contains external links to other site fixtures.

9 |

External Links

10 | Blank Site 1 11 | Blank Site 2 12 | Blank Site 3 13 | Blank Site 4 14 | Blank Site 5 15 | 16 | 17 | -------------------------------------------------------------------------------- /test/mock/fixtures/getting_started.html: -------------------------------------------------------------------------------- 1 | 2 | 4 |

Running the following Wgit code will programmatically configure your database:

5 |
db = Wgit::Database.new '<connection_string>'
10 | 
11 | db.create_collections
12 | db.create_unique_indexes
13 | db.text_index = Wgit::Database::DEFAULT_TEXT_INDEX
14 |

Or take a look at the mongo_init.js file for the equivalent Javascript commands.

15 |

Note: The text search index lists all document fields to be searched by MongoDB when calling Wgit::Database#search. Therefore, you should append this list with any other fields that you want searched. For example, if you extend the API then you might want to search your new fields in the database by adding them to the index above. This can be done programmatically with:

16 | 17 | 18 | -------------------------------------------------------------------------------- /test/mock/fixtures/link-to-robots-txt.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Link to Robots.txt 5 | 6 | 7 |

Link to Robots.txt Test Site

8 |

This site contains an external to robot.txt test site.

9 | Robots.txt Test Site 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/motherfuckingwebsite.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Motherfucking Website 10 | 11 | 12 | 13 |
14 |

This is a motherfucking website.

15 | 16 |
17 | 18 |

Seriously, what the fuck else do you want?

19 | 20 |

You probably build websites and think your shit is special. You think your 13 megabyte parallax-ative home page is going to get you some fucking Awwward banner you can glue to the top corner of your site. You think your 40-pound jQuery file and 83 polyfills give IE7 a boner because it finally has box-shadow. Wrong, motherfucker. Let me describe your perfect-ass website:

21 | 22 |
    23 |
  • Shit's lightweight and loads fast
  • 24 |
  • Fits on all your shitty screens
  • 25 |
  • Looks the same in all your shitty browsers
  • 26 |
  • The motherfucker's accessible to every asshole that visits your site
  • 27 |
  • Shit's legible and gets your fucking point across (if you had one instead of just 5mb pics of hipsters drinking coffee)
  • 28 |
29 | 30 |

Well guess what, motherfucker:

31 | 32 |

You. Are. Over-designing. Look at this shit. It's a motherfucking website. Why the fuck do you need to animate a fucking trendy-ass banner flag when I hover over that useless piece of shit? You spent hours on it and added 80 kilobytes to your fucking site, and some motherfucker jabbing at it on their iPad with fat sausage fingers will never see that shit. Not to mention blind people will never see that shit, but they don't see any of your shitty shit.

33 | 34 |

You never knew it, but this is your perfect website. Here's why.

35 | 36 |

It's fucking lightweight

37 | 38 |

This entire page weighs less than the gradient-meshed facebook logo on your fucking Wordpress site. Did you seriously load 100kb of jQuery UI just so you could animate the fucking background color of a div? You loaded all 7 fontfaces of a shitty webfont just so you could say "Hi." at 100px height at the beginning of your site? You piece of shit.

39 | 40 |

It's responsive

41 | 42 |

You dumbass. You thought you needed media queries to be responsive, but no. Responsive means that it responds to whatever motherfucking screensize it's viewed on. This site doesn't care if you're on an iMac or a motherfucking Tamagotchi.

43 | 44 |

It fucking works

45 | 46 |

Look at this shit. You can read it ... that is, if you can read, motherfucker. It makes sense. It has motherfucking hierarchy. It's using HTML5 tags so you and your bitch-ass browser know what the fuck's in this fucking site. That's semantics, motherfucker.

47 | 48 |

It has content on the fucking screen. Your site has three bylines and link to your dribbble account, but you spread it over 7 full screens and make me click some bobbing button to show me how cool the jQuery ScrollTo plugin is.

49 | 50 |

Cross-browser compatibility? Load this motherfucker in IE6. I fucking dare you.

51 | 52 |

This is a website. Look at it. You've never seen one before.

53 | 54 |

Like the man who's never grown out his beard has no idea what his true natural state is, you have no fucking idea what a website is. All you have ever seen are shitty skeuomorphic bastardizations of what should be text communicating a fucking message. This is a real, naked website. Look at it. It's fucking beautiful.

55 | 56 |

Yes, this is fucking satire, you fuck

57 | 58 |

I'm not actually saying your shitty site should look like this. What I'm saying is that all the problems we have with websites are ones we create ourselves. Websites aren't broken by default, they are functional, high-performing, and accessible. You break them. You son-of-a-bitch.

59 | 60 |
"Good design is as little design as possible."
61 | - some German motherfucker 62 |
63 | 64 |
65 | 66 |

Epilogue

67 |

From the philosophies expressed (poorly) above, txti was created. You should try it today to make your own motherfucking websites.

68 | 69 | 70 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /test/mock/fixtures/nearest_fragment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

Hello1

7 |
8 | Anchor1 9 |

Hello2

10 |
11 |

Hello3

12 |
13 |

Hello4

14 |
15 | Anchor2 16 | Anchor3 17 |

Hello5

18 |
19 | Anchor4 20 |

Hello6

21 |
22 |
23 |
24 |

Hello7

25 |
26 |
27 | Anchor5 28 |
29 |
30 |
31 | 32 | Anchor6 33 |
34 |

Hello8

35 |
36 |
37 | Hello9 38 |

39 | Anchor8 40 |

41 |
42 |

Hello10

43 |
44 | 45 | 46 | -------------------------------------------------------------------------------- /test/mock/fixtures/not_found.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Not Found 5 | 6 | 7 |

Not Found

8 |

The page you're looking for cannot be found.

9 | 10 | -------------------------------------------------------------------------------- /test/mock/fixtures/odd-extension.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Some other page with an odd extension 5 | 6 | 7 | -------------------------------------------------------------------------------- /test/mock/fixtures/php.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A PHP Webpage 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Welcome to a PHP Webpage

13 |

All internal page links below should contain a .php extension.

14 | About 15 | Sunglasses 16 | Foo bar on this page (index.php) 17 | External Site - Web Dev Dot Com 18 | 19 | 20 | -------------------------------------------------------------------------------- /test/mock/fixtures/quotes.toscrape.com/tag/humor.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | Quotes to Scrape 15 |

16 |
17 |
18 |

19 | 20 | Login 21 | 22 |

23 |
24 |
25 | 26 | 27 |

Viewing tag: humor

28 | 29 |
30 |
31 | 32 |
33 | ���The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.��� 34 | by 35 | (about) 36 | 37 |
38 | Tags: 39 | 40 | 41 | aliteracy 42 | 43 | books 44 | 45 | classic 46 | 47 | humor 48 | 49 |
50 |
51 | 52 |
53 | ���A day without sunshine is like, you know, night.��� 54 | by 55 | (about) 56 | 57 |
58 | Tags: 59 | 60 | 61 | humor 62 | 63 | obvious 64 | 65 | simile 66 | 67 |
68 |
69 | 70 |
71 | ���Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.��� 72 | by 73 | (about) 74 | 75 |
76 | Tags: 77 | 78 | 79 | humor 80 | 81 | religion 82 | 83 |
84 |
85 | 86 |
87 | ���Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.��� 88 | by 89 | (about) 90 | 91 |
92 | Tags: 93 | 94 | 95 | humor 96 | 97 |
98 |
99 | 100 |
101 | ���All you need is love. But a little chocolate now and then doesn't hurt.��� 102 | by 103 | (about) 104 | 105 |
106 | Tags: 107 | 108 | 109 | chocolate 110 | 111 | food 112 | 113 | humor 114 | 115 |
116 |
117 | 118 |
119 | ���Remember, we're madly in love, so it's all right to kiss me anytime you feel like it.��� 120 | by 121 | (about) 122 | 123 |
124 | Tags: 125 | 126 | 127 | humor 128 | 129 |
130 |
131 | 132 |
133 | ���Some people never go crazy. What truly horrible lives they must lead.��� 134 | by 135 | (about) 136 | 137 |
138 | Tags: 139 | 140 | 141 | humor 142 | 143 |
144 |
145 | 146 |
147 | ���The trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.��� 148 | by 149 | (about) 150 | 151 |
152 | Tags: 153 | 154 | 155 | humor 156 | 157 | open-mind 158 | 159 | thinking 160 | 161 |
162 |
163 | 164 |
165 | ���Think left and think right and think low and think high. Oh, the thinks you can think up if only you try!��� 166 | by 167 | (about) 168 | 169 |
170 | Tags: 171 | 172 | 173 | humor 174 | 175 | philosophy 176 | 177 |
178 |
179 | 180 |
181 | ���The reason I talk to myself is because I���m the only one whose answers I accept.��� 182 | by 183 | (about) 184 | 185 |
186 | Tags: 187 | 188 | 189 | humor 190 | 191 | insanity 192 | 193 | lies 194 | 195 | lying 196 | 197 | self-indulgence 198 | 199 | truth 200 | 201 |
202 |
203 | 204 | 214 |
215 |
216 | 217 |

Top Ten tags

218 | 219 | 220 | love 221 | 222 | 223 | 224 | inspirational 225 | 226 | 227 | 228 | life 229 | 230 | 231 | 232 | humor 233 | 234 | 235 | 236 | books 237 | 238 | 239 | 240 | reading 241 | 242 | 243 | 244 | friendship 245 | 246 | 247 | 248 | friends 249 | 250 | 251 | 252 | truth 253 | 254 | 255 | 256 | simile 257 | 258 | 259 | 260 |
261 |
262 | 263 |
264 | 274 | 275 | -------------------------------------------------------------------------------- /test/mock/fixtures/quotes.toscrape.com/tag/humor/page/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | Quotes to Scrape 15 |

16 |
17 |
18 |

19 | 20 | Login 21 | 22 |

23 |
24 |
25 | 26 | 27 |

Viewing tag: humor

28 | 29 |
30 |
31 | 32 |
33 | ���I am free of all prejudice. I hate everyone equally. ��� 34 | by 35 | (about) 36 | 37 |
38 | Tags: 39 | 40 | 41 | humor 42 | 43 | sinister 44 | 45 |
46 |
47 | 48 |
49 | ���A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.��� 50 | by 51 | (about) 52 | 53 |
54 | Tags: 55 | 56 | 57 | humor 58 | 59 | love 60 | 61 | romantic 62 | 63 | women 64 | 65 |
66 |
67 | 68 | 78 |
79 |
80 | 81 |

Top Ten tags

82 | 83 | 84 | love 85 | 86 | 87 | 88 | inspirational 89 | 90 | 91 | 92 | life 93 | 94 | 95 | 96 | humor 97 | 98 | 99 | 100 | books 101 | 102 | 103 | 104 | reading 105 | 106 | 107 | 108 | friendship 109 | 110 | 111 | 112 | friends 113 | 114 | 115 | 116 | truth 117 | 118 | 119 | 120 | simile 121 | 122 | 123 | 124 |
125 |
126 | 127 |
128 | 138 | 139 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Robots.txt 5 | 6 | 7 |

Robots.txt Test Site

8 |

About page.

9 | Home 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/contact.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Robots.txt 5 | 6 | 7 |

Robots.txt Test Site

8 |

Contact page.

9 | Home 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Robots.txt 5 | 6 | 7 |

Robots.txt Test Site

8 | Login 9 | Reset Password 10 | Account 11 | About 12 | Contact 13 | 14 | 15 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Robots.txt 5 | 6 | 7 |

Robots.txt Test Site

8 |

Login page.

9 | Home 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/pwreset.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Robots.txt 5 | 6 | 7 | 8 |

Robots.txt Test Site

9 |

Password reset page.

10 | Home 11 | 12 | 13 | -------------------------------------------------------------------------------- /test/mock/fixtures/robots.txt.com/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: Slurp 2 | Crawl-delay: 4 3 | 4 | User-agent: wgit 5 | Allow: / 6 | Disallow: /login 7 | 8 | User-agent: yacybot 9 | Disallow: * 10 | -------------------------------------------------------------------------------- /test/mock/fixtures/span_display.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 8 |

Running the following Wgit code will programmatically configure your database:

9 | db = Wgit::Database.new '<connection_string>' 10 | 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | About 6 | 7 | 8 |

About

9 | Records 10 | 11 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/application.js.html: -------------------------------------------------------------------------------- 1 | alert("blah"); 2 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/contact.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Contact 5 | 6 | 7 |

Contact

8 | Search 9 | Sneaky 10 | Invalid URL 11 | 12 | 13 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Home 5 | 6 | 7 | 8 |

Home

9 | Contact 10 | Search 11 | About 12 | Absolute About 13 | Test Site UK 14 | FTP Test Site 15 | FTP Test Site Redirect 16 | 17 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/public/records.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Records 5 | 6 | 7 |

Records

8 | Search 9 | Search 10 | Top of Page 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Search 5 | 6 | 7 | 8 |

Search

9 | Home 10 | Contact 11 | 12 | -------------------------------------------------------------------------------- /test/mock/fixtures/test-site.com/theme.css.html: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: lightblue; 3 | } 4 | 5 | h1 { 6 | color: white; 7 | text-align: center; 8 | } 9 | 10 | p { 11 | font-family: verdana; 12 | font-size: 20px; 13 | } 14 | -------------------------------------------------------------------------------- /test/mock/fixtures/test_doc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | My Test Webpage 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |

Howdy!


19 | 20 | 21 | 22 | 23 | 24 |

Welcome to my site, I hope you like what you see and enjoy browsing the various randomness.

25 | 26 | 27 | 28 |
29 |
30 | Image alt text 31 |

This page is primarily for testing the Ruby code used in Wgit with the Minitest framework.

32 |
33 | Here is a table: 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
CountryCapital
EnglandLondon
IrelandDublin
48 |
49 |
50 |
51 | Minitest rocks!! It's simplicity and power matches the Ruby language in which it's developed. 52 |
53 |
54 |
55 |
56 |
57 |
58 | 59 |
60 |
61 |
62 | 63 |
64 |
65 | 66 | 67 |
68 |
69 |
70 |
71 | Yahoo
72 | Contact
73 | Bing
74 | Index 2
75 | Index 3
76 | Tests
77 | Yahoo Search
78 | Blog
79 | Example.com Blog
80 | Contents
81 | Same Domain FTP Server
82 | Same Domain FTP Server 2
83 | Same Domain FTP Server Files
84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | About txti 9 | 10 | 11 | 12 |

Meet txti

13 | 14 |

Txti is fast web pages for everybody. Most of the world still does not have internet, but many websites from countries like the United States are big and complicated. This makes it hard for people with slow internet to use these sites. It is even harder for those people to put their own thoughts on the internet. With txti, anyone can use any device to share their story.

15 | 16 |

Txti was created by Barry T. Smith because he believes that high speed internet is a responsibility, not a service people buy. He got a lot of help from his friend Adam Newbold.

17 | 18 |

How you can help txti

19 | 20 |

We want to keep txti free to use forever. There are a few easy ways you can help. The best way you can help is by using txti regularly and by telling your friends about txti. Next, you can follow txti (@txties) and txti's creator, Barry (@thebarrytone), on twitter.

21 | 22 |

If you really believe that txti is important, please consider donating money with PayPal.

23 |

But if you're really awesome, you could tip some dogecoin to this address: DAon8fhTHbme13vc5phqk9JmHWesZfxYjX

24 | 25 |

Thank you for using txti!

26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/how.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | How to use txti 9 | 10 | 11 | 12 |

How to make a web page with txti

13 | 14 |

The only thing you have to give txti is the "Content." Txti takes care of the rest, but you can take advantage of these options:

15 |
16 |
Custom URL
17 |
This is the part that goes after "txti.es/" like "txti.es/barry". Currently only accepts lowercase letters (a-z), numbers (0-9), and dashes (-). If you put anything else in there, txti will change it.
18 |
Custom Edit Code
19 |
Txti will give you a random edit code, but you can change it. You have to have this to make changes to the txti, and it can never EVER be retrieved if lost.
20 |
Title
21 |
Give your txti a relevant title so it shows up better when shared on social media, search results, and in browser windows.
22 |
Author
23 |
Let people know who made this! This shows up in some searches and social media posts, and your Twitter handle works really nicely if you post the link on Twitter.
24 |
Description
25 |
This is a short (200 character) summary of the page. This shows up in Twitter cards and search results. Txti will automatically use the first 200 characters of your txti as a description if you don't provide one.
26 |
27 | 28 |

Make your txti easy to read and understand

29 |

You can make your txti robust and full-featured with links, images, lists, headings, and more. Txti uses a popular set of rules called Markdown.

30 |
    31 |
  • #Heading 1 = biggest heading
    32 | ##Heading 2 = second biggest heading
    33 | ###Heading 3 = third biggest, and so on 34 |
  • 35 |
  • Return once starts a new line. Return twice (leaving an empty line) starts a new paragraph.
  • 36 |
  • *italics* = italics
  • 37 |
  • **bold** = bold
  • 38 |
  • Links: [link to txti](http://txti.es) = link to txti (note: be sure to include the "http://" part of the link)
  • 39 |
  • Images*: ![Monkey selfie](http://i.imgur.com/FXSBf8c.jpg) 40 |
  • Bulleted lists:
    41 | - Bulleted item a
    42 | - Bulleted item b
    43 | - Bulleted item c 44 |
  • 45 |
  • Numbered lists:
    46 | 1. Numbered item 1
    47 | 2. Numbered item 2
    48 | 3. Numbered item 3 49 |
  • 50 |
51 |

*Images must be uploaded somewhere else (we recommend imgur.com). Because images can really slow down pages and we're all about "fast web pages for everybody," images are not displayed by default. The reader has the option of displaying the images in the page or just viewing links to them. See an example at txti.es/images.

52 | 53 |

Advanced stuff

54 |

Txti uses Twitter Cards, so when you share a txti link on Twitter, it will show bigger summary of the link (see above to read about the custom options you have).

55 |

Txti has a minimal API so you can use the content of your txti in other applications. Just add /json, /xml, or /html to the end of your txti's URL (such as txti.es/barry/json).

56 | 57 |

Important notes

58 |
    59 |
  • Txti will delete any pages that are more than 6 months old but have been viewed less than two times ever. If you make a txti and forget about it, it might not be there 6 months later.
  • 60 |
  • Legal concerns or pages you think violate txti's terms of service can be sent to legal@txti.es.
  • 61 |
62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/images.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | Made via txti.es: 17 | 18 | 19 | 20 |
This txti has images. Read without them, or click here to load the images.

Images in txti

All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed so the user doesn't need to load images to get the idea. It creates a link so the user can click to view (or they can click the pink banner that appears above at first).

[Image: This is an image of txti on a flip phone]

Doing it this way means that anyone can open any page on txti and know that it will load super fast! Then they can choose to view images if they have the bandwidth.

21 |

txti

22 | 23 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/images/images.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | Made via txti.es: 17 | 18 | 19 | 20 |

Images in txti

21 |

All images will be centered and start on a new line (so text doesn't flow around them. They will be sized so the width fits in the content area. Notice how the description is displayed so the user doesn't need to load images to get the idea. It creates a link so the user can click to view (or they can click the pink banner that appears above at first).

22 |

This is an image of txti on a flip phone

23 |

Doing it this way means that anyone can open any page on txti and know that it will load super fast! Then they can choose to view images if they have the bandwidth.

24 |

txti

25 | 26 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | txti - Fast web pages for everybody 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 |

txti

46 |

Fast web pages for everybody.

47 | What is txti? | How to use txti 48 |

Create a txti

49 |
50 | 51 | 52 | 53 |

By continuing, you agree to the terms of service.

54 | 55 | 56 | or 57 |
58 | 59 | 60 | -------------------------------------------------------------------------------- /test/mock/fixtures/txti.es/terms.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | txti - Terms of Service 9 | 10 | 11 | 12 |

txti Terms of Service

13 | 14 |

IMPORTANT - READ BEFORE USING.

15 | 16 |

Do not use this Application until You have carefully read the following terms and conditions. By choosing or clicking ���I Agree��� (or similar), You acknowledge and accept the terms and conditions of this agreement (���Agreement���). If You do not agree, You are granted no rights in or to the Application and shall not use the Application.

17 | 18 |

This Application is owned and provided by TXTI, LLC. (���TXTI,��� ���We,��� or ���Us���). Questions concerning this Application or its operation should be directed to https://twitter.com/thebarrytone. Use of the Application is offered to You conditioned on Your acceptance without modification of this Agreeement. You agree to familiarize Yourself with and abide by the Agreement if and when You use the Application.

19 | 20 |

DEFINITIONS

21 |
    22 |
  • "You" or "Your" means the person who is being licensed to use the Application.
  • 23 |
  • "Application" means all computer programs and documentation related to the TXTI application which focuses on creating text-based websites.
  • 24 |
25 | 26 |

LICENSE GRANT

27 |

TXTI hereby grants to You a non-exclusive, non-transferable limited license to use the Application.

28 | 29 |

POLICY

30 |
    31 |
  • You are responsible for everything you post/upload. If We, in Our sole judgment, deem that You have violated this policy or Agreement, We reserve the right to terminate Your access to the Application and remove Your material. TXTI shall have no obligation or liability with regard to any deleted material.
  • 32 |
  • This site is not for sexually explicit material including, but not limited to, child exploitation. You grant us permission to disclose your private information to law enforcement, if We, in Our sole judgment, deem that You have violated this policy in any way or if we receive a subpoena or questions from law enforcement officers investigating child exploitation or any other violation of State, Federal or Local laws.
  • 33 |
  • This site is not for obscene material. Material that is designed to predominantly appeal to prurient interests, or which goes substantially beyond customary limits of society, is strictly prohibited.
  • 34 |
  • Respect copyright. Only post material that You made or that You are authorized to use. This means do not use content that someone else owns the copyright to (e.g., lyrics, text, etc.) without necessary authorizations.
  • 35 |
  • We encourage free speech and defend everyone's right to express unpopular points of view. But We do not permit hate speech (speech which attacks or demeans a group based on race or ethnic origin, religion, disability, gender, age, veteran status, and sexual orientation/gender identity).
  • 36 |
  • Predatory behavior, stalking, threats, harassment, intimidation, invading privacy, revealing other people���s personal information, and inciting others to commit violent acts shall be considered a violation of this Policy.
  • 37 |
  • TXTI will delete any pages that are more than 6 months old but have been viewed less than two times ever. If you make a TXTI and forget about it, it might not be there 6 months later.
  • 38 |
39 | 40 |

TERMINATION

41 |

TXTI retains the right to, at any time, and in its sole discretion, terminate this AGREEMENT. Upon the termination of this AGREEMENT, all rights granted to You under this AGREEMENT shall immediately terminate and You shall discontinue all use of the Application.

42 | 43 |

SECURITY

44 |

Any user names and/or passwords used for this Application are for individual use only. You will be responsible for the security of Your user name and/or password (if any). TXTI is under no obligation to preserve or make accessible any data uploaded or posted through TXTI.

45 | 46 |

NO REPRESENTATIONS OR WARRANTIES

47 |

TXTI makes no representations or warranties that this Application is free of defects, viruses or other harmful components. TXTI shall not be responsible for any damages or loss that may result from the hacking or infiltration of this Application or associated computer systems and data servers. YOU HAVE THE SOLE RESPONSIBILITY FOR ADEQUATE PROTECTION AND BACKUP OF DATA AND/OR EQUIPMENT USED IN CONNECTION WITH THIS APPLICATION AND YOU AGREE TO HOLD TXTI HARMLESS FROM, AND YOU COVENANT NOT TO SUE TXTI FOR, ANY CLAIMS BASED ON THE USE OF THIS APPLICATION, INCLUDING CLAIMS FOR LOST DATA, WORK DELAYS OR LOST PROFITS RESULTING FROM USE OF MATERIALS OR CONTENT FROM THIS APPLICATION. THE APPLICATION MAY CONTAIN TECHNICAL INACCURACIES, OUTDATED INFORMATION AND TYPOGRAPHICAL ERRORS. ALL MATERIALS, INFORMATION, APPLICATION, AND SERVICES INCLUDED IN OR AVAILABLE THROUGH THIS APPLICATION ARE PROVIDED ���AS IS��� AND ���AS AVAILABLE.��� TXTI DISCLAIMS ALL WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. TXTI MAKES NO WARRANTIES OR REPRESENTATIONS CONCERNING THE SUITABILITY, RELIABILITY, AVAILABILITY, TIMELINESS, OR ACCURACY OF THE INFORMATION, PRODUCTS OR SERVICES CONTAINED IN OR OBTAINED THROUGH THE APPLICATION FOR ANY PURPOSE. SOME JURISDICTIONS DO NOT PERMIT THE EXCLUSION OF CERTAIN WARRANTIES; THESE EXCLUSIONS MAY NOT APPLY TO YOU. NO AGENT OF TXTI IS AUTHORIZED TO ALTER OR EXCEED THE WARRANTY OBLIGATIONS OF TXTI AS SET FORTH HEREIN. ANY IMPLIED WARRANTIES THAT CANNOT BE EXCLUDED ARE LIMITED TO THE SHORTEST PERIOD PERMITTED BY THE APPLICABLE LAW.

48 | 49 |

DISCLAIMER OF LIABILITY

50 |

UNDER NO CIRCUMSTANCES SHALL TXTI OR ITS SUBSIDIARIES, AFFILIATES, LICENSORS, SERVICE PROVIDERS, CONTENT PROVIDERS, EMPLOYEES, AGENTS, OFFICERS, AND DIRECTORS BE LIABLE FOR ANY DIRECT, INDIRECT, PUNITIVE, INCIDENTAL, SPECIAL, CONSEQUENTIAL, OR ANY OTHER DAMAGES WHATSOEVER THAT MAY RESULT FROM THE USE OF OR THE INABILITY TO USE THIS APPLICATION, INCLUDING WITHOUT LIMITATION, DAMAGES ARISING FROM MISTAKES, OMISSIONS, INTERRUPTIONS, DETERIORATION OR CORRUPTION OF FILES, DELETION OR CORRUPTION OF EMAIL, ERRORS, LOSS OF DATA, LOSS OF PROFITS, DEFECTS, VIRUSES, AND/OR DELAYS. THIS LIMITATION APPLIES WHETHER THE ALLEGED LIABILITY IS BASED ON CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY OR OTHERWISE, EVEN IF TXTI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. YOU ACKNOWLEDGE AND AGREE THAT THE CONSIDERATION WHICH TXTI IS CHARGING HEREUNDER DOES NOT INCLUDE ANY CONSIDERATION FOR ASSUMPTION BY TXTI OF THE RISK OF LICENSEE'S CONSEQUENTIAL OR INCIDENTAL DAMAGES WHICH MAY ARISE IN CONNECTION WITH LICENSEE'S USE OF THE APPLICATION AND DOCUMENTATION. SOME JURISDICTIONS DO NOT PERMIT THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES; TXTI���S LIABILITY IN SUCH JURISDICTIONS SHALL BE LIMITED TO THE FULLEST EXTENT PERMITTED BY LAW. The essential purpose of this provision is to limit the potential liability of TXTI arising out of this AGREEMENT. The parties acknowledge that the limitations set forth in this Section are integral to the amount of consideration levied in connection with the license of the Application and that, were TXTI to assume any further liability other than as set forth herein, such consideration would of necessity be set substantially higher. If You are dissatisfied with any portion of the Application, Your sole remedy is to cease using it.

51 | 52 |

INDEMNITY

53 |

You agree to defend, indemnify, and hold harmless TXTI and its employees, agents, directors, officers and shareholders, from and against all liabilities, claims, damages, and expenses (including without limitation reasonable attorneys��� fees and costs) arising out of Your use of this Application, Your breach of this AGREEMENT, or Your infringement of the intellectual property rights of third parties.

54 | 55 |

GENERAL

56 |

This AGREEMENT shall be given effect to the fullest extent permissible by law. In case any one or more of the provisions contained in this agreement shall for any reason be held to be invalid, illegal or unenforceable in any respect, such invalidity, illegality, or unenforceability shall not affect any other provisions hereof, and this agreement shall be construed to give maximum legal effect to the intent expressed herein.

57 | 58 |
    59 |
  • This agreement is governed by, and construed in accordance with the laws of the State of Ohio, without giving effect to any principles of conflicts of law. You hereby consent to the exclusive jurisdiction and venue of the courts of the State of Ohio or, if appropriate, the United States District Court for the Southern District of Ohio for the resolution of all disputes arising out of or relating to the use of this Application and the associated services. The United Nations Convention on Contracts for the International Sale of Goods does not apply to this software or the software license pertaining to this agreement.
  • 60 |
  • TXTI may assign this AGREEMENT, in whole or in part, at any time.
  • 61 |
  • This AGREEMENT constitutes the entire agreement between TXTI and You with respect to this Application, and these agreements supersede all prior or contemporaneous communications, proposals, and agreements, whether electronic, oral, or written, between TXTI and You with respect to the Application. As such, these terms of use represent the entire understanding relating to the use of this Application and prevail over any prior or contemporaneous, conflicting or additional communications.
  • 62 |
  • TXTI���s performance of this AGREEMENT is subject to existing laws and legal process. Nothing contained in this AGREEMENT is in derogation of TXTI���s right to comply with governmental, court and law enforcement requests relating to Your use of the Application, or information collected by TXTI in connection with such use.
  • 63 |
  • Any unauthorized access, modification or change of any information, or any interference with the availability of or access to this Application is strictly prohibited. TXTI reserves all legal rights and remedies available to it and this disclaimer shall in no way be deemed a limitation or waiver of any other rights TXTI may have.
  • 64 |
  • TXTI may change the terms of this AGREEMENT from time to time. You agree to check the TXTI's website, where the latest copy of the AGREEMENT will be posted, for any material changes. You expressly agree that the continued use of any software provided by the TXTI after the effective date of any change will constitute your consent to any such revised AGREEMENT. If at any time You do not accept any such revision, You must cease the use of the Application.
  • 65 |
  • The provisions which, by their nature, should survive termination of this AGREEMENT shall do so.
  • 66 |
67 | 68 |

RESERVATION OF RIGHTS

69 |

All rights not expressly granted herein are reserved exclusively and entirely to TXTI.

70 | 71 | 72 |

END OF AGREEMENT

73 |

TXTI is a registered trademark of TXTI, LLC. All rights reserved.

74 | 75 | 76 | -------------------------------------------------------------------------------- /test/mock/save_page.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # Script to save a single web page's HTML to disk. For example, 5 | # http://blah.com/admin/about will be saved as: 6 | # /fixtures/blah.com.html 7 | # Call this script like: `ruby save_page.rb "http://blah.com"` or use toys task. 8 | 9 | require_relative "../../lib/wgit" 10 | require "fileutils" 11 | 12 | def save_page(url) 13 | url = Wgit::Url.parse(url) 14 | path = "#{File.expand_path(__dir__)}/fixtures" 15 | crawler = Wgit::Crawler.new 16 | 17 | FileUtils.mkdir_p(path) 18 | Dir.chdir(path) 19 | 20 | # Save the HTML file for the page. 21 | crawler.crawl_url(url) do |doc| 22 | if doc.empty? 23 | puts "Invalid URL: #{doc.url}" 24 | next 25 | end 26 | 27 | file_path = url.to_host 28 | file_path += ".html" unless file_path.end_with? ".html" 29 | puts "Saving document #{file_path}" 30 | File.open(file_path, "w") { |f| f.write(doc.html) } 31 | end 32 | end 33 | 34 | if $PROGRAM_NAME == __FILE__ 35 | raise "ARGV[0] must be a URL" unless ARGV[0] 36 | 37 | url = ARGV[0] 38 | save_page(url) 39 | end 40 | -------------------------------------------------------------------------------- /test/mock/save_site.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # A script which saves a website's HTML to disk. For example, 5 | # http://blah.com/admin/about will be saved as: 6 | # /fixtures/blah.com/admin/about.html 7 | # Call this script like: `ruby save_site.rb http://blah.com` or use toys task. 8 | 9 | require_relative "../../lib/wgit" 10 | require "fileutils" 11 | 12 | def save_site(base_url, follow: :default) 13 | base_url = Wgit::Url.parse(base_url) 14 | path = "#{File.expand_path(__dir__)}/fixtures/#{base_url.to_host}" 15 | crawler = Wgit::Crawler.new 16 | 17 | FileUtils.mkdir_p(path) 18 | Dir.chdir(path) 19 | 20 | # Save the site to disk. 21 | crawler.crawl_site(base_url, follow: follow) do |doc| 22 | url = doc.url 23 | 24 | if doc.empty? 25 | puts "Invalid URL: #{url}" 26 | next 27 | end 28 | 29 | # Save the index.html file to disk. 30 | if !base_url.omit_slashes.to_path && url.omit_slashes == base_url.omit_slashes 31 | puts "Saving document #{base_url.to_host}/index.html" 32 | File.open("index.html", "w") { |f| f.write(doc.html) } 33 | next 34 | end 35 | 36 | # Work out the file structure on disk. 37 | segs = url.omit_base.split("/").reject(&:empty?) 38 | dir = "" 39 | if segs.length == 1 40 | file_name = segs[0] 41 | else 42 | file_name = segs.pop 43 | segs.each { |seg| dir += "#{seg}/" } 44 | dir.chop! # Remove trailing slash. 45 | end 46 | 47 | # Create the directory if necessary. 48 | if dir != "" 49 | FileUtils.mkdir_p(dir) 50 | dir += "/" 51 | end 52 | 53 | file_path = dir + file_name 54 | file_path += ".html" unless file_path.end_with? ".html" 55 | 56 | # Save the HTML file for the page. 57 | puts "Saving document #{base_url.to_host}/#{file_path}" 58 | File.open(file_path, "w") { |f| f.write(doc.html) } 59 | end 60 | end 61 | 62 | if $PROGRAM_NAME == __FILE__ 63 | raise "ARGV[0] must be a URL" unless ARGV[0] 64 | 65 | base_url = ARGV[0] 66 | xpath = ARGV[1] || :default 67 | save_site(base_url, follow: xpath) 68 | end 69 | -------------------------------------------------------------------------------- /test/mock/webmock.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "webmock" 4 | require "uri" 5 | 6 | include WebMock::API 7 | 8 | WebMock.enable! 9 | WebMock.disable_net_connect!(allow: %w[127.0.0.1 vlang.io duckduckgo.com]) 10 | 11 | # Any custom Typhoeus mocking (missing from Webmock) goes below. 12 | class Typhoeus::Response 13 | def total_time 14 | total_time = options[:total_time] 15 | return total_time if total_time.positive? 16 | 17 | rand(0.2...0.7) 18 | end 19 | 20 | def primary_ip 21 | "192.241.176.#{rand(10..99)}" 22 | end 23 | end 24 | 25 | def fixtures_dir 26 | "test/mock/fixtures" 27 | end 28 | 29 | # Return the contents of a HTML fixture file. 30 | def fixture(file) 31 | file = "#{file}.html" if %w[.html robots.txt].none? { |ext| file.end_with?(ext) } 32 | file_path = file.start_with?(fixtures_dir) ? file : "#{fixtures_dir}/#{file}" 33 | File.read(file_path) 34 | end 35 | 36 | # Return the default HTML fixture data. 37 | def default_html 38 | fixture("test_doc") 39 | end 40 | 41 | # Stub a single webpage. Stubs both: 42 | # http://blah.com/hi and http://blah.com/hi/ (with trailing slash). 43 | def stub_page(url, status: 200, body: default_html, fixture: nil) 44 | body = fixture(fixture) if fixture 45 | stub_request(:get, url).to_return(status: status, body: body) 46 | 47 | # Webmock only mocks a trailing slash if there's no path so we do it. 48 | path = URI(url).path 49 | return if path.empty? || path == "/" 50 | 51 | alt_url = url.end_with?("/") ? url.chop : "#{url}/" 52 | stub_request(:get, alt_url).to_return(status: status, body: body) 53 | end 54 | 55 | # Stub a single page 404 not found. 56 | def stub_not_found(url) 57 | stub_page(url, status: 404, fixture: "not_found") 58 | end 59 | 60 | # Stub a 404 not found for /robots.txt. 61 | def stub_robots_txt_not_found(urls) 62 | urls.each do |url| 63 | suffix = url.end_with?("/robots.txt") ? "" : "/robots.txt" 64 | stub_not_found(url + suffix) 65 | end 66 | end 67 | 68 | # Stub a single page 301 redirect. 69 | def stub_redirect(from, to) 70 | stub_request(:get, from).to_return(status: 301, headers: { 'Location': to }) 71 | end 72 | 73 | # Stub a single page network timeout/unknown host error. 74 | def stub_timeout(url) 75 | stub_request(:get, url).to_timeout 76 | end 77 | 78 | # Stub an entire website recursively according to what's saved on the file 79 | # system. Assumes the fixture data exists on disk. 80 | def stub_dir(url, path, dir) 81 | url.chop! if url.end_with?("/") # Remove trailing slash. 82 | path.chop! if path.end_with?("/") # " 83 | dir.chop! if dir.end_with?("/") # " 84 | 85 | url += "/#{dir}" unless URI(url).host == dir 86 | path += "/#{dir}" 87 | 88 | objects = Dir["#{path}/{*,.*}"] 89 | .reject { |f| f.end_with?(".") || f.end_with?("..") } 90 | files = objects 91 | .select { |obj| File.file?(obj) } 92 | .reject { |f| f.end_with?("index.html") } 93 | .map { |f| f.end_with?(".html") ? f[0..-6] : f } # Remove extension. 94 | dirs = objects 95 | .select { |obj| File.directory?(obj) } 96 | 97 | files.each { |f| stub_page("#{url}/#{f.split('/').last}", fixture: f) } 98 | dirs.each { |d| stub_dir(url, path, d.split("/").last) } 99 | end 100 | 101 | # Stub all single webpages and full websites from the fixtures directory. 102 | def stub_fixtures(pages, sites) 103 | pages.each do |url| 104 | path = URI(url).host 105 | stub_page(url, fixture: path) 106 | end 107 | 108 | sites.each do |url| 109 | dir = URI(url).host 110 | index_file = "#{dir}/index.html" 111 | index_path = "#{fixtures_dir}/#{index_file}" 112 | 113 | stub_page(url, fixture: index_file) if File.exist?(index_path) 114 | stub_dir(url, fixtures_dir, dir) 115 | end 116 | end 117 | -------------------------------------------------------------------------------- /test/test_assertable.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "helpers/test_helper" 4 | 5 | # Test class for the Assertable module functions. 6 | class TestAssertable < TestHelper 7 | include Wgit::Assertable 8 | 9 | # Run non DB tests in parallel for speed. 10 | parallelize_me! 11 | 12 | # Runs before every test. 13 | def setup; end 14 | 15 | def test_assert_types__pass 16 | assert_equal "Hello World!", assert_types("Hello World!", String) 17 | assert_equal [1, 2, 3], assert_types([1, 2, 3], [Array, String]) 18 | assert_equal "/about", assert_types("/about".to_url, String) 19 | end 20 | 21 | def test_assert_types__fail 22 | e = assert_raises(StandardError) { assert_types "Hello World!", Integer } 23 | assert_equal "Expected: Integer, Actual: String", e.message 24 | 25 | e = assert_raises StandardError do 26 | assert_types [1, 2, 3], [TrueClass, Integer], "An Array is expected" 27 | end 28 | assert_equal "An Array is expected", e.message 29 | end 30 | 31 | def test_assert_arr_types__pass 32 | assert_equal [1, true, "Boom!"], assert_arr_types([1, true, "Boom!"], [Integer, TrueClass, String]) 33 | assert_equal [1, true, "/about"], assert_arr_types([1, true, "/about".to_url], [Integer, TrueClass, String]) 34 | end 35 | 36 | def test_assert_arr_types__fail 37 | e = assert_raises StandardError do 38 | assert_arr_types [1, true, "Boom!"], [Integer, String] 39 | end 40 | s = "Expected: [Integer, String], Actual: TrueClass" 41 | 42 | assert_equal s, e.message 43 | end 44 | 45 | def test_assert_arr_types__non_enumerable 46 | e = assert_raises StandardError do 47 | assert_arr_type "non enumerable", Integer 48 | end 49 | s = "Expected an Enumerable responding to #each, not: String" 50 | 51 | assert_equal s, e.message 52 | end 53 | 54 | def test_assert_common_arr_types__pass 55 | url = "/about".to_url 56 | assert_equal [1, 2, 3], assert_common_arr_types([1, 2, 3], [Integer, String]) 57 | assert_equal [url, "/about"], assert_common_arr_type([url, "/about"], String) 58 | end 59 | 60 | def test_assert_common_arr_types__fail 61 | e = assert_raises StandardError do 62 | assert_common_arr_types [1, "Boom!"], [Integer, String] 63 | end 64 | s = "Expected an Enumerable with elements of a single common type" 65 | 66 | assert_equal s, e.message 67 | end 68 | 69 | def test_assert_common_arr_types__non_enumerable 70 | e = assert_raises StandardError do 71 | assert_common_arr_type "non enumerable", Integer 72 | end 73 | s = "Expected an Enumerable responding to #each, not: String" 74 | 75 | assert_equal s, e.message 76 | end 77 | 78 | def test_assert_respond_to__pass 79 | objs = ["Hello World!", [1, 2, 3]] 80 | 81 | assert_equal objs, assert_respond_to(objs, %i[equal? include?]) 82 | end 83 | 84 | def test_assert_respond_to__fail 85 | objs = ["Hello World!", [1, 2, 3]] 86 | 87 | e = assert_raises StandardError do 88 | assert_equal objs, assert_respond_to(objs, %i[equal? each]) 89 | end 90 | assert_equal( 91 | "String (Hello World!) doesn't respond_to? [:equal?, :each]", 92 | e.message 93 | ) 94 | end 95 | 96 | def test_assert_respond_to__single_method 97 | objs = ["Hello World!", [1, 2, 3]] 98 | 99 | assert_equal objs, assert_respond_to(objs, :length) 100 | end 101 | 102 | def assert_required_keys__pass 103 | hash = { 'NAME': "Mick", 'AGE': 30 } 104 | 105 | assert_equal hash, assert_required_keys(hash, %w[NAME AGE]) 106 | end 107 | 108 | def assert_required_keys__fail 109 | hash = { 'NAME': "Mick", 'AGE': 30 } 110 | 111 | e = assert_raises(KeyError { assert_required_keys(hash, %w[NAME ADDRESS]) }) 112 | assert_equal( 113 | "Some or all of the required keys are not present: NAME, ADDRESS", 114 | e.message 115 | ) 116 | end 117 | end 118 | -------------------------------------------------------------------------------- /test/test_base.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # The test class is at the bottom of this file. 4 | 5 | class QuotesCrawler < Wgit::Base 6 | mode :crawl_site 7 | start "http://quotes.toscrape.com/tag/humor/" 8 | follow "//li[@class='next']/a/@href" 9 | 10 | # We use the 2 suffix to avoid conflicting with tests elsewhere. 11 | extract :quotes2, "//div[@class='quote']/span[@class='text']", singleton: false 12 | extract :authors2, "//div[@class='quote']/span/small", singleton: false 13 | 14 | def parse(doc) 15 | doc.quotes2.zip(doc.authors2).each do |arr| 16 | yield({ 17 | quote: arr.first, 18 | author: arr.last 19 | }) 20 | end 21 | end 22 | end 23 | 24 | class NoParseCrawler < Wgit::Base 25 | mode :crawl 26 | start "http://quotes.toscrape.com/tag/humor/" 27 | follow "//li[@class='next']/a/@href" 28 | end 29 | 30 | class DefaultModeCrawler < Wgit::Base 31 | start "http://quotes.toscrape.com/tag/humor/" 32 | 33 | def parse(doc) 34 | yield doc.url 35 | end 36 | end 37 | 38 | class SetupTeardownCrawler < Wgit::Base 39 | attr_reader :count 40 | 41 | start "http://quotes.toscrape.com/tag/humor/" 42 | 43 | def initialize 44 | @count = 0 45 | end 46 | 47 | def setup 48 | @count += 1 49 | end 50 | 51 | def parse(_doc) 52 | @count += 1 53 | end 54 | 55 | def teardown 56 | @count += 1 57 | end 58 | end 59 | 60 | # Test class for the Base class logic. 61 | class TestBase < TestHelper 62 | # Runs before every test. 63 | def setup; end 64 | 65 | def test_quotes_crawler 66 | quotes = [] 67 | QuotesCrawler.run { |quote| quotes << quote } 68 | 69 | assert_equal 12, quotes.size 70 | assert({ 71 | quote: "“A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.”", 72 | author: "Jane Austen" 73 | }, quotes.last) 74 | 75 | # Clean up the extractors for other tests. 76 | Wgit::Document.remove_extractor :quotes2 77 | Wgit::Document.remove_extractor :authors2 78 | end 79 | 80 | def test_no_parse_crawler 81 | ex = assert_raises(StandardError) { NoParseCrawler.run } 82 | assert_equal "NoParseCrawler must respond_to? #parse(doc, &block)", ex.message 83 | end 84 | 85 | def test_default_mode_crawler 86 | DefaultModeCrawler.run do |url| 87 | assert_equal "http://quotes.toscrape.com/tag/humor/", url 88 | end 89 | end 90 | 91 | def test_setup_teardown_crawler 92 | crawler = SetupTeardownCrawler.run 93 | assert_equal 3, crawler.count 94 | end 95 | end 96 | -------------------------------------------------------------------------------- /test/test_core_ext.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "helpers/test_helper" 4 | 5 | # Test class for the Ruby core extension methods. 6 | class TestCoreExt < TestHelper 7 | # Run non DB tests in parallel for speed. 8 | parallelize_me! 9 | 10 | # Runs before every test. 11 | def setup; end 12 | 13 | def test_string_to_url 14 | s = "http://www.google.co.uk" 15 | url = s.to_url 16 | assert_instance_of Wgit::Url, url 17 | assert_equal s, url 18 | assert_equal url.object_id, url.to_url.object_id 19 | end 20 | 21 | def test_array_to_urls 22 | url_strs = [ 23 | "http://altitudejunkies.com", 24 | "http://www.mountainmadness.com", 25 | "http://www.adventureconsultants.com" 26 | ] 27 | urls = url_strs.to_urls 28 | 29 | assert(url_strs.all? { |url| url.instance_of? String }) 30 | assert(urls.all? { |url| url.instance_of? Wgit::Url }) 31 | 32 | url_strs = [ 33 | "http://altitudejunkies.com", 34 | true, 35 | "http://www.adventureconsultants.com" 36 | ] 37 | urls = url_strs.to_urls 38 | 39 | assert url_strs.first.instance_of? String 40 | refute(urls.all? { |url| url.instance_of? Wgit::Url }) 41 | assert urls.first.instance_of? Wgit::Url 42 | assert urls[1].instance_of? TrueClass 43 | assert urls.last.instance_of? Wgit::Url 44 | end 45 | 46 | def test_array_to_urls! 47 | urls = [ 48 | "http://altitudejunkies.com", 49 | "http://www.mountainmadness.com", 50 | "http://www.adventureconsultants.com" 51 | ].to_urls! 52 | 53 | assert(urls.all? { |url| url.instance_of? Wgit::Url }) 54 | 55 | urls = [ 56 | "http://altitudejunkies.com", 57 | true, 58 | "http://www.adventureconsultants.com" 59 | ].to_urls! 60 | 61 | refute(urls.all? { |url| url.instance_of? Wgit::Url }) 62 | assert urls.first.instance_of? Wgit::Url 63 | assert urls[1].instance_of? TrueClass 64 | assert urls.last.instance_of? Wgit::Url 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /test/test_database_adapter.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class which includes the Wgit::DSL for testing with. 4 | class TestClass 5 | include Wgit::DSL 6 | end 7 | 8 | # Test class for the Wgit Database.adapter_class accessor methods. 9 | # This class should also test any Wgit code that calls: 10 | # `Wgit::Database.adapter_class.new`; which ensures changing adapters works. 11 | class TestDatabaseAdapter < TestHelper 12 | # Runs before every test. 13 | def setup; end 14 | 15 | # Runs after every test. 16 | def teardown 17 | # Reset the database adapter back to the default. 18 | Wgit::Database.adapter_class = Wgit::Database::DEFAULT_ADAPTER_CLASS 19 | end 20 | 21 | def test_adapter_class__default 22 | assert_equal Wgit::Database::DEFAULT_ADAPTER_CLASS, Wgit::Database.adapter_class 23 | end 24 | 25 | def test_adapter_class__accessor 26 | Wgit::Database.adapter_class = Wgit::Database::InMemory 27 | 28 | assert_equal Wgit::Database::InMemory, Wgit::Database.adapter_class 29 | end 30 | 31 | def test_adapter_class__indexer 32 | Wgit::Database.adapter_class = Wgit::Database::InMemory 33 | indexer = Wgit::Indexer.new 34 | 35 | assert_equal Wgit::Database::InMemory, indexer.db.class 36 | end 37 | 38 | def test_adapter_class__dsl 39 | Wgit::Database.adapter_class = Wgit::Database::InMemory 40 | test_class = TestClass.new 41 | 42 | assert_equal Wgit::Database::InMemory, test_class.send(:get_db).class 43 | end 44 | 45 | def test_database_new_alias 46 | Wgit::Database.adapter_class = Wgit::Database::InMemory 47 | db = Wgit::Database.new 48 | 49 | assert_instance_of Wgit::Database::InMemory, db 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /test/test_gem.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "helpers/test_helper" 4 | 5 | # Test class for requiring the wgit gem. 6 | class TestGem < TestHelper 7 | # Runs before every test. 8 | def setup; end 9 | 10 | # Test the wgit.rb file loads the API correctly. 11 | def test_require 12 | refute_exception { require("wgit") } 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /test/test_html_to_text.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for utility module functions. 4 | class TestHTMLToText < TestHelper 5 | # Run non DB tests in parallel for speed. 6 | parallelize_me! 7 | 8 | # Runs before every test. 9 | def setup 10 | @use_cases = [ 11 | # inline parent 12 | "*", 13 | "*", 14 | "*", 15 | "*", 16 | 17 | # block parent 18 | "*", 19 | "*", 20 | "*", 21 | "*" 22 | ] 23 | 24 | @content_variations = [ 25 | "", 26 | "foobar", 27 | "foo bar", 28 | " foo bar ", 29 | " ", 30 | " ", 31 | "\n", 32 | " \n ", 33 | " \n foo bar \n ", 34 | "
", 35 | "
" 36 | ] 37 | 38 | # For each use_case * text_variation combo above, what do we expect. 39 | @expected = [ 40 | # inline parent - inline inline 41 | "prepost", 42 | "prefoobarpost", 43 | "prefoo barpost", 44 | "pre foo bar post", 45 | "pre post", 46 | "pre post", 47 | "prepost", 48 | "pre post", 49 | "pre foo bar post", 50 | "pre\npost", 51 | "pre\npost", 52 | 53 | # inline parent - inline block 54 | "pre\npost", 55 | "prefoobar\npost", 56 | "prefoo bar\npost", 57 | "pre foo bar \npost", 58 | "pre \npost", 59 | "pre \npost", 60 | "pre\npost", 61 | "pre \npost", 62 | "pre foo bar \npost", 63 | "pre\npost", 64 | "pre\npost", 65 | 66 | # inline parent - block inline 67 | "pre\npost", 68 | "pre\nfoobarpost", 69 | "pre\nfoo barpost", 70 | "pre\n foo bar post", 71 | "pre\n post", 72 | "pre\n post", 73 | "pre\npost", 74 | "pre\n \npost", 75 | "pre\n foo bar post", 76 | "pre\npost", 77 | "pre\npost", 78 | 79 | # inline parent - block block 80 | "pre\npost", 81 | "pre\nfoobar\npost", 82 | "pre\nfoo bar\npost", 83 | "pre\n foo bar \npost", 84 | "pre\n \npost", 85 | "pre\n \npost", 86 | "pre\npost", 87 | "pre\n \npost", 88 | "pre\n foo bar \npost", 89 | "pre\npost", 90 | "pre\npost", 91 | 92 | ####### 93 | 94 | # block parent - inline inline 95 | "prepost", 96 | "prefoobarpost", 97 | "prefoo barpost", 98 | "pre foo bar post", 99 | "pre post", 100 | "pre post", 101 | "prepost", 102 | "pre post", 103 | "pre foo bar post", 104 | "pre\npost", 105 | "pre\npost", 106 | 107 | # block parent - inline block 108 | "pre\npost", 109 | "prefoobar\npost", 110 | "prefoo bar\npost", 111 | "pre foo bar \npost", 112 | "pre \npost", 113 | "pre \npost", 114 | "pre\npost", 115 | "pre \npost", 116 | "pre foo bar \npost", 117 | "pre\npost", 118 | "pre\npost", 119 | 120 | # block parent - block inline 121 | "pre\npost", 122 | "pre\nfoobarpost", 123 | "pre\nfoo barpost", 124 | "pre\n foo bar post", 125 | "pre\n post", 126 | "pre\n post", 127 | "pre\npost", 128 | "pre\n \npost", 129 | "pre\n foo bar post", 130 | "pre\npost", 131 | "pre\npost", 132 | 133 | # block parent - block block 134 | "pre\npost", 135 | "pre\nfoobar\npost", 136 | "pre\nfoo bar\npost", 137 | "pre\n foo bar \npost", 138 | "pre\n \npost", 139 | "pre\n \npost", 140 | "pre\npost", 141 | "pre\n \npost", 142 | "pre\n foo bar \npost", 143 | "pre\npost", 144 | "pre\npost" 145 | ] 146 | end 147 | 148 | def test_extract_text_str 149 | total_test_cases = @use_cases.size * @content_variations.size 150 | should_fail = false 151 | fail_count = 0 152 | i = 0 153 | 154 | raise "invalid @expected array" unless total_test_cases == @expected.size 155 | 156 | @use_cases.each do |use_case| 157 | @content_variations.each do |content| 158 | nodes = gsub_use_case_content(use_case, content) 159 | parser = Nokogiri::HTML("#{nodes}") 160 | 161 | expected = @expected[i] 162 | actual = Wgit::HTMLToText.new(parser).extract_str 163 | 164 | i += 1 165 | assert true # Add our assertion to minitest's total. 166 | has_passed = expected == actual 167 | next if has_passed 168 | 169 | Wgit::Utils.pprint("CASE_#{i}", prefix: "TEST_EXTRACT_TEXT_STR", new_line: true, 170 | use_case: use_case, content: content, nodes: nodes, expected: expected, actual: actual) 171 | 172 | should_fail = true 173 | fail_count += 1 174 | end 175 | end 176 | 177 | return unless should_fail 178 | 179 | Wgit::Utils.pprint("SUMMARY", prefix: "TEST_EXTRACT_TEXT_STR", new_line: true, 180 | total_test_cases: total_test_cases, total_failing_cases: fail_count) 181 | 182 | flunk "test_extract_text_str failed, see logs above for info" 183 | end 184 | 185 | def test_extract__anchors 186 | url = "http://example.com".to_url 187 | html = File.read "./test/mock/fixtures/anchor_display.html" 188 | doc = Wgit::Document.new url, html 189 | 190 | assert_equal ["About", "Foo Location Bar", "Contact Contact2 Contact3"], doc.text 191 | end 192 | 193 | def test_extract__spans 194 | url = "http://example.com".to_url 195 | html = File.read "./test/mock/fixtures/span_display.html" 196 | doc = Wgit::Document.new url, html 197 | 198 | assert_equal [ 199 | "Running the following Wgit code will programmatically configure your database:", 200 | "db = Wgit::Database.new ''" 201 | ], doc.text 202 | end 203 | 204 | def test_extract__divs 205 | url = "http://example.com".to_url 206 | html = File.read "./test/mock/fixtures/div_display.html" 207 | doc = Wgit::Document.new url, html 208 | 209 | assert_equal %w[foo bar], doc.text 210 | end 211 | 212 | def test_extract__getting_started_wiki 213 | url = "http://example.com".to_url 214 | html = File.read "./test/mock/fixtures/getting_started.html" 215 | doc = Wgit::Document.new url, html 216 | 217 | assert_equal [ 218 | "Running the following Wgit code will programmatically configure your database:", 219 | "db = Wgit::Database.new ''", 220 | "db.create_collections", 221 | "db.create_unique_indexes", 222 | "db.text_index = Wgit::Database::DEFAULT_TEXT_INDEX", 223 | "Or take a look at the mongo_init.js file for the equivalent Javascript commands.", 224 | "Note: The text search index lists all document fields to be searched by MongoDB when calling Wgit::Database#search. Therefore, you should append this list with any other fields that you want searched. For example, if you extend the API then you might want to search your new fields in the database by adding them to the index above. This can be done programmatically with:" 225 | ], doc.text 226 | end 227 | 228 | def test_extract__dups_are_not_removed 229 | doc = Wgit::Document.new "http://www.mytestsite.com/home", <<~HTML 230 |

Note: The text search index lists all document fields.

231 |
232 |

Note: The text search index lists all document fields.

233 | HTML 234 | 235 | assert_equal [ 236 | "Note: The text search index lists all document fields.", 237 | "Note: The text search index lists all document fields." 238 | ], doc.text 239 | end 240 | 241 | private 242 | 243 | def gsub_use_case_content(use_case, content) 244 | use_case 245 | .gsub("", "") 246 | .gsub("", "") 247 | .gsub("", "
") 248 | .gsub("", "
") 249 | .gsub("", "pre") 250 | .gsub("", "post") 251 | .gsub("", "
pre
") 252 | .gsub("
", "
post
") 253 | .gsub("*", content) 254 | end 255 | end 256 | -------------------------------------------------------------------------------- /test/test_in_memory.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for the Database::InMemory adapter logic. 4 | # WARNING: The in-memory DB is cleared down prior to each test run. 5 | class TestInMemory < TestHelper 6 | include InMemoryHelper 7 | 8 | # Runs before every test. 9 | def setup 10 | Wgit::Model.set_default_search_fields 11 | 12 | empty_db 13 | 14 | @url = Wgit::Url.new(DatabaseTestData.url) 15 | @doc = Wgit::Document.new(DatabaseTestData.doc) 16 | 17 | @urls = Array.new(3) { Wgit::Url.new(DatabaseTestData.url) } 18 | @docs = Array.new(3) { Wgit::Document.new(DatabaseTestData.doc) } 19 | end 20 | 21 | # Runs after every test. 22 | def teardown 23 | # Remove any defined extractors to avoid interfering with other tests. 24 | return unless Wgit::Document.remove_extractor(:code) 25 | 26 | Wgit::Document.send(:remove_method, :code) 27 | end 28 | 29 | def test_initialize 30 | db2 = Wgit::Database::InMemory.new 31 | 32 | refute_nil db2 33 | assert_empty db2.urls 34 | assert_empty db2.docs 35 | end 36 | 37 | def test_bulk_upsert__urls 38 | urls = [ 39 | "http://example.com", # Gets inserted. 40 | "http://example.com/2", # Gets inserted. 41 | "http://example.com", # Dup of 1, will be updated. 42 | "http://example.com/3" # Gets inserted. 43 | ].to_urls 44 | count = db.bulk_upsert(urls) 45 | 46 | assert_equal 3, count 47 | assert_equal([ 48 | "http://example.com", 49 | "http://example.com/2", 50 | "http://example.com/3" 51 | ], db.urls) 52 | end 53 | 54 | def test_bulk_upsert__docs 55 | urls = [ 56 | "http://example.com", # Gets inserted. 57 | "http://example.com/2", # Gets inserted. 58 | "http://example.com", # Dup of urls[0], will be updated. 59 | "http://example.com/3" # Gets inserted. 60 | ].to_urls 61 | 62 | # Map each of the urls above into a document. 63 | docs = urls.map do |url| 64 | doc_hash = DatabaseTestData.doc(url: url, append_suffix: false) 65 | Wgit::Document.new(doc_hash) 66 | end 67 | 68 | count = db.bulk_upsert(docs) 69 | 70 | assert_equal 3, count 71 | assert_equal([ 72 | "http://example.com", 73 | "http://example.com/2", 74 | "http://example.com/3" 75 | ], db.docs.map(&:url)) 76 | end 77 | 78 | def test_docs 79 | # Test empty docs result. 80 | assert_empty db.docs 81 | 82 | seed { docs @docs } 83 | docs = db.docs 84 | 85 | # Test non empty docs results. 86 | assert(docs.all? { |doc| doc.instance_of? Wgit::Document }) 87 | assert_equal 3, docs.length 88 | end 89 | 90 | def test_urls 91 | # Test empty urls result. 92 | assert_empty db.urls 93 | assert_empty db.uncrawled_urls 94 | 95 | # Seed url data to the DB. 96 | # Url 1 crawled == false, Url 2 & 3 crawled == true. 97 | @urls.first.crawled = false 98 | seed { urls @urls } 99 | 100 | urls = db.urls 101 | uncrawled_urls = db.uncrawled_urls 102 | 103 | # Test urls. 104 | assert(urls.all? { |url| url.instance_of? Wgit::Url }) 105 | assert_equal 3, urls.length 106 | 107 | # Test uncrawled_urls. 108 | assert(uncrawled_urls.all? { |url| url.instance_of? Wgit::Url }) 109 | assert_equal 1, uncrawled_urls.length 110 | end 111 | 112 | def test_urls__with_redirects 113 | # Seed url data to the DB. 114 | # Url with redirects populated. 115 | redirects_hash = { "http://example.com" => "https://example.com" } 116 | @urls.first.redirects = redirects_hash 117 | seed { urls @urls } 118 | 119 | urls = db.urls 120 | 121 | # Test urls. 122 | assert(urls.all? { |url| url.instance_of? Wgit::Url }) 123 | assert_equal 3, urls.length 124 | assert_equal redirects_hash, urls.first.redirects 125 | end 126 | 127 | def test_search 128 | # doc1 = 1.0 (match), doc2 = 0.0, doc3 = 2.0 129 | @docs.first.text << "Foo Bar" 130 | @docs.last.text << "Foo Bar" 131 | @docs.last.text << "foO bAr" 132 | 133 | seed { docs @docs } 134 | 135 | # Test no results. 136 | assert_empty db.search("doesnt_exist_123") 137 | 138 | # Test two results sorted by relevance. 139 | results = db.search("foo bar") 140 | 141 | assert_equal 2, results.length 142 | results.all? { |doc| doc.instance_of? Wgit::Document } 143 | 144 | assert_equal @docs.last.url, results.first.url 145 | assert_equal @docs.first.url, results.last.url 146 | assert results.first.score > results.last.score 147 | end 148 | 149 | def test_search__case_sensitive 150 | @docs.last.text << "Foo Bar" 151 | seed { docs @docs } 152 | 153 | # Test case_sensitive: false and block. 154 | count = 0 155 | results = db.search("foo bar", case_sensitive: false) do |doc| 156 | assert_instance_of Wgit::Document, doc 157 | count += 1 158 | end 159 | assert_equal 1, count 160 | assert_equal 1, results.length 161 | assert(results.all? { |doc| doc.instance_of? Wgit::Document }) 162 | 163 | # Test case_sensitive: true. 164 | assert_empty db.search("foo bar", case_sensitive: true) 165 | end 166 | 167 | def test_search__whole_sentence 168 | @docs.last.text << "Foo Bar" 169 | seed { docs @docs } 170 | 171 | # Test whole_sentence: false. 172 | results = db.search("bar foo", whole_sentence: false) 173 | assert_equal 1, results.length 174 | assert results.first.instance_of?(Wgit::Document) 175 | 176 | # Test whole_sentence: true. 177 | assert_empty db.search("bar foo", whole_sentence: true) 178 | 179 | # Test case_sensitive: true and whole_sentence: true. 180 | results = db.search("Foo Bar", case_sensitive: true, whole_sentence: true) 181 | assert_equal 1, results.length 182 | assert results.first.instance_of?(Wgit::Document) 183 | end 184 | 185 | def test_search__limit 186 | # First doc has highest textScore and so on... 187 | @docs.reverse.each_with_index do |doc, i| 188 | i.times { doc.text << "Everest" } 189 | end 190 | seed { docs @docs } 191 | 192 | assert_equal 3, db.search("everest").length 193 | 194 | # Test limit. 195 | results = db.search("everest", limit: 2) 196 | assert_equal 2, results.length 197 | 198 | results.each_with_index do |doc, i| 199 | doc.instance_of? Wgit::Document 200 | assert_equal @docs[i], doc 201 | assert_equal @docs[i].url.to_h, doc.url.to_h 202 | end 203 | end 204 | 205 | def test_search__skip 206 | # First doc has highest textScore and so on... 207 | @docs.reverse.each_with_index do |doc, i| 208 | i.times { doc.text << "Everest" } 209 | end 210 | seed { docs @docs } 211 | 212 | # Test skip. 213 | results = db.search("everest", skip: 1) 214 | assert_equal 2, results.length 215 | 216 | results.each_with_index do |doc, i| 217 | doc.instance_of? Wgit::Document 218 | assert_equal @docs[i + 1], doc 219 | assert_equal @docs[i + 1].url.to_h, doc.url.to_h 220 | end 221 | 222 | # Test limit and skip. 223 | results = db.search("everest", limit: 1, skip: 1) 224 | assert_equal 1, results.length 225 | 226 | results.each do |doc| 227 | doc.instance_of? Wgit::Document 228 | assert_equal @docs[1], doc 229 | assert_equal @docs[1].url.to_h, doc.url.to_h 230 | end 231 | end 232 | 233 | def test_search__special_char 234 | @doc = Wgit::Document.new @url, <<~HTML 235 |

Hello, this is to test :colon text searches

236 | HTML 237 | 238 | seed { doc @doc } 239 | 240 | # Test the result comes back. 241 | results = db.search(":colon") 242 | 243 | assert_equal 1, results.length 244 | results.all? { |doc| doc.instance_of? Wgit::Document } 245 | end 246 | 247 | def test_search__default_search_fields 248 | # => title (2 hit * 2 weight == 4) 249 | # => text (3 hits * 1 weight == 3) 250 | # => keywords (1 hits * 2 weight == 2) 251 | # => keywords (1 hits * 2 weight == 2) 252 | # ------------------------------------ 253 | # => Total match score: == 11 254 | test_doc = Wgit::Document.new({ 255 | "url" => "http://www.mytestsite.com/home", 256 | "title" => "abc abc", 257 | "keywords" => ["abc 2", "abc 3"], 258 | "text" => "abc abc abc" 259 | }) 260 | seed { doc test_doc } 261 | 262 | results = db.search("abc") 263 | 264 | assert_equal(1, results.size) 265 | assert_equal(11, results.first.score) 266 | end 267 | 268 | def test_search__set_search_fields 269 | Wgit::Document.define_extractor(:code, nil) 270 | Wgit::Model.set_search_fields(%i[code foo]) # @code exists, @foo doesn't. 271 | 272 | test_doc = Wgit::Document.new("http://www.mytestsite.com/home") 273 | test_doc.instance_variable_set(:@code, 'print("hello world")') # Score of 1. 274 | seed { doc test_doc } 275 | 276 | results = db.search("hello") 277 | 278 | assert_equal(1, results.size) 279 | assert_equal(1, results.first.score) 280 | end 281 | 282 | def test_size 283 | # An empty db has two empty arrays taking up 4 bytes. 284 | assert_equal 4, db.size 285 | end 286 | 287 | def test_empty 288 | seed do 289 | urls 3 290 | docs 2 291 | end 292 | 293 | assert_equal 5, db.empty 294 | assert_equal 0, (db.urls.size + db.docs.size) 295 | end 296 | end 297 | -------------------------------------------------------------------------------- /test/test_load.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "helpers/test_helper" 4 | 5 | # Test class for the load script (used in dev). 6 | class TestLoad < TestHelper 7 | # Runs before every test. 8 | def setup; end 9 | 10 | def test_load 11 | assert load("load.rb") 12 | Wgit.logger.level = Logger::WARN 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /test/test_logger.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "helpers/test_helper" 4 | 5 | # Test class for the Wgit Logger methods. 6 | class TestLogger < TestHelper 7 | # Runs before every test. 8 | def setup; end 9 | 10 | # Runs after every test. 11 | def teardown 12 | Wgit.use_default_logger 13 | Wgit.logger.level = Logger::WARN 14 | end 15 | 16 | def test_logger 17 | assert Wgit.logger.is_a?(Logger) 18 | end 19 | 20 | def test_logger= 21 | stdout = Logger.new $stdout 22 | assert_equal stdout, Wgit.logger = stdout 23 | end 24 | 25 | def test_default_logger 26 | assert Wgit.default_logger.is_a?(Logger) 27 | assert_equal 1, Wgit.default_logger.level 28 | assert_equal "wgit", Wgit.default_logger.progname 29 | end 30 | 31 | def test_use_default_logger 32 | assert Wgit.use_default_logger.is_a?(Logger) 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /test/test_model.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for the data Model methods. 4 | class TestModel < TestHelper 5 | # Run non DB tests in parallel for speed. 6 | parallelize_me! 7 | 8 | # Runs before every test. 9 | def setup 10 | Wgit::Model.set_default_search_fields 11 | 12 | Wgit::Model.include_doc_html = false 13 | Wgit::Model.include_doc_score = false 14 | end 15 | 16 | def test_search_fields__default 17 | assert_equal Wgit::Model::DEFAULT_SEARCH_FIELDS, Wgit::Model.search_fields 18 | end 19 | 20 | def test_set_search_fields__fails 21 | ex = assert_raises(StandardError) { Wgit::Model.set_search_fields(true) } 22 | assert_equal "fields must be an Array or Hash, not a TrueClass", ex.message 23 | end 24 | 25 | def test_set_search_fields__symbols 26 | fields = Wgit::Model.set_search_fields(%i[title code]) 27 | 28 | assert_equal({ title: 1, code: 1 }, fields) 29 | assert_equal({ title: 1, code: 1 }, Wgit::Model.search_fields) 30 | end 31 | 32 | def test_set_search_fields__hash 33 | fields = Wgit::Model.set_search_fields({ title: 2, code: 1 }) 34 | 35 | assert_equal({ title: 2, code: 1 }, fields) 36 | assert_equal({ title: 2, code: 1 }, Wgit::Model.search_fields) 37 | end 38 | 39 | def test_set_search_fields__db 40 | # Create a mock DB that is called when passed to the Wgit::Model. 41 | mock_db = Struct.new do 42 | def search_fields=(fields) 43 | raise unless fields == { title: 2, code: 1 } 44 | end 45 | end 46 | db = mock_db.new 47 | 48 | refute_exception do 49 | fields = Wgit::Model.set_search_fields({ title: 2, code: 1 }, db) 50 | assert_equal({ title: 2, code: 1 }, fields) 51 | end 52 | end 53 | 54 | def test_url 55 | url = Wgit::Url.new( 56 | "http://example.com", 57 | crawled: true, 58 | date_crawled: Time.now, 59 | crawl_duration: 1.3 60 | ) 61 | 62 | model = Wgit::Model.url(url) 63 | 64 | assert_equal %w[crawl_duration crawled date_crawled redirects url], model.keys.sort 65 | refute model.values.any?(&:nil?) 66 | end 67 | 68 | def test_document 69 | doc = Wgit::Document.new Wgit::Url.new( 70 | "http://example.com", 71 | crawled: true, 72 | date_crawled: Time.now, 73 | crawl_duration: 1.3 74 | ) 75 | 76 | model = Wgit::Model.document(doc) 77 | 78 | assert_equal %w[author base description keywords links text title url], model.keys.sort 79 | assert_equal %w[crawl_duration crawled date_crawled redirects url], model["url"].keys.sort 80 | refute model["url"].values.any?(&:nil?) 81 | end 82 | 83 | def test_document__include_html 84 | doc = Wgit::Document.new "http://example.com".to_url, "Hello" 85 | 86 | Wgit::Model.include_doc_html = true 87 | model = Wgit::Model.document(doc) 88 | 89 | assert Wgit::Model.include_doc_html 90 | assert_equal "Hello", model["html"] 91 | end 92 | 93 | def test_document__include_score 94 | doc = Wgit::Document.new({ 95 | "url" => "http://example.com", 96 | "score" => 10.5 97 | }) 98 | 99 | Wgit::Model.include_doc_score = true 100 | model = Wgit::Model.document(doc) 101 | 102 | assert Wgit::Model.include_doc_score 103 | assert_equal 10.5, model["score"] 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /test/test_readme.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for code snippets in the README.md. 4 | # WARNING: Certain tests will clear down the DB prior to the test run. 5 | class TestReadme < TestHelper 6 | include Wgit::DSL 7 | 8 | # Runs before every test. 9 | def setup; end 10 | 11 | def test_quotes__dsl 12 | ### PUT README CODE BELOW ### 13 | 14 | # require 'wgit' 15 | # require 'json' 16 | 17 | # include Wgit::DSL 18 | 19 | start "http://quotes.toscrape.com/tag/humor/" 20 | follow "//li[@class='next']/a/@href" 21 | 22 | extract :quotes, "//div[@class='quote']/span[@class='text']", singleton: false 23 | extract :authors, "//div[@class='quote']/span/small", singleton: false 24 | 25 | quotes = [] 26 | 27 | crawl_site do |doc| 28 | doc.quotes.zip(doc.authors).each do |arr| 29 | quotes << { 30 | quote: arr.first, 31 | author: arr.last 32 | } 33 | end 34 | end 35 | 36 | # puts JSON.generate(quotes) 37 | 38 | ### PUT README CODE ABOVE ### 39 | 40 | assert_equal 12, quotes.size 41 | 42 | # Clean up the extractors for other tests. 43 | Wgit::Document.remove_extractor :quotes 44 | Wgit::Document.remove_extractor :authors 45 | end 46 | 47 | def test_quotes__dsl_index 48 | ### PUT README CODE BELOW ### 49 | 50 | # require 'wgit' 51 | 52 | # include Wgit::DSL 53 | 54 | # Wgit.logger.level = Logger::WARN 55 | 56 | # ENV['WGIT_CONNECTION_STRING'] = 'mongodb://user:password@localhost/crawler' 57 | 58 | start "http://quotes.toscrape.com/tag/humor/" 59 | follow "//li[@class='next']/a/@href" 60 | 61 | extract :quotes, "//div[@class='quote']/span[@class='text']", singleton: false 62 | extract :authors, "//div[@class='quote']/span/small", singleton: false 63 | 64 | index_site 65 | results = search "prejudice", stream: nil 66 | 67 | ### PUT README CODE ABOVE ### 68 | 69 | assert_equal 1, results.size 70 | assert_equal "http://quotes.toscrape.com/tag/humor/page/2/", results.first.url 71 | 72 | # Clean up the extractors for other tests. 73 | Wgit::Document.remove_extractor :quotes 74 | Wgit::Document.remove_extractor :authors 75 | end 76 | 77 | def test_quotes__classes 78 | ### PUT README CODE BELOW ### 79 | 80 | # require 'wgit' 81 | # require 'json' 82 | 83 | crawler = Wgit::Crawler.new 84 | url = Wgit::Url.new("http://quotes.toscrape.com/tag/humor/") 85 | quotes = [] 86 | 87 | Wgit::Document.define_extractor(:quotes, "//div[@class='quote']/span[@class='text']", singleton: false) 88 | Wgit::Document.define_extractor(:authors, "//div[@class='quote']/span/small", singleton: false) 89 | 90 | crawler.crawl_site(url, follow: "//li[@class='next']/a/@href") do |doc| 91 | doc.quotes.zip(doc.authors).each do |arr| 92 | quotes << { 93 | quote: arr.first, 94 | author: arr.last 95 | } 96 | end 97 | end 98 | 99 | # puts JSON.generate(quotes) 100 | 101 | ### PUT README CODE ABOVE ### 102 | 103 | assert_equal 12, quotes.size 104 | 105 | # Clean up the extractors for other tests. 106 | Wgit::Document.remove_extractor :quotes 107 | Wgit::Document.remove_extractor :authors 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /test/test_response.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for the Response methods. 4 | class TestResponse < TestHelper 5 | # Run non DB tests in parallel for speed. 6 | parallelize_me! 7 | 8 | # Runs before every test. 9 | def setup; end 10 | 11 | def test_initialize 12 | r = Wgit::Response.new 13 | 14 | assert_empty r.body 15 | assert_empty r.headers 16 | assert_empty r.redirections 17 | assert_equal 0.0, r.total_time 18 | end 19 | 20 | def test_add_total_time 21 | r = Wgit::Response.new 22 | 23 | assert_equal 0.2, r.add_total_time(0.2) 24 | assert_equal 0.5, r.add_total_time(0.3) 25 | end 26 | 27 | def test_body_equals 28 | r = Wgit::Response.new 29 | 30 | r.body = nil 31 | assert_equal "", r.body 32 | 33 | r.body = "hello world" 34 | assert_equal "hello world", r.body 35 | end 36 | 37 | def test_body_or_nil 38 | r = Wgit::Response.new 39 | assert_nil r.body_or_nil 40 | 41 | r.body = "hello world" 42 | assert_equal "hello world", r.body 43 | end 44 | 45 | def test_failure? 46 | r = Wgit::Response.new 47 | assert r.failure? 48 | 49 | r.status = 500 50 | refute r.failure? 51 | 52 | r.status = 0 53 | assert r.failure? 54 | 55 | r.status = 200 56 | refute r.failure? 57 | end 58 | 59 | def test_headers_equals 60 | r = Wgit::Response.new 61 | 62 | r.headers = { "Content-Type" => "text/html" } 63 | assert_equal({ content_type: "text/html" }, r.headers) 64 | end 65 | 66 | def test_not_found? 67 | r = Wgit::Response.new 68 | 69 | r.status = 400 70 | refute r.not_found? 71 | 72 | r.status = 404 73 | assert r.not_found? 74 | end 75 | 76 | def test_ok? 77 | r = Wgit::Response.new 78 | 79 | r.status = 204 80 | refute r.ok? 81 | 82 | r.status = 200 83 | assert r.ok? 84 | end 85 | 86 | def test_redirect? 87 | r = Wgit::Response.new 88 | refute r.redirect? 89 | 90 | r.status = 200 91 | refute r.redirect? 92 | 93 | r.status = 301 94 | assert r.redirect? 95 | end 96 | 97 | def test_redirect_count 98 | r = Wgit::Response.new 99 | r.redirections["a"] = "foo" 100 | r.redirections["b"] = "bar" 101 | 102 | assert_equal 2, r.redirect_count 103 | end 104 | 105 | def test_size 106 | r = Wgit::Response.new 107 | assert_equal 0, r.size 108 | 109 | r.body = "hello world" 110 | assert_equal 11, r.size 111 | end 112 | 113 | def test_status_equals 114 | r = Wgit::Response.new 115 | 116 | r.status = 0 117 | assert_nil r.status 118 | 119 | r.status = 200 120 | assert_equal 200, r.status 121 | end 122 | 123 | def test_success? 124 | r = Wgit::Response.new 125 | refute r.success? 126 | 127 | r.status = 200 128 | assert r.success? 129 | 130 | r.status = 500 131 | assert r.success? 132 | 133 | r.status = 0 134 | refute r.success? 135 | end 136 | 137 | def test_no_index? 138 | r = Wgit::Response.new 139 | refute r.no_index? 140 | 141 | r.headers = { "X-Robots-Tag" => "index" } 142 | refute r.no_index? 143 | 144 | r.headers = { "X-Robots-Tag" => "noindex" } 145 | assert r.no_index? 146 | end 147 | end 148 | -------------------------------------------------------------------------------- /test/test_version.rb: -------------------------------------------------------------------------------- 1 | require_relative "helpers/test_helper" 2 | 3 | # Test class for the Wgit version. 4 | class TestVersion < TestHelper 5 | # Runs before every test. 6 | def setup; end 7 | 8 | def test_version_const 9 | assert_instance_of String, Wgit::VERSION 10 | assert_equal 2, Wgit::VERSION.count(".") 11 | end 12 | 13 | def test_version 14 | assert_equal Wgit::VERSION, Wgit.version 15 | end 16 | 17 | def test_version_str 18 | assert_equal "wgit v#{Wgit::VERSION}", Wgit.version_str 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /wgit.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "./lib/wgit/version" 4 | 5 | Gem::Specification.new do |s| 6 | s.name = "wgit" 7 | s.version = Wgit::VERSION 8 | s.date = Time.now.strftime("%Y-%m-%d") 9 | s.author = "Michael Telford" 10 | s.email = "michael.telford@live.com" 11 | s.homepage = "https://github.com/michaeltelford/wgit" 12 | s.license = "MIT" 13 | 14 | s.summary = <<~TEXT 15 | Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically extract the data you want from the web. 16 | TEXT 17 | s.description = <<~TEXT 18 | Wgit was primarily designed to crawl static HTML websites to index and search their content - providing the basis of any search engine; but Wgit is suitable for many application domains including: URL parsing, data mining and statistical analysis. 19 | TEXT 20 | 21 | s.require_paths = %w[lib] 22 | s.files = Dir[ 23 | "./lib/**/*.rb", 24 | "bin/wgit", 25 | "*.md", 26 | "LICENSE.txt", 27 | ".yardopts" 28 | ] 29 | s.bindir = "bin" 30 | s.executable = "wgit" 31 | s.post_install_message = "Added the 'wgit' executable to $PATH" 32 | s.metadata = { 33 | "yard.run" => "yri", 34 | "source_code_uri" => "https://github.com/michaeltelford/wgit", 35 | "changelog_uri" => "https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md", 36 | "bug_tracker_uri" => "https://github.com/michaeltelford/wgit/issues", 37 | "documentation_uri" => "https://www.rubydoc.info/gems/wgit" 38 | } 39 | 40 | s.platform = Gem::Platform::RUBY 41 | s.required_ruby_version = ">= 3", "< 4" 42 | 43 | s.add_runtime_dependency "addressable", "~> 2.8" 44 | s.add_runtime_dependency "base64", "~> 0.2" 45 | s.add_runtime_dependency "ferrum", "~> 0.14" 46 | s.add_runtime_dependency "mongo", "~> 2.19" 47 | s.add_runtime_dependency "nokogiri", "~> 1.15" 48 | s.add_runtime_dependency "typhoeus", "~> 1.4" 49 | 50 | s.add_development_dependency "byebug", "~> 11.1" 51 | s.add_development_dependency "dotenv", "~> 2.8" 52 | s.add_development_dependency "maxitest", "~> 5.4" 53 | s.add_development_dependency "pry", "~> 0.14" 54 | s.add_development_dependency "rubocop", "~> 1.57" 55 | s.add_development_dependency "toys", "~> 0.15" 56 | s.add_development_dependency "webmock", "~> 3.19" 57 | s.add_development_dependency "yard", "~> 0.9" 58 | 59 | # Only allow gem pushes to rubygems.org. 60 | unless s.respond_to?(:metadata) 61 | raise "Only RubyGems 2.0 or newer can protect against public gem pushes" 62 | end 63 | 64 | s.metadata["allowed_push_host"] = "https://rubygems.org" 65 | end 66 | --------------------------------------------------------------------------------