├── .dockerignore ├── .github ├── FUNDING.yml └── workflows │ └── docker-image.yml ├── .gitignore ├── Dockerfile ├── Gemfile ├── Gemfile.lock ├── README.md ├── cewl.rb ├── cewl_lib.rb ├── changelog.md ├── compose.yml └── fab.rb /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | README.md 4 | fab.rb 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: digininja 2 | custom: https://digi.ninja 3 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | jobs: 10 | 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Login to GitHub Container Registry 19 | uses: docker/login-action@v1 20 | with: 21 | registry: ghcr.io 22 | username: ${{ github.actor }} 23 | password: ${{ secrets.GITHUB_TOKEN }} 24 | 25 | - name: Build the Docker image 26 | run: | 27 | IMAGE_ID=ghcr.io/${{ github.repository_owner }}/cewl 28 | IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') 29 | VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') 30 | [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//') 31 | [ "$VERSION" == "master" ] && VERSION=latest 32 | COMMIT=$(echo "${{ github.sha }}" | cut -c 1-7) 33 | echo IMAGE_ID=$IMAGE_ID 34 | echo VERSION=$VERSION 35 | echo COMMIT=$COMMIT 36 | docker image build --tag cewl . 37 | docker image tag cewl $IMAGE_ID:$VERSION 38 | docker image tag cewl $IMAGE_ID:$COMMIT 39 | docker image push $IMAGE_ID:$VERSION 40 | docker image push $IMAGE_ID:$COMMIT 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Vim swap files 2 | .*.swp 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:3-alpine 2 | 3 | ENV RUBYOPT "rrubygems" 4 | 5 | COPY Gemfile /usr/src/CeWL/ 6 | WORKDIR /usr/src/CeWL 7 | 8 | RUN apk add gcompat 9 | RUN set -ex \ 10 | && apk add --no-cache --virtual .build-deps build-base \ 11 | && gem install bundler \ 12 | && bundle install \ 13 | && apk del .build-deps 14 | 15 | COPY . /usr/src/CeWL 16 | 17 | WORKDIR /host 18 | ENTRYPOINT ["/usr/src/CeWL/cewl.rb"] 19 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem 'mime' 3 | gem 'mime-types', ">=3.3.1" 4 | gem 'mini_exiftool' 5 | gem 'nokogiri' 6 | gem 'rexml' 7 | gem 'rubyzip' 8 | gem 'spider' 9 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | mime (0.4.4) 5 | mime-types (3.5.2) 6 | mime-types-data (~> 3.2015) 7 | mime-types-data (3.2024.0604) 8 | mini_exiftool (2.11.0) 9 | nokogiri (1.16.5-aarch64-linux) 10 | racc (~> 1.4) 11 | nokogiri (1.16.5-arm-linux) 12 | racc (~> 1.4) 13 | nokogiri (1.16.5-arm64-darwin) 14 | racc (~> 1.4) 15 | nokogiri (1.16.5-x86-linux) 16 | racc (~> 1.4) 17 | nokogiri (1.16.5-x86_64-darwin) 18 | racc (~> 1.4) 19 | nokogiri (1.16.5-x86_64-linux) 20 | racc (~> 1.4) 21 | racc (1.8.0) 22 | rexml (3.3.6) 23 | strscan 24 | rubyzip (2.3.2) 25 | spider (0.5.4) 26 | strscan (3.1.0) 27 | 28 | PLATFORMS 29 | aarch64-linux 30 | arm-linux 31 | arm64-darwin 32 | x86-linux 33 | x86_64-darwin 34 | x86_64-linux 35 | 36 | DEPENDENCIES 37 | mime 38 | mime-types (>= 3.3.1) 39 | mini_exiftool 40 | nokogiri 41 | rexml 42 | rubyzip 43 | spider 44 | 45 | BUNDLED WITH 46 | 2.5.9 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CeWL - Custom Word List generator 2 | 3 | Copyright(c) 2024, Robin Wood 4 | 5 | Based on a discussion on PaulDotCom (episode 129) about creating custom word lists spidering a targets website and collecting unique words I decided to write CeWL, the Custom Word List generator. CeWL is a ruby app which spiders a given URL to a specified depth, optionally following external links, and returns a list of words which can then be used for password crackers such as John the Ripper. 6 | 7 | By default, CeWL sticks to just the site you have specified and will go to a depth of 2 links, this behaviour can be changed by passing arguments. Be careful if setting a large depth and allowing it to go offsite, you could end up drifting on to a lot of other domains. All words of three characters and over are output to stdout. This length can be increased and the words can be written to a file rather than screen so the app can be automated. 8 | 9 | CeWL also has an associated command line app, FAB (Files Already Bagged) which uses the same meta data extraction techniques to create author/creator lists from already downloaded. 10 | 11 | For anyone running CeWL with Ruby 2.7, you might get some warnings in the style: 12 | 13 | ``` 14 | .../ruby-2.7.0/gems/mime-types-3.2.2/lib/mime/types/logger.rb:30: warning: `_1' is reserved for numbered parameter; consider another name 15 | ``` 16 | This is due to a new feature introduced in 2.7 which conflices with one line of code in the logger script from the mime-types gem. There is an update for it in the [gem's repo](https://github.com/mime-types/ruby-mime-types/commit/c44673179d24e495e5fb93282a87d37f09925d25#diff-f0a644249326afd54e7a0b90c807f8a6) so hopefully that will be released soon. Till then, as far as I can tell, the warning does not affect CeWL in any way. If, for asthetics, you want to hide the warning, you can run the script as follows: 17 | 18 | ``` 19 | ruby -W0 ./cewl.rb 20 | ``` 21 | 22 | Homepage: 23 | 24 | GitHub: 25 | 26 | ## Pronunciation 27 | 28 | Seeing as I was asked, CeWL is pronounced "cool". 29 | 30 | ## Installation 31 | 32 | CeWL needs the following gems to be installed: 33 | 34 | * mime 35 | * mime-types 36 | * mini_exiftool 37 | * nokogiri 38 | * rubyzip 39 | * spider 40 | 41 | The easiest way to install these gems is with Bundler: 42 | 43 | ``` 44 | gem install bundler 45 | bundle install 46 | ``` 47 | 48 | Alternatively, you can install them manually with: 49 | 50 | ``` 51 | gem install xxx 52 | ``` 53 | 54 | The gem `mini_exiftool` gem also requires the exiftool application to be installed. 55 | 56 | Assuming you cloned the GitHub repo, the script should by executable by default, but if not, you can make it executable with: 57 | 58 | ``` 59 | chmod u+x ./cewl.rb 60 | ``` 61 | 62 | The project page on my site gives some tips on solving common problems people 63 | have encountered while running CeWL - https://digi.ninja/projects/cewl.php 64 | 65 | ## Usage 66 | 67 | ``` 68 | ./cewl.rb 69 | 70 | CeWL 5.5.2 (Grouping) Robin Wood (robin@digi.ninja) (https://digi.ninja/) 71 | Usage: cewl [OPTIONS] ... 72 | 73 | OPTIONS: 74 | -h, --help: Show help. 75 | -k, --keep: Keep the downloaded file. 76 | -d ,--depth : Depth to spider to, default 2. 77 | -m, --min_word_length: Minimum word length, default 3. 78 | -o, --offsite: Let the spider visit other sites. 79 | -w, --write: Write the output to the file. 80 | -u, --ua : User agent to send. 81 | -n, --no-words: Don't output the wordlist. 82 | -a, --meta: include meta data. 83 | --meta_file file: Output file for meta data. 84 | -e, --email: Include email addresses. 85 | --email_file : Output file for email addresses. 86 | --meta-temp-dir : The temporary directory used by exiftool when parsing files, default /tmp. 87 | -c, --count: Show the count for each word found. 88 | -v, --verbose: Verbose. 89 | --debug: Extra debug information. 90 | 91 | Authentication 92 | --auth_type: Digest or basic. 93 | --auth_user: Authentication username. 94 | --auth_pass: Authentication password. 95 | 96 | Proxy Support 97 | --proxy_host: Proxy host. 98 | --proxy_port: Proxy port, default 8080. 99 | --proxy_username: Username for proxy, if required. 100 | --proxy_password: Password for proxy, if required. 101 | 102 | Headers 103 | --header, -H: In format name:value - can pass multiple. 104 | 105 | : The site to spider. 106 | ``` 107 | 108 | ### Running CeWL in a Docker container 109 | 110 | 111 | To quickly use CeWL with Docker, you can use the official `ghcr.io/digininja/cewl` image: 112 | 113 | ```sh 114 | docker run -it --rm -v "${PWD}:/host" ghcr.io/digininja/cewl [OPTIONS] ... 115 | ``` 116 | 117 | You can also build it locally: 118 | ```sh 119 | docker build -t cewl . 120 | docker run -it --rm -v "${PWD}:/host" cewl [OPTIONS] ... 121 | ``` 122 | 123 | I am going to stress here, I am not going to be offering any support for this. The work was done by [@loris-intergalactique](https://github.com/loris-intergalactique) who has offered to field any questions on it and give support. I don't use or know Docker, so please, don't ask me for help. 124 | 125 | ## Licence 126 | 127 | This project released under the Creative Commons Attribution-Share Alike 2.0 UK: England & Wales 128 | 129 | 130 | 131 | Alternatively, you can use GPL-3+ instead the of the original license. 132 | 133 | 134 | -------------------------------------------------------------------------------- /cewl.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | #encoding: UTF-8 3 | 4 | # == CeWL: Custom Word List Generator 5 | # 6 | # CeWL will spider a target site and generate the following lists: 7 | # 8 | # * A word list of all unique words found on the target site 9 | # * A list of all email addresses found in mailto links 10 | # * A list of usernames/author details from meta data found in any documents on the site 11 | # * Groups of words up to the specified group size 12 | # 13 | # URL: The site to spider. 14 | # 15 | # Author:: Robin Wood (robin@digi.ninja) 16 | # Copyright:: Copyright (c) Robin Wood 2018 17 | # Licence:: CC-BY-SA 2.0 or GPL-3+ 18 | # 19 | 20 | VERSION = "6.2.1 (More Fixes)" 21 | 22 | puts "CeWL #{VERSION} Robin Wood (robin@digi.ninja) (https://digi.ninja/)\n" 23 | 24 | begin 25 | require 'getoptlong' 26 | require 'spider' 27 | require 'nokogiri' 28 | require 'net/http' 29 | rescue LoadError => e 30 | # Catch error and provide feedback on installing gem 31 | if e.to_s =~ /cannot load such file -- (.*)/ 32 | missing_gem = $1 33 | puts "\nError: #{missing_gem} gem not installed\n" 34 | puts " Run 'bundle install' to install all the required gems\n\n" 35 | exit 2 36 | else 37 | puts "There was an error loading the gems:\n" 38 | puts e.to_s 39 | exit 2 40 | end 41 | end 42 | 43 | require_relative 'cewl_lib' 44 | 45 | # Doing this so I can override the allowed? function which normally checks 46 | # the robots.txt file 47 | class MySpider a_url}, [], rules, []) 100 | 101 | a_spider.headers = @@headers 102 | 103 | a_spider.auth_type = @@auth_type 104 | a_spider.auth_user = @@auth_user 105 | a_spider.auth_password = @@auth_password 106 | 107 | a_spider.proxy_host = @@proxy_host 108 | a_spider.proxy_port = @@proxy_port 109 | a_spider.proxy_username = @@proxy_username 110 | a_spider.proxy_password = @@proxy_password 111 | 112 | a_spider.verbose = @@verbose 113 | a_spider.debug = @@debug 114 | block.call(a_spider) 115 | a_spider.start! 116 | end 117 | end 118 | 119 | # My version of the spider class which allows all files 120 | # to be processed 121 | class MySpiderInstance a_next_url 188 | end 189 | #exit if interrupted 190 | end 191 | 192 | @teardown.call(a_url) unless @teardown.nil? 193 | throw :ctrl_c if @interrupt 194 | end 195 | end 196 | end while !@next_urls.empty? 197 | end 198 | 199 | def get_page(uri, &block) #:nodoc: 200 | @seen << uri 201 | 202 | trap("SIGINT") { puts 'Hold on, stopping here ...'; @interrupt = true } 203 | begin 204 | if @proxy_host.nil? 205 | http = Net::HTTP.new(uri.host, uri.port) 206 | 207 | if uri.scheme == 'https' 208 | http.use_ssl = true 209 | http.verify_mode = OpenSSL::SSL::VERIFY_NONE 210 | end 211 | else 212 | proxy = Net::HTTP::Proxy(@proxy_host, @proxy_port, @proxy_username, @proxy_password) 213 | begin 214 | if uri.scheme == 'https' 215 | http = proxy.start(uri.host, uri.port, :use_ssl => true, :verify_mode => OpenSSL::SSL::VERIFY_NONE) 216 | else 217 | http = proxy.start(uri.host, uri.port) 218 | end 219 | rescue => e 220 | puts "\nFailed to connect to the proxy (#{@proxy_host}:#{@proxy_port})\n\n" 221 | exit 2 222 | end 223 | end 224 | 225 | req = Net::HTTP::Get.new(uri.request_uri) 226 | @headers.each_pair do |header, value| 227 | req[header] = value 228 | end 229 | 230 | if @auth_type 231 | case @auth_type 232 | when "digest" 233 | uri.user = @auth_user 234 | uri.password = @auth_password 235 | 236 | res = http.request req 237 | 238 | if res['www-authenticate'] 239 | digest_auth = Net::HTTP::DigestAuth.new 240 | auth = digest_auth.auth_header uri, res['www-authenticate'], 'GET' 241 | 242 | req = Net::HTTP::Get.new uri.request_uri 243 | req.add_field 'Authorization', auth 244 | end 245 | 246 | when "basic" 247 | req.basic_auth @auth_user, @auth_password 248 | end 249 | end 250 | 251 | res = http.request(req) 252 | 253 | if res.redirect? 254 | puts "Redirect URL" if @debug 255 | base_url = uri.to_s[0, uri.to_s.rindex('/')] 256 | new_url = URI.parse(construct_complete_url(base_url, res['Location'])) 257 | 258 | # If auth is used then a name:pass@ gets added, this messes the tree 259 | # up so easiest to just remove it 260 | current_uri = uri.to_s.gsub(/:\/\/[^:]*:[^@]*@/, "://") 261 | @next_urls.push current_uri => new_url.to_s 262 | elsif res.code == "401" 263 | puts "Authentication required, can't continue on this branch - #{uri}" if @verbose 264 | else 265 | block.call(res) 266 | end 267 | rescue Zlib::DataError => e 268 | puts "Error in Zlib decompressing data on #{uri}, moving on regardless" 269 | rescue SocketError, Errno::EHOSTUNREACH => e 270 | puts "Couldn't hit the site #{uri}, moving on" 271 | rescue NoMethodError => e 272 | if @verbose 273 | puts "Unable to process URL" 274 | puts "Message is #{e.to_s}" 275 | puts e.backtrace 276 | end 277 | rescue => e 278 | puts "\nUnable to connect to the site (#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.request_uri})" 279 | 280 | if @verbose 281 | puts "\nThe following error may help:" 282 | puts e.to_s 283 | puts e.backtrace 284 | puts "\nCaller" 285 | puts caller 286 | else 287 | puts "Run in verbose mode (-v) for more information" 288 | end 289 | 290 | puts "\n\n" 291 | end 292 | end 293 | 294 | # Overriding so that I can get it to ignore direct names - i.e. #name 295 | def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc: 296 | return nil if additional_url =~ /^#/ 297 | 298 | parsed_additional_url ||= URI.parse(additional_url) 299 | if parsed_additional_url.scheme.nil? 300 | u = base_url.is_a?(URI) ? base_url : URI.parse(base_url) 301 | if additional_url[0].chr == '/' 302 | url = "#{u.scheme}://#{u.host}:#{u.port}#{additional_url}" 303 | elsif u.path.nil? || u.path == '' 304 | url = "#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}" 305 | elsif u.path[0].chr == '/' 306 | url = "#{u.scheme}://#{u.host}:#{u.port}#{u.path}/#{additional_url}" 307 | else 308 | url = "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}" 309 | end 310 | else 311 | url = additional_url 312 | end 313 | return url 314 | end 315 | 316 | # Overriding the original spider one as it doesn't find hrefs very well 317 | def generate_next_urls(a_url, resp) #:nodoc: 318 | if @debug 319 | puts "a_url = #{a_url}" 320 | puts "resp = #{resp}" 321 | end 322 | web_page = resp.body 323 | if URI.parse(a_url).path.empty? 324 | base_url = a_url 325 | else 326 | base_url = a_url[0, a_url.rindex('/')] 327 | end 328 | puts "base_url: #{base_url}" if @debug 329 | 330 | doc = Nokogiri::HTML(web_page) 331 | links = doc.css('a').map { |a| a['href'] } 332 | 333 | puts "links = #{links.inspect}" if @debug 334 | links.map do |link| 335 | begin 336 | if link.nil? 337 | nil 338 | else 339 | begin 340 | parsed_link = URI.parse(link) 341 | parsed_link.fragment == '#' ? nil : construct_complete_url(base_url, link, parsed_link) 342 | rescue 343 | nil 344 | end 345 | end 346 | rescue => e 347 | puts "\nThere was an error generating URL list" 348 | puts "Error: #{e.inspect}" 349 | puts e.backtrace 350 | exit 2 351 | end 352 | end.compact 353 | end 354 | end 355 | 356 | # A node for a tree 357 | class TreeNode 358 | attr :value 359 | attr :depth 360 | attr :key 361 | attr :visited, true 362 | 363 | def initialize(key, value, depth) 364 | @key = key 365 | @value = value 366 | @depth = depth 367 | @visited = false 368 | end 369 | 370 | def to_s 371 | if key.nil? 372 | return "key=nil value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}" 373 | else 374 | return "key=#{@key} value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}" 375 | end 376 | end 377 | 378 | def to_url_hash 379 | return({@key => @value}) 380 | end 381 | end 382 | 383 | # A tree structure 384 | class Tree 385 | attr :data 386 | attr_writer :debug 387 | attr_writer :max_depth 388 | @children 389 | 390 | # Get the maximum depth the tree can grow to 391 | def max_depth 392 | @max_depth 393 | end 394 | 395 | # Set the max depth the tree can grow to 396 | def max_depth=(val) 397 | @max_depth = Integer(val) 398 | end 399 | 400 | # As this is used to work out if there are any more nodes to process it isn't a true empty 401 | def empty? 402 | if !@data.visited 403 | return false 404 | else 405 | @children.each { |node| 406 | return false if !node.data.visited 407 | } 408 | end 409 | return true 410 | end 411 | 412 | # The constructor 413 | def initialize(key=nil, value=nil, depth=0, debug=false) 414 | @data = TreeNode.new(key, value, depth) 415 | @children = [] 416 | @max_depth = 2 417 | end 418 | 419 | # Itterator 420 | def each 421 | yield @data 422 | @children.each do |child_node| 423 | child_node.each { |e| yield e } 424 | end 425 | end 426 | 427 | # Remove an item from the tree 428 | def pop 429 | if !@data.visited 430 | @data.visited = true 431 | return @data.to_url_hash 432 | else 433 | @children.each { |node| 434 | if !node.data.visited 435 | node.data.visited = true 436 | return node.data.to_url_hash 437 | end 438 | } 439 | end 440 | return nil 441 | end 442 | 443 | # Push an item onto the tree 444 | def push(value) 445 | puts "Adding #{value} to the tree" if @debug 446 | key = value.keys.first 447 | value = value.values_at(key).first 448 | 449 | if key.nil? 450 | @data = TreeNode.new(key, value, 0) 451 | else 452 | # If the depth is 0 then don't add anything to the tree 453 | return if @max_depth == 0 454 | if key == @data.value 455 | child = Tree.new(key, value, @data.depth + 1, @debug) 456 | @children << child 457 | else 458 | @children.each { |node| 459 | # Ignore the max depth for mailto links. 460 | # This is not a good way to do this, but it will work for now 461 | # and we all know dirty hacks stay around forever so don't 462 | # expect this to be fixed for a while. 463 | if value =~ /^mailto:/ then 464 | if node.data.value == key then 465 | child = Tree.new(key, value, node.data.depth + 1, @debug) 466 | @children << child 467 | end 468 | else 469 | if node.data.value == key && node.data.depth<@max_depth then 470 | child = Tree.new(key, value, node.data.depth + 1, @debug) 471 | @children << child 472 | end 473 | end 474 | } 475 | end 476 | end 477 | end 478 | end 479 | 480 | opts = GetoptLong.new( 481 | ['--help', '-h', GetoptLong::NO_ARGUMENT], 482 | ['--keep', '-k', GetoptLong::NO_ARGUMENT], 483 | ['--depth', '-d', GetoptLong::REQUIRED_ARGUMENT], 484 | ['--min_word_length', "-m", GetoptLong::REQUIRED_ARGUMENT], 485 | ['--max_word_length', "-x", GetoptLong::REQUIRED_ARGUMENT], 486 | ['--no-words', "-n", GetoptLong::NO_ARGUMENT], 487 | ['--groups', "-g", GetoptLong::REQUIRED_ARGUMENT], 488 | ['--offsite', "-o", GetoptLong::NO_ARGUMENT], 489 | ['--exclude', GetoptLong::REQUIRED_ARGUMENT], 490 | ['--allowed', GetoptLong::REQUIRED_ARGUMENT], 491 | ['--write', "-w", GetoptLong::REQUIRED_ARGUMENT], 492 | ['--ua', "-u", GetoptLong::REQUIRED_ARGUMENT], 493 | ['--meta-temp-dir', GetoptLong::REQUIRED_ARGUMENT], 494 | ['--meta_file', GetoptLong::REQUIRED_ARGUMENT], 495 | ['--email_file', GetoptLong::REQUIRED_ARGUMENT], 496 | ['--lowercase', GetoptLong::NO_ARGUMENT], 497 | ['--with-numbers', GetoptLong::NO_ARGUMENT], 498 | ['--convert-umlauts', GetoptLong::NO_ARGUMENT], 499 | ['--meta', "-a", GetoptLong::NO_ARGUMENT], 500 | ['--email', "-e", GetoptLong::NO_ARGUMENT], 501 | ['--count', '-c', GetoptLong::NO_ARGUMENT], 502 | ['--auth_user', GetoptLong::REQUIRED_ARGUMENT], 503 | ['--auth_pass', GetoptLong::REQUIRED_ARGUMENT], 504 | ['--auth_type', GetoptLong::REQUIRED_ARGUMENT], 505 | ['--header', "-H", GetoptLong::REQUIRED_ARGUMENT], 506 | ['--proxy_host', GetoptLong::REQUIRED_ARGUMENT], 507 | ['--proxy_port', GetoptLong::REQUIRED_ARGUMENT], 508 | ['--proxy_username', GetoptLong::REQUIRED_ARGUMENT], 509 | ['--proxy_password', GetoptLong::REQUIRED_ARGUMENT], 510 | ["--verbose", "-v", GetoptLong::NO_ARGUMENT], 511 | ["--debug", GetoptLong::NO_ARGUMENT] 512 | ) 513 | 514 | # Display the usage 515 | def usage 516 | puts "Usage: cewl [OPTIONS] ... 517 | 518 | OPTIONS: 519 | -h, --help: Show help. 520 | -k, --keep: Keep the downloaded file. 521 | -d ,--depth : Depth to spider to, default 2. 522 | -m, --min_word_length: Minimum word length, default 3. 523 | -x, --max_word_length: Maximum word length, default unset. 524 | -o, --offsite: Let the spider visit other sites. 525 | --exclude: A file containing a list of paths to exclude 526 | --allowed: A regex pattern that path must match to be followed 527 | -w, --write: Write the output to the file. 528 | -u, --ua : User agent to send. 529 | -n, --no-words: Don't output the wordlist. 530 | -g , --groups : Return groups of words as well 531 | --lowercase: Lowercase all parsed words 532 | --with-numbers: Accept words with numbers in as well as just letters 533 | --convert-umlauts: Convert common ISO-8859-1 (Latin-1) umlauts (ä-ae, ö-oe, ü-ue, ß-ss) 534 | -a, --meta: include meta data. 535 | --meta_file file: Output file for meta data. 536 | -e, --email: Include email addresses. 537 | --email_file : Output file for email addresses. 538 | --meta-temp-dir : The temporary directory used by exiftool when parsing files, default /tmp. 539 | -c, --count: Show the count for each word found. 540 | -v, --verbose: Verbose. 541 | --debug: Extra debug information. 542 | 543 | Authentication 544 | --auth_type: Digest or basic. 545 | --auth_user: Authentication username. 546 | --auth_pass: Authentication password. 547 | 548 | Proxy Support 549 | --proxy_host: Proxy host. 550 | --proxy_port: Proxy port, default 8080. 551 | --proxy_username: Username for proxy, if required. 552 | --proxy_password: Password for proxy, if required. 553 | 554 | Headers 555 | --header, -H: In format name:value - can pass multiple. 556 | 557 | : The site to spider. 558 | 559 | " 560 | exit 0 561 | end 562 | 563 | debug = false 564 | verbose = false 565 | ua = nil 566 | url = nil 567 | outfile = nil 568 | email_outfile = nil 569 | meta_outfile = nil 570 | offsite = false 571 | exclude_array = [] 572 | allowed_pattern = nil 573 | depth = 2 574 | min_word_length = 3 575 | max_word_length = -1 576 | email = false 577 | meta = false 578 | wordlist = true 579 | groups = -1 580 | meta_temp_dir = "/tmp/" 581 | keep = false 582 | lowercase = false 583 | words_with_numbers = false 584 | convert_umlauts = false 585 | show_count = false 586 | auth_type = nil 587 | auth_user = nil 588 | auth_pass = nil 589 | 590 | proxy_host = nil 591 | proxy_port = nil 592 | proxy_username = nil 593 | proxy_password = nil 594 | 595 | # headers will be passed in in the format "header: value" 596 | # and there can be multiple 597 | headers = [] 598 | 599 | strip_css = true 600 | strip_js = true 601 | 602 | begin 603 | opts.each do |opt, arg| 604 | case opt 605 | when '--help' 606 | usage 607 | when "--lowercase" 608 | lowercase = true 609 | when "--with-numbers" 610 | words_with_numbers = true 611 | when "--convert-umlauts" 612 | convert_umlauts = true 613 | when "--count" 614 | show_count = true 615 | when "--meta-temp-dir" 616 | if !File.directory?(arg) 617 | puts "\nMeta temp directory is not a directory\n\n" 618 | exit 1 619 | end 620 | 621 | if !File.writable?(arg) 622 | puts "\nThe meta temp directory is not writable\n\n" 623 | exit 1 624 | end 625 | 626 | meta_temp_dir = arg 627 | meta_temp_dir += "/" if meta_temp_dir !~ /.*\/$/ 628 | when "--keep" 629 | keep = true 630 | when "--no-words" 631 | wordlist = false 632 | when "--meta_file" 633 | meta_outfile = arg 634 | when "--meta" 635 | meta = true 636 | when "--groups" 637 | groups = arg.to_i 638 | when "--email_file" 639 | email_outfile = arg 640 | when "--email" 641 | email = true 642 | when '--max_word_length' 643 | max_word_length = arg.to_i 644 | usage if max_word_length < 1 645 | when '--min_word_length' 646 | min_word_length = arg.to_i 647 | usage if min_word_length < 1 648 | when '--depth' 649 | depth = arg.to_i 650 | usage if depth < 0 651 | when '--offsite' 652 | offsite = true 653 | when '--exclude' 654 | begin 655 | tmp_exclude_array = File.readlines(arg) 656 | rescue => e 657 | puts "\nUnable to open the excude file\n\n" 658 | exit 1 659 | end 660 | # Have to do this to strip the newline characters from the end 661 | # of each element in the array 662 | tmp_exclude_array.each do |line| 663 | exc = line.strip 664 | if exc != "" 665 | exclude_array << line.strip 666 | # puts "Excluding #{ line.strip}" 667 | end 668 | end 669 | when '--allowed' 670 | allowed_pattern = Regexp.new(arg) 671 | when '--ua' 672 | ua = arg 673 | when '--debug' 674 | debug = true 675 | when '--verbose' 676 | verbose = true 677 | when '--write' 678 | outfile = arg 679 | when "--header" 680 | headers << arg 681 | when "--proxy_password" 682 | proxy_password = arg 683 | when "--proxy_username" 684 | proxy_username = arg 685 | when "--proxy_host" 686 | proxy_host = arg 687 | when "--proxy_port" 688 | proxy_port = arg.to_i 689 | when "--auth_pass" 690 | auth_pass = arg 691 | when "--auth_user" 692 | auth_user = arg 693 | when "--auth_type" 694 | if arg =~ /(digest|basic)/i 695 | auth_type = $1.downcase 696 | if auth_type == "digest" 697 | begin 698 | require "net/http/digest_auth" 699 | rescue LoadError => e 700 | # Catch error and provide feedback on installing gem 701 | puts "\nError: To use digest auth you require the net-http-digest_auth gem\n" 702 | puts "\t Use: 'gem install net-http-digest_auth'\n\n" 703 | exit 2 704 | end 705 | end 706 | else 707 | puts "\nInvalid authentication type, please specify either basic or digest\n\n" 708 | exit 1 709 | end 710 | end 711 | end 712 | rescue => e 713 | # puts e 714 | usage 715 | end 716 | 717 | if auth_type && (auth_user.nil? || auth_pass.nil?) 718 | puts "\nIf using basic or digest auth you must provide a username and password\n\n" 719 | exit 1 720 | end 721 | 722 | if auth_type.nil? && (!auth_user.nil? || !auth_pass.nil?) 723 | puts "\nAuthentication details provided but no mention of basic or digest\n\n" 724 | exit 1 725 | end 726 | 727 | if ARGV.length != 1 728 | puts "\nMissing URL argument (try --help)\n\n" 729 | exit 1 730 | end 731 | 732 | url = ARGV.shift 733 | 734 | # Must have protocol 735 | url = "http://#{url}" if url !~ /^http(s)?:\/\// 736 | 737 | # Taking this back out again. Can't remember why it was put in but have found problems 738 | # with it in and none with it out so getting rid of it. 739 | # 740 | # The spider doesn't work properly if there isn't a / on the end 741 | #if url !~ /\/$/ 742 | # url = "#{url}/" 743 | #end 744 | 745 | group_word_hash = {} 746 | word_hash = {} 747 | email_arr = [] 748 | url_stack = Tree.new 749 | url_stack.debug = debug 750 | url_stack.max_depth = depth 751 | usernames = Array.new() 752 | 753 | # Do the checks here so we don't do all the processing then find we can't open the file 754 | if outfile 755 | begin 756 | outfile_file = File.new(outfile, "w") 757 | rescue 758 | puts "\nCouldn't open the output file for writing\n\n" 759 | exit 2 760 | end 761 | else 762 | outfile_file = $stdout 763 | end 764 | 765 | if email_outfile && email 766 | begin 767 | email_outfile_file = File.new(email_outfile, "w") 768 | rescue 769 | puts "\nCouldn't open the email output file for writing\n\n" 770 | exit 2 771 | end 772 | else 773 | email_outfile_file = outfile_file 774 | end 775 | 776 | if meta_outfile && meta 777 | begin 778 | meta_outfile_file = File.new(meta_outfile, "w") 779 | rescue 780 | puts "\nCouldn't open the metadata output file for writing\n\n" 781 | exit 2 782 | end 783 | else 784 | meta_outfile_file = outfile_file 785 | end 786 | 787 | catch :ctrl_c do 788 | begin 789 | puts "Starting at #{url}" if verbose 790 | 791 | MySpider.proxy(proxy_host, proxy_port, proxy_username, proxy_password) if proxy_host 792 | MySpider.auth_creds(auth_type, auth_user, auth_pass) if auth_type 793 | MySpider.headers(headers) 794 | MySpider.verbose(verbose) 795 | MySpider.debug(debug) 796 | 797 | MySpider.start_at(url) do |s| 798 | s.headers['User-Agent'] = ua if ua 799 | 800 | s.add_url_check do |a_url| 801 | puts "Checking page #{a_url}" if debug 802 | allow = true 803 | 804 | # Extensions to ignore 805 | if a_url =~ /(\.zip$|\.gz$|\.zip$|\.bz2$|\.png$|\.gif$|\.jpg$|^#)/ 806 | puts "Ignoring internal link or graphic: #{a_url}" if verbose 807 | allow = false 808 | else 809 | if /^mailto:(.*)/i.match(a_url) 810 | if email 811 | email_arr << $1 812 | puts "Found #{$1} on page #{a_url}" if verbose 813 | end 814 | allow = false 815 | else 816 | a_url_parsed = URI.parse(a_url) 817 | if !offsite 818 | url_parsed = URI.parse(url) 819 | puts "Comparing #{a_url} with #{url}" if debug 820 | 821 | # Make sure the host, port and scheme matches (else its offsite) 822 | allow = (a_url_parsed.host == url_parsed.host) && (a_url_parsed.port == url_parsed.port) && (a_url_parsed.scheme == url_parsed.scheme) ? true : false 823 | 824 | puts "Offsite link, not following: #{a_url}" if !allow && verbose 825 | else 826 | puts "Allowing offsite links" if @debug 827 | end 828 | 829 | puts "Found: #{a_url_parsed.path}" if @debug 830 | 831 | if exclude_array.include?(a_url_parsed.request_uri) 832 | puts "Excluding page: #{a_url_parsed.request_uri}" if verbose 833 | allow = false 834 | end 835 | 836 | if allowed_pattern && !a_url_parsed.path.match(allowed_pattern) 837 | puts "Excluding path: #{a_url_parsed.path} based on allowed pattern" if verbose 838 | allow = false 839 | end 840 | end 841 | end 842 | allow 843 | end 844 | 845 | # This was :success so only the content from a 200 was processed. 846 | # Updating it to :every so that the content of all pages gets processed 847 | # so you can grab things off 404s or text leaked on redirect and error pages. 848 | 849 | s.on :every do |a_url, resp, prior_url| 850 | if verbose 851 | if prior_url.nil? 852 | puts "Visiting: #{a_url}, got response code #{resp.code}" 853 | else 854 | puts "Visiting: #{a_url} referred from #{prior_url}, got response code #{resp.code}" 855 | end 856 | end 857 | 858 | # May want 0-9 in here as well in the future but for now limit it to a-z so 859 | # you can't sneak any nasty characters in 860 | if /.*\.([a-z]+)(\?.*$|$)/i.match(a_url) 861 | file_extension = $1 862 | else 863 | file_extension = '' 864 | end 865 | 866 | # Don't get words from these file types. Most will have been blocked by the url_check function but 867 | # some are let through, such as .css, so that they can be checked for email addresses 868 | 869 | # This is a bad way to do this but it is either white or black list extensions and 870 | # the list of either is quite long, may as well black list and let extra through 871 | # that can then be weeded out later than stop things that could be useful 872 | 873 | #if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/ 874 | if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|png|gif|jpg|#)$/ 875 | if meta 876 | begin 877 | if keep && file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2)$/ 878 | if /.*\/(.*)$/.match(a_url) 879 | output_filename = meta_temp_dir + $1 880 | puts "Keeping #{output_filename}" if verbose 881 | else 882 | # Shouldn't ever get here as the regex above should always be able to pull the filename out of the URL, 883 | # ...but just in case 884 | 885 | # Maybe look at doing this to make the temp name 886 | # require "tempfile" 887 | # Dir::Tmpname.make_tmpname "a", "b" 888 | # => "a20150707-8694-hrrxr4-b" 889 | 890 | output_filename = "#{meta_temp_dir}cewl_tmp" 891 | output_filename += ".#{file_extension}" unless file_extension.empty? 892 | end 893 | else 894 | output_filename = "#{meta_temp_dir}cewl_tmp" 895 | output_filename += ".#{file_extension}" unless file_extension.empty? 896 | end 897 | 898 | out = File.new(output_filename, "wb") 899 | out.print(resp.body) 900 | out.close 901 | 902 | meta_data = process_file(output_filename, verbose) 903 | usernames += meta_data if (meta_data != nil) 904 | rescue => e 905 | puts "\nCouldn't open the meta temp file for writing - #{e.inspect}\n\n" 906 | exit 2 907 | end 908 | end 909 | else 910 | html = resp.body.to_s.force_encoding("UTF-8") 911 | # This breaks on this site http://www.spisa.nu/recept/ as the 912 | # replace replaces some of the important characters. Needs a fix 913 | html.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '') 914 | html.encode!('UTF-8', 'UTF-16') 915 | 916 | dom = Nokogiri.HTML(html) 917 | dom.css('script').remove if strip_js 918 | dom.css('style').remove if strip_css 919 | body = dom.to_s 920 | 921 | # Get meta data 922 | if /.*"\/']*/, "") 925 | end 926 | 927 | if /.*"\/']*/, "") 930 | end 931 | 932 | puts body if debug 933 | 934 | # This bit will not normally fire as all JavaScript is stripped out 935 | # by the Nokogiri remove a few lines before this. 936 | # 937 | # The code isn't perfect but will do a rough job of working out 938 | # pages from relative location links 939 | while /(location.href\s*=\s*["']([^"']*)['"];)/i.match(body) 940 | full_match = $1 941 | j_url = $2 942 | 943 | puts "Javascript redirect found #{j_url}" if verbose 944 | 945 | re = Regexp.escape(full_match) 946 | body.gsub!(/#{re}/, "") 947 | 948 | if j_url !~ /https?:\/\//i 949 | parsed = URI.parse(a_url) 950 | protocol = parsed.scheme 951 | host = parsed.host 952 | 953 | domain = "#{protocol}://#{host}" 954 | 955 | j_url = domain + j_url 956 | j_url += $1 if j_url[0] == "/" && parsed.path =~ /(.*)\/.*/ 957 | 958 | puts "Relative URL found, adding domain to make #{j_url}" if verbose 959 | end 960 | 961 | x = {a_url => j_url} 962 | url_stack.push x 963 | end 964 | 965 | # Strip comment tags 966 | body.gsub!(//, "") 968 | 969 | # If you want to add more attribute names to include, just add them to this array 970 | attribute_names = [ 971 | "alt", 972 | "title", 973 | ] 974 | 975 | attribute_text = '' 976 | 977 | attribute_names.each { |attribute_name| 978 | body.gsub!(/#{attribute_name}="([^"]*)"/) { |attr| attribute_text += "#{$1} " } 979 | } 980 | 981 | if verbose and attribute_text 982 | puts "Attribute text found:" 983 | puts attribute_text 984 | puts 985 | end 986 | 987 | body += " #{attribute_text}" 988 | 989 | # Strip html tags 990 | words = body.gsub(/<\/?[^>]*>/, "") 991 | 992 | # Check if this is needed 993 | words.gsub!(/&[a-z]*;/, "") 994 | 995 | begin 996 | #if file_extension !~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/ 997 | begin 998 | if email 999 | # Split the file down based on the email address regexp 1000 | #words.gsub!(/\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i) 1001 | #p words 1002 | 1003 | # If you want to pull email addresses from the contents of files found, such as word docs then move 1004 | # this block outside the if statement 1005 | # I've put it in here as some docs contain email addresses that have nothing to do with the target 1006 | # so give false positive type results 1007 | words.each_line do |word| 1008 | while /\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i.match(word) 1009 | puts "Found #{$1} on page #{a_url}" if verbose 1010 | email_arr << $1 1011 | word = word.gsub(/#{$1}/, "") 1012 | end 1013 | end 1014 | end 1015 | rescue => e 1016 | puts "\nThere was a problem generating the email list" 1017 | puts "Error: #{e.inspect}" 1018 | puts e.backtrace 1019 | end 1020 | 1021 | if wordlist 1022 | # Lowercase all parsed words 1023 | if lowercase then 1024 | words.downcase! 1025 | end 1026 | # Remove any symbols 1027 | if words_with_numbers then 1028 | words.gsub!(/[^[[:alnum:]]]/i, " ") 1029 | else 1030 | words.gsub!(/[^[[:alpha:]]]/i, " ") 1031 | end 1032 | 1033 | if convert_umlauts then 1034 | words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue") 1035 | end 1036 | 1037 | # Add to the array 1038 | group_words = [] 1039 | words.split(" ").each do |word| 1040 | if word.length >= min_word_length and (max_word_length == -1 or word.length <= max_word_length) 1041 | word_hash[word] = 0 if !word_hash.has_key?(word) 1042 | word_hash[word] += 1 1043 | end 1044 | if (groups > 0) 1045 | group_words.push (word) 1046 | if (group_words.length() > groups) 1047 | group_words.shift() 1048 | end 1049 | if (group_words.length() == groups) 1050 | joined = group_words.join(" ") 1051 | group_word_hash[joined] = 0 if !group_word_hash.has_key?(joined) 1052 | group_word_hash[joined] += 1 1053 | end 1054 | end 1055 | end 1056 | end 1057 | #end 1058 | rescue => e 1059 | puts "\nThere was a problem handling word generation" 1060 | puts "Error: #{e.inspect}" 1061 | puts e.backtrace 1062 | end 1063 | end 1064 | end 1065 | s.store_next_urls_with url_stack 1066 | end 1067 | rescue Errno::ENOENT 1068 | puts "\nInvalid URL specified (#{url})\n\n" 1069 | exit 2 1070 | rescue => e 1071 | puts "\nCouldn't access the site (#{url})\n" 1072 | puts "Error: #{e.inspect}" 1073 | puts "Error: #{e.backtrace}" 1074 | exit 2 1075 | end 1076 | end 1077 | 1078 | puts "End of main loop" if debug 1079 | 1080 | if wordlist 1081 | if verbose 1082 | if outfile.nil? 1083 | puts "Words found\n" 1084 | else 1085 | puts "Writing words to file\n" 1086 | end 1087 | end 1088 | 1089 | sorted_wordlist = word_hash.sort_by do |word, count| 1090 | -count 1091 | end 1092 | 1093 | sorted_wordlist.each do |word, count| 1094 | if show_count 1095 | outfile_file.puts "#{word}, #{count.to_s}" 1096 | else 1097 | outfile_file.puts word 1098 | end 1099 | end 1100 | end 1101 | 1102 | if groups > 0 1103 | if verbose 1104 | if outfile.nil? 1105 | puts "Groups of words found\n" 1106 | else 1107 | puts "Writing groups of words to file\n" 1108 | end 1109 | end 1110 | 1111 | sorted_wordlist = group_word_hash.sort_by do |word, count| 1112 | -count 1113 | end 1114 | 1115 | sorted_wordlist.each do |word, count| 1116 | if show_count 1117 | outfile_file.puts "#{word}, #{count.to_s}" 1118 | else 1119 | outfile_file.puts word 1120 | end 1121 | end 1122 | end 1123 | 1124 | puts "End of wordlist loop" if debug 1125 | 1126 | if email 1127 | if email_arr.length == 0 1128 | puts "No email addresses found" if verbose 1129 | else 1130 | puts "Dumping email addresses to file" if verbose 1131 | 1132 | email_arr.delete_if { |x| x.chomp.empty? } 1133 | email_arr.uniq! 1134 | email_arr.sort! 1135 | 1136 | outfile_file.puts if (wordlist || verbose) && email_outfile.nil? 1137 | 1138 | if email_outfile.nil? 1139 | outfile_file.puts "Email addresses found" 1140 | outfile_file.puts "---------------------" 1141 | outfile_file.puts email_arr.join("\n") 1142 | else 1143 | email_outfile_file.puts email_arr.join("\n") 1144 | end 1145 | end 1146 | end 1147 | 1148 | puts "End of email loop" if debug 1149 | 1150 | if meta 1151 | if usernames.length == 0 1152 | puts "No meta data found" if verbose 1153 | else 1154 | puts "Dumping meta data to file" if verbose 1155 | usernames.delete_if { |x| x.chomp.empty? } 1156 | usernames.uniq! 1157 | usernames.sort! 1158 | 1159 | outfile_file.puts if (email||wordlist) && meta_outfile.nil? 1160 | if meta_outfile.nil? 1161 | outfile_file.puts "Meta data found" 1162 | outfile_file.puts "---------------" 1163 | outfile_file.puts usernames.join("\n") 1164 | else 1165 | meta_outfile_file.puts usernames.join("\n") 1166 | end 1167 | end 1168 | end 1169 | 1170 | puts "End of meta loop" if debug 1171 | 1172 | meta_outfile_file.close if meta_outfile 1173 | email_outfile_file.close if email_outfile 1174 | outfile_file.close if outfile 1175 | -------------------------------------------------------------------------------- /cewl_lib.rb: -------------------------------------------------------------------------------- 1 | # == CeWL Library: Library to outsource reusable features 2 | # 3 | # Author:: Robin Wood (robin@digi.ninja) 4 | # Copyright:: Copyright (c) Robin Wood 2016 5 | # Licence:: GPL 6 | # 7 | 8 | begin 9 | require 'mini_exiftool' 10 | require "zip" 11 | require "rexml/document" 12 | require 'mime' 13 | require 'mime-types' 14 | include REXML 15 | rescue LoadError => e 16 | # catch error and prodive feedback on installing gem 17 | if e.to_s =~ /cannot load such file -- (.*)/ 18 | missing_gem = $1 19 | puts "\nError: #{missing_gem} gem not installed\n" 20 | puts "\t use: \"gem install #{missing_gem}\" to install the required gem\n\n" 21 | exit 22 | else 23 | puts "There was an error loading the gems:" 24 | puts 25 | puts e.to_s 26 | exit 27 | end 28 | end 29 | 30 | # Override the MiniExiftool class so that I can modify the parse_line 31 | # method and force all encoding to ISO-8859-1. Without this the app bombs 32 | # on some machines as it is unable to parse UTF-8 33 | class MyMiniExiftool(.*)<\/xap:creator>/i.match(line) 83 | if verbose 84 | puts "Found pdf:creator: "+$1 85 | end 86 | meta_data<<$1.to_s.chomp unless $1.to_s=="" 87 | end 88 | if /(.*)<\/xap:Author>/i.match(line) 89 | if verbose 90 | puts "Found xap:Author: "+$1 91 | end 92 | meta_data<<$1.to_s.chomp unless $1.to_s=="" 93 | end 94 | if /(.*)<\/pdf:Author>/i.match(line) 95 | if verbose 96 | puts "Found pdf:Author: "+$1 97 | end 98 | meta_data<<$1.to_s.chomp unless $1.to_s=="" 99 | end 100 | if /(.*)<\/dc:creator>/i.match(line) 101 | if verbose 102 | puts "Found dc:creator: "+$1 103 | end 104 | meta_data<<$1.to_s.chomp unless $1.to_s=="" 105 | end 106 | 107 | } 108 | return meta_data 109 | rescue => e 110 | if verbose 111 | puts "There was an error processing the document - " + e.message 112 | end 113 | end 114 | return meta_data 115 | end 116 | 117 | # Get data from files using exiftool 118 | def get_doc_data(doc_file, verbose) 119 | data=[] 120 | begin 121 | interesting_fields=Array.[]("Author","LastSavedBy","Creator") 122 | file = MyMiniExiftool.new(doc_file) 123 | 124 | interesting_fields.each{ |field_name| 125 | if file.tags.include?(field_name) 126 | data< e 130 | if verbose 131 | puts "There was an error processing the document - " + e.message 132 | end 133 | end 134 | return data 135 | end 136 | 137 | # Get data from Office 2007 documents by unziping relivant XML files then 138 | # checking for known fields 139 | def get_docx_data(docx_file, verbose) 140 | meta_data=[] 141 | 142 | interesting_fields=Array.[]("cp:coreProperties/dc:creator","cp:coreProperties/cp:lastModifiedBy") 143 | interesting_files=Array.[]("docProps/core.xml") 144 | 145 | begin 146 | Zip::File.open(docx_file) { |zipfile| 147 | interesting_files.each { |file| 148 | if zipfile.find_entry(file) 149 | xml=zipfile.read(file) 150 | 151 | doc=Document.new(xml) 152 | interesting_fields.each { |field| 153 | element=doc.elements[field] 154 | #puts element.get_text unless element==nil||element.get_text==nil 155 | meta_data< e 161 | if verbose 162 | # not a zip file 163 | puts "File probably not a zip file - " + e.message 164 | end 165 | end 166 | return meta_data 167 | end 168 | 169 | # Take the file given, try to work out what type of file it is then pass it 170 | # to the relivant function to try to grab meta data 171 | def process_file(filename, verbose=false) 172 | meta_data=nil 173 | 174 | begin 175 | puts "processing file: " + filename 176 | 177 | if File.file?(filename) && File.exist?(filename) 178 | mime_types=MIME::Types.type_for(filename) 179 | if(mime_types.size==0) 180 | if(verbose) 181 | puts "Empty mime type" 182 | end 183 | return meta_data 184 | end 185 | if verbose 186 | puts "Checking "+filename 187 | puts " Mime type="+mime_types.join(", ") 188 | puts 189 | end 190 | if mime_types.include?("application/word") || mime_types.include?("application/excel") || mime_types.include?("application/powerpoint") 191 | if verbose 192 | puts " Mime type says original office document" 193 | end 194 | meta_data=get_doc_data(filename, verbose) 195 | else 196 | if mime_types.include?("application/pdf") 197 | if verbose 198 | puts " Mime type says PDF" 199 | end 200 | # Running both my own regexp and exiftool on pdfs as I've found exif misses some data 201 | meta_data=get_doc_data(filename, verbose) 202 | meta_data+=get_pdf_data(filename, verbose) 203 | else 204 | # list taken from http://en.wikipedia.org/wiki/Microsoft_Office_2007_file_extensions 205 | if filename =~ /(.(doc|dot|ppt|pot|xls|xlt|pps)[xm]$)|(.ppam$)|(.xlsb$)|(.xlam$)/ 206 | if verbose 207 | puts " File extension says 2007 style office document" 208 | end 209 | meta_data=get_docx_data(filename, verbose) 210 | elsif filename =~ /.php$|.aspx$|.cfm$|.asp$|.html$|.htm$/ 211 | if verbose 212 | puts " Language file, can ignore" 213 | end 214 | else 215 | if verbose 216 | puts " Unknown file type" 217 | end 218 | end 219 | end 220 | end 221 | if meta_data!=nil 222 | if verbose 223 | if meta_data.length > 0 224 | puts " Found "+meta_data.join(", ")+"\n" 225 | end 226 | end 227 | end 228 | end 229 | rescue => e 230 | puts "Problem in process_file function" 231 | puts "Error: " + e.message 232 | puts e.backtrace 233 | end 234 | 235 | return meta_data 236 | end 237 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## Version 5.5.2 4 | 5 | * Can now specify a range for the number of words to group together when grouping. 6 | * Specify multiple grouping characters. 7 | 8 | ## Version 5.5.1 9 | * Fixed accidental concatenation of words when stripping HTML tags. 10 | 11 | ## Version 5.5.0 12 | 13 | * Grouping words together. 14 | 15 | ## Version 5.4.9 16 | 17 | * Added Docker support. 18 | 19 | ## Version 5.4.8 20 | 21 | * Updated the parser so that it looks at the content on all pages which are returned, not just those with a 200 return code. 22 | 23 | ## Version 5.4.7 24 | 25 | * Added the `--allowed parameter` to limit crawling to URLs matching the passed RegEx. Work done by [5p1n](https://github.com/5p1n/). 26 | 27 | ## Version 5.4.6 28 | 29 | * Added the `--lowercase` parameter to convert all letters to lower case. 30 | * Added the `--convert-umlauts` parameter to convert Latin-1 umlauts (e.g. "ä" to "ae", "ö" to "oe", etc.). 31 | 32 | ## Version 5.4.3 33 | 34 | * Added the `--with-number` parameter to make words include letters and numbers. 35 | 36 | ## Version 5.4.2 37 | 38 | * Merged an update to change the way usage instructions are shown. 39 | * Updated instructions on installing gems. 40 | * Updated README. 41 | 42 | ## Version 5.4.1 43 | 44 | * A line to add a / to the end of the URL had been commented out. I don't remember why it was done but I'm putting it back in. See [issue 26](https://github.com/digininja/CeWL/issues/26). 45 | 46 | ## Version 5.4 47 | 48 | * Steven van der Baan added the ability to hit ctrl-c and keep the results so far. 49 | 50 | ## Version 5.3.1 51 | 52 | * Added the ability to handle non-standard port numbers. 53 | * Added lots more debugging and a new --debug parameter. 54 | 55 | ## Version 5.3 56 | 57 | * Added the command line argument --header (-H) to allow headers to be passed in. 58 | * Parameters are specified in name:value pairs and you can pass multiple. 59 | 60 | ## Version 5.2 61 | 62 | Loads of changes including: 63 | 64 | * Code refactoring by [@g0tmi1k](https://github.com/g0tmi1k) 65 | * Internationalisation - should now handle non-ASCII sites much better 66 | * Found more ways to pull words out of JavaScript content and other areas that aren't normal HTML 67 | * Lots of little bug fixes 68 | 69 | ## Version 5.1 70 | 71 | * Added the GPL-3+ licence to allow inclusion in Debian. 72 | * Added a Gemfile to make installing gems easier. 73 | 74 | ## Version 5.0 75 | 76 | * Adds proxy support from the command line and the ability to pass in credentials for both basic and digest authentication. 77 | * A few other smaller bug fixes as well. 78 | 79 | ## Version 4.3 80 | 81 | CeWL now sorts the words found by count and optionally (new --count argument) includes the word count in the output. I've left the words in the case they are in the pages so "Product" is different to "product" I figure that if it is being used for password generation then the case may be significant so let the user strip it if they want to. There are also more improvements to the stability of the spider in this release. 82 | 83 | By default, CeWL sticks to just the site you have specified and will go to a depth of 2 links, this behaviour can be changed by passing arguments. Be careful if setting a large depth and allowing it to go offsite, you could end up drifting on to a lot of other domains. All words of three characters and over are output to stdout. This length can be increased and the words can be written to a file rather than screen so the app can be automated. 84 | 85 | ## Version 4.2 86 | 87 | Fixes a pretty major bug that I found while fixing a smaller bug for @yorikv. The bug was related to a hack I had to put in place because of a problem I was having with the spider, while I was looking in to it I spotted this line which is the one that the spider uses to find new links in downloaded pages: 88 | 89 | ``` 90 | web_page.scan(/href="(.*?)"/i).flatten.map do |link| 91 | ``` 92 | 93 | This is fine if all the links look like this: 94 | 95 | ``` 96 | link 97 | ``` 98 | 99 | But if the link looks like either of these: 100 | 101 | ``` 102 | link 103 | link 104 | ``` 105 | 106 | The regex will fail so the links will be ignored. 107 | 108 | To fix this up I've had to override the function that parses the page to find all the links, rather than use a regex I've changed it to use Nokogiri which is designed to parse a page looking for links rather than just running through it with a custom regex. This brings in a new dependency but I think it is worth it for the fix to the functionality. I also found another bug where a link like this: 109 | 110 | ``` 111 | local 112 | ``` 113 | 114 | Which should be ignored as it just links to an internal name was actually being translated to '/#name' which may unintentionally mean referencing the index page. I've fixed this one as well after a lot of debugging to find how best to do it. 115 | 116 | A final addition is to allow a user to specify a depth of 0 which allows CeWL to spider a single page. 117 | 118 | I'm only putting this out as a point release as I'd like to rewrite the spidering to use a better spider, that will come out as the next major release. 119 | 120 | ## Version 4.0/4.1 121 | 122 | The main change in version 4.0/1 is the upgrade to run with Ruby 1.9.x, this has been tested on various machines and on BT5 as that is a popular platform for running it and it appears to run fine. Another minor change is that Up to version 4 all HTML tags were stripped out before the page was parsed for words, this meant that text in alt and title tags were missed. I now grab the text from those tags before stripping the HTML to give those extra few works. 123 | 124 | ## Version 3 125 | 126 | Addresses a problem spotted by Josh Wright. The Spider gem doesn't handle JavaScript redirection URLs, for example an index page containing just the following: 127 | 128 | ``` 129 | 133 | ``` 134 | 135 | Wasn't spidered because the redirect wasn't picked up. I now scan through a page looking for any lines containing location.href= and then add the given URL to the list of pages to spider. 136 | 137 | ## Version 2 138 | 139 | Version 2 of CeWL can also create two new lists, a list of email addresses 140 | found in mailto links and a list of author/creator names collected from meta 141 | data found in documents on the site. It can currently process documents in 142 | Office pre 2007, Office 2007 and PDF formats. This user data can then be used 143 | to create the list of usernames to be used in association with the password 144 | list. 145 | 146 | -------------------------------------------------------------------------------- /compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | cewl: 3 | build: . 4 | image: ghcr.io/digininja/cewl:latest 5 | -------------------------------------------------------------------------------- /fab.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # == FAB: Files Already Bagged 4 | # 5 | # This script can be ran against files already 6 | # downloaded from a target site to generate a list 7 | # of usernames and email addresses based on meta 8 | # data contained within them. 9 | # 10 | # To see a list of file types which can be processed 11 | # see cewl_lib.rb 12 | # 13 | # == Usage 14 | # 15 | # fab [OPTION] ... filename/list 16 | # 17 | # -h, --help: 18 | # show help 19 | # 20 | # -v 21 | # verbose 22 | # 23 | # filename/list: the file or list of files to check 24 | # 25 | # Author:: Robin Wood (robin@digi.ninja) 26 | # Copyright:: Copyright (c) Robin Wood 2021 27 | # Licence:: GPL 28 | # 29 | 30 | require 'getoptlong' 31 | require_relative "./cewl_lib.rb" 32 | 33 | opts = GetoptLong.new( 34 | [ '--help', '-h', GetoptLong::NO_ARGUMENT ], 35 | [ "-v" , GetoptLong::NO_ARGUMENT ] 36 | ) 37 | 38 | def usage 39 | puts"xx 40 | 41 | Usage: xx [OPTION] ... filename/list 42 | -h, --help: show help 43 | -v: verbose 44 | 45 | filename/list: the file or list of files to check 46 | 47 | " 48 | exit 49 | end 50 | 51 | verbose=false 52 | 53 | begin 54 | opts.each do |opt, arg| 55 | case opt 56 | when '--help' 57 | usage 58 | when '-v' 59 | verbose=true 60 | end 61 | end 62 | rescue 63 | usage 64 | end 65 | 66 | if ARGV.length < 1 67 | puts "Missing filename/list (try --help)" 68 | exit 0 69 | end 70 | 71 | meta_data=[] 72 | 73 | ARGV.each { |param| 74 | data=process_file(param, verbose) 75 | if(data!=nil) 76 | meta_data+=data 77 | end 78 | } 79 | 80 | meta_data.delete_if { |x| x.chomp==""} 81 | meta_data.uniq! 82 | meta_data.sort! 83 | if meta_data.length==0 84 | puts "No data found\n" 85 | else 86 | puts meta_data.join("\n") 87 | end 88 | --------------------------------------------------------------------------------