├── .dockerignore
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── docker-image.yml
├── .gitignore
├── Dockerfile
├── Gemfile
├── Gemfile.lock
├── README.md
├── cewl.rb
├── cewl_lib.rb
├── changelog.md
├── compose.yml
└── fab.rb


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .gitignore
3 | README.md
4 | fab.rb
5 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: digininja
2 | custom: https://digi.ninja
3 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 | 
18 |     - name: Login to GitHub Container Registry
19 |       uses: docker/login-action@v1
20 |       with:
21 |         registry: ghcr.io
22 |         username: ${{ github.actor }}
23 |         password: ${{ secrets.GITHUB_TOKEN }}
24 | 
25 |     - name: Build the Docker image
26 |       run: |
27 |         IMAGE_ID=ghcr.io/${{ github.repository_owner }}/cewl
28 |         IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
29 |         VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
30 |         [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
31 |         [ "$VERSION" == "master" ] && VERSION=latest
32 |         COMMIT=$(echo "${{ github.sha }}" | cut -c 1-7)
33 |         echo IMAGE_ID=$IMAGE_ID
34 |         echo VERSION=$VERSION
35 |         echo COMMIT=$COMMIT
36 |         docker image build --tag cewl .
37 |         docker image tag cewl $IMAGE_ID:$VERSION
38 |         docker image tag cewl $IMAGE_ID:$COMMIT
39 |         docker image push $IMAGE_ID:$VERSION
40 |         docker image push $IMAGE_ID:$COMMIT
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Vim swap files
2 | .*.swp
3 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ruby:3-alpine
 2 | 
 3 | ENV RUBYOPT "rrubygems"
 4 | 
 5 | COPY Gemfile /usr/src/CeWL/
 6 | WORKDIR /usr/src/CeWL
 7 | 
 8 | RUN apk add gcompat
 9 | RUN set -ex \
10 |     && apk add  --no-cache --virtual .build-deps build-base \
11 |     && gem install bundler \
12 |     && bundle install \
13 |     && apk del .build-deps
14 | 
15 | COPY . /usr/src/CeWL
16 | 
17 | WORKDIR /host
18 | ENTRYPOINT ["/usr/src/CeWL/cewl.rb"]
19 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gem 'mime'
3 | gem 'mime-types', ">=3.3.1"
4 | gem 'mini_exiftool'
5 | gem 'nokogiri'
6 | gem 'rexml'
7 | gem 'rubyzip'
8 | gem 'spider'
9 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     mime (0.4.4)
 5 |     mime-types (3.5.2)
 6 |       mime-types-data (~> 3.2015)
 7 |     mime-types-data (3.2024.0604)
 8 |     mini_exiftool (2.11.0)
 9 |     nokogiri (1.16.5-aarch64-linux)
10 |       racc (~> 1.4)
11 |     nokogiri (1.16.5-arm-linux)
12 |       racc (~> 1.4)
13 |     nokogiri (1.16.5-arm64-darwin)
14 |       racc (~> 1.4)
15 |     nokogiri (1.16.5-x86-linux)
16 |       racc (~> 1.4)
17 |     nokogiri (1.16.5-x86_64-darwin)
18 |       racc (~> 1.4)
19 |     nokogiri (1.16.5-x86_64-linux)
20 |       racc (~> 1.4)
21 |     racc (1.8.0)
22 |     rexml (3.3.6)
23 |       strscan
24 |     rubyzip (2.3.2)
25 |     spider (0.5.4)
26 |     strscan (3.1.0)
27 | 
28 | PLATFORMS
29 |   aarch64-linux
30 |   arm-linux
31 |   arm64-darwin
32 |   x86-linux
33 |   x86_64-darwin
34 |   x86_64-linux
35 | 
36 | DEPENDENCIES
37 |   mime
38 |   mime-types (>= 3.3.1)
39 |   mini_exiftool
40 |   nokogiri
41 |   rexml
42 |   rubyzip
43 |   spider
44 | 
45 | BUNDLED WITH
46 |    2.5.9
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CeWL - Custom Word List generator
  2 | 
  3 | Copyright(c) 2024, Robin Wood <robin@digi.ninja>
  4 | 
  5 | Based on a discussion on PaulDotCom (episode 129) about creating custom word lists spidering a targets website and collecting unique words I decided to write CeWL, the Custom Word List generator. CeWL is a ruby app which spiders a given URL to a specified depth, optionally following external links, and returns a list of words which can then be used for password crackers such as John the Ripper.
  6 | 
  7 | By default, CeWL sticks to just the site you have specified and will go to a depth of 2 links, this behaviour can be changed by passing arguments. Be careful if setting a large depth and allowing it to go offsite, you could end up drifting on to a lot of other domains. All words of three characters and over are output to stdout. This length can be increased and the words can be written to a file rather than screen so the app can be automated.
  8 | 
  9 | CeWL also has an associated command line app, FAB (Files Already Bagged) which uses the same meta data extraction techniques to create author/creator lists from already downloaded.
 10 | 
 11 | For anyone running CeWL with Ruby 2.7, you might get some warnings in the style:
 12 | 
 13 | ```
 14 | .../ruby-2.7.0/gems/mime-types-3.2.2/lib/mime/types/logger.rb:30: warning: `_1' is reserved for numbered parameter; consider another name
 15 | ```
 16 | This is due to a new feature introduced in 2.7 which conflices with one line of code in the logger script from the mime-types gem. There is an update for it in the [gem's repo](https://github.com/mime-types/ruby-mime-types/commit/c44673179d24e495e5fb93282a87d37f09925d25#diff-f0a644249326afd54e7a0b90c807f8a6) so hopefully that will be released soon. Till then, as far as I can tell, the warning does not affect CeWL in any way. If, for asthetics, you want to hide the warning, you can run the script as follows:
 17 | 
 18 | ```
 19 | ruby -W0 ./cewl.rb
 20 | ```
 21 | 
 22 | Homepage: <https://digi.ninja/projects/cewl.php>
 23 | 
 24 | GitHub: <https://github.com/digininja/CeWL>
 25 | 
 26 | ## Pronunciation
 27 | 
 28 | Seeing as I was asked, CeWL is pronounced "cool".
 29 | 
 30 | ## Installation
 31 | 
 32 | CeWL needs the following gems to be installed:
 33 | 
 34 | * mime
 35 | * mime-types
 36 | * mini_exiftool
 37 | * nokogiri
 38 | * rubyzip
 39 | * spider
 40 | 
 41 | The easiest way to install these gems is with Bundler:
 42 | 
 43 | ```
 44 | gem install bundler
 45 | bundle install
 46 | ```
 47 | 
 48 | Alternatively, you can install them manually with:
 49 | 
 50 | ```
 51 | gem install xxx
 52 | ```
 53 | 
 54 | The gem `mini_exiftool` gem also requires the exiftool application to be installed.
 55 | 
 56 | Assuming you cloned the GitHub repo, the script should by executable by default, but if not, you can make it executable with:
 57 | 
 58 | ```
 59 | chmod u+x ./cewl.rb
 60 | ```
 61 | 
 62 | The project page on my site gives some tips on solving common problems people
 63 | have encountered while running CeWL - https://digi.ninja/projects/cewl.php
 64 | 
 65 | ## Usage
 66 | 
 67 | ```
 68 | ./cewl.rb
 69 | 
 70 | CeWL 5.5.2 (Grouping) Robin Wood (robin@digi.ninja) (https://digi.ninja/)
 71 | Usage: cewl [OPTIONS] ... <url>
 72 | 
 73 |     OPTIONS:
 74 | 	-h, --help: Show help.
 75 | 	-k, --keep: Keep the downloaded file.
 76 | 	-d <x>,--depth <x>: Depth to spider to, default 2.
 77 | 	-m, --min_word_length: Minimum word length, default 3.
 78 | 	-o, --offsite: Let the spider visit other sites.
 79 | 	-w, --write: Write the output to the file.
 80 | 	-u, --ua <agent>: User agent to send.
 81 | 	-n, --no-words: Don't output the wordlist.
 82 | 	-a, --meta: include meta data.
 83 | 	--meta_file file: Output file for meta data.
 84 | 	-e, --email: Include email addresses.
 85 | 	--email_file <file>: Output file for email addresses.
 86 | 	--meta-temp-dir <dir>: The temporary directory used by exiftool when parsing files, default /tmp.
 87 | 	-c, --count: Show the count for each word found.
 88 | 	-v, --verbose: Verbose.
 89 | 	--debug: Extra debug information.
 90 | 
 91 | 	Authentication
 92 | 	--auth_type: Digest or basic.
 93 | 	--auth_user: Authentication username.
 94 | 	--auth_pass: Authentication password.
 95 | 
 96 | 	Proxy Support
 97 | 	--proxy_host: Proxy host.
 98 | 	--proxy_port: Proxy port, default 8080.
 99 | 	--proxy_username: Username for proxy, if required.
100 | 	--proxy_password: Password for proxy, if required.
101 | 
102 | 	Headers
103 | 	--header, -H: In format name:value - can pass multiple.
104 | 
105 |     <url>: The site to spider.
106 | ```
107 | 
108 | ### Running CeWL in a Docker container
109 | 
110 | 
111 | To quickly use CeWL with Docker, you can use the official `ghcr.io/digininja/cewl` image:
112 | 
113 | ```sh
114 | docker run -it --rm -v "${PWD}:/host" ghcr.io/digininja/cewl [OPTIONS] ... <url>
115 | ```
116 | 
117 | You can also build it locally:
118 | ```sh
119 | docker build -t cewl .
120 | docker run -it --rm -v "${PWD}:/host" cewl [OPTIONS] ... <url>
121 | ```
122 | 
123 | I am going to stress here, I am not going to be offering any support for this. The work was done by [@loris-intergalactique](https://github.com/loris-intergalactique) who has offered to field any questions on it and give support. I don't use or know Docker, so please, don't ask me for help.
124 | 
125 | ## Licence
126 | 
127 | This project released under the Creative Commons Attribution-Share Alike 2.0 UK: England & Wales
128 | 
129 | <http://creativecommons.org/licenses/by-sa/2.0/uk/>
130 | 
131 | Alternatively, you can use GPL-3+ instead the of the original license.
132 | 
133 | <http://opensource.org/licenses/GPL-3.0>
134 | 


--------------------------------------------------------------------------------
/cewl.rb:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env ruby
   2 | #encoding: UTF-8
   3 | 
   4 | # == CeWL: Custom Word List Generator
   5 | #
   6 | # CeWL will spider a target site and generate the following lists:
   7 | #
   8 | # * A word list of all unique words found on the target site
   9 | # * A list of all email addresses found in mailto links
  10 | # * A list of usernames/author details from meta data found in any documents on the site
  11 | # * Groups of words up to the specified group size
  12 | #
  13 | # URL: The site to spider.
  14 | #
  15 | # Author:: Robin Wood (robin@digi.ninja)
  16 | # Copyright:: Copyright (c) Robin Wood 2018
  17 | # Licence:: CC-BY-SA 2.0 or GPL-3+
  18 | #
  19 | 
  20 | VERSION = "6.2.1 (More Fixes)"
  21 | 
  22 | puts "CeWL #{VERSION} Robin Wood (robin@digi.ninja) (https://digi.ninja/)\n"
  23 | 
  24 | begin
  25 | 	require 'getoptlong'
  26 | 	require 'spider'
  27 | 	require 'nokogiri'
  28 | 	require 'net/http'
  29 | rescue LoadError => e
  30 | 	# Catch error and provide feedback on installing gem
  31 | 	if e.to_s =~ /cannot load such file -- (.*)/
  32 | 		missing_gem = $1
  33 | 		puts "\nError: #{missing_gem} gem not installed\n"
  34 | 		puts "    Run 'bundle install' to install all the required gems\n\n"
  35 | 		exit 2
  36 | 	else
  37 | 		puts "There was an error loading the gems:\n"
  38 | 		puts e.to_s
  39 | 		exit 2
  40 | 	end
  41 | end
  42 | 
  43 | require_relative 'cewl_lib'
  44 | 
  45 | # Doing this so I can override the allowed? function which normally checks
  46 | # the robots.txt file
  47 | class MySpider<Spider
  48 | 	@@proxy_host = nil
  49 | 	@@proxy_port = nil
  50 | 	@@proxy_username = nil
  51 | 	@@proxy_password = nil
  52 | 
  53 | 	@@headers = nil
  54 | 
  55 | 	@@auth_type = nil
  56 | 	@@auth_user = nil
  57 | 	@@auth_password = nil
  58 | 	@@verbose = false
  59 | 	@@debug = false
  60 | 
  61 | 	def self.proxy (host, port = nil, username = nil, password = nil)
  62 | 		@@proxy_host = host
  63 | 		port = 8080 if port.nil?
  64 | 		@@proxy_port = port
  65 | 		@@proxy_username = username
  66 | 		@@proxy_password = password
  67 | 	end
  68 | 
  69 | 	def self.headers (headers)
  70 | 		header_hash = {}
  71 | 		headers.each do |header|
  72 | 			header_split = header.split(":")
  73 | 			if (header_split.count == 2)
  74 | 				header_hash[header_split[0].strip] = header_split[1].strip
  75 | 			else
  76 | 				puts "Invalid header: " + header.inspect
  77 | 			end
  78 | 		end
  79 | 		@@headers = header_hash
  80 | 	end
  81 | 
  82 | 	def self.auth_creds (type, user, password)
  83 | 		@@auth_type = type
  84 | 		@@auth_user = user
  85 | 		@@auth_password = password
  86 | 	end
  87 | 
  88 | 	def self.verbose (val)
  89 | 		@@verbose = val
  90 | 	end
  91 | 
  92 | 	def self.debug (val)
  93 | 		@@debug = val
  94 | 	end
  95 | 
  96 | 	# Create an instance of MySpiderInstance rather than SpiderInstance
  97 | 	def self.start_at(a_url, &block)
  98 | 		rules = RobotRules.new('Ruby Spider 1.0')
  99 | 		a_spider = MySpiderInstance.new({nil => a_url}, [], rules, [])
 100 | 
 101 | 		a_spider.headers = @@headers
 102 | 
 103 | 		a_spider.auth_type = @@auth_type
 104 | 		a_spider.auth_user = @@auth_user
 105 | 		a_spider.auth_password = @@auth_password
 106 | 
 107 | 		a_spider.proxy_host = @@proxy_host
 108 | 		a_spider.proxy_port = @@proxy_port
 109 | 		a_spider.proxy_username = @@proxy_username
 110 | 		a_spider.proxy_password = @@proxy_password
 111 | 
 112 | 		a_spider.verbose = @@verbose
 113 | 		a_spider.debug = @@debug
 114 | 		block.call(a_spider)
 115 | 		a_spider.start!
 116 | 	end
 117 | end
 118 | 
 119 | # My version of the spider class which allows all files
 120 | # to be processed
 121 | class MySpiderInstance<SpiderInstance
 122 | 	attr_writer :auth_type
 123 | 	attr_writer :auth_user
 124 | 	attr_writer :auth_password
 125 | 
 126 | 	attr_writer :headers
 127 | 
 128 | 	attr_writer :proxy_host
 129 | 	attr_writer :proxy_port
 130 | 	attr_writer :proxy_username
 131 | 	attr_writer :proxy_password
 132 | 
 133 | 	attr_writer :verbose
 134 | 	attr_writer :debug
 135 | 
 136 | 	attr_writer :interrupt
 137 | 
 138 | 	# Force all files to be allowed
 139 | 	# Normally the robots.txt file will be honoured
 140 | 	def allowed?(a_url, parsed_url)
 141 | 		true
 142 | 	end
 143 | 
 144 | 	# Lifted from the original gem to fix the case statement
 145 | 	# which checked for Fixednum not Integer as
 146 | 	# Fixednum has been deprecated.
 147 | 	#
 148 | 	def on(code, p = nil, &block)
 149 | 		f = p ? p : block
 150 | 		case code
 151 | 		when Integer
 152 | 			@callbacks[code] = f
 153 | 		else
 154 | 			@callbacks[code.to_sym] = f
 155 | 		end
 156 | 	end
 157 | 
 158 | 	def start! #:nodoc:
 159 | 		trap("SIGINT") { puts 'Hold on, about to stop ...'; @interrupt = true }
 160 | 		begin
 161 | 			next_urls = @next_urls.pop
 162 | 			#tmp_n_u = {}
 163 | 			next_urls.each do |prior_url, urls|
 164 | 				x = []
 165 | 
 166 | 				urls.each_line do |a_url|
 167 | 					x << [a_url, (URI.parse(a_url) rescue nil)]
 168 | 				end
 169 | 
 170 | 				y = []
 171 | 				x.select do |a_url, parsed_url|
 172 |                     if (not parsed_url.nil?) then
 173 |                       if (parsed_url.scheme == "mailto" or parsed_url.scheme == "http" or parsed_url.scheme == "https") then
 174 |                         y << [a_url, parsed_url] if allowable_url?(a_url, parsed_url)
 175 |                       end
 176 |                     end
 177 | 				end
 178 | 
 179 | 				y.each do |a_url, parsed_url|
 180 | 					@setup.call(a_url) unless @setup.nil?
 181 | 					get_page(parsed_url) do |response|
 182 | 						do_callbacks(a_url, response, prior_url)
 183 | 						#tmp_n_u[a_url] = generate_next_urls(a_url, response)
 184 | 						#@next_urls.push tmp_n_u
 185 | 						generate_next_urls(a_url, response).each do |a_next_url|
 186 | 							puts "Pushing #{a_next_url}" if @debug
 187 | 							@next_urls.push a_url => a_next_url
 188 | 						end
 189 | 						#exit if interrupted
 190 | 					end
 191 | 
 192 | 					@teardown.call(a_url) unless @teardown.nil?
 193 | 					throw :ctrl_c if @interrupt
 194 | 				end
 195 | 			end
 196 | 		end while !@next_urls.empty?
 197 | 	end
 198 | 
 199 | 	def get_page(uri, &block) #:nodoc:
 200 | 		@seen << uri
 201 | 
 202 | 		trap("SIGINT") { puts 'Hold on, stopping here ...'; @interrupt = true }
 203 | 		begin
 204 | 			if @proxy_host.nil?
 205 | 				http = Net::HTTP.new(uri.host, uri.port)
 206 | 
 207 | 				if uri.scheme == 'https'
 208 | 					http.use_ssl = true
 209 | 					http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 210 | 				end
 211 | 			else
 212 | 				proxy = Net::HTTP::Proxy(@proxy_host, @proxy_port, @proxy_username, @proxy_password)
 213 | 				begin
 214 | 					if uri.scheme == 'https'
 215 | 						http = proxy.start(uri.host, uri.port, :use_ssl => true, :verify_mode => OpenSSL::SSL::VERIFY_NONE)
 216 | 					else
 217 | 						http = proxy.start(uri.host, uri.port)
 218 | 					end
 219 | 				rescue => e
 220 | 					puts "\nFailed to connect to the proxy (#{@proxy_host}:#{@proxy_port})\n\n"
 221 | 					exit 2
 222 | 				end
 223 | 			end
 224 | 
 225 | 			req = Net::HTTP::Get.new(uri.request_uri)
 226 | 			@headers.each_pair do |header, value|
 227 | 				req[header] = value
 228 | 			end
 229 | 
 230 | 			if @auth_type
 231 | 				case @auth_type
 232 | 					when "digest"
 233 | 						uri.user = @auth_user
 234 | 						uri.password = @auth_password
 235 | 
 236 | 						res = http.request req
 237 | 
 238 | 						if res['www-authenticate']
 239 | 							digest_auth = Net::HTTP::DigestAuth.new
 240 | 							auth = digest_auth.auth_header uri, res['www-authenticate'], 'GET'
 241 | 
 242 | 							req = Net::HTTP::Get.new uri.request_uri
 243 | 							req.add_field 'Authorization', auth
 244 | 						end
 245 | 
 246 | 					when "basic"
 247 | 						req.basic_auth @auth_user, @auth_password
 248 | 				end
 249 | 			end
 250 | 
 251 | 			res = http.request(req)
 252 | 
 253 | 			if res.redirect?
 254 | 				puts "Redirect URL" if @debug
 255 | 				base_url = uri.to_s[0, uri.to_s.rindex('/')]
 256 | 				new_url = URI.parse(construct_complete_url(base_url, res['Location']))
 257 | 
 258 | 				# If auth is used then a name:pass@ gets added, this messes the tree
 259 | 				# up so easiest to just remove it
 260 | 				current_uri = uri.to_s.gsub(/:\/\/[^:]*:[^@]*@/, "://")
 261 | 				@next_urls.push current_uri => new_url.to_s
 262 | 			elsif res.code == "401"
 263 | 				puts "Authentication required, can't continue on this branch - #{uri}" if @verbose
 264 | 			else
 265 | 				block.call(res)
 266 | 			end
 267 | 		rescue Zlib::DataError => e
 268 | 			puts "Error in Zlib decompressing data on #{uri}, moving on regardless"
 269 | 		rescue SocketError, Errno::EHOSTUNREACH => e
 270 | 			puts "Couldn't hit the site #{uri}, moving on"
 271 | 		rescue NoMethodError => e
 272 | 			if @verbose
 273 | 				puts "Unable to process URL"
 274 | 				puts "Message is #{e.to_s}"
 275 | 				puts e.backtrace
 276 | 			end
 277 | 		rescue => e
 278 | 			puts "\nUnable to connect to the site (#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.request_uri})"
 279 | 
 280 | 			if @verbose
 281 | 				puts "\nThe following error may help:"
 282 | 				puts e.to_s
 283 | 				puts e.backtrace
 284 | 				puts "\nCaller"
 285 | 				puts caller
 286 | 			else
 287 | 				puts "Run in verbose mode (-v) for more information"
 288 | 			end
 289 | 
 290 | 			puts "\n\n"
 291 | 		end
 292 | 	end
 293 | 
 294 | 	# Overriding so that I can get it to ignore direct names - i.e. #name
 295 | 	def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
 296 | 		return nil if additional_url =~ /^#/
 297 | 
 298 | 		parsed_additional_url ||= URI.parse(additional_url)
 299 | 		if parsed_additional_url.scheme.nil?
 300 | 			u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
 301 | 			if additional_url[0].chr == '/'
 302 | 				url = "#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
 303 | 			elsif u.path.nil? || u.path == ''
 304 | 				url = "#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
 305 | 			elsif u.path[0].chr == '/'
 306 | 				url = "#{u.scheme}://#{u.host}:#{u.port}#{u.path}/#{additional_url}"
 307 | 			else
 308 | 				url = "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
 309 | 			end
 310 | 		else
 311 | 			url = additional_url
 312 | 		end
 313 | 		return url
 314 | 	end
 315 | 
 316 | 	# Overriding the original spider one as it doesn't find hrefs very well
 317 | 	def generate_next_urls(a_url, resp) #:nodoc:
 318 | 		if @debug
 319 | 			puts "a_url = #{a_url}"
 320 | 			puts "resp = #{resp}"
 321 | 		end
 322 | 		web_page = resp.body
 323 | 		if URI.parse(a_url).path.empty?
 324 | 			base_url = a_url
 325 | 		else
 326 | 			base_url = a_url[0, a_url.rindex('/')]
 327 | 		end
 328 | 		puts "base_url: #{base_url}" if @debug
 329 | 
 330 | 		doc = Nokogiri::HTML(web_page)
 331 | 		links = doc.css('a').map { |a| a['href'] }
 332 | 
 333 | 		puts "links = #{links.inspect}" if @debug
 334 | 		links.map do |link|
 335 | 			begin
 336 | 				if link.nil?
 337 | 					nil
 338 | 				else
 339 | 					begin
 340 | 						parsed_link = URI.parse(link)
 341 | 						parsed_link.fragment == '#' ? nil : construct_complete_url(base_url, link, parsed_link)
 342 | 					rescue
 343 | 						nil
 344 | 					end
 345 | 				end
 346 | 			rescue => e
 347 | 				puts "\nThere was an error generating URL list"
 348 | 				puts "Error: #{e.inspect}"
 349 | 				puts e.backtrace
 350 | 				exit 2
 351 | 			end
 352 | 		end.compact
 353 | 	end
 354 | end
 355 | 
 356 | # A node for a tree
 357 | class TreeNode
 358 | 	attr :value
 359 | 	attr :depth
 360 | 	attr :key
 361 | 	attr :visited, true
 362 | 
 363 | 	def initialize(key, value, depth)
 364 | 		@key = key
 365 | 		@value = value
 366 | 		@depth = depth
 367 | 		@visited = false
 368 | 	end
 369 | 
 370 | 	def to_s
 371 | 		if key.nil?
 372 | 			return "key=nil value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
 373 | 		else
 374 | 			return "key=#{@key} value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
 375 | 		end
 376 | 	end
 377 | 
 378 | 	def to_url_hash
 379 | 		return({@key => @value})
 380 | 	end
 381 | end
 382 | 
 383 | # A tree structure
 384 | class Tree
 385 | 	attr :data
 386 | 	attr_writer :debug
 387 | 	attr_writer :max_depth
 388 | 	@children
 389 | 
 390 | 	# Get the maximum depth the tree can grow to
 391 | 	def max_depth
 392 | 		@max_depth
 393 | 	end
 394 | 
 395 | 	# Set the max depth the tree can grow to
 396 | 	def max_depth=(val)
 397 | 		@max_depth = Integer(val)
 398 | 	end
 399 | 
 400 | 	# As this is used to work out if there are any more nodes to process it isn't a true empty
 401 | 	def empty?
 402 | 		if !@data.visited
 403 | 			return false
 404 | 		else
 405 | 			@children.each { |node|
 406 | 				return false if !node.data.visited
 407 | 			}
 408 | 		end
 409 | 		return true
 410 | 	end
 411 | 
 412 | 	# The constructor
 413 | 	def initialize(key=nil, value=nil, depth=0, debug=false)
 414 | 		@data = TreeNode.new(key, value, depth)
 415 | 		@children = []
 416 | 		@max_depth = 2
 417 | 	end
 418 | 
 419 | 	# Itterator
 420 | 	def each
 421 | 		yield @data
 422 | 		@children.each do |child_node|
 423 | 			child_node.each { |e| yield e }
 424 | 		end
 425 | 	end
 426 | 
 427 | 	# Remove an item from the tree
 428 | 	def pop
 429 | 		if !@data.visited
 430 | 			@data.visited = true
 431 | 			return @data.to_url_hash
 432 | 		else
 433 | 			@children.each { |node|
 434 | 				if !node.data.visited
 435 | 					node.data.visited = true
 436 | 					return node.data.to_url_hash
 437 | 				end
 438 | 			}
 439 | 		end
 440 | 		return nil
 441 | 	end
 442 | 
 443 | 	# Push an item onto the tree
 444 | 	def push(value)
 445 | 		puts "Adding #{value} to the tree" if @debug
 446 | 		key = value.keys.first
 447 | 		value = value.values_at(key).first
 448 | 
 449 | 		if key.nil?
 450 | 			@data = TreeNode.new(key, value, 0)
 451 | 		else
 452 | 			# If the depth is 0 then don't add anything to the tree
 453 | 			return if @max_depth == 0
 454 | 			if key == @data.value
 455 | 				child = Tree.new(key, value, @data.depth + 1, @debug)
 456 | 				@children << child
 457 | 			else
 458 | 				@children.each { |node|
 459 |                     # Ignore the max depth for mailto links.
 460 |                     # This is not a good way to do this, but it will work for now
 461 |                     # and we all know dirty hacks stay around forever so don't
 462 |                     # expect this to be fixed for a while.
 463 |                     if value =~ /^mailto:/ then
 464 |                       if node.data.value == key then
 465 |                           child = Tree.new(key, value, node.data.depth + 1, @debug)
 466 |                           @children << child
 467 |                       end
 468 |                     else
 469 |                       if node.data.value == key && node.data.depth<@max_depth then
 470 |                           child = Tree.new(key, value, node.data.depth + 1, @debug)
 471 |                           @children << child
 472 |                       end
 473 |                     end
 474 | 				}
 475 | 			end
 476 | 		end
 477 | 	end
 478 | end
 479 | 
 480 | opts = GetoptLong.new(
 481 | 		['--help', '-h', GetoptLong::NO_ARGUMENT],
 482 | 		['--keep', '-k', GetoptLong::NO_ARGUMENT],
 483 | 		['--depth', '-d', GetoptLong::REQUIRED_ARGUMENT],
 484 | 		['--min_word_length', "-m", GetoptLong::REQUIRED_ARGUMENT],
 485 | 		['--max_word_length', "-x", GetoptLong::REQUIRED_ARGUMENT],
 486 | 		['--no-words', "-n", GetoptLong::NO_ARGUMENT],
 487 | 		['--groups', "-g", GetoptLong::REQUIRED_ARGUMENT],
 488 | 		['--offsite', "-o", GetoptLong::NO_ARGUMENT],
 489 | 		['--exclude', GetoptLong::REQUIRED_ARGUMENT],
 490 | 		['--allowed', GetoptLong::REQUIRED_ARGUMENT],
 491 | 		['--write', "-w", GetoptLong::REQUIRED_ARGUMENT],
 492 | 		['--ua', "-u", GetoptLong::REQUIRED_ARGUMENT],
 493 | 		['--meta-temp-dir', GetoptLong::REQUIRED_ARGUMENT],
 494 | 		['--meta_file', GetoptLong::REQUIRED_ARGUMENT],
 495 | 		['--email_file', GetoptLong::REQUIRED_ARGUMENT],
 496 | 		['--lowercase', GetoptLong::NO_ARGUMENT],
 497 | 		['--with-numbers', GetoptLong::NO_ARGUMENT],
 498 | 		['--convert-umlauts', GetoptLong::NO_ARGUMENT],
 499 | 		['--meta', "-a", GetoptLong::NO_ARGUMENT],
 500 | 		['--email', "-e", GetoptLong::NO_ARGUMENT],
 501 | 		['--count', '-c', GetoptLong::NO_ARGUMENT],
 502 | 		['--auth_user', GetoptLong::REQUIRED_ARGUMENT],
 503 | 		['--auth_pass', GetoptLong::REQUIRED_ARGUMENT],
 504 | 		['--auth_type', GetoptLong::REQUIRED_ARGUMENT],
 505 | 		['--header', "-H", GetoptLong::REQUIRED_ARGUMENT],
 506 | 		['--proxy_host', GetoptLong::REQUIRED_ARGUMENT],
 507 | 		['--proxy_port', GetoptLong::REQUIRED_ARGUMENT],
 508 | 		['--proxy_username', GetoptLong::REQUIRED_ARGUMENT],
 509 | 		['--proxy_password', GetoptLong::REQUIRED_ARGUMENT],
 510 | 		["--verbose", "-v", GetoptLong::NO_ARGUMENT],
 511 | 		["--debug", GetoptLong::NO_ARGUMENT]
 512 | )
 513 | 
 514 | # Display the usage
 515 | def usage
 516 | 	puts "Usage: cewl [OPTIONS] ... <url>
 517 | 
 518 |     OPTIONS:
 519 | 	-h, --help: Show help.
 520 | 	-k, --keep: Keep the downloaded file.
 521 | 	-d <x>,--depth <x>: Depth to spider to, default 2.
 522 | 	-m, --min_word_length: Minimum word length, default 3.
 523 | 	-x, --max_word_length: Maximum word length, default unset.
 524 | 	-o, --offsite: Let the spider visit other sites.
 525 | 	--exclude: A file containing a list of paths to exclude
 526 | 	--allowed: A regex pattern that path must match to be followed
 527 | 	-w, --write: Write the output to the file.
 528 | 	-u, --ua <agent>: User agent to send.
 529 | 	-n, --no-words: Don't output the wordlist.
 530 | 	-g <x>, --groups <x>: Return groups of words as well
 531 | 	--lowercase: Lowercase all parsed words
 532 | 	--with-numbers: Accept words with numbers in as well as just letters
 533 | 	--convert-umlauts: Convert common ISO-8859-1 (Latin-1) umlauts (ä-ae, ö-oe, ü-ue, ß-ss)
 534 | 	-a, --meta: include meta data.
 535 | 	--meta_file file: Output file for meta data.
 536 | 	-e, --email: Include email addresses.
 537 | 	--email_file <file>: Output file for email addresses.
 538 | 	--meta-temp-dir <dir>: The temporary directory used by exiftool when parsing files, default /tmp.
 539 | 	-c, --count: Show the count for each word found.
 540 | 	-v, --verbose: Verbose.
 541 | 	--debug: Extra debug information.
 542 | 
 543 | 	Authentication
 544 | 	--auth_type: Digest or basic.
 545 | 	--auth_user: Authentication username.
 546 | 	--auth_pass: Authentication password.
 547 | 
 548 | 	Proxy Support
 549 | 	--proxy_host: Proxy host.
 550 | 	--proxy_port: Proxy port, default 8080.
 551 | 	--proxy_username: Username for proxy, if required.
 552 | 	--proxy_password: Password for proxy, if required.
 553 | 
 554 | 	Headers
 555 | 	--header, -H: In format name:value - can pass multiple.
 556 | 
 557 |     <url>: The site to spider.
 558 | 
 559 | "
 560 | 	exit 0
 561 | end
 562 | 
 563 | debug = false
 564 | verbose = false
 565 | ua = nil
 566 | url = nil
 567 | outfile = nil
 568 | email_outfile = nil
 569 | meta_outfile = nil
 570 | offsite = false
 571 | exclude_array = []
 572 | allowed_pattern = nil
 573 | depth = 2
 574 | min_word_length = 3
 575 | max_word_length = -1
 576 | email = false
 577 | meta = false
 578 | wordlist = true
 579 | groups = -1
 580 | meta_temp_dir = "/tmp/"
 581 | keep = false
 582 | lowercase = false
 583 | words_with_numbers = false
 584 | convert_umlauts = false
 585 | show_count = false
 586 | auth_type = nil
 587 | auth_user = nil
 588 | auth_pass = nil
 589 | 
 590 | proxy_host = nil
 591 | proxy_port = nil
 592 | proxy_username = nil
 593 | proxy_password = nil
 594 | 
 595 | # headers will be passed in in the format "header: value"
 596 | # and there can be multiple
 597 | headers = []
 598 | 
 599 | strip_css = true
 600 | strip_js = true
 601 | 
 602 | begin
 603 | 	opts.each do |opt, arg|
 604 | 		case opt
 605 | 			when '--help'
 606 | 				usage
 607 | 			when "--lowercase"
 608 | 				lowercase = true
 609 | 			when "--with-numbers"
 610 | 				words_with_numbers = true
 611 | 			when "--convert-umlauts"
 612 | 				convert_umlauts = true
 613 | 			when "--count"
 614 | 				show_count = true
 615 | 			when "--meta-temp-dir"
 616 | 				if !File.directory?(arg)
 617 | 					puts "\nMeta temp directory is not a directory\n\n"
 618 | 					exit 1
 619 | 				end
 620 | 
 621 | 				if !File.writable?(arg)
 622 | 					puts "\nThe meta temp directory is not writable\n\n"
 623 | 					exit 1
 624 | 				end
 625 | 
 626 | 				meta_temp_dir = arg
 627 | 				meta_temp_dir += "/" if meta_temp_dir !~ /.*\/$/
 628 | 			when "--keep"
 629 | 				keep = true
 630 | 			when "--no-words"
 631 | 				wordlist = false
 632 | 			when "--meta_file"
 633 | 				meta_outfile = arg
 634 | 			when "--meta"
 635 | 				meta = true
 636 | 			when "--groups"
 637 | 				groups = arg.to_i
 638 | 			when "--email_file"
 639 | 				email_outfile = arg
 640 | 			when "--email"
 641 | 				email = true
 642 | 			when '--max_word_length'
 643 | 				max_word_length = arg.to_i
 644 | 				usage if max_word_length < 1
 645 | 			when '--min_word_length'
 646 | 				min_word_length = arg.to_i
 647 | 				usage if min_word_length < 1
 648 | 			when '--depth'
 649 | 				depth = arg.to_i
 650 | 				usage if depth < 0
 651 | 			when '--offsite'
 652 | 				offsite = true
 653 | 			when '--exclude'
 654 | 				begin
 655 | 					tmp_exclude_array = File.readlines(arg)
 656 | 				rescue => e
 657 | 					puts "\nUnable to open the excude file\n\n"
 658 | 					exit 1
 659 | 				end
 660 | 				# Have to do this to strip the newline characters from the end
 661 | 				# of each element in the array
 662 | 				tmp_exclude_array.each do |line|
 663 | 					exc = line.strip
 664 | 					if exc != ""
 665 | 						exclude_array << line.strip
 666 | 						# puts "Excluding #{ line.strip}"
 667 | 					end
 668 | 				end
 669 | 			when '--allowed'
 670 | 				allowed_pattern = Regexp.new(arg)
 671 | 			when '--ua'
 672 | 				ua = arg
 673 | 			when '--debug'
 674 | 				debug = true
 675 | 			when '--verbose'
 676 | 				verbose = true
 677 | 			when '--write'
 678 | 				outfile = arg
 679 | 			when "--header"
 680 | 				headers << arg
 681 | 			when "--proxy_password"
 682 | 				proxy_password = arg
 683 | 			when "--proxy_username"
 684 | 				proxy_username = arg
 685 | 			when "--proxy_host"
 686 | 				proxy_host = arg
 687 | 			when "--proxy_port"
 688 | 				proxy_port = arg.to_i
 689 | 			when "--auth_pass"
 690 | 				auth_pass = arg
 691 | 			when "--auth_user"
 692 | 				auth_user = arg
 693 | 			when "--auth_type"
 694 | 				if arg =~ /(digest|basic)/i
 695 | 					auth_type = $1.downcase
 696 | 					if auth_type == "digest"
 697 | 						begin
 698 | 							require "net/http/digest_auth"
 699 | 						rescue LoadError => e
 700 | 							# Catch error and provide feedback on installing gem
 701 | 							puts "\nError: To use digest auth you require the net-http-digest_auth gem\n"
 702 | 							puts "\t Use: 'gem install net-http-digest_auth'\n\n"
 703 | 							exit 2
 704 | 						end
 705 | 					end
 706 | 				else
 707 | 					puts "\nInvalid authentication type, please specify either basic or digest\n\n"
 708 | 					exit 1
 709 | 				end
 710 | 		end
 711 | 	end
 712 | rescue => e
 713 | 	# puts e
 714 | 	usage
 715 | end
 716 | 
 717 | if auth_type && (auth_user.nil? || auth_pass.nil?)
 718 | 	puts "\nIf using basic or digest auth you must provide a username and password\n\n"
 719 | 	exit 1
 720 | end
 721 | 
 722 | if auth_type.nil? && (!auth_user.nil? || !auth_pass.nil?)
 723 | 	puts "\nAuthentication details provided but no mention of basic or digest\n\n"
 724 | 	exit 1
 725 | end
 726 | 
 727 | if ARGV.length != 1
 728 | 	puts "\nMissing URL argument (try --help)\n\n"
 729 | 	exit 1
 730 | end
 731 | 
 732 | url = ARGV.shift
 733 | 
 734 | # Must have protocol
 735 | url = "http://#{url}" if url !~ /^http(s)?:\/\//
 736 | 
 737 | # Taking this back out again. Can't remember why it was put in but have found problems
 738 | # with it in and none with it out so getting rid of it.
 739 | #
 740 | # The spider doesn't work properly if there isn't a / on the end
 741 | #if url !~ /\/$/
 742 | #	url = "#{url}/"
 743 | #end
 744 | 
 745 | group_word_hash = {}
 746 | word_hash = {}
 747 | email_arr = []
 748 | url_stack = Tree.new
 749 | url_stack.debug = debug
 750 | url_stack.max_depth = depth
 751 | usernames = Array.new()
 752 | 
 753 | # Do the checks here so we don't do all the processing then find we can't open the file
 754 | if outfile
 755 | 	begin
 756 | 		outfile_file = File.new(outfile, "w")
 757 | 	rescue
 758 | 		puts "\nCouldn't open the output file for writing\n\n"
 759 | 		exit 2
 760 | 	end
 761 | else
 762 | 	outfile_file = $stdout
 763 | end
 764 | 
 765 | if email_outfile && email
 766 | 	begin
 767 | 		email_outfile_file = File.new(email_outfile, "w")
 768 | 	rescue
 769 | 		puts "\nCouldn't open the email output file for writing\n\n"
 770 | 		exit 2
 771 | 	end
 772 | else
 773 | 	email_outfile_file = outfile_file
 774 | end
 775 | 
 776 | if meta_outfile && meta
 777 | 	begin
 778 | 		meta_outfile_file = File.new(meta_outfile, "w")
 779 | 	rescue
 780 | 		puts "\nCouldn't open the metadata output file for writing\n\n"
 781 | 		exit 2
 782 | 	end
 783 | else
 784 | 	meta_outfile_file = outfile_file
 785 | end
 786 | 
 787 | catch :ctrl_c do
 788 | 	begin
 789 | 		puts "Starting at #{url}" if verbose
 790 | 
 791 | 		MySpider.proxy(proxy_host, proxy_port, proxy_username, proxy_password) if proxy_host
 792 | 		MySpider.auth_creds(auth_type, auth_user, auth_pass) if auth_type
 793 | 		MySpider.headers(headers)
 794 | 		MySpider.verbose(verbose)
 795 | 		MySpider.debug(debug)
 796 | 
 797 | 		MySpider.start_at(url) do |s|
 798 | 			s.headers['User-Agent'] = ua if ua
 799 | 
 800 | 			s.add_url_check do |a_url|
 801 | 				puts "Checking page #{a_url}" if debug
 802 | 				allow = true
 803 | 
 804 | 				# Extensions to ignore
 805 | 				if a_url =~ /(\.zip$|\.gz$|\.zip$|\.bz2$|\.png$|\.gif$|\.jpg$|^#)/
 806 | 					puts "Ignoring internal link or graphic: #{a_url}" if verbose
 807 | 					allow = false
 808 | 				else
 809 | 					if /^mailto:(.*)/i.match(a_url)
 810 | 						if email
 811 | 							email_arr << $1
 812 | 							puts "Found #{$1} on page #{a_url}" if verbose
 813 | 						end
 814 | 						allow = false
 815 | 					else
 816 | 						a_url_parsed = URI.parse(a_url)
 817 | 						if !offsite
 818 | 							url_parsed = URI.parse(url)
 819 | 							puts "Comparing #{a_url} with #{url}" if debug
 820 | 
 821 | 							# Make sure the host, port and scheme matches (else its offsite)
 822 | 							allow = (a_url_parsed.host == url_parsed.host) && (a_url_parsed.port == url_parsed.port) && (a_url_parsed.scheme == url_parsed.scheme) ? true : false
 823 | 
 824 | 							puts "Offsite link, not following: #{a_url}" if !allow && verbose
 825 | 						else
 826 | 							puts "Allowing offsite links" if @debug
 827 | 						end
 828 | 
 829 | 						puts "Found: #{a_url_parsed.path}" if @debug
 830 | 
 831 | 						if exclude_array.include?(a_url_parsed.request_uri)
 832 | 							puts "Excluding page: #{a_url_parsed.request_uri}" if verbose
 833 | 							allow = false
 834 | 						end
 835 | 
 836 | 						if allowed_pattern && !a_url_parsed.path.match(allowed_pattern)
 837 | 							puts "Excluding path: #{a_url_parsed.path} based on allowed pattern" if verbose
 838 | 							allow = false
 839 | 						end
 840 | 					end
 841 | 				end
 842 | 				allow
 843 | 			end
 844 | 
 845 | 			# This was :success so only the content from a 200 was processed.
 846 | 			# Updating it to :every so that the content of all pages gets processed
 847 | 			# so you can grab things off 404s or text leaked on redirect and error pages.
 848 | 
 849 | 			s.on :every do |a_url, resp, prior_url|
 850 | 				if verbose
 851 | 					if prior_url.nil?
 852 | 						puts "Visiting: #{a_url}, got response code #{resp.code}"
 853 | 					else
 854 | 						puts "Visiting: #{a_url} referred from #{prior_url}, got response code #{resp.code}"
 855 | 					end
 856 | 				end
 857 | 
 858 | 				# May want 0-9 in here as well in the future but for now limit it to a-z so
 859 | 				# you can't sneak any nasty characters in
 860 | 				if /.*\.([a-z]+)(\?.*$|$)/i.match(a_url)
 861 | 					file_extension = $1
 862 | 				else
 863 | 					file_extension = ''
 864 | 				end
 865 | 
 866 | 				# Don't get words from these file types. Most will have been blocked by the url_check function but
 867 | 				# some are let through, such as .css, so that they can be checked for email addresses
 868 | 
 869 | 				# This is a bad way to do this but it is either white or black list extensions and
 870 | 				# the list of either is quite long, may as well black list and let extra through
 871 | 				# that can then be weeded out later than stop things that could be useful
 872 | 
 873 | 				#if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
 874 | 				if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|png|gif|jpg|#)$/
 875 | 					if meta
 876 | 						begin
 877 | 							if keep && file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2)$/
 878 | 								if /.*\/(.*)$/.match(a_url)
 879 | 									output_filename = meta_temp_dir + $1
 880 | 									puts "Keeping #{output_filename}" if verbose
 881 | 								else
 882 | 									# Shouldn't ever get here as the regex above should always be able to pull the filename out of the URL,
 883 | 									# ...but just in case
 884 | 
 885 | 									# Maybe look at doing this to make the temp name
 886 | 									# require "tempfile"
 887 | 									# Dir::Tmpname.make_tmpname "a", "b"
 888 | 									#	=> "a20150707-8694-hrrxr4-b"
 889 | 
 890 | 									output_filename = "#{meta_temp_dir}cewl_tmp"
 891 | 									output_filename += ".#{file_extension}" unless file_extension.empty?
 892 | 								end
 893 | 							else
 894 | 								output_filename = "#{meta_temp_dir}cewl_tmp"
 895 | 								output_filename += ".#{file_extension}" unless file_extension.empty?
 896 | 							end
 897 | 
 898 | 							out = File.new(output_filename, "wb")
 899 | 							out.print(resp.body)
 900 | 							out.close
 901 | 
 902 | 							meta_data = process_file(output_filename, verbose)
 903 | 							usernames += meta_data if (meta_data != nil)
 904 | 						rescue => e
 905 | 							puts "\nCouldn't open the meta temp file for writing - #{e.inspect}\n\n"
 906 | 							exit 2
 907 | 						end
 908 | 					end
 909 | 				else
 910 | 					html = resp.body.to_s.force_encoding("UTF-8")
 911 | 					# This breaks on this site http://www.spisa.nu/recept/ as the
 912 | 					# replace replaces some of the important characters. Needs a fix
 913 | 					html.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
 914 | 					html.encode!('UTF-8', 'UTF-16')
 915 | 
 916 | 					dom = Nokogiri.HTML(html)
 917 | 					dom.css('script').remove if strip_js
 918 | 					dom.css('style').remove if strip_css
 919 | 					body = dom.to_s
 920 | 
 921 | 					# Get meta data
 922 | 					if /.*<meta.*description.*content\s*=[\s'"]*(.*)/i.match(body)
 923 | 						description = $1
 924 | 						body += description.gsub(/[>"\/']*/, "")
 925 | 					end
 926 | 
 927 | 					if /.*<meta.*keywords.*content\s*=[\s'"]*(.*)/i.match(body)
 928 | 						keywords = $1
 929 | 						body += keywords.gsub(/[>"\/']*/, "")
 930 | 					end
 931 | 
 932 | 					puts body if debug
 933 | 
 934 | 					# This bit will not normally fire as all JavaScript is stripped out
 935 | 					# by the Nokogiri remove a few lines before this.
 936 | 					#
 937 | 					# The code isn't perfect but will do a rough job of working out
 938 | 					# pages from relative location links
 939 | 					while /(location.href\s*=\s*["']([^"']*)['"];)/i.match(body)
 940 | 						full_match = $1
 941 | 						j_url = $2
 942 | 
 943 | 						puts "Javascript redirect found #{j_url}" if verbose
 944 | 
 945 | 						re = Regexp.escape(full_match)
 946 | 						body.gsub!(/#{re}/, "")
 947 | 
 948 | 						if j_url !~ /https?:\/\//i
 949 | 							parsed = URI.parse(a_url)
 950 | 							protocol = parsed.scheme
 951 | 							host = parsed.host
 952 | 
 953 | 							domain = "#{protocol}://#{host}"
 954 | 
 955 | 							j_url = domain + j_url
 956 | 							j_url += $1 if j_url[0] == "/" && parsed.path =~ /(.*)\/.*/
 957 | 
 958 | 							puts "Relative URL found, adding domain to make #{j_url}" if verbose
 959 | 						end
 960 | 
 961 | 						x = {a_url => j_url}
 962 | 						url_stack.push x
 963 | 					end
 964 | 
 965 | 					# Strip comment tags
 966 | 					body.gsub!(/<!--/, "")
 967 | 					body.gsub!(/-->/, "")
 968 | 
 969 | 					# If you want to add more attribute names to include, just add them to this array
 970 | 					attribute_names = [
 971 | 							"alt",
 972 | 							"title",
 973 | 					]
 974 | 
 975 | 					attribute_text = ''
 976 | 
 977 | 					attribute_names.each { |attribute_name|
 978 | 						body.gsub!(/#{attribute_name}="([^"]*)"/) { |attr| attribute_text += "#{$1} " }
 979 | 					}
 980 | 
 981 | 					if verbose and attribute_text
 982 | 						puts "Attribute text found:"
 983 | 						puts attribute_text
 984 | 						puts
 985 | 					end
 986 | 
 987 | 					body += " #{attribute_text}"
 988 | 
 989 | 					# Strip html tags
 990 | 					words = body.gsub(/<\/?[^>]*>/, "")
 991 | 
 992 | 					# Check if this is needed
 993 | 					words.gsub!(/&[a-z]*;/, "")
 994 | 
 995 | 					begin
 996 | 						#if file_extension !~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
 997 | 						begin
 998 | 							if email
 999 | 								# Split the file down based on the email address regexp
1000 | 								#words.gsub!(/\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i)
1001 | 								#p words
1002 | 
1003 | 								# If you want to pull email addresses from the contents of files found, such as word docs then move
1004 | 								# this block outside the if statement
1005 | 								# I've put it in here as some docs contain email addresses that have nothing to do with the target
1006 | 								# so give false positive type results
1007 | 								words.each_line do |word|
1008 | 									while /\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i.match(word)
1009 | 										puts "Found #{$1} on page #{a_url}" if verbose
1010 | 										email_arr << $1
1011 | 										word = word.gsub(/#{$1}/, "")
1012 | 									end
1013 | 								end
1014 | 							end
1015 | 						rescue => e
1016 | 							puts "\nThere was a problem generating the email list"
1017 | 							puts "Error: #{e.inspect}"
1018 | 							puts e.backtrace
1019 | 						end
1020 | 
1021 | 						if wordlist
1022 | 							# Lowercase all parsed words
1023 | 							if lowercase then
1024 | 								words.downcase!
1025 | 							end
1026 | 							# Remove any symbols
1027 | 							if words_with_numbers then
1028 | 								words.gsub!(/[^[[:alnum:]]]/i, " ")
1029 | 							else
1030 | 								words.gsub!(/[^[[:alpha:]]]/i, " ")
1031 | 							end
1032 | 
1033 | 							if convert_umlauts then
1034 | 								words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue")
1035 | 							end
1036 | 
1037 | 							# Add to the array
1038 | 							group_words = []
1039 | 							words.split(" ").each do |word|
1040 | 								if word.length >= min_word_length and (max_word_length == -1 or word.length <= max_word_length)
1041 | 									word_hash[word] = 0 if !word_hash.has_key?(word)
1042 | 									word_hash[word] += 1
1043 | 								end
1044 | 								if (groups > 0)
1045 | 									group_words.push (word)
1046 | 									if (group_words.length() > groups)
1047 | 										group_words.shift()
1048 | 									end
1049 | 									if (group_words.length() == groups)
1050 | 										joined = group_words.join(" ")
1051 | 										group_word_hash[joined] = 0 if !group_word_hash.has_key?(joined)
1052 | 										group_word_hash[joined] += 1
1053 | 									end
1054 | 								end
1055 | 							end
1056 | 						end
1057 | 						#end
1058 | 					rescue => e
1059 | 						puts "\nThere was a problem handling word generation"
1060 | 						puts "Error: #{e.inspect}"
1061 | 						puts e.backtrace
1062 | 					end
1063 | 				end
1064 | 			end
1065 | 			s.store_next_urls_with url_stack
1066 | 		end
1067 | 	rescue Errno::ENOENT
1068 | 		puts "\nInvalid URL specified (#{url})\n\n"
1069 | 		exit 2
1070 | 	rescue => e
1071 | 		puts "\nCouldn't access the site (#{url})\n"
1072 | 		puts "Error: #{e.inspect}"
1073 | 		puts "Error: #{e.backtrace}"
1074 | 		exit 2
1075 | 	end
1076 | end
1077 | 
1078 | puts "End of main loop" if debug
1079 | 
1080 | if wordlist
1081 | 	if verbose
1082 | 		if outfile.nil?
1083 | 			puts "Words found\n"
1084 | 		else
1085 | 			puts "Writing words to file\n"
1086 | 		end
1087 | 	end
1088 | 
1089 | 	sorted_wordlist = word_hash.sort_by do |word, count|
1090 | 		-count
1091 | 	end
1092 | 
1093 | 	sorted_wordlist.each do |word, count|
1094 | 		if show_count
1095 | 			outfile_file.puts "#{word}, #{count.to_s}"
1096 | 		else
1097 | 			outfile_file.puts word
1098 | 		end
1099 | 	end
1100 | end
1101 | 
1102 | if groups > 0
1103 | 	if verbose
1104 | 		if outfile.nil?
1105 | 			puts "Groups of words found\n"
1106 | 		else
1107 | 			puts "Writing groups of words to file\n"
1108 | 		end
1109 | 	end
1110 | 
1111 | 	sorted_wordlist = group_word_hash.sort_by do |word, count|
1112 | 		-count
1113 | 	end
1114 | 
1115 | 	sorted_wordlist.each do |word, count|
1116 | 		if show_count
1117 | 			outfile_file.puts "#{word}, #{count.to_s}"
1118 | 		else
1119 | 			outfile_file.puts word
1120 | 		end
1121 | 	end
1122 | end
1123 | 
1124 | puts "End of wordlist loop" if debug
1125 | 
1126 | if email
1127 | 	if email_arr.length == 0
1128 | 		puts "No email addresses found" if verbose
1129 | 	else
1130 | 		puts "Dumping email addresses to file" if verbose
1131 | 
1132 | 		email_arr.delete_if { |x| x.chomp.empty? }
1133 | 		email_arr.uniq!
1134 | 		email_arr.sort!
1135 | 
1136 | 		outfile_file.puts if (wordlist || verbose) && email_outfile.nil?
1137 | 
1138 | 		if email_outfile.nil?
1139 | 			outfile_file.puts "Email addresses found"
1140 | 			outfile_file.puts "---------------------"
1141 | 			outfile_file.puts email_arr.join("\n")
1142 | 		else
1143 | 			email_outfile_file.puts email_arr.join("\n")
1144 | 		end
1145 | 	end
1146 | end
1147 | 
1148 | puts "End of email loop" if debug
1149 | 
1150 | if meta
1151 | 	if usernames.length == 0
1152 | 		puts "No meta data found" if verbose
1153 | 	else
1154 | 		puts "Dumping meta data to file" if verbose
1155 | 		usernames.delete_if { |x| x.chomp.empty? }
1156 | 		usernames.uniq!
1157 | 		usernames.sort!
1158 | 
1159 | 		outfile_file.puts if (email||wordlist) && meta_outfile.nil?
1160 | 		if meta_outfile.nil?
1161 | 			outfile_file.puts "Meta data found"
1162 | 			outfile_file.puts "---------------"
1163 | 			outfile_file.puts usernames.join("\n")
1164 | 		else
1165 | 			meta_outfile_file.puts usernames.join("\n")
1166 | 		end
1167 | 	end
1168 | end
1169 | 
1170 | puts "End of meta loop" if debug
1171 | 
1172 | meta_outfile_file.close if meta_outfile
1173 | email_outfile_file.close if email_outfile
1174 | outfile_file.close if outfile
1175 | 


--------------------------------------------------------------------------------
/cewl_lib.rb:
--------------------------------------------------------------------------------
  1 | # == CeWL Library: Library to outsource reusable features
  2 | #
  3 | # Author:: Robin Wood (robin@digi.ninja)
  4 | # Copyright:: Copyright (c) Robin Wood 2016
  5 | # Licence:: GPL
  6 | #
  7 | 
  8 | begin
  9 | 	require 'mini_exiftool'
 10 | 	require "zip"
 11 | 	require "rexml/document"
 12 | 	require 'mime'
 13 | 	require 'mime-types'
 14 | 	include REXML
 15 | rescue LoadError => e
 16 | 	# catch error and prodive feedback on installing gem
 17 | 	if e.to_s =~ /cannot load such file -- (.*)/
 18 | 		missing_gem = $1
 19 | 		puts "\nError: #{missing_gem} gem not installed\n"
 20 | 		puts "\t use: \"gem install #{missing_gem}\" to install the required gem\n\n"
 21 | 		exit
 22 | 	else
 23 | 		puts "There was an error loading the gems:"
 24 | 		puts
 25 | 		puts e.to_s
 26 | 		exit
 27 | 	end
 28 | end
 29 | 
 30 | # Override the MiniExiftool class so that I can modify the parse_line
 31 | # method and force all encoding to ISO-8859-1. Without this the app bombs
 32 | # on some machines as it is unable to parse UTF-8
 33 | class MyMiniExiftool<MiniExiftool
 34 | 	def parse_line line
 35 | 		line.force_encoding('ISO-8859-1')
 36 | 		super	
 37 | 	end
 38 | end
 39 | 
 40 | # == Synopsis
 41 | #
 42 | # This library contains functions to evaulate files found while running CeWL
 43 | #
 44 | # Author:: Robin Wood (robin@digi.ninja)
 45 | # Copyright:: Copyright (c) Robin Wood 2021
 46 | # Licence:: GPL
 47 | #
 48 | 
 49 | # Get data from a pdf file using regexps
 50 | def get_pdf_data(pdf_file, verbose)
 51 | 	meta_data=[]
 52 | 	begin
 53 | 		interesting_fields=Array.[]("/Author")
 54 | 
 55 | 		f=File.open(pdf_file)
 56 | 		f.each_line{ |line|
 57 | 			line.force_encoding('ISO-8859-1')
 58 | 			if /pdf:Author='([^']*)'/.match(line)
 59 | 				if verbose
 60 | 					puts "Found pdf:Author: "+$1
 61 | 				end
 62 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 63 | 			end
 64 | 			if /xap:Author='([^']*)'/i.match(line)
 65 | 				if verbose
 66 | 					puts "Found xap:Author: "+$1
 67 | 				end
 68 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 69 | 			end
 70 | 			if /dc:creator='([^']*)'/i.match(line)
 71 | 				if verbose
 72 | 					puts "Found dc:creator: "+$1
 73 | 				end
 74 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 75 | 			end
 76 | 			if /\/Author ?\(([^\)]*)\)/i.match(line)
 77 | 				if verbose
 78 | 					puts "Found Author: "+$1
 79 | 				end
 80 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 81 | 			end
 82 | 			if /<xap:creator>(.*)<\/xap:creator>/i.match(line)
 83 | 				if verbose
 84 | 					puts "Found pdf:creator: "+$1
 85 | 				end
 86 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 87 | 			end
 88 | 			if /<xap:Author>(.*)<\/xap:Author>/i.match(line)
 89 | 				if verbose
 90 | 					puts "Found xap:Author: "+$1
 91 | 				end
 92 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 93 | 			end
 94 | 			if /<pdf:Author>(.*)<\/pdf:Author>/i.match(line)
 95 | 				if verbose
 96 | 					puts "Found pdf:Author: "+$1
 97 | 				end
 98 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
 99 | 			end
100 | 			if /<dc:creator>(.*)<\/dc:creator>/i.match(line)
101 | 				if verbose
102 | 					puts "Found dc:creator: "+$1
103 | 				end
104 | 				meta_data<<$1.to_s.chomp unless $1.to_s==""
105 | 			end
106 | 			
107 | 		}
108 | 		return meta_data
109 | 	rescue => e
110 | 		if verbose
111 | 			puts "There was an error processing the document - " + e.message
112 | 		end
113 | 	end
114 | 	return meta_data
115 | end
116 | 
117 | # Get data from files using exiftool
118 | def get_doc_data(doc_file, verbose)
119 | 	data=[]
120 | 	begin
121 | 		interesting_fields=Array.[]("Author","LastSavedBy","Creator")
122 | 		file = MyMiniExiftool.new(doc_file)
123 | 
124 | 		interesting_fields.each{ |field_name|
125 | 			if file.tags.include?(field_name)
126 | 				data<<file[field_name].to_s
127 | 			end
128 | 		}
129 | 	rescue => e
130 | 		if verbose
131 | 			puts "There was an error processing the document - " + e.message
132 | 		end
133 | 	end
134 | 	return data
135 | end
136 | 
137 | # Get data from Office 2007 documents by unziping relivant XML files then
138 | # checking for known fields
139 | def get_docx_data(docx_file, verbose)
140 | 	meta_data=[]
141 | 
142 | 	interesting_fields=Array.[]("cp:coreProperties/dc:creator","cp:coreProperties/cp:lastModifiedBy")
143 | 	interesting_files=Array.[]("docProps/core.xml")
144 | 
145 | 	begin
146 | 		Zip::File.open(docx_file) { |zipfile|
147 | 			interesting_files.each { |file|
148 | 				if zipfile.find_entry(file)
149 | 					xml=zipfile.read(file)
150 | 
151 | 					doc=Document.new(xml)
152 | 					interesting_fields.each { |field|
153 | 						element=doc.elements[field]
154 | 						#puts element.get_text unless element==nil||element.get_text==nil
155 | 						meta_data<<element.get_text.to_s.chomp unless element==nil||element.get_text==nil
156 | 					}
157 | 				end
158 | 			}
159 | 		}
160 | 	rescue => e
161 | 		if verbose
162 | 			# not a zip file
163 | 			puts "File probably not a zip file - " + e.message
164 | 		end
165 | 	end
166 | 	return meta_data
167 | end
168 | 
169 | # Take the file given, try to work out what type of file it is then pass it
170 | # to the relivant function to try to grab meta data
171 | def process_file(filename, verbose=false)
172 | 	meta_data=nil
173 | 
174 | 	begin
175 | 		puts "processing file: " + filename
176 | 
177 | 		if File.file?(filename) && File.exist?(filename)
178 | 			mime_types=MIME::Types.type_for(filename)
179 | 			if(mime_types.size==0)
180 | 				if(verbose)
181 | 					puts "Empty mime type"
182 | 				end
183 | 				return meta_data
184 | 			end
185 | 			if verbose
186 | 				puts "Checking "+filename
187 | 				puts "  Mime type="+mime_types.join(", ")
188 | 				puts
189 | 			end
190 | 			if mime_types.include?("application/word") || mime_types.include?("application/excel") || mime_types.include?("application/powerpoint")
191 | 				if verbose
192 | 					puts "  Mime type says original office document"
193 | 				end
194 | 				meta_data=get_doc_data(filename, verbose)
195 | 			else
196 | 				if mime_types.include?("application/pdf")
197 | 					if verbose
198 | 						puts "  Mime type says PDF"
199 | 					end
200 | 					# Running both my own regexp and exiftool on pdfs as I've found exif misses some data
201 | 					meta_data=get_doc_data(filename, verbose)
202 | 					meta_data+=get_pdf_data(filename, verbose)
203 | 				else
204 | 					# list taken from http://en.wikipedia.org/wiki/Microsoft_Office_2007_file_extensions
205 | 					if filename =~ /(.(doc|dot|ppt|pot|xls|xlt|pps)[xm]$)|(.ppam$)|(.xlsb$)|(.xlam$)/
206 | 						if verbose
207 | 							puts "  File extension says 2007 style office document"
208 | 						end
209 | 						meta_data=get_docx_data(filename, verbose)
210 | 					elsif filename =~ /.php$|.aspx$|.cfm$|.asp$|.html$|.htm$/
211 | 						if verbose
212 | 							puts "  Language file, can ignore"
213 | 						end
214 | 					else
215 | 						if verbose
216 | 							puts "  Unknown file type"
217 | 						end
218 | 					end
219 | 				end
220 | 			end
221 | 			if meta_data!=nil
222 | 				if verbose
223 | 					if meta_data.length > 0
224 | 						puts "  Found "+meta_data.join(", ")+"\n"
225 | 					end
226 | 				end
227 | 			end
228 | 		end
229 | 	rescue => e
230 | 		puts "Problem in process_file function"
231 | 		puts "Error: " + e.message
232 | 		puts e.backtrace
233 | 	end
234 | 
235 | 	return meta_data
236 | end
237 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | 
  3 | ## Version 5.5.2
  4 | 
  5 | * Can now specify a range for the number of words to group together when grouping.
  6 | * Specify multiple grouping characters.
  7 | 
  8 | ## Version 5.5.1
  9 | * Fixed accidental concatenation of words when stripping HTML tags.
 10 | 
 11 | ## Version 5.5.0
 12 | 
 13 | * Grouping words together.
 14 | 
 15 | ## Version 5.4.9
 16 | 
 17 | * Added Docker support.
 18 | 
 19 | ## Version 5.4.8
 20 | 
 21 | * Updated the parser so that it looks at the content on all pages which are returned, not just those with a 200 return code.
 22 | 
 23 | ## Version 5.4.7
 24 | 
 25 | * Added the `--allowed parameter` to limit crawling to URLs matching the passed RegEx. Work done by [5p1n](https://github.com/5p1n/).
 26 | 
 27 | ## Version 5.4.6
 28 | 
 29 | * Added the `--lowercase` parameter to convert all letters to lower case.
 30 | * Added the `--convert-umlauts` parameter to convert Latin-1 umlauts (e.g. "ä" to "ae", "ö" to "oe", etc.).
 31 | 
 32 | ## Version 5.4.3
 33 | 
 34 | * Added the `--with-number` parameter to make words include letters and numbers.
 35 | 
 36 | ## Version 5.4.2
 37 | 
 38 | * Merged an update to change the way usage instructions are shown.
 39 | * Updated instructions on installing gems.
 40 | * Updated README.
 41 | 
 42 | ## Version 5.4.1
 43 | 
 44 | * A line to add a / to the end of the URL had been commented out. I don't remember why it was done but I'm putting it back in. See [issue 26](https://github.com/digininja/CeWL/issues/26).
 45 | 
 46 | ## Version 5.4
 47 | 
 48 | * Steven van der Baan added the ability to hit ctrl-c and keep the results so far.
 49 | 
 50 | ## Version 5.3.1
 51 | 
 52 | * Added the ability to handle non-standard port numbers.
 53 | * Added lots more debugging and a new --debug parameter.
 54 | 
 55 | ## Version 5.3
 56 | 
 57 | * Added the command line argument --header (-H) to allow headers to be passed in.
 58 | * Parameters are specified in name:value pairs and you can pass multiple.
 59 | 
 60 | ## Version 5.2
 61 | 
 62 | Loads of changes including:
 63 | 
 64 | * Code refactoring by [@g0tmi1k](https://github.com/g0tmi1k)
 65 | * Internationalisation - should now handle non-ASCII sites much better
 66 | * Found more ways to pull words out of JavaScript content and other areas that aren't normal HTML
 67 | * Lots of little bug fixes
 68 | 
 69 | ## Version 5.1
 70 | 
 71 | * Added the GPL-3+ licence to allow inclusion in Debian.
 72 | * Added a Gemfile to make installing gems easier.
 73 | 
 74 | ## Version 5.0
 75 | 
 76 | * Adds proxy support from the command line and the ability to pass in credentials for both basic and digest authentication.
 77 | * A few other smaller bug fixes as well.
 78 | 
 79 | ## Version 4.3
 80 | 
 81 | CeWL now sorts the words found by count and optionally (new --count argument) includes the word count in the output. I've left the words in the case they are in the pages so "Product" is different to "product" I figure that if it is being used for password generation then the case may be significant so let the user strip it if they want to. There are also more improvements to the stability of the spider in this release.
 82 | 
 83 | By default, CeWL sticks to just the site you have specified and will go to a depth of 2 links, this behaviour can be changed by passing arguments. Be careful if setting a large depth and allowing it to go offsite, you could end up drifting on to a lot of other domains. All words of three characters and over are output to stdout. This length can be increased and the words can be written to a file rather than screen so the app can be automated.
 84 | 
 85 | ## Version 4.2
 86 | 
 87 | Fixes a pretty major bug that I found while fixing a smaller bug for @yorikv. The bug was related to a hack I had to put in place because of a problem I was having with the spider, while I was looking in to it I spotted this line which is the one that the spider uses to find new links in downloaded pages:
 88 | 
 89 | ```
 90 | web_page.scan(/href="(.*?)"/i).flatten.map do |link|
 91 | ```
 92 | 
 93 | This is fine if all the links look like this:
 94 | 
 95 | ```
 96 | <a href="test.php">link</a>
 97 | ```
 98 | 
 99 | But if the link looks like either of these:
100 | 
101 | ```
102 | <a href='test.php'>link</a>
103 | <a href=test.php>link</a>
104 | ```
105 | 
106 | The regex will fail so the links will be ignored.
107 | 
108 | To fix this up I've had to override the function that parses the page to find all the links, rather than use a regex I've changed it to use Nokogiri which is designed to parse a page looking for links rather than just running through it with a custom regex. This brings in a new dependency but I think it is worth it for the fix to the functionality. I also found another bug where a link like this:
109 | 
110 | ```
111 | <a href='#name'>local</a>
112 | ```
113 | 
114 | Which should be ignored as it just links to an internal name was actually being translated to '/#name' which may unintentionally mean referencing the index page. I've fixed this one as well after a lot of debugging to find how best to do it.
115 | 
116 | A final addition is to allow a user to specify a depth of 0 which allows CeWL to spider a single page.
117 | 
118 | I'm only putting this out as a point release as I'd like to rewrite the spidering to use a better spider, that will come out as the next major release.
119 | 
120 | ## Version 4.0/4.1
121 | 
122 | The main change in version 4.0/1 is the upgrade to run with Ruby 1.9.x, this has been tested on various machines and on BT5 as that is a popular platform for running it and it appears to run fine. Another minor change is that Up to version 4 all HTML tags were stripped out before the page was parsed for words, this meant that text in alt and title tags were missed. I now grab the text from those tags before stripping the HTML to give those extra few works.
123 | 
124 | ## Version 3
125 | 
126 | Addresses a problem spotted by Josh Wright. The Spider gem doesn't handle JavaScript redirection URLs, for example an index page containing just the following:
127 | 
128 | ```
129 | <script language="JavaScript">
130 | self.location.href =
131 | 'http://www.FOO.com/FOO/connect/FOONet/Top+Navigator/Home';
132 | </script>
133 | ```
134 | 
135 | Wasn't spidered because the redirect wasn't picked up. I now scan through a page looking for any lines containing location.href= and then add the given URL to the list of pages to spider.
136 | 
137 | ## Version 2
138 | 
139 | Version 2 of CeWL can also create two new lists, a list of email addresses
140 | found in mailto links and a list of author/creator names collected from meta
141 | data found in documents on the site. It can currently process documents in
142 | Office pre 2007, Office 2007 and PDF formats. This user data can then be used
143 | to create the list of usernames to be used in association with the password
144 | list.
145 | 
146 | 


--------------------------------------------------------------------------------
/compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 |   cewl:
3 |     build: .
4 |     image: ghcr.io/digininja/cewl:latest
5 | 


--------------------------------------------------------------------------------
/fab.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # == FAB: Files Already Bagged
 4 | #
 5 | # This script can be ran against files already
 6 | # downloaded from a target site to generate a list
 7 | # of usernames and email addresses based on meta
 8 | # data contained within them.
 9 | #
10 | # To see a list of file types which can be processed
11 | # see cewl_lib.rb
12 | #
13 | # == Usage
14 | #
15 | # fab [OPTION] ... filename/list
16 | #
17 | # -h, --help:
18 | #    show help
19 | #
20 | # -v
21 | #    verbose
22 | #
23 | # filename/list: the file or list of files to check
24 | #
25 | # Author:: Robin Wood (robin@digi.ninja)
26 | # Copyright:: Copyright (c) Robin Wood 2021
27 | # Licence:: GPL
28 | #
29 | 
30 | require 'getoptlong'
31 | require_relative "./cewl_lib.rb"
32 | 
33 | opts = GetoptLong.new(
34 | 	[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
35 | 	[ "-v" , GetoptLong::NO_ARGUMENT ]
36 | )
37 | 
38 | def usage
39 | 	puts"xx
40 | 
41 | Usage: xx [OPTION] ... filename/list
42 | 	-h, --help: show help
43 | 	-v: verbose
44 | 	
45 | 	filename/list: the file or list of files to check
46 | 
47 | "
48 | 	exit
49 | end
50 | 
51 | verbose=false
52 | 
53 | begin
54 | 	opts.each do |opt, arg|
55 | 		case opt
56 | 		when '--help'
57 | 			usage
58 | 		when '-v'
59 | 			verbose=true
60 | 		end
61 | 	end
62 | rescue
63 | 	usage
64 | end
65 | 
66 | if ARGV.length < 1
67 | 	puts "Missing filename/list (try --help)"
68 | 	exit 0
69 | end
70 | 
71 | meta_data=[]
72 | 
73 | ARGV.each { |param|
74 | 	data=process_file(param, verbose)
75 | 	if(data!=nil)
76 | 		meta_data+=data
77 | 	end
78 | }
79 | 
80 | meta_data.delete_if { |x| x.chomp==""}
81 | meta_data.uniq!
82 | meta_data.sort!
83 | if meta_data.length==0
84 | 	puts "No data found\n"
85 | else
86 | 	puts meta_data.join("\n")
87 | end
88 | 


--------------------------------------------------------------------------------