├── .github └── workflows │ └── release.yml ├── .gitignore ├── README.md ├── go.mod ├── go.sum └── main.go /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Go Binary 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | permissions: 8 | contents: write 9 | packages: write 10 | 11 | jobs: 12 | releases-matrix: 13 | name: Release Go Binary 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | goos: [linux, windows, darwin] 18 | goarch: ["386", amd64, arm64] 19 | exclude: 20 | - goarch: "386" 21 | goos: darwin 22 | - goarch: arm64 23 | goos: windows 24 | steps: 25 | - uses: actions/checkout@v4 26 | - uses: wangyoucao577/go-release-action@v1 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | goos: ${{ matrix.goos }} 30 | goarch: ${{ matrix.goarch }} 31 | goversion: "https://go.dev/dl/go1.23.2.linux-amd64.tar.gz" 32 | binary_name: "awsdocs" 33 | extra_files: README.md 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | awsdocs 2 | docs.aws.amazon.com/ 3 | out.log 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWSDocs Archive 2 | 3 | This tool allows you to be able to retrieve all documentation for AWS providing you with a local copy you can archive, search, and diff for security research. I used this repository to create a Bedrock Knowledge Base for querying with AI. 4 | 5 | - Retrieves all sitemap.xml files 6 | - Recursively retrieves all links within them 7 | - Ignores all URLs included in the sitemaps that do not include `docs.aws.amazon.com` 8 | - Ignores all non https links 9 | - Avoids most AWS SDK documentation 10 | - Supports both outputting as warc or html file formats 11 | - Saves all files by `aws_warcs/` or `aws_html/` and `YYYY/MM/DD/docs.aws.amazon.com/ec2/index.warc` 12 | 13 | ## Usage 14 | 15 | The following command allows you to be able to retrieve all the documentation in `aws_warcs/YYYY/MM/DD`. 16 | 17 | ```bash 18 | awsdocs --rate-limit --workers 15 -logfile=awsdocs.log 19 | ``` 20 | 21 | ## Searching 22 | 23 | One thing I discovered as part of this project was [ripgrep](https://github.com/BurntSushi/ripgrep) which helped massively reduce the time to search through all the files recursively as quickly as possible. Grep took `36.78s` and ripgrep spent `0.67s` for the exact same search. So I strongly advise getting familiar with ripgrep to help speed up your search. 24 | 25 | ## Retrieve URLs From Query 26 | 27 | To search for a specific string and retrieve all AWS Documentation urls containing that string you can use a combination of ripgrep and xargs to do so. 28 | 29 | ```bash 30 | $ cd 2024/09/26/docs.aws.amazon.com 31 | $ rg "s3://amzn-s3-demo-bucket-" . -l | xargs -I {} rg "Warc-Target-Uri" {} | awk '{print $2}' | sort | uniq 32 | https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Synthetics_Canaries_WritingCanary_Nodejs.html 33 | https://docs.aws.amazon.com/athena/latest/ug/tables-location-format.html 34 | https://docs.aws.amazon.com/bedrock/latest/userguide/batch-inference-example.html 35 | ``` 36 | 37 | ## Simple Search 38 | 39 | ```bash 40 | $ rg "s3://amzn-s3-demo-bucket-" . 41 | ./athena/latest/ug/tables-location-format.warc 42 | 101: Use:

s3://amzn-s3-demo-bucket/folder/
s3://amzn-s3-demo-bucket-metadata-s3alias/folder/

Do not use any of the following items for specifying the LOCATION for your 43 | 44 | ./bedrock/latest/userguide/batch-inference-example.warc 45 | 95: "s3Uri": "s3://amzn-s3-demo-bucket-input/abc.jsonl" 46 | 101: "s3Uri": "s3://amzn-s3-demo-bucket-output/" 47 | 48 | ./AmazonCloudWatch/latest/monitoring/CloudWatch_Synthetics_Canaries_WritingCanary_Nodejs.warc 49 | 337: "ArtifactS3Location":"s3://amzn-s3-demo-bucket-123456789012-us-west-2", 50 | ``` 51 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module awsdocs 2 | 3 | go 1.22.5 4 | 5 | require ( 6 | github.com/google/uuid v1.6.0 7 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 2 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 3 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690 h1:2RLSydlHktw3Fo4nwOQwjexn1d49KJb/i+EmlT4D878= 4 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690/go.mod h1:LuhAhBK7l5/QEJmiz3tVGLi8n0IwqAwLX/ndr+6XSDE= 5 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/xml" 5 | "flag" 6 | "fmt" 7 | "io" 8 | "io/ioutil" 9 | "log" 10 | "math/rand" 11 | "net" 12 | "net/http" 13 | "net/url" 14 | "os" 15 | "path/filepath" 16 | "regexp" 17 | "strings" 18 | "sync" 19 | "time" 20 | 21 | "github.com/slyrz/warc" 22 | 23 | "github.com/google/uuid" 24 | ) 25 | 26 | const ( 27 | sitemapURL = "https://docs.aws.amazon.com/sitemap_index.xml" 28 | rateLimitDelay = 2 * time.Second // Delay between each request to prevent rate limiting 29 | maxBackoffAttempts = 5 // Maximum number of backoff attempts before giving up 30 | sleepDuration = 3 * time.Second // Time to sleep on rate limit detection or failure 31 | ) 32 | 33 | var ( 34 | rateLimitEnabled bool // Global flag for rate limiting 35 | userAgents = []string{ 36 | "awsdocs/v0.1 (+https://github.com/SecurityRunners/aws-docs)", 37 | } 38 | ) 39 | 40 | // List of SDKs to exclude 41 | var sdkExclusions = []string{ 42 | "AWSJavaSDK", 43 | "AWSJavaScriptSDK", 44 | "CDI-SDK", 45 | "aws-sdk-php", 46 | "chime-sdk", 47 | "database-encryption-sdk", 48 | "embedded-csdk", 49 | "encryption-sdk", 50 | "pythonsdk", 51 | "sdk-for-android", 52 | "sdk-for-cpp", 53 | "sdk-for-go", 54 | "sdk-for-ios", 55 | "sdk-for-java", 56 | "sdk-for-javascript", 57 | "sdk-for-kotlin", 58 | "sdk-for-net", 59 | "sdk-for-php", 60 | "sdk-for-php1", 61 | "sdk-for-ruby", 62 | "sdk-for-rust", 63 | "sdk-for-sapabap", 64 | "sdk-for-swift", 65 | "sdk-for-unity", 66 | "sdkfornet", 67 | "sdkfornet1", 68 | "sdkref", 69 | "xray-sdk-for-java", 70 | } 71 | 72 | var excludeRegex *regexp.Regexp 73 | 74 | func init() { 75 | // Prepare the list of SDKs for regex 76 | escapedSDKs := make([]string, len(sdkExclusions)) 77 | for i, sdk := range sdkExclusions { 78 | escapedSDKs[i] = regexp.QuoteMeta(sdk) 79 | } 80 | 81 | // Build the regex pattern 82 | pattern := fmt.Sprintf(`https://docs\.aws\.amazon\.com/(?:[a-z]{2}_[a-z]{2}|cdk|%s)/`, strings.Join(escapedSDKs, "|")) 83 | 84 | excludeRegex = regexp.MustCompile(pattern) 85 | } 86 | 87 | // SitemapIndex represents the structure of the sitemap index XML. 88 | type SitemapIndex struct { 89 | XMLName xml.Name `xml:"sitemapindex"` 90 | Sitemaps []SitemapLoc `xml:"sitemap"` 91 | } 92 | 93 | // SitemapLoc represents the location of each sitemap in a sitemap index. 94 | type SitemapLoc struct { 95 | Loc string `xml:"loc"` 96 | } 97 | 98 | // URLSet represents the structure of a URL set XML (the list of URLs in a sitemap). 99 | type URLSet struct { 100 | XMLName xml.Name `xml:"urlset"` 101 | URLs []URLLoc `xml:"url"` 102 | } 103 | 104 | // URLLoc represents each URL in a URL set. 105 | type URLLoc struct { 106 | Loc string `xml:"loc"` 107 | } 108 | 109 | func main() { 110 | // Command-line flags 111 | test := flag.Int("test", 0, "Specify the number of documents to download for testing") 112 | logFile := flag.String("logfile", "", "Specify a file to write debug logs to") 113 | maxWorkers := flag.Int("workers", 10, "Number of concurrent workers to download files") 114 | flag.BoolVar(&rateLimitEnabled, "rate-limit", false, "Enable rate limiting to avoid 403 errors") 115 | exportType := flag.String("export-type", "html", "Specify export type: warc or html") 116 | flag.Parse() 117 | 118 | // Set up logging 119 | if *logFile != "" { 120 | logFileHandle, err := os.OpenFile(*logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 121 | if err != nil { 122 | log.Fatalf("Failed to open log file: %v", err) 123 | } 124 | defer logFileHandle.Close() 125 | log.SetOutput(logFileHandle) 126 | log.Println("Debug mode enabled - logs written to file.") 127 | } else { 128 | log.SetOutput(os.Stdout) 129 | } 130 | 131 | log.Println("Starting AWS documentation scraping") 132 | 133 | urlChannel := make(chan string) 134 | var wg sync.WaitGroup 135 | 136 | // Start workers to download files concurrently 137 | for i := 0; i < *maxWorkers; i++ { 138 | wg.Add(1) 139 | go func() { 140 | defer wg.Done() 141 | for url := range urlChannel { 142 | if *exportType == "warc" { 143 | downloadAndSaveAsWARC(url) 144 | } else if *exportType == "html" { 145 | downloadAndSaveAsHTML(url) 146 | } else { 147 | log.Printf("Invalid export type: %s", *exportType) 148 | } 149 | // If rate limiting is enabled, sleep between requests 150 | if rateLimitEnabled { 151 | time.Sleep(rateLimitDelay) // Delay between requests if rate limiting is enabled 152 | } 153 | } 154 | }() 155 | } 156 | 157 | // Fetch and parse the sitemap concurrently 158 | go func() { 159 | defer close(urlChannel) // Close the channel when done 160 | err := fetchAndParseSitemap(sitemapURL, *test, urlChannel) 161 | if err != nil { 162 | log.Fatalf("Error fetching sitemap: %v", err) 163 | } 164 | }() 165 | 166 | // Wait for all downloads to finish 167 | wg.Wait() 168 | log.Println("Scraping finished") 169 | } 170 | 171 | // fetchAndParseSitemap fetches and parses a sitemap, handling both sitemap indexes and URL sets. 172 | func fetchAndParseSitemap(sitemapURL string, maxDocs int, urlChannel chan<- string) error { 173 | // Replace http with https 174 | sitemapURL = strings.Replace(sitemapURL, "http://", "https://", 1) 175 | 176 | parsedURL, err := url.Parse(sitemapURL) 177 | if err != nil { 178 | log.Printf("Error parsing sitemap URL %s: %v", sitemapURL, err) 179 | return err 180 | } 181 | 182 | // Ensure the sitemap URL is under docs.aws.amazon.com 183 | if parsedURL.Host != "docs.aws.amazon.com" { 184 | log.Printf("Skipping sitemap from other domain: %s", sitemapURL) 185 | return nil 186 | } 187 | 188 | // Exclude if matches excludeRegex 189 | if excludeRegex.MatchString(sitemapURL) { 190 | log.Printf("Skipping excluded sitemap: %s", sitemapURL) 191 | return nil 192 | } 193 | 194 | log.Printf("Fetching sitemap: %s", sitemapURL) 195 | resp, err := fetchWithRateLimitHandling(sitemapURL) 196 | if err != nil { 197 | log.Printf("Error fetching sitemap %s: %v", sitemapURL, err) 198 | return err 199 | } 200 | defer resp.Body.Close() 201 | 202 | body, err := ioutil.ReadAll(resp.Body) 203 | if err != nil { 204 | return err 205 | } 206 | 207 | // First try parsing the body as a SitemapIndex 208 | var sitemapIndex SitemapIndex 209 | err = xml.Unmarshal(body, &sitemapIndex) 210 | if err == nil && len(sitemapIndex.Sitemaps) > 0 { 211 | log.Printf("Parsed sitemap as a SitemapIndex: %s", sitemapURL) 212 | // Recursively fetch child sitemaps 213 | for _, sitemap := range sitemapIndex.Sitemaps { 214 | err := fetchAndParseSitemap(sitemap.Loc, maxDocs, urlChannel) 215 | if err != nil { 216 | log.Printf("Error fetching child sitemap: %v", err) 217 | } 218 | } 219 | return nil 220 | } 221 | 222 | // If it's not a SitemapIndex, try parsing it as a URLSet 223 | var urlSet URLSet 224 | err = xml.Unmarshal(body, &urlSet) 225 | if err == nil && len(urlSet.URLs) > 0 { 226 | log.Printf("Parsed sitemap as a URLSet: %s", sitemapURL) 227 | for i, urlEntry := range urlSet.URLs { 228 | // Replace http with https 229 | urlEntry.Loc = strings.Replace(urlEntry.Loc, "http://", "https://", 1) 230 | 231 | parsedURL, err := url.Parse(urlEntry.Loc) 232 | if err != nil { 233 | log.Printf("Error parsing URL %s: %v", urlEntry.Loc, err) 234 | continue 235 | } 236 | 237 | // Ensure the URL is under docs.aws.amazon.com 238 | if parsedURL.Host != "docs.aws.amazon.com" { 239 | log.Printf("Skipping URL from other domain: %s", urlEntry.Loc) 240 | continue 241 | } 242 | 243 | // Check if the URL matches the exclusion pattern 244 | if excludeRegex.MatchString(urlEntry.Loc) { 245 | log.Printf("Skipping excluded URL: %s", urlEntry.Loc) 246 | continue 247 | } 248 | 249 | urlChannel <- urlEntry.Loc 250 | log.Printf("Queued URL for download: %s", urlEntry.Loc) 251 | 252 | if maxDocs > 0 && i+1 >= maxDocs { 253 | break 254 | } 255 | } 256 | return nil 257 | } 258 | 259 | // If parsing fails, log an error 260 | log.Printf("Error parsing sitemap: unable to determine type for URL %s\n", sitemapURL) 261 | return fmt.Errorf("unable to parse sitemap at %s", sitemapURL) 262 | } 263 | 264 | // fetchWithRateLimitHandling fetches the document from the given URL and handles 403 rate limiting or connection errors. 265 | func fetchWithRateLimitHandling(url string) (*http.Response, error) { 266 | maxRetries := 5 267 | for retries := 0; retries < maxRetries; retries++ { 268 | // Randomly select a user agent from the list 269 | userAgent := userAgents[rand.Intn(len(userAgents))] 270 | 271 | // Create a new HTTP request 272 | req, err := http.NewRequest("GET", url, nil) 273 | if err != nil { 274 | return nil, err 275 | } 276 | 277 | // Set the user agent header 278 | req.Header.Set("User-Agent", userAgent) 279 | 280 | // Send the request 281 | client := &http.Client{} 282 | resp, err := client.Do(req) 283 | if err != nil { 284 | // Check if it's a temporary network error 285 | if nerr, ok := err.(net.Error); ok && nerr.Temporary() { 286 | log.Printf("Temporary error fetching URL: %v, retrying in %s", err, sleepDuration) 287 | time.Sleep(sleepDuration) 288 | continue 289 | } else { 290 | // Non-recoverable error, log and exit 291 | log.Printf("Error fetching URL: %v", err) 292 | return nil, err 293 | } 294 | } 295 | 296 | // If successful response, return it 297 | if resp.StatusCode == http.StatusOK { 298 | return resp, nil 299 | } 300 | 301 | // Handle rate limiting (403 Forbidden) 302 | if resp.StatusCode == http.StatusForbidden { 303 | log.Printf("Received 403 Forbidden (rate limit), pausing for %s before retrying...", sleepDuration) 304 | time.Sleep(sleepDuration) 305 | resp.Body.Close() 306 | continue 307 | } 308 | 309 | // Handle other unexpected status codes 310 | log.Printf("Unexpected status code: %d for URL %s", resp.StatusCode, url) 311 | resp.Body.Close() 312 | return nil, fmt.Errorf("unexpected status code: %d for URL %s", resp.StatusCode, url) 313 | } 314 | return nil, fmt.Errorf("max retries exceeded for URL %s", url) 315 | } 316 | 317 | func downloadAndSaveAsWARC(url string) { 318 | // Get the current date 319 | now := time.Now() 320 | datePath := filepath.Join( 321 | // Adding year, month, and day to the directory path 322 | "aws_warcs", 323 | now.Format("2006"), // Year 324 | now.Format("01"), // Month 325 | now.Format("02"), // Day 326 | ) 327 | 328 | // Remove the protocol part (https://) and construct the URL-based directory structure 329 | trimmedURL := strings.TrimPrefix(url, "https://") 330 | dirPath := filepath.Join(datePath, filepath.Dir(trimmedURL)) 331 | 332 | // Create the directory structure based on the date and URL 333 | err := os.MkdirAll(dirPath, 0755) 334 | if err != nil { 335 | log.Printf("Error creating directory: %v", err) 336 | return 337 | } 338 | 339 | // Use the last part of the URL as the file name (with .html removed) 340 | fileName := filepath.Base(strings.TrimSuffix(trimmedURL, ".html")) 341 | warcFilePath := filepath.Join(dirPath, fileName+".warc") 342 | 343 | // Create the WARC file 344 | warcFile, err := os.Create(warcFilePath) 345 | if err != nil { 346 | log.Printf("Error creating WARC file: %v", err) 347 | return 348 | } 349 | defer warcFile.Close() 350 | 351 | // Initialize WARC writer 352 | warcWriter := warc.NewWriter(warcFile) 353 | 354 | // Fetch document 355 | log.Printf("Downloading document: %s\n", url) 356 | resp, err := fetchWithRateLimitHandling(url) 357 | if err != nil { 358 | log.Printf("Error downloading document: %v", err) 359 | return 360 | } 361 | defer resp.Body.Close() 362 | 363 | // Create WARC request record 364 | reqRecord := warc.NewRecord() 365 | reqRecord.Header.Set("WARC-Type", "request") 366 | reqRecord.Header.Set("Content-Type", "application/http;msgtype=request") 367 | reqRecord.Header.Set("WARC-Target-URI", url) 368 | reqRecord.Header.Set("WARC-Date", time.Now().UTC().Format(time.RFC3339)) 369 | reqRecord.Header.Set("WARC-Record-ID", "") 370 | reqRecord.Content = strings.NewReader("") 371 | 372 | // Write request to WARC 373 | _, err = warcWriter.WriteRecord(reqRecord) 374 | if err != nil { 375 | log.Printf("Error writing WARC request record: %v", err) 376 | return 377 | } 378 | 379 | // Create WARC response record 380 | respBody := new(strings.Builder) 381 | _, err = io.Copy(respBody, resp.Body) // Safely read the response body 382 | if err != nil { 383 | log.Printf("Error reading response body: %v", err) 384 | return 385 | } 386 | 387 | respRecord := warc.NewRecord() 388 | respRecord.Header.Set("WARC-Type", "response") 389 | respRecord.Header.Set("Content-Type", "application/http;msgtype=response") 390 | respRecord.Header.Set("WARC-Target-URI", url) 391 | respRecord.Header.Set("WARC-Date", time.Now().UTC().Format(time.RFC3339)) 392 | respRecord.Header.Set("WARC-Record-ID", "") 393 | respRecord.Content = strings.NewReader(respBody.String()) // Use the body content 394 | 395 | // Write response to WARC 396 | _, err = warcWriter.WriteRecord(respRecord) 397 | if err != nil { 398 | log.Printf("Error writing WARC response record: %v", err) 399 | return 400 | } 401 | 402 | log.Printf("Successfully saved WARC file: %s\n", warcFilePath) 403 | } 404 | 405 | func downloadAndSaveAsHTML(url string) { 406 | // Get the current date 407 | now := time.Now() 408 | datePath := filepath.Join( 409 | // Adding year, month, and day to the directory path 410 | "aws_html", 411 | now.Format("2006"), // Year 412 | now.Format("01"), // Month 413 | now.Format("02"), // Day 414 | ) 415 | 416 | // Remove the protocol part (https://) and construct the URL-based directory structure 417 | trimmedURL := strings.TrimPrefix(url, "https://") 418 | 419 | // Determine the directory path and file name 420 | var dirPath, htmlFilePath string 421 | if strings.HasSuffix(trimmedURL, "/") { 422 | dirPath = filepath.Join(datePath, trimmedURL) 423 | htmlFilePath = filepath.Join(dirPath, "index.html") 424 | } else { 425 | dirPath = filepath.Join(datePath, filepath.Dir(trimmedURL)) 426 | fileName := filepath.Base(trimmedURL) 427 | htmlFilePath = filepath.Join(dirPath, fileName) 428 | } 429 | 430 | // Create the directory structure 431 | err := os.MkdirAll(dirPath, 0755) 432 | if err != nil { 433 | log.Printf("Error creating directory: %v", err) 434 | return 435 | } 436 | 437 | // Fetch document 438 | log.Printf("Downloading document: %s\n", url) 439 | resp, err := fetchWithRateLimitHandling(url) 440 | if err != nil { 441 | log.Printf("Error downloading document: %v", err) 442 | return 443 | } 444 | defer resp.Body.Close() 445 | 446 | // Read the response body 447 | bodyBytes, err := io.ReadAll(resp.Body) 448 | if err != nil { 449 | log.Printf("Error reading response body: %v", err) 450 | return 451 | } 452 | 453 | // Write to file 454 | err = os.WriteFile(htmlFilePath, bodyBytes, 0644) 455 | if err != nil { 456 | log.Printf("Error writing HTML file: %v", err) 457 | return 458 | } 459 | 460 | log.Printf("Successfully saved HTML file: %s\n", htmlFilePath) 461 | } 462 | --------------------------------------------------------------------------------