├── .github └── workflows │ └── release.yml ├── .gitignore ├── README.md ├── go.mod ├── go.sum └── main.go /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Go Binary 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | permissions: 8 | contents: write 9 | packages: write 10 | 11 | jobs: 12 | releases-matrix: 13 | name: Release Go Binary 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | goos: [linux, windows, darwin] 18 | goarch: ["386", amd64, arm64] 19 | exclude: 20 | - goarch: "386" 21 | goos: darwin 22 | - goarch: arm64 23 | goos: windows 24 | steps: 25 | - uses: actions/checkout@v4 26 | - uses: wangyoucao577/go-release-action@v1 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | goos: ${{ matrix.goos }} 30 | goarch: ${{ matrix.goarch }} 31 | goversion: "https://go.dev/dl/go1.23.2.linux-amd64.tar.gz" 32 | binary_name: "awsdocs" 33 | extra_files: README.md 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | awsdocs 2 | docs.aws.amazon.com/ 3 | out.log 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWSDocs Archive 2 | 3 | This tool allows you to be able to retrieve all documentation for AWS providing you with a local copy you can archive, search, and diff for security research. I used this repository to create a Bedrock Knowledge Base for querying with AI. 4 | 5 | - Retrieves all sitemap.xml files 6 | - Recursively retrieves all links within them 7 | - Ignores all URLs included in the sitemaps that do not include `docs.aws.amazon.com` 8 | - Ignores all non https links 9 | - Avoids most AWS SDK documentation 10 | - Supports both outputting as warc or html file formats 11 | - Saves all files by `aws_warcs/` or `aws_html/` and `YYYY/MM/DD/docs.aws.amazon.com/ec2/index.warc` 12 | 13 | ## Usage 14 | 15 | The following command allows you to be able to retrieve all the documentation in `aws_warcs/YYYY/MM/DD`. 16 | 17 | ```bash 18 | awsdocs --rate-limit --workers 15 -logfile=awsdocs.log 19 | ``` 20 | 21 | ## Searching 22 | 23 | One thing I discovered as part of this project was [ripgrep](https://github.com/BurntSushi/ripgrep) which helped massively reduce the time to search through all the files recursively as quickly as possible. Grep took `36.78s` and ripgrep spent `0.67s` for the exact same search. So I strongly advise getting familiar with ripgrep to help speed up your search. 24 | 25 | ## Retrieve URLs From Query 26 | 27 | To search for a specific string and retrieve all AWS Documentation urls containing that string you can use a combination of ripgrep and xargs to do so. 28 | 29 | ```bash 30 | $ cd 2024/09/26/docs.aws.amazon.com 31 | $ rg "s3://amzn-s3-demo-bucket-" . -l | xargs -I {} rg "Warc-Target-Uri" {} | awk '{print $2}' | sort | uniq 32 | https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Synthetics_Canaries_WritingCanary_Nodejs.html 33 | https://docs.aws.amazon.com/athena/latest/ug/tables-location-format.html 34 | https://docs.aws.amazon.com/bedrock/latest/userguide/batch-inference-example.html 35 | ``` 36 | 37 | ## Simple Search 38 | 39 | ```bash 40 | $ rg "s3://amzn-s3-demo-bucket-" . 41 | ./athena/latest/ug/tables-location-format.warc 42 | 101: Use:
s3://amzn-s3-demo-bucket/folder
/
s3://amzn-s3-demo-bucket-metadata
-s3alias/folder
/
Do not use any of the following items for specifying the LOCATION
for your
43 |
44 | ./bedrock/latest/userguide/batch-inference-example.warc
45 | 95: "s3Uri": "s3://amzn-s3-demo-bucket-input/abc.jsonl"
46 | 101: "s3Uri": "s3://amzn-s3-demo-bucket-output/"
47 |
48 | ./AmazonCloudWatch/latest/monitoring/CloudWatch_Synthetics_Canaries_WritingCanary_Nodejs.warc
49 | 337: "ArtifactS3Location":"s3://amzn-s3-demo-bucket-123456789012-us-west-2",
50 | ```
51 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module awsdocs
2 |
3 | go 1.22.5
4 |
5 | require (
6 | github.com/google/uuid v1.6.0
7 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690
8 | )
9 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
2 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
3 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690 h1:2RLSydlHktw3Fo4nwOQwjexn1d49KJb/i+EmlT4D878=
4 | github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690/go.mod h1:LuhAhBK7l5/QEJmiz3tVGLi8n0IwqAwLX/ndr+6XSDE=
5 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/xml"
5 | "flag"
6 | "fmt"
7 | "io"
8 | "io/ioutil"
9 | "log"
10 | "math/rand"
11 | "net"
12 | "net/http"
13 | "net/url"
14 | "os"
15 | "path/filepath"
16 | "regexp"
17 | "strings"
18 | "sync"
19 | "time"
20 |
21 | "github.com/slyrz/warc"
22 |
23 | "github.com/google/uuid"
24 | )
25 |
26 | const (
27 | sitemapURL = "https://docs.aws.amazon.com/sitemap_index.xml"
28 | rateLimitDelay = 2 * time.Second // Delay between each request to prevent rate limiting
29 | maxBackoffAttempts = 5 // Maximum number of backoff attempts before giving up
30 | sleepDuration = 3 * time.Second // Time to sleep on rate limit detection or failure
31 | )
32 |
33 | var (
34 | rateLimitEnabled bool // Global flag for rate limiting
35 | userAgents = []string{
36 | "awsdocs/v0.1 (+https://github.com/SecurityRunners/aws-docs)",
37 | }
38 | )
39 |
40 | // List of SDKs to exclude
41 | var sdkExclusions = []string{
42 | "AWSJavaSDK",
43 | "AWSJavaScriptSDK",
44 | "CDI-SDK",
45 | "aws-sdk-php",
46 | "chime-sdk",
47 | "database-encryption-sdk",
48 | "embedded-csdk",
49 | "encryption-sdk",
50 | "pythonsdk",
51 | "sdk-for-android",
52 | "sdk-for-cpp",
53 | "sdk-for-go",
54 | "sdk-for-ios",
55 | "sdk-for-java",
56 | "sdk-for-javascript",
57 | "sdk-for-kotlin",
58 | "sdk-for-net",
59 | "sdk-for-php",
60 | "sdk-for-php1",
61 | "sdk-for-ruby",
62 | "sdk-for-rust",
63 | "sdk-for-sapabap",
64 | "sdk-for-swift",
65 | "sdk-for-unity",
66 | "sdkfornet",
67 | "sdkfornet1",
68 | "sdkref",
69 | "xray-sdk-for-java",
70 | }
71 |
72 | var excludeRegex *regexp.Regexp
73 |
74 | func init() {
75 | // Prepare the list of SDKs for regex
76 | escapedSDKs := make([]string, len(sdkExclusions))
77 | for i, sdk := range sdkExclusions {
78 | escapedSDKs[i] = regexp.QuoteMeta(sdk)
79 | }
80 |
81 | // Build the regex pattern
82 | pattern := fmt.Sprintf(`https://docs\.aws\.amazon\.com/(?:[a-z]{2}_[a-z]{2}|cdk|%s)/`, strings.Join(escapedSDKs, "|"))
83 |
84 | excludeRegex = regexp.MustCompile(pattern)
85 | }
86 |
87 | // SitemapIndex represents the structure of the sitemap index XML.
88 | type SitemapIndex struct {
89 | XMLName xml.Name `xml:"sitemapindex"`
90 | Sitemaps []SitemapLoc `xml:"sitemap"`
91 | }
92 |
93 | // SitemapLoc represents the location of each sitemap in a sitemap index.
94 | type SitemapLoc struct {
95 | Loc string `xml:"loc"`
96 | }
97 |
98 | // URLSet represents the structure of a URL set XML (the list of URLs in a sitemap).
99 | type URLSet struct {
100 | XMLName xml.Name `xml:"urlset"`
101 | URLs []URLLoc `xml:"url"`
102 | }
103 |
104 | // URLLoc represents each URL in a URL set.
105 | type URLLoc struct {
106 | Loc string `xml:"loc"`
107 | }
108 |
109 | func main() {
110 | // Command-line flags
111 | test := flag.Int("test", 0, "Specify the number of documents to download for testing")
112 | logFile := flag.String("logfile", "", "Specify a file to write debug logs to")
113 | maxWorkers := flag.Int("workers", 10, "Number of concurrent workers to download files")
114 | flag.BoolVar(&rateLimitEnabled, "rate-limit", false, "Enable rate limiting to avoid 403 errors")
115 | exportType := flag.String("export-type", "html", "Specify export type: warc or html")
116 | flag.Parse()
117 |
118 | // Set up logging
119 | if *logFile != "" {
120 | logFileHandle, err := os.OpenFile(*logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
121 | if err != nil {
122 | log.Fatalf("Failed to open log file: %v", err)
123 | }
124 | defer logFileHandle.Close()
125 | log.SetOutput(logFileHandle)
126 | log.Println("Debug mode enabled - logs written to file.")
127 | } else {
128 | log.SetOutput(os.Stdout)
129 | }
130 |
131 | log.Println("Starting AWS documentation scraping")
132 |
133 | urlChannel := make(chan string)
134 | var wg sync.WaitGroup
135 |
136 | // Start workers to download files concurrently
137 | for i := 0; i < *maxWorkers; i++ {
138 | wg.Add(1)
139 | go func() {
140 | defer wg.Done()
141 | for url := range urlChannel {
142 | if *exportType == "warc" {
143 | downloadAndSaveAsWARC(url)
144 | } else if *exportType == "html" {
145 | downloadAndSaveAsHTML(url)
146 | } else {
147 | log.Printf("Invalid export type: %s", *exportType)
148 | }
149 | // If rate limiting is enabled, sleep between requests
150 | if rateLimitEnabled {
151 | time.Sleep(rateLimitDelay) // Delay between requests if rate limiting is enabled
152 | }
153 | }
154 | }()
155 | }
156 |
157 | // Fetch and parse the sitemap concurrently
158 | go func() {
159 | defer close(urlChannel) // Close the channel when done
160 | err := fetchAndParseSitemap(sitemapURL, *test, urlChannel)
161 | if err != nil {
162 | log.Fatalf("Error fetching sitemap: %v", err)
163 | }
164 | }()
165 |
166 | // Wait for all downloads to finish
167 | wg.Wait()
168 | log.Println("Scraping finished")
169 | }
170 |
171 | // fetchAndParseSitemap fetches and parses a sitemap, handling both sitemap indexes and URL sets.
172 | func fetchAndParseSitemap(sitemapURL string, maxDocs int, urlChannel chan<- string) error {
173 | // Replace http with https
174 | sitemapURL = strings.Replace(sitemapURL, "http://", "https://", 1)
175 |
176 | parsedURL, err := url.Parse(sitemapURL)
177 | if err != nil {
178 | log.Printf("Error parsing sitemap URL %s: %v", sitemapURL, err)
179 | return err
180 | }
181 |
182 | // Ensure the sitemap URL is under docs.aws.amazon.com
183 | if parsedURL.Host != "docs.aws.amazon.com" {
184 | log.Printf("Skipping sitemap from other domain: %s", sitemapURL)
185 | return nil
186 | }
187 |
188 | // Exclude if matches excludeRegex
189 | if excludeRegex.MatchString(sitemapURL) {
190 | log.Printf("Skipping excluded sitemap: %s", sitemapURL)
191 | return nil
192 | }
193 |
194 | log.Printf("Fetching sitemap: %s", sitemapURL)
195 | resp, err := fetchWithRateLimitHandling(sitemapURL)
196 | if err != nil {
197 | log.Printf("Error fetching sitemap %s: %v", sitemapURL, err)
198 | return err
199 | }
200 | defer resp.Body.Close()
201 |
202 | body, err := ioutil.ReadAll(resp.Body)
203 | if err != nil {
204 | return err
205 | }
206 |
207 | // First try parsing the body as a SitemapIndex
208 | var sitemapIndex SitemapIndex
209 | err = xml.Unmarshal(body, &sitemapIndex)
210 | if err == nil && len(sitemapIndex.Sitemaps) > 0 {
211 | log.Printf("Parsed sitemap as a SitemapIndex: %s", sitemapURL)
212 | // Recursively fetch child sitemaps
213 | for _, sitemap := range sitemapIndex.Sitemaps {
214 | err := fetchAndParseSitemap(sitemap.Loc, maxDocs, urlChannel)
215 | if err != nil {
216 | log.Printf("Error fetching child sitemap: %v", err)
217 | }
218 | }
219 | return nil
220 | }
221 |
222 | // If it's not a SitemapIndex, try parsing it as a URLSet
223 | var urlSet URLSet
224 | err = xml.Unmarshal(body, &urlSet)
225 | if err == nil && len(urlSet.URLs) > 0 {
226 | log.Printf("Parsed sitemap as a URLSet: %s", sitemapURL)
227 | for i, urlEntry := range urlSet.URLs {
228 | // Replace http with https
229 | urlEntry.Loc = strings.Replace(urlEntry.Loc, "http://", "https://", 1)
230 |
231 | parsedURL, err := url.Parse(urlEntry.Loc)
232 | if err != nil {
233 | log.Printf("Error parsing URL %s: %v", urlEntry.Loc, err)
234 | continue
235 | }
236 |
237 | // Ensure the URL is under docs.aws.amazon.com
238 | if parsedURL.Host != "docs.aws.amazon.com" {
239 | log.Printf("Skipping URL from other domain: %s", urlEntry.Loc)
240 | continue
241 | }
242 |
243 | // Check if the URL matches the exclusion pattern
244 | if excludeRegex.MatchString(urlEntry.Loc) {
245 | log.Printf("Skipping excluded URL: %s", urlEntry.Loc)
246 | continue
247 | }
248 |
249 | urlChannel <- urlEntry.Loc
250 | log.Printf("Queued URL for download: %s", urlEntry.Loc)
251 |
252 | if maxDocs > 0 && i+1 >= maxDocs {
253 | break
254 | }
255 | }
256 | return nil
257 | }
258 |
259 | // If parsing fails, log an error
260 | log.Printf("Error parsing sitemap: unable to determine type for URL %s\n", sitemapURL)
261 | return fmt.Errorf("unable to parse sitemap at %s", sitemapURL)
262 | }
263 |
264 | // fetchWithRateLimitHandling fetches the document from the given URL and handles 403 rate limiting or connection errors.
265 | func fetchWithRateLimitHandling(url string) (*http.Response, error) {
266 | maxRetries := 5
267 | for retries := 0; retries < maxRetries; retries++ {
268 | // Randomly select a user agent from the list
269 | userAgent := userAgents[rand.Intn(len(userAgents))]
270 |
271 | // Create a new HTTP request
272 | req, err := http.NewRequest("GET", url, nil)
273 | if err != nil {
274 | return nil, err
275 | }
276 |
277 | // Set the user agent header
278 | req.Header.Set("User-Agent", userAgent)
279 |
280 | // Send the request
281 | client := &http.Client{}
282 | resp, err := client.Do(req)
283 | if err != nil {
284 | // Check if it's a temporary network error
285 | if nerr, ok := err.(net.Error); ok && nerr.Temporary() {
286 | log.Printf("Temporary error fetching URL: %v, retrying in %s", err, sleepDuration)
287 | time.Sleep(sleepDuration)
288 | continue
289 | } else {
290 | // Non-recoverable error, log and exit
291 | log.Printf("Error fetching URL: %v", err)
292 | return nil, err
293 | }
294 | }
295 |
296 | // If successful response, return it
297 | if resp.StatusCode == http.StatusOK {
298 | return resp, nil
299 | }
300 |
301 | // Handle rate limiting (403 Forbidden)
302 | if resp.StatusCode == http.StatusForbidden {
303 | log.Printf("Received 403 Forbidden (rate limit), pausing for %s before retrying...", sleepDuration)
304 | time.Sleep(sleepDuration)
305 | resp.Body.Close()
306 | continue
307 | }
308 |
309 | // Handle other unexpected status codes
310 | log.Printf("Unexpected status code: %d for URL %s", resp.StatusCode, url)
311 | resp.Body.Close()
312 | return nil, fmt.Errorf("unexpected status code: %d for URL %s", resp.StatusCode, url)
313 | }
314 | return nil, fmt.Errorf("max retries exceeded for URL %s", url)
315 | }
316 |
317 | func downloadAndSaveAsWARC(url string) {
318 | // Get the current date
319 | now := time.Now()
320 | datePath := filepath.Join(
321 | // Adding year, month, and day to the directory path
322 | "aws_warcs",
323 | now.Format("2006"), // Year
324 | now.Format("01"), // Month
325 | now.Format("02"), // Day
326 | )
327 |
328 | // Remove the protocol part (https://) and construct the URL-based directory structure
329 | trimmedURL := strings.TrimPrefix(url, "https://")
330 | dirPath := filepath.Join(datePath, filepath.Dir(trimmedURL))
331 |
332 | // Create the directory structure based on the date and URL
333 | err := os.MkdirAll(dirPath, 0755)
334 | if err != nil {
335 | log.Printf("Error creating directory: %v", err)
336 | return
337 | }
338 |
339 | // Use the last part of the URL as the file name (with .html removed)
340 | fileName := filepath.Base(strings.TrimSuffix(trimmedURL, ".html"))
341 | warcFilePath := filepath.Join(dirPath, fileName+".warc")
342 |
343 | // Create the WARC file
344 | warcFile, err := os.Create(warcFilePath)
345 | if err != nil {
346 | log.Printf("Error creating WARC file: %v", err)
347 | return
348 | }
349 | defer warcFile.Close()
350 |
351 | // Initialize WARC writer
352 | warcWriter := warc.NewWriter(warcFile)
353 |
354 | // Fetch document
355 | log.Printf("Downloading document: %s\n", url)
356 | resp, err := fetchWithRateLimitHandling(url)
357 | if err != nil {
358 | log.Printf("Error downloading document: %v", err)
359 | return
360 | }
361 | defer resp.Body.Close()
362 |
363 | // Create WARC request record
364 | reqRecord := warc.NewRecord()
365 | reqRecord.Header.Set("WARC-Type", "request")
366 | reqRecord.Header.Set("Content-Type", "application/http;msgtype=request")
367 | reqRecord.Header.Set("WARC-Target-URI", url)
368 | reqRecord.Header.Set("WARC-Date", time.Now().UTC().Format(time.RFC3339))
369 | reqRecord.Header.Set("WARC-Record-ID", "