├── .gitignore
├── requirements.txt
├── setup-venv.sh
├── PROMPT.md
└── medusa.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /venv/
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | 


--------------------------------------------------------------------------------
/setup-venv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # venv の設定
3 | 
4 | python -mvenv ./venv/
5 | . ./venv/bin/activate
6 | pip install -r requirements.txt
7 | 


--------------------------------------------------------------------------------
/PROMPT.md:
--------------------------------------------------------------------------------
 1 | Pythonでスクリプトを作りたいので手伝ってください。仕様は以下の通りです。
 2 | 
 3 | - 指定のURLから到達できるHTMLを静的HTMLにlinux系のコマンドラインツール。名前はmedusa。
 4 | 
 5 | ## 使い方
 6 | python medusa.py entry_url [path] [--output=dir] [--rewite=path]
 7 | 
 8 | - entry_url: スクレープを始める最初のURL。アクセスできなければエラー
 9 | - path: スクレープの対象となるpath。このpathの子要素へのアクセスはスクレープの対象とし、外側へのリンクはノータッチで。省略されたらentry_urlのpathを使う
10 | - --output=dir: スクレープした結果を書き出すディレクトリ。省略されたらURLをstdoutに出力
11 | - --rewrite=path: スクレープしたHTMLのURLが変わるものを列挙する。省略されたら書き出さない
12 | 
13 | ## 動作
14 | - プログラムは、まず引数を受け取って指定のURLをスクレープ対象のキューに入れる。
15 | - 対象キューからURLを取り出し、コンテンツをダウンロードする。
16 | - ダウンロードしたファイルがHTMLだった場合
17 |   - その中に列挙されているhyperlink, image、css/javascript, そのほかのURLをリスト
18 |   - もしリンク先のURLが同じホストで同じポート、そしてpathのサブコンテンツだったばあい、そのURLもスクレープの対象キューに追加。コンテンツHTMLのURLを置き換える。
19 |   - URLがxxx.htmlという静的なpath出はなかった場合、保存するHTMLのパスは以下の通りに決める
20 |     - もしディレクトリだった場合、`index`をつける
21 |     - URLにquerystringが含まれていた場合、?を`_` に、&を`-` に置き換えてパスに追加
22 |     - 最後に、.htmlをつける
23 |     - 例
24 |       - http://example.com/ は index.html
25 |       - http://example.com/xxx は xxx.html
26 |       - http://example.com/?yyy=zzz は index_yyy-zzz.html
27 |       - http://example.com/xxx?yyy=zzz は xxx_yyy-zzz.html
28 | - outputに相対的な位置を保持しながら保存。outputが指定されていない場合はURLと新しいURLをstdoutに出力
29 | - 対象キューが空になるまで繰り返す
30 | - 書き換えたURLのリストをrewrite_file へ書き出す
31 | 
32 | では少しずつ実装していきましょう。まずは、URLを取得する関数を作りましょう。
33 | 


--------------------------------------------------------------------------------
/medusa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import requests
  5 | from urllib.parse import urljoin, urlparse
  6 | from bs4 import BeautifulSoup
  7 | from queue import Queue
  8 | import re
  9 | 
 10 | def parse_arguments():
 11 |     parser = argparse.ArgumentParser(description="Medusa - Static HTML Scraper")
 12 |     parser.add_argument("entry_url", help="Starting URL for scraping")
 13 |     parser.add_argument("path", nargs="?", help="Path to scrape (default: path of entry_url)")
 14 |     parser.add_argument("--output", help="Output directory for scraped content")
 15 |     parser.add_argument("--rewrite", help="File to write URL rewrites")
 16 |     return parser.parse_args()
 17 | 
 18 | def get_file_path(url, is_dir=False):
 19 |     parsed = urlparse(url)
 20 |     path = parsed.path
 21 |     if not path or path.endswith('/'):
 22 |         path += 'index'
 23 |     if parsed.query:
 24 |         query = parsed.query.replace('?', '_').replace('&', '-')
 25 |         path += f"_{query}"
 26 |     if is_dir and not path.endswith('/index'):
 27 |         path += '/index'
 28 |     return f"{path}.html"
 29 | 
 30 | def download_url(url):
 31 |     try:
 32 |         response = requests.get(url)
 33 |         response.raise_for_status()
 34 |         return response.text, response.headers.get('content-type', '')
 35 |     except requests.RequestException as e:
 36 |         print(f"Error downloading {url}: {e}", file=sys.stderr)
 37 |         return None, None
 38 | 
 39 | def extract_urls(html, base_url):
 40 |     soup = BeautifulSoup(html, 'html.parser')
 41 |     urls = set()
 42 |     for tag in soup.find_all(['a', 'img', 'link', 'script']):
 43 |         if tag.name == 'a' and tag.has_attr('href'):
 44 |             urls.add(urljoin(base_url, tag['href']))
 45 |         elif tag.name == 'img' and tag.has_attr('src'):
 46 |             urls.add(urljoin(base_url, tag['src']))
 47 |         elif tag.name == 'link' and tag.has_attr('href'):
 48 |             urls.add(urljoin(base_url, tag['href']))
 49 |         elif tag.name == 'script' and tag.has_attr('src'):
 50 |             urls.add(urljoin(base_url, tag['src']))
 51 |     return urls
 52 | 
 53 | def is_valid_url(url, base_url, path):
 54 |     parsed_url = urlparse(url)
 55 |     parsed_base = urlparse(base_url)
 56 |     return (parsed_url.netloc == parsed_base.netloc and
 57 |             parsed_url.scheme == parsed_base.scheme and
 58 |             parsed_url.path.startswith(path))
 59 | 
 60 | def rewrite_urls(html, url_map):
 61 |     for old_url, new_url in url_map.items():
 62 |         html = html.replace(old_url, new_url)
 63 |     return html
 64 | 
 65 | def main():
 66 |     args = parse_arguments()
 67 |     entry_url = args.entry_url
 68 |     base_path = args.path or urlparse(entry_url).path
 69 |     output_dir = args.output
 70 |     rewrite_file = args.rewrite
 71 | 
 72 |     url_queue = Queue()
 73 |     url_queue.put(entry_url)
 74 |     processed_urls = set()
 75 |     url_map = {}
 76 | 
 77 |     while not url_queue.empty():
 78 |         current_url = url_queue.get()
 79 |         if current_url in processed_urls:
 80 |             continue
 81 | 
 82 |         html_content, content_type = download_url(current_url)
 83 |         if not html_content:
 84 |             continue
 85 | 
 86 |         processed_urls.add(current_url)
 87 | 
 88 |         if 'text/html' in content_type:
 89 |             urls = extract_urls(html_content, current_url)
 90 |             for url in urls:
 91 |                 if is_valid_url(url, entry_url, base_path) and url not in processed_urls:
 92 |                     url_queue.put(url)
 93 | 
 94 |             file_path = get_file_path(current_url, urlparse(current_url).path.endswith('/'))
 95 |             new_url = os.path.join(base_path, file_path)
 96 |             url_map[current_url] = new_url
 97 | 
 98 |             html_content = rewrite_urls(html_content, url_map)
 99 | 
100 |             if output_dir:
101 |                 full_path = os.path.join(output_dir, file_path.lstrip('/'))
102 |                 os.makedirs(os.path.dirname(full_path), exist_ok=True)
103 |                 with open(full_path, 'w', encoding='utf-8') as f:
104 |                     f.write(html_content)
105 |             else:
106 |                 print(f"{current_url} -> {new_url}")
107 | 
108 |     if rewrite_file:
109 |         with open(rewrite_file, 'w', encoding='utf-8') as f:
110 |             for old_url, new_url in url_map.items():
111 |                 f.write(f"{old_url} -> {new_url}\n")
112 | 
113 | if __name__ == "__main__":
114 |     main()


--------------------------------------------------------------------------------