├── .devcontainer
    └── devcontainer.json
├── .gitignore
├── README.md
├── github2file-tkinter-GUI.py
├── github2file.py
├── requirements.txt
└── ts-js-rust2file.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Python 3",
 3 |     "image": "mcr.microsoft.com/devcontainers/python:3.12-bookworm",
 4 |     "customizations": {
 5 |       "vscode": {
 6 |         "settings": {
 7 |           "python.defaultInterpreterPath": "/usr/local/bin/python",
 8 |           "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
 9 |           "python.formatting.blackPath": "/usr/local/py-utils/bin/black",
10 |           "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
11 |           "python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
12 |           "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
13 |           "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
14 |           "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
15 |           "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
16 |           "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
17 |         },
18 |         "extensions": [
19 |           "ms-python.python",
20 |           "ms-python.vscode-pylance"
21 |         ]
22 |       }
23 |     },
24 |     "postCreateCommand": "sudo apt-get update && sudo apt-get install -y python3-tk && pip3 install -r requirements.txt"
25 |   }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | venv/
3 | pharmaink.git_python.txt
4 | pharmaink.git_symfony.txt
5 | 
6 | # idx
7 | .idx
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GitHub Repository to File Converter
 2 | 
 3 | This Python script allows you to download and process files from a GitHub repository, making it easier to share code with chatbots that have large context capabilities but don't automatically download code from GitHub.
 4 | 
 5 | ## Features
 6 | 
 7 | - Download and process files from a GitHub repository
 8 | - Support for both public and private repositories
 9 | - Filter files based on programming language (Python, Markdown, Go, JavaScript)
10 | - Exclude certain directories, file types, and test files
11 | - Remove comments and docstrings from Python source code (optional)
12 | - Specify a branch or tag to download from (default: "master")
13 | - New GUI feature implemented in `github2file-tkinter-GUI.py`
14 | - New `--claude` option for formatting output for Claude
15 | - New script `ts-js-rust2file.py` for handling TypeScript, JavaScript, Svelte, and Rust files
16 | 
17 | ## Install
18 | 
19 | - conda create -n g2f python=3.10
20 | - conda activate g2f
21 | - pip install -r requirements.txt 
22 | 
23 | ## Usage
24 | 
25 | To download and process files from a public GitHub repository, run the following command:
26 | 
27 | ```
28 | python github2file.py https://github.com/username/repository
29 | ```
30 | 
31 | For a private repository, use the following format:
32 | 
33 | ```
34 | python github2file.py https://<USERNAME>:<GITHUB_ACCESS_TOKEN>@github.com/username/repository
35 | ```
36 | 
37 | Replace `<USERNAME>` with your GitHub username and `<GITHUB_ACCESS_TOKEN>` with your GitHub personal access token.
38 | 
39 | ### Optional Arguments
40 | 
41 | - `--lang`: Specify the programming language of the repository. Choices: "md", "go", "javascript" or "python" (default: "python").
42 | - `--keep-comments`: Keep comments and docstrings in the source code (only applicable for Python).
43 | - `--branch_or_tag`: Specify the branch or tag of the repository to download (default: "master").
44 | - `--claude`: Format the output for Claude with document tags
45 | 
46 | ### Example
47 | 
48 | To download and process files from the Hugging Face Transformers repository, run:
49 | 
50 | ```
51 | python github2file.py https://github.com/huggingface/transformers
52 | ```
53 | 
54 | This will create a file named `transformers_python.txt` containing the combined Python source code from the repository.
55 | 
56 | To download and process files from a private repository, run:
57 | 
58 | ```
59 | python github2file.py https://<USERNAME>:<GITHUB_ACCESS_TOKEN>@github.com/username/private-repo
60 | ```
61 | 
62 | ## Output
63 | 
64 | The script will create a file named `repository_language.txt` (e.g., `transformers_python.txt`) containing the combined source code from the specified repository. You can then share this file with chatbots like Claude for further analysis or discussion.
65 | 
66 | ## GUI Usage
67 | 
68 | To use the GUI feature, run the following command:
69 | 
70 | ```
71 | python github2file-tkinter-GUI.py
72 | ```
73 | 
74 | This will open a graphical user interface where you can enter the GitHub repository URL and download the combined source code.
75 | 
76 | ## New Script for TypeScript, JavaScript, Svelte, and Rust
77 | 
78 | To handle TypeScript, JavaScript, Svelte, and Rust files, use the `ts-js-rust2file.py` script. Run the following command:
79 | 
80 | ```
81 | python ts-js-rust2file.py <github_repo_url>
82 | ```
83 | 
84 | This will create a file named `<repo_name>_code.txt` containing the combined source code from the specified repository.
85 | 
86 | ## Requirements
87 | 
88 | - Python 3.x
89 | - `requests` library
90 | 
91 | ## License
92 | 
93 | This project is open-source and available under the [MIT License](LICENSE).
94 | 


--------------------------------------------------------------------------------
/github2file-tkinter-GUI.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import requests
  4 | import zipfile
  5 | import io
  6 | import ast
  7 | import tkinter as tk
  8 | from tkinter import filedialog, messagebox, font, ttk
  9 | 
 10 | def is_python_file(file_path):
 11 |     """Check if the file is a Python file."""
 12 |     return file_path.endswith(".py")
 13 | 
 14 | def is_likely_useful_file(file_path):
 15 |     """Determine if the file is likely to be useful by excluding certain directories and specific file types."""
 16 |     excluded_dirs = ["docs", "examples", "tests", "test", "__pycache__", "scripts", "utils", "benchmarks"]
 17 |     utility_or_config_files = ["hubconf.py", "setup.py"]
 18 |     github_workflow_or_docs = ["stale.py", "gen-card-", "write_model_card"]
 19 | 
 20 |     if any(part.startswith('.') for part in file_path.split('/')):
 21 |         return False
 22 | 
 23 |     if 'test' in file_path.lower():
 24 |         return False
 25 | 
 26 |     for excluded_dir in excluded_dirs:
 27 |         if f"/{excluded_dir}/" in file_path or file_path.startswith(excluded_dir + "/"):
 28 |             return False
 29 | 
 30 |     for file_name in utility_or_config_files:
 31 |         if file_name in file_path:
 32 |             return False
 33 | 
 34 |     for doc_file in github_workflow_or_docs:
 35 |         if doc_file in file_path:
 36 |             return False
 37 | 
 38 |     return True
 39 | 
 40 | def is_test_file(file_content):
 41 |     """Determine if the file content suggests it is a test file."""
 42 |     test_indicators = ["import unittest", "import pytest", "from unittest", "from pytest"]
 43 |     return any(indicator in file_content for indicator in test_indicators)
 44 | 
 45 | def has_sufficient_content(file_content, min_line_count=10):
 46 |     """Check if the file has a minimum number of substantive lines."""
 47 |     lines = [line for line in file_content.split('\n') if line.strip() and not line.strip().startswith('#')]
 48 |     return len(lines) >= min_line_count
 49 | 
 50 | def remove_comments_and_docstrings(source):
 51 |     """Remove comments and docstrings from the Python source code."""
 52 |     tree = ast.parse(source)
 53 |     for node in ast.walk(tree):
 54 |         if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node):
 55 |             node.body = node.body[1:]  # Remove docstring
 56 |         elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
 57 |             node.value.s = ""  # Remove comments
 58 | 
 59 |     return ast.unparse(tree)
 60 | 
 61 | def download_repo(repo_url, output_file):
 62 |     """Download and process files from a GitHub repository."""
 63 |     response = requests.get(repo_url + "/archive/master.zip")
 64 |     zip_file = zipfile.ZipFile(io.BytesIO(response.content))
 65 | 
 66 |     with open(output_file, "w", encoding="utf-8") as outfile:
 67 |         for file_path in zip_file.namelist():
 68 |             # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
 69 |             if file_path.endswith("/") or not is_python_file(file_path) or not is_likely_useful_file(file_path):
 70 |                 continue
 71 | 
 72 |             file_content = zip_file.read(file_path).decode("utf-8")
 73 | 
 74 |             # Skip test files based on content and files with insufficient substantive content
 75 |             if is_test_file(file_content) or not has_sufficient_content(file_content):
 76 |                 continue
 77 | 
 78 |             try:
 79 |                 file_content = remove_comments_and_docstrings(file_content)
 80 |             except SyntaxError:
 81 |                 # Skip files with syntax errors
 82 |                 continue
 83 | 
 84 |             outfile.write(f"# File: {file_path}\n")
 85 |             outfile.write(file_content)
 86 |             outfile.write("\n\n")
 87 | 
 88 | def main():
 89 |     root = tk.Tk()
 90 |     root.title("GitHub Repo Downloader")
 91 |     root.geometry("500x140")  # Make the window 10% shorter
 92 |     root.configure(bg="#1c1c1c")  # Set the background color to a dark shade
 93 | 
 94 |     # Custom font
 95 |     custom_font = font.Font(family="Consolas", size=12)
 96 | 
 97 |     # Custom button style
 98 |     style = ttk.Style()
 99 |     style.theme_use("clam")
100 |     style.configure("TButton", padding=6, relief="flat", background="#00d0ff", foreground="#1c1c1c", font=custom_font)
101 |     style.map("TButton", background=[("active", "#00a0c0")])
102 | 
103 |     def browse_repo():
104 |         repo_url = repo_entry.get()
105 |         if repo_url:
106 |             repo_name = repo_url.split("/")[-1]
107 |             output_file = f"{repo_name}_python.txt"
108 |             download_repo(repo_url, output_file)
109 |             messagebox.showinfo("Success", f"Combined Python source code saved to {output_file}", parent=root)
110 |         else:
111 |             messagebox.showerror("Error", "Please enter a valid GitHub repository URL.", parent=root)
112 | 
113 |     def browse_file():
114 |         file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text Files", "*.txt")], parent=root)
115 |         if file_path:
116 |             repo_url = repo_entry.get()
117 |             if repo_url:
118 |                 download_repo(repo_url, file_path)
119 |                 messagebox.showinfo("Success", f"Combined Python source code saved to {file_path}", parent=root)
120 |             else:
121 |                 messagebox.showerror("Error", "Please enter a valid GitHub repository URL.", parent=root)
122 | 
123 |     repo_label = tk.Label(root, text="GitHub Repository URL:", font=custom_font, fg="#00d0ff", bg="#1c1c1c")  # Light blue text on dark background
124 |     repo_label.pack(pady=10)
125 | 
126 |     repo_entry = tk.Entry(root, width=40, font=custom_font, bg="#333333", fg="#ffffff")  # Light text on dark background
127 |     repo_entry.pack()
128 | 
129 |     button_frame = tk.Frame(root, bg="#1c1c1c")  # Dark background for the button frame
130 |     button_frame.pack(pady=10)
131 | 
132 |     download_button = ttk.Button(button_frame, text="Download", command=browse_repo)
133 |     download_button.pack(side=tk.LEFT, padx=10)
134 | 
135 |     save_button = ttk.Button(button_frame, text="Save As...", command=browse_file)
136 |     save_button.pack(side=tk.LEFT)
137 | 
138 |     root.mainloop()
139 | 
140 | if __name__ == "__main__":
141 |     main()
142 | 


--------------------------------------------------------------------------------
/github2file.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import requests
  4 | import zipfile
  5 | import io
  6 | import ast
  7 | import argparse
  8 | from typing import List
  9 | 
 10 | def get_language_extensions(language: str) -> List[str]:
 11 |     """Return a list of file extensions for the specified programming language."""
 12 |     language_extensions = {
 13 |         "python": [".py", ".pyw"],  # Add .ipynb extension for Python notebooks
 14 |         #TODO convert python notebooks to python files or some format that allow conversion between notebook and python file.
 15 |         "go": [".go"],
 16 |         "javascript": [".js", ".jsx", ".ts", ".tsx"],
 17 |         "java": [".java"],
 18 |         "md": [".md"],  # Add .md extension for Markdown files
 19 |     }
 20 |     return language_extensions[language.lower()]
 21 | 
 22 | def is_file_type(file_path: str, language: str) -> bool:
 23 |     """Check if the file has a valid extension for the specified language."""
 24 |     extensions = get_language_extensions(language)
 25 |     return any(file_path.endswith(ext) for ext in extensions)
 26 | 
 27 | def is_likely_useful_file(file_path, lang):
 28 |     """Determine if the file is likely useful by applying various filters."""
 29 |     excluded_dirs = ["examples", "tests", "test", "scripts", "utils", "benchmarks"]
 30 |     utility_or_config_files = []
 31 |     workflow_or_docs = [".github", ".gitlab-ci.yml", ".gitignore", "LICENSE", "README"]
 32 | 
 33 |     if lang == "python":
 34 |         excluded_dirs.append("__pycache__")
 35 |         utility_or_config_files.extend(["hubconf.py", "setup.py"])
 36 |         workflow_or_docs.extend(["stale.py", "gen-card-", "write_model_card"])
 37 |     elif lang == "go":
 38 |         excluded_dirs.append("vendor")
 39 |         utility_or_config_files.extend(["go.mod", "go.sum", "Makefile"])
 40 | 
 41 |     if any(part.startswith('.') for part in file_path.split('/')):
 42 |         return False
 43 |     if 'test' in file_path.lower():
 44 |         return False
 45 |     for excluded_dir in excluded_dirs:
 46 |         if f"/{excluded_dir}/" in file_path or file_path.startswith(excluded_dir + "/"):
 47 |             return False
 48 |     for file_name in utility_or_config_files:
 49 |         if file_name in file_path:
 50 |             return False
 51 |     for doc_file in workflow_or_docs:
 52 |         if doc_file in file_path:
 53 |             return False
 54 |     return True
 55 | 
 56 | def is_test_file(file_content, lang):
 57 |     """Determine if the file content suggests it is a test file."""
 58 |     test_indicators = {
 59 |         "python": ["import unittest", "import pytest", "from unittest", "from pytest"],
 60 |         "go": ["import testing", "func Test"]
 61 |     }
 62 |     indicators = test_indicators.get(lang, [])
 63 |     return any(indicator in file_content for indicator in indicators)
 64 | 
 65 | def has_sufficient_content(file_content, min_line_count=10):
 66 |     """Check if the file has a minimum number of substantive lines."""
 67 |     lines = [line for line in file_content.split('\n') if line.strip() and not line.strip().startswith(('#', '//'))]
 68 |     return len(lines) >= min_line_count
 69 | 
 70 | def remove_comments_and_docstrings(source):
 71 |     """Remove comments and docstrings from the Python source code."""
 72 |     tree = ast.parse(source)
 73 |     for node in ast.walk(tree):
 74 |         if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node):
 75 |             node.body = node.body[1:]  # Remove docstring
 76 |         elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Constant):
 77 |             node.value.value = ""  # Remove comments
 78 |     return ast.unparse(tree)
 79 | 
 80 | def construct_download_url(repo_url, branch_or_tag):
 81 |     """Construct the appropriate download URL for GitHub or GitLab based on the provided URL."""
 82 |     if "github.com" in repo_url:
 83 |         return f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
 84 |     elif "gitlab.com" in repo_url:
 85 |         repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '')
 86 |         return f"{repo_url.rstrip('.git')}/-/archive/{branch_or_tag}/{repo_name}-{branch_or_tag}.zip"
 87 |     else:
 88 |         raise ValueError("Unsupported repository URL. Only GitHub and GitLab URLs are supported.")
 89 | 
 90 | def download_repo(repo_url, output_file, lang, keep_comments=False, branch_or_tag="main", token=None, claude=False):
 91 |     """Download and process files from a GitHub or GitLab repository."""
 92 |     download_url = construct_download_url(repo_url, branch_or_tag)
 93 |     headers = {}
 94 | 
 95 |     if token:
 96 |         if "gitlab.com" in repo_url:
 97 |             headers['PRIVATE-TOKEN'] = token
 98 |         elif "github.com" in repo_url:
 99 |             headers['Authorization'] = f'token {token}'
100 | 
101 |     print(download_url)
102 |     response = requests.get(download_url, headers=headers)
103 | 
104 |     try:
105 |         zip_file = zipfile.ZipFile(io.BytesIO(response.content))
106 |     except zipfile.BadZipFile:
107 |         print(f"Error: The downloaded file is not a valid ZIP archive.")
108 |         sys.exit(1)
109 | 
110 |     repo_name = repo_url.split('/')[-1]
111 |     output_file = os.path.join(output_folder, f"{repo_name}_{lang}.txt")
112 |     if claude:
113 |         output_file = os.path.join(output_folder, f"{repo_name}_{lang}-claude.txt")
114 | 
115 |     with open(output_file, "w", encoding="utf-8") as outfile:
116 |         # Include the README file
117 |         readme_file_path, readme_content = find_readme_content(zip_file)
118 | 
119 |         if claude and isinstance(claude, bool):
120 |             outfile.write("Here are some documents for you to reference for your task:\n\n")
121 |             outfile.write("<documents>\n")
122 | 
123 |             outfile.write("<document index=\"0\">\n")
124 |             outfile.write(f"<source>{readme_file_path}</source>\n")
125 |             outfile.write(f"<document_content>\n{readme_content}\n</document_content>\n")
126 |             outfile.write("</document>\n\n")
127 |         else:
128 |             outfile.write(f"{'// ' if lang == 'go' else '# '}File: {readme_file_path}\n")
129 |             outfile.write(readme_content)
130 |             outfile.write("\n\n")
131 | 
132 |         index = 1
133 |         for file_path in zip_file.namelist():
134 |             # Skip directories, non-language files, less likely useful files, hidden directories, and test files
135 |             if file_path.endswith("/") or not is_file_type(file_path, lang) or not is_likely_useful_file(file_path, lang):
136 |                 continue
137 | 
138 |             try:
139 |                 file_content = zip_file.read(file_path).decode("utf-8", errors="replace")
140 |             except UnicodeDecodeError:
141 |                 print(f"Warning: Skipping file {file_path} due to decoding error.")
142 |                 continue
143 | 
144 |             # Skip test files based on content and files with insufficient substantive content
145 |             if is_test_file(file_content, lang) or not has_sufficient_content(file_content):
146 |                 continue
147 |             if lang == "python" and not keep_comments:
148 |                 file_content = remove_comments_and_docstrings(file_content)
149 | 
150 |             if claude and isinstance(claude, bool):
151 |                 outfile.write(f"<document index=\"{index}\">\n")
152 |                 outfile.write(f"<source>{file_path}</source>\n")
153 |                 outfile.write(f"<document_content>\n{file_content}\n</document_content>\n")
154 |                 outfile.write("</document>\n\n")
155 |                 index += 1
156 |             else:
157 |                 outfile.write(f"{'// ' if lang == 'go' else '# '}File: {file_path}\n")
158 |                 outfile.write(file_content)
159 |                 outfile.write("\n\n")
160 | 
161 |         if claude and isinstance(claude, bool):
162 |             outfile.write("</documents>")
163 | 
164 | def find_readme_content(zip_file):
165 |     """
166 |     Recursively search for the README file within the ZIP archive and return its content and file path.
167 |     """
168 |     readme_file_path = ""
169 |     readme_content = ""
170 |     for file_path in zip_file.namelist():
171 |         if file_path.endswith("/README.md") or file_path == "README.md":
172 |             try:
173 |                 readme_content = zip_file.read(file_path).decode("utf-8", errors="replace")
174 |                 readme_file_path = file_path
175 |                 break
176 |             except UnicodeDecodeError:
177 |                 print(f"Warning: Skipping README.md file due to decoding error.")
178 | 
179 |     if not readme_content:
180 |         for file_path in zip_file.namelist():
181 |             if file_path.endswith("/README") or file_path == "README":
182 |                 try:
183 |                     readme_content = zip_file.read(file_path).decode("utf-8", errors="replace")
184 |                     readme_file_path = file_path
185 |                     break
186 |                 except UnicodeDecodeError:
187 |                     print(f"Warning: Skipping README file due to decoding error.")
188 | 
189 |     if not readme_content:
190 |         readme_content = "No README file found in the repository."
191 | 
192 |     return readme_file_path, readme_content
193 | 
194 | def print_usage():
195 |     print("Usage: python github2file.py <repo_url> [--lang <language>] [--keep-comments] [--branch_or_tag <branch_or_tag>] [--claude]")
196 |     print("Options:")
197 |     print("  <repo_url>               The URL of the GitHub repository")
198 |     print("  --lang <language>        The programming language of the repository (choices: go, python, md). Default: python")
199 |     print("  --keep-comments          Keep comments and docstrings in the source code (only applicable for Python)")
200 |     print("  --branch_or_tag <branch_or_tag>  The branch or tag of the repository to download. Default: master")
201 |     print("  --claude                 Format the output for Claude with document tags")
202 | 
203 | if __name__ == "__main__":
204 | 
205 |     parser = argparse.ArgumentParser(description='Download and process files from a GitHub or GitLab repository.')
206 |     parser.add_argument('repo_url', type=str, help='The URL of the GitHub or GitLab repository')
207 |     parser.add_argument('--lang', type=str, choices=['go', 'python', 'md'], default='python', help='The programming language of the repository')
208 |     parser.add_argument('--keep-comments', action='store_true', help='Keep comments and docstrings in the source code (only applicable for Python)')
209 |     parser.add_argument('--branch_or_tag', type=str, help='The branch or tag of the repository to download', default="main")
210 |     parser.add_argument('--token', type=str, help='Personal access token for private repositories', default=None)
211 |     parser.add_argument('--claude', action='store_true', help='Format the output for Claude with document tags')
212 | 
213 |     args = parser.parse_args()
214 |     output_folder = "repos"
215 |     os.makedirs(output_folder, exist_ok=True)
216 |     output_file_base = f"{args.repo_url.split('/')[-1]}_{args.lang}.txt"
217 |     output_file = output_file_base if not args.claude else f"{output_file_base}-claude.txt"
218 | 
219 |     download_repo(repo_url=args.repo_url, output_file=output_folder, lang=args.lang, keep_comments=args.keep_comments, branch_or_tag=args.branch_or_tag, token=args.token, claude=args.claude)
220 | 
221 |     print(f"Combined {args.lang.capitalize()} source code saved to {output_file}")
222 | 
223 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2024.2.2
2 | charset-normalizer==3.3.2
3 | idna==3.6
4 | requests==2.31.0
5 | tk==0.1.0
6 | urllib3==2.2.1
7 | 


--------------------------------------------------------------------------------
/ts-js-rust2file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import requests
 4 | import zipfile
 5 | import io
 6 | import ast
 7 | 
 8 | def is_desired_file(file_path):
 9 |     """Check if the file is a Python, JavaScript, TypeScript, Svelte, or Rust file."""
10 |     return file_path.endswith(".py") or file_path.endswith(".js") or file_path.endswith(".ts") or file_path.endswith(".svelte") or file_path.endswith(".rs")
11 | 
12 | def is_likely_useful_file(file_path):
13 |     """Determine if the file is likely to be useful by excluding certain directories and specific file types."""
14 |     excluded_dirs = ["docs", "examples", "tests", "test", "__pycache__", "scripts", "utils", "benchmarks", "node_modules", ".venv"]
15 |     utility_or_config_files = ["hubconf.py", "setup.py", "package-lock.json"]
16 |     github_workflow_or_docs = ["stale.py", "gen-card-", "write_model_card"]
17 | 
18 |     if any(part.startswith('.') for part in file_path.split('/')):
19 |         return False
20 |     if 'test' in file_path.lower():
21 |         return False
22 |     for excluded_dir in excluded_dirs:
23 |         if f"/{excluded_dir}/" in file_path or file_path.startswith(
24 |             f"{excluded_dir}/"
25 |         ):
26 |             return False
27 |     for file_name in utility_or_config_files:
28 |         if file_name in file_path:
29 |             return False
30 |     return all(doc_file not in file_path for doc_file in github_workflow_or_docs)
31 | 
32 | def has_sufficient_content(file_content, min_line_count=10):
33 |     """Check if the file has a minimum number of substantive lines."""
34 |     lines = [line for line in file_content.split('\n') if line.strip() and not line.strip().startswith('#')]
35 |     return len(lines) >= min_line_count
36 | 
37 | def remove_comments_and_docstrings(source):
38 |     """Remove comments and docstrings from the Python source code."""
39 |     tree = ast.parse(source)
40 |     for node in ast.walk(tree):
41 |         if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node):
42 |             node.body = node.body[1:]  # Remove docstring
43 |         elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
44 |             node.value.s = ""  # Remove comments
45 |     return ast.unparse(tree)
46 | 
47 | 
48 | 
49 | def download_repo(repo_url, output_file):
50 |     """Download and process files from a GitHub repository."""
51 |     if '/tree/' in repo_url:
52 |         repo_url = f'https://download-directory.github.io/?{repo_url}'
53 | 
54 |     response = requests.get(f"{repo_url}/archive/master.zip")
55 |     zip_file = zipfile.ZipFile(io.BytesIO(response.content))
56 | 
57 |     with open(output_file, "w", encoding="utf-8") as outfile:
58 |         for file_path in zip_file.namelist():
59 |             # Skip directories, non-Python files, less likely useful files, hidden directories, and test files
60 |             if file_path.endswith("/") or not is_desired_file(file_path) or not is_likely_useful_file(file_path):
61 |                 continue
62 | 
63 |             file_content = zip_file.read(file_path).decode("utf-8")
64 | 
65 |             # Skip test files based on content and files with insufficient substantive content
66 |             if is_desired_file(file_content) or not has_sufficient_content(file_content):
67 |                 continue
68 | 
69 |             try:
70 |                 file_content = remove_comments_and_docstrings(file_content)
71 |             except SyntaxError:
72 |                 # Skip files with syntax errors
73 |                 continue
74 | 
75 |             outfile.write(f"# File: {file_path}\n")
76 |             outfile.write(file_content)
77 |             outfile.write("\n\n")
78 | 
79 | if __name__ == "__main__":
80 |     if len(sys.argv) != 2:
81 |         print("Usage: python script.py <github_repo_url>")
82 |         sys.exit(1)
83 | 
84 |     repo_url = sys.argv[1]
85 |     repo_name = repo_url.split("/")[-1]
86 |     output_file = f"{repo_name}_code.txt"
87 | 
88 |     download_repo(repo_url, output_file)
89 |     print(f"Combined source code saved to {output_file}")
90 | 


--------------------------------------------------------------------------------