├── README.md └── clone.py /README.md: -------------------------------------------------------------------------------- 1 | # ~~Clone codes from anonymous.4open.science~~ 2 | 3 | **Can NOT work now** :pensive: 4 | 5 | 6 | Only supports Python3. 7 | 8 | ## Prerequisites 9 | ``` 10 | pip install beautifulsoup4 11 | pip install lxml 12 | ``` 13 | 14 | 15 | ## Quick Start 16 | ``` 17 | git clone https://github.com/ShoufaChen/clone-anonymous4open 18 | cd clone-anonymous4open 19 | python clone.py --clone-dir /path/to/save --target anonymous-url 20 | ``` 21 | Example: 22 | ``` 23 | python clone.py --clone-dir ../examples --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/ 24 | ``` 25 | 26 | 27 | ## TODO 28 | - [ ] support files that do not have `` element. 29 | - [ ] support *.md files 30 | 31 | ## Acknowledgement 32 | 33 | Thanks to excellent [Luyuan](https://github.com/BeBeBerr) for helpful instructions. :poultry_leg::poultry_leg::poultry_leg: 34 | -------------------------------------------------------------------------------- /clone.py: -------------------------------------------------------------------------------- 1 | import urllib.request as urllib2 2 | import re 3 | from bs4 import BeautifulSoup 4 | import os 5 | import argparse 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser(description='Clone from the https://anonymous.4open.science') 9 | parser.add_argument('--clone-dir', type=str, default='master', 10 | help='master loacation') 11 | parser.add_argument('--target', type=str, 12 | help='anonymous link you want to clone') 13 | return parser.parse_args() 14 | 15 | 16 | def create_dir(name): 17 | if not os.path.exists(name): 18 | os.mkdir(name) 19 | 20 | def pull_html(url): 21 | req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 22 | try: 23 | response = urllib2.urlopen(req).read() 24 | except urllib2.URLError as e: 25 | print(e) 26 | print(url) 27 | content = response.decode('utf-8') 28 | soup = BeautifulSoup(content, "lxml") 29 | 30 | return soup 31 | 32 | 33 | def pull_trees(url): 34 | folder_soup = pull_html(url) 35 | trees = folder_soup.find_all('div', attrs={'class': 'tree'}) 36 | return trees 37 | 38 | def pull_blobs(url): 39 | blobs_soup = pull_html(url) 40 | blobs = blobs_soup.find_all('div', attrs={'class': 'blob'}) 41 | return blobs 42 | 43 | def clone_file(url, download, root_url='https://anonymous.4open.science'): 44 | blobs = pull_blobs(root_url+url) 45 | for blob in blobs: 46 | href = blob.a.get('href') 47 | split_href = href.split('/') 48 | file_name = '/'.join([download]+split_href[3:]) 49 | 50 | print('Clone... ', file_name) 51 | #used for debug 52 | #print('Clone... ', file_name, href) 53 | 54 | ### Not support clone markdown files now and LICENSE 55 | if split_href[-1].split('.')[-1] == 'md' or split_href[-1] =='LICENSE': 56 | continue 57 | 58 | blob_soup = pull_html(root_url+urllib2.pathname2url(href)) 59 | source_code = blob_soup.find('code') 60 | if not source_code: 61 | print('Skip file {}'.format(file_name)) 62 | continue 63 | with open(file_name, 'w') as f: 64 | f.write(source_code.get_text()) 65 | 66 | def clone_dirs(url, folders_url_lis, download, root_url='https://anonymous.4open.science'): 67 | trees = pull_trees(root_url+url) 68 | 69 | for t in trees: 70 | href = t.a.get('href') 71 | split_href = href.split('/') 72 | #folder_name = split_href[-2] 73 | folder_name = '/'.join([download]+split_href[3:-1]) 74 | print('Clone... ', folder_name) 75 | #print('Clone... ', folder_name, href) 76 | create_dir(folder_name) 77 | 78 | folders_url_list.append(href) 79 | 80 | folders_url_list.remove(url) 81 | return folders_url_list 82 | 83 | 84 | if __name__ == '__main__': 85 | args = parse_args() 86 | assert args.target, '\nPlese specifipy your target URL, \n e.g: '\ 87 | +'python clone.py --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/' 88 | 89 | root_url = 'https://anonymous.4open.science' 90 | target_url = args.target.replace(root_url, '') 91 | 92 | create_dir(args.clone_dir) 93 | 94 | folders_url_list = [target_url] 95 | clone_file(target_url.replace(root_url, ''), args.clone_dir) 96 | folders_url_list = clone_dirs(target_url.replace(root_url, ''), folders_url_list, args.clone_dir) 97 | 98 | while len(folders_url_list): 99 | url = folders_url_list[0] 100 | clone_file(url, args.clone_dir) 101 | folders_url_list = clone_dirs(url, folders_url_list, args.clone_dir) 102 | 103 | print('==='*20) 104 | print('Successfully Clone to: {}'.format(args.clone_dir)) 105 | print('==='*20) 106 | --------------------------------------------------------------------------------