├── README.md
└── clone.py


/README.md:
--------------------------------------------------------------------------------
 1 | # ~~Clone codes from anonymous.4open.science~~
 2 | 
 3 | **Can NOT work now** :pensive:
 4 | 
 5 | 
 6 | Only supports Python3.
 7 | 
 8 | ## Prerequisites
 9 | ```
10 | pip install beautifulsoup4
11 | pip install lxml
12 | ```
13 | 
14 | 
15 | ## Quick Start
16 | ```
17 | git clone https://github.com/ShoufaChen/clone-anonymous4open
18 | cd clone-anonymous4open
19 | python clone.py --clone-dir /path/to/save  --target anonymous-url
20 | ```
21 | Example:
22 | ```
23 | python clone.py --clone-dir ../examples --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/
24 | ```
25 | 
26 | 
27 | ## TODO
28 | - [ ] support files that do not have `<code>` element.
29 | - [ ] support *.md files
30 | 
31 | ## Acknowledgement
32 | 
33 | Thanks to excellent [Luyuan](https://github.com/BeBeBerr) for helpful instructions. :poultry_leg::poultry_leg::poultry_leg:
34 | 


--------------------------------------------------------------------------------
/clone.py:
--------------------------------------------------------------------------------
  1 | import urllib.request as urllib2
  2 | import re
  3 | from bs4 import BeautifulSoup
  4 | import os
  5 | import argparse
  6 | 
  7 | def parse_args():
  8 |     parser = argparse.ArgumentParser(description='Clone from the https://anonymous.4open.science')
  9 |     parser.add_argument('--clone-dir', type=str, default='master',
 10 |                         help='master loacation')
 11 |     parser.add_argument('--target', type=str,
 12 |                         help='anonymous link you want to clone')
 13 |     return parser.parse_args()
 14 | 
 15 | 
 16 | def create_dir(name):
 17 |     if not os.path.exists(name):
 18 |         os.mkdir(name)
 19 | 
 20 | def pull_html(url):
 21 |     req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
 22 |     try:
 23 |         response = urllib2.urlopen(req).read()
 24 |     except urllib2.URLError as e:
 25 |         print(e)
 26 |         print(url)
 27 |     content = response.decode('utf-8')
 28 |     soup = BeautifulSoup(content, "lxml")
 29 | 
 30 |     return soup
 31 | 
 32 | 
 33 | def pull_trees(url):
 34 |     folder_soup = pull_html(url)
 35 |     trees = folder_soup.find_all('div', attrs={'class': 'tree'})
 36 |     return trees
 37 | 
 38 | def pull_blobs(url):
 39 |     blobs_soup = pull_html(url)
 40 |     blobs = blobs_soup.find_all('div', attrs={'class': 'blob'})
 41 |     return blobs
 42 | 
 43 | def clone_file(url, download, root_url='https://anonymous.4open.science'):
 44 |     blobs = pull_blobs(root_url+url)
 45 |     for blob in blobs:
 46 |         href = blob.a.get('href')
 47 |         split_href = href.split('/')
 48 |         file_name = '/'.join([download]+split_href[3:])
 49 | 
 50 |         print('Clone...  ', file_name)
 51 |         #used for debug
 52 |         #print('Clone...  ', file_name, href)
 53 | 
 54 |         ### Not support clone markdown files now  and LICENSE
 55 |         if split_href[-1].split('.')[-1] == 'md' or split_href[-1] =='LICENSE':
 56 |             continue
 57 | 
 58 |         blob_soup = pull_html(root_url+urllib2.pathname2url(href))
 59 |         source_code = blob_soup.find('code')
 60 |         if not source_code:
 61 |             print('Skip file {}'.format(file_name))
 62 |             continue
 63 |         with open(file_name, 'w') as f:
 64 |             f.write(source_code.get_text())
 65 | 
 66 | def clone_dirs(url, folders_url_lis, download, root_url='https://anonymous.4open.science'):
 67 |     trees = pull_trees(root_url+url)
 68 | 
 69 |     for t in trees:
 70 |         href = t.a.get('href')
 71 |         split_href = href.split('/')
 72 |         #folder_name = split_href[-2]
 73 |         folder_name = '/'.join([download]+split_href[3:-1])
 74 |         print('Clone...  ', folder_name)
 75 |         #print('Clone...   ', folder_name, href)
 76 |         create_dir(folder_name)
 77 | 
 78 |         folders_url_list.append(href)
 79 | 
 80 |     folders_url_list.remove(url)
 81 |     return folders_url_list
 82 | 
 83 | 
 84 | if __name__ == '__main__':
 85 |     args = parse_args()
 86 |     assert args.target, '\nPlese specifipy your target URL, \n e.g:    '\
 87 |             +'python clone.py --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/'
 88 | 
 89 |     root_url = 'https://anonymous.4open.science'
 90 |     target_url = args.target.replace(root_url, '')
 91 | 
 92 |     create_dir(args.clone_dir)
 93 | 
 94 |     folders_url_list = [target_url]
 95 |     clone_file(target_url.replace(root_url, ''), args.clone_dir)
 96 |     folders_url_list = clone_dirs(target_url.replace(root_url, ''), folders_url_list, args.clone_dir)
 97 | 
 98 |     while len(folders_url_list):
 99 |         url = folders_url_list[0]
100 |         clone_file(url, args.clone_dir)
101 |         folders_url_list = clone_dirs(url, folders_url_list, args.clone_dir)
102 | 
103 |     print('==='*20)
104 |     print('Successfully Clone to: {}'.format(args.clone_dir))
105 |     print('==='*20)
106 | 


--------------------------------------------------------------------------------