├── .DS_Store
├── .project
├── .pydevproject
├── ICWSM2018_LearningQ_preprint.pdf
├── LearningQ_Logo.png
├── README.md
├── README.txt
└── code
    ├── .DS_Store
    ├── __init__.py
    ├── __init__.pyc
    ├── chromedriver
    ├── functions.py
    ├── functions.pyc
    ├── khan
        ├── __init__.py
        ├── article_crawler.py
        └── video_crawler.py
    ├── teded
        ├── __init__.py
        ├── crawler.py
        ├── get_all_transcripts.py
        └── get_transcript.py
    └── test.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/.DS_Store


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>LearningQ</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?><pydev_project>
3 | <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
4 | <path>/${PROJECT_DIR_NAME}</path>
5 | </pydev_pathproperty>
6 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python interpreter</pydev_property>
7 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
8 | </pydev_project>
9 | 


--------------------------------------------------------------------------------
/ICWSM2018_LearningQ_preprint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/ICWSM2018_LearningQ_preprint.pdf


--------------------------------------------------------------------------------
/LearningQ_Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/LearningQ_Logo.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dataset and source code for [LearningQ: A Large-scale Dataset for Educational Question Generation](./ICWSM2018_LearningQ_preprint.pdf) (ICWSM 2018).
 2 | 
 3 | <p align="center" height="50"> 
 4 | <img src="./LearningQ_Logo.png">
 5 | </p>
 6 | 
 7 | ## Please download the full dataset via [DATAVERSE](https://dataverse.mpi-sws.org/dataverse/icwsm18) or [Google Drive](https://drive.google.com/file/d/1D9Qp6BB3prKYSKSM4B__Eo04cw-71JFt/view?usp=sharing).
 8 | 
 9 | ## 1. How the Dataset was Collected
10 | LearningQ is a dataset which can be used for educational question generation. Specifically, it contains:
11 | 
12 | * 7K instructor-designed questions collected from TED-Ed;
13 | * 223K learner-generated questions collected from Khan Academy.
14 | 
15 | The source documents (i.e., lecture videos and articles) from which the questions are generated are also presented. We include the crawling code as part of LearningQ.
16 | 
17 | ## 2. Files in the Dataset
18 | We open-source not only i) the filtered data which can be directly used to train educational question generators but also ii) the originally-collected data from both TED-Ed and Khan Academy. The list of the data files is described as below.
19 | 
20 | ```
21 | + LearningQ
22 | +---- README.txt
23 | +---- code [crawling code for TED-Ed and Khan Academy]
24 | +---- data [the originally-collected data and the filtered data]
25 | +----+---- khan
26 | +----+----+---- crawled_data [the originally-collected data from Khan Academy]
27 | +----+----+----+---- topictree.json [the full hierachical listing of Khan Academy's topic tree]
28 | +----+----+----+---- topics [the information about each topic node in the full topic tree, each file is named by the topic's slug in Khan Academy and stored in the JSON format]
29 | +----+----+----+---- topic_videos [the list of all videos for each topic node, each file is named by the topic's slug in Khan Academy and stored in the JSON format]
30 | +----+----+----+---- all_video_links [the links of all lecture videos in Khan Academy, the file is stored in the JSON format]
31 | +----+----+----+---- transcripts [the transcripts of all lecture videos]
32 | +----+----+----+---- video_discussions [the originally-collected questions generated by learners for each lecture video, each file is named by the video's YouTube ID and stored in the JSON format]
33 | +----+----+----+---- all_article_links [the links of all articles in Khan Academy, the file is stored in the JSON format]
34 | +----+----+----+---- articles [the content of each article, each file is named by the article's ID in Khan Academy and stored in the JSON format]
35 | +----+----+----+---- article_discussions [the originally-collected questions generated by learners for each article, each file is named by the article ID and stored in the JSON format]
36 | +----+----+---- khan_labeled_data [the manually-labelled questions (whether a question is useful for learning or not) we used to build the question classifier; each line in a file is a data sample, i.e., manually-assigned label (1 for useful and 0 for non-useful) and the corresonding question]
37 | +----+----+---- predicted_article_questions [the list of useful learning questions on articles, the file is stored in the JSON format]
38 | +----+----+---- predicted_video_questions [the list of useful learning questions on lecture videos, the file is stored in the JSON format]
39 | +----+---- teded
40 | +----+----+---- crawled_data [the originally-collected data from TED-Ed]
41 | +----+----+----+---- transcripts [the transcripts for lecture videos, each file is named by the video's YouTube ID]
42 | +----+----+----+---- videos [the instructor-generated questions for each lecture video, each file is named by the video's title in TED-Ed and stored in the JSON format]
43 | +----+---- experiments [the filtered data (i.e., predicted useful learning questions) which can be directly used as input for question generators, each file is named as {para/src/tgt}_{train/dev/test}, which denotes its data type, i.e., source document (para), source sentences (src) and target questions (tgt), and its usage, i.e., whether it is used for training (train), validation (dev) or testing (test).]
44 | ```
45 | 
46 | ## 3. Implementation of the Question Generators
47 | We implemented our question classifier as well as the question generators based on the following code repositories:
48 | * [Sentence Convolution Code in Torch](https://github.com/harvardnlp/sent-conv-torch)
49 | * [Question Generation via Overgenerating Transformations and Ranking](http://www.cs.cmu.edu/~ark/mheilman/questions/)
50 | * [Neural Question Generation](https://github.com/xinyadu/nqg)
51 | 
52 | ## 4. Baseline Results
53 | |              |      Methods      | Bleu 1 | Bleu 2 | Bleu 3 | Bleu 4 | Meteor | Rouge_L |
54 | |:------------:|:-----------------:|:------:|:------:|:------:|:------:|:------:|:-------:|
55 | | Khan Academy |        H&S        |  0.28  |  0.17  |  0.13  |  0.10  |  3.24  |   6.61  |
56 | |              |      Seq2Seq      |  19.84 |  7.68  |  4.02  |  2.29  |  6.44  |  23.11  |
57 | |              | Attention Seq2Seq |  24.70 |  11.68 |  6.36  |  3.63  |  8.73  |  27.36  |
58 | |    TED-Ed    |        H&S        |  0.38  |  0.22  |  0.17  |  0.15  |  3.00  |   6.52  |
59 | |              |      Seq2Seq      |  12.96 |  3.95  |  1.82  |  0.73  |  4.34  |  16.09  |
60 | |              | Attention Seq2Seq |  15.83 |  5.63  |  2.63  |  1.15  |  5.32  |  17.69  |
61 | 
62 | The best Bleu 4 score achieved by the state-of-the-art methods (i.e., Attention Seq2Seq) on SQuAD is larger than 12, while on LearningQ it is less than 4, which indicates large space for improvement on educational question generation.
63 | 
64 | ## 5. Contact
65 | For any questions about the dataset, please contact Guanliang Chen via angus.glchen@gmail.com
66 | 
67 | ## 6. Citation
68 | If you are using LearningQ in your work, please cite:
69 | ```
70 | @paper{ICWSM18LearningQ,
71 | 	author = {Guanliang Chen, Jie Yang, Claudia Hauff and Geert-Jan Houben},
72 | 	title = {LearningQ: A Large-scale Dataset for Educational Question Generation},
73 | 	conference = {International AAAI Conference on Web and Social Media},
74 | 	year = {2018}
75 | }
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Dataset and source code for LearningQ: A Large-scale Dataset for Educational Question Generation (ICWSM 2018).
 2 | 
 3 | 1. How the Dataset was Collected
 4 | LearningQ is a dataset which can be used for educational question generation. Specifically, it consists of i) 7K instructor-designed questions collected from TED-Ed and ii) 223K learner-generated questions collected from Khan Academy. The source documents (i.e., lecture videos and articles) from which the questions are generated are also presented. We include the crawling code as part of LearningQ.
 5 | 
 6 | 2. Files in the Dataset
 7 | We open-source not only i) the filtered data which can be directly used to train educational question generators but also ii) the originally-collected data from both TED-Ed and Khan Academy. The list of the data files is described as below (the number embedded in #### represents the hierarchy order of the file/folder).
 8 | 
 9 | ```
10 | + LearningQ
11 | +---- README.txt
12 | +---- code [crawling code for TED-Ed and Khan Academy]
13 | +---- data [the originally-collected data and the filtered data]
14 | +----+---- khan
15 | +----+----+---- crawled_data [the originally-collected data from Khan Academy]
16 | +----+----+----+---- topictree.json [the full hierachical listing of Khan Academy's topic tree]
17 | +----+----+----+---- topics [the information about each topic node in the full topic tree, each file is named by the topic's slug in Khan Academy and stored in the JSON format]
18 | +----+----+----+---- topic_videos [the list of all videos for each topic node, each file is named by the topic's slug in Khan Academy and stored in the JSON format]
19 | +----+----+----+---- all_video_links [the links of all lecture videos in Khan Academy, the file is stored in the JSON format]
20 | +----+----+----+---- transcripts [the transcripts of all lecture videos]
21 | +----+----+----+---- video_discussions [the originally-collected questions generated by learners for each lecture video, each file is named by the video's YouTube ID and stored in the JSON format]
22 | +----+----+----+---- all_article_links [the links of all articles in Khan Academy, the file is stored in the JSON format]
23 | +----+----+----+---- articles [the content of each article, each file is named by the article's ID in Khan Academy and stored in the JSON format]
24 | +----+----+----+---- article_discussions [the originally-collected questions generated by learners for each article, each file is named by the article ID and stored in the JSON format]
25 | +----+----+---- khan_labeled_data [the manually-labelled questions (whether a question is useful for learning or not) we used to build the question classifier; each line in a file is a data sample, i.e., manually-assigned label (1 for useful and 0 for non-useful) and the corresonding question]
26 | +----+----+---- predicted_article_questions [the list of useful learning questions on articles, the file is stored in the JSON format]
27 | +----+----+---- predicted_video_questions [the list of useful learning questions on lecture videos, the file is stored in the JSON format]
28 | +----+---- teded
29 | +----+----+---- crawled_data [the originally-collected data from TED-Ed]
30 | +----+----+----+---- transcripts [the transcripts for lecture videos, each file is named by the video's YouTube ID]
31 | +----+----+----+---- videos [the instructor-generated questions for each lecture video, each file is named by the video's title in TED-Ed and stored in the JSON format]
32 | +----+---- experiments [the filtered data (i.e., predicted useful learning questions) which can be directly used as input for question generators, each file is named as {para/src/tgt}_{train/dev/test}, which denotes its data type, i.e., source document (para), source sentences (src) and target questions (tgt), and its usage, i.e., whether it is used for training (train), validation (dev) or testing (test).]
33 | ```
34 | 
35 | 3. Implementation of the Question Generators
36 | We implemented our question classifier as well as the question generators based on the following code repositories:
37 | i) Sentence Convolution Code in Torch: https://github.com/harvardnlp/sent-conv-torch
38 | ii) H&S comparison method: http://www.cs.cmu.edu/~ark/mheilman/questions/
39 | iii) Attention Seq2Seq: https://github.com/xinyadu/nqg
40 | 
41 | 4. Contact
42 | For any questions about the dataset, please contact Guanliang Chen via angus.glchen@gmail.com or guanliang.chen@tudelft.nl
43 | 
44 | 5. Citation
45 | If you use LearningQ for a publication, please make reference to it by citing the following paper:
46 | 
47 | ```
48 | @paper{ICWSM18Guanliang,
49 | 	author = {Guanliang Chen, Jie Yang, Claudia Hauff and Geert-Jan Houben},
50 | 	title = {LearningQ: A Large-scale Dataset for Educational Question Generation},
51 | 	conference = {International AAAI Conference on Web and Social Media},
52 | 	year = {2018}
53 | }
54 | ```


--------------------------------------------------------------------------------
/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/.DS_Store


--------------------------------------------------------------------------------
/code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/__init__.py


--------------------------------------------------------------------------------
/code/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/__init__.pyc


--------------------------------------------------------------------------------
/code/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/chromedriver


--------------------------------------------------------------------------------
/code/functions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 1 Dec 2017
 3 | @author: Guanliang Chen
 4 | '''
 5 | 
 6 | import json, nltk
 7 | from langdetect import detect
 8 | 
 9 | 
10 | def save_file(object, path):
11 |     file = open(path, "w")
12 |     file.write(json.dumps(object))
13 |     file.close()
14 |    
15 |     
16 | def check_language(text):
17 |     mark = False
18 |     try:
19 |         if detect(text) == "en":
20 |             mark = True
21 |     except:
22 |         pass
23 |     return mark
24 | 
25 | 
26 | def tokenize_text(text):
27 |     # Lowercase, replace tabs, striping    
28 |     text = text.lower().replace('\n', ' ').strip()
29 |     return " ".join(nltk.word_tokenize(text))
30 | 
31 | 
32 | def gather_subtopics(object, topic_hierarchy):
33 |     if isinstance(object, dict) and "children" in object.keys():        
34 |         main_topic = object["relative_url"].split("/")[1]
35 |         topic = object["relative_url"].split("/")[-1]
36 |         topic_hierarchy[main_topic].append(topic)
37 |         for sub_object in object["children"]:
38 |             gather_subtopics(sub_object, topic_hierarchy)            
39 |     elif isinstance(object, list):
40 |         pass
41 |  
42 |       
43 | def gather_topic_hierarchy(path):
44 |     topictree_object = json.loads(open(path + "topictree.json").read())
45 |     topic_hierarchy = {}
46 |     for element in topictree_object["children"]:
47 |         topic = element["domain_slug"]
48 |         topic_hierarchy[topic] = []
49 |         for sub_element in element["children"]:
50 |             gather_subtopics(sub_element, topic_hierarchy)
51 |     return topic_hierarchy


--------------------------------------------------------------------------------
/code/functions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/functions.pyc


--------------------------------------------------------------------------------
/code/khan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/khan/__init__.py


--------------------------------------------------------------------------------
/code/khan/article_crawler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Dec 1, 2017
  3 | @author: Guanliang Chen
  4 | '''
  5 | 
  6 | import sys
  7 | reload(sys)
  8 | sys.setdefaultencoding("utf-8")
  9 | 
 10 | 
 11 | from code.functions import save_file, gather_topic_hierarchy
 12 | import urllib2, json, os, time
 13 | from selenium import webdriver
 14 |     
 15 | 
 16 | def collect_article_discussion(path):
 17 |     article_map = json.loads(open(path + "all_article_links", "r").read())
 18 |     print("There is a total of %d articles." % len(article_map))
 19 |     
 20 |     # Collected articles
 21 |     collected_articles= set()
 22 |     if not os.path.isdir(path + "article_discussions/"):
 23 |         os.mkdir(path + "article_discussions/")
 24 |     collected_files = os.listdir(path + "article_discussions/")
 25 |     for collected_file in collected_files:
 26 |         if collected_file not in [".DS_Store"]:
 27 |             collected_articles.add(collected_file)
 28 |     print("There are %d article discussion have been collected." % len(collected_articles))
 29 |     
 30 |     # Collect article discussions
 31 |     driver = webdriver.Chrome(executable_path='../chromedriver')
 32 |     driver.maximize_window()
 33 |     
 34 |     for article_id in article_map.keys():             
 35 |         if article_id not in collected_articles and os.path.exists(path + "articles/" + article_id):
 36 |                         
 37 |             driver.get(article_map[article_id]["ka_url"])
 38 |             time.sleep(3)
 39 |             
 40 |             more_mark = True
 41 |             while more_mark:
 42 |                 try:                    
 43 |                     element = driver.find_element_by_xpath("//input[@value='Show more comments']")
 44 |                     driver.execute_script("arguments[0].scrollIntoView();", element)
 45 |                     
 46 |                     driver.find_element_by_xpath("//input[@value='Show more comments']").click()
 47 |                     time.sleep(3)
 48 |                 except:
 49 |                     # print("More discussion button error...\t%s" % e)
 50 |                     more_mark = False
 51 |             
 52 |             try:                
 53 |                 discussion_array = []
 54 |                 questions = driver.find_elements_by_xpath('//div[@class="thread "]/div[@class="question  discussion-item"]/div[@class="discussion-content"]')
 55 |                 for question in questions:
 56 |                     discussion_array.append({"question": question.text, "answers":[]})
 57 |                     
 58 |                 if len(questions) != 0:
 59 |                     save_file(discussion_array, path + "article_discussions/" + article_id)
 60 |                     collected_articles.add(article_id) 
 61 |             except Exception as e:
 62 |                 # print("Storing error...\t%s" % e)
 63 |                 print(e)
 64 | 
 65 | 
 66 | def download_articles(path):
 67 |     article_map = json.loads(open(path + "all_article_links", "r").read())
 68 |     print("There is a total of %d articles." % len(article_map))
 69 |     
 70 |     collected_articles = set()
 71 |     if not os.path.isdir(path + "articles/"):
 72 |         os.mkdir( + "articles/")
 73 |     article_files = os.listdir(path + "articles/")
 74 |     for article_file in article_files:
 75 |         if article_file != ".DS_Store":
 76 |             collected_articles.add(article_file)
 77 |     print("There are %d articles have been collected." % len(collected_articles))
 78 |     
 79 |     for article_id in article_map.keys():
 80 |         if article_id not in collected_articles:
 81 |             try:
 82 |                 article_api = "http://www.khanacademy.org/api/v1/articles/" + article_id
 83 |                 response = urllib2.urlopen(article_api)
 84 |                 article = json.loads(response.read())
 85 |                 save_file(article, path + "articles/" + article_id)
 86 |             except:
 87 |                 # print("http://www.khanacademy.org/api/v1/articles/" + article_id)
 88 |                 pass
 89 |                 
 90 | 
 91 | def gather_article_ids(path):
 92 |     topic_hierarchy = gather_topic_hierarchy(path)
 93 |     component_topic_relation = {}
 94 |     for topic in topic_hierarchy.keys():
 95 |         for component in topic_hierarchy[topic]:
 96 |             component_topic_relation[component] = topic
 97 |     
 98 |     article_map = {}
 99 |     topic_files = os.listdir(path + "topics/")
100 |     for topic_file in topic_files:
101 |         if topic_file != ".DS_Store":
102 |             jsonObject = json.loads(open(path + "topics/" + topic_file, "r").read())
103 |             for tuple in jsonObject["children"]:
104 |                 if tuple["kind"] == "Article":
105 |                     article_map[tuple["internal_id"]] = {"ka_url":tuple["url"], "topic_category":component_topic_relation[topic_file]}
106 |                     
107 |     print("There is a total of %d articles." % len(article_map))
108 |     save_file(article_map, path + "all_article_links")
109 | 
110 | 
111 | 
112 | ######################################################################    
113 | def main():
114 |     data_path = '../../data/khan/khan_crawled_data/'
115 |     
116 |     # Step 1: gather article list
117 |     gather_article_ids(data_path)
118 |     
119 |     # Step 2: download articles
120 |     download_articles(data_path)
121 | 
122 |     # Step 3: gather discussions
123 |     collect_article_discussion(data_path)
124 |     
125 | 
126 | 
127 |             
128 | if __name__ == "__main__":
129 |     main()
130 |     print("Done.")


--------------------------------------------------------------------------------
/code/khan/video_crawler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Dec 1, 2017
  3 | @author: Guanliang Chen
  4 | '''
  5 | 
  6 | import sys
  7 | reload(sys)
  8 | sys.setdefaultencoding("utf-8")
  9 | 
 10 | from code.functions import save_file, gather_topic_hierarchy
 11 | import urllib2, json, os, time
 12 | from selenium import webdriver
 13 | 
 14 | 
 15 | def collect_video_discussion(path):
 16 |     video_link_map = json.loads(open(path + "all_video_links", "r").read())
 17 |     print("There is a total of %d video links." % len(video_link_map))
 18 |     
 19 |     # Considered topics
 20 |     considered_topic_array = ["science", "humanities", "computing", "partner-content", "economics-finance-domain", "test-prep", "college-careers-more", "math"]
 21 |     
 22 |     # Collected videos
 23 |     collected_videos= set()
 24 |     if not os.path.isdir(path + "updated_video_discussions/"):
 25 |         os.mkdir(path + "updated_video_discussions/")
 26 |     collected_files = os.listdir(path + "updated_video_discussions/")
 27 |     for collected_file in collected_files:
 28 |         if collected_file not in [".DS_Store"]:
 29 |             collected_videos.add(collected_file)
 30 |     
 31 |     # Collect video transcripts
 32 |     driver = webdriver.Chrome(executable_path='../chromedriver')
 33 |     driver.maximize_window()
 34 |     for youtube_id in video_link_map.keys():        
 35 |         if youtube_id not in collected_videos and os.path.exists(path + "transcripts/" + youtube_id):
 36 |             topic_category = video_link_map[youtube_id]["topic_category"]
 37 |             
 38 |             if topic_category not in considered_topic_array:
 39 |                 continue
 40 |                         
 41 |             driver.get(video_link_map[youtube_id]["ka_url"])
 42 |             time.sleep(3)        
 43 |                                 
 44 |             more_discussion_mark = True
 45 |             while more_discussion_mark:
 46 |                 try:
 47 |                     element = driver.find_element_by_xpath("//input[@value='Show more comments']")
 48 |                     driver.execute_script("arguments[0].scrollIntoView();", element)
 49 |                     
 50 |                     driver.find_element_by_xpath("//input[@value='Show more comments']").click()
 51 |                     time.sleep(2)
 52 |                 except Exception as e:
 53 |                     # print("More discussion button error...\t%s" % e)
 54 |                     more_discussion_mark = False
 55 |                     
 56 |             try:                
 57 |                 discussion_array = []
 58 |                 questions = driver.find_elements_by_xpath('//div[@class="thread "]/div[@class="question  discussion-item"]/div[@class="discussion-content"]')
 59 |                 for question in questions:
 60 |                     discussion_array.append({"question": question.text, "answers":[]})
 61 |                 
 62 |                 if len(discussion_array) != 0:
 63 |                     save_file(discussion_array, path + "updated_video_discussions/" + youtube_id)
 64 |             except Exception as e:
 65 |                 # print("Storing error...\t%s" % e)
 66 |                 print(e)
 67 |                 
 68 | 
 69 | def download_transcript_from_khan(path):
 70 |     video_link_map = json.loads(open(path + "all_video_links", "r").read())
 71 |     print("There is a total of %d video links." % len(video_link_map))
 72 |     
 73 |     # Collected videos
 74 |     collected_videos= set()
 75 |     if not os.path.isdir(path + "transcripts/"):
 76 |         os.mkdir(path + "transcripts/")         
 77 |     collected_files = os.listdir(path + "transcripts/")
 78 |     for collected_file in collected_files:
 79 |         if collected_file not in [".DS_Store"]:
 80 |             collected_videos.add(collected_file)
 81 |     print("There are %d video transcripts have been collected." % len(collected_videos))
 82 |     
 83 |     # Collect video transcripts
 84 |     driver = webdriver.Chrome(executable_path='../chromedriver')
 85 |     driver.maximize_window()
 86 |     
 87 |     for youtube_id in video_link_map.keys(): 
 88 |         if youtube_id not in collected_videos:
 89 |             driver.get(video_link_map[youtube_id]["ka_url"])
 90 |             time.sleep(2)          
 91 |             try:                
 92 |                 element = driver.find_element_by_xpath("//a[contains(text(),'Transcript')]").click()     
 93 |                 transcript = driver.find_element_by_xpath("//ul[@itemprop='transcript']").text        
 94 |                 if transcript != "":
 95 |                     outFile = open(path + "transcripts/" + youtube_id, "w")
 96 |                     outFile.write(transcript)
 97 |                     outFile.close()                
 98 |                     collected_videos.add(youtube_id)                    
 99 |             except Exception as e:
100 |                 print(e)
101 |             time.sleep(1)
102 |             
103 | 
104 | def collect_video_links(path):
105 |     # Check collected components
106 |     collected_topics = set()
107 |     if not os.path.isdir(path + "topic_videos/"):
108 |         os.mkdir(path + "topic_videos/")
109 |     files = os.listdir(path  + "topic_videos/")
110 |     for file in files:
111 |         collected_topics.add(file)
112 |     print("%d topics have been processed." % len(collected_topics))
113 |     
114 |     # Course components (topics) => Filter out non-EN components
115 |     course_components = set()
116 |     topics = os.listdir(path + "topics/")
117 |     for topic in topics:
118 |         if topic != ".DS_Store":
119 |             object = json.loads(open(path + "topics/" + topic, "r").read())
120 |             if object["source_language"] == "en":
121 |                 course_components.add(topic)
122 |     print("There are %d course components." % len(course_components))
123 |     
124 |     # Download video list for each topic
125 |     for topic in course_components:
126 |         try:
127 |             if topic not in collected_topics:
128 |                 time.sleep(1)
129 |                 video_api = "http://www.khanacademy.org/api/v1/topic/" + topic + "/videos"
130 |                 response = urllib2.urlopen(video_api)
131 |                 videos = json.loads(response.read())
132 |                 if len(videos) != 0:
133 |                     save_file(videos, path + "topic_videos/" + topic)
134 |                 collected_topics.add(topic)
135 |         except:
136 |             time.sleep(10)
137 |     
138 |     # Extract video links
139 |     topic_hierarchy = gather_topic_hierarchy(path)
140 |     component_topic_relation = {}
141 |     for topic in topic_hierarchy.keys():
142 |         for component in topic_hierarchy[topic]:
143 |             component_topic_relation[component] = topic
144 |             
145 |     video_link_map = {}
146 |     video_files = os.listdir(path + "topic_videos/")
147 |     for video_file in video_files:
148 |         if video_file != ".DS_Store":
149 |             object = json.loads(open(path + "topic_videos/" + video_file, "r").read())
150 |             for tuple in object:
151 |                 video_link_map[tuple["youtube_id"]] = {"ka_url":tuple["ka_url"], "component_name":video_file, "topic_category":component_topic_relation[video_file]}
152 |     print("There is a total of %d videos." % len(video_link_map))
153 |     save_file(video_link_map, path + "all_video_links")
154 |     
155 | 
156 | def iterate_topictree_nodes(object, level, array):   
157 |     if isinstance(object, dict) and "children" in object.keys():
158 |         array.append(object["relative_url"])
159 |         for sub_object in object["children"]:
160 |             iterate_topictree_nodes(sub_object, level+1, array)
161 |     elif isinstance(object, list):
162 |         pass
163 | 
164 |   
165 | def get_topic_links(path, url):
166 |     # ==> Download topictrees
167 |     response = urllib2.urlopen(url)
168 |     topictree = response.read()
169 |     save_file(topictree, path + 'topictree.json')
170 |     
171 |     # ==> Iterate over the whole topictree
172 |     level = 0
173 |     course_components = []
174 |     iterate_topictree_nodes(json.loads(topictree), level, course_components)
175 |     # save_file(course_components, path + "course_components")
176 |     
177 |     # ==> Collect links for each topic
178 |     if not os.path.isdir(path + 'topics'):
179 |         os.mkdir(path + 'topics')
180 |     processed_topics = set()
181 |     files = os.listdir(path  + "topics")
182 |     for file in files:
183 |         processed_topics.add(file)
184 |     print("%d topics have been processed." % len(processed_topics))
185 |     
186 |     for component in course_components:
187 |         try:
188 |             topic = component.split("/")[-1]
189 |             if topic != '' and topic not in processed_topics:
190 |                 time.sleep(1)
191 |                 exercise_api = "http://www.khanacademy.org/api/v1/topic/" + topic
192 |                 response = urllib2.urlopen(exercise_api)
193 |                 response = json.loads(response.read())
194 |                 save_file(response, path + "topics/" + topic)
195 |                 processed_topics.add(topic)
196 |         except Exception as e:
197 |             print(e)
198 |             time.sleep(10)
199 |         
200 |    
201 | ######################################################################    
202 | def main():
203 |     data_path = '../../data/khan/khan_crawled_data/'
204 | 
205 |     # Step 1: retrieve and save topictree file
206 |     url = 'http://www.khanacademy.org/api/v1/topictree'
207 |     get_topic_links(data_path, url)
208 |         
209 |     # Step 2: collect video links
210 |     collect_video_links(data_path)
211 | 
212 |     # Step 3: download video transcripts
213 |     download_transcript_from_khan(data_path)
214 |     
215 |     # Step 4: gather discussion on video
216 |     collect_video_discussion(data_path)
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     main()
221 |     print("Done.")


--------------------------------------------------------------------------------
/code/teded/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AngusGLChen/LearningQ/82ee78bf5c77b8d9293f0894cf5e4448914b8bfc/code/teded/__init__.py


--------------------------------------------------------------------------------
/code/teded/crawler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Nov 26, 2017
  3 | @author: Guanliang Chen
  4 | '''
  5 | 
  6 | import sys
  7 | reload(sys)
  8 | sys.setdefaultencoding("utf-8")
  9 | 
 10 | import json, os, time, random, shutil
 11 | from time import sleep
 12 | from selenium import webdriver
 13 | from selenium.webdriver.common.action_chains import ActionChains
 14 | 
 15 | 
 16 | def merge_gather_data(path):
 17 |     
 18 |     data_dump = []
 19 |     
 20 |     # Gather category relation
 21 |     category_file = open(path + "category_video_relation", "r")
 22 |     category_video_map = json.loads(category_file.read())
 23 |     video_set = set()
 24 |     video_category_map = {}
 25 |     
 26 |     title_link_map = {}
 27 |     title_set = set()
 28 |      
 29 |     for category in category_video_map.keys():
 30 |         videos = category_video_map[category]
 31 |         for video in videos:
 32 |             if video["url"] not in video_set:
 33 |                 video_set.add(video["url"])
 34 |                 video_category_map[video["url"]] = []
 35 |             video_category_map[video["url"]].append(category)
 36 |             
 37 |             url = video["url"]
 38 |             title = video["video_title_length"]
 39 |             if title not in title_set:
 40 |                 title_set.add(title)
 41 |                 title_link_map[title] = []
 42 |             
 43 |             title_link_map[title].append(category)
 44 |             
 45 |     
 46 |     
 47 |     # Gather collected videos
 48 |     video_files = os.listdir(path + "ted_videos/")
 49 |     for video_file in video_files:
 50 |         video_object = json.loads(open(path + "ted_videos/" + video_file, "r").read())
 51 |         video_youtube_link = video_object["video_youtube_link"]
 52 |         
 53 |         parsed = urlparse.urlparse(video_youtube_link)
 54 |         video_youtube_id = str(urlparse.parse_qs(parsed.query)['v'][0])
 55 |         
 56 |         if os.path.exists(path + "transcripts/" + video_youtube_id):
 57 |             transcript_file = open(path + "transcripts/" + video_youtube_id)
 58 |             transcript = transcript_file.read()
 59 |             lines = transcript.split("\n")
 60 |             
 61 |             index = None
 62 |             for i in range(len(lines)):
 63 |                 if lines[i] == "":
 64 |                     index = i + 1
 65 |                     break
 66 |             
 67 |             processed_transcript = ""
 68 |             for i in range(index, len(lines)):
 69 |                 processed_transcript += (lines[i] + " ")
 70 |             # processed_transcript = processed_transcript.lower().strip()
 71 |             
 72 |             video_object["transcript"] = processed_transcript
 73 |             if video_object["video_title_length"] in title_link_map.keys():
 74 |                 video_object["categories"] = title_link_map[video_object["video_title_length"]]
 75 |             else:
 76 |                 video_object["categories"] = []
 77 |             
 78 |             if "quizzes" in video_object.keys():
 79 |                 data_dump.append(video_object)
 80 |     
 81 |     out_file = open(path + "data_dump", "w")
 82 |     out_file.write(json.dumps(data_dump))
 83 |     out_file.close()
 84 |  
 85 | 
 86 | 
 87 | def collect_category_relation(path):    
 88 |     driver = webdriver.Chrome(executable_path='../chromedriver')
 89 |     driver.maximize_window()
 90 |     
 91 |     home_link = "https://ed.ted.com/lessons"
 92 |     category_tuples = []
 93 |     
 94 |     driver.get(home_link)
 95 |     
 96 |     # Click "log in"
 97 |     click_action(driver, "//a[@href='/session']")
 98 |     
 99 |     # Log in
100 |     driver.find_element_by_xpath("//input[@id='user_email']").send_keys('')
101 |     driver.find_element_by_xpath("//input[@id='user_password']").send_keys('')
102 |     click_action(driver, "//input[@name='commit']")
103 |     
104 |     # Gather subject categories
105 |     xml_categories = driver.find_elements_by_xpath("//li[@class='parent ']/a")
106 |     for xml_category in xml_categories:
107 |         category = xml_category.text
108 |         category_link = xml_category.get_attribute('href')
109 |         category_tuples.append([category, category_link])
110 |         
111 |     category_videos_map = {}
112 |         
113 |     for category_tuple in category_tuples:
114 |         category = category_tuple[0]
115 |         category_link = category_tuple[1]
116 |         
117 |         category_videos_map[category] = []        
118 |                
119 |         # Click a subject category and gather the number of total pages
120 |         driver.get(category_link)
121 |         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")        
122 |         time.sleep(3)
123 |         try:
124 |             driver.find_element_by_xpath("//li[@class='last next']/a").click()
125 |             total_pages = driver.find_element_by_xpath("//li[@class='page active']").text
126 |             total_pages = int(total_pages)
127 |         except:
128 |             total_pages = 1
129 |         
130 |         for i in range(1, total_pages + 1):            
131 |             video_group_link = category_link + "&page=" + str(i)
132 |             driver.get(video_group_link)
133 |                         
134 |             videos = driver.find_elements_by_xpath("//div[@class='video-text']/a")
135 |             
136 |             for video in videos:
137 |                 url = video.get_attribute('href')     
138 |                 video_title_length = video.text
139 |                 category_videos_map[category].append({"video_title_length":video_title_length, "url":url})
140 |                 
141 |     out_file = open(path + "category_video_relation", "w")
142 |     out_file.write(json.dumps(category_videos_map))
143 |     out_file.close()
144 | 
145 | 
146 | def click_action(driver, xpath):
147 |     element = driver.find_element_by_xpath(xpath)
148 |     driver.execute_script("arguments[0].click();", element)
149 |     time.sleep(2)
150 |     
151 | 
152 | def collect_data(path):
153 |     driver = webdriver.Chrome(executable_path='../chromedriver')
154 |     driver.maximize_window()
155 |     
156 |     home_link = "https://ed.ted.com/lessons"            
157 |     driver.get(home_link)
158 |     
159 |     # Click "log in"
160 |     click_action(driver, "//a[@href='/session']")
161 |     
162 |     # Log in
163 |     driver.find_element_by_xpath("//input[@id='user_email']").send_keys('')
164 |     driver.find_element_by_xpath("//input[@id='user_password']").send_keys('')
165 |     click_action(driver, "//input[@name='commit']")
166 |     
167 |     # Gather collected video list
168 |     collected_videos = set()
169 |     if not os.path.isdir(path + "videos/"):
170 |         os.mkdir( + 'videos/')
171 |     video_files = os.listdir(path + "videos/")
172 |     for video_file in video_files:
173 |         if video_file != ".DS_Store":
174 |             collected_videos.add(json.loads(open(path + "videos/" + video_file,"r").read())["video_title_length"])
175 |     print("There are %d collected videos." % len(collected_videos))
176 |     
177 |     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")        
178 |     time.sleep(3)
179 |     driver.find_element_by_xpath("//li[@class='last next']/a").click()
180 |     total_pages = driver.find_element_by_xpath("//li[@class='page active']").text
181 |     total_pages = int(total_pages)
182 |         
183 |     for i in range(1, total_pages + 1):
184 |         print("Processing page %d" % i)
185 |         video_group_link = home_link + "?page=" + str(i)
186 |         driver.get(video_group_link)
187 |                 
188 |         videos = driver.find_elements_by_xpath("//div[@class='video-text']/a")
189 |         video_tuples = []
190 |         for video in videos:
191 |             video_link = video.get_attribute('href')
192 |             video_title_length = video.text
193 |             video_tuples.append([video_link, video_title_length])
194 |         
195 |         for tuple in video_tuples:
196 |             video_link = tuple[0]
197 |             video_title_length = tuple[1]
198 |                         
199 |             try:
200 |                                                 
201 |                 if video_title_length in collected_videos:
202 |                     continue
203 |                 
204 |                 video_json_object = {"video_link":video_link, "video_title_length":video_title_length}
205 |                                                                              
206 |                 driver.get(video_link)
207 |                 video_description = driver.find_element_by_xpath("//div[@class='lessonDescription']").text
208 |                 video_json_object["video_description"] = video_description
209 |                 
210 |                 # Click the "Think" button
211 |                 try:
212 |                     # For some videos, there is no quiz questions
213 |                     time.sleep(0.5)
214 |                     driver.find_element_by_xpath("//a[@id='think-link']")
215 |                     click_action(driver, "//a[@id='think-link']")
216 |                 except:
217 |                     # Locate the video
218 |                     video_iframe = driver.find_element_by_xpath("//iframe[@id='playerContainer']")
219 |                     driver.switch_to.frame(video_iframe)
220 |                     video_youtube_link = driver.find_element_by_xpath("//a[@class='ytp-watermark yt-uix-sessionlink']").get_attribute('href')
221 |                     video_json_object["video_youtube_link"] = video_youtube_link
222 |                     
223 |                     out_file = open(path + "videos/" + video_title_length, "w")
224 |                     out_file.write(json.dumps(video_json_object))
225 |                     out_file.close()
226 |                     
227 |                     continue
228 |                                       
229 |                 # Locate the quizzes
230 |                 num_quiz_divs = 0
231 |                 while num_quiz_divs == 0:
232 |                     quiz_divs = driver.find_elements_by_xpath("//div[@data-position]")
233 |                     time.sleep(0.5)
234 |                     num_quiz_divs = len(quiz_divs)
235 |                                
236 |                 quizzes = []
237 |                 
238 |                 for j in range(num_quiz_divs):
239 |                     # Loop over quizzes
240 |                     driver.get(video_link + "/review_open#question-" + str(j+1))
241 |                     time.sleep(0.5)
242 |                     driver.get(video_link + "/review_open#question-" + str(j+1))
243 |                     time.sleep(0.5)
244 |                                    
245 |                     open_question_mark = None
246 |                     try:
247 |                         driver.find_element_by_xpath("//div[@data-position=" + str(j) + "]//div[@class='panel-response']")
248 |                         open_question_mark = True
249 |                     except:
250 |                         open_question_mark = False
251 |                                         
252 |                     # Quizzes
253 |                     quiz_description = None
254 |                     quiz_options = []
255 |                     
256 |                     if open_question_mark:
257 |                         # For open-ended questions
258 |                         
259 |                         # Mouse hover
260 |                         element = driver.find_element_by_xpath("//div[@data-position=" + str(j)+ "]//div[@class='panel-response']")
261 |                         hover = ActionChains(driver).move_to_element(element)
262 |                         hover.perform()
263 |                         
264 |                         quiz_description = driver.find_element_by_xpath("//div[@data-position=" + str(j)+ "]//div[@class='panel-response']/div/h5").text
265 |                         quizzes.append({"quiz_description": quiz_description, "question_type":"open-ended"})
266 |                                                 
267 |                     else:
268 |                         # For multiple-choices questions
269 |                         
270 |                         # Mouse hover
271 |                         element = driver.find_element_by_xpath("(//div[@class='question scroll uiScroll text-ultralight'])[1]")
272 |                         hover = ActionChains(driver).move_to_element(element)
273 |                         hover.perform()
274 |                                              
275 |                         # Collect textual information
276 |                         quiz_text = driver.find_element_by_xpath("(//div[@class='question scroll uiScroll text-ultralight'])[1]").text                      
277 |                         lines = quiz_text.split("\n")
278 |                         quiz_description = lines[0]
279 |                         
280 |                         for k in range(1,len(lines),2):
281 |                             letter_id = lines[k]
282 |                             numerical_id = k/2
283 |                             option = lines[k+1]
284 |                             quiz_options.append({"letter_id":letter_id, "option_text":option, "numerical_id":numerical_id})
285 |                             
286 |                         # Collect answer & hint
287 |                         hint_mark = False
288 |                         answer_mark = False
289 |                                        
290 |                         correct_answer_id = None
291 |                         hint = None
292 |                                             
293 |                         num_options = len(quiz_options)  
294 |                         for k in range(num_options):
295 |                             # Loop over options
296 |         
297 |                             # Go back to the quiz question                            
298 |                             driver.get(video_link + "/review_open#question-" + str(j+1))
299 |                             time.sleep(0.5)
300 |                             driver.get(video_link + "/review_open#question-" + str(j+1))
301 |                             time.sleep(0.5)         
302 |                                                         
303 |                             # Mouse hover
304 |                             element = driver.find_element_by_xpath("//div[@class='clearfix a answer'][1]")
305 |                             hover = ActionChains(driver).move_to_element(element)
306 |                             hover.perform()                            
307 |                             time.sleep(0.5)
308 |                                                         
309 |                             # Select answer
310 |                             driver.find_element_by_xpath("(//div[@class='clearfix a answer'])[1]").click()                                                           
311 |                             # Click "Save my answer"                           
312 |                             driver.find_element_by_xpath("(//button[@class='check'])[" + str(k+1) +"]").click()
313 |                             time.sleep(0.5)
314 |                                                    
315 |                             msg_mark = False
316 |                             while not msg_mark:
317 |                                 try:                            
318 |                                     msg_text = driver.find_element_by_xpath("//div[@class='g']").text
319 |                                     msg_mark = True
320 |                                 except:
321 |                                     time.sleep(0.5)                          
322 |                             
323 |                             if msg_text == "Correct!":
324 |                                 correct_answer_id = k
325 |                                 answer_mark = True
326 |                                 # print("correct answer %d" % correct_answer_id)                         
327 |                             
328 |                             if not hint_mark and "That wasn" in msg_text:
329 |                                 hint = driver.find_element_by_xpath("//button[@class='btnWhite vid']").get_attribute('data-seconds')
330 |                                 hint_mark = True
331 |                                 # print("hint is %s" % hint)
332 |                                 
333 |                             if hint_mark and answer_mark:
334 |                                 break
335 |                         
336 |                         quizzes.append({"quiz_description": quiz_description, "question_type":"multiple-choices", "quiz_options":quiz_options, "hint": hint, "answer":correct_answer_id})                        
337 |                         
338 |                 video_json_object["quizzes"] = quizzes
339 |                 
340 |                 # Locate the video
341 |                 video_iframe = driver.find_element_by_xpath("//iframe[@id='playerContainer']")
342 |                 driver.switch_to.frame(video_iframe)
343 |                 video_youtube_link = driver.find_element_by_xpath("//a[@class='ytp-watermark yt-uix-sessionlink']").get_attribute('href')
344 |                 
345 |                 video_json_object["video_youtube_link"] = video_youtube_link
346 |                 
347 |                 out_file = open(path + "videos/" + video_title_length, "w")
348 |                 out_file.write(json.dumps(video_json_object))
349 |                 out_file.close()
350 |                 
351 |                 collected_videos.add(video_title_length)                                                                      
352 |             
353 |             except:
354 |                 # print("Failed for %s" % video_title_length)
355 |                 pass        
356 |     
357 | 
358 | def main():
359 |     data_path = '../../data/teded/teded_crawled_data/'
360 |     
361 |     # Step 1: collect video-category information
362 |     collect_category_relation(data_path)
363 |     
364 |     # Step 2: collect questions
365 |     collect_data(data_path)
366 |     
367 |    
368 | if __name__ == "__main__":
369 |     main()
370 |     print("Done.")
371 |     
372 |         
373 |     


--------------------------------------------------------------------------------
/code/teded/get_all_transcripts.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 30, 2017
 3 | @author: Guanliang Chen
 4 | '''
 5 | 
 6 | import os, json, urlparse
 7 | 
 8 | def main():
 9 |     path = '../../data/khan/khan_crawled_data/'
10 |     
11 |     collected_transcripts = set()
12 |     if not os.path.isdir(path + "transcripts/"):
13 |         os.mkdir(path + "transcripts/")
14 |     transcript_files = os.listdir(path + "transcripts/")
15 |     for transcript_file in transcript_files:
16 |         collected_transcripts.add(transcript_file)    
17 |     
18 |     video_files = os.listdir(path + "videos/")    
19 |     for video_file in video_files:
20 |         video_jsonObject = json.loads(open(path + "videos/" + video_file, "r").read())
21 |         video_youtube_link = video_jsonObject["video_youtube_link"]
22 |         
23 |         parsed = urlparse.urlparse(video_youtube_link)
24 |         video_youtube_id = str(urlparse.parse_qs(parsed.query)['v'][0])
25 |         
26 |         if video_youtube_id not in collected_transcripts:            
27 |             command = "python ./get_transcript.py " + video_youtube_link + " --file " + path + "transcripts/" + video_youtube_id
28 |             os.system(command)           
29 |                     
30 | if __name__ == "__main__":
31 |     main()
32 |     print("Done.")


--------------------------------------------------------------------------------
/code/teded/get_transcript.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # -*- coding: utf-8 -*-
  4 | 
  5 | import urllib2
  6 | import re
  7 | import argparse
  8 | import sys
  9 | import os
 10 | import urlparse
 11 | import subprocess
 12 | 
 13 | 
 14 | # Set up argument parsing.
 15 | parser = argparse.ArgumentParser(description='Retrieve plain text transcripts from YouTube videos.')
 16 | parser.add_argument('youtube_url', metavar='url', help='The URL to the YouTube video you want to'
 17 |                                                        ' retrieve a transcript for.')
 18 | parser.add_argument('--file', help='The path to the file you want to save transcript to. '
 19 |                                    'To use the video\'s title as the filename, specify a directory.'
 20 |                                    ' Example: ~/Desktop/transcript.txt or ~/Desktop', action='store', dest='file')
 21 | parser.add_argument('--overwrite', help='Overwrites existing file at save location, if present.', action='store_true')
 22 | parser.add_argument('--open', help='Open the created transcript.', action='store_true')
 23 | parser.add_argument('--reducenewlines', help='Remove all newlines except those immediately following a period.',
 24 |                     action='store_true')
 25 | parser.add_argument('--printfilepath', help='Prints the outfile file path on console.', action='store_true')
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | # Detect the current platform for platform-specific code later on.
 30 | if sys.platform.startswith('darwin'):
 31 |     platform = 'mac'
 32 | elif os.name == 'nt':
 33 |     platform = 'windows'
 34 | elif os.name == 'posix':
 35 |     platform = 'linux'
 36 | else:
 37 |     platform = 'unknown'
 38 | 
 39 | 
 40 | class VidProperties:
 41 |     """Get and store video attributes (ID & Title)."""
 42 |     def __init__(self):
 43 |         self.id = None
 44 |         self.title = None
 45 |         self.transcript = None
 46 |         self.filename = None
 47 |         try:
 48 |             self.id = parse_url(args.youtube_url)
 49 |         except ValueError:
 50 |             print 'ERROR: You do not appear to have entered a valid YouTube address.'
 51 |             sys.exit(1)
 52 |         self.title = get_title(self.id)
 53 |         self.filename = create_filename(self.title)
 54 | 
 55 | 
 56 | def parse_url(vid_url):
 57 |     """
 58 |     Take video URL, perform basic sanity check, then filter out video ID.
 59 |     @param vid_url: URL of the video to get transcript from.
 60 |     @type vid_url: str
 61 |     """
 62 |     if 'watch?v' in vid_url:
 63 |         vid_code = re.findall(ur'^[^=]+=([^&]+)', vid_url)
 64 |     elif 'youtu.be/' in vid_url:
 65 |         vid_code = re.findall(ur'youtu\.be/([^&]+)', vid_url)
 66 | 
 67 |     else:
 68 |         raise ValueError()
 69 |     return vid_code[0]
 70 | 
 71 | 
 72 | def get_title(vid_id):
 73 |     """
 74 |     Get title of video from ID.
 75 |     @param vid_id: YouTube ID for the video.
 76 |     @type vid_id: str
 77 |     """
 78 |     video_info = urllib2.urlopen('http://youtube.com/get_video_info?video_id=' + vid_id)
 79 |     video_info = video_info.read()
 80 |     if urlparse.parse_qs(video_info)['status'][0] == 'fail':
 81 |         print "WARNING: Couldn't get video title. This probably means you specified an invalid URL."
 82 |         return None
 83 |     else:
 84 |         return urlparse.parse_qs(video_info)['title'][0]
 85 | 
 86 | 
 87 | def get_transcript():
 88 |     """Retrieve XML transcript from video ID. Works for human-created transcripts only."""
 89 |     not_found_error = 'ERROR: No transcript found. This can mean one of several things:\n- There is no ' \
 90 |                       'human-created transcript for this video.\n- The video URL was entered incorrectly.\n' \
 91 |                       '- The video has "burned-on" captions, where the captions are part of the video track. ' \
 92 |                       'There is no way to extract burned-in captions.'
 93 |     try:
 94 |         transcript = urllib2.urlopen('http://video.google.com/timedtext?lang=en&v=' + vidinfo.id)
 95 |         transcript_xml = transcript.read()
 96 |     except urllib2.HTTPError as error:
 97 |         if '404' in str(error):
 98 |             print not_found_error
 99 |             sys.exit(1)
100 |         else:
101 |             raise error
102 | 
103 |     if '<transcript>' not in transcript_xml:
104 |         print not_found_error
105 |         sys.exit(1)
106 |     return transcript_xml
107 | 
108 | 
109 | def remove_extra_linebreaks(string):
110 |     """
111 |     Remove extraneous linebreaks from text.
112 |     If line ends with a period, insert a linebreak.
113 |     @param string: The transcript to remove breaks from.
114 |     @type string: str
115 |     @return: Formatted text.
116 |     """
117 |     string_by_line = string.split('\n')
118 |     new_string = str()
119 |     for line in string_by_line:
120 |         if line.endswith('.'):
121 |             new_string += line + '\n'
122 |         else:
123 |             new_string += line + ' '
124 |     return new_string
125 | 
126 | 
127 | def format_transcript(transcript):
128 |     """
129 |     Receives the full XML transcript as plain text.
130 |     @param transcript: Transcript as XML file.
131 |     @type transcript: str
132 |     """
133 |     # Remove XML tags.
134 |     transcript = re.sub("</text>", "\n", transcript)
135 |     transcript = re.sub("<[^>]+>", "", transcript)
136 | 
137 |     # Remove encoded HTML tags.
138 |     transcript = re.sub("&lt;.*?&gt;", "", transcript)
139 | 
140 |     # Replace ASCII character codes with the actual character.
141 |     rep = {"&amp;#39;": "'", "&amp;gt;": ">", "&amp;quot;": '"', "&amp;lt;": "<"}
142 | 
143 |     # Slick single-pass regex replacement.
144 |     rep = dict((re.escape(k), v) for k, v in rep.iteritems())
145 |     pattern = re.compile("|".join(rep.keys()))
146 |     transcript = pattern.sub(lambda m: rep[re.escape(m.group(0))], transcript)
147 | 
148 |     # Remove all newlines except those immediately following a period to improve readability.
149 |     if args.reducenewlines:
150 |         transcript = remove_extra_linebreaks(transcript)
151 | 
152 |     # If text is more than 75% capitalized, we make it all lowercase for easier reading.
153 |     num_upper_chars = len((re.findall("[A-Z]", transcript)))
154 |     num_chars = len((re.findall("[a-zA-Z]", transcript)))
155 |     percent_upper = (float(num_upper_chars) / float(num_chars)) * 100
156 |     if percent_upper >= 75:
157 |         transcript = transcript.lower()
158 | 
159 |     return transcript
160 | 
161 | 
162 | def create_filename(title):
163 |     """
164 |     Create filename-safe version of video title.
165 |     @param title: Title of the video.
166 |     @type title: str
167 |     """
168 |     # Remove characters that will cause problems in filenames.
169 |     rep = {"/": "-", ":": " -", "\\": '-', "<": "-", ">": "-", "|": "-", "?": "", "*": ""}
170 | 
171 |     rep = dict((re.escape(k), v) for k, v in rep.iteritems())
172 |     pattern = re.compile("|".join(rep.keys()))
173 | 
174 |     return pattern.sub(lambda m: rep[re.escape(m.group(0))], title)
175 | 
176 | 
177 | # EXECUTION START HERE.
178 | 
179 | # Collect the video, ID, transcript and title.
180 | vidinfo = VidProperties()
181 | raw_transcript = get_transcript()
182 | vidinfo.transcript = format_transcript(raw_transcript)
183 | 
184 | # Validate output path.
185 | outfile = os.path.expanduser(args.file)
186 | 
187 | # If user has not specified a filename, use the video title.
188 | if os.path.isdir(outfile):
189 |     outfile = os.path.join(outfile, vidinfo.filename + '.txt')
190 | 
191 | # Check if output file already exists.
192 | if not args.overwrite:
193 |     if os.path.isfile(outfile):
194 |         print 'ERROR: A file already exists in the same place with the same name.\n' \
195 |               'Please specify a different name or location.'
196 |         sys.exit(1)
197 | 
198 | # Write transcript to file.
199 | try:
200 |     with open(outfile, 'w') as output_file:
201 |         output_file.write('Title: ' + vidinfo.title + '\n\n')
202 |         output_file.write(vidinfo.transcript)
203 | except IOError as errtext:
204 |     if 'No such file or directory' in str(errtext):
205 |         print "ERROR: The destination folder you've specified does not exist. Please check the path and try again."
206 |         sys.exit(1)
207 |     else:
208 |         raise errtext
209 | 
210 | # Print filename to console.
211 | if args.printfilepath:
212 |     print outfile
213 | 
214 | # Open created file.
215 | if args.open:
216 |     if platform == 'mac':
217 |         subprocess.call(['open', outfile])
218 |     elif platform == 'windows':
219 |         os.startfile(outfile)
220 |     elif platform == 'linux':
221 |         subprocess.call(('xdg-open', outfile))
222 |     else:
223 |         print 'WARNING: Cannot detect your operating system. Unable to open the transcript file automatically.'
224 | 


--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 14 Apr 2018
 3 | 
 4 | @author: Angus
 5 | '''
 6 | 
 7 | import sys
 8 | reload(sys)
 9 | sys.setdefaultencoding("utf-8")
10 | 
11 | import os
12 | 
13 | def list_files(startpath):
14 |     for root, dirs, files in os.walk(startpath):
15 |         level = root.replace(startpath, '').count(os.sep)
16 |         indent = '+' + '-' * 4 * (level)
17 |         print('{}{}'.format(indent, os.path.basename(root)))
18 |         subindent = ' ' * 4 * (level + 1)
19 |         # for f in files:
20 |         #     print('{}{}'.format(subindent, f))
21 |             
22 | list_files("/Users/Angus/Projects/LearningQ")


--------------------------------------------------------------------------------