├── Dialect Stemming
├── WLF
├── FNAW
├── bin
│ └── stemming
│ │ └── Stemmer.class
├── .classpath
├── .project
├── .settings
│ └── org.eclipse.jdt.core.prefs
├── output.txt
└── src
│ └── stemming
│ └── Stemmer.java
├── Collecting Twitter Data
├── R
│ └── R workshopKSU.R
└── Python
│ ├── UserTweets.ipynb
│ └── SearchWord.ipynb
├── README.md
└── Cleaning Code
└── Main (1).java
/Dialect Stemming/WLF:
--------------------------------------------------------------------------------
1 | This file is used to store the users tweets
2 |
--------------------------------------------------------------------------------
/Dialect Stemming/FNAW:
--------------------------------------------------------------------------------
1 | ببشت تجوري دلاغات ابجورات دريول فريزر بصمة بصمه بلكونة بلكونه بخت بوفيه تليفون تلفزيون خوش روج ساندويش سندويش شاليه
--------------------------------------------------------------------------------
/Dialect Stemming/bin/stemming/Stemmer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArwaAlrazooq/Arabic-twitter-analysis/HEAD/Dialect Stemming/bin/stemming/Stemmer.class
--------------------------------------------------------------------------------
/Dialect Stemming/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Dialect Stemming/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Dialect Stemming
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Dialect Stemming/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.7
12 |
--------------------------------------------------------------------------------
/Collecting Twitter Data/R/R workshopKSU.R:
--------------------------------------------------------------------------------
1 | #install.packages("ROAuth", "RCurl", "stringi", "stringi")
2 | library(ROAuth) # For oauth
3 | library(RCurl)
4 | library(stringi) # for use stri_trans_nfkc
5 | library(streamR) #For opens a connection to Twitter's Streaming API using filterStream function
6 | library(rJava)
7 | library(xlsx) #load the xlsx package
8 |
9 | ## PART 1: Declare Twitter API Credentials & Create Handshake
10 | requestURL = "https://api.twitter.com/oauth/request_token"
11 | accessURL = "https://api.twitter.com/oauth/access_token"
12 | authURL = " https://api.twitter.com/oauth/authorize"
13 | consumerKey = "" # From dev.twitter.com
14 | consumerSecret = "" # From dev.twitter.com
15 |
16 | my_oauth = OAuthFactory$new(consumerKey = consumerKey,
17 | consumerSecret = consumerSecret,
18 | requestURL = requestURL,
19 | accessURL = accessURL,
20 | authURL = authURL)
21 |
22 | #Save the my_oauth data to an .Rdata file
23 | save(my_oauth, file = "my_oauth.Rdata")
24 | load("my_oauth.Rdata")
25 |
26 | my_oauth$handshake(curl=getCurlHandle(), browseUrl=FALSE)
27 |
28 | ## PART 2: collect tweets from twitter streaming API.
29 | #Add as unicode of Arabic word #To get unicode of a word using this url http://r12a.github.io/apps/conversion/
30 |
31 | keyword= stri_trans_nfkc('\u0627\u0644\u0647\u0644\u0627\u0644')
32 |
33 | filterStream(oauth=my_oauth, # Save tweets in a json file
34 | timeout = 120, track = keyword, #(timeout) in a second
35 | file.name = "tweets.json") # Use my_oauth file as the OAuth credentials //
36 |
37 | tweetsdf = parseTweets("tweets.json", simplify = TRUE) # parse the json file and save to a data frame called tweets.df
38 | View(tweetsdf)
39 |
40 | #save xlsx file
41 | write.xlsx(tweetsdf , "tweetsTest1.xlsx",sheetName = "tweets", row.names = FALSE)
42 |
43 | #save csv file
44 | write.csv(tweetsdf, "tweetsTest.csv")
45 |
46 |
47 | ##################
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Arabic Twitter Analysis
2 |
3 | #
المقدمة
4 |
5 |
6 | مع تطور الهواتف والأجهزة المحمولة وتوفر اتصال الانترنت عند غالبية الناس تزايد عدد المستخدمين للإنترنت مما ولد كمية كبيرة من البيانات. من أكثر المواقع التي يقوم مستخدمي الإنترنت بزيارتها هي مواقع التواصل الاجتماعي مثل توتير وفيس بوك و انستجرام وغيرها الكثير والتي أصبحت جزء لا يتجزأ من حياة الأغلبية. لم يقتصر استخدام هذه الشبكات الاجتماعية على الأفراد وإنما نجد الكثير من الحسابات الرسمية والموثقة للجهات الحكومية والشركات والمستشفيات وهلم جرا. يختلف استخدام هذه الشبكات بحسب المستخدم الذي يقول باستخدامها فعلى مستوى الأفراد يقوم الكثير منهم بالتعبير عن مشاعره ومشاركة يومياته ومتابعة أخبار أصدقائه. ومن جهة أخرى نجد أن المؤسسات قد تستخدمها لنشر أخبارها والتواصل مع عملائها وتلبية طلباتهم والإجابة عن استفساراتهم كما أن بعض الشركات التجارية خصيصًا تستخدم هذه الشبكات لتسويق منتجاتها ومعرفة أراء عملائها ومستوى رضاهم عن الخدمات التي تقدمها خصوصًا أن هذه المنصات هي شبكات اجتماعية تؤثر فيها آراء بعض المستخدمين على المستخدمين الآخرين. تقوم هذه الشبكات بجمع آراء الأشخاص عن منتجاتهم وتحليلها لمعرفة المشاكل وإصلاحها ومعرفة المميزات ودعمها. يعبر الأشخاص غالبًا عن آرائهم ومشاعرهم باستخدام النص المكتوب وبعض الوجوه التعبيرية لإيصال الفكرة بشكل تام ويسهل على البشر فهم النص المكتوب وما يرمي إليه الكاتب وما هي مشاعره الموجهة من خلال هذا النص. ولكن عندما تكون البيانات التي تم جمعها وتحتاج إلى تحليل كبيرة يصعب أداء هذه المهمة بواسطة البشر حيث تعتبر مهمة مكلفة للوقت والجهد بمقابل أن عمليات التسويق وقضايا الرأي العام والتي تحتاج إلى تحليل آراء الأشخاص غالبًا هنالك حاجة لعملها بصورة سريعة.
7 |
8 |
9 | # ماذا ستقدم هذه المدونة ؟
10 |
11 | في هذه الصفحه سنتطرق ونتعرف على كيفيه تحليل البيانات التي تم الحصول عليها من منصة التواصل الاجتماعي تويتر
12 | والتي يمكن استخدامها لدراسه مجالات عديده كدراسه تصرفات معينه لمجموعه من الأشخاص او علم تحليل المشاعر: وهي عملية تحليلية باستخدام معالجة اللغة الطبيعية وغيرها
13 |
14 | # الخطوات
15 |
16 | لتطبيق هذه العملية سيقوم الباحث بالمرور بثلاث خطوات رئيسة وهي:
17 |
18 |
19 |
20 | 
21 |
22 | اتجه الى صفحه Wiki للتعرف على جميع الخطوات
23 |
24 | حيث يوجد فيها معلومات مفصلة عن
25 |
26 |
27 | 1. [Collecting Data](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Collecting-Data)
28 |
29 | تبين الطرق والادوات المطلوبه لسحب البيانات من تويتر
30 |
31 | 2. [Data Preprocessing](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Twitter-Data-Preprocessing)
32 |
33 | تختص بعمليه تنظيف البيانات وتجهيزها لعمليه التحليل
34 |
35 | 3. [Data Analysis](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Data-Analysis)
36 |
37 | تشرح عمليه التحليل باستخدام Weka
38 |
39 | 4. [Arabic Word Preprocessing](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Arabic-Word-Preprocessing)
40 |
41 | 5. [Arabic Sentiment Lexicons](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Arabic-Sentiment-Lexicons)
42 |
43 | 6. [Case Studies](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Case-Studies)
44 |
45 |
46 |
47 |
48 |
49 |
50 | # هذه الصفحه تمت كتابتها بواسطة
51 | أروى الرزوق
@Cla22ire
52 | روان المعثم
@rawan_almatham
53 | خلود الغامدي
@KhuloodAlGhamdi
54 | ايمان النخيلان
@EmanAlnkhilan
55 | باشراف الدكتوره: منى الرزقان
@Munas02
56 |
--------------------------------------------------------------------------------
/Cleaning Code/Main (1).java:
--------------------------------------------------------------------------------
1 | package test;
2 |
3 | import java.io.IOException;
4 | import java.io.UnsupportedEncodingException;
5 | import java.nio.file.Path;
6 | import java.nio.file.Paths;
7 | import java.util.Arrays;
8 | import java.util.Formatter;
9 | import java.util.FormatterClosedException;
10 | import java.util.List;
11 | import java.util.NoSuchElementException;
12 | import java.util.Scanner;
13 | import java.util.regex.Matcher;
14 | import java.util.regex.Pattern;
15 |
16 |
17 | public class Main {
18 |
19 | public static String text;
20 | public static void main(String[] args) throws IOException {
21 | // TODO Auto-generated method stub
22 | Formatter output;
23 |
24 | try{
25 | Path p=Paths.get("out.txt").toAbsolutePath();
26 | output = new Formatter(p.toString(), "utf-8");
27 |
28 | Scanner input = new Scanner(Paths.get("input.txt"),"utf-8");
29 |
30 | while (input.hasNextLine()) {
31 | String line = input.nextLine();
32 | //String lineout = new String();
33 | // List filtered = Arrays.asList(line.split("\\s"));
34 |
35 | line = line.replaceAll("((www\\.[^\\s]+)|(https?://[^\\s]+))", " ");// remove url
36 |
37 | //remove user names
38 | line = line.replaceAll("@[^\\s]+", " ");// remove @
39 |
40 | //remove # from hash tag
41 | line = line.replaceAll("#[^\\s]+", "");
42 |
43 | // remove #
44 |
45 |
46 | line = line.replaceAll("\\p{Punct}+", " ");//remove punctuation
47 | line = line.replaceAll("RT", " ");
48 |
49 | line = Main.removeEmojiAndSymbol(line);
50 |
51 |
52 | output.format("%s\n",line);
53 | }
54 |
55 |
56 | input.close();
57 | output.close();
58 | // System.out.println(text);
59 | }
60 | catch(NoSuchElementException excp) {
61 | System.err.println(excp.getMessage());
62 | } catch (FormatterClosedException excp){
63 | System.err.println(excp.getMessage());
64 | } catch (IllegalStateException excp){
65 | System.err.println(excp.getMessage());
66 | }
67 |
68 |
69 | }
70 | // removeEmojiAndSymbol function to remove emojis or you can use emoji4j library
71 | public static String removeEmojiAndSymbol( String content) {
72 | String utf8tweet = "";
73 | try {
74 | byte[] utf8Bytes = content.getBytes( "UTF-8");
75 |
76 | utf8tweet = new String( utf8Bytes, "UTF-8");
77 | } catch (UnsupportedEncodingException e ) {
78 | e.printStackTrace();
79 | }
80 | Pattern unicodeOutliers = Pattern.compile(
81 |
82 | "(?:[\\u2700-\\u27bf]|" +
83 |
84 | "(?:[\\ud83c\\udde6-\\ud83c\\uddff]){2}|" +
85 | "[\\ud800\\udc00-\\uDBFF\\uDFFF]|[\\u2600-\\u26FF])[\\ufe0e\\ufe0f]?(?:[\\u0300-\\u036f\\ufe20-\\ufe23\\u20d0-\\u20f0]|[\\ud83c\\udffb-\\ud83c\\udfff])?" +
86 |
87 | "(?:\\u200d(?:[^\\ud800-\\udfff]|" +
88 |
89 | "(?:[\\ud83c\\udde6-\\ud83c\\uddff]){2}|" +
90 | "[\\ud800\\udc00-\\uDBFF\\uDFFF]|[\\u2600-\\u26FF])[\\ufe0e\\ufe0f]?(?:[\\u0300-\\u036f\\ufe20-\\ufe23\\u20d0-\\u20f0]|[\\ud83c\\udffb-\\ud83c\\udfff])?)*|" +
91 |
92 | "[\\u0023-\\u0039]\\ufe0f?\\u20e3|\\u3299|\\u3297|\\u303d|\\u3030|\\u24c2|[\\ud83c\\udd70-\\ud83c\\udd71]|[\\ud83c\\udd7e-\\ud83c\\udd7f]|\\ud83c\\udd8e|[\\ud83c\\udd91-\\ud83c\\udd9a]|[\\ud83c\\udde6-\\ud83c\\uddff]|[\\ud83c\\ude01-\\ud83c\\ude02]|\\ud83c\\ude1a|\\ud83c\\ude2f|[\\ud83c\\ude32-\\ud83c\\ude3a]|[\\ud83c\\ude50-\\ud83c\\ude51]|\\u203c|\\u2049|[\\u25aa-\\u25ab]|\\u25b6|\\u25c0|[\\u25fb-\\u25fe]|\\u00a9|\\u00ae|\\u2122|\\u2139|\\ud83c\\udc04|[\\u2600-\\u26FF]|\\u2b05|\\u2b06|\\u2b07|\\u2b1b|\\u2b1c|\\u2b50|\\u2b55|\\u231a|\\u231b|\\u2328|\\u23cf|[\\u23e9-\\u23f3]|[\\u23f8-\\u23fa]|\\ud83c\\udccf|\\u2934|\\u2935|[\\u2190-\\u21ff]",
93 | Pattern.UNICODE_CASE |
94 | Pattern.CANON_EQ |
95 | Pattern.CASE_INSENSITIVE
96 | );
97 | //Pattern unicodeOutliers = Pattern.compile("U+1F3FB–U+1F3FF");
98 |
99 | Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
100 |
101 | utf8tweet = unicodeOutlierMatcher.replaceAll(" ");
102 | return utf8tweet;
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/Dialect Stemming/output.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | سجل مع عجز ثل
4 | سجل مع عجز ثل
5 | شرع لعب ReignFal عمل
6 |
7 | شرع زال طير لعب 2D نوع غاز ومغر
8 |
9 | فد قم…
10 | شرع لعب ReignFal عمل
11 |
12 | شرع زال طير لعب 2D نوع غاز ومغر
13 |
14 | فد قم…
15 | ستعرض لعب ReignFal طير
16 | ستعرض لعب ReignFal طير
17 | ستعرض لعب ReignFal طير
18 | ستعرض لعب ReignFal طير
19 | ستعرض لعب ReignFal طير
20 | ستعرض لعب ReignFal طير
21 | ستعرض لعب ReignFal طير
22 | ستعرض لعب ReignFal طير
23 | ستعرض لعب ReignFal طير
24 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس
25 |
26 | عطنا بستر عبت بدأ عجب…
27 | It was a great honr meting and having him try our game An amzing day to remb
28 | شرع لعب ReignFal عمل
29 |
30 | شرع زال طير لعب 2D نوع غاز ومغر
31 |
32 | فد قم…
33 | شرع لعب ReignFal عمل
34 |
35 | شرع زال طير لعب 2D نوع غاز ومغر
36 |
37 | فد قم…
38 | It was a great honr meting and having him try our game An amzing day to remb
39 | ستعرض لعب ReignFal طير
40 | ستعرض لعب ReignFal طير
41 | ستعرض لعب ReignFal طير
42 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس
43 |
44 | عطنا بستر عبت بدأ عجب…
45 |
46 | Layl You are the best
47 | ارب ارزق
48 | فتشب كيد
49 | شيء ضرب خيل
50 | شاء ألف مبر وعقبل فوز نهئ لكم
51 | شاء ألف مبر وعقبل فوز نهئ لكم
52 | بار فيك
53 | مبر لك…
54 |
55 | مبر لستدي نقش ض…
56 | ومبر…
57 | مبر لك…
58 | مبر لك…
59 | مبر لك…
60 | مبر لك…
61 | مبر لك…
62 | مبر لك…
63 | مبر لك…
64 | مبر لك…
65 |
66 | مبر لستدي نقش ض…
67 |
68 | ومبر…
69 |
70 | مبر لستدي نقش ض…
71 |
72 | جمع فئزن
73 | ومبر…
74 |
75 | مبر لستدي نقش ض…
76 |
77 | جمع فئزن
78 | ومبر…
79 |
80 | مبر لستدي نقش ض…
81 |
82 | جمع فئزن
83 | ومبر…
84 | يس
85 | يس
86 |
87 | مبر
88 | مبر
89 | ستعرض لعب ReignFal طير
90 | ستعرض لعب ReignFal طير
91 | ستعرض لعب ReignFal طير
92 | بار فيك
93 | وا برو برو
94 | وا برو برو
95 | بار فيك مود
96 | مبر نتمى لكم زد قدم
97 | مبر نتمى لكم زد قدم
98 | بار فيك
99 | ذن
100 | Why You have an atrcive idea it just ned some polishng
101 | Yeah neds a lots of work But I think this stupid feling is becaus of the presu I fel right now o…
102 | برو وفق
103 | برو وفق
104 | بار فيك عبر
105 | Congratz
106 | Congratz
107 | Thanks
108 | مبر ستهل
109 | مبر ستهل
110 | بار فك، شكرا
111 |
112 | كلم حتج رجم
113 |
114 |
115 |
116 |
117 | ت شر شهر صبر
118 | احبل شكرا
119 | شاء
120 |
121 | ان سمع
122 | BTW
123 | شفت رسم KnowMe اخت، لمس جذب غرب
124 |
125 | فتح كمشن له علمي
126 | You goin arw
127 | lol you can just come and have fun you know
128 |
129 | I doubt it I dont know how to draw
130 | You goin arw
131 | You goin arw
132 | lol you can just come and have fun you know
133 | lol you can just come and have fun you know
134 | I doubt it I dont know how to draw
135 | عظم جركم، له رحم وغفر وسكن فسح جن
136 | Square Enix Sory we dint finsh your game Noctis
137 | Noctis Man this some bulshit Fuck yal I got my boy Hard on…
138 | شاء ، وفق مبر أرى
139 | شاء ، وفق مبر أرى
140 | نفس شيء
141 | نفس شيء
142 | نفس شيء
143 | نفس شيء
144 | بار فيك رم
145 | بار فيك
146 | حط ربط شي، برنمج هوء بشر
147 | خلص
148 | بث لا زال ستمر قر سعدي خلص
149 | مبر لستدي نقش ض…
150 | روى وصلن، وصلك؟
151 | رسل لهم رى، أكد هم، كيد سقط سه
152 | وفق عز
153 | عين رى، تسر ذن
154 | تفق
155 | تفق
156 | لحظ وين ؟
157 | قهر
158 | ودي عرف وش قل، يلا بحث عنه
159 |
160 | جمع فئزن
161 | ومبر…
162 | ثير وصل يمل ؟؟
163 | اخر شيء وصل طلب بين وتر، فصل لقء لا لسه
164 | Wil do
165 | كيس شفت تو حقت
166 | وي ارب
167 | ذن
168 | كلم رئس سكر إنكس طرن يبد إعجب وتفجأ ستى عال وتنع منص ألعب طر حسب…
169 | دير وسك تسدا دهش عجب جد نوع لعب شاف
170 | وفقكم، شكر جهد
171 | ذن نك…
172 | مبر لستدي نقش ض…
173 |
174 | جمع فئزن
175 | ومبر…
176 | دير وسك تسدا دهش عجب جد نوع لعب شاف
177 | وفقكم، شكر جهد
178 | ذن نك…
179 | وفقكم، شكر جهد
180 | ذن نك…
181 | وفقكم، شكر جهد
182 | ذن نك…
183 | ستعرض لعب ReignFal طير
184 | ستعرض لعب ReignFal طير
185 | ستعرض لعب ReignFal طير
186 | ستعرض لعب ReignFal طير
187 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس…
188 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس…
189 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس…
190 | مبر لستدي نقش ض…
191 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب
192 |
193 | جمع فئزن
194 | مبر فئزن سبق نجلإت بمجل ألعبفدي لمث بدعن سعدي عرب أ تسدا وس…
195 | ومبر…
196 | بار فيك
197 | وفقكم، شكر جهد
198 | ذن نك…
199 | بار فيك
200 | سعد اربى، شكر لك
201 | شءل برو ستهلن
202 | yes
203 | بار فيك
204 | بار فيك
205 | بار فيك بين
206 | بار فيك بين
207 | هذي نفع لما صير مال خلق اس مشن
208 | ستعرض لعب ReignFal طير
209 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس
210 |
211 | عطنا بستر عبت بدأ عجب…
212 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب
213 | مبر وفق
214 | شاء
215 |
216 |
217 | ألف برو وبرك لأسمء
218 |
219 | بدعت
220 |
221 |
222 | يلا حذي هرت
223 | ور شغل جمد
224 | بار أرى ستهلو
225 | حمد ه، خبر سعد جدا
226 | ختر لعبت فئزن
227 |
228 | بار فيك
229 | بار فيك
230 |
231 | بار فيك
232 | بار فيك مود
233 | بار فيك
234 | ذن
235 | بار فيك هبه
236 | بار فيك عبر
237 | Thanks
238 | بار فك، شكرا
239 |
240 | I doubt it I dont know how to draw
241 | بار فيك رم
242 | بار فيك بين
243 | ستعرض لعب ReignFal طير
244 | بار فيك
245 | سعد اربى، شكر لك
246 | آمن رب علمن
247 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب
248 | عظم اجر له رحم وغفر ودخل فسح جن
249 | بار فيك
250 | سعد اربى، شكر لك
251 | حط ربط شي، برنمج هوء بشر
252 | خلص
253 | بث لا زال ستمر قر سعدي خلص
254 |
255 | خلص حضر وطلع بسع كمل قلن؟ gt
256 | عين وفق ارب ف ثن
257 | ارب سهل
258 | لحظ وين ؟
259 | قهر
260 | ودي عرف وش قل، يلا بحث عنه
261 | Layl You are the best
262 |
263 | ومبر…
264 | جا اخر
265 | يخلف شاء
266 | مين ارب، جزا خير
267 | لقئ قدم
268 | سجل
269 |
270 | طرلعب
271 | لقئ قدم
272 | سجل
273 |
274 | طرلعب
275 | 45 ≧ω≦
276 | Me want
277 | رقك طلت ؟
278 | يب مدي
279 | يب مدي
280 | عال خاص
281 | بتع سلحه؟
282 | يمد طلب شيء نفس حق سيف نكتس ؟
283 |
284 | بكم سيف نكتس ؟
285 | Ok
286 | مبر وفق
287 | عال خاص
288 | بتع سلحه؟
289 | يمد طلب شيء نفس حق سيف نكتس ؟
290 |
291 | بكم سيف نكتس ؟
292 | Ok
293 |
294 | نفس حال ضبط
295 | يضح اني بغى حدث
296 | حدث صرع مع سبع فات
297 |
298 | سبع بغى وتب صاح علم
299 |
300 |
301 | ولت وتب
302 | عور
303 | ترجى نت فتح قل
304 |
305 |
306 | سبعن كذا بغل طلع نيح وطب
307 |
308 | وحد شتك نت
309 |
310 | ضح بغى رد شغل
311 |
312 | لا
313 | مو فتح
314 |
315 | خلي جال طلع
316 |
317 | بغى كثر طلع
318 | شاء
319 |
320 |
321 | ألف برو وبرك لأسمء
322 |
323 | بدعت
324 |
325 |
326 | يلا حذي هرت
327 | ور شغل جمد
328 | بندل قه حش حل مكن سعد صمد
329 |
330 | وفق
331 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع
332 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع
333 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع
334 | كنسل وتب زمن عتمد درس تك كتبه، اقل خير بتطل فتح امل
335 | مو نت، خل جال فتح يون در ستدي سشل له نخلص
336 | صدق بغل نسه، وبم…
337 | بار فيك بين
338 | يب بس
339 | wan go Im not sure I can atend
340 | لقئ قدم
341 | سجل
342 |
343 | طرلعب
344 | wan go Im not sure I can atend
345 | wan go Im not sure I can atend
346 | helo Just wanted to ask do you take comisn D
347 | شءل برو ستهلن
348 | بار فيك هبه
349 | لستَ وحد دام دوم برفقت
350 | طلع طيب حين
351 | بار أرى ستهلو
352 | لا لا زين حقه ساع
353 |
354 | ، حق قل نوم
355 | أقل حقئ أنا لزم جرب وزبل ثن
356 |
357 | سلم قلب ، بشف بكر ا…
358 | You can do it I beliv in you Arwa
359 | صح شغل ، إني جلس أحل أركز علرض ماب فيد ، أندر ستدي طلع ركز كنت أش…
360 | بار فيك
361 | لا ال VR زاد
362 | حل ساع
363 | فرض
364 |
365 | يلا سلمت ورح نام
366 | نخلي اخر شيء ين نس شوي
367 |
368 | ااخ لزم كذا جلس سنع شغل ونشرب قه
369 | ك شوش كيد
370 | جد ذكر علن ب٢٠١٥
371 | خل خلص رم جزء
372 | لتنرفز ترى مكن تحس نفس ختم ظرف قول لك
373 | كيد مو شيف نفس غلطن وقت غلط اقش ١٠ دقئ ستظرف راس
374 | عظم أجر أحسن عزئ غفر لهم رحم سكن فسح جن
375 | مزح وتقل رشه، ولا علقت رسمي يع غير عقل شوي اشك سلب كان مزح بجد…
376 | ودي وري ما تحط لا ومفق جف
377 | ودي وري ما تحط لا ومفق جف
378 | ه درن وش الي ضحك، ىوى قبل رتن ولا ثل ه
379 | قول هاه نحسب يام الي سلم حق رور كرم ه
380 | لا شفت لعبن
381 | صور سلم ه
382 | حطت ساس لما شفت
383 | ه درن وش الي ضحك، ىوى قبل رتن ولا ثل ه
384 | قول هاه نحسب يام الي سلم حق رور كرم ه
385 | لا شفت لعبن
386 | صور سلم ه
387 | حطت ساس لما شفت
388 | and she rocks
389 | I hope you find what you sek
390 | Thanks Ill chek her works
391 | سمح لي كرك شكرا جزلا
392 | رسم شاء رئع
393 | and she rocks
394 | I hope you find what you sek
395 | Thanks Ill chek her works
396 | سمح لي كرك شكرا جزلا
397 | رسم شاء رئع
398 | كنسل طلع لج تغل
399 | تغل شي، بعد SteamVR قرر يشتغل ولزم حدث وبقعد قربا نت رهب
400 | I had zero confide dont know why
401 | حب
402 | عمل شكل عد
403 | نب نسمع عبر انم بزرن طيب
404 | ترى مو فئه وحد وطن فئه شب
405 | EscapeVR
406 |
407 | وجد عمل يون ساع ٣ شاء الي حاب يجي نسلف جرب في لعب فح
408 | I must prea presntaio and write a paer The dealin for them is tomrw
409 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس
410 |
411 | عطنا بستر عبت و…
412 | بتمنى شيء الي فتح كل شوف بدع طرن شاء عمل وحد ت…
413 | خلص لقء ومعا صدع حين لزم كمل تغل بر سلم
414 |
415 | ارب جز
416 | شيء حلو كمنربي ان جبن مؤد اص
417 | كمنربي رسمن بستقل comisn ؟
418 | EscapeVR
419 | زل جرب ولي عملي يستنق، غير تح
420 |
421 | سبب قرب Nongamers وقعا ختلف ومن…
422 |
--------------------------------------------------------------------------------
/Collecting Twitter Data/Python/UserTweets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": ""
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "code",
12 | "collapsed": false,
13 | "input": [
14 | "import twitter\n",
15 | "\n",
16 | "def oauth_login():\n",
17 | " #This code is taken from https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition \n",
18 | " # XXX: Go to http://twitter.com/apps/new to create an app and get values\n",
19 | " # for these credentials that you'll need to provide in place of these\n",
20 | " # empty string values that are defined as placeholders.\n",
21 | " # See https://dev.twitter.com/docs/auth/oauth for more information \n",
22 | " # on Twitter's OAuth implementation.\n",
23 | " \n",
24 | " CONSUMER_KEY = ''\n",
25 | " CONSUMER_SECRET = ''\n",
26 | " OAUTH_TOKEN = ''\n",
27 | " OAUTH_TOKEN_SECRET = ''\n",
28 | " \n",
29 | " auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n",
30 | " CONSUMER_KEY, CONSUMER_SECRET)\n",
31 | " \n",
32 | " twitter_api = twitter.Twitter(auth=auth)\n",
33 | " return twitter_api\n",
34 | "\n",
35 | "def twitter_search(twitter_api, q, max_results, **kw):\n",
36 | "\n",
37 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets and \n",
38 | " # https://dev.twitter.com/docs/using-search for details on advanced \n",
39 | " # search criteria that may be useful for keyword arguments\n",
40 | " \n",
41 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets \n",
42 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n",
43 | " \n",
44 | " statuses = search_results['statuses']\n",
45 | " print len(statuses)\n",
46 | " \n",
47 | " # Iterate through batches of results by following the cursor until we\n",
48 | " # reach the desired number of results, keeping in mind that OAuth users\n",
49 | " # can \"only\" make 180 search queries per 15-minute interval. See\n",
50 | " # https://dev.twitter.com/docs/rate-limiting/1.1/limits\n",
51 | " # for details. A reasonable number of results is ~1000, although\n",
52 | " # that number of results may not exist for all queries.\n",
53 | " \n",
54 | " # Enforce a reasonable limit\n",
55 | " max_results = 10000 #min(100000, max_results)\n",
56 | " \n",
57 | " for _ in range(10000): # 10*100 = 1000\n",
58 | " try:\n",
59 | " next_results = search_results['search_metadata']['next_results']\n",
60 | " except KeyError, e: # No more results when next_results doesn't exist\n",
61 | " # print len(statuses)\n",
62 | " break\n",
63 | " \n",
64 | " # Create a dictionary from next_results, which has the following form:\n",
65 | " # ?max_id=313519052523986943&q=NCAA&include_entities=1\n",
66 | " kwargs = dict([ kv.split('=') \n",
67 | " for kv in next_results[1:].split(\"&\") ])\n",
68 | " \n",
69 | " print len(statuses)\n",
70 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n",
71 | " statuses += search_results['statuses']\n",
72 | " \n",
73 | " if len(statuses) > max_results: \n",
74 | " break\n",
75 | " print len(statuses)\n",
76 | " \n",
77 | " return statuses"
78 | ],
79 | "language": "python",
80 | "metadata": {},
81 | "outputs": [],
82 | "prompt_number": 2
83 | },
84 | {
85 | "cell_type": "code",
86 | "collapsed": false,
87 | "input": [
88 | "import io, json\n",
89 | "\n",
90 | "def save_json( data):\n",
91 | " with io.open('WAS.json', \n",
92 | " 'w', encoding='utf-8') as f:\n",
93 | " f.write(unicode(json.dumps(data, ensure_ascii=False)))\n",
94 | "\n",
95 | "def load_json(filename):\n",
96 | " with io.open('WAS.json', \n",
97 | " encoding='utf-8') as f:\n",
98 | " return f.read()\n",
99 | "\n",
100 | "# Sample usage\n",
101 | "\n",
102 | "#q = '%23%D8%A7%D9%8A%D8%AC%D8%A7%D8%A8%D9%8A%D8%A9'\n",
103 | "#twitter_api = oauth_login()\n",
104 | "#results = twitter_search(twitter_api, q, max_results=10000)\n",
105 | "\n",
106 | "#save_json(q, results)\n",
107 | "#results = load_json(q)"
108 | ],
109 | "language": "python",
110 | "metadata": {},
111 | "outputs": [],
112 | "prompt_number": 3
113 | },
114 | {
115 | "cell_type": "code",
116 | "collapsed": false,
117 | "input": [
118 | "import sys\n",
119 | "import time\n",
120 | "from urllib2 import URLError\n",
121 | "from httplib import BadStatusLine\n",
122 | "import json\n",
123 | "import twitter\n",
124 | "\n",
125 | "def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): \n",
126 | " \n",
127 | " # A nested helper function that handles common HTTPErrors. Return an updated\n",
128 | " # value for wait_period if the problem is a 500 level error. Block until the\n",
129 | " # rate limit is reset if it's a rate limiting issue (429 error). Returns None\n",
130 | " # for 401 and 404 errors, which requires special handling by the caller.\n",
131 | " def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):\n",
132 | " \n",
133 | " if wait_period > 3600: # Seconds\n",
134 | " print >> sys.stderr, 'Too many retries. Quitting.'\n",
135 | " raise e\n",
136 | " \n",
137 | " # See https://dev.twitter.com/docs/error-codes-responses for common codes\n",
138 | " \n",
139 | " if e.e.code == 401:\n",
140 | " print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'\n",
141 | " return None\n",
142 | " elif e.e.code == 404:\n",
143 | " print >> sys.stderr, 'Encountered 404 Error (Not Found)'\n",
144 | " return None\n",
145 | " elif e.e.code == 429: \n",
146 | " print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'\n",
147 | " if sleep_when_rate_limited:\n",
148 | " print >> sys.stderr, \"Retrying in 15 minutes...ZzZ...\"\n",
149 | " sys.stderr.flush()\n",
150 | " time.sleep(60*15 + 5)\n",
151 | " print >> sys.stderr, '...ZzZ...Awake now and trying again.'\n",
152 | " return 2\n",
153 | " else:\n",
154 | " raise e # Caller must handle the rate limiting issue\n",
155 | " elif e.e.code in (500, 502, 503, 504):\n",
156 | " print >> sys.stderr, 'Encountered %i Error. Retrying in %i seconds' % \\\n",
157 | " (e.e.code, wait_period)\n",
158 | " time.sleep(wait_period)\n",
159 | " wait_period *= 1.5\n",
160 | " return wait_period\n",
161 | " else:\n",
162 | " raise e\n",
163 | "\n",
164 | " # End of nested helper function\n",
165 | " \n",
166 | " wait_period = 2 \n",
167 | " error_count = 0 \n",
168 | "\n",
169 | " while True:\n",
170 | " try:\n",
171 | " return twitter_api_func(*args, **kw)\n",
172 | " except twitter.api.TwitterHTTPError, e:\n",
173 | " error_count = 0 \n",
174 | " wait_period = handle_twitter_http_error(e, wait_period)\n",
175 | " if wait_period is None:\n",
176 | " return\n",
177 | " except URLError, e:\n",
178 | " error_count += 1\n",
179 | " time.sleep(wait_period)\n",
180 | " wait_period *= 1.5\n",
181 | " print >> sys.stderr, \"URLError encountered. Continuing.\"\n",
182 | " if error_count > max_errors:\n",
183 | " print >> sys.stderr, \"Too many consecutive errors...bailing out.\"\n",
184 | " raise\n",
185 | " except BadStatusLine, e:\n",
186 | " error_count += 1\n",
187 | " time.sleep(wait_period)\n",
188 | " wait_period *= 1.5\n",
189 | " print >> sys.stderr, \"BadStatusLine encountered. Continuing.\"\n",
190 | " if error_count > max_errors:\n",
191 | " print >> sys.stderr, \"Too many consecutive errors...bailing out.\"\n",
192 | " raise\n",
193 | "\n",
194 | "def harvest_user_timeline(twitter_api, screen_name=None, user_id=None, max_results=3000):\n",
195 | " \n",
196 | " assert (screen_name != None) != (user_id != None), \\\n",
197 | " \"Must have screen_name or user_id, but not both\" \n",
198 | " \n",
199 | " kw = { # Keyword args for the Twitter API call\n",
200 | " 'count': 2000,\n",
201 | " 'trim_user': 'true',\n",
202 | " 'include_rts' : 'true',\n",
203 | " 'since_id' : 1\n",
204 | " }\n",
205 | " \n",
206 | " if screen_name:\n",
207 | " kw['screen_name'] = screen_name\n",
208 | " else:\n",
209 | " kw['user_id'] = user_id\n",
210 | " \n",
211 | " max_pages = 16\n",
212 | " results = []\n",
213 | " \n",
214 | " tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)\n",
215 | " \n",
216 | " if tweets is None: # 401 (Not Authorized) - Need to bail out on loop entry\n",
217 | " tweets = []\n",
218 | " \n",
219 | " results += tweets\n",
220 | " \n",
221 | " print >> sys.stderr, 'Fetched %i tweets' % len(tweets)\n",
222 | " \n",
223 | " page_num = 1\n",
224 | " \n",
225 | " # Many Twitter accounts have fewer than 200 tweets so you don't want to enter\n",
226 | " # the loop and waste a precious request if max_results = 200.\n",
227 | " \n",
228 | " # Note: Analogous optimizations could be applied inside the loop to try and \n",
229 | " # save requests. e.g. Don't make a third request if you have 287 tweets out of \n",
230 | " # a possible 400 tweets after your second request. Twitter does do some \n",
231 | " # post-filtering on censored and deleted tweets out of batches of 'count', though,\n",
232 | " # so you can't strictly check for the number of results being 200. You might get\n",
233 | " # back 198, for example, and still have many more tweets to go. If you have the\n",
234 | " # total number of tweets for an account (by GET /users/lookup/), then you could \n",
235 | " # simply use this value as a guide.\n",
236 | " \n",
237 | " if max_results == kw['count']:\n",
238 | " page_num = max_pages # Prevent loop entry\n",
239 | " \n",
240 | " while page_num < max_pages and len(tweets) > 0 and len(results) < max_results:\n",
241 | " \n",
242 | " # Necessary for traversing the timeline in Twitter's v1.1 API:\n",
243 | " # get the next query's max-id parameter to pass in.\n",
244 | " # See https://dev.twitter.com/docs/working-with-timelines.\n",
245 | " kw['max_id'] = min([ tweet['id'] for tweet in tweets]) - 1 \n",
246 | " \n",
247 | " tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)\n",
248 | " results += tweets\n",
249 | "\n",
250 | " print >> sys.stderr, 'Fetched %i tweets' % (len(tweets),)\n",
251 | " \n",
252 | " page_num += 1\n",
253 | " \n",
254 | " print >> sys.stderr, 'Done fetching tweets'\n",
255 | "\n",
256 | " return results[:max_results]\n",
257 | " \n",
258 | "# Sample usage\n",
259 | "\n",
260 | "twitter_api = oauth_login()\n",
261 | "#tweets = harvest_user_timeline(twitter_api, user_id=\"@spagov\", max_results=3000)\n",
262 | "tweets = harvest_user_timeline(twitter_api,screen_name=\"spagov\", max_results=3000)\n",
263 | "save_json(tweets)\n",
264 | "# Save to MongoDB with save_to_mongo or a local file with save_json..."
265 | ],
266 | "language": "python",
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "output_type": "stream",
271 | "stream": "stderr",
272 | "text": [
273 | "Fetched 200 tweets\n",
274 | "Fetched 200 tweets"
275 | ]
276 | },
277 | {
278 | "output_type": "stream",
279 | "stream": "stderr",
280 | "text": [
281 | "\n",
282 | "Fetched 200 tweets"
283 | ]
284 | },
285 | {
286 | "output_type": "stream",
287 | "stream": "stderr",
288 | "text": [
289 | "\n",
290 | "Fetched 200 tweets"
291 | ]
292 | },
293 | {
294 | "output_type": "stream",
295 | "stream": "stderr",
296 | "text": [
297 | "\n",
298 | "Fetched 200 tweets"
299 | ]
300 | },
301 | {
302 | "output_type": "stream",
303 | "stream": "stderr",
304 | "text": [
305 | "\n",
306 | "Fetched 200 tweets"
307 | ]
308 | },
309 | {
310 | "output_type": "stream",
311 | "stream": "stderr",
312 | "text": [
313 | "\n",
314 | "Fetched 200 tweets"
315 | ]
316 | },
317 | {
318 | "output_type": "stream",
319 | "stream": "stderr",
320 | "text": [
321 | "\n",
322 | "Fetched 200 tweets"
323 | ]
324 | },
325 | {
326 | "output_type": "stream",
327 | "stream": "stderr",
328 | "text": [
329 | "\n",
330 | "Fetched 200 tweets"
331 | ]
332 | },
333 | {
334 | "output_type": "stream",
335 | "stream": "stderr",
336 | "text": [
337 | "\n",
338 | "Fetched 200 tweets"
339 | ]
340 | },
341 | {
342 | "output_type": "stream",
343 | "stream": "stderr",
344 | "text": [
345 | "\n",
346 | "Fetched 200 tweets"
347 | ]
348 | },
349 | {
350 | "output_type": "stream",
351 | "stream": "stderr",
352 | "text": [
353 | "\n",
354 | "Fetched 200 tweets"
355 | ]
356 | },
357 | {
358 | "output_type": "stream",
359 | "stream": "stderr",
360 | "text": [
361 | "\n",
362 | "Fetched 200 tweets"
363 | ]
364 | },
365 | {
366 | "output_type": "stream",
367 | "stream": "stderr",
368 | "text": [
369 | "\n",
370 | "Fetched 200 tweets"
371 | ]
372 | },
373 | {
374 | "output_type": "stream",
375 | "stream": "stderr",
376 | "text": [
377 | "\n",
378 | "Fetched 200 tweets"
379 | ]
380 | },
381 | {
382 | "output_type": "stream",
383 | "stream": "stderr",
384 | "text": [
385 | "\n",
386 | "Done fetching tweets\n"
387 | ]
388 | }
389 | ],
390 | "prompt_number": 4
391 | },
392 | {
393 | "cell_type": "code",
394 | "collapsed": false,
395 | "input": [],
396 | "language": "python",
397 | "metadata": {},
398 | "outputs": [],
399 | "prompt_number": 3
400 | },
401 | {
402 | "cell_type": "code",
403 | "collapsed": false,
404 | "input": [],
405 | "language": "python",
406 | "metadata": {},
407 | "outputs": []
408 | }
409 | ],
410 | "metadata": {}
411 | }
412 | ]
413 | }
--------------------------------------------------------------------------------
/Dialect Stemming/src/stemming/Stemmer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Eman Alnkhilan- java code is based on rule based algorithm for Gulf dialect
3 | *
4 | */
5 |
6 |
7 | package stemming;
8 |
9 | import java.io.BufferedWriter;
10 | import java.io.File;
11 | import java.io.FileOutputStream;
12 | import java.io.FileWriter;
13 | import java.io.IOException;
14 | import java.io.OutputStreamWriter;
15 | import java.nio.file.Path;
16 | import java.nio.file.Paths;
17 | import java.util.Arrays;
18 | import java.util.Formatter;
19 | import java.util.FormatterClosedException;
20 | import java.util.LinkedHashSet;
21 | import java.util.List;
22 | import java.util.NoSuchElementException;
23 | import java.util.Scanner;
24 | import java.util.Set;
25 |
26 | public class Stemmer {
27 |
28 | public static void main(String[] args) {
29 | // TODO Auto-generated method stub
30 |
31 | //String fin = "files/WLF.txt";
32 |
33 |
34 | Scanner input;
35 | Formatter output;
36 | //String fout = "files/out.txt";
37 | // File newFile = new File("files", "out.txt");
38 | // BufferedWriter output = null;
39 | try{
40 | // newFile.createNewFile();
41 |
42 | //writer.write("Hello world!");
43 | Path p=Paths.get("output.txt").toAbsolutePath();
44 | // FileWriter out =new FileWriter(p.toString());
45 | // out.write("Hello world!");
46 | output = new Formatter(p.toString(), "utf-8");
47 | input = new Scanner(Paths.get("WLF"),"utf-8");
48 |
49 | //naw= non Arabic words
50 | List naw=Arrays.asList("بشت", "تجوري", "دلاغ" ,"ابجور" ,"أبجور", "دريول", "فريزر" , "بصم" ,"بلكون ", "بخت", "تليفون" , "تلفزيون" ,"خوش" , "روج" , "ساندويش", "سندويش" );// non Arabic words without prefix & suffix
51 | List stopwordsList = Arrays.asList("ام","أم","بدون" ,"عشرة", "على", "مقابل" ,"حاليا" ,"التى" ,"حوالى" ,"الماضي",
52 | "انا", "ب", "عند" ,"اذا", "هذا", "امس" ,"ايام", "انها",
53 | "انها", "مافيه", "لان" ,"عندما" ,"احد" ,"هاذا", "السابق" ,"خلال","جميع",
54 | "شي" ,"مافي", "اثر" ,"عليه", "برس" ,"بن", "التي" ,"الذين" ,"الوقت",
55 | "ع", "انا" ,"ا" ,"عاما", "بشكل", "الا" ,"الان" ,"اول",
56 | "له" ,"بكرا", "لانهم", "عن", "حتى", "كم", "اما", "امام" ,"ضمن",
57 | "لها" ,"واذا", "انها" ,"عام", "اعلنت", "وغير" ,"اكد", " الذي" ,"انه",
58 | "لو" ,"ي","ا", "عليها", "باسم" ,"به","اكثر", "الاول", "المقبل",
59 | "اللي" ,"لمن" ,"لماذا", "سنوات", "صباح", "ان" ,"ثلاثة" ,"ذلك", "و",
60 | "ليش" ,"لكل", "مافي", "سنة" ,"شخصا" , "اف" ,"ايضا" ,"بين" ,"الى",
61 | "مثل" ,"لهاذا" , "عشر" ,"تم" ,"اطار" ,"او" ,"الذاتي" ,"دون", "كان","كن",
62 | "مره" ,"راح" ,"عدم", "اعادة" ,"اجل" ,"حيث", "الذى",
63 | "يا" ,"انك" ,"عدة" ,"بعد" ,"اخرى" ,"بها" ,"الثاني" ,"حين", "قوة",
64 | "يقول" ,"ان" , "عدد" ,"ف", "اربعة" ,"اي", "الاخيرة" ,"حول" ,"كما",
65 | "يقولون" ,"بل", "فان" ,"كان" ,"هذه" ,"كانت" ,"في", "من", "لها",
66 | "مع", "واحد" ,"قبل" ,"لدى" ,"وان" ,"واوضح" ,"كل" ,"هو", "يكون",
67 | "مساء" ,"واضاف" ,"قال", "نحو" ,"واكد", "مايو" ,"له", "هي" ,"وقالت",
68 | "هذا" ,"واضافت" ,"وكانت", "وقف" ,"فيها" ,"يمكن" ,"مليار" ,"لوكالة",
69 | "منذ" ,"هناك" ,"فيه", "ومن" , "منها" ,"مليون" ,"لكن" ,"وفي" ,
70 | "وقد" , "وقال" ,"كلم", "وهو" ,"وهي", "يوم" ,"نهاية" ,"لقاء" ,"ومع","مع",
71 | "نفسه" , "وكان" ,"زيارة" , "ثم", "ايار" ,"الاولى" ,"اليوم" ,"هل","اين","متى","أين","تحت","فوق","بين","بينما","فبينما",
72 | "لهذا" ,"بكره" ,"بعض" ,"بان" ,"صفر" ,"الثانية","الف","ما","يا","يومياً","كل","إذا","علي","عليك","عليهم","ماذا","قط",
73 | "عليكم","فجرا","فجر","عشان","علشان","فقط","بإذن","بس","عصر","عصرا","ظهر","ظهرا","الجمعه","الخميس","الاربعاء","الأربعاء","الثلاثاء","الأثنين","الاثنين","الاحد","الأحد","السبت");
74 | List vowels= Arrays.asList ('ا','ي','و');
75 | List nostemming=Arrays.asList("اللهٍ","اللهَ","اللهُ","الله"); // words must not to be stemmed
76 | int lettercount=0;
77 | String word=null;
78 | while (input.hasNextLine()) {
79 | String line = input.nextLine();
80 | if(!line.isEmpty()){
81 | List tokens = Arrays.asList(line.split("\\s"));
82 | // if word is <=3 then it is stemmed
83 | for(int i=0; i3 && !nostemming.contains(word)){
99 | // to remove prefix
100 | if(word.startsWith("ولل")){
101 | String pre="ولل";
102 | word=word.substring ( pre.length ( ) );
103 | }
104 | else if(word.startsWith("لل")){
105 | String pre="لل";
106 | word=word.substring ( pre.length ( ) );
107 | }
108 | else if(word.startsWith("وال")){
109 | String pre="وال";
110 | word=word.substring ( pre.length ( ) );
111 | }
112 | else if(word.startsWith("بال")){
113 | String pre="بال";
114 | word=word.substring ( pre.length ( ) );
115 | }
116 | else if(word.startsWith("ال")){
117 | String pre="ال";
118 | word=word.substring ( pre.length ( ) );
119 | }
120 | else if( word.startsWith("وش")){
121 | String pre="وش";
122 | word=word.substring ( pre.length ( ) );
123 | }
124 | else if(word.startsWith("اش")){
125 | String pre="اش";
126 | word=word.substring ( pre.length ( ) );
127 | }
128 |
129 |
130 | if(word.length()<=3){
131 | // System.out.println(word+" "+"هو الجذر");
132 | output.format( word+" "); }
133 | // to remove suffix
134 | else if(word.length()>3 ){
135 | if(word.endsWith( "الكم")){
136 | String suf="الكم";
137 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
138 | }
139 | else if(word.endsWith( "ونهم")){
140 | String suf="ونهم";
141 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
142 | }
143 | else if(word.endsWith( "تني")){
144 | String suf="تني";
145 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
146 | }
147 | else if(word.endsWith( "ونه")){
148 | String suf="ونه";
149 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
150 | }
151 | else if(word.endsWith( "الك")){
152 | String suf="الك";
153 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
154 | }
155 | else if(word.endsWith( "تهم")){
156 | String suf="تهم";
157 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
158 | }
159 | else if(word.endsWith( "تهن")){
160 | String suf="تهن";
161 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
162 | }
163 | else if(word.endsWith("ات")){
164 | String suf="ات";
165 | //word=word.substring ( suf.length ( ) );
166 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
167 | }
168 | else if(word.endsWith("ون")){
169 | String suf="ون";
170 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
171 | }
172 |
173 | else if(word.endsWith("كم")){
174 | String suf="كم";
175 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
176 | }
177 | else if(word.endsWith("وك")){
178 | String suf="وك";
179 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
180 | }
181 |
182 | else if(word.endsWith( "ته")){
183 | String suf="ته";
184 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
185 | }
186 | else if(word.endsWith( "هم")){
187 | String suf="هم";
188 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
189 | }
190 |
191 | else if(word.endsWith( "تي")){
192 | String suf="تي";
193 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
194 | }
195 | else if(word.endsWith( "ني")){
196 | String suf="ني";
197 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
198 | }
199 | else if(word.endsWith( "وا")){
200 | String suf="وا";
201 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
202 | }
203 | else if(word.endsWith( "ها")){
204 | String suf="ها";
205 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
206 | }
207 | else if(word.endsWith( "نا")){
208 | String suf="نا";
209 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
210 | }
211 | else if(word.endsWith("ك")){
212 | String suf="ك";
213 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
214 | }
215 | else if(word.endsWith( "ه")){
216 | String suf="ه";
217 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
218 | }
219 | else if(word.endsWith( "ة")){
220 | String suf="ة";
221 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
222 | }
223 |
224 | else if(word.endsWith( "ي")){
225 | String suf="ي";
226 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
227 | }
228 | else if(word.endsWith( "و")){
229 | String suf="و";
230 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
231 | }
232 | else if(word.endsWith( "ت")){
233 | String suf="ت";
234 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
235 | }
236 |
237 | else if(word.endsWith( "اً")){
238 | String suf="اً";
239 | word=word.substring ( 0, word.length ( ) - suf.length ( ) );
240 | }
241 | if (word.length()<=3){
242 |
243 | if(naw.contains(word)){// to check the word is Arabic or not
244 | System.out.println(word+" "+"ليست من أصل عربي");
245 | }
246 | else{
247 | // /System.out.println(word+" "+"هو الجذر");
248 | if(vowels.contains(word.charAt(word.length()-1))){
249 | word = new StringBuilder(word).deleteCharAt(word.length()-1).toString();}
250 | output.format( word+" ");
251 | }
252 | }
253 | if(word.length()>3 ){
254 | if(naw.contains(word)){// to check the word is Arabic or not
255 | System.out.println(word+" "+"ليست من أصل عربي");
256 | }
257 | if(!naw.contains(word)) {
258 | if(word.startsWith("با")){
259 | String pre="با";
260 | word=word.substring ( pre.length ( ) );
261 | }
262 | else if(word.startsWith("وا")){
263 | String pre="وا";
264 | word=word.substring ( pre.length ( ) );
265 | }
266 |
267 | else if(word.startsWith("مت")){
268 | String pre="مت";
269 | word=word.substring ( pre.length ( ) );
270 | }
271 | else if(word.startsWith("ان")){
272 | String pre="ان";
273 | word=word.substring ( pre.length ( ) );
274 | }
275 |
276 | else if(word.startsWith("ما")){
277 | String pre="ما";
278 | word=word.substring ( pre.length ( ) );
279 | }
280 | else if(word.startsWith("من")){
281 | String pre="من";
282 | word=word.substring ( pre.length ( ) );
283 | }
284 | else if(word.startsWith("يت")){
285 | String pre="يت";
286 | word=word.substring ( pre.length ( ) );
287 | }
288 | else if(word.startsWith("ا")){
289 | String pre="ا";
290 | word=word.substring ( pre.length ( ) );
291 | }
292 |
293 | else if(word.startsWith("ت")){
294 | String pre="ت";
295 | word=word.substring ( pre.length ( ) );
296 | }
297 | else if(word.startsWith("ي")){
298 | String pre="ي";
299 | word=word.substring ( pre.length ( ) );
300 | }
301 | else if(word.startsWith("م")){
302 | String pre="م";
303 | word=word.substring ( pre.length ( ) );
304 | }
305 |
306 | if (word.length()<=3){
307 | // System.out.println(word+" "+"هو الجذر");
308 | output.format( word+" ");
309 | }
310 | if(word.length()>3 ){
311 |
312 | char[] ch=word.toCharArray();
313 |
314 | if(ch[0]=='م'&& ch[2]=='ت'){
315 |
316 | word = new StringBuilder(word).deleteCharAt(0).toString();
317 | word = new StringBuilder(word).deleteCharAt(1).toString();
318 | }
319 | else if(ch[0]=='ا'&& ch[2]=='ت'){
320 | word = new StringBuilder(word).deleteCharAt(0).toString();
321 | word = new StringBuilder(word).deleteCharAt(1).toString();
322 | }
323 | else if(ch[0]=='ن'&& ch[2]=='ت'){
324 | word = new StringBuilder(word).deleteCharAt(0).toString();
325 | word = new StringBuilder(word).deleteCharAt(1).toString(); }
326 | else if(ch[0]=='ت'&& ch[2]=='ت'){
327 | word = new StringBuilder(word).deleteCharAt(0).toString();
328 | word = new StringBuilder(word).deleteCharAt(1).toString();
329 | }
330 |
331 | if (word.length()<=3){
332 | // System.out.println(word+" "+"هو الجذر");
333 | output.format( word+" ");
334 | }
335 | // System.out.println(word);
336 | if(word.length()>3){
337 |
338 | /*
339 | to remove vowels from a word, If we have a vowel with non-vowels neighbored then
340 | deletes it.
341 | If we have two consecutive vowel letters then we have to delete one of them according to the following
342 | order (ا then و and then ي)
343 |
344 | */
345 | for(int j=1; j set=new LinkedHashSet();
391 | for(char c:word.toCharArray())
392 | {
393 | set.add(Character.valueOf(c));
394 | }
395 | StringBuilder sb = new StringBuilder();
396 | for (Character character : set) {
397 | sb.append(character);
398 | }
399 | word=sb.toString();
400 | //System.out.println(word);
401 | output.format(word+" ");
402 |
403 |
404 | }
405 | }
406 | }
407 | }
408 | }
409 | }
410 |
411 | }
412 |
413 | }
414 | }
415 | output.format( "\n");
416 | }
417 | input.close();
418 | output.close();
419 | }
420 | catch (IOException ioException){
421 | System.err.println(ioException.getMessage());
422 | System.exit(-1);
423 | } catch(NoSuchElementException excp) {
424 | System.err.println(excp.getMessage());
425 | } catch (FormatterClosedException excp){
426 | System.err.println(excp.getMessage());
427 | } catch (IllegalStateException excp){
428 | System.err.println(excp.getMessage());
429 | }
430 |
431 | }
432 |
433 | }
434 |
--------------------------------------------------------------------------------
/Collecting Twitter Data/Python/SearchWord.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": ""
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "code",
12 | "collapsed": false,
13 | "input": [
14 | "\n",
15 | "import sys\n",
16 | "import datetime\n",
17 | "import time\n",
18 | "import twitter\n",
19 | "\n",
20 | "def oauth_login():\n",
21 | " #This code is taken from https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition \n",
22 | " # XXX: Go to http://twitter.com/apps/new to create an app and get values\n",
23 | " # for these credentials that you'll need to provide in place of these\n",
24 | " # empty string values that are defined as placeholders.\n",
25 | " # See https://dev.twitter.com/docs/auth/oauth for more information \n",
26 | " # on Twitter's OAuth implementation.\n",
27 | " \n",
28 | " CONSUMER_KEY = ''\n",
29 | " CONSUMER_SECRET = ''\n",
30 | " OAUTH_TOKEN = ''\n",
31 | " OAUTH_TOKEN_SECRET = ''\n",
32 | " \n",
33 | " auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n",
34 | " CONSUMER_KEY, CONSUMER_SECRET)\n",
35 | " \n",
36 | " twitter_api = twitter.Twitter(auth=auth)\n",
37 | " return twitter_api\n",
38 | "\n",
39 | "def twitter_search(twitter_api, q, max_results, **kw):\n",
40 | "\n",
41 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets and \n",
42 | " # https://dev.twitter.com/docs/using-search for details on advanced \n",
43 | " # search criteria that may be useful for keyword arguments\n",
44 | " \n",
45 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets \n",
46 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n",
47 | " \n",
48 | " statuses = search_results['statuses']\n",
49 | " print len(statuses)\n",
50 | " \n",
51 | " # Iterate through batches of results by following the cursor until we\n",
52 | " # reach the desired number of results, keeping in mind that OAuth users\n",
53 | " # can \"only\" make 180 search queries per 15-minute interval. See\n",
54 | " # https://dev.twitter.com/docs/rate-limiting/1.1/limits\n",
55 | " # for details. A reasonable number of results is ~1000, although\n",
56 | " # that number of results may not exist for all queries.\n",
57 | " \n",
58 | " # Enforce a reasonable limit\n",
59 | " max_results = 10000 #min(100000, max_results)\n",
60 | " \n",
61 | " for _ in range(10000): # 10*100 = 1000\n",
62 | " try:\n",
63 | " next_results = search_results['search_metadata']['next_results']\n",
64 | " except KeyError, e: # No more results when next_results doesn't exist\n",
65 | " print len(statuses)\n",
66 | " break\n",
67 | " \n",
68 | " # Create a dictionary from next_results, which has the following form:\n",
69 | " # ?max_id=313519052523986943&q=NCAA&include_entities=1\n",
70 | " kwargs = dict([ kv.split('=') \n",
71 | " for kv in next_results[1:].split(\"&\") ])\n",
72 | " \n",
73 | " print len(statuses)\n",
74 | " search_results = twitter_api.search.tweets(**kwargs)\n",
75 | " statuses += search_results['statuses']\n",
76 | " \n",
77 | " if len(statuses) > max_results: \n",
78 | " break\n",
79 | " print len(statuses)\n",
80 | " \n",
81 | " return statuses\n",
82 | "\n"
83 | ],
84 | "language": "python",
85 | "metadata": {},
86 | "outputs": [],
87 | "prompt_number": 6
88 | },
89 | {
90 | "cell_type": "code",
91 | "collapsed": false,
92 | "input": [
93 | "import io, json\n",
94 | "\n",
95 | "def save_json(filename, data):\n",
96 | " with io.open(filename, \n",
97 | " 'w', encoding='utf-8') as f:\n",
98 | " f.write(unicode(json.dumps(data, ensure_ascii=False)))\n",
99 | "\n",
100 | "def load_json(filename):\n",
101 | " with io.open(filename, \n",
102 | " encoding='utf-8') as f:\n",
103 | " return f.read()\n",
104 | "\n",
105 | "# Sample usage\n",
106 | "\n",
107 | "q = '%D8%A7%D9%84%D8%AA%D8%B9%D9%84%D9%8A%D9%85'\n",
108 | "twitter_api = oauth_login()\n",
109 | "results = twitter_search(twitter_api, q, max_results=10000)\n",
110 | "\n",
111 | "\n",
112 | "save_json('Education.json', results)\n"
113 | ],
114 | "language": "python",
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "output_type": "stream",
119 | "stream": "stdout",
120 | "text": [
121 | "100\n",
122 | "100\n",
123 | "187"
124 | ]
125 | },
126 | {
127 | "output_type": "stream",
128 | "stream": "stdout",
129 | "text": [
130 | "\n",
131 | "187\n"
132 | ]
133 | }
134 | ],
135 | "prompt_number": 5
136 | },
137 | {
138 | "cell_type": "code",
139 | "collapsed": false,
140 | "input": [
141 | " "
142 | ],
143 | "language": "python",
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "output_type": "stream",
148 | "stream": "stdout",
149 | "text": [
150 | "100\n",
151 | "100\n",
152 | "200"
153 | ]
154 | },
155 | {
156 | "output_type": "stream",
157 | "stream": "stdout",
158 | "text": [
159 | "\n",
160 | "300"
161 | ]
162 | },
163 | {
164 | "output_type": "stream",
165 | "stream": "stdout",
166 | "text": [
167 | "\n",
168 | "400"
169 | ]
170 | },
171 | {
172 | "output_type": "stream",
173 | "stream": "stdout",
174 | "text": [
175 | "\n",
176 | "500"
177 | ]
178 | },
179 | {
180 | "output_type": "stream",
181 | "stream": "stdout",
182 | "text": [
183 | "\n",
184 | "600"
185 | ]
186 | },
187 | {
188 | "output_type": "stream",
189 | "stream": "stdout",
190 | "text": [
191 | "\n",
192 | "700"
193 | ]
194 | },
195 | {
196 | "output_type": "stream",
197 | "stream": "stdout",
198 | "text": [
199 | "\n",
200 | "800"
201 | ]
202 | },
203 | {
204 | "output_type": "stream",
205 | "stream": "stdout",
206 | "text": [
207 | "\n",
208 | "900"
209 | ]
210 | },
211 | {
212 | "output_type": "stream",
213 | "stream": "stdout",
214 | "text": [
215 | "\n",
216 | "1000"
217 | ]
218 | },
219 | {
220 | "output_type": "stream",
221 | "stream": "stdout",
222 | "text": [
223 | "\n",
224 | "1100"
225 | ]
226 | },
227 | {
228 | "output_type": "stream",
229 | "stream": "stdout",
230 | "text": [
231 | "\n",
232 | "1200"
233 | ]
234 | },
235 | {
236 | "output_type": "stream",
237 | "stream": "stdout",
238 | "text": [
239 | "\n",
240 | "1300"
241 | ]
242 | },
243 | {
244 | "output_type": "stream",
245 | "stream": "stdout",
246 | "text": [
247 | "\n",
248 | "1400"
249 | ]
250 | },
251 | {
252 | "output_type": "stream",
253 | "stream": "stdout",
254 | "text": [
255 | "\n",
256 | "1500"
257 | ]
258 | },
259 | {
260 | "output_type": "stream",
261 | "stream": "stdout",
262 | "text": [
263 | "\n",
264 | "1600"
265 | ]
266 | },
267 | {
268 | "output_type": "stream",
269 | "stream": "stdout",
270 | "text": [
271 | "\n",
272 | "1700"
273 | ]
274 | },
275 | {
276 | "output_type": "stream",
277 | "stream": "stdout",
278 | "text": [
279 | "\n",
280 | "1800"
281 | ]
282 | },
283 | {
284 | "output_type": "stream",
285 | "stream": "stdout",
286 | "text": [
287 | "\n",
288 | "1900"
289 | ]
290 | },
291 | {
292 | "output_type": "stream",
293 | "stream": "stdout",
294 | "text": [
295 | "\n",
296 | "2000"
297 | ]
298 | },
299 | {
300 | "output_type": "stream",
301 | "stream": "stdout",
302 | "text": [
303 | "\n",
304 | "2100"
305 | ]
306 | },
307 | {
308 | "output_type": "stream",
309 | "stream": "stdout",
310 | "text": [
311 | "\n",
312 | "2200"
313 | ]
314 | },
315 | {
316 | "output_type": "stream",
317 | "stream": "stdout",
318 | "text": [
319 | "\n",
320 | "2300"
321 | ]
322 | },
323 | {
324 | "output_type": "stream",
325 | "stream": "stdout",
326 | "text": [
327 | "\n",
328 | "2400"
329 | ]
330 | },
331 | {
332 | "output_type": "stream",
333 | "stream": "stdout",
334 | "text": [
335 | "\n",
336 | "2500"
337 | ]
338 | },
339 | {
340 | "output_type": "stream",
341 | "stream": "stdout",
342 | "text": [
343 | "\n",
344 | "2600"
345 | ]
346 | },
347 | {
348 | "output_type": "stream",
349 | "stream": "stdout",
350 | "text": [
351 | "\n",
352 | "2700"
353 | ]
354 | },
355 | {
356 | "output_type": "stream",
357 | "stream": "stdout",
358 | "text": [
359 | "\n",
360 | "2800"
361 | ]
362 | },
363 | {
364 | "output_type": "stream",
365 | "stream": "stdout",
366 | "text": [
367 | "\n",
368 | "2900"
369 | ]
370 | },
371 | {
372 | "output_type": "stream",
373 | "stream": "stdout",
374 | "text": [
375 | "\n",
376 | "3000"
377 | ]
378 | },
379 | {
380 | "output_type": "stream",
381 | "stream": "stdout",
382 | "text": [
383 | "\n",
384 | "3100"
385 | ]
386 | },
387 | {
388 | "output_type": "stream",
389 | "stream": "stdout",
390 | "text": [
391 | "\n",
392 | "3200"
393 | ]
394 | },
395 | {
396 | "output_type": "stream",
397 | "stream": "stdout",
398 | "text": [
399 | "\n",
400 | "3300"
401 | ]
402 | },
403 | {
404 | "output_type": "stream",
405 | "stream": "stdout",
406 | "text": [
407 | "\n",
408 | "3400"
409 | ]
410 | },
411 | {
412 | "output_type": "stream",
413 | "stream": "stdout",
414 | "text": [
415 | "\n",
416 | "3500"
417 | ]
418 | },
419 | {
420 | "output_type": "stream",
421 | "stream": "stdout",
422 | "text": [
423 | "\n",
424 | "3600"
425 | ]
426 | },
427 | {
428 | "output_type": "stream",
429 | "stream": "stdout",
430 | "text": [
431 | "\n",
432 | "3700"
433 | ]
434 | },
435 | {
436 | "output_type": "stream",
437 | "stream": "stdout",
438 | "text": [
439 | "\n",
440 | "3800"
441 | ]
442 | },
443 | {
444 | "output_type": "stream",
445 | "stream": "stdout",
446 | "text": [
447 | "\n",
448 | "3900"
449 | ]
450 | },
451 | {
452 | "output_type": "stream",
453 | "stream": "stdout",
454 | "text": [
455 | "\n",
456 | "4000"
457 | ]
458 | },
459 | {
460 | "output_type": "stream",
461 | "stream": "stdout",
462 | "text": [
463 | "\n",
464 | "4100"
465 | ]
466 | },
467 | {
468 | "output_type": "stream",
469 | "stream": "stdout",
470 | "text": [
471 | "\n",
472 | "4200"
473 | ]
474 | },
475 | {
476 | "output_type": "stream",
477 | "stream": "stdout",
478 | "text": [
479 | "\n",
480 | "4300"
481 | ]
482 | },
483 | {
484 | "output_type": "stream",
485 | "stream": "stdout",
486 | "text": [
487 | "\n",
488 | "4400"
489 | ]
490 | },
491 | {
492 | "output_type": "stream",
493 | "stream": "stdout",
494 | "text": [
495 | "\n",
496 | "4500"
497 | ]
498 | },
499 | {
500 | "output_type": "stream",
501 | "stream": "stdout",
502 | "text": [
503 | "\n",
504 | "4600"
505 | ]
506 | },
507 | {
508 | "output_type": "stream",
509 | "stream": "stdout",
510 | "text": [
511 | "\n",
512 | "4700"
513 | ]
514 | },
515 | {
516 | "output_type": "stream",
517 | "stream": "stdout",
518 | "text": [
519 | "\n",
520 | "4800"
521 | ]
522 | },
523 | {
524 | "output_type": "stream",
525 | "stream": "stdout",
526 | "text": [
527 | "\n",
528 | "4900"
529 | ]
530 | },
531 | {
532 | "output_type": "stream",
533 | "stream": "stdout",
534 | "text": [
535 | "\n",
536 | "5000"
537 | ]
538 | },
539 | {
540 | "output_type": "stream",
541 | "stream": "stdout",
542 | "text": [
543 | "\n",
544 | "5100"
545 | ]
546 | },
547 | {
548 | "output_type": "stream",
549 | "stream": "stdout",
550 | "text": [
551 | "\n",
552 | "5200"
553 | ]
554 | },
555 | {
556 | "output_type": "stream",
557 | "stream": "stdout",
558 | "text": [
559 | "\n",
560 | "5300"
561 | ]
562 | },
563 | {
564 | "output_type": "stream",
565 | "stream": "stdout",
566 | "text": [
567 | "\n",
568 | "5400"
569 | ]
570 | },
571 | {
572 | "output_type": "stream",
573 | "stream": "stdout",
574 | "text": [
575 | "\n",
576 | "5500"
577 | ]
578 | },
579 | {
580 | "output_type": "stream",
581 | "stream": "stdout",
582 | "text": [
583 | "\n",
584 | "5600"
585 | ]
586 | },
587 | {
588 | "output_type": "stream",
589 | "stream": "stdout",
590 | "text": [
591 | "\n",
592 | "5700"
593 | ]
594 | },
595 | {
596 | "output_type": "stream",
597 | "stream": "stdout",
598 | "text": [
599 | "\n",
600 | "5800"
601 | ]
602 | },
603 | {
604 | "output_type": "stream",
605 | "stream": "stdout",
606 | "text": [
607 | "\n",
608 | "5900"
609 | ]
610 | },
611 | {
612 | "output_type": "stream",
613 | "stream": "stdout",
614 | "text": [
615 | "\n",
616 | "6000"
617 | ]
618 | },
619 | {
620 | "output_type": "stream",
621 | "stream": "stdout",
622 | "text": [
623 | "\n",
624 | "6100"
625 | ]
626 | },
627 | {
628 | "output_type": "stream",
629 | "stream": "stdout",
630 | "text": [
631 | "\n",
632 | "6200"
633 | ]
634 | },
635 | {
636 | "output_type": "stream",
637 | "stream": "stdout",
638 | "text": [
639 | "\n",
640 | "6300"
641 | ]
642 | },
643 | {
644 | "output_type": "stream",
645 | "stream": "stdout",
646 | "text": [
647 | "\n",
648 | "6400"
649 | ]
650 | },
651 | {
652 | "output_type": "stream",
653 | "stream": "stdout",
654 | "text": [
655 | "\n",
656 | "6500"
657 | ]
658 | },
659 | {
660 | "output_type": "stream",
661 | "stream": "stdout",
662 | "text": [
663 | "\n",
664 | "6600"
665 | ]
666 | },
667 | {
668 | "output_type": "stream",
669 | "stream": "stdout",
670 | "text": [
671 | "\n",
672 | "6700"
673 | ]
674 | },
675 | {
676 | "output_type": "stream",
677 | "stream": "stdout",
678 | "text": [
679 | "\n",
680 | "6800"
681 | ]
682 | },
683 | {
684 | "output_type": "stream",
685 | "stream": "stdout",
686 | "text": [
687 | "\n",
688 | "6900"
689 | ]
690 | },
691 | {
692 | "output_type": "stream",
693 | "stream": "stdout",
694 | "text": [
695 | "\n",
696 | "7000"
697 | ]
698 | },
699 | {
700 | "output_type": "stream",
701 | "stream": "stdout",
702 | "text": [
703 | "\n",
704 | "7100"
705 | ]
706 | },
707 | {
708 | "output_type": "stream",
709 | "stream": "stdout",
710 | "text": [
711 | "\n",
712 | "7200"
713 | ]
714 | },
715 | {
716 | "output_type": "stream",
717 | "stream": "stdout",
718 | "text": [
719 | "\n",
720 | "7300"
721 | ]
722 | },
723 | {
724 | "output_type": "stream",
725 | "stream": "stdout",
726 | "text": [
727 | "\n",
728 | "7400"
729 | ]
730 | },
731 | {
732 | "output_type": "stream",
733 | "stream": "stdout",
734 | "text": [
735 | "\n",
736 | "7500"
737 | ]
738 | },
739 | {
740 | "output_type": "stream",
741 | "stream": "stdout",
742 | "text": [
743 | "\n",
744 | "7600"
745 | ]
746 | },
747 | {
748 | "output_type": "stream",
749 | "stream": "stdout",
750 | "text": [
751 | "\n",
752 | "7700"
753 | ]
754 | },
755 | {
756 | "output_type": "stream",
757 | "stream": "stdout",
758 | "text": [
759 | "\n",
760 | "7800"
761 | ]
762 | },
763 | {
764 | "output_type": "stream",
765 | "stream": "stdout",
766 | "text": [
767 | "\n",
768 | "7900"
769 | ]
770 | },
771 | {
772 | "output_type": "stream",
773 | "stream": "stdout",
774 | "text": [
775 | "\n",
776 | "8000"
777 | ]
778 | },
779 | {
780 | "output_type": "stream",
781 | "stream": "stdout",
782 | "text": [
783 | "\n",
784 | "8100"
785 | ]
786 | },
787 | {
788 | "output_type": "stream",
789 | "stream": "stdout",
790 | "text": [
791 | "\n",
792 | "8200"
793 | ]
794 | },
795 | {
796 | "output_type": "stream",
797 | "stream": "stdout",
798 | "text": [
799 | "\n",
800 | "8300"
801 | ]
802 | },
803 | {
804 | "output_type": "stream",
805 | "stream": "stdout",
806 | "text": [
807 | "\n",
808 | "8400"
809 | ]
810 | },
811 | {
812 | "output_type": "stream",
813 | "stream": "stdout",
814 | "text": [
815 | "\n",
816 | "8500"
817 | ]
818 | },
819 | {
820 | "output_type": "stream",
821 | "stream": "stdout",
822 | "text": [
823 | "\n",
824 | "8600"
825 | ]
826 | },
827 | {
828 | "output_type": "stream",
829 | "stream": "stdout",
830 | "text": [
831 | "\n",
832 | "8700"
833 | ]
834 | },
835 | {
836 | "output_type": "stream",
837 | "stream": "stdout",
838 | "text": [
839 | "\n",
840 | "8800"
841 | ]
842 | },
843 | {
844 | "output_type": "stream",
845 | "stream": "stdout",
846 | "text": [
847 | "\n",
848 | "8900"
849 | ]
850 | },
851 | {
852 | "output_type": "stream",
853 | "stream": "stdout",
854 | "text": [
855 | "\n",
856 | "9000"
857 | ]
858 | },
859 | {
860 | "output_type": "stream",
861 | "stream": "stdout",
862 | "text": [
863 | "\n",
864 | "9100"
865 | ]
866 | },
867 | {
868 | "output_type": "stream",
869 | "stream": "stdout",
870 | "text": [
871 | "\n",
872 | "9200"
873 | ]
874 | },
875 | {
876 | "output_type": "stream",
877 | "stream": "stdout",
878 | "text": [
879 | "\n",
880 | "9300"
881 | ]
882 | },
883 | {
884 | "output_type": "stream",
885 | "stream": "stdout",
886 | "text": [
887 | "\n",
888 | "9400"
889 | ]
890 | },
891 | {
892 | "output_type": "stream",
893 | "stream": "stdout",
894 | "text": [
895 | "\n",
896 | "9500"
897 | ]
898 | },
899 | {
900 | "output_type": "stream",
901 | "stream": "stdout",
902 | "text": [
903 | "\n",
904 | "9600"
905 | ]
906 | },
907 | {
908 | "output_type": "stream",
909 | "stream": "stdout",
910 | "text": [
911 | "\n",
912 | "9700"
913 | ]
914 | },
915 | {
916 | "output_type": "stream",
917 | "stream": "stdout",
918 | "text": [
919 | "\n",
920 | "9800"
921 | ]
922 | },
923 | {
924 | "output_type": "stream",
925 | "stream": "stdout",
926 | "text": [
927 | "\n",
928 | "9900"
929 | ]
930 | },
931 | {
932 | "output_type": "stream",
933 | "stream": "stdout",
934 | "text": [
935 | "\n",
936 | "10000"
937 | ]
938 | },
939 | {
940 | "output_type": "stream",
941 | "stream": "stdout",
942 | "text": [
943 | "\n",
944 | "10100"
945 | ]
946 | },
947 | {
948 | "ename": "TypeError",
949 | "evalue": "object of type 'NoneType' has no len()",
950 | "output_type": "pyerr",
951 | "traceback": [
952 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
953 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0mget_time_series_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'time-series'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'twitter_world_trends'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
954 | "\u001b[1;32m\u001b[0m in \u001b[0;36mget_time_series_data\u001b[1;34m(q, mongo_db_name, mongo_db_coll, secs_per_interval, max_intervals, **mongo_conn_kw)\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msave_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'SmileOct8'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0minter\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;34m'.json'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 18\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"Write {0} trends\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mids\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 19\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"Zzz...\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
955 | "\u001b[1;31mTypeError\u001b[0m: object of type 'NoneType' has no len()"
956 | ]
957 | },
958 | {
959 | "output_type": "stream",
960 | "stream": "stdout",
961 | "text": [
962 | "\n"
963 | ]
964 | }
965 | ],
966 | "prompt_number": 3
967 | },
968 | {
969 | "cell_type": "code",
970 | "collapsed": false,
971 | "input": [],
972 | "language": "python",
973 | "metadata": {},
974 | "outputs": []
975 | }
976 | ],
977 | "metadata": {}
978 | }
979 | ]
980 | }
--------------------------------------------------------------------------------