├── Dialect Stemming ├── WLF ├── FNAW ├── bin │ └── stemming │ │ └── Stemmer.class ├── .classpath ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── output.txt └── src │ └── stemming │ └── Stemmer.java ├── Collecting Twitter Data ├── R │ └── R workshopKSU.R └── Python │ ├── UserTweets.ipynb │ └── SearchWord.ipynb ├── README.md └── Cleaning Code └── Main (1).java /Dialect Stemming/WLF: -------------------------------------------------------------------------------- 1 | This file is used to store the users tweets 2 | -------------------------------------------------------------------------------- /Dialect Stemming/FNAW: -------------------------------------------------------------------------------- 1 | ببشت تجوري دلاغات ابجورات دريول فريزر بصمة بصمه بلكونة بلكونه بخت بوفيه تليفون تلفزيون خوش روج ساندويش سندويش شاليه -------------------------------------------------------------------------------- /Dialect Stemming/bin/stemming/Stemmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArwaAlrazooq/Arabic-twitter-analysis/HEAD/Dialect Stemming/bin/stemming/Stemmer.class -------------------------------------------------------------------------------- /Dialect Stemming/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /Dialect Stemming/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Dialect Stemming 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Dialect Stemming/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /Collecting Twitter Data/R/R workshopKSU.R: -------------------------------------------------------------------------------- 1 | #install.packages("ROAuth", "RCurl", "stringi", "stringi") 2 | library(ROAuth) # For oauth 3 | library(RCurl) 4 | library(stringi) # for use stri_trans_nfkc 5 | library(streamR) #For opens a connection to Twitter's Streaming API using filterStream function 6 | library(rJava) 7 | library(xlsx) #load the xlsx package 8 | 9 | ## PART 1: Declare Twitter API Credentials & Create Handshake 10 | requestURL = "https://api.twitter.com/oauth/request_token" 11 | accessURL = "https://api.twitter.com/oauth/access_token" 12 | authURL = " https://api.twitter.com/oauth/authorize" 13 | consumerKey = "" # From dev.twitter.com 14 | consumerSecret = "" # From dev.twitter.com 15 | 16 | my_oauth = OAuthFactory$new(consumerKey = consumerKey, 17 | consumerSecret = consumerSecret, 18 | requestURL = requestURL, 19 | accessURL = accessURL, 20 | authURL = authURL) 21 | 22 | #Save the my_oauth data to an .Rdata file 23 | save(my_oauth, file = "my_oauth.Rdata") 24 | load("my_oauth.Rdata") 25 | 26 | my_oauth$handshake(curl=getCurlHandle(), browseUrl=FALSE) 27 | 28 | ## PART 2: collect tweets from twitter streaming API. 29 | #Add as unicode of Arabic word #To get unicode of a word using this url http://r12a.github.io/apps/conversion/ 30 | 31 | keyword= stri_trans_nfkc('\u0627\u0644\u0647\u0644\u0627\u0644') 32 | 33 | filterStream(oauth=my_oauth, # Save tweets in a json file 34 | timeout = 120, track = keyword, #(timeout) in a second 35 | file.name = "tweets.json") # Use my_oauth file as the OAuth credentials // 36 | 37 | tweetsdf = parseTweets("tweets.json", simplify = TRUE) # parse the json file and save to a data frame called tweets.df 38 | View(tweetsdf) 39 | 40 | #save xlsx file 41 | write.xlsx(tweetsdf , "tweetsTest1.xlsx",sheetName = "tweets", row.names = FALSE) 42 | 43 | #save csv file 44 | write.csv(tweetsdf, "tweetsTest.csv") 45 | 46 | 47 | ################## 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Arabic Twitter Analysis 2 | 3 | #
المقدمة
4 | 5 |
6 | مع تطور الهواتف والأجهزة المحمولة وتوفر اتصال الانترنت عند غالبية الناس تزايد عدد المستخدمين للإنترنت مما ولد كمية كبيرة من البيانات. من أكثر المواقع التي يقوم مستخدمي الإنترنت بزيارتها هي مواقع التواصل الاجتماعي مثل توتير وفيس بوك و انستجرام وغيرها الكثير والتي أصبحت جزء لا يتجزأ من حياة الأغلبية. لم يقتصر استخدام هذه الشبكات الاجتماعية على الأفراد وإنما نجد الكثير من الحسابات الرسمية والموثقة للجهات الحكومية والشركات والمستشفيات وهلم جرا. يختلف استخدام هذه الشبكات بحسب المستخدم الذي يقول باستخدامها فعلى مستوى الأفراد يقوم الكثير منهم بالتعبير عن مشاعره ومشاركة يومياته ومتابعة أخبار أصدقائه. ومن جهة أخرى نجد أن المؤسسات قد تستخدمها لنشر أخبارها والتواصل مع عملائها وتلبية طلباتهم والإجابة عن استفساراتهم كما أن بعض الشركات التجارية خصيصًا تستخدم هذه الشبكات لتسويق منتجاتها ومعرفة أراء عملائها ومستوى رضاهم عن الخدمات التي تقدمها خصوصًا أن هذه المنصات هي شبكات اجتماعية تؤثر فيها آراء بعض المستخدمين على المستخدمين الآخرين. تقوم هذه الشبكات بجمع آراء الأشخاص عن منتجاتهم وتحليلها لمعرفة المشاكل وإصلاحها ومعرفة المميزات ودعمها. يعبر الأشخاص غالبًا عن آرائهم ومشاعرهم باستخدام النص المكتوب وبعض الوجوه التعبيرية لإيصال الفكرة بشكل تام ويسهل على البشر فهم النص المكتوب وما يرمي إليه الكاتب وما هي مشاعره الموجهة من خلال هذا النص. ولكن عندما تكون البيانات التي تم جمعها وتحتاج إلى تحليل كبيرة يصعب أداء هذه المهمة بواسطة البشر حيث تعتبر مهمة مكلفة للوقت والجهد بمقابل أن عمليات التسويق وقضايا الرأي العام والتي تحتاج إلى تحليل آراء الأشخاص غالبًا هنالك حاجة لعملها بصورة سريعة. 7 |
8 | 9 | #
ماذا ستقدم هذه المدونة ؟
10 |
11 | في هذه الصفحه سنتطرق ونتعرف على كيفيه تحليل البيانات التي تم الحصول عليها من منصة التواصل الاجتماعي تويتر 12 | والتي يمكن استخدامها لدراسه مجالات عديده كدراسه تصرفات معينه لمجموعه من الأشخاص او علم تحليل المشاعر: وهي عملية تحليلية باستخدام معالجة اللغة الطبيعية وغيرها
13 | 14 | #
الخطوات
15 |
16 | لتطبيق هذه العملية سيقوم الباحث بالمرور بثلاث خطوات رئيسة وهي: 17 |
18 | 19 | 20 | ![](https://i.imgur.com/BzZB9bS.png) 21 |
22 | اتجه الى صفحه Wiki للتعرف على جميع الخطوات 23 |

24 | حيث يوجد فيها معلومات مفصلة عن 25 |

26 | 27 | 1. [Collecting Data](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Collecting-Data) 28 | 29 | تبين الطرق والادوات المطلوبه لسحب البيانات من تويتر 30 | 31 | 2. [Data Preprocessing](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Twitter-Data-Preprocessing) 32 | 33 | تختص بعمليه تنظيف البيانات وتجهيزها لعمليه التحليل 34 | 35 | 3. [Data Analysis](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Data-Analysis) 36 | 37 | تشرح عمليه التحليل باستخدام Weka 38 | 39 | 4. [Arabic Word Preprocessing](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Arabic-Word-Preprocessing) 40 | 41 | 5. [Arabic Sentiment Lexicons](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Arabic-Sentiment-Lexicons) 42 | 43 | 6. [Case Studies](https://github.com/ArwaAlrazooq/Arabic-twitter-analysis/wiki/Case-Studies) 44 | 45 |
46 | 47 | 48 |
49 | 50 | #
هذه الصفحه تمت كتابتها بواسطة
51 |
أروى الرزوق
@Cla22ire
52 |
روان المعثم
@rawan_almatham
53 |
خلود الغامدي
@KhuloodAlGhamdi
54 |
ايمان النخيلان
@EmanAlnkhilan
55 |
باشراف الدكتوره: منى الرزقان
@Munas02
56 | -------------------------------------------------------------------------------- /Cleaning Code/Main (1).java: -------------------------------------------------------------------------------- 1 | package test; 2 | 3 | import java.io.IOException; 4 | import java.io.UnsupportedEncodingException; 5 | import java.nio.file.Path; 6 | import java.nio.file.Paths; 7 | import java.util.Arrays; 8 | import java.util.Formatter; 9 | import java.util.FormatterClosedException; 10 | import java.util.List; 11 | import java.util.NoSuchElementException; 12 | import java.util.Scanner; 13 | import java.util.regex.Matcher; 14 | import java.util.regex.Pattern; 15 | 16 | 17 | public class Main { 18 | 19 | public static String text; 20 | public static void main(String[] args) throws IOException { 21 | // TODO Auto-generated method stub 22 | Formatter output; 23 | 24 | try{ 25 | Path p=Paths.get("out.txt").toAbsolutePath(); 26 | output = new Formatter(p.toString(), "utf-8"); 27 | 28 | Scanner input = new Scanner(Paths.get("input.txt"),"utf-8"); 29 | 30 | while (input.hasNextLine()) { 31 | String line = input.nextLine(); 32 | //String lineout = new String(); 33 | // List filtered = Arrays.asList(line.split("\\s")); 34 | 35 | line = line.replaceAll("((www\\.[^\\s]+)|(https?://[^\\s]+))", " ");// remove url 36 | 37 | //remove user names 38 | line = line.replaceAll("@[^\\s]+", " ");// remove @ 39 | 40 | //remove # from hash tag 41 | line = line.replaceAll("#[^\\s]+", ""); 42 | 43 | // remove # 44 | 45 | 46 | line = line.replaceAll("\\p{Punct}+", " ");//remove punctuation 47 | line = line.replaceAll("RT", " "); 48 | 49 | line = Main.removeEmojiAndSymbol(line); 50 | 51 | 52 | output.format("%s\n",line); 53 | } 54 | 55 | 56 | input.close(); 57 | output.close(); 58 | // System.out.println(text); 59 | } 60 | catch(NoSuchElementException excp) { 61 | System.err.println(excp.getMessage()); 62 | } catch (FormatterClosedException excp){ 63 | System.err.println(excp.getMessage()); 64 | } catch (IllegalStateException excp){ 65 | System.err.println(excp.getMessage()); 66 | } 67 | 68 | 69 | } 70 | // removeEmojiAndSymbol function to remove emojis or you can use emoji4j library 71 | public static String removeEmojiAndSymbol( String content) { 72 | String utf8tweet = ""; 73 | try { 74 | byte[] utf8Bytes = content.getBytes( "UTF-8"); 75 | 76 | utf8tweet = new String( utf8Bytes, "UTF-8"); 77 | } catch (UnsupportedEncodingException e ) { 78 | e.printStackTrace(); 79 | } 80 | Pattern unicodeOutliers = Pattern.compile( 81 | 82 | "(?:[\\u2700-\\u27bf]|" + 83 | 84 | "(?:[\\ud83c\\udde6-\\ud83c\\uddff]){2}|" + 85 | "[\\ud800\\udc00-\\uDBFF\\uDFFF]|[\\u2600-\\u26FF])[\\ufe0e\\ufe0f]?(?:[\\u0300-\\u036f\\ufe20-\\ufe23\\u20d0-\\u20f0]|[\\ud83c\\udffb-\\ud83c\\udfff])?" + 86 | 87 | "(?:\\u200d(?:[^\\ud800-\\udfff]|" + 88 | 89 | "(?:[\\ud83c\\udde6-\\ud83c\\uddff]){2}|" + 90 | "[\\ud800\\udc00-\\uDBFF\\uDFFF]|[\\u2600-\\u26FF])[\\ufe0e\\ufe0f]?(?:[\\u0300-\\u036f\\ufe20-\\ufe23\\u20d0-\\u20f0]|[\\ud83c\\udffb-\\ud83c\\udfff])?)*|" + 91 | 92 | "[\\u0023-\\u0039]\\ufe0f?\\u20e3|\\u3299|\\u3297|\\u303d|\\u3030|\\u24c2|[\\ud83c\\udd70-\\ud83c\\udd71]|[\\ud83c\\udd7e-\\ud83c\\udd7f]|\\ud83c\\udd8e|[\\ud83c\\udd91-\\ud83c\\udd9a]|[\\ud83c\\udde6-\\ud83c\\uddff]|[\\ud83c\\ude01-\\ud83c\\ude02]|\\ud83c\\ude1a|\\ud83c\\ude2f|[\\ud83c\\ude32-\\ud83c\\ude3a]|[\\ud83c\\ude50-\\ud83c\\ude51]|\\u203c|\\u2049|[\\u25aa-\\u25ab]|\\u25b6|\\u25c0|[\\u25fb-\\u25fe]|\\u00a9|\\u00ae|\\u2122|\\u2139|\\ud83c\\udc04|[\\u2600-\\u26FF]|\\u2b05|\\u2b06|\\u2b07|\\u2b1b|\\u2b1c|\\u2b50|\\u2b55|\\u231a|\\u231b|\\u2328|\\u23cf|[\\u23e9-\\u23f3]|[\\u23f8-\\u23fa]|\\ud83c\\udccf|\\u2934|\\u2935|[\\u2190-\\u21ff]", 93 | Pattern.UNICODE_CASE | 94 | Pattern.CANON_EQ | 95 | Pattern.CASE_INSENSITIVE 96 | ); 97 | //Pattern unicodeOutliers = Pattern.compile("U+1F3FB–U+1F3FF"); 98 | 99 | Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); 100 | 101 | utf8tweet = unicodeOutlierMatcher.replaceAll(" "); 102 | return utf8tweet; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /Dialect Stemming/output.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | سجل مع عجز ثل 4 | سجل مع عجز ثل 5 | شرع لعب ReignFal عمل 6 | 7 | شرع زال طير لعب 2D نوع غاز ومغر 8 | 9 | فد قم… 10 | شرع لعب ReignFal عمل 11 | 12 | شرع زال طير لعب 2D نوع غاز ومغر 13 | 14 | فد قم… 15 | ستعرض لعب ReignFal طير 16 | ستعرض لعب ReignFal طير 17 | ستعرض لعب ReignFal طير 18 | ستعرض لعب ReignFal طير 19 | ستعرض لعب ReignFal طير 20 | ستعرض لعب ReignFal طير 21 | ستعرض لعب ReignFal طير 22 | ستعرض لعب ReignFal طير 23 | ستعرض لعب ReignFal طير 24 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس 25 | 26 | عطنا بستر عبت بدأ عجب… 27 | It was a great honr meting and having him try our game An amzing day to remb 28 | شرع لعب ReignFal عمل 29 | 30 | شرع زال طير لعب 2D نوع غاز ومغر 31 | 32 | فد قم… 33 | شرع لعب ReignFal عمل 34 | 35 | شرع زال طير لعب 2D نوع غاز ومغر 36 | 37 | فد قم… 38 | It was a great honr meting and having him try our game An amzing day to remb 39 | ستعرض لعب ReignFal طير 40 | ستعرض لعب ReignFal طير 41 | ستعرض لعب ReignFal طير 42 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس 43 | 44 | عطنا بستر عبت بدأ عجب… 45 | 46 | Layl You are the best 47 | ارب ارزق 48 | فتشب كيد 49 | شيء ضرب خيل 50 | شاء ألف مبر وعقبل فوز نهئ لكم 51 | شاء ألف مبر وعقبل فوز نهئ لكم 52 | بار فيك ⁦ ⁩⁦ ⁩ 53 | مبر لك… 54 | 55 | مبر لستدي نقش ض… 56 | ومبر… 57 | مبر لك… 58 | مبر لك… 59 | مبر لك… 60 | مبر لك… 61 | مبر لك… 62 | مبر لك… 63 | مبر لك… 64 | مبر لك… 65 | 66 | مبر لستدي نقش ض… 67 | 68 | ومبر… 69 | 70 | مبر لستدي نقش ض… 71 | 72 | جمع فئزن 73 | ومبر… 74 | 75 | مبر لستدي نقش ض… 76 | 77 | جمع فئزن 78 | ومبر… 79 | 80 | مبر لستدي نقش ض… 81 | 82 | جمع فئزن 83 | ومبر… 84 | يس 85 | يس 86 | 87 | مبر 88 | مبر 89 | ستعرض لعب ReignFal طير 90 | ستعرض لعب ReignFal طير 91 | ستعرض لعب ReignFal طير 92 | بار فيك ⁦ ⁩ 93 | وا برو برو 94 | وا برو برو 95 | بار فيك مود ⁦ ⁩⁦ ⁩ 96 | مبر نتمى لكم زد قدم 97 | مبر نتمى لكم زد قدم 98 | بار فيك 99 | ذن 100 | Why You have an atrcive idea it just ned some polishng 101 | Yeah neds a lots of work But I think this stupid feling is becaus of the presu I fel right now o… 102 | برو وفق 103 | برو وفق 104 | بار فيك عبر ⁦ ⁩⁦ ⁩ 105 | Congratz 106 | Congratz 107 | Thanks 108 | مبر ستهل 109 | مبر ستهل 110 | بار فك، شكرا 111 | 112 | كلم حتج رجم 113 | 114 | 115 | 116 | 117 | ت شر شهر صبر 118 | احبل شكرا 119 | شاء 120 | 121 | ان سمع 122 | BTW 123 | شفت رسم KnowMe اخت، لمس جذب غرب ⁦ ⁩⁦ ⁩⁦ ⁩ 124 | 125 | فتح كمشن له علمي 126 | You goin arw 127 | lol you can just come and have fun you know 128 | 129 | I doubt it I dont know how to draw 130 | You goin arw 131 | You goin arw 132 | lol you can just come and have fun you know 133 | lol you can just come and have fun you know 134 | I doubt it I dont know how to draw 135 | عظم جركم، له رحم وغفر وسكن فسح جن 136 | Square Enix Sory we dint finsh your game Noctis 137 | Noctis Man this some bulshit Fuck yal I got my boy Hard on… 138 | شاء ، وفق مبر أرى 139 | شاء ، وفق مبر أرى 140 | نفس شيء 141 | نفس شيء 142 | نفس شيء 143 | نفس شيء 144 | بار فيك رم ⁦ ⁩ 145 | بار فيك ⁦ ⁦ 146 | حط ربط شي، برنمج هوء بشر 147 | خلص 148 | بث لا زال ستمر قر سعدي خلص 149 | مبر لستدي نقش ض… 150 | روى وصلن، وصلك؟ 151 | رسل لهم رى، أكد هم، كيد سقط سه 152 | وفق عز 153 | عين رى، تسر ذن 154 | تفق 155 | تفق 156 | لحظ وين ؟ 157 | قهر 158 | ودي عرف وش قل، يلا بحث عنه 159 | 160 | جمع فئزن 161 | ومبر… 162 | ثير وصل يمل ؟؟ 163 | اخر شيء وصل طلب بين وتر، فصل لقء لا لسه 164 | Wil do 165 | كيس شفت تو حقت 166 | وي ارب ⁦ ⁩⁦ ⁩ 167 | ذن 168 | كلم رئس سكر إنكس طرن يبد إعجب وتفجأ ستى عال وتنع منص ألعب طر حسب… 169 | دير وسك تسدا دهش عجب جد نوع لعب شاف 170 | وفقكم، شكر جهد ⁦ ⁩ 171 | ذن نك… 172 | مبر لستدي نقش ض… 173 | 174 | جمع فئزن 175 | ومبر… 176 | دير وسك تسدا دهش عجب جد نوع لعب شاف 177 | وفقكم، شكر جهد ⁦ ⁩ 178 | ذن نك… 179 | وفقكم، شكر جهد ⁦ ⁩ 180 | ذن نك… 181 | وفقكم، شكر جهد ⁦ ⁩ 182 | ذن نك… 183 | ستعرض لعب ReignFal طير 184 | ستعرض لعب ReignFal طير 185 | ستعرض لعب ReignFal طير 186 | ستعرض لعب ReignFal طير 187 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس… 188 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس… 189 | عف، ساس حصر شكل علم لعب ؟ وضع علم كبر مكن بس… 190 | مبر لستدي نقش ض… 191 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب 192 | 193 | جمع فئزن 194 | مبر فئزن سبق نجلإت بمجل ألعبفدي لمث بدعن سعدي عرب أ تسدا وس… 195 | ومبر… 196 | بار فيك ⁦ ⁦ 197 | وفقكم، شكر جهد ⁦ ⁩ 198 | ذن نك… 199 | بار فيك ⁦ ⁩⁦ ⁩ 200 | سعد اربى، شكر لك ⁦ 201 | شءل برو ستهلن 202 | yes 203 | بار فيك 204 | بار فيك 205 | بار فيك بين 206 | بار فيك بين 207 | هذي نفع لما صير مال خلق اس مشن 208 | ستعرض لعب ReignFal طير 209 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس 210 | 211 | عطنا بستر عبت بدأ عجب… 212 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب 213 | مبر وفق 214 | شاء 215 | 216 | 217 | ألف برو وبرك لأسمء 218 | 219 | بدعت 220 | 221 | 222 | يلا حذي هرت 223 | ور شغل جمد 224 | بار أرى ستهلو 225 | حمد ه، خبر سعد جدا ⁦ ⁩⁦ ⁩⁦ 226 | ختر لعبت فئزن 227 | 228 | بار فيك ⁦ ⁩⁦ ⁩ 229 | بار فيك ⁦ ⁩⁦ ⁩ ⁩ 230 | 231 | بار فيك ⁦ ⁩ 232 | بار فيك مود ⁦ ⁩⁦ ⁩ 233 | بار فيك 234 | ذن 235 | بار فيك هبه ⁦ ⁩⁦ ⁩⁦ ⁩ 236 | بار فيك عبر ⁦ ⁩⁦ ⁩ 237 | Thanks 238 | بار فك، شكرا 239 | 240 | I doubt it I dont know how to draw 241 | بار فيك رم ⁦ ⁩ 242 | بار فيك بين ⁦ ⁩ 243 | ستعرض لعب ReignFal طير 244 | بار فيك ⁦ ⁩⁦ ⁩ 245 | سعد اربى، شكر لك ⁦ 246 | آمن رب علمن 247 | مبر أرى تخل قد ايش فرحن لكم ان خت وفق رب 248 | عظم اجر له رحم وغفر ودخل فسح جن 249 | بار فيك ⁦ ⁩⁦ ⁩ 250 | سعد اربى، شكر لك ⁦ 251 | حط ربط شي، برنمج هوء بشر 252 | خلص 253 | بث لا زال ستمر قر سعدي خلص 254 | 255 | خلص حضر وطلع بسع كمل قلن؟ gt 256 | عين وفق ارب ف ثن 257 | ارب سهل 258 | لحظ وين ؟ 259 | قهر 260 | ودي عرف وش قل، يلا بحث عنه 261 | Layl You are the best 262 | 263 | ومبر… 264 | جا اخر 265 | يخلف شاء 266 | مين ارب، جزا خير ⁦ ⁩ 267 | لقئ قدم 268 | سجل 269 | 270 | طرلعب 271 | لقئ قدم 272 | سجل 273 | 274 | طرلعب 275 | 45 ≧ω≦ 276 | Me want 277 | رقك طلت ؟ 278 | يب مدي 279 | يب مدي 280 | عال خاص 281 | بتع سلحه؟ 282 | يمد طلب شيء نفس حق سيف نكتس ؟ 283 | 284 | بكم سيف نكتس ؟ 285 | Ok 286 | مبر وفق 287 | عال خاص 288 | بتع سلحه؟ 289 | يمد طلب شيء نفس حق سيف نكتس ؟ 290 | 291 | بكم سيف نكتس ؟ 292 | Ok 293 | 294 | نفس حال ضبط 295 | يضح اني بغى حدث 296 | حدث صرع مع سبع فات 297 | 298 | سبع بغى وتب صاح علم 299 | 300 | 301 | ولت وتب 302 | عور 303 | ترجى نت فتح قل 304 | 305 | 306 | سبعن كذا بغل طلع نيح وطب 307 | 308 | وحد شتك نت 309 | 310 | ضح بغى رد شغل 311 | 312 | لا 313 | مو فتح 314 | 315 | خلي جال طلع 316 | 317 | بغى كثر طلع 318 | شاء 319 | 320 | 321 | ألف برو وبرك لأسمء 322 | 323 | بدعت 324 | 325 | 326 | يلا حذي هرت 327 | ور شغل جمد 328 | بندل قه حش حل مكن سعد صمد 329 | 330 | وفق 331 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع 332 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع 333 | ا ودي حضر ، قعد فكر أسجل بعدن عتمدا جاز شرع 334 | كنسل وتب زمن عتمد درس تك كتبه، اقل خير بتطل فتح امل 335 | مو نت، خل جال فتح يون در ستدي سشل له نخلص 336 | صدق بغل نسه، وبم… 337 | بار فيك بين ⁦ ⁩ 338 | يب بس 339 | wan go Im not sure I can atend 340 | لقئ قدم 341 | سجل 342 | 343 | طرلعب 344 | wan go Im not sure I can atend 345 | wan go Im not sure I can atend 346 | helo Just wanted to ask do you take comisn D 347 | شءل برو ستهلن 348 | بار فيك هبه ⁦ ⁩⁦ ⁩⁦ ⁩ 349 | لستَ وحد دام دوم برفقت 350 | طلع طيب حين 351 | بار أرى ستهلو 352 | لا لا زين حقه ساع 353 | 354 | ، حق قل نوم 355 | أقل حقئ أنا لزم جرب وزبل ثن 356 | 357 | سلم قلب ، بشف بكر ا… 358 | You can do it I beliv in you Arwa 359 | صح شغل ، إني جلس أحل أركز علرض ماب فيد ، أندر ستدي طلع ركز كنت أش… 360 | بار فيك ⁦ ⁩⁦ ⁩ ⁩ 361 | لا ال VR زاد 362 | حل ساع 363 | فرض 364 | 365 | يلا سلمت ورح نام 366 | نخلي اخر شيء ين نس شوي 367 | 368 | ااخ لزم كذا جلس سنع شغل ونشرب قه 369 | ك شوش كيد 370 | جد ذكر علن ب٢٠١٥ 371 | خل خلص رم جزء 372 | لتنرفز ترى مكن تحس نفس ختم ظرف قول لك 373 | كيد مو شيف نفس غلطن وقت غلط اقش ١٠ دقئ ستظرف راس 374 | عظم أجر أحسن عزئ غفر لهم رحم سكن فسح جن 375 | مزح وتقل رشه، ولا علقت رسمي يع غير عقل شوي اشك سلب كان مزح بجد… 376 | ودي وري ما تحط لا ومفق جف 377 | ودي وري ما تحط لا ومفق جف 378 | ه درن وش الي ضحك، ىوى قبل رتن ولا ثل ه 379 | قول هاه نحسب يام الي سلم حق رور كرم ه 380 | لا شفت لعبن 381 | صور سلم ه 382 | حطت ساس لما شفت 383 | ه درن وش الي ضحك، ىوى قبل رتن ولا ثل ه 384 | قول هاه نحسب يام الي سلم حق رور كرم ه 385 | لا شفت لعبن 386 | صور سلم ه 387 | حطت ساس لما شفت 388 | and she rocks 389 | I hope you find what you sek 390 | Thanks Ill chek her works 391 | سمح لي كرك شكرا جزلا ⁦ ⁦ 392 | رسم شاء رئع ⁦ ⁩⁦ ⁩ 393 | and she rocks 394 | I hope you find what you sek 395 | Thanks Ill chek her works 396 | سمح لي كرك شكرا جزلا ⁦ ⁦ 397 | رسم شاء رئع ⁦ ⁩⁦ ⁩ 398 | كنسل طلع لج تغل 399 | تغل شي، بعد SteamVR قرر يشتغل ولزم حدث وبقعد قربا نت رهب 400 | I had zero confide dont know why 401 | حب 402 | عمل شكل عد 403 | نب نسمع عبر انم بزرن طيب 404 | ترى مو فئه وحد وطن فئه شب 405 | EscapeVR 406 | 407 | وجد عمل يون ساع ٣ شاء الي حاب يجي نسلف جرب في لعب فح ⁦ ⁩ 408 | I must prea presntaio and write a paer The dealin for them is tomrw 409 | لقء تبع نجلإت سيد تسدا وسك، رئس جلس دار سكر ينكس 410 | 411 | عطنا بستر عبت و… 412 | بتمنى شيء الي فتح كل شوف بدع طرن شاء عمل وحد ت… 413 | خلص لقء ومعا صدع حين لزم كمل تغل بر سلم 414 | 415 | ارب جز 416 | شيء حلو كمنربي ان جبن مؤد اص ⁦ ⁩ 417 | كمنربي رسمن بستقل comisn ؟ 418 | EscapeVR 419 | زل جرب ولي عملي يستنق، غير تح 420 | 421 | سبب قرب Nongamers وقعا ختلف ومن… 422 | -------------------------------------------------------------------------------- /Collecting Twitter Data/Python/UserTweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "import twitter\n", 15 | "\n", 16 | "def oauth_login():\n", 17 | " #This code is taken from https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition \n", 18 | " # XXX: Go to http://twitter.com/apps/new to create an app and get values\n", 19 | " # for these credentials that you'll need to provide in place of these\n", 20 | " # empty string values that are defined as placeholders.\n", 21 | " # See https://dev.twitter.com/docs/auth/oauth for more information \n", 22 | " # on Twitter's OAuth implementation.\n", 23 | " \n", 24 | " CONSUMER_KEY = ''\n", 25 | " CONSUMER_SECRET = ''\n", 26 | " OAUTH_TOKEN = ''\n", 27 | " OAUTH_TOKEN_SECRET = ''\n", 28 | " \n", 29 | " auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n", 30 | " CONSUMER_KEY, CONSUMER_SECRET)\n", 31 | " \n", 32 | " twitter_api = twitter.Twitter(auth=auth)\n", 33 | " return twitter_api\n", 34 | "\n", 35 | "def twitter_search(twitter_api, q, max_results, **kw):\n", 36 | "\n", 37 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets and \n", 38 | " # https://dev.twitter.com/docs/using-search for details on advanced \n", 39 | " # search criteria that may be useful for keyword arguments\n", 40 | " \n", 41 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets \n", 42 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n", 43 | " \n", 44 | " statuses = search_results['statuses']\n", 45 | " print len(statuses)\n", 46 | " \n", 47 | " # Iterate through batches of results by following the cursor until we\n", 48 | " # reach the desired number of results, keeping in mind that OAuth users\n", 49 | " # can \"only\" make 180 search queries per 15-minute interval. See\n", 50 | " # https://dev.twitter.com/docs/rate-limiting/1.1/limits\n", 51 | " # for details. A reasonable number of results is ~1000, although\n", 52 | " # that number of results may not exist for all queries.\n", 53 | " \n", 54 | " # Enforce a reasonable limit\n", 55 | " max_results = 10000 #min(100000, max_results)\n", 56 | " \n", 57 | " for _ in range(10000): # 10*100 = 1000\n", 58 | " try:\n", 59 | " next_results = search_results['search_metadata']['next_results']\n", 60 | " except KeyError, e: # No more results when next_results doesn't exist\n", 61 | " # print len(statuses)\n", 62 | " break\n", 63 | " \n", 64 | " # Create a dictionary from next_results, which has the following form:\n", 65 | " # ?max_id=313519052523986943&q=NCAA&include_entities=1\n", 66 | " kwargs = dict([ kv.split('=') \n", 67 | " for kv in next_results[1:].split(\"&\") ])\n", 68 | " \n", 69 | " print len(statuses)\n", 70 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n", 71 | " statuses += search_results['statuses']\n", 72 | " \n", 73 | " if len(statuses) > max_results: \n", 74 | " break\n", 75 | " print len(statuses)\n", 76 | " \n", 77 | " return statuses" 78 | ], 79 | "language": "python", 80 | "metadata": {}, 81 | "outputs": [], 82 | "prompt_number": 2 83 | }, 84 | { 85 | "cell_type": "code", 86 | "collapsed": false, 87 | "input": [ 88 | "import io, json\n", 89 | "\n", 90 | "def save_json( data):\n", 91 | " with io.open('WAS.json', \n", 92 | " 'w', encoding='utf-8') as f:\n", 93 | " f.write(unicode(json.dumps(data, ensure_ascii=False)))\n", 94 | "\n", 95 | "def load_json(filename):\n", 96 | " with io.open('WAS.json', \n", 97 | " encoding='utf-8') as f:\n", 98 | " return f.read()\n", 99 | "\n", 100 | "# Sample usage\n", 101 | "\n", 102 | "#q = '%23%D8%A7%D9%8A%D8%AC%D8%A7%D8%A8%D9%8A%D8%A9'\n", 103 | "#twitter_api = oauth_login()\n", 104 | "#results = twitter_search(twitter_api, q, max_results=10000)\n", 105 | "\n", 106 | "#save_json(q, results)\n", 107 | "#results = load_json(q)" 108 | ], 109 | "language": "python", 110 | "metadata": {}, 111 | "outputs": [], 112 | "prompt_number": 3 113 | }, 114 | { 115 | "cell_type": "code", 116 | "collapsed": false, 117 | "input": [ 118 | "import sys\n", 119 | "import time\n", 120 | "from urllib2 import URLError\n", 121 | "from httplib import BadStatusLine\n", 122 | "import json\n", 123 | "import twitter\n", 124 | "\n", 125 | "def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): \n", 126 | " \n", 127 | " # A nested helper function that handles common HTTPErrors. Return an updated\n", 128 | " # value for wait_period if the problem is a 500 level error. Block until the\n", 129 | " # rate limit is reset if it's a rate limiting issue (429 error). Returns None\n", 130 | " # for 401 and 404 errors, which requires special handling by the caller.\n", 131 | " def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):\n", 132 | " \n", 133 | " if wait_period > 3600: # Seconds\n", 134 | " print >> sys.stderr, 'Too many retries. Quitting.'\n", 135 | " raise e\n", 136 | " \n", 137 | " # See https://dev.twitter.com/docs/error-codes-responses for common codes\n", 138 | " \n", 139 | " if e.e.code == 401:\n", 140 | " print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'\n", 141 | " return None\n", 142 | " elif e.e.code == 404:\n", 143 | " print >> sys.stderr, 'Encountered 404 Error (Not Found)'\n", 144 | " return None\n", 145 | " elif e.e.code == 429: \n", 146 | " print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'\n", 147 | " if sleep_when_rate_limited:\n", 148 | " print >> sys.stderr, \"Retrying in 15 minutes...ZzZ...\"\n", 149 | " sys.stderr.flush()\n", 150 | " time.sleep(60*15 + 5)\n", 151 | " print >> sys.stderr, '...ZzZ...Awake now and trying again.'\n", 152 | " return 2\n", 153 | " else:\n", 154 | " raise e # Caller must handle the rate limiting issue\n", 155 | " elif e.e.code in (500, 502, 503, 504):\n", 156 | " print >> sys.stderr, 'Encountered %i Error. Retrying in %i seconds' % \\\n", 157 | " (e.e.code, wait_period)\n", 158 | " time.sleep(wait_period)\n", 159 | " wait_period *= 1.5\n", 160 | " return wait_period\n", 161 | " else:\n", 162 | " raise e\n", 163 | "\n", 164 | " # End of nested helper function\n", 165 | " \n", 166 | " wait_period = 2 \n", 167 | " error_count = 0 \n", 168 | "\n", 169 | " while True:\n", 170 | " try:\n", 171 | " return twitter_api_func(*args, **kw)\n", 172 | " except twitter.api.TwitterHTTPError, e:\n", 173 | " error_count = 0 \n", 174 | " wait_period = handle_twitter_http_error(e, wait_period)\n", 175 | " if wait_period is None:\n", 176 | " return\n", 177 | " except URLError, e:\n", 178 | " error_count += 1\n", 179 | " time.sleep(wait_period)\n", 180 | " wait_period *= 1.5\n", 181 | " print >> sys.stderr, \"URLError encountered. Continuing.\"\n", 182 | " if error_count > max_errors:\n", 183 | " print >> sys.stderr, \"Too many consecutive errors...bailing out.\"\n", 184 | " raise\n", 185 | " except BadStatusLine, e:\n", 186 | " error_count += 1\n", 187 | " time.sleep(wait_period)\n", 188 | " wait_period *= 1.5\n", 189 | " print >> sys.stderr, \"BadStatusLine encountered. Continuing.\"\n", 190 | " if error_count > max_errors:\n", 191 | " print >> sys.stderr, \"Too many consecutive errors...bailing out.\"\n", 192 | " raise\n", 193 | "\n", 194 | "def harvest_user_timeline(twitter_api, screen_name=None, user_id=None, max_results=3000):\n", 195 | " \n", 196 | " assert (screen_name != None) != (user_id != None), \\\n", 197 | " \"Must have screen_name or user_id, but not both\" \n", 198 | " \n", 199 | " kw = { # Keyword args for the Twitter API call\n", 200 | " 'count': 2000,\n", 201 | " 'trim_user': 'true',\n", 202 | " 'include_rts' : 'true',\n", 203 | " 'since_id' : 1\n", 204 | " }\n", 205 | " \n", 206 | " if screen_name:\n", 207 | " kw['screen_name'] = screen_name\n", 208 | " else:\n", 209 | " kw['user_id'] = user_id\n", 210 | " \n", 211 | " max_pages = 16\n", 212 | " results = []\n", 213 | " \n", 214 | " tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)\n", 215 | " \n", 216 | " if tweets is None: # 401 (Not Authorized) - Need to bail out on loop entry\n", 217 | " tweets = []\n", 218 | " \n", 219 | " results += tweets\n", 220 | " \n", 221 | " print >> sys.stderr, 'Fetched %i tweets' % len(tweets)\n", 222 | " \n", 223 | " page_num = 1\n", 224 | " \n", 225 | " # Many Twitter accounts have fewer than 200 tweets so you don't want to enter\n", 226 | " # the loop and waste a precious request if max_results = 200.\n", 227 | " \n", 228 | " # Note: Analogous optimizations could be applied inside the loop to try and \n", 229 | " # save requests. e.g. Don't make a third request if you have 287 tweets out of \n", 230 | " # a possible 400 tweets after your second request. Twitter does do some \n", 231 | " # post-filtering on censored and deleted tweets out of batches of 'count', though,\n", 232 | " # so you can't strictly check for the number of results being 200. You might get\n", 233 | " # back 198, for example, and still have many more tweets to go. If you have the\n", 234 | " # total number of tweets for an account (by GET /users/lookup/), then you could \n", 235 | " # simply use this value as a guide.\n", 236 | " \n", 237 | " if max_results == kw['count']:\n", 238 | " page_num = max_pages # Prevent loop entry\n", 239 | " \n", 240 | " while page_num < max_pages and len(tweets) > 0 and len(results) < max_results:\n", 241 | " \n", 242 | " # Necessary for traversing the timeline in Twitter's v1.1 API:\n", 243 | " # get the next query's max-id parameter to pass in.\n", 244 | " # See https://dev.twitter.com/docs/working-with-timelines.\n", 245 | " kw['max_id'] = min([ tweet['id'] for tweet in tweets]) - 1 \n", 246 | " \n", 247 | " tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)\n", 248 | " results += tweets\n", 249 | "\n", 250 | " print >> sys.stderr, 'Fetched %i tweets' % (len(tweets),)\n", 251 | " \n", 252 | " page_num += 1\n", 253 | " \n", 254 | " print >> sys.stderr, 'Done fetching tweets'\n", 255 | "\n", 256 | " return results[:max_results]\n", 257 | " \n", 258 | "# Sample usage\n", 259 | "\n", 260 | "twitter_api = oauth_login()\n", 261 | "#tweets = harvest_user_timeline(twitter_api, user_id=\"@spagov\", max_results=3000)\n", 262 | "tweets = harvest_user_timeline(twitter_api,screen_name=\"spagov\", max_results=3000)\n", 263 | "save_json(tweets)\n", 264 | "# Save to MongoDB with save_to_mongo or a local file with save_json..." 265 | ], 266 | "language": "python", 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "output_type": "stream", 271 | "stream": "stderr", 272 | "text": [ 273 | "Fetched 200 tweets\n", 274 | "Fetched 200 tweets" 275 | ] 276 | }, 277 | { 278 | "output_type": "stream", 279 | "stream": "stderr", 280 | "text": [ 281 | "\n", 282 | "Fetched 200 tweets" 283 | ] 284 | }, 285 | { 286 | "output_type": "stream", 287 | "stream": "stderr", 288 | "text": [ 289 | "\n", 290 | "Fetched 200 tweets" 291 | ] 292 | }, 293 | { 294 | "output_type": "stream", 295 | "stream": "stderr", 296 | "text": [ 297 | "\n", 298 | "Fetched 200 tweets" 299 | ] 300 | }, 301 | { 302 | "output_type": "stream", 303 | "stream": "stderr", 304 | "text": [ 305 | "\n", 306 | "Fetched 200 tweets" 307 | ] 308 | }, 309 | { 310 | "output_type": "stream", 311 | "stream": "stderr", 312 | "text": [ 313 | "\n", 314 | "Fetched 200 tweets" 315 | ] 316 | }, 317 | { 318 | "output_type": "stream", 319 | "stream": "stderr", 320 | "text": [ 321 | "\n", 322 | "Fetched 200 tweets" 323 | ] 324 | }, 325 | { 326 | "output_type": "stream", 327 | "stream": "stderr", 328 | "text": [ 329 | "\n", 330 | "Fetched 200 tweets" 331 | ] 332 | }, 333 | { 334 | "output_type": "stream", 335 | "stream": "stderr", 336 | "text": [ 337 | "\n", 338 | "Fetched 200 tweets" 339 | ] 340 | }, 341 | { 342 | "output_type": "stream", 343 | "stream": "stderr", 344 | "text": [ 345 | "\n", 346 | "Fetched 200 tweets" 347 | ] 348 | }, 349 | { 350 | "output_type": "stream", 351 | "stream": "stderr", 352 | "text": [ 353 | "\n", 354 | "Fetched 200 tweets" 355 | ] 356 | }, 357 | { 358 | "output_type": "stream", 359 | "stream": "stderr", 360 | "text": [ 361 | "\n", 362 | "Fetched 200 tweets" 363 | ] 364 | }, 365 | { 366 | "output_type": "stream", 367 | "stream": "stderr", 368 | "text": [ 369 | "\n", 370 | "Fetched 200 tweets" 371 | ] 372 | }, 373 | { 374 | "output_type": "stream", 375 | "stream": "stderr", 376 | "text": [ 377 | "\n", 378 | "Fetched 200 tweets" 379 | ] 380 | }, 381 | { 382 | "output_type": "stream", 383 | "stream": "stderr", 384 | "text": [ 385 | "\n", 386 | "Done fetching tweets\n" 387 | ] 388 | } 389 | ], 390 | "prompt_number": 4 391 | }, 392 | { 393 | "cell_type": "code", 394 | "collapsed": false, 395 | "input": [], 396 | "language": "python", 397 | "metadata": {}, 398 | "outputs": [], 399 | "prompt_number": 3 400 | }, 401 | { 402 | "cell_type": "code", 403 | "collapsed": false, 404 | "input": [], 405 | "language": "python", 406 | "metadata": {}, 407 | "outputs": [] 408 | } 409 | ], 410 | "metadata": {} 411 | } 412 | ] 413 | } -------------------------------------------------------------------------------- /Dialect Stemming/src/stemming/Stemmer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Eman Alnkhilan- java code is based on rule based algorithm for Gulf dialect 3 | * 4 | */ 5 | 6 | 7 | package stemming; 8 | 9 | import java.io.BufferedWriter; 10 | import java.io.File; 11 | import java.io.FileOutputStream; 12 | import java.io.FileWriter; 13 | import java.io.IOException; 14 | import java.io.OutputStreamWriter; 15 | import java.nio.file.Path; 16 | import java.nio.file.Paths; 17 | import java.util.Arrays; 18 | import java.util.Formatter; 19 | import java.util.FormatterClosedException; 20 | import java.util.LinkedHashSet; 21 | import java.util.List; 22 | import java.util.NoSuchElementException; 23 | import java.util.Scanner; 24 | import java.util.Set; 25 | 26 | public class Stemmer { 27 | 28 | public static void main(String[] args) { 29 | // TODO Auto-generated method stub 30 | 31 | //String fin = "files/WLF.txt"; 32 | 33 | 34 | Scanner input; 35 | Formatter output; 36 | //String fout = "files/out.txt"; 37 | // File newFile = new File("files", "out.txt"); 38 | // BufferedWriter output = null; 39 | try{ 40 | // newFile.createNewFile(); 41 | 42 | //writer.write("Hello world!"); 43 | Path p=Paths.get("output.txt").toAbsolutePath(); 44 | // FileWriter out =new FileWriter(p.toString()); 45 | // out.write("Hello world!"); 46 | output = new Formatter(p.toString(), "utf-8"); 47 | input = new Scanner(Paths.get("WLF"),"utf-8"); 48 | 49 | //naw= non Arabic words 50 | List naw=Arrays.asList("بشت", "تجوري", "دلاغ" ,"ابجور" ,"أبجور", "دريول", "فريزر" , "بصم" ,"بلكون ", "بخت", "تليفون" , "تلفزيون" ,"خوش" , "روج" , "ساندويش", "سندويش" );// non Arabic words without prefix & suffix 51 | List stopwordsList = Arrays.asList("ام","أم","بدون" ,"عشرة", "على", "مقابل" ,"حاليا" ,"التى" ,"حوالى" ,"الماضي", 52 | "انا", "ب", "عند" ,"اذا", "هذا", "امس" ,"ايام", "انها", 53 | "انها", "مافيه", "لان" ,"عندما" ,"احد" ,"هاذا", "السابق" ,"خلال","جميع", 54 | "شي" ,"مافي", "اثر" ,"عليه", "برس" ,"بن", "التي" ,"الذين" ,"الوقت", 55 | "ع", "انا" ,"ا" ,"عاما", "بشكل", "الا" ,"الان" ,"اول", 56 | "له" ,"بكرا", "لانهم", "عن", "حتى", "كم", "اما", "امام" ,"ضمن", 57 | "لها" ,"واذا", "انها" ,"عام", "اعلنت", "وغير" ,"اكد", " الذي" ,"انه", 58 | "لو" ,"ي","ا", "عليها", "باسم" ,"به","اكثر", "الاول", "المقبل", 59 | "اللي" ,"لمن" ,"لماذا", "سنوات", "صباح", "ان" ,"ثلاثة" ,"ذلك", "و", 60 | "ليش" ,"لكل", "مافي", "سنة" ,"شخصا" , "اف" ,"ايضا" ,"بين" ,"الى", 61 | "مثل" ,"لهاذا" , "عشر" ,"تم" ,"اطار" ,"او" ,"الذاتي" ,"دون", "كان","كن", 62 | "مره" ,"راح" ,"عدم", "اعادة" ,"اجل" ,"حيث", "الذى", 63 | "يا" ,"انك" ,"عدة" ,"بعد" ,"اخرى" ,"بها" ,"الثاني" ,"حين", "قوة", 64 | "يقول" ,"ان" , "عدد" ,"ف", "اربعة" ,"اي", "الاخيرة" ,"حول" ,"كما", 65 | "يقولون" ,"بل", "فان" ,"كان" ,"هذه" ,"كانت" ,"في", "من", "لها", 66 | "مع", "واحد" ,"قبل" ,"لدى" ,"وان" ,"واوضح" ,"كل" ,"هو", "يكون", 67 | "مساء" ,"واضاف" ,"قال", "نحو" ,"واكد", "مايو" ,"له", "هي" ,"وقالت", 68 | "هذا" ,"واضافت" ,"وكانت", "وقف" ,"فيها" ,"يمكن" ,"مليار" ,"لوكالة", 69 | "منذ" ,"هناك" ,"فيه", "ومن" , "منها" ,"مليون" ,"لكن" ,"وفي" , 70 | "وقد" , "وقال" ,"كلم", "وهو" ,"وهي", "يوم" ,"نهاية" ,"لقاء" ,"ومع","مع", 71 | "نفسه" , "وكان" ,"زيارة" , "ثم", "ايار" ,"الاولى" ,"اليوم" ,"هل","اين","متى","أين","تحت","فوق","بين","بينما","فبينما", 72 | "لهذا" ,"بكره" ,"بعض" ,"بان" ,"صفر" ,"الثانية","الف","ما","يا","يومياً","كل","إذا","علي","عليك","عليهم","ماذا","قط", 73 | "عليكم","فجرا","فجر","عشان","علشان","فقط","بإذن","بس","عصر","عصرا","ظهر","ظهرا","الجمعه","الخميس","الاربعاء","الأربعاء","الثلاثاء","الأثنين","الاثنين","الاحد","الأحد","السبت"); 74 | List vowels= Arrays.asList ('ا','ي','و'); 75 | List nostemming=Arrays.asList("اللهٍ","اللهَ","اللهُ","الله"); // words must not to be stemmed 76 | int lettercount=0; 77 | String word=null; 78 | while (input.hasNextLine()) { 79 | String line = input.nextLine(); 80 | if(!line.isEmpty()){ 81 | List tokens = Arrays.asList(line.split("\\s")); 82 | // if word is <=3 then it is stemmed 83 | for(int i=0; i3 && !nostemming.contains(word)){ 99 | // to remove prefix 100 | if(word.startsWith("ولل")){ 101 | String pre="ولل"; 102 | word=word.substring ( pre.length ( ) ); 103 | } 104 | else if(word.startsWith("لل")){ 105 | String pre="لل"; 106 | word=word.substring ( pre.length ( ) ); 107 | } 108 | else if(word.startsWith("وال")){ 109 | String pre="وال"; 110 | word=word.substring ( pre.length ( ) ); 111 | } 112 | else if(word.startsWith("بال")){ 113 | String pre="بال"; 114 | word=word.substring ( pre.length ( ) ); 115 | } 116 | else if(word.startsWith("ال")){ 117 | String pre="ال"; 118 | word=word.substring ( pre.length ( ) ); 119 | } 120 | else if( word.startsWith("وش")){ 121 | String pre="وش"; 122 | word=word.substring ( pre.length ( ) ); 123 | } 124 | else if(word.startsWith("اش")){ 125 | String pre="اش"; 126 | word=word.substring ( pre.length ( ) ); 127 | } 128 | 129 | 130 | if(word.length()<=3){ 131 | // System.out.println(word+" "+"هو الجذر"); 132 | output.format( word+" "); } 133 | // to remove suffix 134 | else if(word.length()>3 ){ 135 | if(word.endsWith( "الكم")){ 136 | String suf="الكم"; 137 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 138 | } 139 | else if(word.endsWith( "ونهم")){ 140 | String suf="ونهم"; 141 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 142 | } 143 | else if(word.endsWith( "تني")){ 144 | String suf="تني"; 145 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 146 | } 147 | else if(word.endsWith( "ونه")){ 148 | String suf="ونه"; 149 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 150 | } 151 | else if(word.endsWith( "الك")){ 152 | String suf="الك"; 153 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 154 | } 155 | else if(word.endsWith( "تهم")){ 156 | String suf="تهم"; 157 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 158 | } 159 | else if(word.endsWith( "تهن")){ 160 | String suf="تهن"; 161 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 162 | } 163 | else if(word.endsWith("ات")){ 164 | String suf="ات"; 165 | //word=word.substring ( suf.length ( ) ); 166 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 167 | } 168 | else if(word.endsWith("ون")){ 169 | String suf="ون"; 170 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 171 | } 172 | 173 | else if(word.endsWith("كم")){ 174 | String suf="كم"; 175 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 176 | } 177 | else if(word.endsWith("وك")){ 178 | String suf="وك"; 179 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 180 | } 181 | 182 | else if(word.endsWith( "ته")){ 183 | String suf="ته"; 184 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 185 | } 186 | else if(word.endsWith( "هم")){ 187 | String suf="هم"; 188 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 189 | } 190 | 191 | else if(word.endsWith( "تي")){ 192 | String suf="تي"; 193 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 194 | } 195 | else if(word.endsWith( "ني")){ 196 | String suf="ني"; 197 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 198 | } 199 | else if(word.endsWith( "وا")){ 200 | String suf="وا"; 201 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 202 | } 203 | else if(word.endsWith( "ها")){ 204 | String suf="ها"; 205 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 206 | } 207 | else if(word.endsWith( "نا")){ 208 | String suf="نا"; 209 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 210 | } 211 | else if(word.endsWith("ك")){ 212 | String suf="ك"; 213 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 214 | } 215 | else if(word.endsWith( "ه")){ 216 | String suf="ه"; 217 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 218 | } 219 | else if(word.endsWith( "ة")){ 220 | String suf="ة"; 221 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 222 | } 223 | 224 | else if(word.endsWith( "ي")){ 225 | String suf="ي"; 226 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 227 | } 228 | else if(word.endsWith( "و")){ 229 | String suf="و"; 230 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 231 | } 232 | else if(word.endsWith( "ت")){ 233 | String suf="ت"; 234 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 235 | } 236 | 237 | else if(word.endsWith( "اً")){ 238 | String suf="اً"; 239 | word=word.substring ( 0, word.length ( ) - suf.length ( ) ); 240 | } 241 | if (word.length()<=3){ 242 | 243 | if(naw.contains(word)){// to check the word is Arabic or not 244 | System.out.println(word+" "+"ليست من أصل عربي"); 245 | } 246 | else{ 247 | // /System.out.println(word+" "+"هو الجذر"); 248 | if(vowels.contains(word.charAt(word.length()-1))){ 249 | word = new StringBuilder(word).deleteCharAt(word.length()-1).toString();} 250 | output.format( word+" "); 251 | } 252 | } 253 | if(word.length()>3 ){ 254 | if(naw.contains(word)){// to check the word is Arabic or not 255 | System.out.println(word+" "+"ليست من أصل عربي"); 256 | } 257 | if(!naw.contains(word)) { 258 | if(word.startsWith("با")){ 259 | String pre="با"; 260 | word=word.substring ( pre.length ( ) ); 261 | } 262 | else if(word.startsWith("وا")){ 263 | String pre="وا"; 264 | word=word.substring ( pre.length ( ) ); 265 | } 266 | 267 | else if(word.startsWith("مت")){ 268 | String pre="مت"; 269 | word=word.substring ( pre.length ( ) ); 270 | } 271 | else if(word.startsWith("ان")){ 272 | String pre="ان"; 273 | word=word.substring ( pre.length ( ) ); 274 | } 275 | 276 | else if(word.startsWith("ما")){ 277 | String pre="ما"; 278 | word=word.substring ( pre.length ( ) ); 279 | } 280 | else if(word.startsWith("من")){ 281 | String pre="من"; 282 | word=word.substring ( pre.length ( ) ); 283 | } 284 | else if(word.startsWith("يت")){ 285 | String pre="يت"; 286 | word=word.substring ( pre.length ( ) ); 287 | } 288 | else if(word.startsWith("ا")){ 289 | String pre="ا"; 290 | word=word.substring ( pre.length ( ) ); 291 | } 292 | 293 | else if(word.startsWith("ت")){ 294 | String pre="ت"; 295 | word=word.substring ( pre.length ( ) ); 296 | } 297 | else if(word.startsWith("ي")){ 298 | String pre="ي"; 299 | word=word.substring ( pre.length ( ) ); 300 | } 301 | else if(word.startsWith("م")){ 302 | String pre="م"; 303 | word=word.substring ( pre.length ( ) ); 304 | } 305 | 306 | if (word.length()<=3){ 307 | // System.out.println(word+" "+"هو الجذر"); 308 | output.format( word+" "); 309 | } 310 | if(word.length()>3 ){ 311 | 312 | char[] ch=word.toCharArray(); 313 | 314 | if(ch[0]=='م'&& ch[2]=='ت'){ 315 | 316 | word = new StringBuilder(word).deleteCharAt(0).toString(); 317 | word = new StringBuilder(word).deleteCharAt(1).toString(); 318 | } 319 | else if(ch[0]=='ا'&& ch[2]=='ت'){ 320 | word = new StringBuilder(word).deleteCharAt(0).toString(); 321 | word = new StringBuilder(word).deleteCharAt(1).toString(); 322 | } 323 | else if(ch[0]=='ن'&& ch[2]=='ت'){ 324 | word = new StringBuilder(word).deleteCharAt(0).toString(); 325 | word = new StringBuilder(word).deleteCharAt(1).toString(); } 326 | else if(ch[0]=='ت'&& ch[2]=='ت'){ 327 | word = new StringBuilder(word).deleteCharAt(0).toString(); 328 | word = new StringBuilder(word).deleteCharAt(1).toString(); 329 | } 330 | 331 | if (word.length()<=3){ 332 | // System.out.println(word+" "+"هو الجذر"); 333 | output.format( word+" "); 334 | } 335 | // System.out.println(word); 336 | if(word.length()>3){ 337 | 338 | /* 339 | to remove vowels from a word, If we have a vowel with non-vowels neighbored then 340 | deletes it. 341 | If we have two consecutive vowel letters then we have to delete one of them according to the following 342 | order (ا then و and then ي) 343 | 344 | */ 345 | for(int j=1; j set=new LinkedHashSet(); 391 | for(char c:word.toCharArray()) 392 | { 393 | set.add(Character.valueOf(c)); 394 | } 395 | StringBuilder sb = new StringBuilder(); 396 | for (Character character : set) { 397 | sb.append(character); 398 | } 399 | word=sb.toString(); 400 | //System.out.println(word); 401 | output.format(word+" "); 402 | 403 | 404 | } 405 | } 406 | } 407 | } 408 | } 409 | } 410 | 411 | } 412 | 413 | } 414 | } 415 | output.format( "\n"); 416 | } 417 | input.close(); 418 | output.close(); 419 | } 420 | catch (IOException ioException){ 421 | System.err.println(ioException.getMessage()); 422 | System.exit(-1); 423 | } catch(NoSuchElementException excp) { 424 | System.err.println(excp.getMessage()); 425 | } catch (FormatterClosedException excp){ 426 | System.err.println(excp.getMessage()); 427 | } catch (IllegalStateException excp){ 428 | System.err.println(excp.getMessage()); 429 | } 430 | 431 | } 432 | 433 | } 434 | -------------------------------------------------------------------------------- /Collecting Twitter Data/Python/SearchWord.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "\n", 15 | "import sys\n", 16 | "import datetime\n", 17 | "import time\n", 18 | "import twitter\n", 19 | "\n", 20 | "def oauth_login():\n", 21 | " #This code is taken from https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition \n", 22 | " # XXX: Go to http://twitter.com/apps/new to create an app and get values\n", 23 | " # for these credentials that you'll need to provide in place of these\n", 24 | " # empty string values that are defined as placeholders.\n", 25 | " # See https://dev.twitter.com/docs/auth/oauth for more information \n", 26 | " # on Twitter's OAuth implementation.\n", 27 | " \n", 28 | " CONSUMER_KEY = ''\n", 29 | " CONSUMER_SECRET = ''\n", 30 | " OAUTH_TOKEN = ''\n", 31 | " OAUTH_TOKEN_SECRET = ''\n", 32 | " \n", 33 | " auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n", 34 | " CONSUMER_KEY, CONSUMER_SECRET)\n", 35 | " \n", 36 | " twitter_api = twitter.Twitter(auth=auth)\n", 37 | " return twitter_api\n", 38 | "\n", 39 | "def twitter_search(twitter_api, q, max_results, **kw):\n", 40 | "\n", 41 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets and \n", 42 | " # https://dev.twitter.com/docs/using-search for details on advanced \n", 43 | " # search criteria that may be useful for keyword arguments\n", 44 | " \n", 45 | " # See https://dev.twitter.com/docs/api/1.1/get/search/tweets \n", 46 | " search_results = twitter_api.search.tweets(q=q, count=10000, **kw)\n", 47 | " \n", 48 | " statuses = search_results['statuses']\n", 49 | " print len(statuses)\n", 50 | " \n", 51 | " # Iterate through batches of results by following the cursor until we\n", 52 | " # reach the desired number of results, keeping in mind that OAuth users\n", 53 | " # can \"only\" make 180 search queries per 15-minute interval. See\n", 54 | " # https://dev.twitter.com/docs/rate-limiting/1.1/limits\n", 55 | " # for details. A reasonable number of results is ~1000, although\n", 56 | " # that number of results may not exist for all queries.\n", 57 | " \n", 58 | " # Enforce a reasonable limit\n", 59 | " max_results = 10000 #min(100000, max_results)\n", 60 | " \n", 61 | " for _ in range(10000): # 10*100 = 1000\n", 62 | " try:\n", 63 | " next_results = search_results['search_metadata']['next_results']\n", 64 | " except KeyError, e: # No more results when next_results doesn't exist\n", 65 | " print len(statuses)\n", 66 | " break\n", 67 | " \n", 68 | " # Create a dictionary from next_results, which has the following form:\n", 69 | " # ?max_id=313519052523986943&q=NCAA&include_entities=1\n", 70 | " kwargs = dict([ kv.split('=') \n", 71 | " for kv in next_results[1:].split(\"&\") ])\n", 72 | " \n", 73 | " print len(statuses)\n", 74 | " search_results = twitter_api.search.tweets(**kwargs)\n", 75 | " statuses += search_results['statuses']\n", 76 | " \n", 77 | " if len(statuses) > max_results: \n", 78 | " break\n", 79 | " print len(statuses)\n", 80 | " \n", 81 | " return statuses\n", 82 | "\n" 83 | ], 84 | "language": "python", 85 | "metadata": {}, 86 | "outputs": [], 87 | "prompt_number": 6 88 | }, 89 | { 90 | "cell_type": "code", 91 | "collapsed": false, 92 | "input": [ 93 | "import io, json\n", 94 | "\n", 95 | "def save_json(filename, data):\n", 96 | " with io.open(filename, \n", 97 | " 'w', encoding='utf-8') as f:\n", 98 | " f.write(unicode(json.dumps(data, ensure_ascii=False)))\n", 99 | "\n", 100 | "def load_json(filename):\n", 101 | " with io.open(filename, \n", 102 | " encoding='utf-8') as f:\n", 103 | " return f.read()\n", 104 | "\n", 105 | "# Sample usage\n", 106 | "\n", 107 | "q = '%D8%A7%D9%84%D8%AA%D8%B9%D9%84%D9%8A%D9%85'\n", 108 | "twitter_api = oauth_login()\n", 109 | "results = twitter_search(twitter_api, q, max_results=10000)\n", 110 | "\n", 111 | "\n", 112 | "save_json('Education.json', results)\n" 113 | ], 114 | "language": "python", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "output_type": "stream", 119 | "stream": "stdout", 120 | "text": [ 121 | "100\n", 122 | "100\n", 123 | "187" 124 | ] 125 | }, 126 | { 127 | "output_type": "stream", 128 | "stream": "stdout", 129 | "text": [ 130 | "\n", 131 | "187\n" 132 | ] 133 | } 134 | ], 135 | "prompt_number": 5 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | " " 142 | ], 143 | "language": "python", 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "output_type": "stream", 148 | "stream": "stdout", 149 | "text": [ 150 | "100\n", 151 | "100\n", 152 | "200" 153 | ] 154 | }, 155 | { 156 | "output_type": "stream", 157 | "stream": "stdout", 158 | "text": [ 159 | "\n", 160 | "300" 161 | ] 162 | }, 163 | { 164 | "output_type": "stream", 165 | "stream": "stdout", 166 | "text": [ 167 | "\n", 168 | "400" 169 | ] 170 | }, 171 | { 172 | "output_type": "stream", 173 | "stream": "stdout", 174 | "text": [ 175 | "\n", 176 | "500" 177 | ] 178 | }, 179 | { 180 | "output_type": "stream", 181 | "stream": "stdout", 182 | "text": [ 183 | "\n", 184 | "600" 185 | ] 186 | }, 187 | { 188 | "output_type": "stream", 189 | "stream": "stdout", 190 | "text": [ 191 | "\n", 192 | "700" 193 | ] 194 | }, 195 | { 196 | "output_type": "stream", 197 | "stream": "stdout", 198 | "text": [ 199 | "\n", 200 | "800" 201 | ] 202 | }, 203 | { 204 | "output_type": "stream", 205 | "stream": "stdout", 206 | "text": [ 207 | "\n", 208 | "900" 209 | ] 210 | }, 211 | { 212 | "output_type": "stream", 213 | "stream": "stdout", 214 | "text": [ 215 | "\n", 216 | "1000" 217 | ] 218 | }, 219 | { 220 | "output_type": "stream", 221 | "stream": "stdout", 222 | "text": [ 223 | "\n", 224 | "1100" 225 | ] 226 | }, 227 | { 228 | "output_type": "stream", 229 | "stream": "stdout", 230 | "text": [ 231 | "\n", 232 | "1200" 233 | ] 234 | }, 235 | { 236 | "output_type": "stream", 237 | "stream": "stdout", 238 | "text": [ 239 | "\n", 240 | "1300" 241 | ] 242 | }, 243 | { 244 | "output_type": "stream", 245 | "stream": "stdout", 246 | "text": [ 247 | "\n", 248 | "1400" 249 | ] 250 | }, 251 | { 252 | "output_type": "stream", 253 | "stream": "stdout", 254 | "text": [ 255 | "\n", 256 | "1500" 257 | ] 258 | }, 259 | { 260 | "output_type": "stream", 261 | "stream": "stdout", 262 | "text": [ 263 | "\n", 264 | "1600" 265 | ] 266 | }, 267 | { 268 | "output_type": "stream", 269 | "stream": "stdout", 270 | "text": [ 271 | "\n", 272 | "1700" 273 | ] 274 | }, 275 | { 276 | "output_type": "stream", 277 | "stream": "stdout", 278 | "text": [ 279 | "\n", 280 | "1800" 281 | ] 282 | }, 283 | { 284 | "output_type": "stream", 285 | "stream": "stdout", 286 | "text": [ 287 | "\n", 288 | "1900" 289 | ] 290 | }, 291 | { 292 | "output_type": "stream", 293 | "stream": "stdout", 294 | "text": [ 295 | "\n", 296 | "2000" 297 | ] 298 | }, 299 | { 300 | "output_type": "stream", 301 | "stream": "stdout", 302 | "text": [ 303 | "\n", 304 | "2100" 305 | ] 306 | }, 307 | { 308 | "output_type": "stream", 309 | "stream": "stdout", 310 | "text": [ 311 | "\n", 312 | "2200" 313 | ] 314 | }, 315 | { 316 | "output_type": "stream", 317 | "stream": "stdout", 318 | "text": [ 319 | "\n", 320 | "2300" 321 | ] 322 | }, 323 | { 324 | "output_type": "stream", 325 | "stream": "stdout", 326 | "text": [ 327 | "\n", 328 | "2400" 329 | ] 330 | }, 331 | { 332 | "output_type": "stream", 333 | "stream": "stdout", 334 | "text": [ 335 | "\n", 336 | "2500" 337 | ] 338 | }, 339 | { 340 | "output_type": "stream", 341 | "stream": "stdout", 342 | "text": [ 343 | "\n", 344 | "2600" 345 | ] 346 | }, 347 | { 348 | "output_type": "stream", 349 | "stream": "stdout", 350 | "text": [ 351 | "\n", 352 | "2700" 353 | ] 354 | }, 355 | { 356 | "output_type": "stream", 357 | "stream": "stdout", 358 | "text": [ 359 | "\n", 360 | "2800" 361 | ] 362 | }, 363 | { 364 | "output_type": "stream", 365 | "stream": "stdout", 366 | "text": [ 367 | "\n", 368 | "2900" 369 | ] 370 | }, 371 | { 372 | "output_type": "stream", 373 | "stream": "stdout", 374 | "text": [ 375 | "\n", 376 | "3000" 377 | ] 378 | }, 379 | { 380 | "output_type": "stream", 381 | "stream": "stdout", 382 | "text": [ 383 | "\n", 384 | "3100" 385 | ] 386 | }, 387 | { 388 | "output_type": "stream", 389 | "stream": "stdout", 390 | "text": [ 391 | "\n", 392 | "3200" 393 | ] 394 | }, 395 | { 396 | "output_type": "stream", 397 | "stream": "stdout", 398 | "text": [ 399 | "\n", 400 | "3300" 401 | ] 402 | }, 403 | { 404 | "output_type": "stream", 405 | "stream": "stdout", 406 | "text": [ 407 | "\n", 408 | "3400" 409 | ] 410 | }, 411 | { 412 | "output_type": "stream", 413 | "stream": "stdout", 414 | "text": [ 415 | "\n", 416 | "3500" 417 | ] 418 | }, 419 | { 420 | "output_type": "stream", 421 | "stream": "stdout", 422 | "text": [ 423 | "\n", 424 | "3600" 425 | ] 426 | }, 427 | { 428 | "output_type": "stream", 429 | "stream": "stdout", 430 | "text": [ 431 | "\n", 432 | "3700" 433 | ] 434 | }, 435 | { 436 | "output_type": "stream", 437 | "stream": "stdout", 438 | "text": [ 439 | "\n", 440 | "3800" 441 | ] 442 | }, 443 | { 444 | "output_type": "stream", 445 | "stream": "stdout", 446 | "text": [ 447 | "\n", 448 | "3900" 449 | ] 450 | }, 451 | { 452 | "output_type": "stream", 453 | "stream": "stdout", 454 | "text": [ 455 | "\n", 456 | "4000" 457 | ] 458 | }, 459 | { 460 | "output_type": "stream", 461 | "stream": "stdout", 462 | "text": [ 463 | "\n", 464 | "4100" 465 | ] 466 | }, 467 | { 468 | "output_type": "stream", 469 | "stream": "stdout", 470 | "text": [ 471 | "\n", 472 | "4200" 473 | ] 474 | }, 475 | { 476 | "output_type": "stream", 477 | "stream": "stdout", 478 | "text": [ 479 | "\n", 480 | "4300" 481 | ] 482 | }, 483 | { 484 | "output_type": "stream", 485 | "stream": "stdout", 486 | "text": [ 487 | "\n", 488 | "4400" 489 | ] 490 | }, 491 | { 492 | "output_type": "stream", 493 | "stream": "stdout", 494 | "text": [ 495 | "\n", 496 | "4500" 497 | ] 498 | }, 499 | { 500 | "output_type": "stream", 501 | "stream": "stdout", 502 | "text": [ 503 | "\n", 504 | "4600" 505 | ] 506 | }, 507 | { 508 | "output_type": "stream", 509 | "stream": "stdout", 510 | "text": [ 511 | "\n", 512 | "4700" 513 | ] 514 | }, 515 | { 516 | "output_type": "stream", 517 | "stream": "stdout", 518 | "text": [ 519 | "\n", 520 | "4800" 521 | ] 522 | }, 523 | { 524 | "output_type": "stream", 525 | "stream": "stdout", 526 | "text": [ 527 | "\n", 528 | "4900" 529 | ] 530 | }, 531 | { 532 | "output_type": "stream", 533 | "stream": "stdout", 534 | "text": [ 535 | "\n", 536 | "5000" 537 | ] 538 | }, 539 | { 540 | "output_type": "stream", 541 | "stream": "stdout", 542 | "text": [ 543 | "\n", 544 | "5100" 545 | ] 546 | }, 547 | { 548 | "output_type": "stream", 549 | "stream": "stdout", 550 | "text": [ 551 | "\n", 552 | "5200" 553 | ] 554 | }, 555 | { 556 | "output_type": "stream", 557 | "stream": "stdout", 558 | "text": [ 559 | "\n", 560 | "5300" 561 | ] 562 | }, 563 | { 564 | "output_type": "stream", 565 | "stream": "stdout", 566 | "text": [ 567 | "\n", 568 | "5400" 569 | ] 570 | }, 571 | { 572 | "output_type": "stream", 573 | "stream": "stdout", 574 | "text": [ 575 | "\n", 576 | "5500" 577 | ] 578 | }, 579 | { 580 | "output_type": "stream", 581 | "stream": "stdout", 582 | "text": [ 583 | "\n", 584 | "5600" 585 | ] 586 | }, 587 | { 588 | "output_type": "stream", 589 | "stream": "stdout", 590 | "text": [ 591 | "\n", 592 | "5700" 593 | ] 594 | }, 595 | { 596 | "output_type": "stream", 597 | "stream": "stdout", 598 | "text": [ 599 | "\n", 600 | "5800" 601 | ] 602 | }, 603 | { 604 | "output_type": "stream", 605 | "stream": "stdout", 606 | "text": [ 607 | "\n", 608 | "5900" 609 | ] 610 | }, 611 | { 612 | "output_type": "stream", 613 | "stream": "stdout", 614 | "text": [ 615 | "\n", 616 | "6000" 617 | ] 618 | }, 619 | { 620 | "output_type": "stream", 621 | "stream": "stdout", 622 | "text": [ 623 | "\n", 624 | "6100" 625 | ] 626 | }, 627 | { 628 | "output_type": "stream", 629 | "stream": "stdout", 630 | "text": [ 631 | "\n", 632 | "6200" 633 | ] 634 | }, 635 | { 636 | "output_type": "stream", 637 | "stream": "stdout", 638 | "text": [ 639 | "\n", 640 | "6300" 641 | ] 642 | }, 643 | { 644 | "output_type": "stream", 645 | "stream": "stdout", 646 | "text": [ 647 | "\n", 648 | "6400" 649 | ] 650 | }, 651 | { 652 | "output_type": "stream", 653 | "stream": "stdout", 654 | "text": [ 655 | "\n", 656 | "6500" 657 | ] 658 | }, 659 | { 660 | "output_type": "stream", 661 | "stream": "stdout", 662 | "text": [ 663 | "\n", 664 | "6600" 665 | ] 666 | }, 667 | { 668 | "output_type": "stream", 669 | "stream": "stdout", 670 | "text": [ 671 | "\n", 672 | "6700" 673 | ] 674 | }, 675 | { 676 | "output_type": "stream", 677 | "stream": "stdout", 678 | "text": [ 679 | "\n", 680 | "6800" 681 | ] 682 | }, 683 | { 684 | "output_type": "stream", 685 | "stream": "stdout", 686 | "text": [ 687 | "\n", 688 | "6900" 689 | ] 690 | }, 691 | { 692 | "output_type": "stream", 693 | "stream": "stdout", 694 | "text": [ 695 | "\n", 696 | "7000" 697 | ] 698 | }, 699 | { 700 | "output_type": "stream", 701 | "stream": "stdout", 702 | "text": [ 703 | "\n", 704 | "7100" 705 | ] 706 | }, 707 | { 708 | "output_type": "stream", 709 | "stream": "stdout", 710 | "text": [ 711 | "\n", 712 | "7200" 713 | ] 714 | }, 715 | { 716 | "output_type": "stream", 717 | "stream": "stdout", 718 | "text": [ 719 | "\n", 720 | "7300" 721 | ] 722 | }, 723 | { 724 | "output_type": "stream", 725 | "stream": "stdout", 726 | "text": [ 727 | "\n", 728 | "7400" 729 | ] 730 | }, 731 | { 732 | "output_type": "stream", 733 | "stream": "stdout", 734 | "text": [ 735 | "\n", 736 | "7500" 737 | ] 738 | }, 739 | { 740 | "output_type": "stream", 741 | "stream": "stdout", 742 | "text": [ 743 | "\n", 744 | "7600" 745 | ] 746 | }, 747 | { 748 | "output_type": "stream", 749 | "stream": "stdout", 750 | "text": [ 751 | "\n", 752 | "7700" 753 | ] 754 | }, 755 | { 756 | "output_type": "stream", 757 | "stream": "stdout", 758 | "text": [ 759 | "\n", 760 | "7800" 761 | ] 762 | }, 763 | { 764 | "output_type": "stream", 765 | "stream": "stdout", 766 | "text": [ 767 | "\n", 768 | "7900" 769 | ] 770 | }, 771 | { 772 | "output_type": "stream", 773 | "stream": "stdout", 774 | "text": [ 775 | "\n", 776 | "8000" 777 | ] 778 | }, 779 | { 780 | "output_type": "stream", 781 | "stream": "stdout", 782 | "text": [ 783 | "\n", 784 | "8100" 785 | ] 786 | }, 787 | { 788 | "output_type": "stream", 789 | "stream": "stdout", 790 | "text": [ 791 | "\n", 792 | "8200" 793 | ] 794 | }, 795 | { 796 | "output_type": "stream", 797 | "stream": "stdout", 798 | "text": [ 799 | "\n", 800 | "8300" 801 | ] 802 | }, 803 | { 804 | "output_type": "stream", 805 | "stream": "stdout", 806 | "text": [ 807 | "\n", 808 | "8400" 809 | ] 810 | }, 811 | { 812 | "output_type": "stream", 813 | "stream": "stdout", 814 | "text": [ 815 | "\n", 816 | "8500" 817 | ] 818 | }, 819 | { 820 | "output_type": "stream", 821 | "stream": "stdout", 822 | "text": [ 823 | "\n", 824 | "8600" 825 | ] 826 | }, 827 | { 828 | "output_type": "stream", 829 | "stream": "stdout", 830 | "text": [ 831 | "\n", 832 | "8700" 833 | ] 834 | }, 835 | { 836 | "output_type": "stream", 837 | "stream": "stdout", 838 | "text": [ 839 | "\n", 840 | "8800" 841 | ] 842 | }, 843 | { 844 | "output_type": "stream", 845 | "stream": "stdout", 846 | "text": [ 847 | "\n", 848 | "8900" 849 | ] 850 | }, 851 | { 852 | "output_type": "stream", 853 | "stream": "stdout", 854 | "text": [ 855 | "\n", 856 | "9000" 857 | ] 858 | }, 859 | { 860 | "output_type": "stream", 861 | "stream": "stdout", 862 | "text": [ 863 | "\n", 864 | "9100" 865 | ] 866 | }, 867 | { 868 | "output_type": "stream", 869 | "stream": "stdout", 870 | "text": [ 871 | "\n", 872 | "9200" 873 | ] 874 | }, 875 | { 876 | "output_type": "stream", 877 | "stream": "stdout", 878 | "text": [ 879 | "\n", 880 | "9300" 881 | ] 882 | }, 883 | { 884 | "output_type": "stream", 885 | "stream": "stdout", 886 | "text": [ 887 | "\n", 888 | "9400" 889 | ] 890 | }, 891 | { 892 | "output_type": "stream", 893 | "stream": "stdout", 894 | "text": [ 895 | "\n", 896 | "9500" 897 | ] 898 | }, 899 | { 900 | "output_type": "stream", 901 | "stream": "stdout", 902 | "text": [ 903 | "\n", 904 | "9600" 905 | ] 906 | }, 907 | { 908 | "output_type": "stream", 909 | "stream": "stdout", 910 | "text": [ 911 | "\n", 912 | "9700" 913 | ] 914 | }, 915 | { 916 | "output_type": "stream", 917 | "stream": "stdout", 918 | "text": [ 919 | "\n", 920 | "9800" 921 | ] 922 | }, 923 | { 924 | "output_type": "stream", 925 | "stream": "stdout", 926 | "text": [ 927 | "\n", 928 | "9900" 929 | ] 930 | }, 931 | { 932 | "output_type": "stream", 933 | "stream": "stdout", 934 | "text": [ 935 | "\n", 936 | "10000" 937 | ] 938 | }, 939 | { 940 | "output_type": "stream", 941 | "stream": "stdout", 942 | "text": [ 943 | "\n", 944 | "10100" 945 | ] 946 | }, 947 | { 948 | "ename": "TypeError", 949 | "evalue": "object of type 'NoneType' has no len()", 950 | "output_type": "pyerr", 951 | "traceback": [ 952 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 953 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0mget_time_series_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'time-series'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'twitter_world_trends'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 954 | "\u001b[1;32m\u001b[0m in \u001b[0;36mget_time_series_data\u001b[1;34m(q, mongo_db_name, mongo_db_coll, secs_per_interval, max_intervals, **mongo_conn_kw)\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msave_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'SmileOct8'\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0minter\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;34m'.json'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 18\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"Write {0} trends\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mids\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 19\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"Zzz...\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 955 | "\u001b[1;31mTypeError\u001b[0m: object of type 'NoneType' has no len()" 956 | ] 957 | }, 958 | { 959 | "output_type": "stream", 960 | "stream": "stdout", 961 | "text": [ 962 | "\n" 963 | ] 964 | } 965 | ], 966 | "prompt_number": 3 967 | }, 968 | { 969 | "cell_type": "code", 970 | "collapsed": false, 971 | "input": [], 972 | "language": "python", 973 | "metadata": {}, 974 | "outputs": [] 975 | } 976 | ], 977 | "metadata": {} 978 | } 979 | ] 980 | } --------------------------------------------------------------------------------