├── .gitignore ├── dbtweets.png ├── .github └── FUNDING.yml ├── docker ├── worker │ ├── composer.json │ ├── Dockerfile │ └── worker-redis.php └── twitter │ ├── Dockerfile-search │ ├── Dockerfile-stream │ └── src │ ├── package.json │ ├── .env.sample │ ├── mysave.js │ ├── mytwitter.js │ ├── myfunctions.js │ ├── stream.js │ └── search.js ├── docker-compose.yml ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | docker/twitter/src/.env 2 | docker/twitter/src/node_modules 3 | -------------------------------------------------------------------------------- /dbtweets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tazeg/dbtweets/master/dbtweets.png -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: ['Tazeg'] 2 | custom: ['https://keybase.io/jeffprod'] 3 | -------------------------------------------------------------------------------- /docker/worker/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "require": { 3 | "graphaware/neo4j-php-client": "^4.0", 4 | "predis/predis": "^1.1" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /docker/twitter/Dockerfile-search: -------------------------------------------------------------------------------- 1 | FROM node 2 | RUN apt-get update 3 | 4 | RUN apt-get install nano 5 | ENV TERM xterm 6 | 7 | COPY src /usr/src/app 8 | WORKDIR /usr/src/app 9 | RUN npm install 10 | 11 | CMD ["npm", "run", "search"] 12 | -------------------------------------------------------------------------------- /docker/twitter/Dockerfile-stream: -------------------------------------------------------------------------------- 1 | FROM node 2 | RUN apt-get update 3 | 4 | RUN apt-get install nano 5 | ENV TERM xterm 6 | 7 | COPY src /usr/src/app 8 | WORKDIR /usr/src/app 9 | RUN npm install 10 | 11 | CMD ["npm", "run", "stream"] 12 | -------------------------------------------------------------------------------- /docker/twitter/src/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "twitter", 3 | "version": "0.0.1", 4 | "description": "Vacuum tweets", 5 | "private": true, 6 | "scripts": { 7 | "search": "node search.js", 8 | "stream": "node stream.js" 9 | }, 10 | "author": "@JeffProd", 11 | "license": "SEE LICENSE IN LICENSE", 12 | "dependencies": { 13 | "dotenv": "^6.2.0", 14 | "moment": "^2.22.2", 15 | "neo4j-driver": "^1.7.2", 16 | "redis": "^2.8.0", 17 | "twit": "^2.2.11" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /docker/twitter/src/.env.sample: -------------------------------------------------------------------------------- 1 | # Twitter 2 | CONSUMER_KEY='***' 3 | CONSUMER_SECRET='***' 4 | ACCESS_TOKEN='***' 5 | ACCESS_TOKEN_SECRET='***' 6 | 7 | # Twitter rest API parameters 8 | # https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets 9 | # https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/overview/standard-operators 10 | SEARCH_REST_q='realdonaldtrump', 11 | SEARCH_REST_lang='' 12 | 13 | # Twitter stream API parameters 14 | # https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters.html 15 | SEARCH_STREAM_track='realdonaldtrump' 16 | SEARCH_STREAM_language='' 17 | 18 | # Redis host 19 | REDIS_SERVER=redis-twitter 20 | 21 | -------------------------------------------------------------------------------- /docker/worker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | ARG DEBIAN_FRONTEND=newt 3 | RUN apt-get update && apt-get -y install git unzip php7.4-cli php7.4-curl php7.4-bcmath php7.4-mbstring php7.4-json 4 | COPY . /usr/src/app 5 | WORKDIR /usr/src/app 6 | RUN php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');" 7 | RUN php -r "if (hash_file('sha384', 'composer-setup.php') === 'c31c1e292ad7be5f49291169c0ac8f683499edddcfd4e42232982d0fd193004208a58ff6f353fde0012d35fdd72bc394') { echo 'Installer verified'; } else { echo 'Installer corrupt'; unlink('composer-setup.php'); } echo PHP_EOL;" 8 | RUN php composer-setup.php 9 | RUN php -r "unlink('composer-setup.php');" 10 | RUN php composer.phar install 11 | CMD [ "php", "./worker-redis.php" ] 12 | -------------------------------------------------------------------------------- /docker/twitter/src/mysave.js: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // Web : https://jeffprod.com 3 | // Twitter : https://twitter.com/JeffProd 4 | //----------------------------------------------------------------------------- 5 | // Save a JSON tweet in Redis. 6 | // A PHP worker will insert them in Neo4j 7 | //----------------------------------------------------------------------------- 8 | 9 | require('dotenv').config(); 10 | const redis = require("redis"); 11 | 12 | // Push tweets to end list 13 | // The worker will pull from the beginning of the list 14 | r = redis.createClient({host: process.env.REDIS_SERVER}); 15 | r.on("error", function (err) { 16 | console.log("Error " + err); 17 | process.exit(1); 18 | }); 19 | 20 | let saveTweet = function(status) { 21 | // IN : status = un tweet en JSON 22 | r.rpush('tweets', JSON.stringify(status)); 23 | }; // saveTweet() 24 | 25 | exports.saveTweet = saveTweet; 26 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | 5 | neo4j-twitter: 6 | image: neo4j:3.5.18 7 | ports: 8 | - "7474:7474" 9 | - "7687:7687" 10 | volumes: 11 | - "/home/user/neo4j:/data" 12 | environment: 13 | - "NEO4J_AUTH=neo4j/123456" 14 | 15 | redis-twitter: 16 | image: redis 17 | ports: 18 | - "6379:6379" 19 | 20 | node-search: 21 | build: 22 | context: ./docker/twitter 23 | dockerfile: Dockerfile-search 24 | links: 25 | - redis-twitter:redis-twitter 26 | 27 | node-stream: 28 | build: 29 | context: ./docker/twitter 30 | dockerfile: Dockerfile-stream 31 | links: 32 | - redis-twitter:redis-twitter 33 | 34 | worker: 35 | build: ./docker/worker 36 | links: 37 | - redis-twitter:redis-twitter 38 | - neo4j-twitter:neo4j-twitter 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 JeffProd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docker/twitter/src/mytwitter.js: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // Web : https://jeffprod.com 3 | // Twitter : https://twitter.com/JeffProd 4 | //----------------------------------------------------------------------------- 5 | 6 | let fn = require('./myfunctions'); 7 | let mysave = require('./mysave'); 8 | 9 | let tweetCount = 0; 10 | let timeStart = fn.unixtime(); 11 | let timeLast = fn.unixtime(); 12 | 13 | let onStatusStream = function(status) { 14 | // Stram API : Reveiving a tweet 15 | // 'status' can also be : source: { limit: { track: 13, timestamp_ms: '1494706633814' } } } 16 | // See : https://developer.twitter.com/en/docs/tutorials/consuming-streaming-data#limit_notices 17 | if(status.id_str===undefined) { 18 | console.log(status); 19 | return; 20 | } 21 | 22 | // save json tweet 23 | mysave.saveTweet(status); 24 | tweetCount++; 25 | 26 | // Info each 5 secs 27 | let timeNow = fn.unixtime(); 28 | if(timeNow-timeLast>=5) { 29 | console.log(tweetCount + " tweets | " + flowrate(timeNow) + " t/s"); 30 | timeLast = timeNow; 31 | } 32 | }; 33 | 34 | let flowrate = function(timeNow) { 35 | return (tweetCount/(timeNow-timeStart)).toFixed(2); 36 | }; 37 | 38 | exports.onStatusStream = onStatusStream; 39 | -------------------------------------------------------------------------------- /docker/twitter/src/myfunctions.js: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // Web : https://jeffprod.com 3 | // Twitter : https://twitter.com/JeffProd 4 | //----------------------------------------------------------------------------- 5 | 6 | let moment = require('moment'); 7 | 8 | let unixtime = function () { 9 | // Return seconds since 1970-01-01 10 | return new Date().getTime()/1000; 11 | }; 12 | 13 | let twitterDateToYMD = function(tweetDate) { 14 | // tweetDate = 'Fri Jun 12 11:03:22 +0000 2009' 15 | // OUT : '2009-06-12' 16 | return moment(tweetDate, 'dd MMM DD HH:mm:ss ZZ YYYY').format('YYYY-MM-DD'); 17 | }; 18 | 19 | let twitterDateToHMS = function(tweetDate) { 20 | // tweetDate = 'Fri Jun 12 11:03:22 +0000 2009' 21 | // OUT : '11:03:22' 22 | return moment(tweetDate, 'dd MMM DD HH:mm:ss ZZ YYYY').format('HH:mm:ss'); 23 | }; 24 | 25 | let twitterDateToYMDHMS = function(tweetDate) { 26 | // tweetDate = 'Fri Jun 12 11:03:22 +0000 2009' 27 | // OUT : '11:03:22' 28 | return moment(tweetDate, 'dd MMM DD HH:mm:ss ZZ YYYY').format('YYYY-MM-DD HH:mm:ss'); 29 | }; 30 | 31 | exports.unixtime = unixtime; 32 | exports.twitterDateToYMD = twitterDateToYMD; 33 | exports.twitterDateToHMS = twitterDateToHMS; 34 | exports.twitterDateToYMDHMS = twitterDateToYMDHMS; 35 | -------------------------------------------------------------------------------- /docker/twitter/src/stream.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | //------------------------------------------------------------------------------ 4 | // Web : https://jeffprod.com 5 | // Twitter : https://twitter.com/JeffProd 6 | //----------------------------------------------------------------------------- 7 | // Get tweets from STREAM API 8 | // https://developer.twitter.com/en/docs/tweets/filter-realtime/api-reference/post-statuses-filter.html 9 | // Save each tweet with "mysave.js" 10 | //----------------------------------------------------------------------------- 11 | 12 | require('dotenv').config(); 13 | const Twit = require('twit'); 14 | const myTwitter = require('./mytwitter'); 15 | 16 | let stream; // to have only one stream 17 | 18 | let t = new Twit({ 19 | consumer_key: process.env.CONSUMER_KEY, 20 | consumer_secret: process.env.CONSUMER_SECRET, 21 | access_token: process.env.ACCESS_TOKEN, 22 | access_token_secret: process.env.ACCESS_TOKEN_SECRET 23 | }); 24 | 25 | const params = { 26 | track: process.env.SEARCH_STREAM_track, 27 | language: process.env.SEARCH_STREAM_language 28 | }; 29 | 30 | let startStream = function () { 31 | let waiting = false; 32 | stream = t.stream('statuses/filter', params); 33 | stream.on('tweet', myTwitter.onStatusStream); 34 | stream.on('error', function(error) { 35 | if(waiting) {return;} // do not start several streams 36 | waiting = true; 37 | console.log(error); 38 | console.log('retry within 30 secs.'); 39 | setTimeout(startStream, 30000); 40 | }); 41 | }; 42 | 43 | console.log('Start twitter stream API, params=' + JSON.stringify(params)); 44 | startStream(); 45 | -------------------------------------------------------------------------------- /docker/twitter/src/search.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | //----------------------------------------------------------------------------- 4 | // Web : https://jeffprod.com 5 | // Twitter : https://twitter.com/JeffProd 6 | //----------------------------------------------------------------------------- 7 | // Get tweets from SEARCH API 8 | // https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html 9 | // Save each tweet with "mysave.js" 10 | //----------------------------------------------------------------------------- 11 | 12 | require('dotenv').config(); 13 | const Twit = require('twit'); 14 | const mysave = require('./mysave'); 15 | 16 | const SLEEP = 2000; // Rate limit APP AUTH : 450 rqt / 15' <=> 1 rqt each 2" 17 | 18 | let t = new Twit({ 19 | consumer_key: process.env.CONSUMER_KEY, 20 | consumer_secret: process.env.CONSUMER_SECRET, 21 | app_only_auth: true // app auth is limit 450/15', user auth is 180/15' 22 | }); 23 | 24 | let params = { 25 | q: process.env.SEARCH_REST_q, 26 | result_type: 'recent', 27 | count: 100, 28 | lang: process.env.SEARCH_REST_lang, 29 | since_id: '' 30 | }; 31 | 32 | let search = function() { 33 | console.log('Search ' + JSON.stringify(params)); 34 | t.get('search/tweets', params, function(error, tweets) { 35 | if(error) {console.log(error); return;} 36 | let len = tweets.statuses.length; 37 | let i = len-1; 38 | while(i >=0) { 39 | mysave.saveTweet(tweets.statuses[i]); 40 | params.since_id = tweets.statuses[i].id_str; // next search after this last tweet id got 41 | i--; 42 | } 43 | console.log(len + ' tweets got, sleep ' + (SLEEP/1000) + 's'); 44 | }); 45 | }; // search() 46 | 47 | // loop 48 | setInterval(search, SLEEP); 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### DB Tweets 2 | 3 | Or how to vacuum a lot of tweets on a specific topic. 4 | 5 | It uses both Twitter SEARCH (recursive intervals) and STREAM API. 6 | Tweets are finally inserted into a Neo4j database that you can query in real time. 7 | 8 | Please, read and agree the [Twitter Developer Agreement and Policy](https://developer.twitter.com/en/developer-terms/agreement-and-policy.html) 9 | 10 | ![Schéma](dbtweets.png) 11 | 12 | ### Requirement 13 | 14 | - [Docker](https://docs.docker.com/install/) 15 | - [Docker Compose](https://docs.docker.com/compose/) 16 | 17 | ### Install 18 | 19 | ``` 20 | git clone https://github.com/Tazeg/dbtweets.git 21 | cd dbtweets 22 | ``` 23 | 24 | - Create a local directory for the Neo4j database : 25 | 26 | ``` 27 | mkdir /home/user/neo4j 28 | ``` 29 | 30 | - Rename `docker/twitter/src/.env.sample` to `.env` and edit it. 31 | - Update the Neo4j volume path in `docker-compose.yml` according to the directory you created on the previous command 32 | 33 | 34 | ### Run 35 | 36 | ``` 37 | docker-compose rm 38 | docker-compose build 39 | docker-compose up 40 | (or "docker-compose up -d" for daemon mode) 41 | ``` 42 | 43 | Go to : 44 | 45 | - http://localhost:7474 for direct access to the Database (login/pass : neo4j/123456) 46 | 47 | ### Sample Neo4j queries 48 | 49 | You just have to copy/paste into the Neo4j dashboard : 50 | 51 | How many users in database : 52 | ``` 53 | MATCH (n:User) RETURN COUNT(n) 54 | ``` 55 | 56 | How many tweets : 57 | ``` 58 | MATCH (n:Tweet) RETURN COUNT(n) 59 | ``` 60 | 61 | Tweets count by date : 62 | ``` 63 | MATCH (t:Tweet) 64 | RETURN t.created_at_YMD, count(*) AS nb 65 | ORDER BY t.created_at_YMD 66 | ``` 67 | 68 | App used to post tweets : 69 | ``` 70 | MATCH (u:User)-[:POSTS|RETWEETS]->(t:Tweet) 71 | WITH COUNT(t.source) AS nb, t.source AS source 72 | RETURN source, nb ORDER By nb DESC LIMIT 8 73 | ``` 74 | 75 | Tweets having coordinates : 76 | ``` 77 | MATCH (u:User)-[:POSTS]->(t:Tweet) 78 | WHERE t.latitude<>0 AND t.longitude<>0 79 | RETURN u,t 80 | ``` 81 | 82 | Searching strings in tweets : 83 | ``` 84 | MATCH (u:User)-[]->(t:Tweet) 85 | WHERE toLower(t.text) CONTAINS 'car' 86 | RETURN t.created_at_YMD, t.created_at_HIS, u.screen_name,t.text 87 | ORDER BY t.created_at_YMD DESC, t.created_at_HIS DESC 88 | LIMIT 20 89 | ``` 90 | 91 | Hashtags and counts : 92 | ``` 93 | MATCH (h:Hashtag)<-[:TAGS]-(t:Tweet) 94 | WITH h, COUNT(h) AS nb 95 | ORDER BY nb DESC 96 | RETURN h.text AS text, nb 97 | LIMIT 15 98 | ``` 99 | 100 | Most shared links : 101 | ``` 102 | MATCH (t:Tweet)-[r:CONTAINS]->(l:Link) 103 | WITH l.url AS url,COUNT(r) AS nb 104 | WHERE nb>1 105 | RETURN url,nb 106 | ORDER BY nb DESC 107 | ``` 108 | 109 | Most shared medias : 110 | ``` 111 | MATCH (t:Tweet) 112 | WHERE t.media_url<>"" 113 | RETURN t.media_url AS media, COUNT(t.media_url) AS nb 114 | ORDER BY nb DESC 115 | ``` 116 | 117 | Most retweeted tweets : 118 | ``` 119 | MATCH (u:User)-[:POSTS]->(t:Tweet) 120 | RETURN 'https://twitter.com/' + u.screen_name + '/status/' + t.id_str AS tweeturl,t.text,t.retweet_count 121 | ORDER BY t.retweet_count DESC LIMIT 100 122 | ``` 123 | 124 | Top languages of tweets : 125 | ``` 126 | MATCH (t:Tweet) 127 | WHERE t.lang<>"und" 128 | RETURN t.lang AS lng, COUNT(t.lang) AS nb 129 | LIMIT 10 130 | ``` 131 | 132 | Most active users : 133 | ``` 134 | MATCH (u:User)-[r:POSTS|RETWEETS]->(t:Tweet) 135 | RETURN u.screen_name AS screen_name, 136 | COUNT(r) AS tweet_or_rt_count, 137 | u.friends_count AS friends_count, 138 | u.followers_count AS followers_count 139 | ORDER BY tweet_or_rt_count DESC 140 | LIMIT 15 141 | ``` 142 | 143 | Most mentionned users : 144 | ``` 145 | MATCH (t:Tweet)-[:MENTIONS]->(u:User) 146 | WHERE NOT t.text=~("(?i)^RT @"+u.screen_name+".*") 147 | RETURN u.screen_name AS screen_name, 148 | COUNT(u.screen_name) AS count, 149 | u.friends_count AS friends_count, 150 | u.followers_count AS followers_count 151 | ORDER BY count DESC 152 | LIMIT 15 153 | ``` 154 | 155 | Users who RT the most : 156 | ``` 157 | MATCH (u:User)-[r:RETWEETS]->(t:Tweet) 158 | RETURN u.screen_name AS screen_name, 159 | COUNT(r) AS nbRT 160 | ORDER BY nbRT DESC 161 | LIMIT 15 162 | ``` 163 | 164 | List of RT from a given user sorted by date : 165 | ``` 166 | MATCH (u1:User)-[r:RETWEETS]->(t:Tweet)<-[:POSTS]-(u2:User) 167 | WHERE u1.id_str='123456789' 168 | RETURN t.created_at_YMD + ' ' + t.created_at_HIS AS dateTweet, r.created_at_YMD + ' ' + r.created_at_HIS AS dateRT, u2.screen_name, t.text 169 | ORDER BY t.created_at_YMD DESC, t.created_at_HIS DESC 170 | LIMIT 15 171 | ``` 172 | 173 | List of the very first retweeters of a given tweet : 174 | ``` 175 | MATCH (u:User)-[r:RETWEETS]->(t:Tweet {id_str: '123456789'}) 176 | RETURN t.created_at_YMD + ' ' + t.created_at_HIS AS dateTweet, r.created_at_YMD + ' ' + r.created_at_HIS AS dateRT, 177 | duration.between(datetime(t.created_at_YMD + 'T' + t.created_at_HIS), datetime(r.created_at_YMD + 'T' + r.created_at_HIS)).seconds AS seconds, 178 | u.screen_name AS retweeter 179 | ORDER BY seconds 180 | LIMIT 15 181 | ``` 182 | 183 | ### Stop 184 | 185 | ``` 186 | CTRL+C 187 | (or "docker-compose stop" if daemon) 188 | ``` 189 | 190 | ### Donate 191 | 192 | https://en.jeffprod.com/donate/ 193 | -------------------------------------------------------------------------------- /docker/worker/worker-redis.php: -------------------------------------------------------------------------------- 1 | REDIS_SERVER]); 32 | 33 | // Wait for Neo4j to start, otherwhise we get : PHP Warning: stream_socket_client(): unable to connect to tcp://neo4j-twitter:7687 34 | echo 'Waiting 10s...'.PHP_EOL; 35 | sleep(10); 36 | $neo4j = GraphAware\Neo4j\Client\ClientBuilder::create() 37 | ->addConnection('bolt', 'bolt://'.NEO4J_USER.':'.NEO4J_PASSWD.'@'.NEO4J_SERVER.':7687') 38 | ->build(); 39 | 40 | // Create contrainst and indexes (not on screen_name that can change) 41 | echo 'Creating Neo4j indexes...'.PHP_EOL; 42 | $neo4j->run('CREATE CONSTRAINT ON (u:User) ASSERT u.id_str IS UNIQUE'); 43 | $neo4j->run('CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id_str IS UNIQUE'); 44 | $neo4j->run('CREATE CONSTRAINT ON (h:Hashtag) ASSERT h.text IS UNIQUE'); 45 | $neo4j->run('CREATE CONSTRAINT ON (l:Link) ASSERT l.url IS UNIQUE'); 46 | 47 | define('NB_TO_GET', 100); 48 | 49 | $done = 0; 50 | $start = get_ms(); 51 | 52 | echo 'Worker is listening Redis...'.PHP_EOL; 53 | while(true) { 54 | $tweets = $redis->lrange('tweets', 0, NB_TO_GET-1); // gettin x items 55 | $redis->ltrim('tweets', NB_TO_GET, -1); // and delete them 56 | if(empty($tweets)) { 57 | // empty queue, let's mini sleep 58 | sleep(1); 59 | $done = 0; 60 | $start = get_ms(); 61 | continue; 62 | } 63 | 64 | $neo4j->run('BEGIN'); 65 | foreach($tweets as $jsontweet) { 66 | $done++; 67 | addtweet_neo4j($jsontweet, $neo4j); 68 | } // foreach tweets 69 | $neo4j->run('COMMIT'); 70 | 71 | $duration_s = (get_ms()-$start)/1000; 72 | $flowrate = '0'; 73 | if($duration_s>0) {$flowrate = round($done/$duration_s, 2);} 74 | echo $done.' tweets inserted - '.$flowrate.' insert/s'.PHP_EOL; 75 | } // while(true) 76 | 77 | /** 78 | * Insert a JSON tweet into Neo4j 79 | * @param $jsontweet : the native JSON of a tweet 80 | * @param $neo4j : client connect 81 | */ 82 | function addtweet_neo4j($jsontweet, $neo4j) { 83 | // (User)-[:POSTS]->(Tweet) 84 | // or if RT : (User)-[:RETWEETS]->(Tweet:['retweeted_status'])<-[:POSTS]-(User:['retweeted_status']['user']) 85 | // NB : quoting text not processed actually (is_quote_status) 86 | // (Tweet)-[:MENTIONS]->(User) ['entities']['user_mentions'][x]['screen_name+name+id_str'] 87 | // (Tweet)-[:TAGS]->(Hashtag) ['entities']['hashtags'][x]['text'] 88 | // (Tweet)-[:CONTAINS]->(Link) ['entities']['urls'][x]['expanded_url'] 89 | $jsontweet = json_decode($jsontweet, true); 90 | 91 | if (!empty($jsontweet['retweeted_status'])) { 92 | // this is a RT 93 | // (User:['retweeted_status']['user'])-[:POSTS]->(Tweet:['retweeted_status']) 94 | addtweet($jsontweet['retweeted_status']['user'], 'POSTS', $jsontweet['retweeted_status'], $neo4j); 95 | // (User)-[:RETWEETS]->(Tweet:['retweeted_status']) 96 | addtweet($jsontweet['user'], 'RETWEETS', $jsontweet['retweeted_status'], $neo4j); 97 | // add the RT date time 98 | $neo4j->run('MATCH (u:User {id_str:{user_id_str}}), (t:Tweet {id_str:{tweet_id_str}}), (u)-[r:RETWEETS]->(t) SET r.created_at_YMD={created_at_YMD}, r.created_at_HIS={created_at_HIS}', [ // edge 99 | 'user_id_str' => $jsontweet['user']['id_str'], 100 | 'tweet_id_str' => $jsontweet['retweeted_status']['id_str'], 101 | 'created_at_YMD' => strdate_to($jsontweet['created_at'], 'Y-m-d'), 102 | 'created_at_HIS' => strdate_to($jsontweet['created_at'], 'H:i:s') 103 | ]); 104 | } 105 | else { 106 | // Not a RT 107 | // (User)-[:POSTS]->(Tweet) 108 | addtweet($jsontweet['user'], 'POSTS', $jsontweet, $neo4j); 109 | } 110 | } // addtweet_neo4j 111 | 112 | function addtweet($user, $rel, $tweet, $neo4j) { 113 | $place = ''; 114 | $longitude = 0; 115 | $latitude = 0; 116 | $media_url = ''; 117 | 118 | if(!empty($tweet['place'])) { 119 | $place = $tweet['place']['full_name']; 120 | } 121 | if(!empty($tweet['coordinates'])) { 122 | $longitude = $tweet['coordinates']['coordinates'][0]; 123 | $latitude = $tweet['coordinates']['coordinates'][1]; 124 | } 125 | if(!empty($tweet['entities']['media'])) { 126 | foreach($tweet['entities']['media'] as $image) { 127 | $media_url = $image['media_url_https']; // getting only the last one actually 128 | } 129 | } 130 | $source = strip_tags($tweet['source']); // ex : Falcon Pro Material 131 | 132 | // update ON MATCH to update RT count 133 | $q = 'MERGE (t:Tweet {id_str:{tweet_id_str}}) ON CREATE SET t+={infos_tweets} ON MATCH SET t+={infos_tweets} '. 134 | 'MERGE (u:User {id_str:{user_id_str}}) ON CREATE SET u+={infos_user} ON MATCH SET u+={infos_user} '. 135 | 'MERGE (u)-[:'.$rel.']->(t)'; 136 | $args = [ 137 | 'tweet_id_str' => $tweet['id_str'], 138 | 'infos_tweets' => [ 139 | 'id_str' => $tweet['id_str'], 140 | 'created_at_YMD' => strdate_to($tweet['created_at'], 'Y-m-d'), 141 | 'created_at_HIS' => strdate_to($tweet['created_at'], 'H:i:s'), 142 | 'text' => mytrim($tweet['text']), 143 | 'lang' => $tweet['lang'], 144 | 'media_url' => $media_url, 145 | 'source' => $source, 146 | 'in_reply_to_status_id' => $tweet['in_reply_to_status_id'], 147 | 'place' => $place, 148 | 'longitude' => $longitude, 149 | 'latitude' => $latitude, 150 | 'retweet_count' => $tweet['retweet_count'], 151 | 'favorite_count' => $tweet['favorite_count'] 152 | ], 153 | 'user_id_str' => $user['id_str'], 154 | 'infos_user' => [ 155 | 'id_str' => $user['id_str'], 156 | 'created_at_YMD' => strdate_to($user['created_at'], 'Y-m-d'), 157 | 'created_at_HIS' => strdate_to($user['created_at'], 'H:i:s'), 158 | 'name' => mytrim($user['name']), 159 | 'screen_name' => strtolower($user['screen_name']), 160 | 'location' => mytrim($user['location']), 161 | 'url' => mytrim($user['url']), 162 | 'description' => mytrim($user['description']), 163 | 'protected' => $user['protected'], 164 | 'verified' => $user['verified'], 165 | 'followers_count' => $user['followers_count'], 166 | 'friends_count' => $user['friends_count'], 167 | 'listed_count' => $user['listed_count'], 168 | 'favourites_count' => $user['favourites_count'], 169 | 'statuses_count' => $user['statuses_count'], 170 | 'utc_offset' => $user['utc_offset'], 171 | 'time_zone' => $user['time_zone'], 172 | 'geo_enabled' => $user['geo_enabled'], 173 | 'lang' => $user['lang'], 174 | 'profile_image_url' => $user['profile_image_url_https'], 175 | 'profile_background_image'=> $user['profile_background_image_url_https'], 176 | 'influence' => influence($user['friends_count'], $user['followers_count']) 177 | ] 178 | ]; 179 | $neo4j->run($q, $args); 180 | 181 | // hashtags 182 | foreach($tweet['entities']['hashtags'] as $hashtag) { 183 | $h = strtolower($hashtag['text']); 184 | $neo4j->run('MERGE (h:Hashtag {text:{h_text}})', ['h_text' => $h]); // node 185 | $neo4j->run('MATCH (h:Hashtag {text:{h_text}}), (t:Tweet {id_str:{tweet_id_str}}) MERGE (t)-[:TAGS]->(h)', [ // edge 186 | 'h_text' => $h, 187 | 'tweet_id_str' => $tweet['id_str'] 188 | ]); 189 | } // foreach hashtags 190 | 191 | // mentions 192 | foreach($tweet['entities']['user_mentions'] as $mention) { 193 | $neo4j->run('MERGE (u:User {id_str:{user_id_str}}) ON CREATE SET u += {infos}', [ // node 194 | 'user_id_str' => $mention['id_str'], 195 | 'infos' => [ 196 | 'screen_name' => strtolower($mention['screen_name']), 197 | 'name' => mytrim($mention['name']) 198 | ]]); 199 | $neo4j->run('MATCH (u:User {id_str:{user_id_str}}), (t:Tweet {id_str:{tweet_id_str}}) MERGE (t)-[:MENTIONS]->(u)', [ // edge 200 | 'user_id_str' => $mention['id_str'], 201 | 'tweet_id_str' => $tweet['id_str'] 202 | ]); 203 | } // foreach hashtags 204 | 205 | // urls : (Tweet)-[:CONTAINS]->(Link), ['entities']['urls'][x]['expanded_url'] 206 | foreach($tweet['entities']['urls'] as $url) { 207 | // l'url du tweet original on s'en fout 208 | if(preg_match('~https://twitter.com/.+/status/[0-9]+~', $url['expanded_url'])) {continue;} 209 | $neo4j->run('MERGE (l:Link {url:{url}})', ['url' => $url['expanded_url']]); // node 210 | $neo4j->run('MATCH (l:Link {url:{url}}), (t:Tweet {id_str:{tweet_id_str}}) MERGE (t)-[:CONTAINS]->(l)', [ // edge 211 | 'url' => $url['expanded_url'], 212 | 'tweet_id_str' => $tweet['id_str'] 213 | ]); 214 | } // foreach urls 215 | } // addtweet_neo4j 216 | 217 | function mytrim($txt) { 218 | $txt = str_replace("\r", '', $txt); 219 | $txt = str_replace("\n", ' ', $txt); 220 | return $txt; 221 | } 222 | 223 | function strdate_to($strdate, $format) { 224 | // IN : $strdate = 'Sat Jul 21 12:02:47 +0000 2012' 225 | // IN : $format = 'Y-m-d H:i:s' 226 | // OUT : '2012-07-21 12:02:47' 227 | return date($format, strtotime($strdate)); 228 | } 229 | 230 | function get_ms() { 231 | return round(microtime(true) * 1000 ); 232 | } 233 | 234 | function influence($friends_count, $followers_count) { 235 | $somme = $friends_count + $followers_count; 236 | if($somme>0) {return round($followers_count / $somme, 2);} 237 | return 0; 238 | } 239 | --------------------------------------------------------------------------------