├── .github └── workflows │ └── codeql.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── documentation └── fulldoc.md ├── requirements.txt ├── samples ├── sample_.env ├── sample_key.json └── sample_s3_config.cfg ├── scraper ├── detector.py ├── requirements.txt └── sns.py └── webapp ├── app.py ├── form.py ├── readme.md ├── requirements.txt ├── setup.sh ├── sqlquery.py ├── static ├── Chart.min.js ├── banner4.jpeg ├── customCharts.js ├── logo-full.png ├── logo-transp.png ├── styles.css ├── table.css ├── userCharts.js └── utils.js ├── templates ├── _formhelpers.html ├── base.html ├── chart.html ├── custom.html ├── head.html ├── index.html ├── result.html └── scrap.html └── utils.py /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '35 23 * * 3' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'javascript', 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | encrypted_conf 2 | video* 3 | key.json 4 | .DS* 5 | \#* 6 | .env 7 | s3_config.cfg 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | COPY ./scraper /app/scraper 4 | COPY ./webapp /app/webapp 5 | COPY ./key.json /app/key.json 6 | COPY ./requirements.txt /app/requirements.txt 7 | 8 | EXPOSE 5000 9 | 10 | RUN apt update 11 | # Install node 12 | RUN apt install curl -y 13 | RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - 14 | RUN apt -y install nodejs 15 | # Install npm and the scraper 16 | RUN curl -qL https://www.npmjs.com/install.sh | sh 17 | RUN npm install -g tiktok-scraper 18 | # Install other python requirements 19 | RUN python3 -m pip install -r /app/requirements.txt 20 | 21 | ENV FLASK_APP=/app/webapp/app.py 22 | 23 | ENTRYPOINT ["flask"] 24 | 25 | CMD ["run", "--host=0.0.0.0"] 26 | 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Victor L. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Social Net SCrap 2 | 3 | A powerful tool that collect data from videos on social media. 4 | 5 | **Possible improvments :** 6 | 7 | - Add a graphic cool visualisation ffor data in the DB 8 | - Automate db creation 9 | - Delete video after usage or send it to s3 10 | - Reduce video quality before sending it to Google with ffmpeg 11 | - Make google API calls optional 12 | - Add other scrapper (insta / snapchat .. ? ) 13 | - Include tiktok-scrapper in this project 14 | ------- 15 | ## Run the program 16 | 17 | Once you have setup the project with the instructions bellow, you can run the project. 18 | 19 | The program takes some time to run (download video then send it to google for inspection). 20 | 21 | ```python 22 | usage: sns.py [-h] [-u USER | --hashtag HASHTAG] [-n NUMBER] 23 | 24 | optional arguments: 25 | -h, --help show this help message and exit 26 | -u USER, --user USER the username of the account you want to scrap. 27 | --hashtag HASHTAG the hashtag you want to scrap (without #). 28 | -n NUMBER, --number NUMBER the number of videos to scrap (default: 10). 29 | ``` 30 | 31 | Examples : 32 | ```bash 33 | python3 ./sns -u -n 3 # will download 3 videos of 34 | python3 ./sns --hashtag -n 3 # will download 3 videos from #hashtag 35 | ``` 36 | 37 | ## Setup to run the program 38 | 39 | In order to run this program you will need to : 40 | * Get your google credentials (optional : needed to call google API) 41 | * Do not forget to enable the API [(see this link)](https://console.developers.google.com/apis/api/videointelligence.googleapis.com/overview) 42 | * Setup Python3 and install the requirements 43 | * Install s3cmd if you want to use with --s3 44 | * Install tiktok-scrapper 45 | * Install mysql and setup the DB 46 | 47 | ### Get Google credentials json file 48 | 49 | You need to get the json file in the GCP console and add it at the root of this repository under the name `key.json`. 50 | 51 | ### Setup Python 52 | 53 | Env is very useful to avoid a mess on your host when installing lot of strange python packets. 54 | 55 | ```bash 56 | apt update 57 | apt install virtualenv 58 | virtualenv env -p python3 59 | # to activate 60 | source env/bin/activate 61 | # to check versions 62 | ls env/lib/ 63 | # to deactivate 64 | deactivate 65 | ``` 66 | 67 | Then install the requirements : 68 | 69 | ```bash 70 | # In your virtual env 71 | python3 -m pip install ./requirements.txt 72 | ``` 73 | 74 | 75 | ### Install tiktok-scrapper 76 | 77 | The tiktok scrapper is a npm command you need to install : 78 | ```bash 79 | npm install -g tiktok-scraper 80 | ``` 81 | 82 | If it is not working try the solution at this link : 83 | https://github.com/drawrowfly/tiktok-scraper/pull/563#issuecomment-852264427 84 | 85 | 86 | ### Install mysql if not done already 87 | 88 | ```bash 89 | # install mysql 90 | apt install mysql 91 | 92 | # Login as root (no password, just press 'enter') 93 | mysql -u root -p 94 | ``` 95 | 96 | To create the DB : 97 | ```sql 98 | CREATE DATABASE sns; 99 | use sns; 100 | ``` 101 | 102 | You will need to store the db information in a `.env` file : 103 | ```bash 104 | SNS_DB_HOST="127.0.0.1" 105 | SNS_DB_USER="sns" 106 | SNS_DB_PASS="yourpass" 107 | SNS_DB_NAME="sns" 108 | SNS_DB_PORT=3306 109 | ``` 110 | 111 | Then you will need to create all the tables : 112 | 113 | #### Create user table : 114 | ```sql 115 | -- User 116 | CREATE TABLE user (id VARCHAR(255) PRIMARY KEY, nickname VARCHAR(255), avatar VARCHAR(1000), name VARCHAR(255), tikid VARCHAR(255), fans INT, secuid VARCHAR(255), signature VARCHAR(1000), digg INT, verified TINYINT, video INT, heart INT, following INT); 117 | 118 | ALTER TABLE user CONVERT TO CHARACTER SET utf8mb4; 119 | 120 | -- Music 121 | CREATE TABLE music (id VARCHAR(50) PRIMARY KEY, musicName VARCHAR(255), duration INT, playUrl VARCHAR(500), musicOriginal TINYINT, coverUrl VARCHAR(500), musicAlbum VARCHAR(255), musicAuthor VARCHAR(255)); 122 | 123 | ALTER TABLE music CONVERT TO CHARACTER SET utf8mb4; 124 | 125 | -- Video 126 | CREATE TABLE video (id VARCHAR(50) PRIMARY KEY, userId VARCHAR(255), shareCount INT, commentCount INT, playCount INT, videoUrl VARCHAR(1000), text VARCHAR(1000), coverDynamic VARCHAR(1000), createTime VARCHAR(255), secretID VARCHAR(255), webVideoUrl VARCHAR(1000), diggCount INT, height INT, width INT, duration INT); 127 | 128 | ALTER TABLE video CONVERT TO CHARACTER SET utf8mb4; 129 | 130 | -- Mention 131 | CREATE TABLE mention (id INT PRIMARY KEY AUTO_INCREMENT, id_video VARCHAR(255), username VARCHAR(255)); 132 | 133 | ALTER TABLE mention CONVERT TO CHARACTER SET utf8mb4; 134 | 135 | -- Hashtag 136 | CREATE TABLE hashtag (id INT PRIMARY KEY AUTO_INCREMENT, id_video VARCHAR(255), name VARCHAR(255), title VARCHAR(255), cover VARCHAR(1000)); 137 | 138 | ALTER TABLE hashtag CONVERT TO CHARACTER SET utf8mb4; 139 | 140 | -- Brand 141 | CREATE TABLE brand (id_video VARCHAR(255) PRIMARY KEY, name VARCHAR(255)); 142 | ALTER TABLE brand CONVERT TO CHARACTER SET utf8mb4; 143 | 144 | -- Theme 145 | CREATE TABLE theme (id_video VARCHAR(255) PRIMARY KEY, name VARCHAR(255)); 146 | 147 | ALTER TABLE theme CONVERT TO CHARACTER SET utf8mb4; 148 | 149 | -- explicit 150 | CREATE TABLE explicit (id_video VARCHAR(255) PRIMARY KEY, explicit VARCHAR(250)); 151 | 152 | ALTER TABLE explicit CONVERT TO CHARACTER SET utf8mb4; 153 | ``` 154 | -------------------------------------------------------------------------------- /documentation/fulldoc.md: -------------------------------------------------------------------------------- 1 | # Social Net SCrap 2 | 3 | Social Net SCrap (later called SNS) is an Open Source Intelligence (OSINT) tool, that collect data and metada from videos on social medias from users' profile. 4 | 5 | This document is here to list the full documentation of the project! 6 | 7 | ## Summary 8 | 9 | - I. Global overview 10 | 1. Why this project 11 | 2. Global architechture 12 | 3. How to install 13 | 4. Open Source 14 | 15 | - II. The scrapper 16 | 1. How it works 17 | 2. MySQL setup 18 | 3. S3 setup 19 | 4. Google Cloud setup 20 | 5. Adding more analysis 21 | 6. Usage example 22 | 23 | - III. The web interface 24 | 1. How it works 25 | 2. Research part 26 | 3. Scrap part 27 | 4. Adding more graphs 28 | 29 | - IV. To go further 30 | 1. Add new scrappers 31 | 2. Things to improve 32 | 33 | ## I. Global Overview 34 | ### 1. Why this project 35 | 36 | We decided to create this tool because we think that social media are underestimated and underused to collect personnal data on a specific user or to make links between a group of people. 37 | 38 | Moreover the data usually collected and analyzed is mostly textual or from pictures but it's rare to find tools that analyze the videos. Nowadays, videos are becoming mainstream, as we can see through the increase of video "stories", apps like Vine or TikTok or platform such as Youtube and Twitch. 39 | Being able to analyze these videos and detect what kind of person is behond, what kind of intentions or ideas are propagated are now a matter. 40 | 41 | SNS is the first step of a bigger project. Where here we scrap only one platform, we want to be able to implement more and more platform. Where here we collect only some types of data and use AI with moderation, we want to collect much more data using AI. And finally where we only display some graphs to present these data, we want to perform a complete deep-analysis of links between people as well as there content and who they are. 42 | 43 | ### 2. Global architecture 44 | 45 | To create SNS, we tried to make each part independent. So the scrapp is working standalone and the web interface too. Also, we packed the project in a container to it is easier to use. Finally we wanted a precise secret management, so we are using environement variable in the container to manage secrets. 46 | 47 | We decided to let the users choose wheter or not they want to store the data in a local or remote database, as well as videos in a object storage or if they wanted to delete them after analysis. 48 | 49 | We also wanted to make SNS usable from the CLI as well as from the web interface. 50 | 51 | Here you can find a schema of the architecture of the project : 52 | 53 | ------------------- put schema here ------------------- 54 | 55 | ### 3. How to install 56 | 57 | The installation is pretty straightforward ! 58 | 59 | ```bash 60 | docker pull sns:latest 61 | 62 | docker run -d -p 5000:5000 sns:latest 63 | 64 | # You can also add env variable like this : 65 | docker run -d -p 5000:5000 sns:latest # ------------------- Add env variable command ------------------- 66 | ``` 67 | Then you will be able to access the web interface on `127.0.0.1:5000`. 68 | 69 | Otherwise if you want to use the CLI : 70 | ```bash 71 | docker pull sns:latest 72 | 73 | docker run -t -i sns:latest /bin/bash 74 | cd /app/scrapper 75 | 76 | # Then you will be in the contaier and you can simply run : 77 | python3 sns.py --help 78 | ``` 79 | 80 | ### 4. Open Source 81 | 82 | We wanted to make this project opensource, as we think this should be edited and improved by everyone. Also the size of the project is too big for a small team like us. 83 | Finally we wanted everyone to be able to use our tool. 84 | 85 | 86 | ## II. The scrapper 87 | ### 1. How it works 88 | 89 | ------------------- Add the last part ------------------- 90 | - explain the process 91 | - talk about the dependancy 92 | 93 | ### 2. MySQL setup 94 | 95 | ------------------- Add the last part ------------------- 96 | - scw account 97 | - create database 98 | - create a user 99 | - connect to database 100 | - set the tables 101 | - set utf8 102 | - setup the env credentials 103 | 104 | 105 | ### 3. S3 setup 106 | 107 | We decided to allow users to save or not the scrapped videos into an s3 bucket. We think that some people might want to reuse the videos later in a custom process. 108 | In this documentation we will be using Scaleway Object Storage, so you need an account to connect to [the console of Scaleway](http://console.scaleway.com/). 109 | 110 | ------------------- Add the last part ------------------- 111 | - download s3cmd 112 | - create a bucket on scw 113 | - get your credentials 114 | - configure s3 cmd 115 | - set the config file at the right place 116 | - verify everything works 117 | 118 | 119 | ### 4. Google Cloud setup 120 | 121 | ------------------- Add the last part ------------------- 122 | - create a google cloud account 123 | - add an IAM role 124 | - get the credentials json file 125 | - set the creds at the right place 126 | - talk abour the GOOGLE_APPLICATION_CREDENTIALS env variable 127 | - authorized the use of the API to the 128 | 129 | ### 5. Adding more analysis 130 | 131 | ------------------- Add the last part ------------------- 132 | - talk about google full potential 133 | - talk about other providers 134 | - how to implement in the code 135 | 136 | ### 6. Usage example 137 | 138 | ------------------- Add the last part ------------------- 139 | - using cli only 140 | - saving to s3 141 | - using hashtag 142 | - using user 143 | 144 | 145 | ## III. The web interface 146 | ### 1. How it works 147 | 148 | ------------------- Add the last part ------------------- 149 | - explain flask 150 | - explain templates and jinja 151 | - explain the 2 parts search / scrap 152 | 153 | ### 2. Research part 154 | 155 | ------------------- Add the last part ------------------- 156 | - how to search 157 | - explain the search engine 158 | 159 | ### 3. Scrap part 160 | 161 | ------------------- Add the last part ------------------- 162 | - how to scrap 163 | - what if i do not see the user i scraped 164 | 165 | ### 4. Adding more graphs 166 | 167 | ------------------- Add the last part ------------------- 168 | - how graph works 169 | - how to add some 170 | 171 | 172 | ## IV. To go further 173 | ### 1. Add new scrappers 174 | 175 | ------------------- Add the last part ------------------- 176 | - link to insta scraper 177 | - how to implement new scraper 178 | 179 | 180 | 181 | ### 2. Things to improve 182 | 183 | ------------------- Add the last part ------------------- 184 | - add ssl certificates to web interface 185 | - create an API 186 | - add more options to the scraper (like music etc) 187 | - add more relevant graphs 188 | - add more scraper 189 | - make the front better 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | flask-wtf 3 | flask 4 | google-cloud-videointelligence==2.0.0 5 | mysql-connector -------------------------------------------------------------------------------- /samples/sample_.env: -------------------------------------------------------------------------------- 1 | SNS_DB_HOST="127.0.0.1" 2 | SNS_DB_USER="your_db_username" 3 | SNS_DB_PASS="your_db_password" 4 | SNS_DB_NAME="your_db_name" 5 | SNS_DB_PORT=3630 6 | 7 | GOOGLE_APPLICATION_CREDENTIALS=key.json 8 | 9 | S3_BUCKET_NAME=sns -------------------------------------------------------------------------------- /samples/sample_key.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "project_id": "xxxx", 4 | "private_key_id": "xxxx", 5 | "private_key": "xxxx", 6 | "client_email": "xxxx", 7 | "client_id": "xxxx", 8 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 9 | "token_uri": "https://oauth2.googleapis.com/token", 10 | "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", 11 | "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/xxxx" 12 | } 13 | -------------------------------------------------------------------------------- /samples/sample_s3_config.cfg: -------------------------------------------------------------------------------- 1 | [default] 2 | # Object Storage Region NL-AMS 3 | host_base = s3.nl-ams.scw.cloud 4 | host_bucket = %(bucket)s.s3.nl-ams.scw.cloud 5 | bucket_location = nl-ams 6 | use_https = True 7 | 8 | # Login credentials 9 | access_key = XXXX 10 | secret_key = XXXX -------------------------------------------------------------------------------- /scraper/detector.py: -------------------------------------------------------------------------------- 1 | from google.cloud import videointelligence_v1 as videointelligence 2 | 3 | ########################################## 4 | ######## person 5 | ########################################## 6 | def person(annotation_result): 7 | for annotation in annotation_result.person_detection_annotations: 8 | print("Person detected:") 9 | for track in annotation.tracks: 10 | # Attributes include unique pieces of clothing, 11 | # poses, or hair color. 12 | print("Attributes:") 13 | for attribute in timestamped_object.attributes: 14 | print( 15 | "\t{}:{} {}".format( 16 | attribute.name, attribute.value, attribute.confidence 17 | ) 18 | ) 19 | 20 | # Landmarks in person detection include body parts such as 21 | # left_shoulder, right_ear, and right_ankle 22 | print("Landmarks:") 23 | for landmark in timestamped_object.landmarks: 24 | print( 25 | "\t{}: {} (x={}, y={})".format( 26 | landmark.name, 27 | landmark.confidence, 28 | landmark.point.x, # Normalized vertex 29 | landmark.point.y, # Normalized vertex 30 | ) 31 | ) 32 | 33 | ########################################## 34 | ######## FACE 35 | ########################################## 36 | def visage(annotation_result): 37 | for annotation in annotation_result.face_detection_annotations: 38 | print("Face detected:") 39 | for track in annotation.tracks: 40 | # Attributes include glasses, headwear, smiling, direction of gaze 41 | print("Attributes:") 42 | for attribute in timestamped_object.attributes: 43 | print( 44 | "\t{}:{} {}".format( 45 | attribute.name, attribute.value, attribute.confidence 46 | ) 47 | ) 48 | 49 | ########################################## 50 | ######## theme 51 | ########################################## 52 | def theme(annotation_result): 53 | # Process video/segment level label annotations 54 | videos_desc = [] 55 | segment_labels = annotation_result.segment_label_annotations 56 | for i, segment_label in enumerate(segment_labels): 57 | for i, segment in enumerate(segment_label.segments): 58 | confidence = segment.confidence 59 | if confidence >= 0.7 : 60 | videos_desc.append(segment_label.entity.description) 61 | 62 | return videos_desc 63 | 64 | ########################################## 65 | ######## explicit 66 | ########################################## 67 | def explicit(annotation_result): 68 | full = [] 69 | for frame in annotation_result.explicit_annotation.frames: 70 | likelihood = videointelligence.Likelihood(frame.pornography_likelihood) 71 | frame_time = frame.time_offset.seconds + frame.time_offset.microseconds / 1e6 72 | full.append(likelihood.name) 73 | 74 | if "VERY_LIKELY" in full: 75 | return "VERY_LIKELY" 76 | if "LIKELY" in full: 77 | return "LIKELY" 78 | if "POSSIBLE" in full: 79 | return "POSSIBLE" 80 | 81 | 82 | ########################################## 83 | ######## logo 84 | ########################################## 85 | def logo(annotation_result): 86 | logos = [] 87 | # Annotations for list of logos detected, tracked and recognized in video. 88 | for logo_recognition_annotation in annotation_result.logo_recognition_annotations: 89 | entity = logo_recognition_annotation.entity 90 | for track in logo_recognition_annotation.tracks: 91 | confidence = track.confidence 92 | if confidence > 0.92: 93 | logos.append(entity.description) 94 | return logos -------------------------------------------------------------------------------- /scraper/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-videointelligence==2.0.0 2 | mysql-connector 3 | python-dotenv 4 | flask 5 | flask-wtf -------------------------------------------------------------------------------- /scraper/sns.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | 3 | # Before runnig make sure you have these tables ready : 4 | # Also after creating table run : 5 | # ALTER TABLE theme CONVERT TO CHARACTER SET utf8mb4; 6 | # 7 | # user 8 | # user (id VARCHAR(255) PRIMARY KEY, nickname VARCHAR(255), avatar VARCHAR(1000), name VARCHAR(255), tikid VARCHAR(255), fans INT, 9 | # secuid VARCHAR(255), signature VARCHAR(1000), digg INT, verified TINYINT, video INT, heart INT, following INT) 10 | # 11 | # music 12 | # TABLE music (id VARCHAR(50) PRIMARY KEY, musicName VARCHAR(255), duration INT, playUrl VARCHAR(500), 13 | # musicOriginal TINYINT, coverUrl VARCHAR(500), musicAlbum VARCHAR(255), musicAuthor VARCHAR(255)) 14 | # 15 | # video 16 | # video (id VARCHAR(50) PRIMARY KEY, userId VARCHAR(255), shareCount INT, commentCount INT, playCount INT, 17 | # videoUrl VARCHAR(1000), text VARCHAR(1000), coverDynamic VARCHAR(1000), createTime VARCHAR(255), secretID VARCHAR(255), 18 | # webVideoUrl VARCHAR(1000), diggCount INT, height INT, width INT, duration INT) 19 | # 20 | # mention 21 | # mention (id INT PRIMARY KEY AUTO_INCREMENT, id_video VARCHAR(255), username VARCHAR(255)) 22 | # 23 | # hashtag 24 | # hashtag (id INT PRIMARY KEY AUTO_INCREMENT, id_video VARCHAR(255), name VARCHAR(255), title VARCHAR(255), cover VARCHAR(1000)) 25 | # 26 | # brand 27 | # brand (id_video VARCHAR(255) PRIMARY KEY, name VARCHAR(255)) 28 | # 29 | # theme 30 | # theme (id_video VARCHAR(255) PRIMARY KEY, name VARCHAR(255)) 31 | # 32 | # explicit 33 | # explicit (id_video VARCHAR(255) PRIMARY KEY, explicit VARCHAR(250)) 34 | 35 | # - username / hashtag / number 36 | 37 | import os 38 | import sys 39 | import argparse 40 | # To retreive files from extensions : 41 | import glob 42 | import json 43 | import io 44 | import mysql.connector 45 | import asyncio 46 | from google.cloud import videointelligence_v1 as videointelligence 47 | 48 | # Local import 49 | import detector 50 | 51 | 52 | def fill_args(args): 53 | nb = 10 54 | user = "" 55 | hashtag = "" 56 | is_user = True 57 | delete = False 58 | save = False 59 | 60 | 61 | if args.number: nb = args.number 62 | 63 | if args.user and args.user != "": 64 | user = args.user 65 | is_user = True 66 | 67 | if args.hashtag and args.hashtag != "": 68 | hashtag = args.hashtag 69 | is_user = False 70 | 71 | if args.delete == 1: 72 | delete = True 73 | 74 | if args.s3 == 1: 75 | save = True 76 | 77 | return {"number":nb, "user":user, "hashtag":hashtag, "is_user":is_user, "delete":delete, "save":save} 78 | 79 | def dl_videos(p): 80 | one = ("user " if p["user"] != "" else "hashtag ") 81 | two = (p["user"] if p["user"] != "" else p["hashtag"]) 82 | three = str(p["number"]) 83 | cmd = "tiktok-scraper " + one + two + " -n " + three + " -d -t json" 84 | print(cmd) 85 | os.system(cmd) 86 | 87 | def get_files(p): 88 | dirname = (p["user"] if p["user"] != "" else "/#"+p["hashtag"]) 89 | pwd = "./" + dirname 90 | search_for_videos = pwd + "/*.mp4" 91 | search_for_json = pwd + "/*.json" 92 | videos = glob.glob(search_for_videos) 93 | json = glob.glob(search_for_json) 94 | return (videos, json) 95 | 96 | 97 | def parse_json(jsons, mycursor, mydb): 98 | if len(jsons) == 0: 99 | print("No video found... Quitting") 100 | exit(84) 101 | file = jsons[0] 102 | with open(file, 'r') as f: 103 | data = json.load(f) 104 | 105 | for elem in data: 106 | # Insert or Update 107 | ### User info ### 108 | meta = elem["authorMeta"] 109 | sql = """ 110 | INSERT INTO user (id, name, nickname, avatar, tikid, fans, secuid, signature, digg, verified, video, heart, following) 111 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE 112 | name=%s, nickname=%s, avatar=%s, tikid=%s, fans=%s, secuid=%s, 113 | signature=%s, digg=%s, verified=%s, video=%s, heart=%s, following=%s ; 114 | """ 115 | val = (meta["id"], meta["name"], meta["nickName"], meta["avatar"], meta["id"], 116 | meta["fans"], meta["secUid"], meta["signature"], meta["digg"], 117 | (1 if meta["verified"] == True else 0), meta["video"], meta["heart"], meta["following"], 118 | # from here it's for the update part could not find something simpler 119 | meta["name"], meta["nickName"], meta["avatar"], meta["id"], 120 | meta["fans"], meta["secUid"], meta["signature"], meta["digg"], 121 | (1 if meta["verified"] == True else 0), meta["video"], meta["heart"], meta["following"]) 122 | 123 | mycursor.execute(sql, val) 124 | mydb.commit() 125 | 126 | ### Music info ### 127 | meta = elem["musicMeta"] 128 | sql = """ 129 | INSERT INTO music (id, musicName, duration, playUrl, musicOriginal, coverUrl, musicAlbum, musicAuthor) 130 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE 131 | musicName=%s, duration=%s, playUrl=%s, musicOriginal=%s, coverUrl=%s, musicAlbum=%s, musicAuthor=%s; 132 | """ 133 | val = (meta["musicId"], meta["musicName"], meta["duration"], meta["playUrl"], (1 if meta["musicOriginal"] == True else 0), 134 | meta["coverLarge"], meta["musicAlbum"], meta["musicAuthor"], 135 | # from here it's for the update part could not find something simpler 136 | meta["musicName"], meta["duration"], meta["playUrl"], (1 if meta["musicOriginal"] == True else 0), 137 | meta["coverLarge"], meta["musicAlbum"], meta["musicAuthor"]) 138 | 139 | mycursor.execute(sql, val) 140 | mydb.commit() 141 | 142 | ### Video info ### 143 | meta = elem 144 | sql = """ 145 | INSERT INTO video (id, userId, shareCount, commentCount, playCount, videoUrl, text, coverDynamic, createTime, 146 | secretID, webVideoUrl, diggCount, height, width, duration) 147 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE 148 | userId=%s, shareCount=%s, commentCount=%s, playCount=%s, videoUrl=%s, text=%s, coverDynamic=%s, 149 | createTime=%s, secretID=%s, webVideoUrl=%s, diggCount=%s, height=%s, width=%s, duration=%s; 150 | """ 151 | val = (meta["id"], meta["authorMeta"]["id"], meta["shareCount"], meta["commentCount"], meta["playCount"], meta["videoUrl"], 152 | meta["text"], meta["covers"]["dynamic"], meta["createTime"], meta["secretID"], meta["webVideoUrl"], meta["createTime"], 153 | meta["videoMeta"]["height"], meta["videoMeta"]["width"], meta["videoMeta"]["duration"], 154 | # from here it's for the update part could not find something simpler 155 | meta["authorMeta"]["id"], meta["shareCount"], meta["commentCount"], meta["playCount"], meta["videoUrl"], 156 | meta["text"], meta["covers"]["dynamic"], meta["createTime"], meta["secretID"], meta["webVideoUrl"], meta["createTime"], 157 | meta["videoMeta"]["height"], meta["videoMeta"]["width"], meta["videoMeta"]["duration"]) 158 | 159 | mycursor.execute(sql, val) 160 | mydb.commit() 161 | 162 | ### Mentions info ### 163 | for mention in meta["mentions"]: 164 | username = mention[1:] # This remove the @ before a mention 165 | sql = """ 166 | INSERT INTO mention (id_video, username) 167 | VALUES (%s, %s) 168 | """ 169 | val = (meta["id"], mention) 170 | mycursor.execute(sql, val) 171 | mydb.commit() 172 | 173 | # As we insert anyway here, we need to remove duplicates now : 174 | # This keeps the highest id 175 | sql = """ 176 | DELETE t1 FROM mention t1 177 | INNER JOIN mention t2 178 | WHERE 179 | t1.id < t2.id AND 180 | t1.id_video = t2.id_video AND 181 | t1.username = t2.username; 182 | """ 183 | mycursor.execute(sql) 184 | mydb.commit() 185 | 186 | ### Hashtags info ### 187 | for hashtag in meta["hashtags"]: 188 | sql = """ 189 | INSERT INTO hashtag (id_video, name, title, cover) 190 | VALUES (%s, %s, %s, %s) 191 | """ 192 | val = (meta["id"], hashtag["name"], hashtag["title"], hashtag["cover"]) 193 | mycursor.execute(sql, val) 194 | mydb.commit() 195 | 196 | # As we insert anyway here, we need to remove duplicates now : 197 | # This keeps the highest id 198 | sql = """ 199 | DELETE t1 FROM hashtag t1 200 | INNER JOIN hashtag t2 201 | WHERE 202 | t1.id < t2.id AND 203 | t1.id_video = t2.id_video AND 204 | t1.name = t2.name; 205 | """ 206 | mycursor.execute(sql) 207 | mydb.commit() 208 | 209 | 210 | def get_video_id(video): 211 | # Get video id 212 | tmp = video 213 | p = tmp.find('/') 214 | while p != -1: 215 | tmp = tmp[p+1:] 216 | p = tmp.find('/') 217 | idv = tmp[:len(tmp) - 4] 218 | return idv 219 | 220 | 221 | def delete_video(video): 222 | print("Deleting : " + video) 223 | cmd = "rm " + video 224 | os.system(cmd) 225 | 226 | 227 | 228 | def google_single_video(video, mycursor, mydb): 229 | client = videointelligence.VideoIntelligenceServiceClient() 230 | config = videointelligence.types.PersonDetectionConfig( 231 | include_bounding_boxes=True, 232 | include_attributes=True, 233 | include_pose_landmarks=True, 234 | ) 235 | context = videointelligence.types.VideoContext(person_detection_config=config) 236 | 237 | # Open video 238 | with io.open(video, "rb") as f: 239 | input_content = f.read() 240 | # Start the asynchronous request 241 | print("Sending video " + video + " for analysis...") 242 | 243 | # Maybe here we can send all the videos at the same time 244 | operation = client.annotate_video( 245 | request={ 246 | "features": [videointelligence.Feature.LABEL_DETECTION, videointelligence.Feature.LOGO_RECOGNITION, videointelligence.Feature.LABEL_DETECTION, videointelligence.Feature.PERSON_DETECTION, videointelligence.Feature.FACE_DETECTION, videointelligence.Feature.EXPLICIT_CONTENT_DETECTION], 247 | "input_content": input_content, 248 | "video_context": context, 249 | } 250 | ) 251 | result = operation.result(timeout=90) 252 | 253 | # Retrieve the first result, because a single video was processed. 254 | annotation_result = result.annotation_results[0] 255 | 256 | print("Searching for explicit content...") 257 | explicit = detector.explicit(annotation_result) 258 | 259 | print("Searching for logo...") 260 | logos = detector.logo(annotation_result) 261 | 262 | print("Searching for theme...") 263 | themes = detector.theme(annotation_result) 264 | 265 | # Saving to DB 266 | ## Explicit content 267 | id = get_video_id(video) 268 | sql = """ 269 | INSERT INTO explicit (id_video, explicit) 270 | VALUES (%s, %s) ON DUPLICATE KEY UPDATE 271 | explicit=%s; 272 | """ 273 | val = (id, explicit, explicit) 274 | mycursor.execute(sql, val) 275 | mydb.commit() 276 | 277 | ## Logos 278 | for brand in logos: 279 | sql = """ 280 | INSERT INTO brand (id_video, name) 281 | VALUES (%s, %s) ON DUPLICATE KEY UPDATE 282 | name=%s; 283 | """ 284 | val = (id, brand, brand) 285 | mycursor.execute(sql, val) 286 | mydb.commit() 287 | 288 | ## Theme 289 | for theme in themes: 290 | sql = """ 291 | INSERT INTO theme (id_video, name) 292 | VALUES (%s, %s) ON DUPLICATE KEY UPDATE 293 | name=%s; 294 | """ 295 | val = (id, theme, theme) 296 | mycursor.execute(sql, val) 297 | mydb.commit() 298 | 299 | delete_video(video) 300 | 301 | def google_call(videos, mycursor, mydb, should_delete): 302 | for video in videos: 303 | google_single_video(video, mycursor, mydb) 304 | 305 | def setupDB(): 306 | db_host = os.getenv('SNS_DB_HOST', '127.0.0.1') 307 | db_port = os.getenv('SNS_DB_PORT', 3630) 308 | db_user = os.getenv('SNS_DB_USER', 'toto') 309 | db_pass = os.getenv('SNS_DB_PASS', 'toto') 310 | db_name = os.getenv('SNS_DB_NAME', 'toto') 311 | 312 | print(db_host) 313 | 314 | mydb = mysql.connector.connect( 315 | host = db_host, 316 | user = db_user, 317 | port = db_port, 318 | password = db_pass, 319 | database = db_name, 320 | charset = 'utf8mb4' 321 | ) 322 | return mydb 323 | 324 | 325 | def delete_jsons(jsons): 326 | print("Deleting : ") 327 | for file in jsons: 328 | print("- " + file) 329 | cmd = "rm " + file 330 | os.system(cmd) 331 | 332 | def save_videos(videos): 333 | all_videos = " ".join(videos) 334 | s3_bucket_name = "s3://" + os.getenv("S3_BUCKET_NAME", "sns") 335 | cmd = "s3cmd -c ./s3_config.cfg put " + all_videos + " " + s3_bucket_name 336 | print("sending videos to s3 : " + s3_bucket_name) 337 | os.system(cmd) 338 | 339 | def main(): 340 | parser = argparse.ArgumentParser() 341 | group = parser.add_mutually_exclusive_group() 342 | group.add_argument("-u", "--user", help="the username of the account you want to scrap.") 343 | group.add_argument("--hashtag", help="the hashtag you want to scrap (without #).") 344 | parser.add_argument("-n", "--number", help="the number of videos to scrap (default: 10).", type=int) 345 | parser.add_argument("-d", "--delete", help="present if you want the videos to be deleted after usage (default: not deleted)", default=0, action="count") 346 | parser.add_argument("-s", "--s3", help="if present, save to an s3, need a s3_config to be present (default: not saving", default=0, action="count") 347 | args = parser.parse_args() 348 | 349 | # Get arguments 350 | params = fill_args(args) 351 | 352 | # Dl videos with tiktok-scraper 353 | dl_videos(params) 354 | 355 | # retreive videos and json filesn in tabs 356 | (videos, jsons) = get_files(params) 357 | 358 | # Save videos if needed : 359 | if params["save"]: 360 | save_videos(videos) 361 | 362 | # parse json/csv file and store the result in DB 363 | parse_json(jsons, mycursor, mydb) 364 | 365 | # Add a step here to reduce the video quality ? 366 | # call google api and store result in DB 367 | google_call(videos, mycursor, mydb, params["delete"]) 368 | 369 | if params["delete"] : 370 | delete_jsons(jsons) 371 | 372 | 373 | # Setp db from env conf 374 | # env config can be in a .env file 375 | from dotenv import load_dotenv, find_dotenv 376 | load_dotenv(find_dotenv()) 377 | 378 | mydb = setupDB() 379 | mycursor = mydb.cursor() 380 | 381 | # start the program 382 | main() -------------------------------------------------------------------------------- /webapp/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask import request, redirect, url_for 3 | from flask import render_template 4 | from flask import jsonify 5 | import os 6 | import mysql.connector 7 | 8 | import sqlquery 9 | import utils 10 | 11 | from form import searchform, scrapform 12 | 13 | from dotenv import load_dotenv 14 | 15 | app = Flask(__name__) 16 | SECRET_KEY = os.urandom(32) 17 | app.config['SECRET_KEY'] = SECRET_KEY 18 | 19 | load_dotenv() 20 | 21 | db_host = os.getenv('SNS_DB_HOST', '127.0.0.1') 22 | db_port = os.getenv('SNS_DB_PORT', 3630) 23 | db_user = os.getenv('SNS_DB_USER', 'toto') 24 | db_pass = os.getenv('SNS_DB_PASS', 'toto') 25 | db_name = os.getenv('SNS_DB_NAME', 'toto') 26 | 27 | mydb = mysql.connector.connect( 28 | host = db_host, 29 | user = db_user, 30 | port = db_port, 31 | password = db_pass, 32 | database = db_name, 33 | charset = 'utf8mb4' 34 | ) 35 | 36 | cursor = mydb.cursor() 37 | 38 | 39 | 40 | @app.route("/", methods=['GET', 'POST']) 41 | def chart(): 42 | searchF = searchform() 43 | 44 | if searchF.validate(): 45 | return redirect(url_for('search', item=request.form.get('search'))) 46 | 47 | ## get top 10 users 48 | names, videosNb = sqlquery.getTop10UsersByVideoCount(cursor) 49 | charts = {} 50 | charts["top10User"] = {'title': "Top 10 users with the most number of videos", \ 51 | 'labels':names, \ 52 | 'values':videosNb, \ 53 | 'legend':'Number of video'} 54 | 55 | ## get top 10 brand 56 | brands, brandsCount = sqlquery.getTop10BrandByCount(cursor) 57 | charts["top10Brand"] = {'title': "Top 10 brands detected", \ 58 | 'labels':brands, \ 59 | 'values':brandsCount, \ 60 | 'legend':'Number of times detected'} 61 | 62 | ## get explicit 63 | categories, count = sqlquery.getExplicitCountByCategory(cursor) 64 | charts["explicitCount"] = {'title': "Number of explicit per category", \ 65 | 'labels':categories, \ 66 | 'values':count, \ 67 | 'legend':'Number of video per category'} 68 | 69 | return render_template('base.html', charts=charts, title="SNS", form=searchF) 70 | 71 | 72 | @app.route('/search/') 73 | def search(item="N/A"): 74 | searchF = searchform() 75 | print(item) 76 | users = sqlquery.searchFromBaseUsername(cursor, item) 77 | return render_template('result.html', form=searchF, item=item, users=users) 78 | 79 | 80 | @app.route('/custom/') 81 | def custom(name="N/A"): 82 | searchF = searchform() 83 | userInfo = sqlquery.getUserInfo(cursor, name) 84 | videos = sqlquery.getUserVideos(cursor, name) 85 | videosInfo = utils.computeVideosInfo(videos) 86 | explicits = sqlquery.getExplicitVideoUrlFromUser(cursor, name) 87 | hashtags = sqlquery.getHashtagsCountForUser(cursor, name) 88 | mentions = sqlquery.getMentionsFromUser(cursor, name) 89 | brands = sqlquery.getBrandsCountForUser(cursor, name) 90 | return render_template('custom.html', form=searchF, info=userInfo, vidinfo=videosInfo, \ 91 | explicits=explicits, hashtags=hashtags, mentions=mentions, brands=brands) 92 | 93 | 94 | @app.route('/scrap', methods=['GET', 'POST']) 95 | def scrap(): 96 | searchF = searchform() 97 | scrapF =scrapform() 98 | 99 | if scrapF.validate_on_submit(): 100 | scrapData = {'radio':request.form.get('radio'), \ 101 | 'data':request.form.get('data'), \ 102 | 'number':request.form.get('number')} 103 | utils.launchScrapper(scrapData) 104 | return redirect(url_for('chart')) 105 | 106 | return render_template('scrap.html', form=searchF, scrapForm=scrapF) 107 | 108 | -------------------------------------------------------------------------------- /webapp/form.py: -------------------------------------------------------------------------------- 1 | from flask_wtf import FlaskForm 2 | from wtforms import StringField, validators, RadioField, IntegerField, DecimalField 3 | from wtforms.validators import NumberRange 4 | 5 | 6 | class searchform(FlaskForm): 7 | search = StringField('Search', [validators.Length(min=0, max=50), validators.DataRequired()]) 8 | 9 | 10 | class scrapform(FlaskForm): 11 | radio = RadioField('radio', choices=[('user','Username'),('hashtag','Hashtag')], default='user') 12 | data = StringField('data', [validators.Length(min=0, max=100), validators.DataRequired()]) 13 | number = IntegerField('number', validators=[ 14 | validators.Required(), 15 | validators.NumberRange(min=1, max=20) 16 | ]) 17 | -------------------------------------------------------------------------------- /webapp/readme.md: -------------------------------------------------------------------------------- 1 | # yo 2 | 3 | Ne pas oublier 4 | 5 | ```bash 6 | source env/bin/activate 7 | export FLASK_APP=ffl/app.py 8 | ``` -------------------------------------------------------------------------------- /webapp/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | flask-wtf 3 | flask -------------------------------------------------------------------------------- /webapp/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source env/bin/activate 4 | pip install -r ./requirements.txt 5 | export FLASK_APP=webapp/app.py -------------------------------------------------------------------------------- /webapp/sqlquery.py: -------------------------------------------------------------------------------- 1 | # Recupérer les utilisateurs avec un contenu possiblement explicit 2 | # SELECT user.nickname FROM user INNER JOIN video ON video.userId = user.id INNER JOIN explicit ON explicit.id_video = video.id; 3 | 4 | # Get url of possible explicit videos 5 | # SELECT webVideoUrl from video INNER JOIN explicit ON explicit.id_video = video.id limit 3; 6 | 7 | # convert time in seconds to date format : 8 | # import datetime 9 | # datetime.datetime.fromtimestamp(1549634152).strftime('%c') 10 | 11 | # Get user specific by item 12 | # SELECT name FROM user INNER JOIN video INNER JOIN explicit WHERE explicit.explicit='LIKELY' AND explicit.id_video=video.id AND video.userId=user.id; 13 | 14 | # Get top 10 brand and count 15 | def getTop10BrandByCount(cursor): 16 | cursor.execute("select name, count(*) from brand group by name limit 10;") 17 | result = cursor.fetchall() 18 | return [str(name) for name, count in result], [count for name, count in result] 19 | 20 | # get explicit count 21 | def getExplicitCountByCategory(cursor): 22 | cursor.execute("select explicit, count(*) from explicit group by explicit;") 23 | result = cursor.fetchall() 24 | return [str(name) for name, count in result], [count for name, count in result] 25 | 26 | # get top10 nickname by video count desc 27 | def getTop10UsersByVideoCount(cursor): 28 | cursor.execute("select nickname, video from user order by video desc limit 10;") 29 | users = cursor.fetchall() 30 | return [names for names, nb in users], [nb for names, nb in users] 31 | 32 | 33 | # search engine from username 34 | def searchFromBaseUsername(cursor, username): 35 | username = '%' + username + '%' 36 | cursor.execute("select nickname, name from user where nickname like %s or name like %s", (username,username)) 37 | users = cursor.fetchall() 38 | result = [{k:v} for k, v in users] 39 | return result 40 | 41 | 42 | # get user basic informations 43 | def getUserInfo(cursor, name): 44 | toSelect = ["nickname", "name", "fans", "signature", "verified", "video", "heart", "following"] 45 | cursor.execute("select nickname, name, fans, signature, verified, video, heart, following from user where name=%s;", (name,)) 46 | userInfo = cursor.fetchall() 47 | zipped = zip(toSelect, userInfo[0]) 48 | dicted = dict(zipped) 49 | return dicted 50 | 51 | # get all videos data from a user 52 | def getUserVideos(cursor, name): 53 | cursor.execute("select * from video inner join user where video.userId = user.id and user.name = %s;", (name,)) 54 | results = cursor.fetchall() 55 | num_fields = len(cursor.description) 56 | field_names = [i[0] for i in cursor.description] 57 | final_res = [] 58 | for item in results: 59 | zipped = zip(field_names, item) 60 | final_res.append(dict(zipped)) 61 | return final_res 62 | 63 | # Get all the explicit videos for a specific user 64 | def getExplicitVideoUrlFromUser(cursor, name): 65 | cursor.execute(""" \ 66 | select webVideoUrl, explicit from video join explicit inner join user \ 67 | WHERE user.name=%s and video.userId=user.id and video.id=explicit.id_video \ 68 | and (explicit.explicit='VERY_LIKELY' or explicit.explicit='LIKELY' or explicit.explicit='POSSIBLE'); 69 | """, (name,)) # or explicit.explicit='POSSIBLE' 70 | results = cursor.fetchall() 71 | results = dict(results) 72 | if results == {}: 73 | return {'N/A':'N/A'} 74 | return results 75 | 76 | # Get all the hashtag and the count for a specific user 77 | def getHashtagsCountForUser(cursor, name): 78 | cursor.execute(""" \ 79 | select hashtag.name, count(*) from hashtag inner join video inner join user where \ 80 | user.id=video.userId and hashtag.id_video=video.id and user.name=%s \ 81 | group by hashtag.name; \ 82 | """, (name,)) 83 | results = cursor.fetchall() 84 | results = dict(results) 85 | if results == {}: 86 | return {'N/A':'N/A'} 87 | return results 88 | 89 | # Get all mentions count for a specific user 90 | def getMentionsFromUser(cursor, name): 91 | cursor.execute("""\ 92 | select username, count(*) from mention inner join video inner join user \ 93 | where user.id=video.userId and mention.id_video=video.id and user.name=%s \ 94 | group by username; \ 95 | """, (name,)) 96 | results = cursor.fetchall() 97 | results = dict(results) 98 | if results == {}: 99 | return {'N/A':'N/A'} 100 | return results 101 | 102 | 103 | # get all brand count that appears for a users 104 | def getBrandsCountForUser(cursor, name): 105 | cursor.execute("""\ 106 | select brand.name, count(*) from brand inner join video inner join user \ 107 | where user.id=video.userId and brand.id_video=video.id and user.name=%s \ 108 | group by brand.name; \ 109 | """, (name,)) 110 | 111 | results = cursor.fetchall() 112 | results = dict(results) 113 | if results == {}: 114 | return {'N/A':'N/A'} 115 | return results -------------------------------------------------------------------------------- /webapp/static/banner4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VictorLuc4/Social-Net-SCrap/c6b481e4473ee0e878026080e20fa32ca28245d2/webapp/static/banner4.jpeg -------------------------------------------------------------------------------- /webapp/static/customCharts.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | // ---------------------------------------------- 4 | // explicit Count By Category 5 | // ---------------------------------------------- 6 | categoryColors = getColorsForTab(explicitCountData.values, 0.35) 7 | 8 | var explicitCountByCategory = document.getElementById("explicitCountByCategory").getContext("2d"); 9 | Chart.defaults.global.responsive = false; 10 | 11 | var explicitCountByCategoryChart = new Chart(explicitCountByCategory, { 12 | type: 'doughnut', 13 | data: { 14 | labels : explicitCountData.labels, 15 | datasets: [{ 16 | label: explicitCountData.legend, 17 | fill: true, 18 | lineTension: 0.1, 19 | backgroundColor: categoryColors, 20 | borderColor: categoryColors, 21 | data : explicitCountData.values, 22 | }] 23 | }, 24 | options: { 25 | title: { 26 | display: true, 27 | text: explicitCountData.title 28 | }, 29 | }, 30 | }); 31 | 32 | // ---------------------------------------------- 33 | // Top 10 users by video number : 34 | // ---------------------------------------------- 35 | top10colors = getColorsForTab(top10User.values, 0.16) 36 | 37 | var top10UsersByVideoCount = document.getElementById("top10UsersByVideoCount").getContext("2d"); 38 | //Chart.defaults.global.responsive = false; 39 | 40 | var top10UsersByVideoCountChart = new Chart(top10UsersByVideoCount, { 41 | type: 'bar', 42 | data: { 43 | labels : top10User.labels, 44 | datasets: [{ 45 | label: top10User.legend, 46 | fill: true, 47 | lineTension: 0.1, 48 | backgroundColor: top10colors, 49 | borderColor: top10colors, 50 | borderCapStyle: 'butt', 51 | borderDash: [], 52 | borderDashOffset: 0.0, 53 | borderJoinStyle: 'miter', 54 | pointBorderColor: top10colors, 55 | pointBackgroundColor: "#fff", 56 | pointBorderWidth: 1, 57 | pointHoverRadius: 5, 58 | pointHoverBackgroundColor: top10colors, 59 | pointHoverBorderColor: "rgba(220,220,220,1)", 60 | pointHoverBorderWidth: 2, 61 | pointRadius: 1, 62 | pointHitRadius: 10, 63 | data : top10User.values, 64 | spanGaps: false 65 | }] 66 | }, 67 | options: { 68 | title: { 69 | display: true, 70 | text: top10User.title 71 | }, 72 | }, 73 | }); 74 | // ---------------------------------------------- 75 | 76 | // ---------------------------------------------- 77 | // Top 10 users by video number : 78 | // ---------------------------------------------- 79 | top10colors = getColorsForTab(top10Brand.values, 0.16) 80 | 81 | var top10BrandByCount = document.getElementById("top10BrandByCount").getContext("2d"); 82 | //Chart.defaults.global.responsive = false; 83 | 84 | var top10BrandByCountChart = new Chart(top10BrandByCount, { 85 | type: 'bar', 86 | data: { 87 | labels : top10Brand.labels, 88 | datasets: [{ 89 | label: top10Brand.legend, 90 | fill: true, 91 | lineTension: 0.1, 92 | backgroundColor: top10colors, 93 | borderColor: top10colors, 94 | borderCapStyle: 'butt', 95 | borderDash: [], 96 | borderDashOffset: 0.0, 97 | borderJoinStyle: 'miter', 98 | pointBorderColor: top10colors, 99 | pointBackgroundColor: "#fff", 100 | pointBorderWidth: 1, 101 | pointHoverRadius: 5, 102 | pointHoverBackgroundColor: top10colors, 103 | pointHoverBorderColor: "rgba(220,220,220,1)", 104 | pointHoverBorderWidth: 2, 105 | pointRadius: 1, 106 | pointHitRadius: 10, 107 | data : top10Brand.values, 108 | spanGaps: false 109 | }] 110 | }, 111 | options: { 112 | title: { 113 | display: true, 114 | text: top10Brand.title 115 | }, 116 | }, 117 | }); 118 | // ---------------------------------------------- -------------------------------------------------------------------------------- /webapp/static/logo-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VictorLuc4/Social-Net-SCrap/c6b481e4473ee0e878026080e20fa32ca28245d2/webapp/static/logo-full.png -------------------------------------------------------------------------------- /webapp/static/logo-transp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VictorLuc4/Social-Net-SCrap/c6b481e4473ee0e878026080e20fa32ca28245d2/webapp/static/logo-transp.png -------------------------------------------------------------------------------- /webapp/static/styles.css: -------------------------------------------------------------------------------- 1 | .wrapper 2 | { 3 | width: 100%; 4 | height: auto; 5 | display: grid; 6 | grid-template-columns: 50%; 7 | grid-template-areas: 8 | "c c" 9 | "c c" 10 | "c c" 11 | "c c"; 12 | } 13 | 14 | .general { 15 | padding: 10px; 16 | } 17 | 18 | canvas { 19 | grid-area: c; 20 | position: relative; 21 | } 22 | 23 | body { 24 | font-family: Arial; 25 | } 26 | 27 | body, html { 28 | height: 100%; 29 | margin: 0; 30 | font-family: Arial, Helvetica, sans-serif; 31 | } 32 | 33 | * { 34 | box-sizing: border-box; 35 | } 36 | 37 | /* The hero image */ 38 | .hero-image { 39 | /* Use "linear-gradient" to add a darken background effect to the image (photographer.jpg). This will make the text easier to read */ 40 | background-image: linear-gradient(rgba(0, 0, 0, 0.0), rgba(0, 0, 0, 0.0)), url("/static/banner4.jpeg"); 41 | 42 | /* Set a specific height */ 43 | height: 100%; 44 | 45 | /* Position and center the image to scale nicely on all screens */ 46 | background-position: center; 47 | background-repeat: no-repeat; 48 | background-size: cover; 49 | position: relative; 50 | } 51 | 52 | /* The hero image header */ 53 | .hero-image-head { 54 | /* Use "linear-gradient" to add a darken background effect to the image (photographer.jpg). This will make the text easier to read */ 55 | background-image: linear-gradient(rgba(0, 0, 0, 0.0), rgba(0, 0, 0, 0.0)), url("/static/banner4.jpeg"); 56 | 57 | /* Set a specific height */ 58 | height: 20%; 59 | 60 | /* Position and center the image to scale nicely on all screens */ 61 | background-position: center; 62 | background-repeat: no-repeat; 63 | background-size: cover; 64 | position: relative; 65 | } 66 | 67 | /* Place text in the middle of the image */ 68 | .hero-text { 69 | text-align: center; 70 | position: absolute; 71 | top: 50%; 72 | left: 50%; 73 | transform: translate(-50%, -50%); 74 | color: black; 75 | } 76 | 77 | /* ----- Search Form ----- */ 78 | 79 | form.example input[type=text] { 80 | padding: 10px; 81 | font-size: 17px; 82 | border: 1px solid grey; 83 | float: left; 84 | width: 80%; 85 | height:40px; 86 | background: #f1f1f1; 87 | } 88 | 89 | form.example button { 90 | float: left; 91 | width: 20%; 92 | padding: 10px; 93 | background: #86C4ECA2; 94 | color: white; 95 | font-size: 20px; 96 | height: 40px; 97 | border: 1px solid grey; 98 | border-left: none; 99 | cursor: pointer; 100 | } 101 | 102 | form.example button:hover { 103 | background: #86C4ECFF; 104 | } 105 | 106 | form.example::after { 107 | content: ""; 108 | clear: both; 109 | display: table; 110 | } 111 | 112 | /* ----- Scraper Form ----- */ 113 | 114 | form.scrapform input[type=text] { 115 | padding: 10px; 116 | font-size: 17px; 117 | border: 1px solid grey; 118 | float: center; 119 | width: 100%; 120 | background: #f1f1f1; 121 | } 122 | 123 | form.scrapform button { 124 | float: center; 125 | width: 100%; 126 | padding: 10px; 127 | background: #86C4ECA2; 128 | color: white; 129 | font-size: 20px; 130 | border: 1px solid grey; 131 | border-left: none; 132 | cursor: pointer; 133 | } 134 | 135 | form.scrapform button:hover { 136 | background: #86C4ECFF; 137 | } 138 | 139 | form.scrapform::after { 140 | content: ""; 141 | clear: both; 142 | display: table; 143 | } 144 | 145 | 146 | /* ----- Radio button ----- */ 147 | 148 | ul { 149 | list-style-type: none; 150 | margin: 0; 151 | padding: 0; 152 | } 153 | 154 | .radiobox { 155 | float: center; 156 | 157 | width: 50em; 158 | padding: 2em; 159 | background: #f6f6f6; 160 | } 161 | 162 | input[type=radio] { 163 | position: absolute; 164 | opacity: 0; 165 | } 166 | 167 | input[type=radio] + label { 168 | display: inline-block; 169 | } 170 | 171 | input[type=radio] + label:before { 172 | content: ""; 173 | display: inline-block; 174 | vertical-align: -0.2em; 175 | width: 1em; 176 | height: 1em; 177 | border: 0.15em solid #86C4ECFF; 178 | border-radius: 0.2em; 179 | margin-right: 0.3em; 180 | background-color: white; 181 | } 182 | input[type=radio] + label:before { 183 | border-radius: 50%; 184 | } 185 | input[type=radio]:checked + label:before { 186 | background-color: #86C4ECFF; 187 | box-shadow: inset 0 0 0 0.15em white; 188 | } 189 | input[type=radio]:focus + label:before { 190 | outline: dotted 1px; 191 | } 192 | 193 | /* ----- Progress bar ----- */ 194 | -------------------------------------------------------------------------------- /webapp/static/table.css: -------------------------------------------------------------------------------- 1 | .styled-table { 2 | border-collapse: collapse; 3 | margin: 10 0; 4 | font-size: 0.9em; 5 | font-family: sans-serif; 6 | width: 60%; 7 | box-shadow: 0 0 20px rgba(0, 0, 0, 0.15); 8 | } 9 | 10 | .styled-table thead tr { 11 | background-color: #86C4ECFF; 12 | color: #ffffff; 13 | text-align: left; 14 | } 15 | 16 | .styled-table th, 17 | .styled-table td { 18 | padding: 12px 15px; 19 | } 20 | 21 | tbody { 22 | display: block; 23 | overflow: auto; 24 | max-height: 200px; 25 | } 26 | 27 | thead, tbody tr { 28 | display: table; 29 | width: 100%; 30 | table-layout: fixed; 31 | } 32 | 33 | .styled-table tbody tr { 34 | border-bottom: 1px solid #8fedbf30; 35 | } 36 | 37 | .styled-table tbody tr:nth-of-type(even) { 38 | background-color: #86C4EC30; 39 | } 40 | 41 | .styled-table tbody tr:last-of-type { 42 | border-bottom: 2px solid #38a2e0a2; 43 | } 44 | 45 | /* ---- results ----*/ 46 | 47 | .styled-res { 48 | border-collapse: collapse; 49 | margin: 10 0; 50 | padding-left: 40px; 51 | padding-right: 40px; 52 | font-size: 0.9em; 53 | font-family: sans-serif; 54 | height: auto; 55 | width: auto; 56 | box-shadow: 0 0 20px rgba(0, 0, 0, 0.15); 57 | } 58 | 59 | .styled-res thead tr { 60 | background-color: #86C4ECFF; 61 | color: #000000; 62 | text-align: center; 63 | } 64 | 65 | .styled-res th, 66 | .styled-res td { 67 | padding: 12px 15px; 68 | } 69 | 70 | tbody { 71 | display: block; 72 | overflow: auto; 73 | max-height: 100%; 74 | } 75 | 76 | thead, tbody tr { 77 | display: table; 78 | width: 100%; 79 | table-layout: fixed; 80 | } 81 | 82 | .styled-res tbody tr { 83 | border-bottom: 1px solid #dddddd; 84 | } 85 | 86 | .styled-res tbody tr:nth-of-type(even) { 87 | background-color: #86C4EC30; 88 | } 89 | 90 | 91 | .styled-res tbody tr:last-of-type { 92 | border-bottom: 2px solid #38a2e0a2; 93 | } 94 | -------------------------------------------------------------------------------- /webapp/static/userCharts.js: -------------------------------------------------------------------------------- 1 | // ---------------------------------------------- 2 | // Preferred Time for posts 3 | // ---------------------------------------------- 4 | //categoryColors = getColorsForTab(explicitCountData.values, 0.35) 5 | 6 | var prefposttime = document.getElementById("prefposttime").getContext("2d"); 7 | Chart.defaults.global.responsive = false; 8 | 9 | var prefposttimeChart = new Chart(prefposttime, { 10 | type: 'bubble', 11 | data: { 12 | datasets: [ 13 | { 14 | label: 'Posts', 15 | data: videoData.bubble, 16 | backgroundColor: "rgba(134,196,236, 0.6)" 17 | }, 18 | ] 19 | }, 20 | options: { 21 | title: { 22 | display: true, 23 | text: "Posts posted times" 24 | }, 25 | scales: { 26 | yAxes: [{ 27 | ticks: { 28 | max: 6, 29 | min: 0, 30 | stepSize: 1, 31 | callback: function(value, index, values) { 32 | days = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}; 33 | return days[index] 34 | } 35 | } 36 | }], 37 | xAxes: [{ 38 | ticks: { 39 | max: 24, 40 | min: 0, 41 | stepSize: 1, 42 | callback: function(value, index, values) { 43 | days = {0:'midnight', 1:'1am', 2:'2am', 3:'3am', 4:'4am', 5:'5am', 6:'6am', 44 | 7:'7am', 8:'8am', 9:'9am', 10:'10am', 11:'11am', 12:'Lunch Time', 45 | 13:'1pm', 14:'2pm', 15:'3pm', 16:'4pm', 17:'5pm', 18:'6pm', 19:'7pm', 46 | 20:'8pm', 21:'9pm', 22:'10pm', 23:'11pm', 24:'midnight too'} 47 | 48 | return days[index] 49 | } 50 | } 51 | }] 52 | }, 53 | }, 54 | }); 55 | 56 | -------------------------------------------------------------------------------- /webapp/static/utils.js: -------------------------------------------------------------------------------- 1 | 2 | function getRandomColor() { 3 | var letters = '0123456789ABCDEF'; 4 | var color = '#'; 5 | for (var i = 0; i < 6; i++) { 6 | color += letters[Math.floor(Math.random() * 16)]; 7 | } 8 | return color; 9 | } 10 | 11 | function approximateColor1ToColor2ByPercent(color1, color2, percent) { 12 | var red1 = parseInt(color1[1] + color1[2], 16); 13 | var green1 = parseInt(color1[3] + color1[4], 16); 14 | var blue1 = parseInt(color1[5] + color1[6], 16); 15 | 16 | var red2 = parseInt(color2[1] + color2[2], 16); 17 | var green2 = parseInt(color2[3] + color2[4], 16); 18 | var blue2 = parseInt(color2[5] + color2[6], 16); 19 | 20 | var red = Math.round(mix(red1, red2, percent)); 21 | var green = Math.round(mix(green1, green2, percent)); 22 | var blue = Math.round(mix(blue1, blue2, percent)); 23 | 24 | return generateHex(red, green, blue); 25 | } 26 | 27 | function generateHex(r, g, b) { 28 | r = r.toString(16); 29 | g = g.toString(16); 30 | b = b.toString(16); 31 | 32 | // to address problem mentioned by Alexis Wilke: 33 | while (r.length < 2) { r = "0" + r; } 34 | while (g.length < 2) { g = "0" + g; } 35 | while (b.length < 2) { b = "0" + b; } 36 | 37 | return "#" + r + g + b; 38 | } 39 | 40 | function mix(start, end, percent) { 41 | return start + ((percent) * (end - start)); 42 | } 43 | 44 | function getColorsForTab(tab, force){ 45 | cols = [] 46 | first = getRandomColor() 47 | sec = '#FFFFFF' 48 | for (i = 0; i < tab.length; i++){ 49 | newcol = approximateColor1ToColor2ByPercent(first, sec, force) 50 | first = newcol 51 | cols.push(newcol) 52 | } 53 | return cols 54 | } -------------------------------------------------------------------------------- /webapp/templates/_formhelpers.html: -------------------------------------------------------------------------------- 1 | {% macro render_field(field) %} 2 |
{{ field.label }} 3 |
{{ field(**kwargs)|safe }} 4 | {% if field.errors %} 5 |
    6 | {% for error in field.errors %} 7 |
  • {{ error }}
  • 8 | {% endfor %} 9 |
10 | {% endif %} 11 |
12 | {% endmacro %} -------------------------------------------------------------------------------- /webapp/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% if title %} 4 | {{ title }} 5 | {% else %} 6 | Welcome to SNS 7 | {% endif %} 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 | 22 |

Research | Scrap

23 |

Not just another OSINT tool

24 |
25 | {{ form.csrf_token }} 26 | {{ form.search(placeholder="Search user..") }} 27 | 28 |
29 |
30 |
31 | {% block content %}{% endblock %} 32 | 33 | -------------------------------------------------------------------------------- /webapp/templates/chart.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |
5 | 6 | 7 |

General Overview

8 | 9 |
10 |
11 | 12 |
13 | 14 |
15 | 16 |
17 | 18 |
19 | 20 |
21 |
22 | 23 | 30 | 31 |
32 | {% endblock %} -------------------------------------------------------------------------------- /webapp/templates/custom.html: -------------------------------------------------------------------------------- 1 | {% extends "head.html" %} 2 | 3 | {% block content %} 4 | 5 | 6 |
7 |
8 |

{{ info.nickname }}

9 | {{ info.name }} 10 | {% if info.verified == 1 %} 11 | 12 | {% else %} 13 | 14 | {% endif %} 15 |
16 | {{ info.signature }} 17 |
18 |
19 |

From the video analized we found the informations below :

20 |

21 | A video duration average of {{ vidinfo.duration_av }} seconds
22 | An average of {{ vidinfo.share_av }} shares, {{ vidinfo.comments_av}} comments and {{vidinfo.plays_av }} plays per video.
23 | The average size of the description is {{ vidinfo.descSize_av }} 24 |

25 |
26 |
27 |

28 | Following {{ info.following }} people
29 | Has {{ info.video }} videos but only {{ vidinfo.videoNum }} were scrapped and analyzed
30 | Has {{ info.fans }} fans
31 | Has a total of {{ info.heart }} hearts
32 |

33 |
34 | 35 |
36 |

Brands that appears in the videos :

37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | {% for key, value in brands.items() %} 46 | 47 | 48 | 49 | 50 | {% endfor %} 51 | 52 |
BrandsNumber of time
{{key}}{{value}}
53 |
54 |
55 |

People mentionned in the videos :

56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | {% for key, value in mentions.items() %} 65 | 66 | {% if value == 'N/A' %} 67 | 68 | {% else %} 69 | 70 | {% endif %} 71 | 72 | 73 | {% endfor %} 74 | 75 |
MentionnedNumber of time
{{key}}{{key}}{{value}}
76 |
77 |
78 |

Hashtags used in the videos :

79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | {% for key, value in hashtags.items() %} 88 | 89 | 90 | 91 | 92 | {% endfor %} 93 | 94 |
HashtagsNumber of time used
{{key}}{{ value }}
95 |
96 |
97 |
98 |
99 |

Videos with possible explicit content :

100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | {% for key, value in explicits.items() %} 109 | 110 | {% if value == 'N/A' %} 111 | 112 | {% else %} 113 | 114 | {% endif %} 115 | 116 | 117 | {% endfor %} 118 | 119 |
Video UrlExplicit Video
{{key}}{{key}}{{ value }}
120 |
121 |
122 |

Posts map regarding the days and the hours :

123 |
124 |
125 |
126 | 127 | 130 | 131 | {% endblock %} -------------------------------------------------------------------------------- /webapp/templates/head.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% if title %} 4 | {{ title }} 5 | {% else %} 6 | Welcome to SNS 7 | {% endif %} 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 |

Research | Scrap

22 |

Not just another OSINT tool

23 |
24 | {{ form.csrf_token }} 25 | {{ form.search(placeholder="Search user..") }} 26 | 27 |
28 |
29 |
30 |
31 | {% block content %}{% endblock %} 32 |
33 | 34 | -------------------------------------------------------------------------------- /webapp/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |

Hi, {{ user.name }}!

5 | {% for post in posts %} 6 |

{{ post.author.username }} says: {{ post.body }}

7 | {% endfor %} 8 | {% endblock %} -------------------------------------------------------------------------------- /webapp/templates/result.html: -------------------------------------------------------------------------------- 1 | {% extends "head.html" %} 2 | 3 | {% block content %} 4 |
5 |

Results found for "{{ item }}"

6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for dict_item in users %} 18 | 19 | {% for key, value in dict_item.items() %} 20 | 21 | 22 | {% endfor %} 23 | 24 | {% endfor %} 25 | 26 |
NicknameReal Name
{{key}}{{value}}
27 |
28 | 29 | {% endblock %} -------------------------------------------------------------------------------- /webapp/templates/scrap.html: -------------------------------------------------------------------------------- 1 | {% extends "head.html" %} 2 | 3 | {% block content %} 4 | 5 |
6 | {{ scrapForm.csrf_token }} 7 |

Let'scrap some people...

8 |
9 | Scrap a user or a hashtag ?
10 | {{ scrapForm.radio }}
11 | Username or hashtag to scrap :
12 | {{ scrapForm.data(placeholder="usershtag") }}

13 | Number of video to scrap from 1 to 20 :
14 | {{ scrapForm.number(placeholder="3") }}

15 | 16 | 19 |
20 | 21 | {% endblock %} -------------------------------------------------------------------------------- /webapp/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def launchScrapper(scrapData): 4 | print(scrapData) 5 | cmd = "python3 scraper/sns.py --" + scrapData['radio'] + " " + scrapData['data'] +" -n " + scrapData['number'] 6 | print(cmd) 7 | os.system(cmd) 8 | return 9 | 10 | 11 | def computeVideosInfo(videos): 12 | import datetime 13 | 14 | durations = [] 15 | shares = [] 16 | comments = [] 17 | plays = [] 18 | descSize = [] 19 | webUrls = [] 20 | created = [] 21 | weekdays = [] 22 | hours = [] 23 | 24 | for video in videos: 25 | durations.append(video["duration"]) 26 | shares.append(video["shareCount"]) 27 | comments.append(video["commentCount"]) 28 | plays.append(video["playCount"]) 29 | descSize.append(len(video["text"])) 30 | webUrls.append(video["webVideoUrl"]) 31 | created_date = datetime.datetime.fromtimestamp(int(video["createTime"])) 32 | fulldate = created_date.strftime('%c') 33 | day = created_date.strftime('%A') 34 | hour = created_date.strftime('%H') 35 | created.append(fulldate) 36 | weekdays.append(day) 37 | hours.append(hour) 38 | 39 | bubble = getBubbleFromDaysAndHours(weekdays, hours) 40 | videoNum = len(videos) 41 | infos = {'videoNum':videoNum, \ 42 | 'duration_av':int(sum(durations)/videoNum), \ 43 | 'share_av': int(sum(shares)/videoNum), \ 44 | 'plays_av': int(sum(plays)/videoNum), \ 45 | 'comments_av': int(sum(comments)/videoNum), \ 46 | 'descSize_av': int(sum(descSize)/videoNum), \ 47 | 'web_urls' : webUrls, \ 48 | 'created_date': created, \ 49 | 'weekdays' : weekdays, \ 50 | 'hours' : hours, \ 51 | 'bubble': bubble } 52 | 53 | return infos 54 | 55 | def getBubbleFromDaysAndHours(days, hours): 56 | #days = ['Tuesday', 'Tuesday', 'Tuesday', 'Wednesday', 'Wednesday'] 57 | #hours = [2, 8, 8, 2, 2] 58 | daysDict = {'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6} 59 | infos = {} 60 | for i in range(0, len(days)): 61 | dnum = daysDict[days[i]] 62 | 63 | if dnum in infos.keys(): 64 | # the day exist, so we need to check for the hour now 65 | if hours[i] in infos[dnum].keys(): 66 | # hour for the day exists so we increase the occurence 67 | infos[dnum][hours[i]] += 1 68 | else: 69 | # hour doesn't exist so we add it 70 | infos[dnum][hours[i]] = 1 71 | else: 72 | # the day doesn't exist so we add it with the hour 73 | infos[dnum] = {hours[i]: 1} 74 | 75 | # Then we need to transform in in a x, y, r dictionary 76 | bub = [] 77 | for y, val in infos.items(): 78 | tmp = {} 79 | for x, z in val.items(): 80 | bub.append({'x':x, 'y':y, 'r':z*5}) 81 | 82 | return bub --------------------------------------------------------------------------------