├── .gitignore ├── META-INF └── MANIFEST.MF ├── README.md ├── assets └── logo.png ├── build.gradle ├── checked_proxies.txt ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── proxy_list.xml ├── proxyhandler-v1.md ├── proxyhandler-v2.md ├── src ├── main │ └── java │ │ └── com │ │ └── scrapium │ │ ├── CustomSignalHandler.java │ │ ├── DatabaseConnection.java │ │ ├── Main.java │ │ ├── Scraper.java │ │ ├── ThreadBase.java │ │ ├── TweetThreadTaskProcessor.java │ │ ├── handler.java │ │ ├── proxium │ │ ├── Proxy.java │ │ └── ProxyService.java │ │ ├── tests │ │ └── Benchmark.java │ │ ├── threads │ │ ├── LoggingThread.java │ │ ├── ProducerThread.java │ │ ├── ProxyThread.java │ │ └── TweetThread.java │ │ ├── tweetium │ │ ├── TaskService.java │ │ └── TweetTask.java │ │ └── utils │ │ ├── DebugLogger.java │ │ └── TimeUtils.java └── schema.sql └── unchecked_proxies.txt /.gitignore: -------------------------------------------------------------------------------- 1 | ############################## 2 | ## Java 3 | ############################## 4 | .mtj.tmp/ 5 | *.class 6 | *.jar 7 | *.war 8 | *.ear 9 | *.nar 10 | hs_err_pid* 11 | replay_pid* 12 | 13 | ############################## 14 | ## Maven 15 | ############################## 16 | target/ 17 | pom.xml.tag 18 | pom.xml.releaseBackup 19 | pom.xml.versionsBackup 20 | pom.xml.next 21 | pom.xml.bak 22 | release.properties 23 | dependency-reduced-pom.xml 24 | buildNumber.properties 25 | .mvn/timing.properties 26 | .mvn/wrapper/maven-wrapper.jar 27 | 28 | ############################## 29 | ## Gradle 30 | ############################## 31 | bin/ 32 | build/ 33 | .gradle 34 | .gradletasknamecache 35 | gradle-app.setting 36 | !gradle-wrapper.jar 37 | 38 | ############################## 39 | ## IntelliJ 40 | ############################## 41 | out/ 42 | .idea/ 43 | .idea_modules/ 44 | *.iml 45 | *.ipr 46 | *.iws 47 | 48 | ############################## 49 | ## Eclipse 50 | ############################## 51 | .settings/ 52 | bin/ 53 | tmp/ 54 | .metadata 55 | .classpath 56 | .project 57 | *.tmp 58 | *.bak 59 | *.swp 60 | *~.nib 61 | local.properties 62 | .loadpath 63 | .factorypath 64 | 65 | ############################## 66 | ## NetBeans 67 | ############################## 68 | nbproject/private/ 69 | build/ 70 | nbbuild/ 71 | dist/ 72 | nbdist/ 73 | nbactions.xml 74 | nb-configuration.xml 75 | 76 | ############################## 77 | ## Visual Studio Code 78 | ############################## 79 | .vscode/ 80 | .code-workspace 81 | 82 | ############################## 83 | ## OS X 84 | ############################## 85 | .DS_Store 86 | proxy_checker/proxyChecker.js 87 | proxy_checker/proxies.txt 88 | proxy_checker/package.json 89 | proxy_checker/package-lock.json 90 | proxy_checker/node_modules/ms/readme.md 91 | proxy_checker/node_modules/ms/package.json 92 | proxy_checker/node_modules/ms/license.md 93 | proxy_checker/node_modules/ms/index.js 94 | proxy_checker/node_modules/mass-proxy-validator/yarn.lock 95 | proxy_checker/node_modules/mass-proxy-validator/README.md 96 | proxy_checker/node_modules/mass-proxy-validator/package.json 97 | proxy_checker/node_modules/mass-proxy-validator/LICENSE 98 | proxy_checker/node_modules/mass-proxy-validator/index.js 99 | proxy_checker/node_modules/mass-proxy-validator/.gitattributes 100 | proxy_checker/node_modules/is-buffer/README.md 101 | proxy_checker/node_modules/is-buffer/package.json 102 | proxy_checker/node_modules/is-buffer/LICENSE 103 | proxy_checker/node_modules/is-buffer/index.js 104 | proxy_checker/node_modules/is-buffer/index.d.ts 105 | proxy_checker/node_modules/follow-redirects/README.md 106 | proxy_checker/node_modules/follow-redirects/package.json 107 | proxy_checker/node_modules/follow-redirects/LICENSE 108 | proxy_checker/node_modules/follow-redirects/index.js 109 | proxy_checker/node_modules/follow-redirects/https.js 110 | proxy_checker/node_modules/follow-redirects/http.js 111 | proxy_checker/node_modules/debug/src/node.js 112 | proxy_checker/node_modules/debug/src/index.js 113 | proxy_checker/node_modules/debug/src/debug.js 114 | proxy_checker/node_modules/debug/src/browser.js 115 | proxy_checker/node_modules/debug/README.md 116 | proxy_checker/node_modules/debug/package.json 117 | proxy_checker/node_modules/debug/node.js 118 | proxy_checker/node_modules/debug/Makefile 119 | proxy_checker/node_modules/debug/LICENSE 120 | proxy_checker/node_modules/debug/karma.conf.js 121 | proxy_checker/node_modules/debug/CHANGELOG.md 122 | proxy_checker/node_modules/debug/.travis.yml 123 | proxy_checker/node_modules/debug/.npmignore 124 | proxy_checker/node_modules/debug/.eslintrc 125 | proxy_checker/node_modules/debug/.coveralls.yml 126 | proxy_checker/node_modules/axios/UPGRADE_GUIDE.md 127 | proxy_checker/node_modules/axios/README.md 128 | proxy_checker/node_modules/axios/package.json 129 | proxy_checker/node_modules/axios/LICENSE 130 | proxy_checker/node_modules/axios/lib/utils.js 131 | proxy_checker/node_modules/axios/lib/helpers/spread.js 132 | proxy_checker/node_modules/axios/lib/helpers/README.md 133 | proxy_checker/node_modules/axios/lib/helpers/parseHeaders.js 134 | proxy_checker/node_modules/axios/lib/helpers/normalizeHeaderName.js 135 | proxy_checker/node_modules/axios/lib/helpers/isURLSameOrigin.js 136 | proxy_checker/node_modules/axios/lib/helpers/isAbsoluteURL.js 137 | proxy_checker/node_modules/axios/lib/helpers/deprecatedMethod.js 138 | proxy_checker/node_modules/axios/lib/helpers/cookies.js 139 | proxy_checker/node_modules/axios/lib/helpers/combineURLs.js 140 | proxy_checker/node_modules/axios/lib/helpers/buildURL.js 141 | proxy_checker/node_modules/axios/lib/helpers/bind.js 142 | proxy_checker/node_modules/axios/lib/defaults.js 143 | proxy_checker/node_modules/axios/lib/core/transformData.js 144 | proxy_checker/node_modules/axios/lib/core/settle.js 145 | proxy_checker/node_modules/axios/lib/core/README.md 146 | proxy_checker/node_modules/axios/lib/core/InterceptorManager.js 147 | proxy_checker/node_modules/axios/lib/core/enhanceError.js 148 | proxy_checker/node_modules/axios/lib/core/dispatchRequest.js 149 | proxy_checker/node_modules/axios/lib/core/createError.js 150 | proxy_checker/node_modules/axios/lib/core/Axios.js 151 | proxy_checker/node_modules/axios/lib/cancel/isCancel.js 152 | proxy_checker/node_modules/axios/lib/cancel/CancelToken.js 153 | proxy_checker/node_modules/axios/lib/cancel/Cancel.js 154 | proxy_checker/node_modules/axios/lib/axios.js 155 | proxy_checker/node_modules/axios/lib/adapters/xhr.js 156 | proxy_checker/node_modules/axios/lib/adapters/README.md 157 | proxy_checker/node_modules/axios/lib/adapters/http.js 158 | proxy_checker/node_modules/axios/index.js 159 | proxy_checker/node_modules/axios/index.d.ts 160 | proxy_checker/node_modules/axios/CHANGELOG.md 161 | proxy_checker/node_modules/.package-lock.json 162 | -------------------------------------------------------------------------------- /META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.scrapium.Main 3 | 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | logo 3 |

4 | requests - but as fast as possible.
Proxied asynchronous multi-threaded scraper via concurrent queues written in Java. 5 | 6 | A template for fast web scrapers 7 |

8 | 9 | 10 | 11 |

12 | 13 | contributors 14 | 15 | 16 | 21 | 22 | 23 | forks 24 | 25 | 26 | 27 | stars 28 | 29 | 30 | 31 | open issues 32 | 33 | 34 | 38 |

39 | 40 |

41 | Report Bug 42 | · 43 | Request Feature 44 | · 45 | Support this project 46 |

47 |
48 | 49 | 50 | ## WIP: Information coming soon 51 | 52 | Original Repo: Debug development repo for scraping tweets and market data via RDS. Using optimisations techniques such as (Threading, asynchronous I/O, non-blocking I/O - ConcurrentLinkedQueues, and runnable tasks for making requests and saving tweets to the database. (Rewritten in Java) 53 | 54 | Adjust speed: 55 | 56 | Scraper scraper = new Scraper(2, 2000, 10); 57 | 58 | To adjust scraper rate, ensure your internet is fast enough, the scraper will increase in speed over time, as optimum proxies are found. 59 | 60 | Run the program as is, the debug version does not download from Twitter. Achieved RPS will be shown in the console. 61 | 62 | PLEASE NOTE THE FOLLOWING: 63 | - This code only runs properly on Linux, preferably cloud hosted. 64 | - The program initially takes time to work out which proxies are most successful. 65 | - If settings in Main.java are too high, the program will fail all requests. 66 | - The bot uses public proxies, enter proxies in checked_proxies.txt 67 | - !!! The bot must have excessive internet bandwidth. 68 | - On Linux, max open files should be set to a high number 69 | 70 | Troubleshooting 71 | - Requests drop to 0 72 | - Maybe you're using too many resources - there's a perfect balance. 73 | - Having too many worker threads leads to blocks and switching between threads - slowing the system down. 74 | - Using too much memory and the program can't allocate memory 75 | - There are no proxies available 76 | - There are no tasks left 77 | - You have reached the maximum co-currency 78 | - Something may be causing a hang - ie. in the request handler, ie. not updating the coroutine count properly 79 | 80 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/couldbejake/fast/a3932892ef78602637d20d679d8bcd7cead1abef/assets/logo.png -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'java' 3 | } 4 | 5 | repositories { 6 | mavenCentral() 7 | jcenter() 8 | } 9 | 10 | dependencies { 11 | implementation 'com.google.code.gson:gson:2.9.1' 12 | implementation 'net.java.dev.jna:jna:5.10.0' 13 | implementation 'org.apache.httpcomponents.client5:httpclient5:5.2.1' 14 | implementation 'org.slf4j:slf4j-api:2.0.7' 15 | implementation 'org.slf4j:slf4j-simple:2.0.7' 16 | implementation 'org.postgresql:postgresql:42.3.1' 17 | implementation 'org.apache.commons:commons-dbcp2:2.9.0' 18 | implementation 'org.apache.httpcomponents:httpcore-nio:4.4.14' 19 | implementation group: 'org.asynchttpclient', name: 'async-http-client', version: '2.12.3' 20 | implementation 'oauth.signpost:signpost-core:1.2.1.2' 21 | implementation 'com.squareup.okhttp3:okhttp:4.9.1' 22 | 23 | implementation 'org.apache.httpcomponents:httpclient:4.5.13' 24 | implementation 'commons-codec:commons-codec:1.15' 25 | implementation 'org.glassfish:javax.json:1.1.4' 26 | } 27 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/couldbejake/fast/a3932892ef78602637d20d679d8bcd7cead1abef/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-bin.zip 4 | networkTimeout=10000 5 | zipStoreBase=GRADLE_USER_HOME 6 | zipStorePath=wrapper/dists 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | # This is normally unused 84 | # shellcheck disable=SC2034 85 | APP_BASE_NAME=${0##*/} 86 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. 147 | # shellcheck disable=SC3045 148 | MAX_FD=$( ulimit -H -n ) || 149 | warn "Could not query maximum file descriptor limit" 150 | esac 151 | case $MAX_FD in #( 152 | '' | soft) :;; #( 153 | *) 154 | # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. 155 | # shellcheck disable=SC3045 156 | ulimit -n "$MAX_FD" || 157 | warn "Could not set maximum file descriptor limit to $MAX_FD" 158 | esac 159 | fi 160 | 161 | # Collect all arguments for the java command, stacking in reverse order: 162 | # * args from the command line 163 | # * the main class name 164 | # * -classpath 165 | # * -D...appname settings 166 | # * --module-path (only if needed) 167 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 168 | 169 | # For Cygwin or MSYS, switch paths to Windows format before running java 170 | if "$cygwin" || "$msys" ; then 171 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 172 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 173 | 174 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 175 | 176 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 177 | for arg do 178 | if 179 | case $arg in #( 180 | -*) false ;; # don't mess with options #( 181 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 182 | [ -e "$t" ] ;; #( 183 | *) false ;; 184 | esac 185 | then 186 | arg=$( cygpath --path --ignore --mixed "$arg" ) 187 | fi 188 | # Roll the args list around exactly as many times as the number of 189 | # args, so each arg winds up back in the position where it started, but 190 | # possibly modified. 191 | # 192 | # NB: a `for` loop captures its iteration list before it begins, so 193 | # changing the positional parameters here affects neither the number of 194 | # iterations, nor the values presented in `arg`. 195 | shift # remove old arg 196 | set -- "$@" "$arg" # push replacement arg 197 | done 198 | fi 199 | 200 | # Collect all arguments for the java command; 201 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 202 | # shell script including quotes and variable substitutions, so put them in 203 | # double quotes to make sure that they get re-expanded; and 204 | # * put everything else in single quotes, so that it's not re-expanded. 205 | 206 | set -- \ 207 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 208 | -classpath "$CLASSPATH" \ 209 | org.gradle.wrapper.GradleWrapperMain \ 210 | "$@" 211 | 212 | # Stop when "xargs" is not available. 213 | if ! command -v xargs >/dev/null 2>&1 214 | then 215 | die "xargs is not available" 216 | fi 217 | 218 | # Use "xargs" to parse quoted args. 219 | # 220 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 221 | # 222 | # In Bash we could simply go: 223 | # 224 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 225 | # set -- "${ARGS[@]}" "$@" 226 | # 227 | # but POSIX shell has neither arrays nor command substitution, so instead we 228 | # post-process each arg (as a line of input to sed) to backslash-escape any 229 | # character that might be a shell metacharacter, then use eval to reverse 230 | # that process (while maintaining the separation between arguments), and wrap 231 | # the whole thing up as a single "set" statement. 232 | # 233 | # This will of course break if any of these variables contains a newline or 234 | # an unmatched quote. 235 | # 236 | 237 | eval "set -- $( 238 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 239 | xargs -n1 | 240 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 241 | tr '\n' ' ' 242 | )" '"$@"' 243 | 244 | exec "$JAVACMD" "$@" 245 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%"=="" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%"=="" set DIRNAME=. 29 | @rem This is normally unused 30 | set APP_BASE_NAME=%~n0 31 | set APP_HOME=%DIRNAME% 32 | 33 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 34 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 35 | 36 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 37 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 38 | 39 | @rem Find java.exe 40 | if defined JAVA_HOME goto findJavaFromJavaHome 41 | 42 | set JAVA_EXE=java.exe 43 | %JAVA_EXE% -version >NUL 2>&1 44 | if %ERRORLEVEL% equ 0 goto execute 45 | 46 | echo. 47 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 48 | echo. 49 | echo Please set the JAVA_HOME variable in your environment to match the 50 | echo location of your Java installation. 51 | 52 | goto fail 53 | 54 | :findJavaFromJavaHome 55 | set JAVA_HOME=%JAVA_HOME:"=% 56 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 57 | 58 | if exist "%JAVA_EXE%" goto execute 59 | 60 | echo. 61 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 62 | echo. 63 | echo Please set the JAVA_HOME variable in your environment to match the 64 | echo location of your Java installation. 65 | 66 | goto fail 67 | 68 | :execute 69 | @rem Setup the command line 70 | 71 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 72 | 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if %ERRORLEVEL% equ 0 goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | set EXIT_CODE=%ERRORLEVEL% 85 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 86 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 87 | exit /b %EXIT_CODE% 88 | 89 | :mainEnd 90 | if "%OS%"=="Windows_NT" endlocal 91 | 92 | :omega 93 | -------------------------------------------------------------------------------- /proxy_list.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/http.txt 4 | https://raw.githubusercontent.com/caliphdev/Proxy-List/master/http.txt 5 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/https.txt 6 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt 7 | https://raw.githubusercontent.com/Bardiafa/Proxy-Leecher/main/proxies.txt 8 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt 9 | https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/http.txt 10 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/https.txt 11 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt 12 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/http.txt 13 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/https.txt 14 | https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt 15 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/https.txt 16 | https://raw.githubusercontent.com/mertguvencli/http-proxy-list/main/proxy-list/data.txt 17 | https://www.proxyscan.io/download?type=https 18 | https://raw.githubusercontent.com/aslisk/proxyhttps/main/https.txt 19 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks4.txt 20 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt 21 | https://raw.githubusercontent.com/iptotal/free-proxy-list/master/socks4.txt 22 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt 23 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt 24 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/socks4.txt 25 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt 26 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/socks4.txt 27 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt 28 | https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt 29 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt 30 | https://api.openproxylist.xyz/socks4.txt 31 | https://www.proxy-list.download/api/v1/get?type=socks4 32 | https://www.proxyscan.io/download?type=socks4 33 | https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt 34 | https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt 35 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt 36 | https://raw.githubusercontent.com/caliphdev/Proxy-List/master/socks5.txt 37 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt 38 | https://raw.githubusercontent.com/iptotal/free-proxy-list/master/socks5.txt 39 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt 40 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/socks5.txt 41 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks5.txt 42 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/socks5.txt 43 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt> 44 | https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt 45 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt 46 | https://api.openproxylist.xyz/socks5.txt 47 | https://www.proxy-list.download/api/v1/get?type=socks5 48 | https://www.proxyscan.io/download?type=socks5 49 | https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt 50 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt 51 | https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt 52 | https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /proxyhandler-v1.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | We have outlined a plan for a proxy picker, to pick proxies that haven't been used. 4 | 5 | The following restrictions are required: 6 | 7 | - Proxies must timeout after 500 successful/failed requests in total are made. 8 | - If a proxies invalid attempts exceeds 30 attempts within a 5 minute window, the proxy next_avaliable should be set 10 minutes in the future 9 | - If a proxy is successful, delta should increment. If it fails it should decrement. 10 | - If a GT (Twitter Guest Token) has been recently updated, skip the update process, as it's possible a cocurrent/paralell thread has attempted this recently. 11 | - As threading is used, a single proxy should not be used too many times. 12 | - Proxies should be added to a queue by a proxy gen thread that updates a FILO queue. 13 | 14 | 15 | ``` 16 | Proxy: 17 | usage_count -> 1 -> 500 # holds information so that one proxy isn't used more than 500 times in quick succession 18 | next_avaliable -> TIME # holds the next avaliable time a proxy can be used 19 | 20 | gt_last_updated -> TIME # holds a guest token (a token used when making a request. It can be invalid.) 21 | success_delta -> 0 LIMIT ( 5, 000 -> 10, 000 ) # holds information on how many times a proxy was successful 22 | failed_count -> 0 (0 -> 100) # holds information on the number of concequative failed attemps 23 | ``` 24 | 25 | ``` 26 | 27 | add_proxy_to_queue(): # proxies are added into a queue, and then used at a later point 28 | 29 | # makes a query to the database to get a free proxy using parameters. 30 | proxy = get_proxy(get a proxy where the next_avaliable time is now, sort by success_delta) 31 | 32 | if(there are proxies avaliable): 33 | 34 | if(usage_count > 500): # checks if the proxy has been used more than 500 times recently 35 | 36 | # we are close to the rate limit of the proxy 37 | 38 | usage_count = 0 # reset the usage count 39 | failed_count = 0 # reset the failed count (should this be done here?) 40 | 41 | next_avaliable = time + 15 minutes # up the next_avaliable time to 15 minutes into the future. 42 | 43 | # get a new proxy, since this one can not be used 44 | (END) 45 | 46 | if(failed_count > 100): # check if there have been over 100 sequential failed requests 47 | 48 | usage_count = 0 # reset the usage count (should this be done here?) 49 | failed_count = 0 # reset the failed count 50 | next_avaliable = time + 10 minutes # up the next_avaliable time to 15 minutes into the future. 51 | 52 | # get a new proxy, since this one can not be used 53 | (END) 54 | 55 | +1 usage count # add 1 to the usage count 56 | 57 | make_regular_request() 58 | 59 | if(HTTP request success && Proxy is working): # if the HTTP request was a success using proxy 60 | + 1 success_delta # include the success delta 61 | failed_count = 0 # reset the consecutive fail count {p1} 62 | if(guest_token_invalid): # if the Guest token (GT) was invalid 63 | if(gt_last_updated is longer than 2 minutes ago): # and hasn't been updated recently 64 | make_gt_request() # make a separate request to get a new guest token 65 | update_gt() # update the guest token 66 | next_avaliable = 0 # allow this proxy to be used again 67 | else: 68 | save_reqest() # if the guest token was correct, and the HTTP request didn't fail due to a proxy 69 | else: 70 | -1 success_delta # decrease the success delta 71 | +1 failed_count # increase the consecutive fail count 72 | next_avaliable = NOW() + 5 minutes # make the proxy usable again in 5 minutes 73 | 74 | ``` 75 | 76 | Additional suggestions: 77 | - Use a ML model to choose which proxy to use. 78 | - Backoff strategy 2^n seconds timeout. n = consecutive filed attempts -> max n + $r (randomness). 79 | - Use a rolling time window instead 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /proxyhandler-v2.md: -------------------------------------------------------------------------------- 1 | get_new_proxy(): 2 | 3 | proxy = get_proxy_query() # where next_avaliable is in the future, sort by success_delta 4 | 5 | if(proxy != null): 6 | 7 | if(proxy.usage_count > 500): 8 | update_proxy(proxy, { 9 | usage_count: 0, 10 | failed_count: 0, 11 | next_avaliable: NOW() + 15 minutes 12 | }) 13 | return get_new_proxy() 14 | 15 | if(proxy.failed_count > 100): 16 | update_proxy(proxy, { 17 | usage_count: 0, 18 | failed_count: 0, 19 | next_avaliable: NOW() + 10 minutes 20 | }) 21 | 22 | return get_new_proxy() 23 | 24 | proxy.usage_count += 1 25 | 26 | response = make_request("make search request", proxy) 27 | 28 | if(response.success is True): 29 | proxy.success_delta += 1 30 | proxy.failed_count = 0 31 | if(response.guest_token_invalid == true): 32 | if(proxy.guest_token_last_updated more than 2 minutes ago): 33 | response2 = make_request("make guest token request", proxy) 34 | update_proxy({ 35 | guest_token: response2.guest_token 36 | }) 37 | proxy.next_avliable = 0 38 | else: 39 | save_response_to_db(response) 40 | else: 41 | proxy.success_delta -= 1 42 | proxy.failed_count += 1 43 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/CustomSignalHandler.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | import sun.misc.Signal; 4 | import sun.misc.SignalHandler; 5 | 6 | public class CustomSignalHandler { 7 | public static void handleTSTPSignal(final Runnable onStop) { 8 | Signal.handle(new Signal("TSTP"), new SignalHandler() { 9 | @Override 10 | public void handle(Signal signal) { 11 | onStop.run(); 12 | } 13 | }); 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/DatabaseConnection.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | //import org.apache.commons.dbcp2.BasicDataSource; 4 | 5 | import javax.sql.DataSource; 6 | import java.sql.Connection; 7 | import java.sql.SQLException; 8 | 9 | public class DatabaseConnection { 10 | private static DataSource dataSource; 11 | 12 | /* 13 | static { 14 | BasicDataSource ds = new BasicDataSource(); 15 | ds.setUrl("jdbc:postgresql://localhost:5432/scrapium_proxies"); 16 | ds.setUsername("scrapium_user"); 17 | ds.setPassword("6F3dNfvz3eL3Vb3ol"); 18 | ds.setInitialSize(5); // Set the initial number of connections in the pool 19 | ds.setMaxTotal(10); // Set the maximum number of connections in the pool 20 | dataSource = ds; 21 | } */ 22 | 23 | public static Connection getConnection() throws SQLException { 24 | return dataSource.getConnection(); 25 | } 26 | 27 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/Main.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | import com.scrapium.tests.Benchmark; 4 | 5 | import java.util.ArrayList; 6 | // makes sense to use PostgreSQL for data, and Redis for caching & analytics 7 | 8 | /* 9 | Troubleshooting 10 | - Requests drop to 0 11 | - Maybe you're using too many resources - there's a perfect balance. 12 | - Having too many worker threads leads to blocks and switching between threads - slowing the system down. 13 | - Using too much memory and the program can't allocate memory 14 | - There are no proxies available 15 | - There are no tasks left 16 | - You have reached the maximum co-currency 17 | - Something may be causing a hang - ie. in the request handler, ie. not updating the coroutine count properly 18 | 19 | */ 20 | 21 | public class Main { 22 | 23 | public static void main(String[] args) { 24 | runService(); 25 | } 26 | 27 | public static void runTest(){ 28 | Benchmark.runTest(); 29 | } 30 | 31 | public static void runService(){ 32 | 33 | // Scraper(consumerCount, maxCoroutineCount, conSocketTimeout) 34 | 35 | // consumerCount - The number of threads running scraper tasks 36 | // maxCoroutineCount - The max amount of asynchronous calls that should be made for each thread 37 | // conSocketTimeout - The amount of time before connectionSocketTimeout will occur. 38 | 39 | // calls 40 | 41 | // scraper.logger.successRequestCount.get() - Will get the amount of total successful requests since .scrape() is called. 42 | // scraper.logger.failedRequestCount.get() - Will get the amount of total failed requests since .scrape() is called. 43 | 44 | // note: The last parameter of Scrape() is not currently used. 45 | 46 | // 6, 5000 - AWS 800 requests per second 47 | 48 | Scraper scraper = new Scraper(2, 2000, 10); 49 | 50 | scraper.scrape(); 51 | 52 | } 53 | 54 | 55 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/Scraper.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | import com.scrapium.proxium.ProxyService; 4 | import com.scrapium.threads.LoggingThread; 5 | import com.scrapium.threads.ProducerThread; 6 | import com.scrapium.threads.ProxyThread; 7 | import com.scrapium.threads.TweetThread; 8 | import com.scrapium.tweetium.TaskService; 9 | import com.scrapium.utils.DebugLogger; 10 | 11 | import java.util.ArrayList; 12 | import java.util.Iterator; 13 | import java.util.concurrent.*; 14 | 15 | public class Scraper { 16 | 17 | 18 | public ProxyService proxyService; 19 | public long conSocketTimeout; 20 | private int consumerCount; 21 | public int maxCoroutineCount; 22 | 23 | private final ExecutorService threadPool; 24 | private TaskService taskService; 25 | 26 | //public AtomicInteger coroutineCount; 27 | public LoggingThread logger; 28 | public ProxyThread proxyThread; 29 | 30 | private ProducerThread producer; 31 | private ArrayList threads; 32 | 33 | // the number of coroutines currently running 34 | 35 | public Scraper(int consumerCount, int maxCoroutineCount, int conSocketTimeout) { 36 | 37 | this.proxyService = new ProxyService(); 38 | 39 | this.consumerCount = consumerCount; 40 | this.maxCoroutineCount = maxCoroutineCount; 41 | this.conSocketTimeout = conSocketTimeout; 42 | 43 | this.threadPool = Executors.newFixedThreadPool(consumerCount + 3); 44 | this.taskService = new TaskService(); 45 | this.threads = new ArrayList(); 46 | 47 | 48 | /* 49 | // Handle the SIGINT signal (CTRL + C) 50 | Runtime.getRuntime().addShutdownHook(new Thread(() -> { 51 | System.out.println("Shutting down gracefully..."); 52 | this.stop(); 53 | })); 54 | 55 | String osName = System.getProperty("os.name"); 56 | if (!osName.toLowerCase().contains("windows")) { 57 | // the environment is not Windows 58 | 59 | // Handle the SIGTSTP signal (CTRL + Z) 60 | CustomSignalHandler.handleTSTPSignal(() -> { 61 | this.stop(); 62 | System.out.println("SIGTSTP signal received!"); 63 | System.exit(0); 64 | }); 65 | } */ 66 | 67 | } 68 | 69 | public void scrape() { 70 | 71 | this.logger = new LoggingThread(this, taskService); 72 | //threads.add(this.logger); 73 | threadPool.submit(this.logger); 74 | 75 | this.proxyThread = new ProxyThread(this, this.proxyService); 76 | //threads.add(this.proxyThread); 77 | threadPool.submit(this.proxyThread); 78 | 79 | this.producer = new ProducerThread(this, taskService); 80 | //threads.add(this.producer); 81 | threadPool.submit(this.producer); 82 | 83 | for (int i = 0; i < consumerCount; i++) { 84 | DebugLogger.log("Scraper: Created consumer thread."); 85 | TweetThread tweetThread = new TweetThread(i + 1, this, taskService); 86 | // threads.add(tweetThread); 87 | threadPool.submit(tweetThread); 88 | } 89 | } 90 | 91 | public void stop() { 92 | for (Iterator iterator = threads.iterator(); iterator.hasNext(); ) { 93 | ThreadBase item = iterator.next(); 94 | item.running = false; 95 | } 96 | 97 | try { 98 | System.out.println("Attempting to shutdown thread pool..."); 99 | threadPool.shutdown(); 100 | threadPool.awaitTermination(400, TimeUnit.SECONDS); 101 | } catch (InterruptedException e) { 102 | e.printStackTrace(); 103 | System.err.println("Thread pool termination interrupted."); 104 | } finally { 105 | if (!threadPool.isTerminated()) { 106 | System.err.println("Forcing thread pool shutdown..."); 107 | threadPool.shutdownNow(); 108 | try { 109 | threadPool.awaitTermination(60, TimeUnit.SECONDS); 110 | } catch (InterruptedException e) { 111 | e.printStackTrace(); 112 | throw new RuntimeException(e); 113 | } 114 | } 115 | System.out.println("Thread pool shutdown complete."); 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/ThreadBase.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | public class ThreadBase { 4 | public volatile boolean running = true; 5 | 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/TweetThreadTaskProcessor.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | import com.scrapium.proxium.Proxy; 4 | import com.scrapium.threads.LoggingThread; 5 | import com.scrapium.tweetium.TaskService; 6 | import com.scrapium.tweetium.TweetTask; 7 | import com.scrapium.utils.DebugLogger; 8 | import io.netty.handler.ssl.SslContext; 9 | import io.netty.handler.ssl.SslContextBuilder; 10 | import org.apache.hc.client5.http.auth.AuthScope; 11 | import org.apache.hc.client5.http.auth.CredentialsProvider; 12 | import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; 13 | import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; 14 | import org.asynchttpclient.*; 15 | import org.asynchttpclient.proxy.ProxyServer; 16 | import org.asynchttpclient.proxy.ProxyType; 17 | 18 | import javax.net.ssl.*; 19 | 20 | import static org.asynchttpclient.Dsl.*; 21 | 22 | 23 | import java.io.IOException; 24 | import java.net.Socket; 25 | import java.security.KeyManagementException; 26 | import java.security.NoSuchAlgorithmException; 27 | import java.security.cert.CertificateException; 28 | import java.security.cert.X509Certificate; 29 | import java.time.Duration; 30 | import java.time.Instant; 31 | import java.util.concurrent.*; 32 | import java.util.concurrent.atomic.AtomicInteger; 33 | import java.util.Base64; 34 | 35 | public class TweetThreadTaskProcessor { 36 | private AsyncHttpClient c; 37 | private final DefaultAsyncHttpClientConfig clientConfig; 38 | 39 | /* 40 | Notes TODO: 41 | - AtomicReference isn't efficient (create a new object instead) 42 | */ 43 | 44 | private Scraper scraper; 45 | private TaskService taskService; 46 | private final int threadID; 47 | private volatile boolean tweetThreadRunning; 48 | private AtomicInteger coroutineCount; 49 | 50 | private int requestCount; 51 | private Instant lastCleanup; 52 | 53 | private final boolean DO_CLEANUP = false; 54 | 55 | private SSLContext createSslContext() throws Exception { 56 | X509TrustManager tm = new X509TrustManager() { 57 | 58 | public void checkClientTrusted(X509Certificate[] xcs, 59 | String string) throws CertificateException { 60 | } 61 | 62 | public void checkServerTrusted(X509Certificate[] xcs, 63 | String string) throws CertificateException { 64 | } 65 | 66 | public X509Certificate[] getAcceptedIssuers() { 67 | return null; 68 | } 69 | }; 70 | 71 | SSLContext ctx = SSLContext.getInstance("TLS"); 72 | ctx.init(null, new TrustManager[] { tm }, null); 73 | return ctx; 74 | } 75 | 76 | public TweetThreadTaskProcessor(int threadID, boolean running, Scraper scraper, TaskService taskService, AtomicInteger coroutineCount) { 77 | this.threadID = threadID; 78 | this.scraper = scraper; 79 | this.taskService = taskService; 80 | this.coroutineCount = coroutineCount; 81 | this.tweetThreadRunning = running; 82 | 83 | 84 | 85 | this.clientConfig = new DefaultAsyncHttpClientConfig.Builder() 86 | .setConnectTimeout(8000) 87 | .setRequestTimeout(8000) 88 | .setReadTimeout(5000) 89 | .setMaxConnections(5000) 90 | .setMaxRequestRetry(1) 91 | .build(); 92 | 93 | this.c = asyncHttpClient(this.clientConfig); 94 | 95 | 96 | this.lastCleanup = Instant.now(); 97 | 98 | } 99 | 100 | public void doClientCleanupTick(){ 101 | if(DO_CLEANUP){ 102 | if(this.lastCleanup.isBefore(Instant.now().minusSeconds(180))){ 103 | System.out.println("[!] Doing client clean up."); 104 | this.lastCleanup = Instant.now(); 105 | try { 106 | this.c.close(); 107 | } catch (IOException e) { 108 | throw new RuntimeException(e); 109 | } 110 | this.coroutineCount.set(0); 111 | this.c = asyncHttpClient(this.clientConfig); 112 | } 113 | } 114 | } 115 | 116 | /* 117 | Run Continuously 118 | */ 119 | public void processNextTask(){ 120 | 121 | if(!DO_CLEANUP || this.lastCleanup.isBefore(Instant.now().minusSeconds(10))){ 122 | DebugLogger.log("TweetThreadTask: Before attempting to increase request count."); 123 | 124 | if(this.taskService.hasNextTask()){ 125 | Proxy proxy = this.scraper.proxyService.getNewProxy(); 126 | TweetTask task = this.taskService.getNextTask(); 127 | 128 | 129 | if(proxy != null){ 130 | 131 | // Debugging version only makes debug requests! 132 | Request request1 = new RequestBuilder("GET") 133 | .setUrl("http://httpforever.com") 134 | .setProxyServer(new ProxyServer.Builder(proxy.getIP(), proxy.getPort()).build()) 135 | .build(); 136 | 137 | c.executeRequest(request1, new handler(c, proxy, task, this)); 138 | } else { 139 | System.out.println("No proxies are available!"); 140 | } 141 | } 142 | } 143 | } 144 | 145 | 146 | public Scraper getScraper(){ 147 | return this.scraper; 148 | } 149 | 150 | public LoggingThread getLogger(){ 151 | return this.scraper.logger; 152 | } 153 | 154 | public int getCoroutineCount() { return this.coroutineCount.get(); } 155 | 156 | public void incrementCoroutineCount() { this.coroutineCount.incrementAndGet(); } 157 | public void decrementCoroutineCount() { this.coroutineCount.decrementAndGet(); } 158 | 159 | public TaskService getTaskService(){ 160 | return this.taskService; 161 | } 162 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/handler.java: -------------------------------------------------------------------------------- 1 | package com.scrapium; 2 | 3 | import com.scrapium.proxium.Proxy; 4 | import com.scrapium.tweetium.TweetTask; 5 | import io.netty.handler.codec.http.HttpHeaders; 6 | import org.asynchttpclient.*; 7 | 8 | import java.io.IOException; 9 | 10 | public class handler implements AsyncHandler { 11 | private final AsyncHttpClient client; 12 | private final TweetThreadTaskProcessor processor; 13 | private final Proxy proxy; 14 | private final TweetTask task; 15 | private Integer status; 16 | 17 | public handler(AsyncHttpClient client, Proxy proxy, TweetTask task, TweetThreadTaskProcessor tweetThreadTaskProcessor) { 18 | this.client = client; 19 | this.proxy = proxy; 20 | this.task = task; 21 | this.processor = tweetThreadTaskProcessor; 22 | this.processor.incrementCoroutineCount(); 23 | } 24 | @Override 25 | public AsyncHandler.State onStatusReceived(HttpResponseStatus responseStatus) throws Exception { 26 | status = responseStatus.getStatusCode(); 27 | if(status >= 200 && status < 300){ 28 | this.processor.getScraper().logger.increaseSuccessRequestCount(); 29 | proxy.onSuccess(); 30 | System.out.print("V"); 31 | processor.getTaskService().successfulTask(task); 32 | } else { 33 | this.processor.getScraper().logger.increaseFailedRequestCount(); 34 | proxy.onFailure(); 35 | System.out.print("X"); 36 | } 37 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); } 38 | return State.CONTINUE; 39 | } 40 | 41 | @Override 42 | public State onHeadersReceived(HttpHeaders headers) throws Exception { 43 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); } 44 | return State.CONTINUE; 45 | } 46 | 47 | @Override 48 | public AsyncHandler.State onBodyPartReceived(HttpResponseBodyPart bodyPart) throws Exception { 49 | 50 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); } 51 | return State.CONTINUE; 52 | 53 | } 54 | @Override 55 | public Integer onCompleted() throws Exception { 56 | this.processor.decrementCoroutineCount(); 57 | return 200; 58 | } 59 | 60 | @Override 61 | public void onThrowable(Throwable t) { 62 | proxy.onFailure(); 63 | //System.out.print("E"); 64 | // Handle exceptions here 65 | this.processor.getScraper().logger.increaseFailedRequestCount(); 66 | this.processor.decrementCoroutineCount(); 67 | processor.getTaskService().failTask(task); 68 | //System.err.println("An error occurred: " + t.getMessage()); 69 | } 70 | 71 | 72 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/proxium/Proxy.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.proxium; 2 | 3 | import java.sql.Time; 4 | import java.sql.Timestamp; 5 | import java.util.concurrent.atomic.AtomicInteger; 6 | import java.util.concurrent.atomic.AtomicLong; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | public class Proxy { 11 | 12 | private int id; 13 | private String connectionString; 14 | private AtomicInteger usageCount; 15 | private AtomicInteger successCount; 16 | private AtomicInteger failedCount; 17 | private AtomicInteger failStreak; 18 | private AtomicLong cooldownUntil; 19 | 20 | 21 | public Proxy(int id, String connectionString, int _usageCount, int _successCount, int _failedCount, int _failStreak, Timestamp _cooldownUntil) { 22 | this.id = id; 23 | this.connectionString = connectionString; 24 | this.usageCount = new AtomicInteger(_usageCount); 25 | this.successCount = new AtomicInteger(_successCount); 26 | this.failedCount = new AtomicInteger(_failedCount); 27 | this.failStreak = new AtomicInteger(_failStreak); 28 | 29 | long coolUntil = ( _cooldownUntil == null ) ? 0 : _cooldownUntil.getTime(); 30 | this.cooldownUntil = new AtomicLong(coolUntil); 31 | 32 | } 33 | 34 | public void onSuccess(){ 35 | this.usageCount.incrementAndGet(); 36 | this.successCount.incrementAndGet(); 37 | this.failStreak.set(0); 38 | this.cooldownUntil.set(System.currentTimeMillis()); 39 | } 40 | 41 | public void onFailure() { 42 | this.usageCount.incrementAndGet(); 43 | this.failedCount.incrementAndGet(); 44 | this.failStreak.incrementAndGet(); 45 | 46 | if(this.failStreak.get() > 50){ 47 | 48 | //System.out.println("Proxy fail streak over 50."); 49 | int baseCooldownTime = 1000; 50 | int maxCooldownTime = 120000; 51 | double exponentialFactor = 0.5;//1.2; 52 | long cooldownTime = baseCooldownTime * (long) Math.pow(exponentialFactor, failStreak.get() - 50); 53 | 54 | if(cooldownTime > maxCooldownTime){ 55 | cooldownTime = maxCooldownTime; 56 | } 57 | 58 | long cooldownUntil = System.currentTimeMillis() + cooldownTime; 59 | 60 | 61 | 62 | if(cooldownUntil > this.cooldownUntil.get()){ 63 | //System.out.println("[" + this.id + "] Proxy has failed, setting time to " + cooldownTime + ", fail streak = " + failStreak.get()); 64 | this.cooldownUntil.set(System.currentTimeMillis() + cooldownTime); 65 | } 66 | } 67 | } 68 | 69 | public String getConnectionString(){ 70 | return this.connectionString; 71 | } 72 | public String getIP() { 73 | return extractWithPattern(this.connectionString, "(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"); 74 | } 75 | public int getPort() { 76 | return Integer.valueOf(extractWithPattern(this.connectionString, "(?<=:)(\\d+)")); 77 | } 78 | 79 | public static String extractWithPattern(String input, String pattern) { 80 | Pattern compiledPattern = Pattern.compile(pattern); 81 | Matcher matcher = compiledPattern.matcher(input); 82 | if (matcher.find()) { 83 | return matcher.group(); 84 | } 85 | return ""; 86 | } 87 | 88 | 89 | public int getUsageCount() { 90 | return this.usageCount.get(); 91 | } 92 | 93 | public int getSuccessCount() { 94 | return this.successCount.get(); 95 | } 96 | 97 | public int getFailedCount() { 98 | return this.failedCount.get(); 99 | } 100 | 101 | public int getFailStreak() { 102 | return this.failStreak.get(); 103 | } 104 | 105 | public Timestamp getCooldownUntil() { 106 | return new Timestamp(this.cooldownUntil.get()); 107 | } 108 | 109 | public int getID() { 110 | return this.id; 111 | } 112 | 113 | public void debug_incrementUsageCount() { 114 | this.usageCount.incrementAndGet(); 115 | } 116 | 117 | public int getSuccessDelta(){ 118 | return this.getSuccessCount() - this.getFailedCount(); 119 | } 120 | 121 | public boolean inCoolDown(){ 122 | if(this.cooldownUntil.get() < System.currentTimeMillis()){ 123 | return false; 124 | } 125 | return true; 126 | } 127 | 128 | @Override 129 | public String toString() { 130 | return "Proxy{" + 131 | "id=" + id + 132 | ", connectionString='" + connectionString + '\'' + 133 | ", usageCount=" + usageCount + 134 | ", successCount=" + successCount + 135 | ", failedCount=" + failedCount + 136 | ", failStreak=" + failStreak + 137 | ", cooldownUntil=" + cooldownUntil + 138 | '}'; 139 | } 140 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/proxium/ProxyService.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.proxium; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.sql.*; 7 | import java.util.*; 8 | import java.util.concurrent.CopyOnWriteArrayList; 9 | 10 | public class ProxyService { 11 | 12 | 13 | private final Random rand; 14 | private ArrayList proxies; 15 | private ArrayList availableProxies; 16 | 17 | public ProxyService (){ 18 | this.proxies = new ArrayList(); 19 | this.availableProxies = new ArrayList(); 20 | this.rand = new Random(); 21 | } 22 | 23 | public void loadProxies() { 24 | synchronized (this.proxies){ 25 | try (BufferedReader br = new BufferedReader(new FileReader("./checked_proxies.txt"))) { 26 | String _proxy_entry; 27 | 28 | int i = 0; 29 | 30 | while ((_proxy_entry = br.readLine()) != null) { 31 | 32 | String proxy_entry = _proxy_entry.replaceAll("[\\r\\n]+", ""); 33 | String connString = proxy_entry; 34 | 35 | Proxy proxy = new Proxy( 36 | i++, 37 | connString, 38 | 0, 39 | 0, 40 | 0, 41 | 0, 42 | new Timestamp(0) 43 | ); 44 | 45 | //System.out.println("added "); 46 | //System.out.print(proxy.toString()); 47 | 48 | this.proxies.add(proxy); 49 | 50 | } 51 | 52 | System.out.println("Loaded (" + i + ") proxies!"); 53 | /* 54 | } catch (SQLException e) { 55 | e.printStackTrace(); 56 | System.out.println("Failed to get connection!"); 57 | }*/ 58 | 59 | } catch (IOException e) { 60 | System.err.format("IOException: %s%n", e); 61 | } 62 | } 63 | } 64 | 65 | /* 66 | String query = "SELECT id, connection_string, usage_count, success_count, failed_count, fail_streak, cooldown_until " + 67 | "FROM test_proxy " + 68 | "WHERE (cooldown_until IS NULL OR NOW() > cooldown_until) " + 69 | "ORDER BY CASE WHEN usage_count = 0 THEN 1 ELSE success_count / usage_count END DESC, last_used ASC " + 70 | "LIMIT 50"; 71 | */ 72 | 73 | public void updateAvailableProxies(){ 74 | synchronized (this.availableProxies){ 75 | this.availableProxies = new ArrayList(); 76 | 77 | // benchmark the below for a better solution. 78 | for (int i = 0; i < this.proxies.size(); i++){ 79 | // TODO: check isCoolDown function 80 | if(!this.proxies.get(i).inCoolDown()){ 81 | availableProxies.add(this.proxies.get(i)); 82 | } 83 | } 84 | } 85 | 86 | //System.out.println("Available proxy count = " + this.availableProxies.size()); 87 | 88 | synchronized (this.availableProxies) { 89 | if (this.availableProxies.size() < 50) { 90 | System.out.println("!! INCREDIBLY LOW AVAILABLE PROXY POOL SIZE (" + availableProxies.size() + ")"); 91 | } 92 | } 93 | 94 | 95 | synchronized (this.availableProxies) { 96 | //System.out.println("[proxyman] (" + availableProxies.size() + ") available proxies."); 97 | Collections.sort(availableProxies, new Comparator() { 98 | public int compare(Proxy p1, Proxy p2) { 99 | return Integer.compare(p2.getSuccessDelta() - p2.getFailedCount(), p1.getSuccessDelta() - p2.getFailedCount()); 100 | } 101 | }); 102 | } 103 | 104 | //System.out.println("[proxyman] Sorted available proxies!"); 105 | } 106 | 107 | // get one of the top 50 proxies, that aren't currently banned. 108 | public Proxy getNewProxy() { 109 | 110 | boolean proxyInCoolDown = true; 111 | int attempts = 0; 112 | 113 | Proxy randomProxy = null; 114 | 115 | while(proxyInCoolDown && attempts <= 150){ 116 | synchronized (this.availableProxies) { 117 | 118 | if (availableProxies.size() == 0) { 119 | System.out.println("No available Proxies...."); 120 | return randomProxy; 121 | } 122 | int randInd = rand.nextInt(30); 123 | if (randInd > availableProxies.size()) { 124 | randInd = availableProxies.size() - 1; 125 | } 126 | randomProxy = availableProxies.get(randInd); 127 | proxyInCoolDown = randomProxy.inCoolDown(); 128 | attempts++; 129 | } 130 | } 131 | 132 | if(attempts > 100){ 133 | System.out.println("Warning: iterated over 150 random proxies and couldn't find a viable proxy NOT in cooldown."); 134 | 135 | // TODO: reset all proxies 136 | } 137 | 138 | return randomProxy; 139 | } 140 | 141 | public int getAvailableProxyCount(){ 142 | synchronized (this.availableProxies){ 143 | return availableProxies.size(); 144 | } 145 | } 146 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/tests/Benchmark.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.tests; 2 | 3 | import com.scrapium.Scraper; 4 | import com.scrapium.utils.TimeUtils; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | public class Benchmark { 10 | public static void runTest() { 11 | Map configResults = new HashMap<>(); 12 | String bestConfigKey = ""; 13 | int highestSuccessfulRequests = 0; 14 | 15 | double timePerTest = 5 * 60 * 1000; // 30 seconds 16 | 17 | int totalTestCount = (((6-1)/2) * ((15000-100)/250) * ((28 - 4)/10)); 18 | int totalTestTime = (int) (totalTestCount * timePerTest); 19 | 20 | int testIter = 0; 21 | 22 | System.out.println("\n== Test started ==\n"); 23 | System.out.println("- Total Tests = " + (totalTestCount)); 24 | System.out.println("- Test will be completed " + TimeUtils.timeToString((totalTestTime/1000))); 25 | 26 | for (int maxCoroutineCount = 100; maxCoroutineCount <= 15000; maxCoroutineCount += 250) { // 100 -> 2000 27 | for (int consumerCount = 1; consumerCount <= 6; consumerCount += 2) { // 1 -> 8 28 | for (int conSocketTimeout = 6; conSocketTimeout <= 28; conSocketTimeout += 10) { // 4 -> 28 29 | 30 | Scraper scraper = new Scraper(consumerCount, maxCoroutineCount, conSocketTimeout); 31 | scraper.scrape(); 32 | 33 | String configKey = String.format("c_%d_m_%d_t_%d", consumerCount, maxCoroutineCount, conSocketTimeout); 34 | 35 | System.out.println("\n[" + testIter + "/" + totalTestCount + "] Starting test: "+ configKey + "\n"); 36 | 37 | int timeRemaining = (int) (totalTestTime - testIter * timePerTest); 38 | System.out.println("( Test will be completed " + TimeUtils.timeToString(timeRemaining/1000) + " )\n"); 39 | 40 | try { 41 | Thread.sleep((long) timePerTest); 42 | } catch (InterruptedException e) { 43 | e.printStackTrace(); 44 | throw new RuntimeException(e); 45 | } 46 | 47 | int successfulRequests = scraper.logger.successRequestCount.get(); 48 | int failedRequests = scraper.logger.failedRequestCount.get(); 49 | 50 | 51 | configResults.put(configKey, successfulRequests); 52 | 53 | 54 | scraper.stop(); 55 | 56 | System.out.printf("\n"+ "Test (" + testIter + "/" + totalTestCount + ") Finished Configuration: %s | Successful Requests: %d | Failed Requests: %d%n\n", 57 | configKey, successfulRequests, failedRequests); 58 | 59 | 60 | testIter++; 61 | 62 | if (successfulRequests > highestSuccessfulRequests) { 63 | highestSuccessfulRequests = successfulRequests; 64 | bestConfigKey = configKey; 65 | } 66 | 67 | try { 68 | Thread.sleep(10000); 69 | } catch (InterruptedException e) { 70 | e.printStackTrace(); 71 | throw new RuntimeException(e); 72 | } 73 | } 74 | } 75 | } 76 | 77 | System.out.println("\n== All Configuration Results =="); 78 | System.out.println("\n== C=threads m=coroutines t=timeout"); 79 | 80 | for (Map.Entry entry : configResults.entrySet()) { 81 | System.out.printf("Configuration: %s | Successful Requests: %d%n", entry.getKey(), entry.getValue()); 82 | } 83 | 84 | System.out.printf("\nBest Configuration: %s | Highest Successful Requests: %d%n", bestConfigKey, highestSuccessfulRequests); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/threads/LoggingThread.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.threads; 2 | 3 | import com.scrapium.Scraper; 4 | import com.scrapium.ThreadBase; 5 | import com.scrapium.tweetium.TaskService; 6 | 7 | import java.time.Duration; 8 | import java.time.Instant; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | public class LoggingThread extends ThreadBase implements Runnable { 12 | 13 | private final Instant scraperStart; 14 | private Scraper scraper; 15 | private AtomicInteger coroutineCount; 16 | private int lastRequestCount = 0; 17 | public AtomicInteger successRequestCount; 18 | public AtomicInteger failedRequestCount; 19 | 20 | private long startEpoch; 21 | private int lastSuccessCount; 22 | private int lastFailedCount; 23 | private long lastLogEpoch; 24 | 25 | private TaskService taskService; 26 | public LoggingThread(Scraper scraper, TaskService taskService) { 27 | this.scraper = scraper; 28 | this.taskService = taskService; 29 | this.successRequestCount = new AtomicInteger(0); 30 | this.failedRequestCount = new AtomicInteger(0); 31 | this.startEpoch = System.currentTimeMillis() / 1000; 32 | 33 | this.scraperStart = Instant.now(); 34 | } 35 | 36 | public static String format(Duration d) { 37 | long days = d.toDays(); 38 | d = d.minusDays(days); 39 | long hours = d.toHours(); 40 | d = d.minusHours(hours); 41 | long minutes = d.toMinutes(); 42 | d = d.minusMinutes(minutes); 43 | long seconds = d.getSeconds() ; 44 | return 45 | (days == 0?"":days+" days,")+ 46 | (hours == 0?"":hours+" hours,")+ 47 | (minutes == 0?"":minutes+" minutes,")+ 48 | (seconds == 0?"":seconds+" seconds,"); 49 | } 50 | 51 | @Override 52 | public void run() { 53 | while (this.running) { 54 | 55 | long currentEpoch = System.currentTimeMillis() / 1000; 56 | 57 | int successDelta = this.successRequestCount.get() - this.lastSuccessCount; 58 | int failedDelta = this.failedRequestCount.get() - this.lastFailedCount; 59 | 60 | double successPS = successDelta / (currentEpoch - this.lastLogEpoch); 61 | double failedPS = failedDelta / (currentEpoch - this.lastLogEpoch); 62 | 63 | int secondSinceStart = (int) (currentEpoch - this.startEpoch); 64 | 65 | double successPSTotal = this.successRequestCount.get() / (secondSinceStart == 0 ? 1 : secondSinceStart); 66 | 67 | String out = "\n\n=== Tweet Scraper ===\n"; 68 | out += ("Requests : " + (this.successRequestCount.get() + this.failedRequestCount.get())) + "\n"; 69 | out += ("Success/s: " + (successPS)) + "\n"; 70 | out += ("Success Total/s: " + (successPSTotal)) + "\n"; 71 | out += ("Failed/s: " + (failedPS)) + "\n"; 72 | out += ("Available Proxies: " + (this.scraper.proxyService.getAvailableProxyCount())) + "\n"; 73 | out += ("Running for: " + format(Duration.between(this.scraperStart, Instant.now()))); 74 | 75 | System.out.println(out); 76 | 77 | /* 78 | if(successPS == 0 && Duration.between(this.scraperStart, Instant.now()).toSeconds() > 30 ){ 79 | System.out.println("Requests per second == 0... EXITING."); 80 | System.exit(0); 81 | }*/ 82 | 83 | this.lastSuccessCount = this.successRequestCount.get(); 84 | this.lastFailedCount = this.failedRequestCount.get(); 85 | this.lastLogEpoch = System.currentTimeMillis() / 1000; 86 | 87 | 88 | try { 89 | Thread.sleep(2000); 90 | } catch (InterruptedException e) { 91 | e.printStackTrace(); 92 | throw new RuntimeException(e); 93 | } 94 | } 95 | } 96 | 97 | public void increaseSuccessRequestCount(){ 98 | successRequestCount.incrementAndGet(); 99 | } 100 | 101 | public void increaseFailedRequestCount(){ 102 | failedRequestCount.incrementAndGet(); 103 | } 104 | 105 | 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/threads/ProducerThread.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.threads; 2 | 3 | import com.scrapium.Scraper; 4 | import com.scrapium.ThreadBase; 5 | import com.scrapium.tweetium.TaskService; 6 | import com.scrapium.tweetium.TweetTask; 7 | 8 | public class ProducerThread extends ThreadBase implements Runnable { 9 | 10 | 11 | private final Scraper scraper; 12 | private TaskService taskService; 13 | 14 | private int debug_epoch = 1575072000; 15 | private String debug_search = "$BTC"; 16 | public ProducerThread(Scraper scraper, TaskService taskService) { 17 | this.scraper = scraper; 18 | this.taskService = taskService; 19 | } 20 | @Override 21 | public void run() { 22 | while (this.running) { 23 | 24 | if(this.taskService.doesQueueHaveFreeSpace()){ 25 | TweetTask newTask = new TweetTask(debug_search, debug_epoch, debug_epoch + 30); 26 | debug_epoch += 30; 27 | this.taskService.addNewTweetTaskEnd(newTask); 28 | } 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/threads/ProxyThread.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.threads; 2 | 3 | import com.scrapium.Scraper; 4 | import com.scrapium.ThreadBase; 5 | import com.scrapium.proxium.ProxyService; 6 | 7 | public class ProxyThread extends ThreadBase implements Runnable { 8 | 9 | 10 | private final Scraper scraper; 11 | private final ProxyService proxyService; 12 | 13 | public ProxyThread(Scraper scraper, ProxyService proxyService) { 14 | this.scraper = scraper; 15 | this.proxyService = proxyService; 16 | this.proxyService.loadProxies(); 17 | } 18 | 19 | @Override 20 | public void run() { 21 | while (this.running) { 22 | 23 | this.proxyService.updateAvailableProxies(); 24 | 25 | try { 26 | Thread.sleep(500); 27 | } catch (InterruptedException e) { 28 | throw new RuntimeException(e); 29 | } 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /src/main/java/com/scrapium/threads/TweetThread.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.threads; 2 | 3 | import com.scrapium.Scraper; 4 | import com.scrapium.ThreadBase; 5 | import com.scrapium.TweetThreadTaskProcessor; 6 | import com.scrapium.tweetium.TaskService; 7 | import com.scrapium.utils.DebugLogger; 8 | 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | public class TweetThread extends ThreadBase implements Runnable { 12 | 13 | 14 | 15 | private final Scraper scraper; 16 | private final int threadID; 17 | private TaskService taskService; 18 | private AtomicInteger coroutineCount; 19 | 20 | private final TweetThreadTaskProcessor taskProcessor; 21 | 22 | // possibly move maxCoroutineCount to scraper, so it doesn't need to be updated in each class - blocking. 23 | 24 | public TweetThread(int i, Scraper scraper, TaskService taskService) { 25 | this.threadID = i; 26 | this.scraper = scraper; 27 | this.taskService = taskService; 28 | this.coroutineCount = new AtomicInteger(0); 29 | this.taskProcessor = new TweetThreadTaskProcessor(this.threadID, this.running, this.scraper, taskService, this.coroutineCount); 30 | } 31 | 32 | @Override 33 | public void run() { 34 | while (this.running) { 35 | // move tick to different thread for code speed-up 36 | this.taskProcessor.doClientCleanupTick(); 37 | 38 | if (this.coroutineCount.get() < scraper.maxCoroutineCount) { 39 | DebugLogger.log("TweetThread: Ran cycle"); 40 | DebugLogger.log("TweetThread: Task Taken"); 41 | this.taskProcessor.processNextTask(); 42 | DebugLogger.log("Decrementing counter"); 43 | 44 | } else { 45 | if(this.coroutineCount.get() >= scraper.maxCoroutineCount){ 46 | //System.out.println("Skipping thread execution!"); 47 | //System.out.println(" Reason: MAX CO-ROUTINES (" + this.coroutineCount.get() + "/" + scraper.maxCoroutineCount + ")"); 48 | } 49 | 50 | try { 51 | Thread.sleep(150); // Sleep when the maximum number of tasks are being executed 52 | } catch (InterruptedException e) { 53 | throw new RuntimeException(e); 54 | } 55 | } 56 | 57 | /* 58 | try { 59 | //System.out.println("tweetThread coroutine count = " + this.coroutineCount.get()); 60 | //System.out.println("tweetThread taskQueue = " + this.taskQueue.size()); 61 | 62 | if (this.taskQueue.size() > 0 && this.coroutineCount.get() < scraper.maxCoroutineCount) { 63 | DebugLogger.log("TweetThread: Ran cycle"); 64 | DebugLogger.log("TweetThread: Task Taken"); 65 | this.taskProcessor.processNextTask(); 66 | DebugLogger.log("Decrementing counter"); 67 | 68 | } else { 69 | 70 | if(this.taskQueue.size() == 0){ 71 | //System.out.println("Skipping thread execution!"); 72 | //System.out.println(" Reason: QUEUE EMPTY"); 73 | } 74 | if(this.coroutineCount.get() >= scraper.maxCoroutineCount){ 75 | //System.out.println("Skipping thread execution!"); 76 | //System.out.println(" Reason: MAX CO-ROUTINES (" + this.coroutineCount.get() + "/" + scraper.maxCoroutineCount + ")"); 77 | } 78 | 79 | Thread.sleep(150); // Sleep when the maximum number of tasks are being executed 80 | } 81 | } catch (Exception e) { 82 | e.printStackTrace(); 83 | DebugLogger.log("Interrupted Exception!"); 84 | } */ 85 | } 86 | 87 | // TODO: readd close request client; 88 | //System.out.println("closeRequestClient called"); 89 | //this.taskProcessor.closeRequestClient(); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/tweetium/TaskService.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.tweetium; 2 | 3 | import java.util.concurrent.ConcurrentLinkedDeque; 4 | import java.util.concurrent.ConcurrentLinkedQueue; 5 | 6 | public class TaskService { 7 | 8 | private ConcurrentLinkedDeque backlogTweetQueue; 9 | private ConcurrentLinkedQueue inProcessingTweetQueue; 10 | 11 | public TaskService() { 12 | backlogTweetQueue = new ConcurrentLinkedDeque<>(); 13 | inProcessingTweetQueue = new ConcurrentLinkedQueue<>(); 14 | } 15 | 16 | public boolean doesQueueHaveFreeSpace(){ 17 | return (backlogTweetQueue.size() < 5000); 18 | } 19 | 20 | // maybe separate this into two lists. 21 | public TweetTask getNextTask(){ 22 | if(backlogTweetQueue.size() == 0){ 23 | System.out.println("[X] No tasks in the queue!"); 24 | return null; 25 | } 26 | TweetTask task = backlogTweetQueue.poll(); 27 | //task.setState(TweetTask.TweetTaskState.PROCESSING); 28 | inProcessingTweetQueue.add(task); 29 | return task; 30 | } 31 | 32 | public boolean hasNextTask(){ 33 | return (backlogTweetQueue.size() > 0); 34 | } 35 | 36 | public void successfulTask(TweetTask task){ 37 | inProcessingTweetQueue.remove(task); 38 | if(task.hasContinuation()/* && task.getState() == TweetTask.TweetTaskState.COMPLETED */){ 39 | // continue with next request 40 | TweetTask continuationTask = task.getConsecutiveRequest(); 41 | //continuationTask.setState(TweetTask.TweetTaskState.PROCESSING); 42 | this.backlogTweetQueue.addFirst(continuationTask); 43 | } else { 44 | // no continuation 45 | } 46 | } 47 | 48 | public void failTask(TweetTask task){ 49 | inProcessingTweetQueue.remove(task); 50 | this.backlogTweetQueue.addFirst(task); 51 | } 52 | 53 | // fix branch prediction failures 54 | public void cleanup(){ 55 | // do cleanup, if items in visible for more than 4 minutes, push to front of tweet queue 56 | } 57 | 58 | public void addNewTweetTaskEnd(TweetTask tweetTask) { 59 | this.backlogTweetQueue.addLast(tweetTask); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/tweetium/TweetTask.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.tweetium; 2 | 3 | public class TweetTask { 4 | 5 | private final String searchTerm; 6 | private final int fromEpoch; 7 | private final int toEpoch; 8 | private final String cursor; 9 | 10 | public TweetTask(String searchTerm, int fromEpoch, int toEpoch) { 11 | this.searchTerm = searchTerm; 12 | this.fromEpoch = fromEpoch; 13 | this.toEpoch = toEpoch; 14 | this.cursor = ""; 15 | } 16 | 17 | public TweetTask(String searchTerm, int fromEpoch, int toEpoch, String cursor) { 18 | this.searchTerm = searchTerm; 19 | this.fromEpoch = fromEpoch; 20 | this.toEpoch = toEpoch; 21 | this.cursor = cursor; 22 | } 23 | 24 | public TweetTask getConsecutiveRequest() { 25 | return null; 26 | } 27 | 28 | 29 | public boolean hasContinuation() { 30 | return false; 31 | } 32 | 33 | @Override 34 | public String toString() { 35 | return "TweetTask{" + 36 | "searchTerm='" + searchTerm + '\'' + 37 | ", fromEpoch=" + fromEpoch + 38 | ", toEpoch=" + toEpoch + 39 | ", cursor='" + cursor + '\'' + 40 | '}'; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/utils/DebugLogger.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.utils; 2 | 3 | public class DebugLogger { 4 | public static void log(String message) { 5 | //System.out.println(message); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/scrapium/utils/TimeUtils.java: -------------------------------------------------------------------------------- 1 | package com.scrapium.utils; 2 | 3 | import java.sql.Timestamp; 4 | import java.util.Calendar; 5 | 6 | public class TimeUtils { 7 | 8 | public static String timeToString(int seconds) { 9 | if (seconds < 60) { 10 | return "in " + seconds + " seconds"; 11 | } else if (seconds < 3600) { 12 | int minutes = seconds / 60; 13 | return "in " + minutes + " minute" + (minutes == 1 ? "" : "s"); 14 | } else if (seconds < 86400) { 15 | int hours = seconds / 3600; 16 | return "in " + hours + " hour" + (hours == 1 ? "" : "s"); 17 | } else { 18 | int days = seconds / 86400; 19 | return "in " + days + " day" + (days == 1 ? "" : "s"); 20 | } 21 | } 22 | 23 | public static Timestamp nowPlusMinutes(int minutes){ 24 | Timestamp currentTimestamp = new Timestamp(System.currentTimeMillis()); 25 | 26 | // Create a Calendar instance and set the time to the current timestamp 27 | Calendar calendar = Calendar.getInstance(); 28 | calendar.setTime(currentTimestamp); 29 | 30 | // Add 15 minutes to the calendar 31 | calendar.add(Calendar.MINUTE, minutes); 32 | 33 | // Get the new timestamp with the updated time 34 | Timestamp newTimeStamp = new Timestamp(calendar.getTimeInMillis()); 35 | 36 | return newTimeStamp; 37 | } 38 | 39 | 40 | } -------------------------------------------------------------------------------- /src/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE proxies ( 2 | id SERIAL PRIMARY KEY, 3 | conn_string VARCHAR(255) NOT NULL, 4 | ip_address VARCHAR(255) NOT NULL, 5 | port INTEGER NOT NULL, 6 | is_socks BOOLEAN NOT NULL, 7 | usage_count INTEGER DEFAULT 0, 8 | retry_count INTEGER DEFAULT 0, 9 | next_available TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP, 10 | guest_token VARCHAR(255), 11 | guest_token_updated TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP, 12 | success_delta INTEGER DEFAULT 0, 13 | failed_count INTEGER DEFAULT 0, 14 | last_updated TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP 15 | ); 16 | 17 | GRANT SELECT, INSERT, UPDATE, DELETE ON TABLE proxies TO scrapium_user; 18 | GRANT USAGE, SELECT ON SEQUENCE proxies_id_seq1 TO scrapium_user; 19 | 20 | # 21 | 22 | -- Create the test_proxy table 23 | CREATE TABLE test_proxy ( 24 | id SERIAL PRIMARY KEY, 25 | connection_string VARCHAR(255) UNIQUE, 26 | usage_count INTEGER DEFAULT 0, 27 | success_count INTEGER DEFAULT 0, 28 | failed_count INTEGER DEFAULT 0, 29 | fail_streak INTEGER DEFAULT 0, 30 | cooldown_until TIMESTAMPTZ 31 | ); 32 | 33 | -- Grant privileges to scrapium_user 34 | GRANT ALL PRIVILEGES ON TABLE test_proxy TO scrapium_user; 35 | GRANT USAGE, SELECT ON SEQUENCE test_proxy_id_seq TO scrapium_user; 36 | 37 | 38 | # 39 | 40 | 41 | 42 | CREATE TABLE test_proxy ( 43 | id SERIAL PRIMARY KEY, 44 | connection_string VARCHAR(255), 45 | usage_count INTEGER NOT NULL DEFAULT 0, 46 | success_count INTEGER NOT NULL DEFAULT 0, 47 | failed_count INTEGER NOT NULL DEFAULT 0, 48 | fail_streak INTEGER NOT NULL DEFAULT 0, 49 | cooldown_until TIMESTAMP WITH TIME ZONE, 50 | last_used TIMESTAMP WITH TIME ZONE, 51 | status VARCHAR(50) NOT NULL DEFAULT 'active' 52 | ); 53 | 54 | CREATE INDEX idx_cooldown ON test_proxy (cooldown_until); 55 | CREATE INDEX idx_usage_count ON test_proxy (usage_count); 56 | CREATE INDEX idx_last_used ON test_proxy (last_used); 57 | GRANT ALL PRIVILEGES ON TABLE test_proxy TO scrapium_user; 58 | GRANT USAGE, SELECT ON SEQUENCE test_proxy_id_seq TO scrapium_user; --------------------------------------------------------------------------------