├── sample-data ├── graph.png ├── histo10.png └── histo10b.png ├── repextract.awk ├── CMakeLists.txt ├── LICENSE ├── ext └── sqlitewriter │ ├── minipsql.hh │ ├── psqlwriter.hh │ ├── sqlwriter.hh │ ├── minipsql.cc │ ├── psqlwriter.cc │ └── sqlwriter.cc ├── audience-minutes.js ├── audience.ipynb ├── access2sql.cc └── README.md /sample-data/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/graph.png -------------------------------------------------------------------------------- /sample-data/histo10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/histo10.png -------------------------------------------------------------------------------- /sample-data/histo10b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berthubert/audience-minutes/main/sample-data/histo10b.png -------------------------------------------------------------------------------- /repextract.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | # 1 2 3 4 5 6 7 8 9 10 11 3 | # 0.0.0.0 - - [26/Jun/2021:00:01:08 +0200] "GET /articles/report.json?scrollPerc=4&count=5 HTTP/1.1" 200 0 "https://berthub.eu/articles/posts/reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/" 4 | 5 | BEGIN { printf("url,perc,count\n"); } 6 | { 7 | split($11, parts, "[\"?]"); 8 | url=parts[2]; 9 | 10 | # 0 1 2 3 4 11 | # /articles/report.json?scrollPerc=4&count=5 12 | if($7 ~ /report.json/) { 13 | split($7, parts, "[?=&]"); 14 | 15 | printf("\"%s\",%s,%s\n", url, parts[3], parts[5]); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -O3") 3 | 4 | project(audience VERSION 1.0 5 | DESCRIPTION "Hello deep learning" 6 | LANGUAGES CXX) 7 | 8 | 9 | #add_compile_options(-fsanitize=address) 10 | #add_link_options(-fsanitize=address) 11 | 12 | set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard to use") 13 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 14 | set(CMAKE_CXX_EXTENSIONS ON) 15 | 16 | set(CMAKE_THREAD_PREFER_PTHREAD TRUE) 17 | set(THREADS_PREFER_PTHREAD_FLAG TRUE) 18 | find_package(Threads REQUIRED) 19 | 20 | add_executable(access2sql access2sql.cc ext/sqlitewriter/sqlwriter.cc) 21 | target_link_libraries(access2sql sqlite3 Threads::Threads) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 bert hubert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ext/sqlitewriter/minipsql.hh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | class MiniPSQL 7 | { 8 | public: 9 | MiniPSQL(std::string_view fname); 10 | ~MiniPSQL(); 11 | std::vector> getSchema(const std::string& table); 12 | void addColumn(const std::string& table, std::string_view name, std::string_view type); 13 | 14 | //!< execute a random query, for example a PRAGMA 15 | std::vector> exec(const std::string& query); 16 | // std::vector> exec(const std::string& query, const std::vector& params); 17 | std::vector> exec(const std::string& query, std::vector params); 18 | 19 | //!< set the prepared statement for a table, question marks as placeholder 20 | void prepare(const std::string& table, std::string_view str, unsigned int paramsize); 21 | // offset from 1!! 22 | template 23 | void bindPrep(const std::string& table, int idx, const T& value) 24 | { 25 | bindPrep(table, idx, std::to_string(value)); 26 | } 27 | 28 | void bindPrep(const std::string& table, int idx, const std::string& value) 29 | { 30 | int pos = idx-1; 31 | if(d_params[table].size() <= pos) 32 | d_params[table].resize(pos+1); 33 | d_params[table][pos]=value; 34 | } 35 | 36 | //!< execute the prepared & bound statement 37 | void execPrep(const std::string& table); 38 | 39 | void begin(); 40 | void commit(); 41 | void cycle(); 42 | 43 | //!< do we have a prepared statement for this table 44 | bool isPrepared(const std::string& table) const 45 | { 46 | return d_stmts.find(table) != d_stmts.end(); 47 | } 48 | 49 | private: 50 | PGconn* d_conn; 51 | 52 | std::unordered_map d_stmts; // keyed on table name 53 | std::unordered_map> d_params; // keyed on table name 54 | 55 | bool d_intransaction{false}; 56 | bool haveTable(const std::string& table); 57 | }; 58 | -------------------------------------------------------------------------------- /audience-minutes.js: -------------------------------------------------------------------------------- 1 | // this small script is intended to give you some insight in how people spend 2 | // time on your site. 3 | // It does not track users, there are no cookies or any stored data, and it only gives some probabilistic insights 4 | 5 | 6 | // here you can pick the granularity in time 7 | var intervalSeconds=60; 8 | // and here the reporting probability. The busier your site is the lower you can set this & 9 | // still get sufficient reports 10 | var reportingProbability=0.1; 11 | 12 | var hadActivity=false; 13 | var scrollPerc=0; 14 | var activityCount=0; 15 | 16 | // this is called every intervalSeconds to see if there was any activity 17 | function reportOrNot() 18 | { 19 | if(hadActivity) { 20 | activityCount++; 21 | if(Math.random() < reportingProbability) { 22 | let url = '/articles/report.json?scrollPerc='+Math.round(scrollPerc)+"&count="+activityCount; 23 | var oReq = new XMLHttpRequest(); 24 | oReq.open("GET", url); 25 | oReq.setRequestHeader("Cache-Control", "no-cache, no-store, max-age=0"); 26 | 27 | // fallbacks for IE and older browsers: 28 | oReq.setRequestHeader("Expires", "Tue, 01 Jan 1980 1:00:00 GMT"); 29 | oReq.setRequestHeader("Pragma", "no-cache"); 30 | 31 | oReq.send(); 32 | } 33 | hadActivity=false; 34 | } 35 | } 36 | 37 | document.addEventListener("DOMContentLoaded", function(event) { 38 | document.addEventListener('scroll', function(e) { 39 | hadActivity=true; 40 | // I found this on Stacj Overflow somewhere.. 41 | let scrollHeight = Math.max( 42 | document.body.scrollHeight, document.documentElement.scrollHeight, 43 | document.body.offsetHeight, document.documentElement.offsetHeight, 44 | document.body.clientHeight, document.documentElement.clientHeight 45 | ); 46 | scrollPerc=100.0*window.pageYOffset/(scrollHeight-window.innerHeight); 47 | }); 48 | 49 | document.addEventListener('mousemove', function(e) { 50 | hadActivity=true; 51 | }); 52 | 53 | setInterval(reportOrNot, intervalSeconds * 1000); 54 | }); 55 | -------------------------------------------------------------------------------- /ext/sqlitewriter/psqlwriter.hh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /* ok for a remote database, two things are very different: 9 | 1) We need to stream or batch our inserts, otherwise we get killed by latency 10 | 2) We need isolation from database server failures 11 | 12 | The way to do this is to send all the inserts to a worker thread, which takes 13 | care of the batching, sending, reconnecting etc. 14 | 15 | There can be multiple tables, which means the batching needs to happen per 16 | table. There could be multiple insert signatures, which makes batching harder. 17 | It may be possibe to use NULLs to create unified signatures though. 18 | 19 | */ 20 | 21 | 22 | class PSQLWriter 23 | { 24 | 25 | public: 26 | explicit PSQLWriter(std::string_view fname) 27 | { 28 | pipe2(d_pipe, 0); //O_NONBLOCK); 29 | d_thread = std::thread(&PSQLWriter::commitThread, this); 30 | } 31 | typedef std::variant var_t; 32 | void addValue(const std::initializer_list>& values, const std::string& table="data") 33 | { 34 | addValueGeneric(table, values); 35 | } 36 | 37 | void addValue(const std::vector>& values, const std::string& table="data") 38 | { 39 | addValueGeneric(table, values); 40 | } 41 | 42 | template 43 | void addValueGeneric(const std::string& table, const T& values); 44 | ~PSQLWriter() 45 | { 46 | // std::cerr<<"Destructor called"<>> d_columns; 59 | std::unordered_map> d_lastsig; 60 | bool haveColumn(const std::string& table, std::string_view name); 61 | 62 | struct Message 63 | { 64 | std::string table; 65 | std::unordered_map values; 66 | }; 67 | }; 68 | 69 | 70 | template 71 | void PSQLWriter::addValueGeneric(const std::string& table, const T& values) 72 | { 73 | auto msg = new Message({table}); 74 | for(const auto& v : values) { 75 | msg->values[v.first] = v.second; 76 | } 77 | write(d_pipe[1], &msg, sizeof(msg)); 78 | } 79 | -------------------------------------------------------------------------------- /audience.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib notebook\n", 10 | "%precision 2\n", 11 | "import matplotlib\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "plt.rcParams['figure.figsize'] = [9.5, 6]\n", 14 | "\n", 15 | "import pandas" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "percs=pandas.read_csv(\"percs.csv\")\n", 25 | "print(percs.head())\n", 26 | "percs[\"perc\"]=pandas.to_numeric(percs[\"perc\"], errors='coerce') # NaN strings\n", 27 | "percs=percs[(~percs.perc.isna()) & (percs.perc <=100)] # remove NaN\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "busy=percs.groupby([\"url\"]).count().sort_values([\"count\"], ascending=False).head(15)\n", 37 | "busy" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# given 10% sampling and once a minute measurements, one sample is 10 minutes\n", 47 | "# so this gives a table with reading hours\n", 48 | "busy.perc*10/60" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "scrolled": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# you can raise or lower the number of bins depending on how much data you have\n", 60 | "for url in busy.index:\n", 61 | " plt.figure()\n", 62 | " plt.hist(sep.perc, bins=10, density=True)\n", 63 | " plt.grid()\n", 64 | " plt.ylabel(\"Density\")\n", 65 | " plt.xlabel(\"Location as percentage of page length\")\n", 66 | " plt.title(\"Sampled density of readership for\\n\"+url)\n", 67 | " print(url)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3 (ipykernel)", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.9.7" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 4 99 | } 100 | -------------------------------------------------------------------------------- /ext/sqlitewriter/sqlwriter.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | struct sqlite3; 11 | struct sqlite3_stmt; 12 | 13 | class MiniSQLite 14 | { 15 | public: 16 | MiniSQLite(std::string_view fname); 17 | ~MiniSQLite(); 18 | std::vector> getSchema(const std::string& table); 19 | void addColumn(const std::string& table, std::string_view name, std::string_view type); 20 | std::vector> exec(std::string_view query); 21 | void prepare(const std::string& table, std::string_view str); 22 | void bindPrep(const std::string& table, int idx, bool value); 23 | void bindPrep(const std::string& table, int idx, int value); 24 | void bindPrep(const std::string& table, int idx, uint32_t value); 25 | void bindPrep(const std::string& table, int idx, long value); 26 | void bindPrep(const std::string& table, int idx, unsigned long value); 27 | void bindPrep(const std::string& table, int idx, long long value); 28 | void bindPrep(const std::string& table, int idx, unsigned long long value); 29 | void bindPrep(const std::string& table, int idx, double value); 30 | void bindPrep(const std::string& table, int idx, const std::string& value); 31 | void execPrep(const std::string& table); 32 | void begin(); 33 | void commit(); 34 | void cycle(); 35 | bool isPrepared(const std::string& table) const 36 | { 37 | if(auto iter = d_stmts.find(table); iter == d_stmts.end()) 38 | return false; 39 | else 40 | return iter->second != nullptr; 41 | } 42 | 43 | private: 44 | sqlite3* d_sqlite; 45 | std::unordered_map d_stmts; 46 | std::vector> d_rows; // for exec() 47 | static int helperFunc(void* ptr, int cols, char** colvals, char** colnames); 48 | bool d_intransaction{false}; 49 | bool haveTable(const std::string& table); 50 | }; 51 | 52 | class SQLiteWriter 53 | { 54 | 55 | public: 56 | explicit SQLiteWriter(std::string_view fname) : d_db(fname) 57 | { 58 | // for(const auto& c : d_columns) 59 | // cout < var_t; 66 | void addValue(const std::initializer_list>& values, const std::string& table="data"); 67 | void addValue(const std::vector>& values, const std::string& table="data"); 68 | 69 | template 70 | void addValueGeneric(const std::string& table, const T& values); 71 | ~SQLiteWriter() 72 | { 73 | // std::cerr<<"Destructor called"<>> d_columns; 85 | std::unordered_map> d_lastsig; 86 | bool haveColumn(const std::string& table, std::string_view name); 87 | 88 | }; 89 | -------------------------------------------------------------------------------- /access2sql.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "ext/sqlitewriter/sqlwriter.hh" 8 | 9 | /* You can pipe a typical access.log into this, and it will populate sqlite3 database ('access.sqlite3') for you, in streaming fashion. 10 | You can safely access that sqlite database while the program runs, see https://berthub.eu/articles/posts/big-data-storage/ 11 | 12 | A fun view to create: 13 | create view botfree as select * from data where agent like 'Mozilla/5.0 %' and agent not like '%bot%' and agent not like '%miniflux%' 14 | 15 | This filters out the bulk of bots right now. 16 | */ 17 | 18 | using namespace std; 19 | 20 | struct Parser { 21 | explicit Parser(FILE* fp) : d_fp(fp) 22 | {} 23 | 24 | FILE* d_fp; 25 | 26 | struct EofException{}; 27 | 28 | void skipSpaces() 29 | { 30 | int c; 31 | for(;;) { 32 | c = getc(d_fp); 33 | if(c==EOF) 34 | throw EofException(); 35 | if(c!=' ') { 36 | ungetc(c, d_fp); 37 | break; 38 | } 39 | } 40 | } 41 | void skipToEol() 42 | { 43 | int c; 44 | for(;;) { 45 | c = getc(d_fp); 46 | if(c==EOF) 47 | throw EofException(); 48 | if(c=='\n') { 49 | break; 50 | } 51 | } 52 | } 53 | 54 | string getWord() 55 | { 56 | skipSpaces(); 57 | string ret; 58 | int c; 59 | for(;;) { 60 | c = getc(d_fp); 61 | if(c==EOF) 62 | throw EofException(); 63 | if(c==' ') 64 | break; 65 | ret.append(1, (char)c); 66 | } 67 | return ret; 68 | } 69 | 70 | string getDelim(char start, char stop) 71 | { 72 | skipSpaces(); 73 | string ret; 74 | int c; 75 | c = getc(d_fp); 76 | if(c==EOF) 77 | throw EofException(); 78 | 79 | if(c!=start) 80 | throw runtime_error("Wrong delimiter, skipping"); 81 | for(;;) { 82 | c = getc(d_fp); 83 | if(c==EOF) 84 | throw EofException(); 85 | 86 | if(c==EOF) 87 | throw EofException(); 88 | if(c==stop) 89 | break; 90 | if(c=='\n') 91 | throw runtime_error("Delimiter not found on line, skipping"); 92 | ret.append(1, (char)c); 93 | } 94 | return ret; 95 | } 96 | 97 | int64_t getNumber() 98 | { 99 | string word = getWord(); 100 | return stol(word); 101 | } 102 | string getQuotedWord(); 103 | }; 104 | 105 | // 19/Mar/2023:00:00:10 +0100 106 | time_t getTime(const string& in) 107 | { 108 | struct tm tm{}; 109 | strptime(in.c_str(), "%d/%b/%Y:%H:%M:%S %z", &tm); 110 | 111 | // this gets the timezone wrong! XXX 112 | return mktime(&tm); 113 | } 114 | 115 | bool starts_with(const std::string& str, const std::string& prefix) 116 | { 117 | return str.compare(0, prefix.length(), prefix) == 0; 118 | } 119 | 120 | // written by ChatGPT! 121 | vector split_string(const string& input) 122 | { 123 | istringstream iss(input); 124 | vector tokens; 125 | string token; 126 | while (iss >> token) 127 | tokens.push_back(token); 128 | return tokens; 129 | } 130 | 131 | int main(int argc, char** argv) 132 | try 133 | { 134 | Parser p(stdin); 135 | SQLiteWriter sqw(argc > 1 ? argv[1] : "access.sqlite3"); 136 | 137 | // ::ffff:194.117.254.60 - - [19/Mar/2023:00:00:10 +0100] "GET /articles/posts/nerdfluisteraar/ HTTP/1.1" 200 16627 "-" "Friendica 'Giant Rhubarb' 2023.01-1502; https://friendica.se1.eu" 138 | 139 | for(;;) { 140 | try { 141 | string ip = p.getWord(); 142 | string ign1 = p.getWord(); 143 | string ign2 = p.getWord(); 144 | if(starts_with(ip, "::ffff:")) 145 | ip = ip.substr(7); 146 | string t = p.getDelim('[', ']'); 147 | time_t tim = getTime(t); 148 | string req = p.getDelim('"', '"'); 149 | int64_t stat = p.getNumber(); 150 | int64_t size = p.getNumber(); 151 | string ref = p.getDelim('"', '"'); 152 | string agent = p.getDelim('"', '"'); 153 | auto parts = split_string(req); 154 | string url; 155 | string params; 156 | if(parts.size() >= 2) { 157 | url = parts[1]; 158 | 159 | if(auto pos = url.find('?'); pos != string::npos) { 160 | params = url.substr(pos+1); 161 | url.resize(pos); 162 | } 163 | } 164 | sqw.addValue({{"timestamp", tim}, {"ip", ip}, {"url", url}, 165 | {"params", params}, {"agent", agent}, {"ref", ref}, 166 | {"stat", stat}, {"siz", size}}); 167 | } 168 | catch(std::exception& i) { 169 | cerr< 4 | using namespace std; 5 | 6 | 7 | MiniPSQL::MiniPSQL(std::string_view fname) 8 | { 9 | d_conn = PQconnectdb(&fname[0]); 10 | if (PQstatus(d_conn) != CONNECTION_OK) { 11 | throw std::runtime_error("Error connecting to postgresql: "+ string(PQerrorMessage(d_conn))); 12 | } 13 | } 14 | 15 | MiniPSQL::~MiniPSQL() 16 | { 17 | if(d_intransaction) 18 | commit(); 19 | 20 | PQfinish(d_conn); 21 | } 22 | 23 | 24 | struct QueryResult 25 | { 26 | explicit QueryResult(PGconn* conn, const std::string& query) 27 | { 28 | d_res = PQexec(conn, query.c_str()); 29 | if(PQresultStatus(d_res) == PGRES_COMMAND_OK) { 30 | d_ntuples = d_nfields = 0; 31 | } 32 | else if (PQresultStatus(d_res) != PGRES_TUPLES_OK) { 33 | PQclear(d_res); 34 | throw std::runtime_error(string("query error: ") + PQerrorMessage(conn)); 35 | } 36 | d_ntuples = PQntuples(d_res); 37 | d_nfields = PQnfields(d_res); 38 | } 39 | 40 | explicit QueryResult(PGconn* conn, const std::string& table, const std::string& query, int paramsize) 41 | { 42 | d_res = PQprepare(conn, ("procedure_"+table).c_str(), query.c_str(), paramsize, NULL); 43 | 44 | if(PQresultStatus(d_res) != PGRES_COMMAND_OK) { 45 | PQclear(d_res); 46 | throw std::runtime_error(string("prepare error: ") + PQerrorMessage(conn)); 47 | } 48 | PQclear(d_res); 49 | d_res=0; 50 | } 51 | 52 | explicit QueryResult(PGconn* conn, const std::string& query, const std::vector& params) 53 | { 54 | d_res = PQexecParams(conn, query.c_str(), params.size(), NULL, ¶ms[0], NULL, NULL, 0); 55 | 56 | if (PQresultStatus(d_res) == PGRES_COMMAND_OK) { 57 | d_ntuples = d_nfields = 0; 58 | } 59 | else if (PQresultStatus(d_res) != PGRES_TUPLES_OK) { 60 | PQclear(d_res); 61 | throw std::runtime_error(string("parameter query error: ") + PQerrorMessage(conn)); 62 | } 63 | d_ntuples = PQntuples(d_res); 64 | d_nfields = PQnfields(d_res); 65 | } 66 | 67 | explicit QueryResult(PGconn* conn, const std::string& table, const std::vector& params) 68 | { 69 | vector pms; 70 | for(const auto& p : params) { 71 | // cout<<"Adding param: '"< getRow() 90 | { 91 | vector ret; 92 | 93 | if(d_row < d_ntuples) { 94 | for (unsigned int j = 0; j < d_nfields; j++) 95 | ret.push_back(PQgetvalue(d_res, d_row, j)); 96 | } 97 | d_row++; 98 | return ret; 99 | } 100 | 101 | ~QueryResult() 102 | { 103 | if(d_res) 104 | PQclear(d_res); 105 | } 106 | PGresult* d_res=0; 107 | unsigned int d_row=0; 108 | unsigned int d_ntuples; 109 | unsigned int d_nfields; 110 | }; 111 | 112 | std::vector> MiniPSQL::exec(const std::string& query) 113 | { 114 | std::vector> ret; 115 | 116 | QueryResult qr(d_conn, query); 117 | for(;;) { 118 | auto row = qr.getRow(); 119 | if(row.empty()) 120 | break;; 121 | ret.push_back(row); 122 | } 123 | 124 | return ret; 125 | } 126 | 127 | std::vector> MiniPSQL::exec(const std::string& query, vector params) 128 | { 129 | std::vector> ret; 130 | 131 | QueryResult qr(d_conn, query, params); 132 | for(;;) { 133 | auto row = qr.getRow(); 134 | if(row.empty()) 135 | break;; 136 | ret.push_back(row); 137 | } 138 | 139 | return ret; 140 | } 141 | 142 | 143 | void MiniPSQL::execPrep(const std::string& table) 144 | { 145 | QueryResult qr(d_conn, table, d_params[table]); 146 | d_params[table].clear(); 147 | } 148 | 149 | void MiniPSQL::addColumn(const std::string& table, std::string_view name, std::string_view type) 150 | { 151 | // SECURITY PROBLEM - somehow we can't do prepared statements here 152 | 153 | if(!haveTable(table)) { 154 | exec("create table if not exists "+table+" ( "+(string)name+" "+(string)type+" )"); 155 | } else { 156 | // cout<<"Adding column "< > MiniPSQL::getSchema(const std::string& table) 164 | { 165 | vector> ret; 166 | 167 | auto rows = exec("SELECT column_name, udt_name FROM information_schema.columns where table_name='"+table+"'"); 168 | 169 | for(const auto& r : rows) { 170 | ret.push_back({r[0], r[1]}); 171 | } 172 | sort(ret.begin(), ret.end(), [](const auto& a, const auto& b) { 173 | return a.first < b.first; 174 | }); 175 | 176 | // cout<<"returning "< “If we have data, let’s look at data. If all we have are opinions, let’s go with mine.” - Jim Barksdale. 6 | 7 | ![There is some sample data linked from the end of this file](/sample-data/graph.png) 8 | 9 | What is this? 10 | ------------- 11 | If you publish articles, you probably want to know if people are reading 12 | them. You can count raw hits but especially if you aren't drawing a lot of 13 | traffic, many of these hits are actually bots and crawlers. 14 | 15 | In addition, that the page got loaded doesn't tell you if the visitor 16 | actually read your work. 17 | 18 | Some time ago I was involved with a Dutch newspaper article, and they could 19 | tell me how many minutes of reading time it had generated. And this made me 20 | somewhat jealous. 21 | 22 | With these three scripts, you can instrument your pages with a tiny bit of 23 | javascript that probabilistically samples if a reader was active over the 24 | past minute (mouse/touch movement or scrolling). In the default settings, 25 | 10% of every active minute will be reported. 26 | 27 | Or in other words, if you get 10 reports, something like 100 minutes was 28 | spent reading your site. Sorta. 29 | 30 | In addition, these reports measure at what percentage of the content your 31 | reader is positioned. This helps you determine if people are making it to 32 | the end of your page or not. 33 | 34 | At the end of this page I've included some discussion how to interpret the 35 | data. 36 | 37 | Privacy 38 | ------- 39 | These scripts use no cookies and no local storage. There are no identifiers. 40 | You can run this without having to add a cookie or GDPR banner etc. However, 41 | I personally prefer to sample as little as possible. Many sites will track 42 | your every click, and even note if you switch to another tab. I find that 43 | somewhat upsetting. 44 | 45 | You can tailor your level of intrusiveness with the `reportingProbability` 46 | setting. The busier your site is the lower you can set this at and still 47 | have decent statistics. 48 | 49 | JavaScript 50 | ---------- 51 | Insert or link [audience-minutes.js](audience-minutes.js) from all pages you 52 | want to measure on. There are some settings at the beginning of the file 53 | where you can tweak how intrusive you want the measurements to be. In other 54 | words, do you want to sample 10% of every viewing minute? Or 1%? The busier 55 | your site is the lower you can set this. 56 | 57 | In the same place where you put the javascript file, also put an empty file 58 | called `report.json`. This will receive reports of active minutes. If you 59 | don't generate this file, the browser console will show 404s which is ugly. 60 | 61 | The easiest way to parse the results is to grab them from an access.log 62 | file. 63 | 64 | AWK 65 | --- 66 | Yes, AWK! With [this little script](repextract.awk) you can trawl your access.log files and 67 | generate a CSV file that only has URLs, scroll percentages and minute 68 | counts. 69 | 70 | This CSV file has no privacy considerations, there are no IP addresses in 71 | there. Unlike your original access.log. 72 | 73 | Jupyter notebook 74 | ---------------- 75 | To turn the CSV file into a graphs, use [this Jupyter 76 | script](audience.ipynb), from which you can also extract Python 3 if you 77 | don't want to run Jupyter. It is based on Pandas and Matplotlib. 78 | 79 | A few words on 'density' 80 | ------------------------ 81 | The "density" is which proportion of the samples were from that part of the 82 | page. So if everyone reads the page straight to the end, at a constant 83 | speed, the density is flat for the whole page. If it drops off, it means 84 | some people abandonded the page. If there is a particularly difficult part 85 | that takes people a long time to read, this would show up as a positive bump 86 | in the density. Conversely, of there is a list of bulletpoints that people 87 | can go through quickly, that would show up as a negative bump in the 88 | density. 89 | 90 | 91 | How to interpret the results 92 | ---------------------------- 93 | For starters, you'll notice that even quite high visitor numbers translate 94 | into not that many "audience minutes". This is not due to this script, it is 95 | a common disappointment. So if your calculations show that an article had 96 | 100 reading hours, this actually is quite a lot. 97 | 98 | For obvious reasons, professional media operations are not very forthcoming 99 | with these statistics on their readership. One big newspaper article I 100 | worked on garnered 75 confirmed reading hours, for example. 101 | 102 | So why are these numbers so low? For starters, you may not be seeing all 103 | readers. Perhaps the script doesn't fire on all devices. I've done some 104 | research, I don't think this is a major factor. But it could be. 105 | 106 | On the other hand, there is a ton of automated traffic coming to sites these 107 | days, lots of crawlers, bots, strange scanners etc. Mind you, some of these 108 | will even execute JavaScript! But most won't. This non-human traffic may 109 | have been inflating your numbers previously. 110 | 111 | In terms of the graph, if you have a ton of data, interpretation is easy. I 112 | find that you can even see where you put big photos in an article - these 113 | sections do not get a lot of reading minutes. 114 | 115 | If you have less data, you need to reduce the number of 'bins' in the 116 | histogram. I find that 10 bins work pretty well for general conclusions, and 117 | that might get you this: 118 | 119 | ![](/sample-data/histo10.png) 120 | 121 | If you get this, you can conclude that 1) most people that visited the URL 122 | actually wanted to read this kind of content and 2) most readers made it to 123 | the end of your article. The profile is mostly flat, with only some drop-off 124 | near the end, and no suspicious peak at the beginning. 125 | 126 | Contrast this with: 127 | 128 | ![](/sample-data/histo10b.png) 129 | 130 | This was an article that was extremely popular on HackerNews and a few other 131 | places. But we can see that readership clearly peaked in the first 10%. Lots 132 | of people decided that they had read enough at that point. This is 133 | caused by how the article got promoted, and it is not necessarily the 134 | "fault" of the writer. 135 | 136 | If we ignore the "mistargeting", from that point on, almost everyone makes 137 | it to the end of the article. 138 | 139 | 140 | Sample data 141 | ----------- 142 | If you want to help improve these small tools, but you don't yet have a lot 143 | data to work with, please find attached two weeks of data collected by 144 | [berthub.eu/articles](https://berthub.eu/articles) in the 145 | [sample-data](sample-data/) directory. 146 | 147 | -------------------------------------------------------------------------------- /ext/sqlitewriter/psqlwriter.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "minipsql.hh" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "psqlwriter.hh" 17 | #include 18 | using namespace std; 19 | 20 | struct DTime 21 | { 22 | void start() 23 | { 24 | d_start = std::chrono::steady_clock::now(); 25 | } 26 | uint32_t lapUsec() 27 | { 28 | auto usec = std::chrono::duration_cast(std::chrono::steady_clock::now()- d_start).count(); 29 | start(); 30 | return usec; 31 | } 32 | 33 | std::chrono::time_point d_start; 34 | }; 35 | 36 | // seconds 37 | static int waitForRWData(int fd, bool waitForRead, double* timeout, bool* error=0, bool* disconnected=0) 38 | { 39 | int ret; 40 | 41 | struct pollfd pfd; 42 | memset(&pfd, 0, sizeof(pfd)); 43 | pfd.fd = fd; 44 | 45 | if(waitForRead) 46 | pfd.events=POLLIN; 47 | else 48 | pfd.events=POLLOUT; 49 | 50 | ret = poll(&pfd, 1, timeout ? (*timeout * 1000) : -1); 51 | if ( ret == -1 ) { 52 | throw std::runtime_error("Waiting for data: "+std::string(strerror(errno))); 53 | } 54 | if(ret > 0) { 55 | if (error && (pfd.revents & POLLERR)) { 56 | *error = true; 57 | } 58 | if (disconnected && (pfd.revents & POLLHUP)) { 59 | *disconnected = true; 60 | } 61 | 62 | } 63 | return ret; 64 | } 65 | 66 | int waitForData(int fd, double timeout) 67 | { 68 | return waitForRWData(fd, true, &timeout); 69 | } 70 | 71 | void SetNonBlocking(int sock, bool to) 72 | { 73 | int flags=fcntl(sock,F_GETFL,0); 74 | if(flags<0) 75 | std::runtime_error(string("Retrieving socket flags: ")+ strerror(errno)); 76 | 77 | // so we could optimize to not do it if nonblocking already set, but that would be.. semantics 78 | if(to) { 79 | flags |= O_NONBLOCK; 80 | } 81 | else 82 | flags &= (~O_NONBLOCK); 83 | 84 | if(fcntl(sock, F_SETFL, flags) < 0) 85 | std::runtime_error(string("Setting socket flags: ")+ strerror(errno)); 86 | } 87 | 88 | 89 | void PSQLWriter::commitThread() 90 | { 91 | MiniPSQL mp(""); 92 | mp.exec("begin"); 93 | 94 | map>> schemas; 95 | 96 | // so how does this work 97 | // we get a stream of messages, each aimed at a single table 98 | // we group the messages by table, and check if the table exists, and if all fields exist 99 | // if not, we create tables and fields to match 100 | SetNonBlocking(d_pipe[0], true); 101 | bool needWait = false; 102 | time_t prevcommit=time(0); 103 | for(;;) { 104 | map> tabwork; // group by table 105 | DTime dt; 106 | dt.start(); 107 | int lim=0; 108 | int sumparms = 0; 109 | for(; lim < 10000 && sumparms < 60000; ++lim) { 110 | Message* msg; 111 | if(needWait) { 112 | cout<<"Waiting for data.."<values.size(); 131 | tabwork[msg->table].push_back(msg); 132 | needWait=false; 133 | } 134 | cout<<"Received "< fields; 142 | 143 | for(const auto& m : work) { 144 | for(const auto& f : m->values) { 145 | if(auto iter = fields.find(f.first); iter == fields.end()) { 146 | fields.insert(f.first); 147 | pair cmp{f.first, std::string()}; 148 | if(!binary_search(schemas[table].begin(), schemas[table].end(), cmp, 149 | [](const auto& a, const auto& b) 150 | { 151 | return a.first < b.first; 152 | })) { 153 | cout<<"shit, we miss "<(&f.second)) { 155 | mp.addColumn(table, f.first, "REAL"); 156 | schemas[table].push_back({f.first, "REAL"}); 157 | } 158 | else if(std::get_if(&f.second)) { 159 | mp.addColumn(table, f.first, "TEXT"); 160 | schemas[table].push_back({f.first, "TEXT"}); 161 | } else { 162 | mp.addColumn(table, f.first, "BIGINT"); 163 | schemas[table].push_back({f.first, "BIGINT"}); 164 | } 165 | 166 | sort(schemas[table].begin(), schemas[table].end()); 167 | 168 | } 169 | } 170 | } 171 | } 172 | 173 | string query="insert into "+table+" ("; 174 | 175 | bool first=true; 176 | for(const auto& f : fields) { 177 | if(!first) 178 | query+=','; 179 | first=false; 180 | query+=f; 181 | cout<<" "<> allstrings; 207 | allstrings.reserve(work.size()*fields.size()); 208 | 209 | for(const auto& m : work) { 210 | for(const auto& f: fields) { 211 | if(auto iter = m->values.find(f); iter!= m->values.end()) { 212 | std::visit([&allstrings](auto&&arg) { 213 | using T = std::decay_t; 214 | if constexpr (std::is_same_v) 215 | allstrings.push_back(arg); 216 | else 217 | allstrings.push_back(to_string(arg)); 218 | }, iter->second); 219 | } 220 | else { 221 | allstrings.push_back(std::optional()); 222 | } 223 | } 224 | } 225 | cout< allptrs; 231 | allptrs.reserve(allstrings.size()); 232 | for(const auto& p : allstrings) { 233 | if(p) 234 | allptrs.push_back(p->c_str()); 235 | else 236 | allptrs.push_back(0); 237 | } 238 | 239 | // cout<<"params2 took "< 3 | #include 4 | #include "sqlite3.h" 5 | using namespace std; 6 | 7 | MiniSQLite::MiniSQLite(std::string_view fname) 8 | { 9 | if ( sqlite3_open(&fname[0], &d_sqlite)!=SQLITE_OK ) { 10 | throw runtime_error("Unable to open "+(string)fname+" for sqlite"); 11 | } 12 | exec("PRAGMA journal_mode='wal'"); 13 | sqlite3_busy_timeout(d_sqlite, 60000); 14 | } 15 | 16 | //! Get field names and types from a table 17 | vector > MiniSQLite::getSchema(const std::string& table) 18 | { 19 | vector> ret; 20 | 21 | auto rows = exec("SELECT cid,name,type FROM pragma_table_xinfo('"+table+"')"); 22 | 23 | for(const auto& r : rows) { 24 | ret.push_back({r[1], r[2]}); 25 | } 26 | sort(ret.begin(), ret.end(), [](const auto& a, const auto& b) { 27 | return a.first < b.first; 28 | }); 29 | 30 | // cout<<"returning "< row; 37 | row.reserve(cols); 38 | for(int n=0; n < cols ; ++n) 39 | row.push_back(colvals[n]); 40 | ((MiniSQLite*)ptr)->d_rows.push_back(row); 41 | return 0; 42 | } 43 | 44 | vector> MiniSQLite::exec(std::string_view str) 45 | { 46 | char *errmsg; 47 | std::string errstr; 48 | // int (*callback)(void*,int,char**,char**) 49 | d_rows.clear(); 50 | int rc = sqlite3_exec(d_sqlite, &str[0], helperFunc, this, &errmsg); 51 | if (rc != SQLITE_OK) { 52 | errstr = errmsg; 53 | sqlite3_free(errmsg); 54 | throw std::runtime_error("Error executing sqlite3 query '"+(string)str+"': "+errstr); 55 | } 56 | return d_rows; 57 | } 58 | 59 | void MiniSQLite::bindPrep(const std::string& table, int idx, bool value) { sqlite3_bind_int(d_stmts[table], idx, value ? 1 : 0); } 60 | void MiniSQLite::bindPrep(const std::string& table, int idx, int value) { sqlite3_bind_int(d_stmts[table], idx, value); } 61 | void MiniSQLite::bindPrep(const std::string& table, int idx, uint32_t value) { sqlite3_bind_int64(d_stmts[table], idx, value); } 62 | void MiniSQLite::bindPrep(const std::string& table, int idx, long value) { sqlite3_bind_int64(d_stmts[table], idx, value); } 63 | void MiniSQLite::bindPrep(const std::string& table, int idx, unsigned long value) { sqlite3_bind_int64(d_stmts[table], idx, value); } 64 | void MiniSQLite::bindPrep(const std::string& table, int idx, long long value) { sqlite3_bind_int64(d_stmts[table], idx, value); } 65 | void MiniSQLite::bindPrep(const std::string& table, int idx, unsigned long long value) { sqlite3_bind_int64(d_stmts[table], idx, value); } 66 | void MiniSQLite::bindPrep(const std::string& table, int idx, double value) { sqlite3_bind_double(d_stmts[table], idx, value); } 67 | void MiniSQLite::bindPrep(const std::string& table, int idx, const std::string& value) { sqlite3_bind_text(d_stmts[table], idx, value.c_str(), value.size(), SQLITE_TRANSIENT); } 68 | 69 | 70 | void MiniSQLite::prepare(const std::string& table, string_view str) 71 | { 72 | if(d_stmts[table]) { 73 | sqlite3_finalize(d_stmts[table]); 74 | d_stmts[table] = 0; 75 | } 76 | const char* pTail; 77 | 78 | if (sqlite3_prepare_v2(d_sqlite, &str[0], -1, &d_stmts[table], &pTail ) != SQLITE_OK) { 79 | throw runtime_error("Unable to prepare query "+(string)str); 80 | } 81 | } 82 | 83 | void MiniSQLite::execPrep(const std::string& table) 84 | { 85 | int rc; 86 | for(;;) { 87 | rc = sqlite3_step(d_stmts[table]); // XXX this needs to be an error checking loop 88 | if(rc == SQLITE_DONE) 89 | break; 90 | else 91 | throw runtime_error("Sqlite error: "+std::to_string(rc)); 92 | } 93 | rc= sqlite3_reset(d_stmts[table]); 94 | if(rc != SQLITE_OK) 95 | throw runtime_error("Sqlite error: "+std::to_string(rc)); 96 | sqlite3_clear_bindings(d_stmts[table]); 97 | } 98 | 99 | void MiniSQLite::begin() 100 | { 101 | d_intransaction=true; 102 | exec("begin"); 103 | } 104 | void MiniSQLite::commit() 105 | { 106 | d_intransaction=false; 107 | exec("commit"); 108 | } 109 | 110 | void MiniSQLite::cycle() 111 | { 112 | exec("commit;begin"); 113 | } 114 | 115 | bool MiniSQLite::haveTable(const string& table) 116 | { 117 | return !getSchema(table).empty(); 118 | } 119 | 120 | 121 | //! Add a column to a table with a certain type 122 | void MiniSQLite::addColumn(const string& table, string_view name, string_view type) 123 | { 124 | // SECURITY PROBLEM - somehow we can't do prepared statements here 125 | 126 | if(!haveTable(table)) { 127 | #if SQLITE_VERSION_NUMBER >= 3037001 128 | exec("create table if not exists '"+table+"' ( '"+(string)name+"' "+(string)type+" ) STRICT"); 129 | #else 130 | exec("create table if not exists '"+table+"' ( '"+(string)name+"' "+(string)type+" )"); 131 | #endif 132 | } else { 133 | // cout<<"Adding column "< lock(d_mutex); 147 | d_db.cycle(); 148 | } 149 | n++; 150 | } 151 | // cerr<<"Thread exiting"< cmp{name, std::string()}; 162 | return binary_search(d_columns[table].begin(), d_columns[table].end(), cmp, 163 | [](const auto& a, const auto& b) 164 | { 165 | return a.first < b.first; 166 | }); 167 | 168 | } 169 | 170 | 171 | void SQLiteWriter::addValue(const initializer_list>& values, const std::string& table) 172 | { 173 | addValueGeneric(table, values); 174 | } 175 | 176 | void SQLiteWriter::addValue(const std::vector>& values, const std::string& table) 177 | { 178 | addValueGeneric(table, values); 179 | } 180 | 181 | 182 | template 183 | void SQLiteWriter::addValueGeneric(const std::string& table, const T& values) 184 | { 185 | std::lock_guard lock(d_mutex); 186 | if(!d_db.isPrepared(table) || !equal(values.begin(), values.end(), 187 | d_lastsig[table].cbegin(), d_lastsig[table].cend(), 188 | [](const auto& a, const auto& b) 189 | { 190 | return a.first == b; 191 | })) { 192 | // cout<<"Starting a new prepared statement"<(&p.second)) { 199 | d_db.addColumn(table, p.first, "REAL"); 200 | d_columns[table].push_back({p.first, "REAL"}); 201 | } 202 | else if(std::get_if(&p.second)) { 203 | d_db.addColumn(table, p.first, "TEXT"); 204 | d_columns[table].push_back({p.first, "TEXT"}); 205 | } else { 206 | d_db.addColumn(table, p.first, "INT"); 207 | d_columns[table].push_back({p.first, "INT"}); 208 | } 209 | 210 | sort(d_columns[table].begin(), d_columns[table].end()); 211 | } 212 | if(!first) { 213 | q+=", "; 214 | qmarks += ", "; 215 | } 216 | first=false; 217 | q+="'"+string(p.first)+"'"; 218 | qmarks +="?"; 219 | } 220 | q+= ") values ("+qmarks+")"; 221 | 222 | d_db.prepare(table, q); 223 | 224 | d_lastsig[table].clear(); 225 | for(const auto& p : values) 226 | d_lastsig[table].push_back(p.first); 227 | } 228 | 229 | int n = 1; 230 | for(const auto& p : values) { 231 | std::visit([this, &n, &table](auto&& arg) { 232 | d_db.bindPrep(table, n, arg); 233 | }, p.second); 234 | n++; 235 | } 236 | d_db.execPrep(table); 237 | } 238 | 239 | MiniSQLite::~MiniSQLite() 240 | { 241 | // needs to close down d_sqlite3 242 | if(d_intransaction) 243 | commit(); 244 | 245 | for(auto& stmt: d_stmts) 246 | if(stmt.second) 247 | sqlite3_finalize(stmt.second); 248 | 249 | sqlite3_close(d_sqlite); // same 250 | } 251 | --------------------------------------------------------------------------------