#!/bin/sh

export LANG=C #for speed

feed="/feed/rss2.xml"

#assume all subscribers check once a week
./get_last_days 7 access_log |
#filter on those accessing feed URL
grep -F "GET $feed" |
#exclude browsers that refer to (click) feed from site
grep -vE "pixelbeat.org.*(rv:|MSIE|AppleWebKit/|Konqueror|Opera) .* " |
#extract first 16 bits of ip & user_agent
sed 's/\([0-9]*\.[0-9]*\)\.[0-9]*\.[0-9]* .*"\([^"]*\)"$/\1\t\2/' |
#sort by agent, then by ip net
sort -k2 -k1,1 |
#merge and count all requests from same user agent at a particular net
uniq -c |
#ignore single requests from browsers
grep -vE "      1 .*(rv:|MSIE|AppleWebKit/|Konqueror|Opera).*" |
#ignore bots
grep -vE -f agents_to_ignore |
#Merge reader variants
sed '
 s/\([^\t]\)\t.*Firefox.*/\1\tFirefox/;
 s/\([^\t]\)\t.*MSIE 7.0.*/\1\tIE7/;
 s/\([^\t]\)\t.*Opera.*/\1\tOpera/;
 s/\([^\t]\)\t.*Akregator.*/\1\tAkregator/;
 s/\([^\t]\)\t.*Thunderbird.*/\1\tThunderbird/;
 s/\([^\t]\)\t.*Liferea.*/\1\tLiferea/;
 s/\([^\t]\)\t.*Google Desktop.*/\1\tGoogle Desktop/;
 ' |
#select just agent strings
cut -d"`echo -e '\t'`" -f2 |
#group agent strings
sort |
#count number of subscribers using each agent
uniq -c |
#uniquely identify different feeds read by google
sed 's/\(.*\)\(feedfetcher.html\)\(.*\)id=\([0-9]*\).*/\1\2.\4\3/' |
#move subscribers counts of online readers to first column
sed 's/ *[0-9]* .*\(http[^;]*\).* \([0-9]*\) subscriber.*/     \2 \1/' |
#merge agents again, in case there were increasing subscribers during day
uniq -f1 |
#sort by subscriber numbers
sort -k1,1n |
#right align numbers
sed "s/^/      /; s/ *\([ 0-9]\{7,\}\) \([^ ].*\)/\1 \2/" |
#truncate lines to 80 chars
sed "s/\(.\{80\}\).*/\1/" #note $COLUMNS not exported
