1 Read log data

  • readr::read_log is more forgiving than data.table::fread
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
rawlogs<-read_log("https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_integer(),
##   X7 = col_integer(),
##   X8 = col_character(),
##   X9 = col_character()
## )
## Warning: 16 parsing failures.
##  row col   expected     actual                                                                                        file
## 4031  -- 9 columns  5 columns  'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 4192  -- 9 columns  5 columns  'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897  X6 an integer U;         'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897  X7 an integer )          'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897  -- 9 columns  11 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## .... ... .......... .......... ...........................................................................................
## See problems(...) for more details.
library(data.table)
logs<-data.table(rawlogs)
knitr::kable(head(logs))
X1 X2 X3 X4 X5 X6 X7 X8 X9
83.149.9.216 NA NA 17/May/2015:10:05:03 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 200 203023 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:43 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 200 171717 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:47 +0000 GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 200 26185 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:12 +0000 GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 200 7697 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:07 +0000 GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 200 2892 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:34 +0000 GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 200 430406 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36

1.1 Rename columns

Logs usually don’t have headers so you need to update the default column titles to something more expressive.

setnames(logs, colnames(logs)
         ,c( "ip", "identd", "uname", "time", "request", "status", "respsize", "referer", "agent"))
# http://stackoverflow.com/questions/9234699/understanding-apache-access-log
#  %h is the remote host (ie the client IP)
# %l is the identity of the user determined by identd (not usually # used since not reliable)
# %u is the user name determined by HTTP authentication
# %t is the time the request was received.
# %r is the request line from the client. ("GET / HTTP/1.0")
# %>s is the status code sent from the server to the client (200, # 404 etc.)
# %b is the size of the response to the client (in bytes)
# Referer is the page that linked to this URL.
# User-agent is the browser identification string.

knitr::kable(head(logs))
ip identd uname time request status respsize referer agent
83.149.9.216 NA NA 17/May/2015:10:05:03 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 200 203023 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:43 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 200 171717 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:47 +0000 GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 200 26185 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:12 +0000 GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 200 7697 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:07 +0000 GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 200 2892 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
83.149.9.216 NA NA 17/May/2015:10:05:34 +0000 GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 200 430406 http://semicomplete.com/presentations/logstash-monitorama-2013/ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36

2 Time handling

There is date handling capability out of the box with R, however, the lubridate package makes it easier to convert strings to dates, and perform manipulations.

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
logs[,time:=dmy_hms(time)]
logs[,`:=`(hour=hour(time), wday=wday(time)
           ,morning=am(time))]
logs[ , weekend:= wday %in% c(1,7)]

3 Geolocation packages

There are few packages for resolving IPs: - rgeolocate - ggmap - iptools - ipapi (gh: hrbrmstr/ipapi)

Which one to use depends on API preferences, plus any additional requirements.

Play it smart - don’t call for every record, call for every unique record. Cache values where possible!

if(!require(ipapi)) devtools::install_github("hrbrmstr/ipapi")
## Loading required package: ipapi
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ipapi'
## Using GitHub PAT from envvar GITHUB_PAT
## Downloading GitHub repo hrbrmstr/ipapi@master
## from URL https://api.github.com/repos/hrbrmstr/ipapi/zipball/master
## Installing ipapi
## Installing pbapply
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save  \
##   --no-restore --quiet CMD INSTALL  \
##   '/tmp/RtmpOwVkYn/devtools100d738e37cc3/pbapply'  \
##   --library='/home/travis/R/Library' --install-tests
## 
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save  \
##   --no-restore --quiet CMD INSTALL  \
##   '/tmp/RtmpOwVkYn/devtools100d7740c4e06/hrbrmstr-ipapi-c612329'  \
##   --library='/home/travis/R/Library' --install-tests
## 
library(ipapi)
ips<-logs[,unique(ip)]

example<-TRUE
iptblloc<-"https://raw.githubusercontent.com/stephlocke/lazyCDN/master/sampleIPtbl.csv"

ip_tbl<-if(example) fread(iptblloc) 

#ip_tbl<- ipapi::geolocate(ips)[, status:=NULL]

# Join IP results to log data
logs<-logs[ip_tbl, on=c(ip="query")]
head(logs)
##              ip identd uname                time
## 1: 83.149.9.216     NA    NA 2015-05-17 10:05:03
## 2: 83.149.9.216     NA    NA 2015-05-17 10:05:43
## 3: 83.149.9.216     NA    NA 2015-05-17 10:05:47
## 4: 83.149.9.216     NA    NA 2015-05-17 10:05:12
## 5: 83.149.9.216     NA    NA 2015-05-17 10:05:07
## 6: 83.149.9.216     NA    NA 2015-05-17 10:05:34
##                                                                               request
## 1:      GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1
## 2:  GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1
## 3: GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1
## 4:        GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1
## 5:         GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1
## 6:          GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1
##    status respsize
## 1:    200   203023
## 2:    200   171717
## 3:    200    26185
## 4:    200     7697
## 5:    200     2892
## 6:    200   430406
##                                                            referer
## 1: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 2: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 3: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 4: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 5: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 6: http://semicomplete.com/presentations/logstash-monitorama-2013/
##                                                                                                                      agent
## 1: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 2: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 3: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 4: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 5: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 6: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
##    hour wday morning weekend                   as   city country
## 1:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
## 2:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
## 3:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
## 4:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
## 5:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
## 6:   10    1    TRUE    TRUE AS25159 PJSC MegaFon Moscow  Russia
##    countryCode          isp     lat     lon          org region regionName
## 1:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
## 2:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
## 3:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
## 4:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
## 5:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
## 6:          RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon    MOW     Moscow
##         timezone    zip
## 1: Europe/Moscow 101194
## 2: Europe/Moscow 101194
## 3: Europe/Moscow 101194
## 4: Europe/Moscow 101194
## 5: Europe/Moscow 101194
## 6: Europe/Moscow 101194

4 URL handling

The format of the Apache request log means that the request component needs splitting up. The values are not always in quite the right format so you should always check for errors.

logs[,c("verb","url","scheme"):=tstrsplit(request," ")[1:3]]

# isolate issues!
issues<-logs[,!((verb %like% "^[A-Z]{3,}$")&
                 (scheme %like% "^HTTP"))]
errors<-logs[issues,]
logs<-logs[!issues, ]
library(urltools)
logs[,c("path","params"):=.(path(url),parameters(url))]

5 Event steps

Often you need to worry about steps taken over time. The data.table package gives you an easy way to add IDs to rows or groups.

logs[order(time),`:=`(order=.SD[,.I], visit=.GRP), .(ip,agent)]

5.1 Most common landing pages

knitr::kable(logs[order==1,.N,path][
  order(-N)[1:10],])
path N
favicon.ico 221
NA 145
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png 112
images/web/2009/banner.png 95
reset.css 91
style2.css 91
images/googledotcom.png 89
robots.txt 79
images/jordan-80.png 73
projects/xdotool/xdotool.xhtml 54

5.2 Most common exit pages

knitr::kable(logs[,.SD[which.max(order)],visit][
  ,.N,path][order(-N)[1:10],])
path N
favicon.ico 215
NA 149
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png 111
images/googledotcom.png 94
images/jordan-80.png 90
images/web/2009/banner.png 90
style2.css 83
robots.txt 75
reset.css 63
projects/xdotool/xdotool.xhtml 63

5.3 Most common bounce pages

knitr::kable(logs[,.SD[which.max(order)],visit][
  order==1,.N,path][order(-N)[1:10],])
path N
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png 105
images/googledotcom.png 89
favicon.ico 67
NA 62
robots.txt 43
presentations/logstash-scale11x/images/logstash.png 18
projects/xdotool/ 15
images/jordan-80.png 11
articles/dynamic-dns-with-dhcp/ 11
blog/geekery/ssl-latency.html 10

5.4 Most common error pages

logs[status>=500, .N, .(path,status)][order(-N)[1:pmin(10, .N)]]
##                  path status N
## 1: misc/Title.php.txt    500 2
## 2:  projects/xdotool/    500 1

5.5 Time since last request

logs[order(order), timesinceprevrequest:= time - shift(time) , visit]
logs[visit==1, .(order, time, timesinceprevrequest )]
##     order                time timesinceprevrequest
##  1:     2 2015-05-17 10:05:03               3 secs
##  2:    14 2015-05-17 10:05:43               9 secs
##  3:    16 2015-05-17 10:05:47               1 secs
##  4:     5 2015-05-17 10:05:12               1 secs
##  5:     3 2015-05-17 10:05:07               4 secs
##  6:    13 2015-05-17 10:05:34               1 secs
##  7:    22 2015-05-17 10:05:57               1 secs
##  8:    17 2015-05-17 10:05:50               3 secs
##  9:     7 2015-05-17 10:05:24               5 secs
## 10:    18 2015-05-17 10:05:50               0 secs
## 11:    15 2015-05-17 10:05:46               3 secs
## 12:     4 2015-05-17 10:05:11               4 secs
## 13:     6 2015-05-17 10:05:19               7 secs
## 14:    11 2015-05-17 10:05:33               3 secs
## 15:     1 2015-05-17 10:05:00              NA secs
## 16:     9 2015-05-17 10:05:25               1 secs
## 17:    23 2015-05-17 10:05:59               2 secs
## 18:    10 2015-05-17 10:05:30               5 secs
## 19:    19 2015-05-17 10:05:53               3 secs
## 20:     8 2015-05-17 10:05:24               0 secs
## 21:    20 2015-05-17 10:05:54               1 secs
## 22:    12 2015-05-17 10:05:33               0 secs
## 23:    21 2015-05-17 10:05:56               2 secs
##     order                time timesinceprevrequest

6 Visualising

6.1 Mapping

library(ggmap)
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind
## The following object is masked from 'package:magrittr':
## 
##     inset
ggplot(map_data('world')) +
  geom_polygon(aes(x = long, y = lat, group = group), fill = 'grey90', colour = 'white') + 
geom_point(aes(x = lon, y = lat, size = N), color = '#2165B6',
           data = logs[, .N, .(lon, lat)]) +
  xlab('') + ylab('') + 
  theme_minimal() + theme('legend.position' = 'top')

6.2 Heatmap

library(ggplot2)

heatmap<-function(ggplot,size=20){
  ggplot+ coord_equal()+
    geom_tile(color="white", size=0.1)+
    labs(x=NULL, y=NULL, title=NULL)+
    scale_x_continuous(breaks=seq(0,24,6))+
    scale_fill_gradient()
}

ip_activity<-logs[,.N,.(country,hour)]
ga<-ggplot(ip_activity[country %like% "^A"], aes(x=hour, y=country, fill=N))
heatmap(ga)

6.3 Flow Diagram

library(DiagrammeR)

URLids<-logs[,.N,.(labels_col=path)][,nodes:=.I][N>50]
activity<-URLids[logs, on=c(labels_col="path")][
  !is.na(nodes),.(visit, order, nodes)]

# Get a cross join of activity
moves<-activity[activity, on=c("visit"), allow.cartesian=TRUE][
  # Filter to only include next site
  order==i.order-1][ 
    # Get nodes and position
    ,.(tooltip=.N),.(from=nodes,to=i.nodes)][,penwidth:=10*tooltip/max(tooltip)]

## Bug in DiagrammeR latest v :-/ 
#
#gr<-create_graph(URLids, moves)
#
#render_graph(gr)

7 Time series

top10<-logs[,.N,country][order(-N)[1:10],country]
tz_ts<-logs[country %in% top10,.N,.(country, xts::align.time(time,n=60*5))]
ggplot(tz_ts, aes(x=xts, y=N, group=1))+
  geom_line()+
  geom_smooth()+
  facet_wrap(~country, scales="free_y")
## `geom_smooth()` using method = 'loess'

library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following objects are masked from 'package:data.table':
## 
##     first, last
ts<-logs[order(time),.N,.(time=xts::align.time(time,n=60))]
xts_df<-xts(ts$N,ts$time)
plot(xts_df)

devtools::install_github("twitter/AnomalyDetection")
## Using GitHub PAT from envvar GITHUB_PAT
## Downloading GitHub repo twitter/AnomalyDetection@master
## from URL https://api.github.com/repos/twitter/AnomalyDetection/zipball/master
## Installing AnomalyDetection
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save  \
##   --no-restore --quiet CMD INSTALL  \
##   '/tmp/RtmpOwVkYn/devtools100d75b506f5f/twitter-AnomalyDetection-1f5deaa'  \
##   --library='/home/travis/R/Library' --install-tests
## 
library(AnomalyDetection)
AnomalyDetectionTs(setDF(logs[,.N,align.time(time)]), max_anoms=0.05, direction='both',plot=TRUE)
## $anoms
##             timestamp anoms
## 1 2015-05-17 10:06:00    74
## 2 2015-05-18 10:06:00   132
## 3 2015-05-19 14:06:00   134
## 4 2015-05-20 21:06:00    78
## 
## $plot