library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
rawlogs<-read_log("https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_integer(),
## X7 = col_integer(),
## X8 = col_character(),
## X9 = col_character()
## )
## Warning: 16 parsing failures.
## row col expected actual file
## 4031 -- 9 columns 5 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 4192 -- 9 columns 5 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 X6 an integer U; 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 X7 an integer ) 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 -- 9 columns 11 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## .... ... .......... .......... ...........................................................................................
## See problems(...) for more details.
library(data.table)
logs<-data.table(rawlogs)
knitr::kable(head(logs))
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 |
---|---|---|---|---|---|---|---|---|
83.149.9.216 | NA | NA | 17/May/2015:10:05:03 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 | 200 | 203023 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:43 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 | 200 | 171717 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:47 +0000 | GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 | 200 | 26185 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:12 +0000 | GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 | 200 | 7697 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:07 +0000 | GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 | 200 | 2892 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:34 +0000 | GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 | 200 | 430406 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
Logs usually don’t have headers so you need to update the default column titles to something more expressive.
setnames(logs, colnames(logs)
,c( "ip", "identd", "uname", "time", "request", "status", "respsize", "referer", "agent"))
# http://stackoverflow.com/questions/9234699/understanding-apache-access-log
# %h is the remote host (ie the client IP)
# %l is the identity of the user determined by identd (not usually # used since not reliable)
# %u is the user name determined by HTTP authentication
# %t is the time the request was received.
# %r is the request line from the client. ("GET / HTTP/1.0")
# %>s is the status code sent from the server to the client (200, # 404 etc.)
# %b is the size of the response to the client (in bytes)
# Referer is the page that linked to this URL.
# User-agent is the browser identification string.
knitr::kable(head(logs))
ip | identd | uname | time | request | status | respsize | referer | agent |
---|---|---|---|---|---|---|---|---|
83.149.9.216 | NA | NA | 17/May/2015:10:05:03 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 | 200 | 203023 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:43 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 | 200 | 171717 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:47 +0000 | GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 | 200 | 26185 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:12 +0000 | GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 | 200 | 7697 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:07 +0000 | GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 | 200 | 2892 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:34 +0000 | GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 | 200 | 430406 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
There is date handling capability out of the box with R, however, the lubridate package makes it easier to convert strings to dates, and perform manipulations.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
logs[,time:=dmy_hms(time)]
logs[,`:=`(hour=hour(time), wday=wday(time)
,morning=am(time))]
logs[ , weekend:= wday %in% c(1,7)]
There are few packages for resolving IPs: - rgeolocate - ggmap - iptools - ipapi (gh: hrbrmstr/ipapi)
Which one to use depends on API preferences, plus any additional requirements.
Play it smart - don’t call for every record, call for every unique record. Cache values where possible!
if(!require(ipapi)) devtools::install_github("hrbrmstr/ipapi")
## Loading required package: ipapi
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ipapi'
## Using GitHub PAT from envvar GITHUB_PAT
## Downloading GitHub repo hrbrmstr/ipapi@master
## from URL https://api.github.com/repos/hrbrmstr/ipapi/zipball/master
## Installing ipapi
## Installing pbapply
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save \
## --no-restore --quiet CMD INSTALL \
## '/tmp/RtmpOwVkYn/devtools100d738e37cc3/pbapply' \
## --library='/home/travis/R/Library' --install-tests
##
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save \
## --no-restore --quiet CMD INSTALL \
## '/tmp/RtmpOwVkYn/devtools100d7740c4e06/hrbrmstr-ipapi-c612329' \
## --library='/home/travis/R/Library' --install-tests
##
library(ipapi)
ips<-logs[,unique(ip)]
example<-TRUE
iptblloc<-"https://raw.githubusercontent.com/stephlocke/lazyCDN/master/sampleIPtbl.csv"
ip_tbl<-if(example) fread(iptblloc)
#ip_tbl<- ipapi::geolocate(ips)[, status:=NULL]
# Join IP results to log data
logs<-logs[ip_tbl, on=c(ip="query")]
head(logs)
## ip identd uname time
## 1: 83.149.9.216 NA NA 2015-05-17 10:05:03
## 2: 83.149.9.216 NA NA 2015-05-17 10:05:43
## 3: 83.149.9.216 NA NA 2015-05-17 10:05:47
## 4: 83.149.9.216 NA NA 2015-05-17 10:05:12
## 5: 83.149.9.216 NA NA 2015-05-17 10:05:07
## 6: 83.149.9.216 NA NA 2015-05-17 10:05:34
## request
## 1: GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1
## 2: GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1
## 3: GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1
## 4: GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1
## 5: GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1
## 6: GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1
## status respsize
## 1: 200 203023
## 2: 200 171717
## 3: 200 26185
## 4: 200 7697
## 5: 200 2892
## 6: 200 430406
## referer
## 1: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 2: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 3: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 4: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 5: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 6: http://semicomplete.com/presentations/logstash-monitorama-2013/
## agent
## 1: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 2: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 3: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 4: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 5: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 6: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## hour wday morning weekend as city country
## 1: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 2: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 3: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 4: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 5: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 6: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## countryCode isp lat lon org region regionName
## 1: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 2: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 3: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 4: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 5: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 6: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## timezone zip
## 1: Europe/Moscow 101194
## 2: Europe/Moscow 101194
## 3: Europe/Moscow 101194
## 4: Europe/Moscow 101194
## 5: Europe/Moscow 101194
## 6: Europe/Moscow 101194
The format of the Apache request log means that the request component needs splitting up. The values are not always in quite the right format so you should always check for errors.
logs[,c("verb","url","scheme"):=tstrsplit(request," ")[1:3]]
# isolate issues!
issues<-logs[,!((verb %like% "^[A-Z]{3,}$")&
(scheme %like% "^HTTP"))]
errors<-logs[issues,]
logs<-logs[!issues, ]
library(urltools)
logs[,c("path","params"):=.(path(url),parameters(url))]
Often you need to worry about steps taken over time. The data.table package gives you an easy way to add IDs to rows or groups.
logs[order(time),`:=`(order=.SD[,.I], visit=.GRP), .(ip,agent)]
knitr::kable(logs[order==1,.N,path][
order(-N)[1:10],])
path | N |
---|---|
favicon.ico | 221 |
NA | 145 |
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png | 112 |
images/web/2009/banner.png | 95 |
reset.css | 91 |
style2.css | 91 |
images/googledotcom.png | 89 |
robots.txt | 79 |
images/jordan-80.png | 73 |
projects/xdotool/xdotool.xhtml | 54 |
knitr::kable(logs[,.SD[which.max(order)],visit][
,.N,path][order(-N)[1:10],])
path | N |
---|---|
favicon.ico | 215 |
NA | 149 |
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png | 111 |
images/googledotcom.png | 94 |
images/jordan-80.png | 90 |
images/web/2009/banner.png | 90 |
style2.css | 83 |
robots.txt | 75 |
reset.css | 63 |
projects/xdotool/xdotool.xhtml | 63 |
knitr::kable(logs[,.SD[which.max(order)],visit][
order==1,.N,path][order(-N)[1:10],])
path | N |
---|---|
presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png | 105 |
images/googledotcom.png | 89 |
favicon.ico | 67 |
NA | 62 |
robots.txt | 43 |
presentations/logstash-scale11x/images/logstash.png | 18 |
projects/xdotool/ | 15 |
images/jordan-80.png | 11 |
articles/dynamic-dns-with-dhcp/ | 11 |
blog/geekery/ssl-latency.html | 10 |
logs[status>=500, .N, .(path,status)][order(-N)[1:pmin(10, .N)]]
## path status N
## 1: misc/Title.php.txt 500 2
## 2: projects/xdotool/ 500 1
logs[order(order), timesinceprevrequest:= time - shift(time) , visit]
logs[visit==1, .(order, time, timesinceprevrequest )]
## order time timesinceprevrequest
## 1: 2 2015-05-17 10:05:03 3 secs
## 2: 14 2015-05-17 10:05:43 9 secs
## 3: 16 2015-05-17 10:05:47 1 secs
## 4: 5 2015-05-17 10:05:12 1 secs
## 5: 3 2015-05-17 10:05:07 4 secs
## 6: 13 2015-05-17 10:05:34 1 secs
## 7: 22 2015-05-17 10:05:57 1 secs
## 8: 17 2015-05-17 10:05:50 3 secs
## 9: 7 2015-05-17 10:05:24 5 secs
## 10: 18 2015-05-17 10:05:50 0 secs
## 11: 15 2015-05-17 10:05:46 3 secs
## 12: 4 2015-05-17 10:05:11 4 secs
## 13: 6 2015-05-17 10:05:19 7 secs
## 14: 11 2015-05-17 10:05:33 3 secs
## 15: 1 2015-05-17 10:05:00 NA secs
## 16: 9 2015-05-17 10:05:25 1 secs
## 17: 23 2015-05-17 10:05:59 2 secs
## 18: 10 2015-05-17 10:05:30 5 secs
## 19: 19 2015-05-17 10:05:53 3 secs
## 20: 8 2015-05-17 10:05:24 0 secs
## 21: 20 2015-05-17 10:05:54 1 secs
## 22: 12 2015-05-17 10:05:33 0 secs
## 23: 21 2015-05-17 10:05:56 2 secs
## order time timesinceprevrequest
library(ggmap)
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
## The following object is masked from 'package:magrittr':
##
## inset
ggplot(map_data('world')) +
geom_polygon(aes(x = long, y = lat, group = group), fill = 'grey90', colour = 'white') +
geom_point(aes(x = lon, y = lat, size = N), color = '#2165B6',
data = logs[, .N, .(lon, lat)]) +
xlab('') + ylab('') +
theme_minimal() + theme('legend.position' = 'top')
library(ggplot2)
heatmap<-function(ggplot,size=20){
ggplot+ coord_equal()+
geom_tile(color="white", size=0.1)+
labs(x=NULL, y=NULL, title=NULL)+
scale_x_continuous(breaks=seq(0,24,6))+
scale_fill_gradient()
}
ip_activity<-logs[,.N,.(country,hour)]
ga<-ggplot(ip_activity[country %like% "^A"], aes(x=hour, y=country, fill=N))
heatmap(ga)
library(DiagrammeR)
URLids<-logs[,.N,.(labels_col=path)][,nodes:=.I][N>50]
activity<-URLids[logs, on=c(labels_col="path")][
!is.na(nodes),.(visit, order, nodes)]
# Get a cross join of activity
moves<-activity[activity, on=c("visit"), allow.cartesian=TRUE][
# Filter to only include next site
order==i.order-1][
# Get nodes and position
,.(tooltip=.N),.(from=nodes,to=i.nodes)][,penwidth:=10*tooltip/max(tooltip)]
## Bug in DiagrammeR latest v :-/
#
#gr<-create_graph(URLids, moves)
#
#render_graph(gr)
top10<-logs[,.N,country][order(-N)[1:10],country]
tz_ts<-logs[country %in% top10,.N,.(country, xts::align.time(time,n=60*5))]
ggplot(tz_ts, aes(x=xts, y=N, group=1))+
geom_line()+
geom_smooth()+
facet_wrap(~country, scales="free_y")
## `geom_smooth()` using method = 'loess'
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## first, last
ts<-logs[order(time),.N,.(time=xts::align.time(time,n=60))]
xts_df<-xts(ts$N,ts$time)
plot(xts_df)
devtools::install_github("twitter/AnomalyDetection")
## Using GitHub PAT from envvar GITHUB_PAT
## Downloading GitHub repo twitter/AnomalyDetection@master
## from URL https://api.github.com/repos/twitter/AnomalyDetection/zipball/master
## Installing AnomalyDetection
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save \
## --no-restore --quiet CMD INSTALL \
## '/tmp/RtmpOwVkYn/devtools100d75b506f5f/twitter-AnomalyDetection-1f5deaa' \
## --library='/home/travis/R/Library' --install-tests
##
library(AnomalyDetection)
AnomalyDetectionTs(setDF(logs[,.N,align.time(time)]), max_anoms=0.05, direction='both',plot=TRUE)
## $anoms
## timestamp anoms
## 1 2015-05-17 10:06:00 74
## 2 2015-05-18 10:06:00 132
## 3 2015-05-19 14:06:00 134
## 4 2015-05-20 21:06:00 78
##
## $plot