library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
rawlogs<-read_log("https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_integer(),
## X7 = col_integer(),
## X8 = col_character(),
## X9 = col_character()
## )
## Warning: 16 parsing failures.
## row col expected actual file
## 4031 -- 9 columns 5 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 4192 -- 9 columns 5 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 X6 an integer U; 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 X7 an integer ) 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## 8897 -- 9 columns 11 columns 'https://raw.githubusercontent.com/elastic/examples/master/ElasticStack_apache/apache_logs'
## .... ... .......... .......... ...........................................................................................
## See problems(...) for more details.
library(data.table)
logs<-data.table(rawlogs)
knitr::kable(head(logs))
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 |
---|---|---|---|---|---|---|---|---|
83.149.9.216 | NA | NA | 17/May/2015:10:05:03 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 | 200 | 203023 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:43 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 | 200 | 171717 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:47 +0000 | GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 | 200 | 26185 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:12 +0000 | GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 | 200 | 7697 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:07 +0000 | GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 | 200 | 2892 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:34 +0000 | GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 | 200 | 430406 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
Logs usually don’t have headers so you need to update the default column titles to something more expressive.
setnames(logs, colnames(logs)
,c( "ip", "identd", "uname", "time", "request", "status", "respsize", "referer", "agent"))
# http://stackoverflow.com/questions/9234699/understanding-apache-access-log
# %h is the remote host (ie the client IP)
# %l is the identity of the user determined by identd (not usually # used since not reliable)
# %u is the user name determined by HTTP authentication
# %t is the time the request was received.
# %r is the request line from the client. ("GET / HTTP/1.0")
# %>s is the status code sent from the server to the client (200, # 404 etc.)
# %b is the size of the response to the client (in bytes)
# Referer is the page that linked to this URL.
# User-agent is the browser identification string.
knitr::kable(head(logs))
ip | identd | uname | time | request | status | respsize | referer | agent |
---|---|---|---|---|---|---|---|---|
83.149.9.216 | NA | NA | 17/May/2015:10:05:03 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1 | 200 | 203023 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:43 +0000 | GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1 | 200 | 171717 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:47 +0000 | GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1 | 200 | 26185 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:12 +0000 | GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1 | 200 | 7697 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:07 +0000 | GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1 | 200 | 2892 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
83.149.9.216 | NA | NA | 17/May/2015:10:05:34 +0000 | GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1 | 200 | 430406 | http://semicomplete.com/presentations/logstash-monitorama-2013/ | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36 |
There is date handling capability out of the box with R, however, the lubridate package makes it easier to convert strings to dates, and perform manipulations.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
logs[,time:=dmy_hms(time)]
logs[,`:=`(hour=hour(time), wday=wday(time)
,morning=am(time))]
logs[ , weekend:= wday %in% c(1,7)]
There are few packages for resolving IPs: - rgeolocate - ggmap - iptools - ipapi (gh: hrbrmstr/ipapi)
Which one to use depends on API preferences, plus any additional requirements.
Play it smart - don’t call for every record, call for every unique record. Cache values where possible!
if(!require(ipapi)) devtools::install_github("hrbrmstr/ipapi")
## Loading required package: ipapi
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ipapi'
## Using GitHub PAT from envvar GITHUB_PAT
## Downloading GitHub repo hrbrmstr/ipapi@master
## from URL https://api.github.com/repos/hrbrmstr/ipapi/zipball/master
## Installing ipapi
## Installing pbapply
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save \
## --no-restore --quiet CMD INSTALL \
## '/tmp/RtmpOwVkYn/devtools100d738e37cc3/pbapply' \
## --library='/home/travis/R/Library' --install-tests
##
## '/home/travis/R-bin/lib/R/bin/R' --no-site-file --no-environ --no-save \
## --no-restore --quiet CMD INSTALL \
## '/tmp/RtmpOwVkYn/devtools100d7740c4e06/hrbrmstr-ipapi-c612329' \
## --library='/home/travis/R/Library' --install-tests
##
library(ipapi)
ips<-logs[,unique(ip)]
example<-TRUE
iptblloc<-"https://raw.githubusercontent.com/stephlocke/lazyCDN/master/sampleIPtbl.csv"
ip_tbl<-if(example) fread(iptblloc)
#ip_tbl<- ipapi::geolocate(ips)[, status:=NULL]
# Join IP results to log data
logs<-logs[ip_tbl, on=c(ip="query")]
head(logs)
## ip identd uname time
## 1: 83.149.9.216 NA NA 2015-05-17 10:05:03
## 2: 83.149.9.216 NA NA 2015-05-17 10:05:43
## 3: 83.149.9.216 NA NA 2015-05-17 10:05:47
## 4: 83.149.9.216 NA NA 2015-05-17 10:05:12
## 5: 83.149.9.216 NA NA 2015-05-17 10:05:07
## 6: 83.149.9.216 NA NA 2015-05-17 10:05:34
## request
## 1: GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1
## 2: GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1
## 3: GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1
## 4: GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1
## 5: GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1
## 6: GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1
## status respsize
## 1: 200 203023
## 2: 200 171717
## 3: 200 26185
## 4: 200 7697
## 5: 200 2892
## 6: 200 430406
## referer
## 1: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 2: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 3: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 4: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 5: http://semicomplete.com/presentations/logstash-monitorama-2013/
## 6: http://semicomplete.com/presentations/logstash-monitorama-2013/
## agent
## 1: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 2: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 3: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 4: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 5: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## 6: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36
## hour wday morning weekend as city country
## 1: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 2: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 3: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 4: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 5: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## 6: 10 1 TRUE TRUE AS25159 PJSC MegaFon Moscow Russia
## countryCode isp lat lon org region regionName
## 1: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 2: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 3: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 4: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 5: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Moscow
## 6: RU PJSC MegaFon 55.7522 37.6156 PJSC MegaFon MOW Mos