Built with blockbuilder.org
My 8 year old son ran the 2016 Mercedes Marathon 1 mile kids’ race in Birmingham, Alabama this past weekend, February, 13, 2016. He put in a 6:42 time, which I considered quite good especially since he did no training. The intent of the race is to promote health, but he said, “That’s worthless. I want to know how I did.” So out comes the R
+ xml2
+ StackOverflow to parse and sort the the ugly <span>
table results. I have posted the code and a more usable csv in this gist in case somebody else has a kid that is as competitive as my son. By the way, his 6:42 placed him in a tie for 29th out of 3,915. I couldn’t determine how to split by grade/age.
library(xml2)
library(dplyr)
library(pipeR)
race_html <- read_html(
"http://www.besttimescct.com/results/Mercedes16_kids.HTML"
)
# http://stackoverflow.com/questions/1389428/dealing-with-time-periods-such-as-5-minutes-and-30-seconds-in-r
# time - time in the format of dd hh:mm:ss
# (That's the format used in cvs export from Alcatel CCS reports)
#
time.to.seconds <- function(time) {
t <- strsplit(as.character(time), " |:")[[1]]
seconds <- NaN
if (length(t) == 1 )
seconds <- as.numeric(t[1])
else if (length(t) == 2)
seconds <- as.numeric(t[1]) * 60 + as.numeric(t[2])
else if (length(t) == 3)
seconds <- (as.numeric(t[1]) * 60 * 60
+ as.numeric(t[2]) * 60 + as.numeric(t[3]))
else if (length(t) == 4)
seconds <- (as.numeric(t[1]) * 24 * 60 * 60 +
as.numeric(t[2]) * 60 * 60 + as.numeric(t[3]) * 60 +
as.numeric(t[4]))
return(seconds)
}
xml_find_all(race_html, "//p") %>>%
xml_text %>>%
(
gsub(
x = .,
pattern = "([^0-9|:[:blank:]])",
replacement = ""
)
) %>>%
strsplit("\\|") %>>%
unlist %>>%
# get rid of all the date stuff
(
.[!grepl(x=.,pattern="(2016)")]
) %>>%
# only get the stuff with numbers
(
.[grepl(x=.,pattern="[0-9]")]
) %>>%
# remove the leading space (no longer necessary)
(
gsub(x = ., pattern = "^[^0-9]+", replacement = "")
) %>>%
# remove the trailing space (no longer necessary)
(
gsub(x = ., pattern = "[^0-9]$", replacement = "")
) %>>%
# replace everything not numbers or colons with space
(
gsub(x = ., pattern = "[^0-9:]", replacement = " ")
) %>>%
(
read.table(text = ., sep = "", stringsAsFactors = FALSE)
) -> race_results
colnames(race_results) <- c("bib","time_char")
# convert time to seconds
race_results$time_sec <- race_results %>>%
(
lapply(.$time_char, time.to.seconds) %>>% unlist
)
# get rank
race_results$rank <- rank(race_results$time_sec, ties.method = "min")
race_results %>>%
ggplot( aes(x = time_sec)) + geom_density()
race_results %>>%
ggplot(aes(x = time_sec)) +
geom_density()
race_results %>>%
ggplot(aes(x = bib, y = time_sec)) +
geom_point()
race_results %>>%
(rank(.$time_sec,ties.method="min")[which(.$time_char=="6:42")])
race_results %>>%
(plot(seq_len(nrow(.)),sort(.$time_sec)))