block by timelyportfolio 2d1972cd7b6eb011c9fd

mercedes marathon kids race 2016

Built with blockbuilder.org

My 8 year old son ran the 2016 Mercedes Marathon 1 mile kids’ race in Birmingham, Alabama this past weekend, February, 13, 2016. He put in a 6:42 time, which I considered quite good especially since he did no training. The intent of the race is to promote health, but he said, “That’s worthless. I want to know how I did.” So out comes the R + xml2 + StackOverflow to parse and sort the the ugly <span> table results. I have posted the code and a more usable csv in this gist in case somebody else has a kid that is as competitive as my son. By the way, his 6:42 placed him in a tie for 29th out of 3,915. I couldn’t determine how to split by grade/age.

library(xml2)
library(dplyr)
library(pipeR)

race_html <- read_html(
  "http://www.besttimescct.com/results/Mercedes16_kids.HTML"
)

# http://stackoverflow.com/questions/1389428/dealing-with-time-periods-such-as-5-minutes-and-30-seconds-in-r
# time - time in the format of dd hh:mm:ss
#       (That's the format used in cvs export from Alcatel CCS reports)
#
time.to.seconds <- function(time) {

  t <- strsplit(as.character(time), " |:")[[1]]
  seconds <- NaN

  if (length(t) == 1 )
    seconds <- as.numeric(t[1])
  else if (length(t) == 2)
    seconds <- as.numeric(t[1]) * 60 + as.numeric(t[2])
  else if (length(t) == 3)
    seconds <- (as.numeric(t[1]) * 60 * 60 
                + as.numeric(t[2]) * 60 + as.numeric(t[3]))   
  else if (length(t) == 4)
    seconds <- (as.numeric(t[1]) * 24 * 60 * 60 +
                  as.numeric(t[2]) * 60 * 60  + as.numeric(t[3]) * 60 +
                  as.numeric(t[4]))

  return(seconds)
}

xml_find_all(race_html,  "//p") %>>%
  xml_text %>>%
  (
    gsub(
      x = .,
      pattern = "([^0-9|:[:blank:]])",
      replacement  = ""
    )
  ) %>>%
  strsplit("\\|") %>>%
  unlist %>>%
  # get rid of all the date stuff
  (
    .[!grepl(x=.,pattern="(2016)")]
  ) %>>%
  # only get the stuff with numbers
  (
    .[grepl(x=.,pattern="[0-9]")]
  ) %>>%
  # remove the leading space (no longer necessary)
  (
    gsub(x = ., pattern = "^[^0-9]+", replacement = "")
  ) %>>%
  # remove the trailing space (no longer necessary)
  (
    gsub(x = ., pattern = "[^0-9]$", replacement = "")
  ) %>>%
  # replace everything not numbers or colons with space
  (
    gsub(x = ., pattern = "[^0-9:]", replacement = " ")
  ) %>>%
  (
    read.table(text = ., sep = "", stringsAsFactors = FALSE)
  ) -> race_results

colnames(race_results) <- c("bib","time_char")

# convert time to seconds    
race_results$time_sec <- race_results %>>%
  (
    lapply(.$time_char, time.to.seconds) %>>% unlist
  )

# get rank
race_results$rank <- rank(race_results$time_sec, ties.method = "min")

race_results %>>%
  ggplot( aes(x = time_sec)) + geom_density()


race_results %>>%
  ggplot(aes(x = time_sec)) +
    geom_density()

race_results %>>%
  ggplot(aes(x = bib, y = time_sec)) +
    geom_point()

race_results %>>%
  (rank(.$time_sec,ties.method="min")[which(.$time_char=="6:42")])

race_results %>>%
  (plot(seq_len(nrow(.)),sort(.$time_sec)))