Saturday, March 15, 2014

The Bukowski Sequence:

Like any other 17 year old guy, Bukowski was a really fun read. Different era, different perspective. Slums, women, booze, fights, all make for a potent youth catnip. It got me back into reading so I have been grateful for that. More recently, and partially due to a location change, I've chased down another Bukowski fixture: the horses. As I like uncertainty and probalistic estimates, I figured I'd throw my hat into the ring.

First step was to acquire the dataset. Copy, pastes and the following script helped out with that:
direc <- direcHere
realFile <- textFileContainingInfo
require(stringr)
resultsAttr <- c("raceID", "lastRaceDT","lastRaceNum", "lastTrack", "lastFin",
"pgm","horseNM","jockeyNM","Wgt","ME","PP","fin","odds","cmmt")
resultsFrame <- as.data.frame(matrix(NA,nrow=1,ncol=length(resultsAttr)))
colnames(resultsFrame) <- resultsAttr
resFrame <- resultsFrame
notesFrame <- as.data.frame(matrix(NA,nrow=1,ncol=2))
raceAttr <- c("Track","Date","Race Number","Race Type","HorseType",
"ClaimPriceHigh","ClaimPriceLow","Race Length","Record Time",
"Record Date", "Purse","Plus",
"Available Money","Valueist","Value2nd","Value3rd","Weather",
"Track","Time","Start",
"2FTime1","2FTime2","3FTime1","3FTime2","3FTime3",
"4FTime1","4FTime2","4FTime3","4FTime4",
"t_Final",
"2SplT1","2SplT2","3SplT1","3SplT2","3SplT3",
"4SplT1","4SplT2","4SplT3","4SplT4",
"Run-up","WPS_Pool", "ID")
raceFrame <- as.data.frame(matrix(NA,nrow=1,ncol=length(raceAttr)))
colnames(raceFrame) <- raceAttr
fileName <- realFile
testRun <- readLines( paste(direc,fileName,sep="")
)
chunkSplits <- c(0,grep("Copyright", testRun))
for (i in seq( 1 , length( chunkSplits ) - 1 ) ) {
print(i)
## Sets the current chunk to be parsed.
chunk <- testRun[ chunkSplits[i]:chunkSplits[i+1] ]
testGrep <- grep("GOLDEN GATE FIELDS", chunk)
testGreg <- gregexpr("([^-])+", chunk[testGrep])
raceFrame[i,1:3] <- str_trim(regmatches(chunk[testGrep],testGreg)[[1]]
,side="both")
raceFrame[i,2] <- as.character( as.Date(raceFrame[i,2],
format='%B %d, %Y') )
raceFrame[i,4:5] <- str_trim(
regmatches(chunk[testGrep+1]
,gregexpr("([^-])+", chunk[testGrep+1] ) )[[1]],
side="both" )
priceGrep <- grep("Claiming Price:", chunk)+1
if (length(priceGrep)!=0){
raceFrame[i,6:7] <- str_trim(regmatches(chunk[priceGrep]
,gregexpr("([^-])+",chunk[priceGrep]) )[[1]]
, side = "both" )
raceFrame[i,6:7] <- gsub('\\$','',raceFrame[i,6:7])
} else{
raceFrame[i,6:7] <- NA
}
lengthGrep <- grep("[Ff]urlong | [Mm]ile | [Ff]urlongs | [Mm]iles",chunk)[1]
raceFrame[i,8] <- str_trim(chunk[lengthGrep],side='both')
## raceFrame[i,8] <- str_trim(chunk[priceGrep+2],side='both')
recordGrep <- grep("Track Record",chunk)
raceFrame[i,9:10] <- str_trim(
regmatches(chunk[recordGrep+1]
,gregexpr("([^-])+", chunk[recordGrep+1] ) )[[1]][2:3],
side="both" )
raceFrame[i,10] <- substr(raceFrame[i,10],1,nchar(raceFrame[i,10])-1)
raceFrame[i,10] <- as.character(as.Date(raceFrame[i,10],
format='%B %d, %Y' ) )
raceFrame[i,11] <- gsub('\\$','',str_trim(chunk[grep("Purse", chunk)+1]))
raceFrame[i,12] <- 0
## str_trim(chunk[grep("Plus:",chunk)+1] )
raceFrame[i,13] <- gsub('\\$','',
str_trim(chunk[grep("Available Money:",chunk)+1] ) )
raceValGrep <- grep("Value of Race:", chunk)
raceFrame[i,14:16] <-regmatches(chunk[raceValGrep+1],
gregexpr("[0-9^\\,0-9^]+",
chunk[raceValGrep+1]) )[[1]][c(1,3,5)]
raceFrame[i,14:16] <-regmatches(chunk[raceValGrep+1],
gregexpr("[0-9^\\,]+[0-9]{3}",
chunk[raceValGrep+1]) )[[1]][c(1,2,3)]
raceFrame[i,17] <- str_trim(chunk[grep("Weather:",chunk)+1] )
raceFrame[i,18] <- str_trim(chunk[grep("Track:",chunk)+1] )
raceFrame[i,19] <- str_trim(chunk[grep("Off at:",chunk)+1] )
raceFrame[i,20] <- str_trim(chunk[grep("Start:",chunk)+1] )
fTimes <- str_trim(
strsplit(chunk[grep("Fractional Times", chunk)+1], " ")[[1]],
side="both" )
if (length(fTimes)==2){
raceFrame[i,21:22] <- fTimes
}
if (length(fTimes)==3){
raceFrame[i,23:25] <- fTimes
}
if (length(fTimes)==4){
raceFrame[i,26:29] <- fTimes
}
raceFrame[i,30] <- str_trim(chunk[grep("Final Time:",chunk)+1] )
sTimes <- gsub('\\)','',
str_trim(
gsub('\\(', '',
strsplit(chunk[grep("Split Times", chunk)+1], " ")[[1]]
),
side="both" ) )
if (length(sTimes)==2){
raceFrame[i,31:32] <- sTimes
}
if (length(sTimes)==3){
raceFrame[i,33:35] <- sTimes
}
if (length(sTimes)==4){
raceFrame[i,36:39] <- sTimes
}
raceFrame[i,40] <- str_trim(chunk[grep("Run-Up:",chunk)+1] )
raceFrame[i,41] <- str_trim(chunk[grep("Total WPS Pool:",chunk)+1] )
raceID <- i
raceFrame[i,42] <- raceID
output <- "C:\\Users\\Coleman\\Desktop\\Projects\\Horses\\footNoteFile.txt"
notesRange <- c(grep("Footnotes",chunk)+1,
grep("Equibase Company LLC\\.", chunk)-1)
cat(paste("Race ID:",raceID,sep=''),file=output,append=TRUE)
cat("\n",file=output,append=TRUE)
if (length(notesRange!=2)){
finalNote <- tail(notesRange,n=1) }
cat(paste(chunk[notesRange[1]:finalNote]),file=output,append=TRUE)
cat("\n \n \n",file=output,append=TRUE)
##--------Begins result information.
smaller <- c( grep( "Last Raced",chunk ),
grep( "Fractional Times",chunk ) )
subChunk <- chunk[smaller[1]:smaller[2]]
grepName <- grep( "\\((.*)\\)", subChunk )
grepName <- grepName[2:length(grepName)]
for (k in 1:length(grepName)){
tryCatch({
resultsFrame[1,] <- NA
resultsFrame[1,1:5]<- c(raceID
,subChunk[grepName[k]-4]
,subChunk[grepName[k]-3]
,subChunk[grepName[k]-2]
,subChunk[grepName[k]-1] )
resultsFrame[1,6] <- substr(subChunk[grepName[k]],1,1)
if (resultsFrame[1,6]=="-"){resultsFrame[1,2:5] <- NA}
resultsFrame[1,7] <- str_trim(
regmatches( subChunk[grepName[k]],
gregexpr("[A-Za-z](.*) \\(",
subChunk[grepName[k]])[[1]]-1 ) )
resultsFrame[1,8] <- str_trim(
gsub("\\(|\\)","",
regmatches( subChunk[grepName[k]],
gregexpr("\\((.*)\\)",
subChunk[grepName[k]])[[1]] ) ) )
resultsFrame[1,9] <- str_trim( gsub( "\\)","",
regmatches( subChunk[grepName[k]],
gregexpr("\\) [0-9]* ",
subChunk[grepName[k]])[[1]] ) ) )
resultsFrame[1,10] <- str_trim(
regmatches( subChunk[grepName[k]],
gregexpr(paste(resultsFrame[1,9],"([A-Za-z]* )*",sep=" "),
subChunk[grepName[k]])[[1]] ) )
resultsFrame[1,10] <- str_trim( gsub(
"[0-9]","",resultsFrame[1,10] ) )
if (resultsFrame[1,10]==""){
resultsFrame[1,11] <- str_trim(
regmatches( subChunk[grepName[k]],
gregexpr(paste(resultsFrame[1,9],"[0-9]",sep=" "),
subChunk[grepName[k]])[[1]] ) )
resultsFrame[1,11] <- str_trim( strsplit (
resultsFrame[1,11], " " )[[1]][2] )
} else {
resultsFrame[1,11] <- str_trim( gsub("[A-Za-z]","",
regmatches( subChunk[grepName[k]],
gregexpr(paste(resultsFrame[1,10]," [0-9]",sep=""),
subChunk[grepName[k]])[[1]] ) ) )
}
resultsFrame[1,12] <- k
decGrep <- grep("[0-9]*\\.[0-9]*", subChunk)
resultsFrame[1,13] <- str_trim(
regmatches ( subChunk[decGrep[k]],
gregexpr("[0-9]*\\.[0-9]*",
subChunk[decGrep[k]])[[1]] ) )
resultsFrame[1,14] <- str_trim(
gsub("[0-9]*\\.[0-9]*", "", subChunk[decGrep[k]]) )
resFrame <- rbind(resFrame,resultsFrame)
}, warning=function(war){
print("problem")
}, error=function(err){
print("error")
}, finally={})
}
}
view raw gistfile1.txt hosted with ❤ by GitHub

No comments:

Post a Comment