First step was to acquire the dataset. Copy, pastes and the following script helped out with that:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
direc <- direcHere | |
realFile <- textFileContainingInfo | |
require(stringr) | |
resultsAttr <- c("raceID", "lastRaceDT","lastRaceNum", "lastTrack", "lastFin", | |
"pgm","horseNM","jockeyNM","Wgt","ME","PP","fin","odds","cmmt") | |
resultsFrame <- as.data.frame(matrix(NA,nrow=1,ncol=length(resultsAttr))) | |
colnames(resultsFrame) <- resultsAttr | |
resFrame <- resultsFrame | |
notesFrame <- as.data.frame(matrix(NA,nrow=1,ncol=2)) | |
raceAttr <- c("Track","Date","Race Number","Race Type","HorseType", | |
"ClaimPriceHigh","ClaimPriceLow","Race Length","Record Time", | |
"Record Date", "Purse","Plus", | |
"Available Money","Valueist","Value2nd","Value3rd","Weather", | |
"Track","Time","Start", | |
"2FTime1","2FTime2","3FTime1","3FTime2","3FTime3", | |
"4FTime1","4FTime2","4FTime3","4FTime4", | |
"t_Final", | |
"2SplT1","2SplT2","3SplT1","3SplT2","3SplT3", | |
"4SplT1","4SplT2","4SplT3","4SplT4", | |
"Run-up","WPS_Pool", "ID") | |
raceFrame <- as.data.frame(matrix(NA,nrow=1,ncol=length(raceAttr))) | |
colnames(raceFrame) <- raceAttr | |
fileName <- realFile | |
testRun <- readLines( paste(direc,fileName,sep="") | |
) | |
chunkSplits <- c(0,grep("Copyright", testRun)) | |
for (i in seq( 1 , length( chunkSplits ) - 1 ) ) { | |
print(i) | |
## Sets the current chunk to be parsed. | |
chunk <- testRun[ chunkSplits[i]:chunkSplits[i+1] ] | |
testGrep <- grep("GOLDEN GATE FIELDS", chunk) | |
testGreg <- gregexpr("([^-])+", chunk[testGrep]) | |
raceFrame[i,1:3] <- str_trim(regmatches(chunk[testGrep],testGreg)[[1]] | |
,side="both") | |
raceFrame[i,2] <- as.character( as.Date(raceFrame[i,2], | |
format='%B %d, %Y') ) | |
raceFrame[i,4:5] <- str_trim( | |
regmatches(chunk[testGrep+1] | |
,gregexpr("([^-])+", chunk[testGrep+1] ) )[[1]], | |
side="both" ) | |
priceGrep <- grep("Claiming Price:", chunk)+1 | |
if (length(priceGrep)!=0){ | |
raceFrame[i,6:7] <- str_trim(regmatches(chunk[priceGrep] | |
,gregexpr("([^-])+",chunk[priceGrep]) )[[1]] | |
, side = "both" ) | |
raceFrame[i,6:7] <- gsub('\\$','',raceFrame[i,6:7]) | |
} else{ | |
raceFrame[i,6:7] <- NA | |
} | |
lengthGrep <- grep("[Ff]urlong | [Mm]ile | [Ff]urlongs | [Mm]iles",chunk)[1] | |
raceFrame[i,8] <- str_trim(chunk[lengthGrep],side='both') | |
## raceFrame[i,8] <- str_trim(chunk[priceGrep+2],side='both') | |
recordGrep <- grep("Track Record",chunk) | |
raceFrame[i,9:10] <- str_trim( | |
regmatches(chunk[recordGrep+1] | |
,gregexpr("([^-])+", chunk[recordGrep+1] ) )[[1]][2:3], | |
side="both" ) | |
raceFrame[i,10] <- substr(raceFrame[i,10],1,nchar(raceFrame[i,10])-1) | |
raceFrame[i,10] <- as.character(as.Date(raceFrame[i,10], | |
format='%B %d, %Y' ) ) | |
raceFrame[i,11] <- gsub('\\$','',str_trim(chunk[grep("Purse", chunk)+1])) | |
raceFrame[i,12] <- 0 | |
## str_trim(chunk[grep("Plus:",chunk)+1] ) | |
raceFrame[i,13] <- gsub('\\$','', | |
str_trim(chunk[grep("Available Money:",chunk)+1] ) ) | |
raceValGrep <- grep("Value of Race:", chunk) | |
raceFrame[i,14:16] <-regmatches(chunk[raceValGrep+1], | |
gregexpr("[0-9^\\,0-9^]+", | |
chunk[raceValGrep+1]) )[[1]][c(1,3,5)] | |
raceFrame[i,14:16] <-regmatches(chunk[raceValGrep+1], | |
gregexpr("[0-9^\\,]+[0-9]{3}", | |
chunk[raceValGrep+1]) )[[1]][c(1,2,3)] | |
raceFrame[i,17] <- str_trim(chunk[grep("Weather:",chunk)+1] ) | |
raceFrame[i,18] <- str_trim(chunk[grep("Track:",chunk)+1] ) | |
raceFrame[i,19] <- str_trim(chunk[grep("Off at:",chunk)+1] ) | |
raceFrame[i,20] <- str_trim(chunk[grep("Start:",chunk)+1] ) | |
fTimes <- str_trim( | |
strsplit(chunk[grep("Fractional Times", chunk)+1], " ")[[1]], | |
side="both" ) | |
if (length(fTimes)==2){ | |
raceFrame[i,21:22] <- fTimes | |
} | |
if (length(fTimes)==3){ | |
raceFrame[i,23:25] <- fTimes | |
} | |
if (length(fTimes)==4){ | |
raceFrame[i,26:29] <- fTimes | |
} | |
raceFrame[i,30] <- str_trim(chunk[grep("Final Time:",chunk)+1] ) | |
sTimes <- gsub('\\)','', | |
str_trim( | |
gsub('\\(', '', | |
strsplit(chunk[grep("Split Times", chunk)+1], " ")[[1]] | |
), | |
side="both" ) ) | |
if (length(sTimes)==2){ | |
raceFrame[i,31:32] <- sTimes | |
} | |
if (length(sTimes)==3){ | |
raceFrame[i,33:35] <- sTimes | |
} | |
if (length(sTimes)==4){ | |
raceFrame[i,36:39] <- sTimes | |
} | |
raceFrame[i,40] <- str_trim(chunk[grep("Run-Up:",chunk)+1] ) | |
raceFrame[i,41] <- str_trim(chunk[grep("Total WPS Pool:",chunk)+1] ) | |
raceID <- i | |
raceFrame[i,42] <- raceID | |
output <- "C:\\Users\\Coleman\\Desktop\\Projects\\Horses\\footNoteFile.txt" | |
notesRange <- c(grep("Footnotes",chunk)+1, | |
grep("Equibase Company LLC\\.", chunk)-1) | |
cat(paste("Race ID:",raceID,sep=''),file=output,append=TRUE) | |
cat("\n",file=output,append=TRUE) | |
if (length(notesRange!=2)){ | |
finalNote <- tail(notesRange,n=1) } | |
cat(paste(chunk[notesRange[1]:finalNote]),file=output,append=TRUE) | |
cat("\n \n \n",file=output,append=TRUE) | |
##--------Begins result information. | |
smaller <- c( grep( "Last Raced",chunk ), | |
grep( "Fractional Times",chunk ) ) | |
subChunk <- chunk[smaller[1]:smaller[2]] | |
grepName <- grep( "\\((.*)\\)", subChunk ) | |
grepName <- grepName[2:length(grepName)] | |
for (k in 1:length(grepName)){ | |
tryCatch({ | |
resultsFrame[1,] <- NA | |
resultsFrame[1,1:5]<- c(raceID | |
,subChunk[grepName[k]-4] | |
,subChunk[grepName[k]-3] | |
,subChunk[grepName[k]-2] | |
,subChunk[grepName[k]-1] ) | |
resultsFrame[1,6] <- substr(subChunk[grepName[k]],1,1) | |
if (resultsFrame[1,6]=="-"){resultsFrame[1,2:5] <- NA} | |
resultsFrame[1,7] <- str_trim( | |
regmatches( subChunk[grepName[k]], | |
gregexpr("[A-Za-z](.*) \\(", | |
subChunk[grepName[k]])[[1]]-1 ) ) | |
resultsFrame[1,8] <- str_trim( | |
gsub("\\(|\\)","", | |
regmatches( subChunk[grepName[k]], | |
gregexpr("\\((.*)\\)", | |
subChunk[grepName[k]])[[1]] ) ) ) | |
resultsFrame[1,9] <- str_trim( gsub( "\\)","", | |
regmatches( subChunk[grepName[k]], | |
gregexpr("\\) [0-9]* ", | |
subChunk[grepName[k]])[[1]] ) ) ) | |
resultsFrame[1,10] <- str_trim( | |
regmatches( subChunk[grepName[k]], | |
gregexpr(paste(resultsFrame[1,9],"([A-Za-z]* )*",sep=" "), | |
subChunk[grepName[k]])[[1]] ) ) | |
resultsFrame[1,10] <- str_trim( gsub( | |
"[0-9]","",resultsFrame[1,10] ) ) | |
if (resultsFrame[1,10]==""){ | |
resultsFrame[1,11] <- str_trim( | |
regmatches( subChunk[grepName[k]], | |
gregexpr(paste(resultsFrame[1,9],"[0-9]",sep=" "), | |
subChunk[grepName[k]])[[1]] ) ) | |
resultsFrame[1,11] <- str_trim( strsplit ( | |
resultsFrame[1,11], " " )[[1]][2] ) | |
} else { | |
resultsFrame[1,11] <- str_trim( gsub("[A-Za-z]","", | |
regmatches( subChunk[grepName[k]], | |
gregexpr(paste(resultsFrame[1,10]," [0-9]",sep=""), | |
subChunk[grepName[k]])[[1]] ) ) ) | |
} | |
resultsFrame[1,12] <- k | |
decGrep <- grep("[0-9]*\\.[0-9]*", subChunk) | |
resultsFrame[1,13] <- str_trim( | |
regmatches ( subChunk[decGrep[k]], | |
gregexpr("[0-9]*\\.[0-9]*", | |
subChunk[decGrep[k]])[[1]] ) ) | |
resultsFrame[1,14] <- str_trim( | |
gsub("[0-9]*\\.[0-9]*", "", subChunk[decGrep[k]]) ) | |
resFrame <- rbind(resFrame,resultsFrame) | |
}, warning=function(war){ | |
print("problem") | |
}, error=function(err){ | |
print("error") | |
}, finally={}) | |
} | |
} |
No comments:
Post a Comment