I am learning R for text mining. I have a TV program schedule in form of CSV. The programs usually start at 06:00 AM and goes on until 05:00 AM the next day which is called
It's a bit of a mess, but it seems to work:
df <- read.table(textConnection(txt <- "Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|"), header = F, sep = "|", stringsAsFactors = F)
cat(txt)
Sys.setlocale("LC_TIME", "English") # if needed
weekdays <- format(seq.Date(Sys.Date(), Sys.Date()+6, 1), "%A")
days <- split(df, cumsum(df$V1 %in% weekdays))
lapply(days, function(dayDF) {
tmp <- cbind.data.frame(V1=dayDF[2, 1], do.call(rbind, split(unlist(dayDF[-c(1:2), ]), cumsum(!dayDF[-(1:2), 2]==""))), stringsAsFactors = F)
tmp[, 1] <- as.Date(tmp[, 1], "%d-%B-%y")
tmp[, 2] <- as.numeric(tmp[, 2])
tmp[, 5] <- NULL
idx <- c(FALSE, diff(tmp[, 2])<0)
tmp[idx, 1] <- tmp[idx, 1] + 1
return(tmp)
}) -> days
days <- transform(do.call(rbind.data.frame, days), V1=as.POSIXct(paste(V1, sprintf("%.2f", V11)), format="%Y-%m-%d %H.%M"), V11=NULL)
names(days) <- c("Date", "Synopsis", "Program")
rownames(days) <- NULL
days[, c(1, 3, 2)]
# Date Program Synopsis
# 1 2015-11-01 06:00:00 Tom some information about the program
# 2 2015-11-01 23:30:00 Jerry some information about the program
# 3 2015-11-02 05:00:00 Avatar some information about the program
# 4 2015-11-02 06:00:00 Tom some information about the program
# 5 2015-11-02 23:30:00 Jerry some information about the program
# 6 2015-11-03 05:00:00 Avatar some information about the program