train <- read.csv("train.csv")
121 seconds
1.8 Gb; 1.7 million rows
library(readr)
train <- read_csv("train.csv")
25 seconds
library(data.table)
train <- fread("train.csv")
35 seconds
Create variables to go into model.
# train$new_col <- train$old_col1 / train$old_col2
train <- as.data.table(train)
train[ , new_col := old_col1 / old_col1 ]
train[ , c("new_col1", "new_col2") :=
list(old_col1^2, sin(old_col2)) ]
thedate <- as.POSIXct(as.numeric(train$TIMESTAMP),
origin="1970-01-01", tz = "GMT")
train$DAY <- format(thedate, "%a")
train$MONTH <- format(thedate, "%b")
train$TIME <- as.numeric(format(thedate, "%H")) +
as.numeric(format(thedate, "%M")) / 60
with open("./train_polyline.csv") as read_file,
open("./train_aug_polyline_raw.csv", 'w') as write_file
for line in read_file:
n_gps = line.count('],[') + 1
num_samples = int(math.ceil(math.log(n_gps / 5 + 1) + 1))
for sample in range(0, num_samples):
if n_gps == 1:
num_gps_samp = 1
else:
num_gps_samp = int(math.ceil(random.uniform(1, n_gps - 1)))
new_poly = line.split('],[')[0:num_gps_samp]
write_file.write("\"" + "\",\"".join(new_poly) + "\"\n")
id1, [1, 2], [3, 4], [5, 6], ...
id2, [7, 8], [9, 10], ...
id1, 1, 2,
id1, 3, 4,
id1, 5, 6,
...
id2, 7, 8,
id2, 9, 10,
...
id, clust
id1, A
id1, A
id1, B
id1, B
...
id1, N
id2, C
id2, D
...
id2, G
id, first, second, last
id1, A, B, N
id2, C, D, G
...
cluststat <- function(X)
{
clust <- unique(X)
list(first = clust[1],
second = clust[2],
last = clust[length(clust)])
}
cluster_data[ , cluststat(.SD$clust), by = id]
library(h2o)
# use all threads: nthreads = -1
h2oserver <- h2o.init(nthreads = -1)
h2otrain <- h2o.importFile("train.csv")
# h2otrain <- as.h2o(train)
x <- c("CALL_TYPE", "TAXI_ID", "MONTH", "DAY", "TIME", "FIRST",
"SECOND", "LAST", "POINTS_SO_FAR")
y <- "POINTS_LEFT"
h2o.fit <- h2o.glm(x = x, y = y, training_frame = h2otrain,
family = "poisson")