Я подумал поделиться своим решением здесь на случай, если оно кому-то поможет. Единственное отличие этого решения в том, что я добавил дополнительный столбец positions
в качестве фильтра. Вы можете удалить этот фильтр, если ваша проблема похожа на мою, как указано выше. В противном случае этот фильтр помогает отсортировать hours
по position id
.
#loading libraries
library(lubridate)
library(readxl)
library(stringr)
library(tidyr)
#read data
df <- read_excel('data_sample.xlsx',
col_types = c('numeric', 'text', 'date', 'guess', 'guess','skip', 'numeric'))
#convert clocks to float for faster comparisons
df$`Clock In` = hour(df$`Clock In`) + minute(df$`Clock In`) / 60
df$`Clock Out` = hour(df$`Clock Out`) + minute(df$`Clock Out`) / 60
#remove shallow records
df <- df[!(df$`Clock In` == df$`Clock Out`),]
#24 hours strings
hours = lapply(0 : 23, function(i) str_pad(paste(toString(i), ':00', sep=""), 5, pad = '0'))
#empty presence matrix
hours_mat = c()
#fill the presence matrix with ines when exists in work place
for (r in 1:nrow(df)) {
lis = c()
for (i in 0:23) {
if (i < df[r, 'Clock In'] | i >= df[r, 'Clock Out']){
lis[i + 1] <- 0
}
else {
lis[i + 1] <- 1
}
}
hours_mat <- rbind(hours_mat, lis)
}
#convert matrix to dataframe
hours_df = as.data.frame(hours_mat)
colnames(hours_df) <- hours
#bind the matrix to the original dataframe
final_df <- cbind(df, hours_df)
#aggregate presence count over date in every position
result <- aggregate(final_df[1:nrow(final_df),7:ncol(final_df)],
by=list(Date = final_df$Date, Position = final_df$Position),
FUN=sum)
#factorize dates for converting from wide to long format
result$Date = factor(result$Date)
#wide to long format
long_result <- gather(result, Timestamp, Count, '00:00':'23:00',
factor_key = TRUE)
#long to wide format using dates
result_wide <- spread(long_result, Date, Count)
#to select a particular position, uncomment this line :
#result_wide <- result_wide[result_wide$Position == 'your required position as number']
#write the final output to "output.csv"
write.csv(result_wide, 'output.csv')
и образцы данных следующие:
structure(list(`Employee Number` = c(1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1), Day = c("THU", "FRI", "SAT", "SUN",
"WED", "THU", "FRI", "SAT", "SUN", "WED", "THU", "THU", "FRI",
"SAT", "SUN", "WED", "THU", "THU"), Date = structure(c(1577923200,
1578009600, 1578096000, 1578182400, 1578441600, 1578528000, 1578614400,
1578700800, 1578787200, 1579046400, 1579132800, 1579132800, 1579219200,
1579305600, 1579392000, 1579651200, 1579737600, 1579737600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), ClockIn = structure(c(-2209021200,
-2209021200, -2209021200, -2209021200, -2209021200, -2209023000,
-2209021200, -2209021200, -2209021200, -2209021200, -2209075200,
-2209021200, -2209021200, -2209021200, -2209021200, -2209021200,
-2209075200, -2209021200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Clock Out` = structure(c(-2208992400, -2208992400, -2208992400,
-2208992400, -2208992400, -2208994200, -2208992400, -2208992400,
-2208992400, -2208992400, -2209075200, -2208999600, -2208992400,
-2208992400, -2208992400, -2208992400, -2209075200, -2208999600
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Department = c(20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20), Position = c(35, 35, 35, 35, 35, 35, 35, 35, 35,
35, 35, 35, 35, 35, 35, 35, 35, 35)), row.names = c(NA, -18L
), class = c("tbl_df", "tbl", "data.frame"))
person
bp41
schedule
09.07.2020
15:00
количество для меня равно 4 - person akrun   schedule 01.07.2020