Kaggle 데이터를 활용한 DataTable 문서화.

2019. 3. 17. 20:25분석 R

DataTable
다 돌아가지만, Posting 하기 위해서는 주석으로 올려야해서 주석으로 올립니다. 

html이나 rmd가 필요하다면 댓글에 글 남겨주세요.






library(data.table)

Data Loading



  • fread를 사용하면 큰 데이터를 더 빠르게 읽을 수 있다.
time <- system.time(Data <- fread("./../Data/data.csv", header = TRUE))



time1 <- system.time(Data1 <- read.csv("./../Data/data.csv"))

to_table <- data.table(Data1)
to_table_2 <- setDT(Data1)


## 결측치 제거 및 Type 변환 


## R에서는 잘 되지만 포스팅 할 때는 문제가 발새
#Str_to_int <- colnames(Data)[4:6]
#Data[, (Str_to_int) := lapply(.SD, as.numeric), .SDcols = Str_to_int]

Data$`Video Uploads` <- as.numeric(Data$`Video Uploads`)
Data$Subscribers <- as.numeric(Data$Subscribers)
Data$`Video views` <- as.numeric(Data$`Video views`)

colSums(is.na(Data))
##          Rank         Grade  Channel name Video Uploads   Subscribers 
##             0             0             0            15           344 
##   Video views 
##           263
Data <- na.omit(Data)

print(paste(class(Data) , " == " , class(Data1)))
## [1] "data.table  ==  data.table" "data.frame  ==  data.frame"
print(paste(time[3] , " < " , time1[3] ))
## [1] "0  <  0.100000000000364"







Select




  • Keeping Columns
Data[,1]
##    [1] "1st"     "2nd"     "3rd"     "4th"     "5th"     "6th"    
##    [7] "7th"     "8th"     "9th"     "10th"    "11th"    "12th"   
##   [13] "13th"    "14th"    "15th"    "16th"    "17th"    "19th"   
##   [19] "20th"    "21st"    "22nd"    "23rd"    "24th"    "25th"   
##   [25] "26th"    "27th"    "28th"    "29th"    "30th"    "31st"   
##   [31] "32nd"    "33rd"    "34th"    "35th"    "36th"    "37th"   
##   [37] "38th"    "39th"    "40th"    "41st"    "42nd"    "43rd"   
##   [43] "44th"    "45th"    "46th"    "47th"    "48th"    "49th"   
##   [49] "50th"    "51st"    "52nd"    "53rd"    "54th"    "55th"   
##   [55] "56th"    "58th"    "59th"    "60th"    "61st"    "62nd"   
##   [61] "63rd"    "64th"    "65th"    "66th"    "67th"    "68th"   
##   [67] "69th"    "70th"    "71st"    "72nd"    "73rd"    "74th"   

Data[,1, with = F]
##    [1] "1st"     "2nd"     "3rd"     "4th"     "5th"     "6th"    
##    [7] "7th"     "8th"     "9th"     "10th"    "11th"    "12th"   
##   [13] "13th"    "14th"    "15th"    "16th"    "17th"    "19th"   
##   [19] "20th"    "21st"    "22nd"    "23rd"    "24th"    "25th"   
##   [25] "26th"    "27th"    "28th"    "29th"    "30th"    "31st"   
##   [31] "32nd"    "33rd"    "34th"    "35th"    "36th"    "37th"   
##   [37] "38th"    "39th"    "40th"    "41st"    "42nd"    "43rd"   
##   [43] "44th"    "45th"    "46th"    "47th"    "48th"    "49th"   
##   [49] "50th"    "51st"    "52nd"    "53rd"    "54th"    "55th"   
##   [55] "56th"    "58th"    "59th"    "60th"    "61st"    "62nd"   
##   [61] "63rd"    "64th"    "65th"    "66th"    "67th"    "68th"   
##   [67] "69th"    "70th"    "71st"    "72nd"    "73rd"    "74th"   
##   [73] "75th"    "76th"    "77th"    "78th"    "80th"    "81st"   
##   [79] "82nd"    "83rd"    "84th"    "85th"    "86th"    "87th"   
##   [85] "88th"    "90th"    "91st"    "92nd"    "93rd"    "94th"   

#Select <- Data[, Rank]

#print(head(Select))
#print(class(Select))

## multiple select

#Data[, .(Rank, Grade)]
Data[, c(1:2)]
##          Rank Grade
##    1:     1st   A++
##    2:     2nd   A++
##    3:     3rd   A++
##    4:     4th   A++
##    5:     5th   A++
##   ---              
## 3783: 4,996th    B+
## 3784: 4,997th    B+
## 3785: 4,998th    B+
## 3786: 4,999th    B+
## 3787: 5,000th    B+


  • Drop Columns
tmp2 <- Data[,with = F]
#tmp2[, !c("Rank" , "Grade")]
tmp2[, !c(1:2)]
## Null data.table (0 rows and 0 cols)


Order



mydata01 = Data

setorder(Data ,  Subscribers )
head(mydata01)
##       Rank Grade            Channel name Video Uploads Subscribers
## 1: 4,097th    B+           Xiaomi France            17         301
## 2:    93rd     A           hairong zheng           492         394
## 3: 3,517th    B+             Jianpeng Li           232         437
## 4: 3,161st    B+     Ron Funches - Topic            21         453
## 5: 2,288th    B+ Christian Nodal - Topic            30         471
## 6: 3,181st    B+   vendezvotrevoiture.fr            12         554
##    Video views
## 1:    12943662
## 2:   254252482
## 3:    70635195
## 4:    13932203
## 5:    17868274
## 6:    17603860
setorder(Data ,  Grade, -Subscribers )
head(mydata01)
##     Rank Grade  Channel name Video Uploads Subscribers Video views
## 1: 175th     A  Dude Perfect           185    34489432  6095981611
## 2:  64th     A    Ed Sheeran           122    33961184 14521211040
## 3: 127th     A  TheEllenShow          9066    27765423 13403860667
## 4: 211th     A KatyPerryVEVO           125    26712128 16244535352
## 5: 150th     A    EminemVEVO            80    26650488 11317532576
## 6:  84th     A   Felipe Neto          1276    25069826  4455604361

Filter




* one condition


#head(Data[Grade == "A++"])
#head(Data[Grade == "A++",])

#head(Data[`Video views`> 50000,])


  • mutiple condition


# head(Data[Grade == "A++" & `Video views`> 10^10 , ])
  • mutiple condition

  • Logica operator %in%


#Data[Grade %in% c("A++", "B+")]
#Data[!Grade %in% c("A++", "B+")]


  • J Expression

  • 기존 방식보다 더 빠르게 Filtering 가능


setkey(Data , Grade )
#Data[J("A++")]
tables()
##          NAME  NROW NCOL MB
## 1:       Data 3,787    6  1
## 2:      Data1 4,393    6  1
## 3:   mydata01 3,787    6  1
## 4:       tmp2 3,787    6  1
## 5:   to_table 4,393    6  1
## 6: to_table_2 4,393    6  1
##                                                             COLS   KEY
## 1: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views Grade
## 2: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views      
## 3: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views Grade
## 4: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views      
## 5: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views      
## 6: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views      
## Total: 6MB
setkey(Data , Grade , Rank)
tables()
##          NAME  NROW NCOL MB
## 1:       Data 3,787    6  1
## 2:      Data1 4,393    6  1
## 3:   mydata01 3,787    6  1
## 4:       tmp2 3,787    6  1
## 5:   to_table 4,393    6  1
## 6: to_table_2 4,393    6  1
##                                                             COLS
## 1: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
## 2: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views
## 3: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
## 4: Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
## 5: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views
## 6: Rank,Grade,Channel.name,Video.Uploads,Subscribers,Video.views
##           KEY
## 1: Grade,Rank
## 2:           
## 3: Grade,Rank
## 4:           
## 5:           
## 6:           
## Total: 6MB
#Data[J("A++", "1st")]

#Data[J("A++")]


  • Condition & Summarise


# Data[Grade == "A++", mean(`Video Uploads`)]
# Data[J("A++") , mean(`Video Uploads`)]


Summarise







  • .N : Count
  • D[,][,] Pipe 연산자와 같은 효과
# Data[, .(Rank, Grade)][,.N, by=Grade]
# Data[, tmp := "NULL"][, tmp := NULL]

.SD , .SDcols





  • .SD : Subset of Data
# Data[, .(mean(`Subscribers`), mean(`Video views`))]
# Data[, lapply(.SD, mean)]
# Data[, lapply(.SD, mean), .SDcols = c("Subscribers","Video views")]


Groupby







#names(Data)
#Data[ , mean(`Video Uploads`) , by ="Grade"]
#Data[ , max(`Video Uploads`) , by ="Grade"]
#Data[ , list(min(`Video views`) , max(`Video views`)) , by ="Grade"]


Mutate





  • D[ , key := func(value) , ]


## 1번째 방법

#Data[ , tmp := "Null"]
#Data[ , `:=`(tmp , "NULL")]

#Data2 <- Data[ , tmp := ifelse(`Video views` > 10^10 , "Top" , "Down")]


## 2번째 방법

#Data2[ , tmp := "Top"]
#Data2[`Video views` <= 10^10 , `:=`(tmp, "Bottom")]


#head(Data2)

#Data2[ , tmp :=NULL]

#names(Data2)


Reshape


dcast





DT <- data.table(mtcars)

dcast(DT, gear ~ cyl, 
      value.var = c("disp", "hp"))
## Aggregate function missing, defaulting to 'length'
##    gear disp_4 disp_6 disp_8 hp_4 hp_6 hp_8
## 1:    3      1      2     12    1    2   12
## 2:    4      8      4      0    8    4    0
## 3:    5      2      1      2    2    1    2
DT2 <- dcast(DT, gear ~ cyl, 
      value.var = c("disp", "hp"),
      fun = list(mean, sum))

melt





melt(DT2,
     id.vars = c("gear"),
     measure.vars = patterns("^disp", "^hp"),
     variable.name = "cyl",
     value.name = c("disp", "hp"))
##     gear cyl      disp        hp
##  1:    3   1  120.1000   97.0000
##  2:    4   1  102.6250   76.0000
##  3:    5   1  107.7000  102.0000
##  4:    3   2  241.5000  107.5000
##  5:    4   2  163.8000  116.5000
##  6:    5   2  145.0000  175.0000
##  7:    3   3  357.6167  194.1667
##  8:    4   3       NaN       NaN
##  9:    5   3  326.0000  299.5000
## 10:    3   4  120.1000   97.0000
## 11:    4   4  821.0000  608.0000
## 12:    5   4  215.4000  204.0000
## 13:    3   5  483.0000  215.0000
## 14:    4   5  655.2000  466.0000
## 15:    5   5  145.0000  175.0000
## 16:    3   6 4291.4000 2330.0000
## 17:    4   6    0.0000    0.0000
## 18:    5   6  652.0000  599.0000


Merging / Joins




(dt1 <- data.table(A = letters[rep(1:3, 2)], X = 1:6, key = "A"))
##    A X
## 1: a 1
## 2: a 4
## 3: b 2
## 4: b 5
## 5: c 3
## 6: c 6
(dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A"))
##    A Y
## 1: b 6
## 2: b 3
## 3: c 5
## 4: c 2
## 5: d 4
## 6: d 1
  • inner join
merge(dt1, dt2, by="A")
##    A X Y
## 1: b 2 6
## 2: b 2 3
## 3: b 5 6
## 4: b 5 3
## 5: c 3 5
## 6: c 3 2
## 7: c 6 5
## 8: c 6 2
  • left join
merge(dt1, dt2, by="A", all.x = TRUE)
##     A X  Y
##  1: a 1 NA
##  2: a 4 NA
##  3: b 2  6
##  4: b 2  3
##  5: b 5  6
##  6: b 5  3
##  7: c 3  5
##  8: c 3  2
##  9: c 6  5
## 10: c 6  2
  • Right Join
merge(dt1, dt2, by="A", all.y = TRUE)
##     A  X Y
##  1: b  2 6
##  2: b  2 3
##  3: b  5 6
##  4: b  5 3
##  5: c  3 5
##  6: c  3 2
##  7: c  6 5
##  8: c  6 2
##  9: d NA 4
## 10: d NA 1
  • Full Join
merge(dt1, dt2, all=TRUE)
##     A  X  Y
##  1: a  1 NA
##  2: a  4 NA
##  3: b  2  6
##  4: b  2  3
##  5: b  5  6
##  6: b  5  3
##  7: c  3  5
##  8: c  3  2
##  9: c  6  5
## 10: c  6  2
## 11: d NA  4
## 12: d NA  1


Rename



colnames(tmp2)
## [1] "Rank"          "Grade"         "Channel name"  "Video Uploads"
## [5] "Subscribers"   "Video views"
setnames(tmp2, c("Rank"), c("랭크"))
colnames(tmp2)
## [1] "랭크"          "Grade"         "Channel name"  "Video Uploads"
## [5] "Subscribers"   "Video views"

Example


## 2개 변수 같이 고려하기 

#Data2[,mean_upload := mean(`Video Uploads`)]


#myfun2 = function(arg1,arg2) {
#  temp1 <- ifelse(arg1 >= arg2, "평균 이상", "평균 미만")
#  temp2 <- ifelse(arg1 > arg2, arg1 - arg2, arg1 / arg2)
#  list(temp1,temp2)
#}

#Data2 <- Data2[, c(paste0("tmp", 4:5)) := myfun2(`Video Uploads` , mean_upload)]
#head(Data2)


#Data2[Grade == "B+", mean(Subscribers), by= "tmp4" ]
#Data2[Grade == "B+", .N , by= "tmp4"]

#Data2[Grade == "A", mean(Subscribers), by= "tmp4"]
#Data2[Grade == "A", .N, by= "tmp4"]
#Data2[,c(paste0("tmp", 4:5)) := list(NULL , NULL)]


#Data2[Grade == "A", .N]
#Data2[, length(which(Grade == "A"))]


728x90

'분석 R' 카테고리의 다른 글

[R] magick package 설치 에러 (ubuntu)  (0) 2020.05.26
R 최신 버전 설치 관련 자료  (0) 2020.04.29
[ R ] roc curve 패키지 비교  (0) 2019.05.01
Tidyverse (dplyr , tidyr)  (0) 2019.03.17
알고리즘 체인과 파이프라인  (0) 2018.01.25