Tidyverse (ggplot)
2019. 3. 19. 00:25ㆍ분석 R/EDA
<html 필요하시면 댓글에 글 남겨주세요.>
2. Data Visualization
2.1 Grammar of Graphic by ggplot2 package
- developed by Hadley Wickham
- based on the philosphpy of hierarchical architecture
Library Loading
library(tidyverse)
library(MASS)
#library(ggplot2)
2.2 Understanding ggplot2 via iris data
2.2.1 iris data
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
2.2.2 Point plot = scatter plot
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) + geom_point()
myplot <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width))
myplot + geom_point()
myplot + geom_line()
myplot + geom_point() + geom_line()
size
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point(size = 3)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point(size = 3)
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(aes(size=Sepal.Length))
color
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(color = "skyblue")
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(aes(color=Species), alpha=0.5)
alpha
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(alpha = 0.6)
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(aes(alpha = Sepal.Length ))
shape
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(shape = 5)
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(aes(shape = Species ))
Example
ggplot(iris, aes(Petal.Length, Petal.Width)) +
geom_point(aes(shape = Species ,
alpha = Sepal.Length ,
fill = Sepal.Length) ,
color ="red")
2.2.3 Box plot
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_boxplot()
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_boxplot() + geom_jitter()
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_boxplot(aes(fill=Species)) +
geom_jitter()
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_boxplot(aes(fill=Species)) +
geom_jitter() + coord_flip()
2.2.4 Bar plot
- stat =
count
- 해당 값의 빈도를 나타냄
- stat =
identity
- aes(x,y) x에 해당하는 y값
ggplot(iris, aes(x=Species)) + geom_bar(stat="count")
ggplot(iris, aes(x=Petal.Length)) + geom_bar(stat="count")
ggplot(iris, aes(x=Species)) + geom_bar(aes(fill=Species), stat="count")
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_bar(stat="identity")
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_bar(stat="identity", position="stack")
ggplot(iris, aes(x=Species, y=Petal.Length)) + geom_bar(stat="identity", aes(fill=Species))
ggplot(iris, aes(x=Species, y=Petal.Length)) +
geom_bar(stat="identity", aes(fill=Species)) +
coord_flip()
gather를 사용해 변수를 모아서 한번에 표현하기
# library(tidyr)
long_iris <- gather(iris, key, value, 1:4)
head(long_iris)
## Species key value
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
- position
stack
fill
dodge
ggplot(long_iris, aes(x=Species, y=value)) + geom_bar(stat="identity",position ="stack" , aes(fill=key))
ggplot(long_iris, aes(x=Species, y=value)) +
geom_bar(stat="identity", position="fill", aes(fill=key))
ggplot(long_iris, aes(x=Species, y=value)) +
geom_bar(stat="identity", position="dodge", aes(fill=key))
- 주의
- Default : position =
stack
- y value를 누적해서 쌓은 값이 y축값이 됨.
- Default : position =
## default : position : "stack"
ggplot(long_iris, aes(x=Species, y=value)) +
geom_bar(stat="identity", aes(fill=key), alpha = 0.9 , col ="red" )
Example
- 주의
- 그림에서 보이는 바와 같이 아래쪽에는 진하게 표시되고 위로 갈수록 하애짐
- 이유 : y값들이 중복적으로 겹쳐서 진하게 표시되는 것.
mtcars2 <- mtcars %>% dplyr::select(-vs , -am , -cyl , -carb )
long <- gather( mtcars2 , variable , value , -gear )
base <- ggplot( long , aes(x = gear , y = value , fill = variable ))
base + geom_bar( stat= "identity" , position = "dodge" , color ="yellow" , alpha = 0.1)
long %>% filter(variable == "disp" & gear == 3)
## gear variable value
## 1 3 disp 258.0
## 2 3 disp 360.0
## 3 3 disp 225.0
## 4 3 disp 360.0
## 5 3 disp 275.8
## 6 3 disp 275.8
## 7 3 disp 275.8
## 8 3 disp 472.0
## 9 3 disp 460.0
## 10 3 disp 440.0
## 11 3 disp 120.1
## 12 3 disp 318.0
## 13 3 disp 304.0
## 14 3 disp 350.0
## 15 3 disp 400.0
2.2.5 Trellis plot
- facet_grid(x~y)
- 격자 표시로 비교하기가 용이함.
- ncol 조정 불가능
- 꽉 찬 격자형태로만 나옴.
- facet_wrap(x~y)
- 개별적으로 그려주기 때문에 비교하기가 용이하지 않음.
- 빈공간이 나올 수 있음.
- ncol 조정 가능
ggplot(iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point(size=3) +
facet_grid( ~ Species)
## linear regression 개별적으로 가능
ggplot(iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point(size=3, aes(color=Species)) +
facet_grid(~ Species) +
stat_smooth(method = "lm")
## theme 변경 가능
ggplot(iris, aes(x=Sepal.Length, y=Petal.Length)) +
geom_point(size=3, aes(color=Species)) +
facet_grid(~ Species) +
stat_smooth(method = "lm") +
theme(panel.grid.major = element_line(color = "gray"))
## facet_grid
ggplot(mtcars, aes(x=mpg , y=disp )) +
geom_point(shape=21, colour="black") +
facet_grid(cyl ~ am )
## facet_wrap
ggplot(mtcars, aes(x=mpg , y=disp )) +
geom_point(shape=21, colour="black") +
facet_wrap(cyl ~ am , ncol=3)
- 비교 facet_wrap , facet_grid
ggplot(Cars93, aes(x=Weight, y=MPG.highway)) +
geom_point(shape=21, colour="black") +
facet_grid(Origin ~ Type)
ggplot(Cars93, aes(x=Weight, y=MPG.highway)) +
geom_point(shape=21, colour="black") +
facet_wrap(Origin ~ Type , ncol=6)
********************************
728x90
'분석 R > EDA' 카테고리의 다른 글
Data Handling Practice (0) | 2019.04.18 |
---|---|
Kaggle BlackFriday 데이터를 활용한 EDA (0) | 2019.03.16 |
Kaggle 올림픽 데이터를 활용한 EDA 2번째 (0) | 2019.03.16 |
Kaggle 올림픽 데이터를 활용한 EDA 1번째 (0) | 2019.03.16 |