> head(iris)
Sepal.Length Sepal.Width Petal.Length
Petal.Width Species
1 5.1 3.5 1.4 0.2
setosa
2 4.9 3.0 1.4 0.2
setosa
3 4.7 3.2 1.3 0.2
setosa
4 4.6 3.1 1.5 0.2
setosa
5 5.0 3.6 1.4 0.2
setosa
6 5.4 3.9 1.7 0.4
setosa
> cor(iris[1:4])
#상관계수 확인
Sepal.Length Sepal.Width
Petal.Length Petal.Width
Sepal.Length 1.0000000
-0.1175698 0.8717538 0.8179411
Sepal.Width -0.1175698
1.0000000 -0.4284401 -0.3661259
Petal.Length 0.8717538
-0.4284401 1.0000000 0.9628654
Petal.Width 0.8179411
-0.3661259 0.9628654 1.0000000
> log.ir<-log(iris[,1:4])
#데이터가 편향되어 있으므로 log변환 해주기
> ir.species<-iris[,5]
#species는 y변수
> #주성분 분석
> ir.pca<-prcomp(log.ir,
center=T, scale.=T)
> print(ir.pca)
Standard
deviations (1, .., p=4):
[1]
1.7124583 0.9523797 0.3647029 0.1656840
Rotation
(n x k) = (4 x 4):
PC1
PC2 PC3 PC4
Sepal.Length 0.5038236 -0.45499872 0.7088547
0.19147575
Sepal.Width -0.3023682 -0.88914419 -0.3311628 -0.09125405
Petal.Length 0.5767881 -0.03378802 -0.2192793 -0.78618732
Petal.Width 0.5674952 -0.03545628 -0.5829003 0.58044745
> plot(ir.pca,
type="l")
> 가 급 꺾이는데 이 부분을 elbow point라고 하며,
Error:
unexpected symbol in "가 급"
> #이 위의 주성분을 주로 선택하여 사용함
>
> summary(ir.pca)
Importance
of components:
PC1 PC2
PC3 PC4
Standard
deviation 1.7125 0.9524 0.36470
0.16568
Proportion
of Variance 0.7331 0.2268 0.03325 0.00686
Cumulative
Proportion 0.7331 0.9599 0.99314 1.00000
> #PC1의 Cumulative~의 데이터를 보면 PC1이 73% 설명 가능하고
> #PC2까지 추가되면 95%까지 설명가능한 것
>
> #원래 데이터와 선형계수를 메트릭스 곱으로 선형조합에 맞게 만들기
> #ir.pca$rotation에 선혀계수 있음
> PRC<-as.matrix(log.ir)
%*% ir.pca$rotation
> head(PRC)
PC1 PC2
PC3 PC4
[1,]
-0.2772209 -1.809493 1.604387 -1.0010840
[2,]
-0.2507663 -1.654229 1.627078 -0.9946772
[3,]
-0.3340210 -1.690148 1.592416 -0.9502831
[4,]
-0.2527176 -1.656968 1.556306 -1.0640079
[5,]
-0.2957159 -1.825531 1.581020 -1.0074464
[6,] 0.2242011 -1.962854 1.162457 -0.7503219
>
> train1 <-
cbind(ir.species,as.data.frame(PRC))
> train1[,1]
<- as.factor(train1[,1])
> colnames(train1)[1]
<- "label"
>
> head(train1)
label
PC1 PC2 PC3
PC4
1 setosa -0.2772209 -1.809493 1.604387 -1.0010840
2 setosa -0.2507663 -1.654229 1.627078 -0.9946772
3 setosa -0.3340210 -1.690148 1.592416 -0.9502831
4 setosa -0.2527176 -1.656968 1.556306 -1.0640079
5 setosa -0.2957159 -1.825531 1.581020 -1.0074464
6 setosa 0.2242011 -1.962854
1.162457 -0.7503219
>
> #회귀분석
> fit1 <- lm(label~PC1+PC2, data=train1)
Warning messages:
1: In model.response(mf, "numeric") :
요인형 종속변수의 type = "numeric"의 사용은 무시될 것입니다
2: In
Ops.factor(y, z$residuals) : ‘-’ not meaningful for factors
> fit1
Call:
lm(formula
= label ~ PC1 + PC2, data = train1)
Coefficients:
(Intercept) PC1 PC2
0.6696 0.7680 -0.2548
> #이 회귀모델의 예측력이 좋은지 확인하기 위해 predict함수로 예측해 봄
> fit1_pred
<- predict(fit1,newdata = train1)
> fit1_pred
1 2 3 4 5 6 7
0.9177087
0.8984698 0.8436802 0.8976691 0.9075902 1.3418786 1.0463684
8 9 10 11 12 13 14
0.9390723
0.8445396 0.6210744 0.9772913 0.9476893 0.5797361 0.4155148
15
16 17 18 19 20 21
0.9120119
1.3118574 1.2207336 1.0980907 1.2412319 1.1287785 1.0342953
22
23 24 25 26 27 28
1.2569135
0.7137137 1.4133590 1.0252950 0.9689303 1.2765822 0.9586297
29
30 31 32 33
34 35
0.9276385
0.9374478 0.9482156 1.2861381 0.6493632 0.9546398 0.9294394
36
37 38 39 40 41 42
0.8386486
0.9222122 0.5890660 0.8108801 0.9490303 1.0546664 1.0040768
43
44 45
46 47 48 49
0.8105124
1.4567990 1.3635121 1.0684832 0.9775414 0.8663319 0.9678917
50
51 52 53 54 55 56
0.9080860
2.4900652 2.4560586 2.5325228 2.2648798 2.4745413 2.3349098
57
58 59 60 61 62 63
2.4963131
2.0029576 2.4183566 2.2572967 2.0407272 2.3843643 2.1921680
64
65 66 67 68 69 70
2.4214215
2.2250403 2.4384336 2.3892785 2.1851043 2.4422283 2.1877138
71
72 73 74 75 76 77
2.5254085
2.3158257 2.4880022 2.3530434 2.3724269 2.4310584 2.4857568
78
79 80 81 82 83 84
2.5827240
2.4241655 2.1051215 2.1671554 2.1127110 2.2436308 2.5098066
85
86 87 88 89 90 91
2.3709906
2.4519709 2.4989128 2.3762102 2.2835777 2.2644047 2.2716131
92
93 94 95 96
97 98
2.4115164
2.2552790 2.0133592 2.2950601 2.2677511 2.3035534 2.3564617
99
100 101 102
103 104 105
2.0222024
2.2928712 2.8051324 2.5692110 2.7806345 2.6285684 2.7492116
106
107 108 109
110 111 112
2.8654869
2.3788516 2.7558419 2.6762160 2.8792490 2.6483604 2.6360837
113
114 115 116
117 118 119
2.7272214
2.5747804 2.6729336 2.7201117 2.6359540 2.8982001 2.9334209
120
121 122 123
124 125 126
2.4733191
2.7907960 2.5561109 2.8575388 2.5686743 2.7353583 2.7263118
127
128 129 130
131 132 133
2.5511097
2.5518512 2.7052657 2.6589710 2.7723682 2.8480064 2.7259613
134
135 136 137
138 139 140
2.5054224
2.5011639 2.8769550 2.7556452 2.6279707 2.5342278 2.7260895
141
142 143 144
145 146
147
2.7871266
2.7407486 2.5692110 2.7990284 2.8129241 2.7349133 2.6022894
148
149 150
2.6574971
2.7122422 2.5531536
>
> b <-
round(fit1_pred) #반올림 함수
>
> b[b==0 |
b==1] <- "setosa"
> b[b==2] <- "Versicolor"
> b[b==3] <- "Virginica"
>
> a <-
ir.species
> table(b,a)
a
b setosa versicolor
virginica
setosa 50 0 0
Versicolor 0
46 2
Virginica 0 4 48
>
> #setosa : 50단위 中 50단위 예측
> #versicolor
: 50단위 中 46단위 예측
> #virginica :
50단위 中 48단위 예측
출처 : https://m.blog.naver.com/PostView.nhn?blogId=leedk1110&logNo=220783514855&proxyReferer=https%3A%2F%2Fwww.google.co.kr%2F