Code R.

Transcription

Code R.
carburant= read.csv("exemples/fuel2001.csv", header=TRUE,
sep=",")
carburant
+ carburant
X Drivers
FuelC Income Miles
MPC
Pop
Tax
1 AL 3559897 2382507 23471 94440 12737.00 3451586 18.00
2 AK
472211
235400 30064 13628 7639.16
457728 8.00
3 AZ 3550367 2428430 25578 55245 9411.55 3907526 18.00
4 AR 1961883 1358174 22257 98132 11268.40 2072622 21.70
5 CA 21623793 14691753 32275 168771 8923.89 25599275 18.00
6 CO 3287922 2048664 32949 85854 9722.73 3322455 22.00
7 CT 2650374 1458279 40640 20910 9021.35 2651452 25.00
8 DE
564099
382043 31255
5814 10891.30
610269 23.00
9 DC
328094
148769 37383
1534 6555.94
468575 20.00
10 FL 12743403 7471117 28145 117299 9531.23 12741821 13.60
11 GA 5833802 4693703 27940 115534 13248.60 6250708 7.50
12 HI
787820
404684 28221
4278 7108.75
949184 16.00
13 ID
896666
609051 24180 46310 10879.40
969166 25.00
14 IL 7809500 5015217 32259 138359 8239.09 9530327 19.00
15 IN 4116924 3120985 27011 94038 12916.90 4682392 15.00
16 IA 1978748 1475812 26723 113437 10258.40 2281002 20.00
17 KS 1871301 1236951 27816 134725 10656.70 2058489 21.00
18 KY 2756634 2085629 24294 78914 11301.70 3161283 16.40
19 LA 2718209 2151437 23334 60829 9505.31 3394854 20.00
20 ME
942556
590093 25623 22672 11320.00 1010273 22.00
21 MD 3451966 2460545 33872 30622 9673.67 4085342 23.50
22 MA 4610666 2720510 37992 35408 8310.86 5008007 21.00
23 MI 6976982 4904689 29612 121790 9959.45 7628170 19.00
24 MN 2961236 2545530 32101 132280 10728.30 3782817 20.00
25 MS 1859487 1476477 20993 73701 12286.80 2160165 18.40
26 MO 3862300 2958880 27445 124324 12012.80 4292175 17.00
27 MT
683351
467567 22569 69503 10965.00
701423 27.00
28 NE 1267284
812247 27829 92766 10567.40 1314974 24.50
29 NV 1420714
945643 30529 38658 9796.15 1537896 24.75
30 NH
941829
662475 33332 15508 9923.45
960593 19.50
31 NJ 5715089 3911837 36983 36175 8099.59 6545471 10.50
32 NM 1231701
885829 22203 59883 12571.40 1370134 18.50
33 NY 11014805 5536612 34547 112961 6876.49 14797284 22.00
34 NC 5884651 4060592 27194 101195 11186.00 6291182 24.10
35 ND
455921
334544 25068 86591 11322.40
502176 21.00
36 OH 7736115 5028276 28400 117267 9371.29 8789530 22.00
37 OK 2172394 1751701 23517 112694 12612.90 2665966 17.00
38 OR 2534464 1487269 28350 66784 9907.26 2673283 24.00
39 PA 8226202 5024671 29539 119985 8092.07 9693987 26.00
40 RI
660435
399113 29685
6053 7625.00
827474 29.00
41 SC 2849885 2217141 24321 66167 11469.60 3115130 16.00
42 SD
544997
402472 26115 83560 11298.90
577391 22.00
43 TN 4188317 2837567 26239 87826 11610.60 4445987 20.00
44 TX 13045727 10637488 27871 300767 10458.40 15618097 20.00
1
45
46
47
48
49
50
51
UT
VT
VA
WA
WV
WI
WY
1495887
515348
4920753
4237845
1316955
3667497
370713
945531
331183
3765718
2622633
818516
2418289
321847
23907 42208 11291.30
26901 14291 15688.40
31162 70721 10259.50
31528 80985 8981.59
21915 36997 10946.10
28232 112663 10337.40
27230 27292 17494.90
1598531
479265
5529436
4552631
1455370
4156609
381882
24.50
20.00
17.50
23.00
25.65
27.30
14.00
##créer des variables
carburant$Fuel = carburant$FuelC/carburant$Pop
carburant$Dlic = carburant$Drivers / carburant$Pop
names(carburant)
[1] "X"
"Drivers" "FuelC"
"Pop"
"Tax"
[9] "Fuel"
"Dlic"
"Income"
"Miles"
"MPC"
##filtrer les variables à utiliser
vars.imp= c( 9, 8, 10, 4, 5 )
names( carburant)[vars.imp]
[1] "Fuel"
"Tax"
"Dlic"
"Income"
"Miles"
carbrnt= carburant[ , vars.imp]
carbrnt[1:10,]
1
2
3
4
5
6
7
8
9
10
Fuel
0.6902644
0.5142792
0.6214751
0.6552927
0.5739129
0.6166115
0.5499926
0.6260239
0.3174924
0.5863461
Tax
18.0
8.0
18.0
21.7
18.0
22.0
25.0
23.0
20.0
13.6
Dlic Income Miles
1.0313801 23471 94440
1.0316411 30064 13628
0.9085972 25578 55245
0.9465706 22257 98132
0.8447033 32275 168771
0.9896062 32949 85854
0.9995934 40640 20910
0.9243448 31255
5814
0.7001953 37383
1534
1.0001242 28145 117299
###regarder les statistiques de base
summary( carbrnt )
Fuel
Tax
Dlic
Miles
Min.
:0.3175
Min.
: 7.50
Min.
:0.7002
Min.
: 1534
1st Qu.:0.5750
1st Qu.:18.00
1st Qu.:0.8641
1st Qu.: 36586
2
Income
Min.
:20993
1st Qu.:25323
Median :0.6260
Median : 78914
Mean
:0.6131
Mean
: 77419
3rd Qu.:0.6666
3rd Qu.:112828
Max.
:0.8428
Max.
:300767
Median :20.00
Median :0.9091
Median :27871
Mean
Mean
Mean
:20.15
:0.9037
:28404
3rd Qu.:23.25
3rd Qu.:0.9430
3rd Qu.:31208
Max.
Max.
Max.
:29.00
:1.0753
:40640
### regarder le graphique des corrélations (la matrice des nuages
de points)
pdf("exemples/carburantGraphiques.pdf")
pairs( carbrnt, pch=16 )
###matrice de corrélation
round( cor( carbrnt ), 3 )
Fuel
Tax
Dlic Income Miles
Fuel
1.000 -0.259 0.469 -0.464 0.208
Tax
-0.259 1.000 -0.086 -0.011 -0.065
Dlic
0.469 -0.086 1.000 -0.176 -0.212
Income -0.464 -0.011 -0.176 1.000 -0.135
Miles
0.208 -0.065 -0.212 -0.135 1.000
##la variable la plus corrélée avec Fuel est Dlic
library(mgcv)
carbrnt.mgcv1 = gam( Fuel ~ s(Dlic), data= carbrnt )
summary(carbrnt.mgcv1)
Family: gaussian
Link function: identity
Formula:
Fuel ~ s(Dlic)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.61313
0.01012
60.56
<2e-16 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
3
s(Dlic) 2.636
---
3.335 7.832 0.000156 ***
R-sq.(adj) =
0.34
Deviance explained = 37.4%
GCV = 0.005628 Scale est. = 0.0052267 n = 51
##graphique de résidus
##créer une fonction pour faire ces graphiques
plotResidus= function( data.lm, titre="" )
{
par( mfrow=c(2,1) )
plot( fitted( data.lm ), resid( data.lm ), pch=16, cex=1.5,
main=titre,
xlab="VALEURS PREDITES", ylab="RESIDUS" )
abline(h=0)
qqnorm( resid( data.lm ), pch=16, xlab="QUANTILES THEORIQUES",
ylab="QUANTILES EMPIRIQUES" )
qqline( resid( data.lm ) )
par( mfrow = c(1,1) )
}
plotResidus( carbrnt.mgcv1, titre="CONSOMMATION DE CARBURANT" )
gam.check( carbrnt.mgcv1)
plot( Fuel ~ Dlic, pch=16, data= carbrnt )
lines( smooth.spline(carbrnt$Dlic, fitted( carbrnt.mgcv1 ) ) )
##On remarque que Tax et Dlic ne sont pas très corrélées.
##Est-ce que tax pourrait aiderait à expliquer mieux la
consommation de carburant?
##régression de Tax en fonction de Dlic
tax.gam = gam( Tax ~ s(Dlic), data=carbrnt )
summary( tax.gam)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.1549
0.6405
31.47
<2e-16 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
s(Dlic)
1
1 0.364
0.549
R-sq.(adj) =
GCV = 21.775
-0.0129
Deviance explained = 0.737%
Scale est. = 20.921
n = 51
4
plotResidus( tax.gam, titre="REGRESSION DE Tax EN FONCTION DE
Dlic" )
#################################################################
#
##ce qui n'est pas expliqué par Dlic dans la régression de Fuel
err.fuel.dlic = resid( carbrnt.mgcv1 )
##ce qui n'est pas expliqué par Dlic dans la régression de Tax
err.tax.dlic= resid( tax.gam )
tax.ajoutee.gam= gam( err.fuel.dlic ~ s(err.tax.dlic) )
summary( tax.ajoutee.gam )
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.976e-16 9.266e-03
0
1
Approximate significance of smooth terms:
edf Ref.df
F p-value
s(err.tax.dlic) 1.886 2.359 2.938 0.0478 *
--R-sq.(adj) = 0.116
Deviance explained = 14.9%
GCV = 0.004641 Scale est. = 0.0043784 n = 51
plotResidus( tax.ajoutee.gam, titre="VARIABLE AJOUTEE: Tax APRES
Dlic" )
##GRAPHIQUE DE LA VARIABLE AJOUTEE
dev.new()
par(mfrow=c(2,1))
plot( err.tax.dlic, err.fuel.dlic , pch=16, cex=1.5,
main="VARIABLE AJOUTEE: Tax APRES Dlic",
xlab="residus de Tax ajustée par Dlic",
ylab="residus de Fuel ajustée par Dlic" )
lines( smooth.spline(err.tax.dlic, fitted(tax.ajoutee.gam)), lwd=
2 )
#################################################################
#####
##régression de Fuel en fonction de Dlic et Tax
carbrnt.mgcv2= gam( Fuel ~ s(Dlic) + s(Tax), data=carbrnt )
summary( carbrnt.mgcv2 )
5
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.613129
0.008792
69.74
<2e-16 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
s(Dlic) 7.172 8.206 5.651 3.85e-05 ***
s(Tax) 1.925 2.379 4.631
0.0108 *
--R-sq.(adj) = 0.502
Deviance explained = 59.2%
GCV = 0.0049156 Scale est. = 0.0039424 n = 51
plotResidus( carbrnt.mgcv2, titre="CONSOMMATION DE CARBURANT:
Dlic + Tax" )
par(mfrow=c(2,1))
plot( carbrnt.mgcv2, pch=16 )
##On remarque que Miles n'est pas très corrélée avec Dlic et Tax.
##Est-ce que tax pourrait aiderait à expliquer mieux la
consommation de carburant?
##ce qui n'est pas expliqué par Dlic et Tax dans la régression de
Fuel
err.fuel.dlictax = resid( carbrnt.mgcv2 )
##régression de Miles en fonction de Dlic et Tax
Miles.gam = gam( Miles ~ s(Dlic) + s(Tax), data=carbrnt )
summary( Miles.gam)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept)
77419
7118
10.88 2.48e-14 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
s(Dlic) 2.219 2.814 1.430
0.197
s(Tax) 1.559 1.927 0.496
0.548
R-sq.(adj) = 0.0794
Deviance explained = 14.9%
GCV = 2.8515e+09 Scale est. = 2.5843e+09 n = 51
#################################################################
######
##variable ajoutée
6
Miles.ajoutee.gam= gam( err.fuel.dlictax ~ s(resid(Miles.gam)),
data=carbrnt )
summary( Miles.ajoutee.gam )
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.069e-16 7.345e-03
0
1
Approximate significance of smooth terms:
edf Ref.df
F p-value
s(resid(Miles.gam)) 3.815 4.632 2.147 0.0742 .
--R-sq.(adj) = 0.147
Deviance explained = 21.2%
GCV = 0.0030379 Scale est. = 0.0027511 n = 51
plotResidus( Miles.ajoutee.gam, titre="VARIABLE AJOUTEE: Miles
APRES Dlic + Tax" )
##GRAPHIQUE DE LA VARIABLE AJOUTEE
plot( resid(Miles.gam), err.fuel.dlictax, pch=16, cex=1.5,
main="VARIABLE AJOUTEE: Miles APRES Dlic + Tax",
xlab="Miles",
ylab="residus de Fuel ajustée par Dlic et Tax" )
lines( smooth.spline( resid(Miles.gam),
fitted( Miles.ajoutee.gam ) ), lwd=2 )
plot( Miles.ajoutee.gam )
#################################################################
###############
##regression de Fuel en fonction de Dlic, Tax et Miles
carbrnt.mgcv3= gam( Fuel ~ s(Dlic) + s(Tax) + s(Miles),
data=carbrnt )
summary( carbrnt.mgcv3 )
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.613129
0.008292
73.94
<2e-16 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
s(Dlic) 8.203 8.809 5.322 6.16e-05 ***
s(Tax)
1.753 2.164 4.898
0.0106 *
s(Miles) 2.504 3.038 2.055
0.1205
---
7
R-sq.(adj) = 0.557
Deviance explained = 66.7%
GCV = 0.0047638 Scale est. = 0.0035065 n = 51
dev.new()
par( mfrow=c(3,1))
plot( carbrnt.mgcv3 )
####anova
anova( carbrnt.mgcv2, carbrnt.mgcv3 )
Model 1: Fuel ~ s(Dlic) + s(Tax)
Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles)
1
2
Resid. Df Resid. Dev
Df Deviance
40.903
0.16126
37.539
0.13163 3.364 0.029629
anova( carbrnt.mgcv1, carbrnt.mgcv2 )
Model 1: Fuel ~ s(Dlic)
Model 2: Fuel ~ s(Dlic) + s(Tax)
1
2
Resid. Df Resid. Dev
Df Deviance
47.364
0.24756
40.903
0.16126 6.4605 0.086299
#################################################################
###############
##régression de Fuel en fonction de Dlic, et Miles
carbrnt.mgcv4= gam( Fuel ~ s(Dlic) + s(Miles), data=carbrnt )
summary( carbrnt.mgcv4 )
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.613129
0.009855
62.21
<2e-16 ***
--Approximate significance of smooth terms:
edf Ref.df
F p-value
s(Dlic) 2.236 2.846 9.385 5.79e-05 ***
s(Miles) 1.000 1.001 5.061
0.0291 *
--R-sq.(adj) = 0.374
Deviance explained = 41.5%
GCV = 0.0054021 Scale est. = 0.0049533 n = 51
8
####
anova( carbrnt.mgcv4, carbrnt.mgcv3 )
Model 1: Fuel ~ s(Dlic) + s(Miles)
Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles)
Resid. Df Resid. Dev
Df Deviance
1
46.763
0.23164
2
37.539
0.13163 9.2241
0.1
#####
anova( carbrnt.mgcv1, carbrnt.mgcv4 )
Model 1: Fuel ~ s(Dlic)
Model 2: Fuel ~ s(Dlic) + s(Miles)
Resid. Df Resid. Dev
Df Deviance
1
47.364
0.24756
2
46.763
0.23164 0.60039 0.015922
carbrnt.mgcv5= gam( Fuel ~ s(Dlic) + Miles + Tax, data=carbrnt )
summary( carbrnt.mgcv5 )
anova( carbrnt.mgcv5, carbrnt.mgcv3 )
Analysis of Deviance Table
Model 1: Fuel ~ s(Dlic) + Miles + Tax
Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles)
Resid. Df Resid. Dev
Df Deviance
1
39.654
0.14708
2
37.539
0.13163 2.115 0.015446
#########################################################
library(rpart)
carbrnt.tree= rpart( Fuel ~ Dlic + Miles + Tax + Income,
method="anova", data= carbrnt )
par(mfrow=c(1,2), xpd=NA) # otherwise on some devices the text is
clipped
summary( carbrnt.tree )
9
Call:
rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data =
carbrnt,
method = "anova")
n= 51
1
2
3
4
5
CP nsplit rel error
xerror
xstd
0.32187635
0 1.0000000 1.0320695 0.2993860
0.15276296
1 0.6781236 0.8681936 0.2120283
0.06191811
2 0.5253607 0.9872748 0.2483880
0.01709497
3 0.4634426 0.9411495 0.2264639
0.01000000
4 0.4463476 0.9639883 0.2384718
Variable importance
Income
Dlic Miles
38
28
21
Tax
14
Node number 1: 51 observations,
complexity param=0.3218764
mean=0.6131288, MSE=0.007758707
left son=2 (24 obs) right son=3 (27 obs)
Primary splits:
Income < 28042.5
to the right, improve=0.32187640, (0
missing)
Dlic
< 0.8326463 to the left, improve=0.24871170, (0
missing)
Miles < 21791
to the left, improve=0.13845350, (0
missing)
Tax
< 21.85
to the right, improve=0.07445155, (0
missing)
Surrogate splits:
Miles < 36586
to the left, agree=0.667, adj=0.292, (0
split)
Dlic < 0.854698 to the left, agree=0.647, adj=0.250, (0
split)
Tax
< 21.85
to the right, agree=0.608, adj=0.167, (0
split)
Node number 2: 24 observations,
complexity param=0.152763
mean=0.560124, MSE=0.007885153
left son=4 (7 obs) right son=5 (17 obs)
Primary splits:
Dlic
< 0.8448336 to the left, improve=0.31941570, (0
missing)
Miles < 25766
to the left, improve=0.10557910, (0
missing)
Income < 33140.5
to the right, improve=0.06470725, (0
missing)
Tax
< 19.75
to the right, improve=0.03657008, (0
missing)
Surrogate splits:
Miles < 127035
to the right, agree=0.833, adj=0.429, (0
10
split)
Node number 3: 27 observations,
complexity param=0.06191811
mean=0.6602442, MSE=0.002929105
left son=6 (20 obs) right son=7 (7 obs)
Primary splits:
Tax
< 17.5
to the right, improve=0.30979790, (0
missing)
Dlic
< 0.9429701 to the left, improve=0.13929780, (0
missing)
Income < 26812
to the left, improve=0.12501640, (0
missing)
Miles < 63498
to the left, improve=0.04143225, (0
missing)
Node number 4: 7 observations
mean=0.4819146, MSE=0.01266335
Node number 5: 17 observations
mean=0.5923278, MSE=0.00236193
Node number 6: 20 observations,
complexity param=0.01709497
mean=0.6424229, MSE=0.001382062
left son=12 (8 obs) right son=13 (12 obs)
Primary splits:
Miles < 65166
to the left, improve=0.24472050, (0
missing)
Dlic
< 0.9389165 to the left, improve=0.20149910, (0
missing)
Tax
< 20.5
to the right, improve=0.16676220, (0
missing)
Income < 25869
to the left, improve=0.04711045, (0
missing)
Surrogate splits:
Income < 25869
to the left, agree=0.70, adj=0.250, (0
split)
Dlic
< 0.9341762 to the left, agree=0.65, adj=0.125, (0
split)
Tax
< 24.3
to the right, agree=0.65, adj=0.125, (0
split)
Node number 7: 7 observations
mean=0.7111624, MSE=0.003849139
Node number 12: 8 observations
mean=0.6198989, MSE=0.001428473
Node number 13: 12 observations
mean=0.6574388, MSE=0.000787423
####################################
11
printcp( carbrnt.tree )
Regression tree:
rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data =
carbrnt,
method = "anova")
Variables actually used in tree construction:
[1] Dlic
Income Miles Tax
Root node error: 0.39569/51 = 0.0077587
n= 51
1
2
3
4
5
CP nsplit rel error
0.321876
0
1.00000
0.152763
1
0.67812
0.061918
2
0.52536
0.017095
3
0.46344
0.010000
4
0.44635
xerror
1.03207
0.86819
0.98727
0.94115
0.96399
xstd
0.29939
0.21203
0.24839
0.22646
0.23847
##graphique plus beau de l'arbre (dans un fichier de type
postscript)
post(carbrnt.tree, file="exemples/FuelTree1.ps",
title="ARBRE DE REGRESSION: FUEL", horizontal=FALSE )
#####################################################
carbrnt.control= rpart.control( cp=0, xvar=5, minsplit=10 )
carbrnt.tree= rpart( Fuel ~ Dlic + Miles + Tax + Income,
method="anova", data= carbrnt, control= carbrnt.control )
printcp( carbrnt.tree )
Regression tree:
rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data =
carbrnt,
method = "anova", control = carbrnt.control)
Variables actually used in tree construction:
[1] Dlic
Income Miles Tax
Root node error: 0.39569/51 = 0.0077587
n= 51
CP nsplit rel error xerror
xstd
1 0.3218764
0
1.00000 1.02255 0.29468
2 0.1769079
1
0.67812 1.34004 0.35462
3 0.0812552
2
0.50122 1.07670 0.28303
12
4
5
6
7
8
0.0219343
0.0178144
0.0061884
0.0038397
0.0000000
3
4
6
7
8
0.41996
0.39803
0.36240
0.35621
0.35237
0.94282
1.04884
1.05389
1.06093
1.06023
0.24798
0.25173
0.25167
0.25510
0.25516
##graphique plus beau de l'arbre (dans un fichier de type
postscript)
post(carbrnt.tree, file="exemples/FuelTree2.ps",
title="ARBRE DE REGRESSION: FUEL", horizontal=FALSE )
pdf("exemples/FuelParametredeLissage.pdf")
rsq.rpart( carbrnt.tree )
dev.off()
carbrnt.prune= prune( carbrnt.tree, cp= carbrnt.tree
$cptable[which.min(carbrnt.tree$cptable[,"xerror"]),"CP"])
##graphique plus beau de l'arbre (dans un fichier de type
postscript)
post(carbrnt.prune, file="exemples/FuelTreepruned.ps",
title="ARBRE DE REGRESSION: FUEL (élagué)",
horizontal=FALSE )
printcp(carbrnt.prune)
Regression tree:
rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data =
carbrnt,
method = "anova", control = carbrnt.control)
Variables actually used in tree construction:
[1] Dlic
Income Tax
Root node error: 0.39569/51 = 0.0077587
n= 51
1
2
3
4
CP nsplit rel error
0.321876
0
1.00000
0.176908
1
0.67812
0.081255
2
0.50122
0.021934
3
0.41996
xerror
1.02255
1.34004
1.07670
0.94282
13
xstd
0.29468
0.35462
0.28303
0.24798