Code R.
Transcription
Code R.
carburant= read.csv("exemples/fuel2001.csv", header=TRUE, sep=",") carburant + carburant X Drivers FuelC Income Miles MPC Pop Tax 1 AL 3559897 2382507 23471 94440 12737.00 3451586 18.00 2 AK 472211 235400 30064 13628 7639.16 457728 8.00 3 AZ 3550367 2428430 25578 55245 9411.55 3907526 18.00 4 AR 1961883 1358174 22257 98132 11268.40 2072622 21.70 5 CA 21623793 14691753 32275 168771 8923.89 25599275 18.00 6 CO 3287922 2048664 32949 85854 9722.73 3322455 22.00 7 CT 2650374 1458279 40640 20910 9021.35 2651452 25.00 8 DE 564099 382043 31255 5814 10891.30 610269 23.00 9 DC 328094 148769 37383 1534 6555.94 468575 20.00 10 FL 12743403 7471117 28145 117299 9531.23 12741821 13.60 11 GA 5833802 4693703 27940 115534 13248.60 6250708 7.50 12 HI 787820 404684 28221 4278 7108.75 949184 16.00 13 ID 896666 609051 24180 46310 10879.40 969166 25.00 14 IL 7809500 5015217 32259 138359 8239.09 9530327 19.00 15 IN 4116924 3120985 27011 94038 12916.90 4682392 15.00 16 IA 1978748 1475812 26723 113437 10258.40 2281002 20.00 17 KS 1871301 1236951 27816 134725 10656.70 2058489 21.00 18 KY 2756634 2085629 24294 78914 11301.70 3161283 16.40 19 LA 2718209 2151437 23334 60829 9505.31 3394854 20.00 20 ME 942556 590093 25623 22672 11320.00 1010273 22.00 21 MD 3451966 2460545 33872 30622 9673.67 4085342 23.50 22 MA 4610666 2720510 37992 35408 8310.86 5008007 21.00 23 MI 6976982 4904689 29612 121790 9959.45 7628170 19.00 24 MN 2961236 2545530 32101 132280 10728.30 3782817 20.00 25 MS 1859487 1476477 20993 73701 12286.80 2160165 18.40 26 MO 3862300 2958880 27445 124324 12012.80 4292175 17.00 27 MT 683351 467567 22569 69503 10965.00 701423 27.00 28 NE 1267284 812247 27829 92766 10567.40 1314974 24.50 29 NV 1420714 945643 30529 38658 9796.15 1537896 24.75 30 NH 941829 662475 33332 15508 9923.45 960593 19.50 31 NJ 5715089 3911837 36983 36175 8099.59 6545471 10.50 32 NM 1231701 885829 22203 59883 12571.40 1370134 18.50 33 NY 11014805 5536612 34547 112961 6876.49 14797284 22.00 34 NC 5884651 4060592 27194 101195 11186.00 6291182 24.10 35 ND 455921 334544 25068 86591 11322.40 502176 21.00 36 OH 7736115 5028276 28400 117267 9371.29 8789530 22.00 37 OK 2172394 1751701 23517 112694 12612.90 2665966 17.00 38 OR 2534464 1487269 28350 66784 9907.26 2673283 24.00 39 PA 8226202 5024671 29539 119985 8092.07 9693987 26.00 40 RI 660435 399113 29685 6053 7625.00 827474 29.00 41 SC 2849885 2217141 24321 66167 11469.60 3115130 16.00 42 SD 544997 402472 26115 83560 11298.90 577391 22.00 43 TN 4188317 2837567 26239 87826 11610.60 4445987 20.00 44 TX 13045727 10637488 27871 300767 10458.40 15618097 20.00 1 45 46 47 48 49 50 51 UT VT VA WA WV WI WY 1495887 515348 4920753 4237845 1316955 3667497 370713 945531 331183 3765718 2622633 818516 2418289 321847 23907 42208 11291.30 26901 14291 15688.40 31162 70721 10259.50 31528 80985 8981.59 21915 36997 10946.10 28232 112663 10337.40 27230 27292 17494.90 1598531 479265 5529436 4552631 1455370 4156609 381882 24.50 20.00 17.50 23.00 25.65 27.30 14.00 ##créer des variables carburant$Fuel = carburant$FuelC/carburant$Pop carburant$Dlic = carburant$Drivers / carburant$Pop names(carburant) [1] "X" "Drivers" "FuelC" "Pop" "Tax" [9] "Fuel" "Dlic" "Income" "Miles" "MPC" ##filtrer les variables à utiliser vars.imp= c( 9, 8, 10, 4, 5 ) names( carburant)[vars.imp] [1] "Fuel" "Tax" "Dlic" "Income" "Miles" carbrnt= carburant[ , vars.imp] carbrnt[1:10,] 1 2 3 4 5 6 7 8 9 10 Fuel 0.6902644 0.5142792 0.6214751 0.6552927 0.5739129 0.6166115 0.5499926 0.6260239 0.3174924 0.5863461 Tax 18.0 8.0 18.0 21.7 18.0 22.0 25.0 23.0 20.0 13.6 Dlic Income Miles 1.0313801 23471 94440 1.0316411 30064 13628 0.9085972 25578 55245 0.9465706 22257 98132 0.8447033 32275 168771 0.9896062 32949 85854 0.9995934 40640 20910 0.9243448 31255 5814 0.7001953 37383 1534 1.0001242 28145 117299 ###regarder les statistiques de base summary( carbrnt ) Fuel Tax Dlic Miles Min. :0.3175 Min. : 7.50 Min. :0.7002 Min. : 1534 1st Qu.:0.5750 1st Qu.:18.00 1st Qu.:0.8641 1st Qu.: 36586 2 Income Min. :20993 1st Qu.:25323 Median :0.6260 Median : 78914 Mean :0.6131 Mean : 77419 3rd Qu.:0.6666 3rd Qu.:112828 Max. :0.8428 Max. :300767 Median :20.00 Median :0.9091 Median :27871 Mean Mean Mean :20.15 :0.9037 :28404 3rd Qu.:23.25 3rd Qu.:0.9430 3rd Qu.:31208 Max. Max. Max. :29.00 :1.0753 :40640 ### regarder le graphique des corrélations (la matrice des nuages de points) pdf("exemples/carburantGraphiques.pdf") pairs( carbrnt, pch=16 ) ###matrice de corrélation round( cor( carbrnt ), 3 ) Fuel Tax Dlic Income Miles Fuel 1.000 -0.259 0.469 -0.464 0.208 Tax -0.259 1.000 -0.086 -0.011 -0.065 Dlic 0.469 -0.086 1.000 -0.176 -0.212 Income -0.464 -0.011 -0.176 1.000 -0.135 Miles 0.208 -0.065 -0.212 -0.135 1.000 ##la variable la plus corrélée avec Fuel est Dlic library(mgcv) carbrnt.mgcv1 = gam( Fuel ~ s(Dlic), data= carbrnt ) summary(carbrnt.mgcv1) Family: gaussian Link function: identity Formula: Fuel ~ s(Dlic) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.61313 0.01012 60.56 <2e-16 *** --Approximate significance of smooth terms: edf Ref.df F p-value 3 s(Dlic) 2.636 --- 3.335 7.832 0.000156 *** R-sq.(adj) = 0.34 Deviance explained = 37.4% GCV = 0.005628 Scale est. = 0.0052267 n = 51 ##graphique de résidus ##créer une fonction pour faire ces graphiques plotResidus= function( data.lm, titre="" ) { par( mfrow=c(2,1) ) plot( fitted( data.lm ), resid( data.lm ), pch=16, cex=1.5, main=titre, xlab="VALEURS PREDITES", ylab="RESIDUS" ) abline(h=0) qqnorm( resid( data.lm ), pch=16, xlab="QUANTILES THEORIQUES", ylab="QUANTILES EMPIRIQUES" ) qqline( resid( data.lm ) ) par( mfrow = c(1,1) ) } plotResidus( carbrnt.mgcv1, titre="CONSOMMATION DE CARBURANT" ) gam.check( carbrnt.mgcv1) plot( Fuel ~ Dlic, pch=16, data= carbrnt ) lines( smooth.spline(carbrnt$Dlic, fitted( carbrnt.mgcv1 ) ) ) ##On remarque que Tax et Dlic ne sont pas très corrélées. ##Est-ce que tax pourrait aiderait à expliquer mieux la consommation de carburant? ##régression de Tax en fonction de Dlic tax.gam = gam( Tax ~ s(Dlic), data=carbrnt ) summary( tax.gam) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 20.1549 0.6405 31.47 <2e-16 *** --Approximate significance of smooth terms: edf Ref.df F p-value s(Dlic) 1 1 0.364 0.549 R-sq.(adj) = GCV = 21.775 -0.0129 Deviance explained = 0.737% Scale est. = 20.921 n = 51 4 plotResidus( tax.gam, titre="REGRESSION DE Tax EN FONCTION DE Dlic" ) ################################################################# # ##ce qui n'est pas expliqué par Dlic dans la régression de Fuel err.fuel.dlic = resid( carbrnt.mgcv1 ) ##ce qui n'est pas expliqué par Dlic dans la régression de Tax err.tax.dlic= resid( tax.gam ) tax.ajoutee.gam= gam( err.fuel.dlic ~ s(err.tax.dlic) ) summary( tax.ajoutee.gam ) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.976e-16 9.266e-03 0 1 Approximate significance of smooth terms: edf Ref.df F p-value s(err.tax.dlic) 1.886 2.359 2.938 0.0478 * --R-sq.(adj) = 0.116 Deviance explained = 14.9% GCV = 0.004641 Scale est. = 0.0043784 n = 51 plotResidus( tax.ajoutee.gam, titre="VARIABLE AJOUTEE: Tax APRES Dlic" ) ##GRAPHIQUE DE LA VARIABLE AJOUTEE dev.new() par(mfrow=c(2,1)) plot( err.tax.dlic, err.fuel.dlic , pch=16, cex=1.5, main="VARIABLE AJOUTEE: Tax APRES Dlic", xlab="residus de Tax ajustée par Dlic", ylab="residus de Fuel ajustée par Dlic" ) lines( smooth.spline(err.tax.dlic, fitted(tax.ajoutee.gam)), lwd= 2 ) ################################################################# ##### ##régression de Fuel en fonction de Dlic et Tax carbrnt.mgcv2= gam( Fuel ~ s(Dlic) + s(Tax), data=carbrnt ) summary( carbrnt.mgcv2 ) 5 Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.613129 0.008792 69.74 <2e-16 *** --Approximate significance of smooth terms: edf Ref.df F p-value s(Dlic) 7.172 8.206 5.651 3.85e-05 *** s(Tax) 1.925 2.379 4.631 0.0108 * --R-sq.(adj) = 0.502 Deviance explained = 59.2% GCV = 0.0049156 Scale est. = 0.0039424 n = 51 plotResidus( carbrnt.mgcv2, titre="CONSOMMATION DE CARBURANT: Dlic + Tax" ) par(mfrow=c(2,1)) plot( carbrnt.mgcv2, pch=16 ) ##On remarque que Miles n'est pas très corrélée avec Dlic et Tax. ##Est-ce que tax pourrait aiderait à expliquer mieux la consommation de carburant? ##ce qui n'est pas expliqué par Dlic et Tax dans la régression de Fuel err.fuel.dlictax = resid( carbrnt.mgcv2 ) ##régression de Miles en fonction de Dlic et Tax Miles.gam = gam( Miles ~ s(Dlic) + s(Tax), data=carbrnt ) summary( Miles.gam) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 77419 7118 10.88 2.48e-14 *** --Approximate significance of smooth terms: edf Ref.df F p-value s(Dlic) 2.219 2.814 1.430 0.197 s(Tax) 1.559 1.927 0.496 0.548 R-sq.(adj) = 0.0794 Deviance explained = 14.9% GCV = 2.8515e+09 Scale est. = 2.5843e+09 n = 51 ################################################################# ###### ##variable ajoutée 6 Miles.ajoutee.gam= gam( err.fuel.dlictax ~ s(resid(Miles.gam)), data=carbrnt ) summary( Miles.ajoutee.gam ) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.069e-16 7.345e-03 0 1 Approximate significance of smooth terms: edf Ref.df F p-value s(resid(Miles.gam)) 3.815 4.632 2.147 0.0742 . --R-sq.(adj) = 0.147 Deviance explained = 21.2% GCV = 0.0030379 Scale est. = 0.0027511 n = 51 plotResidus( Miles.ajoutee.gam, titre="VARIABLE AJOUTEE: Miles APRES Dlic + Tax" ) ##GRAPHIQUE DE LA VARIABLE AJOUTEE plot( resid(Miles.gam), err.fuel.dlictax, pch=16, cex=1.5, main="VARIABLE AJOUTEE: Miles APRES Dlic + Tax", xlab="Miles", ylab="residus de Fuel ajustée par Dlic et Tax" ) lines( smooth.spline( resid(Miles.gam), fitted( Miles.ajoutee.gam ) ), lwd=2 ) plot( Miles.ajoutee.gam ) ################################################################# ############### ##regression de Fuel en fonction de Dlic, Tax et Miles carbrnt.mgcv3= gam( Fuel ~ s(Dlic) + s(Tax) + s(Miles), data=carbrnt ) summary( carbrnt.mgcv3 ) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.613129 0.008292 73.94 <2e-16 *** --Approximate significance of smooth terms: edf Ref.df F p-value s(Dlic) 8.203 8.809 5.322 6.16e-05 *** s(Tax) 1.753 2.164 4.898 0.0106 * s(Miles) 2.504 3.038 2.055 0.1205 --- 7 R-sq.(adj) = 0.557 Deviance explained = 66.7% GCV = 0.0047638 Scale est. = 0.0035065 n = 51 dev.new() par( mfrow=c(3,1)) plot( carbrnt.mgcv3 ) ####anova anova( carbrnt.mgcv2, carbrnt.mgcv3 ) Model 1: Fuel ~ s(Dlic) + s(Tax) Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles) 1 2 Resid. Df Resid. Dev Df Deviance 40.903 0.16126 37.539 0.13163 3.364 0.029629 anova( carbrnt.mgcv1, carbrnt.mgcv2 ) Model 1: Fuel ~ s(Dlic) Model 2: Fuel ~ s(Dlic) + s(Tax) 1 2 Resid. Df Resid. Dev Df Deviance 47.364 0.24756 40.903 0.16126 6.4605 0.086299 ################################################################# ############### ##régression de Fuel en fonction de Dlic, et Miles carbrnt.mgcv4= gam( Fuel ~ s(Dlic) + s(Miles), data=carbrnt ) summary( carbrnt.mgcv4 ) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.613129 0.009855 62.21 <2e-16 *** --Approximate significance of smooth terms: edf Ref.df F p-value s(Dlic) 2.236 2.846 9.385 5.79e-05 *** s(Miles) 1.000 1.001 5.061 0.0291 * --R-sq.(adj) = 0.374 Deviance explained = 41.5% GCV = 0.0054021 Scale est. = 0.0049533 n = 51 8 #### anova( carbrnt.mgcv4, carbrnt.mgcv3 ) Model 1: Fuel ~ s(Dlic) + s(Miles) Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles) Resid. Df Resid. Dev Df Deviance 1 46.763 0.23164 2 37.539 0.13163 9.2241 0.1 ##### anova( carbrnt.mgcv1, carbrnt.mgcv4 ) Model 1: Fuel ~ s(Dlic) Model 2: Fuel ~ s(Dlic) + s(Miles) Resid. Df Resid. Dev Df Deviance 1 47.364 0.24756 2 46.763 0.23164 0.60039 0.015922 carbrnt.mgcv5= gam( Fuel ~ s(Dlic) + Miles + Tax, data=carbrnt ) summary( carbrnt.mgcv5 ) anova( carbrnt.mgcv5, carbrnt.mgcv3 ) Analysis of Deviance Table Model 1: Fuel ~ s(Dlic) + Miles + Tax Model 2: Fuel ~ s(Dlic) + s(Tax) + s(Miles) Resid. Df Resid. Dev Df Deviance 1 39.654 0.14708 2 37.539 0.13163 2.115 0.015446 ######################################################### library(rpart) carbrnt.tree= rpart( Fuel ~ Dlic + Miles + Tax + Income, method="anova", data= carbrnt ) par(mfrow=c(1,2), xpd=NA) # otherwise on some devices the text is clipped summary( carbrnt.tree ) 9 Call: rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data = carbrnt, method = "anova") n= 51 1 2 3 4 5 CP nsplit rel error xerror xstd 0.32187635 0 1.0000000 1.0320695 0.2993860 0.15276296 1 0.6781236 0.8681936 0.2120283 0.06191811 2 0.5253607 0.9872748 0.2483880 0.01709497 3 0.4634426 0.9411495 0.2264639 0.01000000 4 0.4463476 0.9639883 0.2384718 Variable importance Income Dlic Miles 38 28 21 Tax 14 Node number 1: 51 observations, complexity param=0.3218764 mean=0.6131288, MSE=0.007758707 left son=2 (24 obs) right son=3 (27 obs) Primary splits: Income < 28042.5 to the right, improve=0.32187640, (0 missing) Dlic < 0.8326463 to the left, improve=0.24871170, (0 missing) Miles < 21791 to the left, improve=0.13845350, (0 missing) Tax < 21.85 to the right, improve=0.07445155, (0 missing) Surrogate splits: Miles < 36586 to the left, agree=0.667, adj=0.292, (0 split) Dlic < 0.854698 to the left, agree=0.647, adj=0.250, (0 split) Tax < 21.85 to the right, agree=0.608, adj=0.167, (0 split) Node number 2: 24 observations, complexity param=0.152763 mean=0.560124, MSE=0.007885153 left son=4 (7 obs) right son=5 (17 obs) Primary splits: Dlic < 0.8448336 to the left, improve=0.31941570, (0 missing) Miles < 25766 to the left, improve=0.10557910, (0 missing) Income < 33140.5 to the right, improve=0.06470725, (0 missing) Tax < 19.75 to the right, improve=0.03657008, (0 missing) Surrogate splits: Miles < 127035 to the right, agree=0.833, adj=0.429, (0 10 split) Node number 3: 27 observations, complexity param=0.06191811 mean=0.6602442, MSE=0.002929105 left son=6 (20 obs) right son=7 (7 obs) Primary splits: Tax < 17.5 to the right, improve=0.30979790, (0 missing) Dlic < 0.9429701 to the left, improve=0.13929780, (0 missing) Income < 26812 to the left, improve=0.12501640, (0 missing) Miles < 63498 to the left, improve=0.04143225, (0 missing) Node number 4: 7 observations mean=0.4819146, MSE=0.01266335 Node number 5: 17 observations mean=0.5923278, MSE=0.00236193 Node number 6: 20 observations, complexity param=0.01709497 mean=0.6424229, MSE=0.001382062 left son=12 (8 obs) right son=13 (12 obs) Primary splits: Miles < 65166 to the left, improve=0.24472050, (0 missing) Dlic < 0.9389165 to the left, improve=0.20149910, (0 missing) Tax < 20.5 to the right, improve=0.16676220, (0 missing) Income < 25869 to the left, improve=0.04711045, (0 missing) Surrogate splits: Income < 25869 to the left, agree=0.70, adj=0.250, (0 split) Dlic < 0.9341762 to the left, agree=0.65, adj=0.125, (0 split) Tax < 24.3 to the right, agree=0.65, adj=0.125, (0 split) Node number 7: 7 observations mean=0.7111624, MSE=0.003849139 Node number 12: 8 observations mean=0.6198989, MSE=0.001428473 Node number 13: 12 observations mean=0.6574388, MSE=0.000787423 #################################### 11 printcp( carbrnt.tree ) Regression tree: rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data = carbrnt, method = "anova") Variables actually used in tree construction: [1] Dlic Income Miles Tax Root node error: 0.39569/51 = 0.0077587 n= 51 1 2 3 4 5 CP nsplit rel error 0.321876 0 1.00000 0.152763 1 0.67812 0.061918 2 0.52536 0.017095 3 0.46344 0.010000 4 0.44635 xerror 1.03207 0.86819 0.98727 0.94115 0.96399 xstd 0.29939 0.21203 0.24839 0.22646 0.23847 ##graphique plus beau de l'arbre (dans un fichier de type postscript) post(carbrnt.tree, file="exemples/FuelTree1.ps", title="ARBRE DE REGRESSION: FUEL", horizontal=FALSE ) ##################################################### carbrnt.control= rpart.control( cp=0, xvar=5, minsplit=10 ) carbrnt.tree= rpart( Fuel ~ Dlic + Miles + Tax + Income, method="anova", data= carbrnt, control= carbrnt.control ) printcp( carbrnt.tree ) Regression tree: rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data = carbrnt, method = "anova", control = carbrnt.control) Variables actually used in tree construction: [1] Dlic Income Miles Tax Root node error: 0.39569/51 = 0.0077587 n= 51 CP nsplit rel error xerror xstd 1 0.3218764 0 1.00000 1.02255 0.29468 2 0.1769079 1 0.67812 1.34004 0.35462 3 0.0812552 2 0.50122 1.07670 0.28303 12 4 5 6 7 8 0.0219343 0.0178144 0.0061884 0.0038397 0.0000000 3 4 6 7 8 0.41996 0.39803 0.36240 0.35621 0.35237 0.94282 1.04884 1.05389 1.06093 1.06023 0.24798 0.25173 0.25167 0.25510 0.25516 ##graphique plus beau de l'arbre (dans un fichier de type postscript) post(carbrnt.tree, file="exemples/FuelTree2.ps", title="ARBRE DE REGRESSION: FUEL", horizontal=FALSE ) pdf("exemples/FuelParametredeLissage.pdf") rsq.rpart( carbrnt.tree ) dev.off() carbrnt.prune= prune( carbrnt.tree, cp= carbrnt.tree $cptable[which.min(carbrnt.tree$cptable[,"xerror"]),"CP"]) ##graphique plus beau de l'arbre (dans un fichier de type postscript) post(carbrnt.prune, file="exemples/FuelTreepruned.ps", title="ARBRE DE REGRESSION: FUEL (élagué)", horizontal=FALSE ) printcp(carbrnt.prune) Regression tree: rpart(formula = Fuel ~ Dlic + Miles + Tax + Income, data = carbrnt, method = "anova", control = carbrnt.control) Variables actually used in tree construction: [1] Dlic Income Tax Root node error: 0.39569/51 = 0.0077587 n= 51 1 2 3 4 CP nsplit rel error 0.321876 0 1.00000 0.176908 1 0.67812 0.081255 2 0.50122 0.021934 3 0.41996 xerror 1.02255 1.34004 1.07670 0.94282 13 xstd 0.29468 0.35462 0.28303 0.24798