Aplicación 3.8: Falta de observaciones
Code
# Lectura de librerías
library(tidyverse)
# Lectura de datos
# library(faraway)
# help(chredlin, package="faraway")
<- read.csv("data/SEGUROS_CHICAGO.csv", header=TRUE, row.names="zip")
chmiss # Usando tidyverse: read_csv("data/SEGUROS_CHICAGO.csv")
# %>% column_to_rownames(., var = "zip")
chmiss
race fire theft age involact income
60626 10.0 6.2 29 60.4 NA 11.744
60640 22.2 9.5 44 76.5 0.1 9.323
60613 19.6 10.5 36 NA 1.2 9.948
60657 17.3 7.7 37 NA 0.5 10.656
60614 24.5 8.6 53 81.4 0.7 9.730
60610 54.0 34.1 68 52.6 0.3 8.231
60611 4.9 11.0 75 42.6 0.0 21.480
60625 7.1 6.9 18 78.5 0.0 11.104
60618 5.3 7.3 31 90.1 NA 10.694
60647 21.5 15.1 NA 89.8 1.1 9.631
60622 43.1 29.1 34 82.7 1.9 7.995
60631 1.1 2.2 14 40.2 0.0 13.722
60646 NA 5.7 11 27.9 0.0 16.250
60656 1.7 2.0 11 7.7 0.0 13.686
60630 1.6 2.5 22 63.8 0.0 12.405
60634 1.5 3.0 NA 51.2 0.0 12.198
60641 1.8 5.4 27 85.1 0.0 11.600
60635 1.0 2.2 9 44.4 0.0 12.765
60639 2.5 7.2 29 84.2 0.2 11.084
60651 NA 15.1 30 89.8 0.8 10.510
60644 59.8 16.5 40 NA 0.8 9.784
60624 94.4 18.4 32 72.9 1.8 7.342
60612 86.2 36.2 41 63.1 1.8 6.565
60607 50.2 NA 147 83.0 0.9 7.459
60623 74.2 18.5 22 78.3 1.9 8.014
60608 55.5 NA 29 79.0 1.5 8.177
60616 NA 12.2 46 48.0 0.6 8.212
60632 4.4 5.6 23 71.5 0.3 11.230
60609 46.2 21.8 4 73.1 1.3 NA
60653 99.7 21.6 31 65.0 0.9 5.583
60615 73.5 9.0 39 75.4 0.4 8.564
60638 10.7 3.6 15 20.8 0.0 12.102
60629 1.5 5.0 32 61.8 NA 11.876
60636 48.8 28.6 27 78.1 1.4 9.742
60621 98.9 17.4 NA 68.6 2.2 7.520
60637 90.6 11.3 34 73.4 0.8 7.388
60652 1.4 3.4 17 2.0 0.0 13.842
60620 71.2 11.9 46 NA 0.9 11.040
60619 94.1 10.5 42 55.9 0.9 10.332
60649 66.1 10.7 NA 67.5 0.4 10.908
60617 NA 10.8 34 58.0 0.9 11.156
60655 1.0 4.8 19 15.2 0.0 13.323
60643 42.5 10.4 25 40.8 0.5 12.960
60628 35.1 15.6 28 57.8 1.0 NA
60627 47.4 7.0 3 11.4 0.2 10.080
60633 34.0 7.1 23 49.2 0.3 11.428
60645 3.1 4.9 27 NA 0.0 13.731
Code
# Detección de datos perdidos
# Por filas (casos)
rowSums(is.na(chmiss))
60626 60640 60613 60657 60614 60610 60611 60625 60618 60647 60622 60631 60646
1 0 1 1 0 0 0 0 1 1 0 0 1
60656 60630 60634 60641 60635 60639 60651 60644 60624 60612 60607 60623 60608
0 0 1 0 0 0 1 1 0 0 1 0 1
60616 60632 60609 60653 60615 60638 60629 60636 60621 60637 60652 60620 60619
1 0 1 0 0 0 1 0 1 0 0 1 0
60649 60617 60655 60643 60628 60627 60633 60645
1 1 0 0 1 0 0 1
Code
image(is.na(chmiss),axes=FALSE)
axis(2, at=0:5/5, labels=colnames(chmiss),las=2)
axis(1, at=0:46/46, labels=row.names(chmiss),las=2)
Code
# Por columnas (variables)
colSums(is.na(chmiss))
race fire theft age involact income
4 2 4 5 3 2
Code
library(mi)
<- missing_data.frame(chmiss)
md image(md)
Code
summary(md@patterns)
nothing involact age theft race fire income
27 3 5 4 4 2 2
Code
# Tratamiento de datos perdidos
# Omisión de observaciones con NAs (opción por defecto en R):
# resultados con los datos completos
<- lm(involact ~ race+fire+theft+age+log(income), chmiss)
mod_1 summary(mod_1)
Call:
lm(formula = involact ~ race + fire + theft + age + log(income),
data = chmiss)
Residuals:
Min 1Q Median 3Q Max
-0.59598 -0.19593 -0.05729 0.13641 0.63481
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.407202 1.419160 -1.696 0.104621
race 0.011129 0.003443 3.232 0.003994 **
fire 0.045003 0.010696 4.208 0.000395 ***
theft -0.016062 0.005550 -2.894 0.008678 **
age 0.009129 0.003443 2.652 0.014926 *
log(income) 0.844261 0.531601 1.588 0.127196
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3392 on 21 degrees of freedom
(20 observations deleted due to missingness)
Multiple R-squared: 0.7899, Adjusted R-squared: 0.7399
F-statistic: 15.79 on 5 and 21 DF, p-value: 1.688e-06
Code
# Imputación simple:
# Uso de la media para predecir los datos que faltan de los regresores
<- colMeans(chmiss,na.rm=TRUE)) (cmeans
race fire theft age involact income
35.6093023 11.4244444 32.6511628 59.9690476 0.6477273 10.7358667
Code
<- chmiss
imput_1 for(i in c(1:4,6)) imput_1[is.na(chmiss[,i]),i] <- cmeans[i] # Sólo var. expl.
<- lm(involact ~ race+fire+theft+age+log(income), imput_1)
mod_2 summary(mod_2)
Call:
lm(formula = involact ~ race + fire + theft + age + log(income),
data = imput_1)
Residuals:
Min 1Q Median 3Q Max
-1.00582 -0.15772 -0.00159 0.22214 0.80666
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.529635 1.087923 0.487 0.62917
race 0.006975 0.002845 2.452 0.01893 *
fire 0.028208 0.009541 2.956 0.00532 **
theft -0.003310 0.002745 -1.206 0.23541
age 0.006244 0.003160 1.976 0.05543 .
log(income) -0.315826 0.390255 -0.809 0.42339
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3845 on 38 degrees of freedom
(3 observations deleted due to missingness)
Multiple R-squared: 0.6813, Adjusted R-squared: 0.6394
F-statistic: 16.25 on 5 and 38 DF, p-value: 1.461e-08
Code
# Uso de regresiones auxiliares para predecir los datos que faltan
# de las variables explicativas
<- lm(race ~ fire+theft+age+income,chmiss)
mod_race is.na(chmiss$race),] chmiss[
race fire theft age involact income
60646 NA 5.7 11 27.9 0.0 16.250
60651 NA 15.1 30 89.8 0.8 10.510
60616 NA 12.2 46 48.0 0.6 8.212
60617 NA 10.8 34 58.0 0.9 11.156
Code
predict(mod_race,chmiss[is.na(chmiss$race),])
60646 60651 60616 60617
-17.84722 26.35970 70.39441 32.62029
Code
# Como la predicción para la zona 60646 es negativa,
# se puede usar la transformación logit ( log(y/(1-y)) )
# para evitar que las predicciones se salgan del intervalo [0,1]
# y luego aplicar la transformación inversa a la predicción
library(faraway)
<- lm(logit(race/100) ~ fire+theft+age+income,chmiss)
mod_race_2 ilogit(predict(mod_race_2,chmiss[is.na(chmiss$race),]))*100
60646 60651 60616 60617
0.4190909 14.7320193 84.2653995 21.3121261
Code
<- lm(fire ~ race+theft+age+income,chmiss)
mod_fire is.na(chmiss$fire),] chmiss[
race fire theft age involact income
60607 50.2 NA 147 83 0.9 7.459
60608 55.5 NA 29 79 1.5 8.177
Code
predict(mod_fire,chmiss[is.na(chmiss$fire),])
60607 60608
50.23688 15.40465
Code
<- lm(theft ~ race+fire+age+income,chmiss)
mod_theft is.na(chmiss$theft),] chmiss[
race fire theft age involact income
60647 21.5 15.1 NA 89.8 1.1 9.631
60634 1.5 3.0 NA 51.2 0.0 12.198
60621 98.9 17.4 NA 68.6 2.2 7.520
60649 66.1 10.7 NA 67.5 0.4 10.908
Code
predict(mod_theft,chmiss[is.na(chmiss$theft),])
60647 60634 60621 60649
35.82849 20.86813 35.39362 38.04624
Code
<- lm(age ~ race+fire+theft+income,chmiss)
mod_age is.na(chmiss$age),] chmiss[
race fire theft age involact income
60613 19.6 10.5 36 NA 1.2 9.948
60657 17.3 7.7 37 NA 0.5 10.656
60644 59.8 16.5 40 NA 0.8 9.784
60620 71.2 11.9 46 NA 0.9 11.040
60645 3.1 4.9 27 NA 0.0 13.731
Code
predict(mod_age,chmiss[is.na(chmiss$age),])
60613 60657 60644 60620 60645
74.11181 71.75477 64.29939 59.13367 45.83281
Code
<- lm(income ~ race+fire+theft+age,chmiss)
mod_income is.na(chmiss$income),] chmiss[
race fire theft age involact income
60609 46.2 21.8 4 73.1 1.3 NA
60628 35.1 15.6 28 57.8 1.0 NA
Code
predict(mod_income,chmiss[is.na(chmiss$income),])
60609 60628
6.338472 10.190580
Code
# Generación de valores para los datos perdidos por variable
<- chmiss
imput_2 $race[is.na(chmiss$race)] <-
imput_2ilogit(predict(mod_race_2,chmiss[is.na(chmiss$race),]))*100
$fire[is.na(chmiss$fire)] <-
imput_2predict(mod_fire,chmiss[is.na(chmiss$fire),])
$theft[is.na(chmiss$theft)] <-
imput_2predict(mod_theft,chmiss[is.na(chmiss$theft),])
$age[is.na(chmiss$age)] <-
imput_2predict(mod_age,chmiss[is.na(chmiss$age),])
$income[is.na(chmiss$income)] <-
imput_2predict(mod_income,chmiss[is.na(chmiss$income),])
#
<- lm(involact ~ race+fire+theft+age+log(income), imput_2)
mod_3 summary(mod_3)
Call:
lm(formula = involact ~ race + fire + theft + age + log(income),
data = imput_2)
Residuals:
Min 1Q Median 3Q Max
-0.7475 -0.1841 -0.0582 0.2203 0.8484
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.329475 1.212672 -1.096 0.279839
race 0.009550 0.002627 3.635 0.000820 ***
fire 0.039664 0.009974 3.977 0.000303 ***
theft -0.012791 0.003615 -3.539 0.001080 **
age 0.009848 0.003071 3.207 0.002723 **
log(income) 0.408210 0.443478 0.920 0.363132
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3581 on 38 degrees of freedom
(3 observations deleted due to missingness)
Multiple R-squared: 0.7235, Adjusted R-squared: 0.6872
F-statistic: 19.89 on 5 and 38 DF, p-value: 1.067e-09
Code
# Imputación múltiple:
# Enfoque AMELIA (https://cran.r-project.org/web/packages/Amelia/index.html)
library(Amelia)
<- amelia(chmiss, m=10) imput_3
-- Imputation 1 --
1 2 3 4 5 6 7 8
-- Imputation 2 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
41 42 43
-- Imputation 3 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
-- Imputation 4 --
1 2 3 4 5 6 7 8 9 10 11
-- Imputation 5 --
1 2 3 4 5 6 7 8 9 10 11
-- Imputation 6 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24
-- Imputation 7 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
61 62 63 64 65 66 67 68
-- Imputation 8 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
41
-- Imputation 9 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
-- Imputation 10 --
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23
Code
summary(imput_3)
Amelia output with 10 imputed datasets.
Return code: 1
Message: Normal EM convergence.
Chain Lengths:
--------------
Imputation 1: 8
Imputation 2: 43
Imputation 3: 18
Imputation 4: 11
Imputation 5: 11
Imputation 6: 24
Imputation 7: 68
Imputation 8: 41
Imputation 9: 37
Imputation 10: 23
Rows after Listwise Deletion: 27
Rows after Imputation: 47
Patterns of missingness in the data: 7
Fraction Missing for original variables:
-----------------------------------------
Fraction Missing
race 0.08510638
fire 0.04255319
theft 0.08510638
age 0.10638298
involact 0.06382979
income 0.04255319
Code
<- NULL
betas <- NULL
se_betas for(i in 1:imput_3$m){
<- lm(involact ~ race+fire+theft+age+log(income), imput_3$imputations[[i]])
lmod <- rbind(betas ,coef(lmod))
betas <- rbind(se_betas ,coef(summary(lmod))[,2])
se_betas
}# Estimaciones, desviaciones estándar estimadas y estadísticos t
# (individuales y combinadas)
betas; se_betas
(Intercept) race fire theft age log(income)
[1,] -1.6801450 0.010820252 0.04048431 -0.010039725 0.008124351 0.5301754
[2,] -0.9312928 0.008317184 0.03575807 -0.009244454 0.010494513 0.2176623
[3,] -1.2833810 0.010251147 0.03971323 -0.016894806 0.010211915 0.4133626
[4,] -1.0238843 0.008817896 0.03942862 -0.008400617 0.008059900 0.2743848
[5,] -1.1039979 0.008141964 0.04148049 -0.011802865 0.008323422 0.3497556
[6,] -1.7960198 0.010180713 0.04564098 -0.011738443 0.008921833 0.5609202
[7,] 0.2148243 0.005361182 0.03637236 -0.005573936 0.005897951 -0.1636501
[8,] -0.5640028 0.008599732 0.03473224 -0.012382377 0.009097477 0.1373044
[9,] -1.0049162 0.007957480 0.03936905 -0.008824682 0.008246736 0.2978977
[10,] -1.4004437 0.010089209 0.04021558 -0.012000704 0.007800557 0.4563504
(Intercept) race fire theft age log(income)
[1,] 1.1756156 0.002626828 0.008706705 0.002929358 0.002801494 0.4303396
[2,] 1.1271998 0.002487445 0.009201853 0.002869491 0.002839147 0.4067779
[3,] 0.7219403 0.001961061 0.007986642 0.003451006 0.002464368 0.2645855
[4,] 1.0478966 0.002369027 0.008791722 0.002547343 0.002679163 0.3801683
[5,] 1.0992003 0.002385818 0.008560116 0.002791145 0.002521390 0.4040653
[6,] 1.1157473 0.002436982 0.008855348 0.002785670 0.002633433 0.4063557
[7,] 0.8491166 0.002248528 0.009359403 0.002352570 0.002546133 0.3087461
[8,] 1.0076226 0.002377513 0.008928770 0.003357053 0.002535886 0.3741138
[9,] 1.1328720 0.002438547 0.008809766 0.002562930 0.002543884 0.4156849
[10,] 1.0382054 0.002347864 0.008129222 0.002832482 0.002632433 0.3788353
Code
<- mi.meld(q=betas,se=se_betas)) (cr
$q.mi
(Intercept) race fire theft age log(income)
[1,] -1.057326 0.008853676 0.03931949 -0.01069026 0.008517866 0.3074163
$se.mi
(Intercept) race fire theft age log(income)
[1,] 1.203008 0.00290546 0.009346149 0.004282435 0.002953978 0.4410524
Code
$q.mi/cr$se.mi) (cr
(Intercept) race fire theft age log(income)
[1,] -0.878902 3.047255 4.207026 -2.496304 2.883523 0.6970064
Code
# Enfoque MICE (https://cran.r-project.org/web/packages/mice/)
library(mice)
# Patrón de datos perdidos
md.pattern(chmiss)
fire income involact race theft age
27 1 1 1 1 1 1 0
5 1 1 1 1 1 0 1
4 1 1 1 1 0 1 1
4 1 1 1 0 1 1 1
3 1 1 0 1 1 1 1
2 1 0 1 1 1 1 1
2 0 1 1 1 1 1 1
2 2 3 4 4 5 20
Code
# Imputación simple con la media de los datos completos
<- mice(chmiss, method = "mean", m = 1, maxit = 1) imp_4_0
iter imp variable
1 1 race fire theft age involact income
Code
<- complete(imp_4_0) # Se obtiene el mismo resultado que
imput_1_2 # en el fichero imput_1, pera también se imputan
summary(with(imput_1_2, lm(involact ~ race+fire+theft+age+log(income))))
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Residuals:
Min 1Q Median 3Q Max
-0.99102 -0.18765 0.01174 0.21644 0.82392
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.628604 1.075959 0.584 0.56227
race 0.006248 0.002765 2.259 0.02923 *
fire 0.027110 0.009419 2.878 0.00632 **
theft -0.003339 0.002716 -1.229 0.22599
age 0.006754 0.003044 2.219 0.03209 *
log(income) -0.345200 0.386098 -0.894 0.37650
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.381 on 41 degrees of freedom
Multiple R-squared: 0.6624, Adjusted R-squared: 0.6212
F-statistic: 16.09 on 5 and 41 DF, p-value: 9.149e-09
Code
# Imputación por el método de regresión (1):
# Predicción simple
<- mice(chmiss, method = "norm.predict", m = 1, maxit = 1) imp_4_1
iter imp variable
1 1 race fire theft age involact income
Code
<- complete(imp_4_1) # Comparar con el fichero imput_2
imput_2_1 # Imputación por el método de regresión (2):
# Predicción estocástica (se añade un error aleatorio)
<- mice(chmiss, method = "norm.nob", m = 1, maxit = 1) imp_4_2
iter imp variable
1 1 race fire theft age involact income
Code
<- complete(imp_4_2) # Comparar con el fichero imput_2
imput_2_2 # Imputación por el método de regresión (3):
# Predicción usando booostrap
<- mice(chmiss, method = "norm.boot", m = 1, maxit = 1) imp_4_3
iter imp variable
1 1 race fire theft age involact income
Code
<- complete(imp_4_3) # Comparar con el fichero imput_2
imput_2_3 # MICE: Multiple Imputation Chained Equation
<- mice(chmiss, m = 10, print=F)
imp_4 summary(imp_4)
Class: mids
Number of multiple imputations: 10
Imputation methods:
race fire theft age involact income
"pmm" "pmm" "pmm" "pmm" "pmm" "pmm"
PredictorMatrix:
race fire theft age involact income
race 0 1 1 1 1 1
fire 1 0 1 1 1 1
theft 1 1 0 1 1 1
age 1 1 1 0 1 1
involact 1 1 1 1 0 1
income 1 1 1 1 1 0
Code
class(imp_4) # mids: multiply imputed data set
[1] "mids"
Code
attributes(imp_4)
$names
[1] "data" "imp" "m" "where"
[5] "blocks" "call" "nmis" "method"
[9] "predictorMatrix" "visitSequence" "formulas" "post"
[13] "blots" "ignore" "seed" "iteration"
[17] "lastSeedValue" "chainMean" "chainVar" "loggedEvents"
[21] "version" "date"
$class
[1] "mids"
Code
# Para ver los datos imputados de una variable
$imp # Todas las imputaciones imp_4
$race
1 2 3 4 5 6 7 8 9 10
60646 1.6 3.1 3.1 1.6 3.1 1.8 1.8 7.1 7.1 1.8
60651 17.3 34.0 47.4 47.4 24.5 1.5 66.1 59.8 1.4 17.3
60616 59.8 55.5 54.0 19.6 19.6 54.0 48.8 48.8 55.5 94.1
60617 71.2 24.5 59.8 73.5 24.5 47.4 71.2 24.5 21.5 10.0
$fire
1 2 3 4 5 6 7 8 9 10
60607 29.1 17.4 18.4 36.2 18.4 18.4 18.5 17.4 29.1 18.5
60608 17.4 34.1 34.1 28.6 12.2 18.4 34.1 10.5 18.5 10.5
$theft
1 2 3 4 5 6 7 8 9 10
60647 28 41 18 29 46 41 31 27 9 37
60634 14 22 27 53 34 32 34 42 34 34
60621 15 29 4 11 31 19 15 18 11 14
60649 27 41 29 30 30 29 40 27 11 147
$age
1 2 3 4 5 6 7 8 9 10
60613 81.4 81.4 90.1 78.3 82.7 81.4 78.3 78.3 81.4 58.0
60657 71.5 84.2 48.0 73.1 73.1 76.5 76.5 65.0 57.8 78.1
60644 75.4 51.2 48.0 73.4 57.8 58.0 73.4 71.5 76.5 78.1
60620 40.8 7.7 78.1 75.4 89.8 75.4 71.5 89.8 89.8 60.4
60645 2.0 11.4 71.5 44.4 60.4 11.4 89.8 7.7 44.4 75.4
$involact
1 2 3 4 5 6 7 8 9 10
60626 0.0 0.0 0 0.0 0.3 0.0 0.5 0.3 0.0 0.3
60618 0.1 1.2 0 0.7 0.1 0.5 0.2 0.5 0.3 0.3
60629 0.3 0.0 0 0.0 0.0 0.3 0.3 0.0 0.0 0.0
$income
1 2 3 4 5 6 7 8 9 10
60609 5.583 9.948 7.995 10.510 8.231 8.564 11.040 9.730 7.520 8.564
60628 11.040 9.323 9.323 11.104 9.948 9.948 11.428 11.104 9.948 10.510
Code
# Imputaciones para una variable:
# cambiar race por otra variable para ver el resultado
$imp$race imp_4
1 2 3 4 5 6 7 8 9 10
60646 1.6 3.1 3.1 1.6 3.1 1.8 1.8 7.1 7.1 1.8
60651 17.3 34.0 47.4 47.4 24.5 1.5 66.1 59.8 1.4 17.3
60616 59.8 55.5 54.0 19.6 19.6 54.0 48.8 48.8 55.5 94.1
60617 71.2 24.5 59.8 73.5 24.5 47.4 71.2 24.5 21.5 10.0
Code
# Para guardar los datos completos de una imputación
<- complete(imp_4,1) # cambiar 1 por cualquier valor entre 1 y 10
imp_4_m_1 #
# Inspección de la calidad de las imputaciones
# Convergencia del algoritmo (iterative Markov Chain Monte Carlo)
plot(imp_4)
Code
# Gráfica conjunta
stripplot(imp_4)
Code
# Gráfica individual
stripplot(imp_4, involact, pch = 20, xlab = "Imputation number")
Code
stripplot(imp_4, race, pch = 20, xlab = "Imputation number")
Code
stripplot(imp_4, fire, pch = 20, xlab = "Imputation number")
Code
stripplot(imp_4, theft, pch = 20, xlab = "Imputation number")
Code
stripplot(imp_4, age, pch = 20, xlab = "Imputation number")
Code
stripplot(imp_4, income, pch = 20, xlab = "Imputation number")
Code
# Distribución de los datos completos y los completados
xyplot(imp_4,involact ~ race+fire+theft+age+income)
Code
densityplot(imp_4)
Code
# Ajuste de los modelos con los datos completados (m modelos)
<- with(imp_4, lm(involact ~ race+fire+theft+age+log(income)))
fit # Pool y resumen de resultados
# Resultado de la regresión para cada conjunto de datos imputados fit
call :
with.mids(data = imp_4, expr = lm(involact ~ race + fire + theft +
age + log(income)))
call1 :
mice(data = chmiss, m = 10, printFlag = F)
nmis :
[1] 4 2 4 5 3 2
analyses :
[[1]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.603183 0.008307 0.038416 -0.007757 0.006613 0.135504
[[2]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.142384 0.007765 0.030247 -0.004289 0.006778 -0.050869
[[3]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.032372 0.008018 0.032584 -0.005442 0.005715 -0.093720
[[4]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.195929 0.006959 0.036476 -0.008743 0.007240 -0.015716
[[5]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
0.326679 0.007137 0.031465 -0.004461 0.006082 -0.233780
[[6]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.181802 0.007207 0.033901 -0.005391 0.007380 -0.047510
[[7]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
0.139669 0.007180 0.032120 -0.005132 0.005517 -0.143618
[[8]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
0.429395 0.006452 0.030840 -0.005017 0.006214 -0.257550
[[9]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
-0.198052 0.006885 0.038232 -0.007550 0.006819 -0.027053
[[10]]
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Coefficients:
(Intercept) race fire theft age log(income)
0.036659 0.007366 0.032308 -0.004966 0.006492 -0.116944
Code
# (m=10 en el ejemplo)
# Para ver el resultado de una regresión concreta (la primera por ejemplo)
summary(fit$analyses[[1]])
Call:
lm(formula = involact ~ race + fire + theft + age + log(income))
Residuals:
Min 1Q Median 3Q Max
-0.96139 -0.18833 -0.00346 0.21901 0.70249
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.603183 0.987787 -0.611 0.544808
race 0.008307 0.002346 3.541 0.001010 **
fire 0.038416 0.009453 4.064 0.000213 ***
theft -0.007757 0.002694 -2.879 0.006305 **
age 0.006613 0.002575 2.568 0.013975 *
log(income) 0.135504 0.359314 0.377 0.708031
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3442 on 41 degrees of freedom
Multiple R-squared: 0.7363, Adjusted R-squared: 0.7042
F-statistic: 22.9 on 5 and 41 DF, p-value: 6.687e-11
Code
# Pool de resultados de regresión
summary(pool(fit))
term estimate std.error statistic df p.value
1 (Intercept) -0.042131972 1.096805208 -0.03841336 35.09091 0.969575821
2 race 0.007327550 0.002623725 2.79280405 36.81363 0.008240559
3 fire 0.033658770 0.009446099 3.56324537 33.18881 0.001134112
4 theft -0.005874735 0.003008512 -1.95270472 21.94873 0.063723560
5 age 0.006484934 0.002763027 2.34703935 36.60385 0.024446172
6 log(income) -0.085125616 0.400547523 -0.21252314 34.61619 0.832945471
Code
# Comparación de los resultados a posteriori, con la muestra con datos completos
data(chredlin, package="faraway")
<- lm(involact ~ race+fire+theft+age+log(income), chredlin)
mod_full summary(mod_full)
Call:
lm(formula = involact ~ race + fire + theft + age + log(income),
data = chredlin)
Residuals:
Min 1Q Median 3Q Max
-0.85393 -0.16922 -0.03088 0.17890 0.81228
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.185540 1.100255 -1.078 0.287550
race 0.009502 0.002490 3.817 0.000449 ***
fire 0.039856 0.008766 4.547 4.76e-05 ***
theft -0.010295 0.002818 -3.653 0.000728 ***
age 0.008336 0.002744 3.038 0.004134 **
log(income) 0.345762 0.400123 0.864 0.392540
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3345 on 41 degrees of freedom
Multiple R-squared: 0.7517, Adjusted R-squared: 0.7214
F-statistic: 24.83 on 5 and 41 DF, p-value: 2.009e-11
Code
# Lectura de librerías
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Lectura de datos
= pd.read_csv("data/SEGUROS_CHICAGO.csv", index_col=0)
chmiss round(2) chmiss.describe().
race fire theft age involact income
count 43.00 45.00 43.00 42.00 44.00 45.00
mean 35.61 11.42 32.65 59.97 0.65 10.74
std 33.26 8.36 23.12 23.62 0.64 2.79
min 1.00 2.00 3.00 2.00 0.00 5.58
25% 3.75 5.60 22.00 48.30 0.00 8.56
50% 24.50 9.50 29.00 64.40 0.50 10.69
75% 57.65 15.10 38.00 78.25 0.92 12.10
max 99.70 36.20 147.00 90.10 2.20 21.48
Code
# Detección de datos perdidos
# Por filas (casos)
sum(axis=1) chmiss.isna().
zip
60626 1
60640 0
60613 1
60657 1
60614 0
60610 0
60611 0
60625 0
60618 1
60647 1
60622 0
60631 0
60646 1
60656 0
60630 0
60634 1
60641 0
60635 0
60639 0
60651 1
60644 1
60624 0
60612 0
60607 1
60623 0
60608 1
60616 1
60632 0
60609 1
60653 0
60615 0
60638 0
60629 1
60636 0
60621 1
60637 0
60652 0
60620 1
60619 0
60649 1
60617 1
60655 0
60643 0
60628 1
60627 0
60633 0
60645 1
dtype: int64
Code
~chmiss.isna(), aspect='auto')
plt.imshow("variables")
plt.xlabel("cases")
plt.ylabel(
plt.gray() plt.show()
Code
# Por columnas (variables)
sum(axis=0) chmiss.isna().
race 4
fire 2
theft 4
age 5
involact 3
income 2
dtype: int64
Code
# Tratamiento de datos perdidos
# Omisión de los datos perdidos: resultados con los datos completos
= smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)',
mod_1 =chmiss).fit()
dataprint(mod_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: involact R-squared: 0.790
Model: OLS Adj. R-squared: 0.740
Method: Least Squares F-statistic: 15.79
Date: Sun, 09 Feb 2025 Prob (F-statistic): 1.69e-06
Time: 15:27:24 Log-Likelihood: -5.7253
No. Observations: 27 AIC: 23.45
Df Residuals: 21 BIC: 31.23
Df Model: 5
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept -2.4072 1.419 -1.696 0.105 -5.359 0.544
race 0.0111 0.003 3.232 0.004 0.004 0.018
fire 0.0450 0.011 4.208 0.000 0.023 0.067
theft -0.0161 0.006 -2.894 0.009 -0.028 -0.005
age 0.0091 0.003 2.652 0.015 0.002 0.016
np.log(income) 0.8443 0.532 1.588 0.127 -0.261 1.950
==============================================================================
Omnibus: 1.216 Durbin-Watson: 2.453
Prob(Omnibus): 0.545 Jarque-Bera (JB): 0.935
Skew: 0.441 Prob(JB): 0.626
Kurtosis: 2.768 Cond. No. 1.89e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.89e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Code
# Imputación simple:
# Uso de la media para predecir los datos que faltan de los regresores
= chmiss.mean(axis=0)
cmeans cmeans
race 35.609302
fire 11.424444
theft 32.651163
age 59.969048
involact 0.647727
income 10.735867
dtype: float64
Code
= chmiss.copy()
imput_1 'race'],inplace=True)
imput_1.race.fillna(cmeans['fire'],inplace=True)
imput_1.fire.fillna(cmeans['theft'],inplace=True)
imput_1.theft.fillna(cmeans['age'],inplace=True)
imput_1.age.fillna(cmeans['income'],inplace=True)
imput_1.income.fillna(cmeans[= smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=imput_1).fit()
mod_2 print(mod_2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: involact R-squared: 0.681
Model: OLS Adj. R-squared: 0.639
Method: Least Squares F-statistic: 16.25
Date: Sun, 09 Feb 2025 Prob (F-statistic): 1.46e-08
Time: 15:27:24 Log-Likelihood: -17.153
No. Observations: 44 AIC: 46.31
Df Residuals: 38 BIC: 57.01
Df Model: 5
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 0.5296 1.088 0.487 0.629 -1.673 2.732
race 0.0070 0.003 2.452 0.019 0.001 0.013
fire 0.0282 0.010 2.956 0.005 0.009 0.048
theft -0.0033 0.003 -1.206 0.235 -0.009 0.002
age 0.0062 0.003 1.976 0.055 -0.000 0.013
np.log(income) -0.3158 0.390 -0.809 0.423 -1.106 0.474
==============================================================================
Omnibus: 1.687 Durbin-Watson: 2.026
Prob(Omnibus): 0.430 Jarque-Bera (JB): 0.817
Skew: -0.219 Prob(JB): 0.665
Kurtosis: 3.504 Cond. No. 1.68e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.68e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Code
# Imputación múltiple: enfoque MICE
# (https://www.statsmodels.org/stable/imputation.html)
# NOTA: la librería statsmodels incluye el método MICE, pero no es
# exactamente el mismo que el usado en R
import statsmodels.imputation.mice as smimice
= smimice.MICEData(chmiss)
imp = 'involact ~ race + fire + theft + age + np.log(income)'
formula = smimice.MICE(formula, sm.OLS, imp)
mod_3 = mod_3.fit(10, 10)
results print(results.summary())
Results: MICE
====================================================================
Method: MICE Sample size: 47
Model: OLS Scale 0.14
Dependent variable: involact Num. imputations 10
--------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975] FMI
--------------------------------------------------------------------
Intercept 0.2436 1.2045 0.2022 0.8397 -2.1172 2.6043 0.1600
race 0.0070 0.0029 2.4425 0.0146 0.0014 0.0127 0.1239
fire 0.0314 0.0095 3.3225 0.0009 0.0129 0.0499 0.0287
theft -0.0042 0.0030 -1.3838 0.1664 -0.0101 0.0017 0.2629
age 0.0059 0.0033 1.8233 0.0683 -0.0004 0.0123 0.2162
np.log(income) -0.1975 0.4348 -0.4541 0.6497 -1.0497 0.6548 0.1554
====================================================================