R version 2.15.1 (2012-06-22) -- "Roasted Marshmallows" Copyright (C) 2012 The R Foundation for Statistical Computing ISBN 3-900051-07-0 Platform: i386-apple-darwin9.8.0/i386 (32-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. [R.app GUI 1.52 (6188) i386-apple-darwin9.8.0] > # STAT 100B chapter 7 simulation > # Purpose: To understand random sampling, central limit theorem and confidence intervals > hospitals=read.table("http://www.stat.ucla.edu/~hqxu/stat100B/hospitals.txt", h=T) > summary(hospitals) discharges beds Min. : 14.0 Min. : 10.0 1st Qu.: 311.0 1st Qu.:102.0 Median : 713.0 Median :233.0 Mean : 814.6 Mean :274.8 3rd Qu.:1186.0 3rd Qu.:393.0 Max. :2844.0 Max. :986.0 > discharges = hospitals$discharges > sort(discharges) [1] 14 22 23 35 36 41 42 48 49 52 56 57 58 64 64 66 75 76 78 79 81 84 85 [24] 87 87 89 90 91 92 95 95 98 100 100 103 108 109 115 118 120 120 121 124 125 134 141 [47] 153 170 173 174 182 186 194 200 209 210 213 215 216 220 222 225 227 228 229 231 231 233 233 [70] 233 239 240 242 243 243 244 244 247 253 255 258 259 260 265 266 270 273 283 296 297 297 297 [93] 298 301 308 308 309 310 311 315 318 321 322 323 326 327 337 345 346 355 360 360 362 362 368 [116] 371 373 373 376 377 381 383 384 389 402 410 411 414 414 416 426 427 431 439 439 440 444 446 [139] 453 456 467 467 470 471 475 479 481 486 487 490 498 504 505 518 531 534 535 538 539 543 557 [162] 557 558 573 586 587 590 590 592 592 594 601 609 610 611 621 622 625 629 632 635 637 652 662 [185] 665 669 670 670 684 689 690 695 697 703 705 707 713 715 726 726 732 744 754 764 767 773 778 [208] 778 787 788 795 808 810 811 814 824 828 834 842 852 855 858 861 872 876 876 884 885 887 889 [231] 896 906 912 915 918 925 926 928 931 935 941 944 946 951 955 956 956 956 958 966 974 976 985 [254] 986 989 995 999 1007 1009 1009 1011 1012 1016 1028 1029 1031 1036 1040 1042 1049 1060 1063 1076 1084 1089 1093 [277] 1095 1097 1097 1098 1105 1106 1126 1133 1137 1149 1150 1152 1153 1156 1160 1166 1173 1175 1186 1201 1210 1218 1219 [300] 1219 1225 1226 1231 1232 1232 1239 1239 1258 1272 1283 1287 1301 1315 1322 1326 1346 1347 1350 1355 1369 1370 1373 [323] 1376 1376 1386 1389 1390 1394 1400 1413 1415 1418 1420 1425 1453 1463 1478 1504 1509 1522 1527 1534 1547 1583 1584 [346] 1606 1617 1620 1624 1632 1634 1645 1648 1657 1665 1669 1678 1684 1705 1706 1707 1715 1719 1744 1765 1766 1780 1785 [369] 1785 1789 1805 1828 1835 1893 1894 1948 2031 2034 2051 2058 2089 2116 2135 2150 2154 2171 2190 2240 2268 2700 2766 [392] 2818 2844 > mean(discharges) # mean mu=sum(x_i)/N [1] 814.6031 > var(discharges) # sample variance = sum((x_i-mu)^2)/(N-1) [1] 347766.4 > sd(discharges) # sample standard deviation [1] 589.7173 > hist(discharges) # Figure 7.1 > > # define a function to compute popu. variance > pop.var=function(x) mean(x^2) - mean(x)^2 > pop.sd=function(x) sqrt(pop.var(x)) > pop.var(discharges) # population variance sigma^2 = sum((x_i-mu)^2)/N [1] 346881.5 > pop.sd(discharges) # population standard deviation [1] 588.9665 > ## note: the population standard deviation should be 588.9665. The text gives the sample standard deviation=589.7173. > ## the difference is small. > > ?sd # var and sd use denominator n - 1 not n. starting httpd help server ... done > 392*var(discharges) - 393*pop.var(discharges) # should be 0 [1] 0 > > # take simple random samples with function sample() > ?sample > sample(1:10, 5) [1] 5 9 7 4 10 > sample(1:10, 5) [1] 2 9 5 4 10 > > # simple random sample of size n > pop = discharges # population > (N=length(pop)) # popu size [1] 393 > n=64 # sample size 8, 16, 32, 64 in Figure 7.2 > > (x=sample(pop, n)) [1] 57 2135 1029 90 1894 416 637 228 467 1160 535 1031 2818 345 14 239 2051 194 444 297 270 247 1219 [24] 377 134 887 487 587 592 1390 1634 538 573 1149 225 928 504 1522 629 470 301 431 1076 1415 23 1097 [47] 89 999 381 346 1828 912 308 79 1226 389 1400 1133 744 1632 2154 174 622 1084 > mean(x) [1] 785.7188 > sd(x) [1] 624.4496 > > (x=sample(pop, n)) [1] 884 355 726 121 1301 103 1715 373 1287 906 594 120 244 587 925 951 89 200 1175 535 1373 926 995 [24] 989 1648 1097 92 411 1645 1415 222 1315 852 227 632 85 467 100 240 273 1547 1504 1186 1785 1042 1272 [47] 2844 1744 186 1418 814 231 896 265 247 592 876 2154 402 360 1007 498 118 778 > mean(x) [1] 811.8906 > sd(x) [1] 595.1521 > > K=1000; > xbar=rep(0,K) > for(i in 1:K){ + (x=sample(pop, n)) + xbar[i]=mean(x) + } > summary(xbar) Min. 1st Qu. Median Mean 3rd Qu. Max. 628.6 768.6 814.5 815.2 860.5 1020.0 > mean(xbar) # mean of xbar from K simulations [1] 815.231 > sd(xbar) # empirical standard error of xbar from K simulations [1] 67.67512 > # examine the distribution of K sample means (xbar). > hist(xbar, main=paste("Histogram of xbar, n=",n)) # Figure 7.2 approximately normal for large n > > # proportion of xbar differed by more than 100 from mu > sum(abs(xbar-mean(pop))>100)/K [1] 0.148 > > mean(pop) # population mean [1] 814.6031 > pop.sd(pop)/sqrt(n)*sqrt((N-n)/(N-1)) # standard error of xbar [1] 67.44589 > pop.sd(pop)/sqrt(n) # standard error of xbar, ignoring the finite population correction factor [1] 73.62081 > > > # STAT 100B: confidence interval simulation > # > # population: patient discharges > # simple random sample of size n > pop = discharges # population > (N=length(pop)) # popu size [1] 393 > mu=mean(pop); sigma=pop.sd(pop); # > n=25; # sample size n > (sigma.xbar = pop.sd(pop)/sqrt(n)*sqrt((N-n)/(N-1))) # standard error of xbar [1] 114.1304 > > # 95% CI margin=1.96*sigma.xbar > (margin=1.96*sigma.xbar) [1] 223.6956 > > # draw a simple random sample of n > (x=sample(pop, n)) [1] 2051 1287 467 243 42 1583 995 498 1707 872 444 377 446 621 778 999 153 590 1705 381 1893 611 98 [24] 505 90 > mean(x) [1] 777.44 > # 95% CI for mu, z(.025)=1.96 > c(mean(x) - 1.96*sigma.xbar, mean(x) + 1.96*sigma.xbar) [1] 553.7444 1001.1356 > > ## or estiamte sigma.bar by s.xbar > (s.xbar = sd(x)/sqrt(n)*sqrt(1-n/N)) # estimated standard error of xbar [1] 116.0533 > c(mean(x) - 1.96*s.xbar, mean(x) + 1.96*s.xbar) [1] 549.9756 1004.9044 > > # do it again, > (x=sample(pop, n)) [1] 92 98 1029 14 315 453 767 824 855 1665 216 935 36 1744 1126 247 1418 665 1219 610 1386 1322 471 [24] 1063 918 > mean(x) # sample mean [1] 779.52 > # 95% CI for mu is xbar +/- margin; > c(mean(x) - margin, mean(x) + margin) [1] 555.8244 1003.2156 > > # do it again, > (x=sample(pop, n)) [1] 1369 999 966 1084 1705 2034 1948 109 1684 311 100 439 889 92 2116 787 690 14 925 590 1355 1418 1606 [24] 1028 255 > mean(x) # sample mean [1] 980.52 > # 95% CI for mu is xbar +/- margin; > c(mean(x) - margin, mean(x) + margin) [1] 756.8244 1204.2156 > > ## simulation A: use sigma.bar > ### repeat the procedure K times, save the K sample means in xbar > ### > K=100; xbar = rep(0, K) > for(i in 1:K) { x=sample(pop, n); xbar[i] = mean(x) } > > (margin=1.96*sigma.xbar) # margin is a number [1] 223.6956 > > # K 95% CI's for mu is xbar +/- margin; > ci=cbind(xbar - margin, xbar + margin) > ci[1:min(20,K),] # show first 20 CI's [,1] [,2] [1,] 545.1044 992.4956 [2,] 454.5844 901.9756 [3,] 494.2644 941.6556 [4,] 803.1844 1250.5756 [5,] 829.4644 1276.8556 [6,] 562.1044 1009.4956 [7,] 511.8644 959.2556 [8,] 574.9844 1022.3756 [9,] 398.8244 846.2156 [10,] 587.8244 1035.2156 [11,] 816.2644 1263.6556 [12,] 788.5444 1235.9356 [13,] 510.1844 957.5756 [14,] 681.1844 1128.5756 [15,] 755.7844 1203.1756 [16,] 569.9844 1017.3756 [17,] 525.5444 972.9356 [18,] 637.7044 1085.0956 [19,] 582.0644 1029.4556 [20,] 592.1444 1039.5356 > > # plot the K CI's > matplot(cbind(xbar - margin, xbar + margin), type="n") > for(i in 1:K) lines(c(i,i),c(xbar[i] - margin, xbar[i] + margin)) > abline(h=mu) # add a line > > # count the number of CI's containing mu, which is not exactly 100(1-alpha) > contain.mu = xbar-margin < mu & mu < xbar + margin > sum(contain.mu) # number of CI's containing mu [1] 91 > (1:K)[!contain.mu] # list of CI's do not contain mu [1] 5 11 26 27 39 41 42 74 80 > > > ## simulation B: estiamte sigma.bar by (s.xbar = sd(x)/sqrt(n)*sqrt(1-n/N)) > # repeat the procedure K times, save the K sample means in xbar > K=100; sd.x = xbar = rep(0, K) > for(i in 1:K) { x=sample(pop, n); xbar[i] = mean(x); sd.x[i]=sd(x) } > > (s.xbar = sd.x/sqrt(n)*sqrt(1-n/N)) # estimated standard error of xbar [1] 149.04532 95.87504 107.66076 132.87547 95.85226 105.59134 102.57643 107.75369 130.38309 111.13549 84.24756 [12] 119.32188 85.32578 102.84166 103.02316 116.00645 110.32553 118.40502 123.90642 114.41863 136.74604 126.46866 [23] 128.76736 99.79612 120.09234 107.33954 118.27789 90.80950 103.07687 107.79002 108.23991 95.70246 111.94577 [34] 120.34387 127.12453 124.86641 128.18698 115.70020 130.48205 104.34907 100.05659 99.41573 143.34400 116.36737 [45] 127.34119 103.17494 130.38976 109.77661 129.59046 106.21062 141.49158 87.21732 117.24083 120.87569 115.17148 [56] 108.41477 110.21362 106.67716 77.92286 124.80054 110.76185 100.88888 125.53959 120.21340 140.25634 112.89065 [67] 140.20494 130.41759 123.18462 79.93993 112.17361 93.60985 102.81457 139.64338 84.61022 96.46418 112.89077 [78] 99.50410 130.67686 116.13125 98.56003 105.98414 85.58291 111.43724 118.15411 137.88032 109.97058 133.93434 [89] 93.95223 107.28861 104.74959 115.95465 107.70985 126.39463 104.95491 124.18902 114.05880 99.05772 86.22106 [100] 140.20415 > (margin = 1.96*s.xbar) # margin is a vector now [1] 292.1288 187.9151 211.0151 260.4359 187.8704 206.9590 201.0498 211.1972 255.5509 217.8256 165.1252 233.8709 167.2385 [14] 201.5696 201.9254 227.3727 216.2380 232.0738 242.8566 224.2605 268.0222 247.8786 252.3840 195.6004 235.3810 210.3855 [27] 231.8247 177.9866 202.0307 211.2684 212.1502 187.5768 219.4137 235.8740 249.1641 244.7382 251.2465 226.7724 255.7448 [40] 204.5242 196.1109 194.8548 280.9542 228.0800 249.5887 202.2229 255.5639 215.1622 253.9973 208.1728 277.3235 170.9460 [53] 229.7920 236.9164 225.7361 212.4929 216.0187 209.0872 152.7288 244.6091 217.0932 197.7422 246.0576 235.6183 274.9024 [66] 221.2657 274.8017 255.6185 241.4419 156.6823 219.8603 183.4753 201.5166 273.7010 165.8360 189.0698 221.2659 195.0280 [79] 256.1266 227.6173 193.1777 207.7289 167.7425 218.4170 231.5821 270.2454 215.5423 262.5113 184.1464 210.2857 205.3092 [92] 227.2711 211.1113 247.7335 205.7116 243.4105 223.5553 194.1531 168.9933 274.8001 > > # K 95% CI's for mu is xbar +/- margin; > ci=cbind(xbar - margin, xbar + margin) > ci[1:min(20,K),] # show first 20 CI's [,1] [,2] [1,] 639.8712 1224.1288 [2,] 654.0449 1029.8751 [3,] 398.5849 820.6151 [4,] 556.2041 1077.0759 [5,] 475.8896 851.6304 [6,] 522.3210 936.2390 [7,] 538.1902 940.2898 [8,] 682.9628 1105.3572 [9,] 721.2091 1232.3109 [10,] 664.0944 1099.7456 [11,] 483.5548 813.8052 [12,] 620.7691 1088.5109 [13,] 580.3215 914.7985 [14,] 696.1104 1099.2496 [15,] 483.2746 887.1254 [16,] 620.7073 1075.4527 [17,] 491.6020 924.0780 [18,] 514.6862 978.8338 [19,] 638.9434 1124.6566 [20,] 533.2995 981.8205 > > # plot the K CI's > matplot(cbind(xbar - margin, xbar + margin), type="n") > for(i in 1:K) lines(c(i,i),c(xbar[i] - margin[i], xbar[i] + margin[i])) > abline(h=mu) # add a line > > # count the number of CI's containing mu, which is not exactly 100(1-alpha) > contain.mu = xbar-margin < mu & mu < xbar + margin > sum(contain.mu) # number of CI's containing mu [1] 93 > (1:K)[!contain.mu] # list of CI's do not contain mu [1] 11 41 70 79 81 98 99