Saturday, July 5, 2008

Bootstrap code for forming confidence intervals in Max Entropy estimation

Maximum entropy estimation has become an important tool when few observations are available. Some references are:

Confidence intervals can be formed using the bootstrap or resampling approach. The percentile method is used to estimate the confidence intervals. The model below illustrates how this can be implemented in GAMS.


$ontext

   Bootstrap GME model
 
   Copyright 2007 Erwin Kalvelagen, Amsterdam Optimization
 
   References:
     [1] Maximum entropy estimation in economic models with linear
         inequality restrictions, Randall C. Campbell, R. Carter Hill
         Department of Economics, Louisiana State University,
         Baton Rouge, LA 70803,USA, 2001
 
     [2] Ramanathan, R., 2002. Introductory econometrics with applications
         (Harcourt College Publishers, Fort Worth).
 
     [3] Bardley Efron, Robert J. Tibshirani, "An Introduction to the Bootstrap",
         Chapman & Hall, 1993
 
   Files:
     bootstrap-gme.gms    -- gams file
     poverty.inc          -- data file
     tc.inc (optional)    -- file with quantiles for Student t distribution
 
 
$offtext
 
 
set i /case1*case116/
    k0 /
        constant 'constant term in regression'
        povrate  'poverty level'
        urb      'not used'
        famsize  'average household size'
        unemp    'percentage unemployment rate'
        highschl 'percentage of population age 25 and over with high school degree only'
        college  'percentage of population age 25 and over that completed 4 or more years of college'
        medinc   'median household income'
        D90      'dummy: one for the 1990 Census and zero for the 1980 Census'
      /
   k1(k0) 'data used' /povrate,famsize,unemp,highschl,college,medinc,D90/
;
 
*------------------------------------------------------------------------------
* input data from Ramanthan (2002, Data 7-6, p. 653)
*------------------------------------------------------------------------------
 
table data(i,k0) '-- input data from Ramanthan [2] (2002, Data 7-6, p. 653)'
$include poverty.inc
;
display data;
 
 
*------------------------------------------------------------------------------
* critical values t distribution
* this table is available from amsterdamoptimization.com
* if not available you can use a normal approximation
*------------------------------------------------------------------------------
 
$if exist tc.inc $include tc.inc
$if defined tc $goto tcok
 
 
* we have no access to tc.inc so we use a normal approximation
 
set prob /p1,p2,p3,p4,p5,p6/;
parameter probval(prob) /
  p1 0.10,  p2 0.05,  p3 0.025,  p4 0.01,  p5 0.005,  p6 0.001
/;
 
parameter qnorm(prob) /
  p1 1.281552, p2 1.644854, p3 1.959964, p4 2.326348, p5 2.575829, p6 3.090232
/;
 
$label tcok
 
 
*------------------------------------------------------------------------------
* create some summary statistics
*------------------------------------------------------------------------------
 
parameter summary(k1,*) '-- summary statistics (table 1 in [1])';
summary(k1,'mean') = sum(i,data(i,k1))/card(i);
summary(k1,'min') = smin(i,data(i,k1));
summary(k1,'max') = smax(i,data(i,k1));
summary(k1,'stdev') = sqrt[sum(i,sqr[data(i,k1)-summary(k1,'mean')])/(card(i)-1)];
summary(k1,'coeffvar') = summary(k1,'stdev')/summary(k1,'mean');
display summary;
 
*------------------------------------------------------------------------------
* linear regression using OLS
*------------------------------------------------------------------------------
 
set k(k0) 'regression coefficients'
    /constant,famsize,unemp,highschl,college,medinc,D90/;
data(i,'constant') = 1;
 
variables
  beta(k)  'regression coefficients'
  sse      'sum of squared errors'
;
equations
  dummyobj  'dummy objective'
  fit(i)    'fit linear equations'
;
 
dummyobj..   sse =n= 0;
fit(i)..     data(i,'povrate') =e=  sum(k, beta(k)*data(i,k));
 
model ols /dummyobj,fit/;
 
option lp=ls;
ols.limrow = 0;
ols.limcol = 0;
ols.solprint = 0;
solve ols using lp minimizing sse;
 
* Note: compare OLS results to page 8 in [1]
 
parameter estim(k,*,*) 'estimates';
estim(k,'OLS','-') = beta.l(k);
 
*------------------------------------------------------------------------------
* Calculate OLS confidence intervals
*------------------------------------------------------------------------------
 
parameter ols_se(k) 'standard errors';
ols_se(k) = beta.m(k);
display ols_se;
 
set ival 'confidence interval' /lo,up/;
scalar ndf 'degrees of freedom';
ndf = card(i) - card(k);
scalar alpha 'significance level' /0.025/;
scalar qt 'critical value';
$if defined tc        qt = sum((df,prob)$(dfval(df)=ndf and probval(prob)=0.025), tc(df,prob));
$if not defined tc    abort$(ndf<30) "Normal approximation not valid";
$if not defined tc    qt = sum(prob$(probval(prob)=0.025), qnorm(prob));
display ndf,alpha,qt;
 
parameter ols_conf_ival(k,ival);
ols_conf_ival(k,'lo') = beta.l(k) - qt*ols_se(k);
ols_conf_ival(k,'up') = beta.l(k) + qt*ols_se(k);
display ols_conf_ival;
 
* Note: these can also be imported from the GDX file LS.GDX
* See http://amsterdamoptimization.com/pdf/nlregression.pdf
 
*------------------------------------------------------------------------------
* GME model
*------------------------------------------------------------------------------
 
set j 'support points' /j1*j5/;
 
variables p(k,j), w(i,j);
p.lo(k,j) = 0.0001;
w.lo(i,j) = 0.0001;
 
variable
  entrpy      'entropy'
  e(i)        'residuals'
;
equations
  parmsupp(k) 'parameter support'
  errsupp(i)  'error support'
  normp(k)    'normalize p'
  normw(i)    'normalize w'
  obj         'maximize entropy'
  linear(i)   'linear model'
;
 
parameter
  z(k,j)      'parameter support matrix'
  v(i,j)      'error support matrix'
;
 
obj..          entrpy =e= -sum((k,j), p(k,j)*log(p(k,j)))-sum((i,j), w(i,j)*log(w(i,j)));
linear(i)..    data(i,'povrate') =e= sum(k, beta(k)*data(i,k)) + e(i);
parmsupp(k)..  beta(k) =e= sum(j, z(k,j)*p(k,j));
errsupp(i)..   e(i) =e= sum(j, v(i,j)*w(i,j));
normp(k)..     sum(j, p(k,j)) =e= 1;
normw(i)..     sum(j, w(i,j)) =e= 1;
 
model gme /obj,parmsupp,errsupp,linear,normp,normw/;
 
 
*------------------------------------------------------------------------------
* GME model support data
*------------------------------------------------------------------------------
set s 'error support scenarios' / s3, s4 /;
set gmex 'parameter support scenarios' /gme1,gme2,gme3/;
 
table z_all(gmex,k,j) 'parameter support for GME model'
                      j1   j2   j3   j4   j5
   gme1.constant     -50  -25    0   25   50
   gme1.famsize      -20  -10    0   10   20
   gme1.unemp        -20  -10    0   10   20
   gme1.highschl     -20  -10    0   10   20
   gme1.college      -20  -10    0   10   20
   gme1.medinc       -20  -10    0   10   20
   gme1.d90          -20  -10    0   10   20
 
   gme2.constant     -50  -25    0   25   50
   gme2.famsize      -10   -5    0    5   10
   gme2.unemp         -2   -1    0    1    2
   gme2.highschl      -2   -1    0    1    2
   gme2.college       -2   -1    0    1    2
   gme2.medinc       -10   -5    0    5   10
   gme2.d90          -20  -10    0   10   20
 
   gme3.constant     -50  -25    0   25   50
   gme3.famsize       -5    0    5   10   15
   gme3.unemp         -1    0    1    2    3
   gme3.highschl      -3   -2   -1    0    1
   gme3.college       -3   -2   -1    0    1
   gme3.medinc       -15  -10   -5    0    5
   gme3.d90          -20  -10    0   10   20
 
;
 
table esup_all(s,j) 'error support'
        j1    j2  j3    j4  j5
   s3  -10  -5.0   0   5.0  10
   s4  -13  -6.5   0   6.5  13
;
 
display "GME support data",z_all,esup_all;
 
*------------------------------------------------------------------------------
* run GME scenarios
*------------------------------------------------------------------------------
 
gme.limrow=0;
gme.limcol=0;
gme.solprint=2;
gme.solvelink=2;
loop((s,gmex),
  z(k,j) = z_all(gmex,k,j);
  v(i,j) = esup_all(s,j);
  solve gme maximizing entrpy using nlp;
  estim(k,s,gmex)=beta.l(k);
);
 
*------------------------------------------------------------------------------
* report results
*------------------------------------------------------------------------------
 
option estim:3:1:2;
display "---------------------------------------------------",
        "OLS and GME estimates (table 5 in [1])",
        "---------------------------------------------------",
         estim;
 
*------------------------------------------------------------------------------
* create bootstrap sample
*------------------------------------------------------------------------------
 
 
set nbs 'bootstrap samples' /sample1*sample400/;
 
parameter random(nbs,i) 'random draws with replacement';
random(nbs,i) = uniformint(1,card(i));
display random;
 
alias (i,ii);
set srandom(nbs,i,ii) 'as random but now expressed as a set';
srandom(nbs,i,ii)$(ord(ii)=random(nbs,i)) = yes;
display srandom;
 
 
*------------------------------------------------------------------------------
* run bootstrap method
*------------------------------------------------------------------------------
 
 
parameter newdata(i,k0);
 
equation linear2(i) 'linear model';
 
linear2(i)..    newdata(i,'povrate') =e= sum(k, beta(k)*newdata(i,k)) + e(i);
 
model gme2 /obj,parmsupp,errsupp,linear2,normp,normw/;
 
gme2.limrow=0;
gme2.limcol=0;
gme2.solprint=2;
gme2.solvelink=2;
 
parameter estim2(nbs,k,s,gmex) 'estimates';
 
loop(nbs,
    newdata(i,k0) = 0;
    newdata(i,k0) =  sum(srandom(nbs,i,ii), data(ii,k0));
 
    loop((s,gmex),
       z(k,j) = z_all(gmex,k,j);
       v(i,j) = esup_all(s,j);
       solve gme2 maximizing entrpy using nlp;
       estim2(nbs,k,s,gmex)=beta.l(k);
    );
);
 
display estim2;
 
*------------------------------------------------------------------------------
* calculate bootstrap confidence intervals
* percentile method
*------------------------------------------------------------------------------
 
 
* call rank first not from within a loop so declarations can happen
set idummy/a/;
parameters
   xdummy(idummy) /a 1/
   rdummy(idummy)
;
$libinclude rank xdummy idummy rdummy
 
 
parameter bconfival2(k,*,*,ival) 'confidence interval percentile method -- table 13 in [1]';
bconfival2(k,'OLS','-',ival) = ols_conf_ival(k,ival);
 
parameter
   bb(nbs)    'values to be sorted'
   pct(ival)  'percentiles'
   r(nbs)     'rank'
;
loop((k,s,gmex),
   bb(nbs) = estim2(nbs,k,s,gmex);
   pct('LO') = 2.5;
   pct('UP') = 97.5;
$libinclude rank bb nbs r pct
   bconfival2(k,s,gmex,ival) = pct(ival);
);
 
option bconfival2:3:1:3;
display bconfival2;

References: