Date: Mon, 3 Nov 1997 09:13:12 +1100
Reply-To: "Bruce Bradbury (remove deletethis from address)"
<b.bradbury@DELETETHIS.UNSW.EDU.AU>
Sender: "SAS(r) Discussion" <SAS-L@UGA.CC.UGA.EDU>
From: "Bruce Bradbury (remove deletethis from address)"
<b.bradbury@DELETETHIS.UNSW.EDU.AU>
Organization: Social Policy Research Centre
Subject: Re: help: lorenz curves and sas
Luiz Eduardo Miranda Cruz wrote in message
<345AE7CB.7BB6@econ.Berkeley.edu>...
>does anyone has a program to calculate lorenz curves using SAS?
>
This you could modify this macro. The cumulative share variable in the
output dataset gives you 10 points on the Lorenz curve (the cumulative mean
is also a simple transformation of the generalised Lorenz curve).
If your data is not weighted, the program could be a lot simpler than the
one shown here.
%macro decile(y=,w=1,dsin=_last_,dout=,gout=,suffix=,print=1);
%* -------------------------------------------------------------------
Calculates decile shares etc for variable y with weight w.
(gini coefficients are also calculated).
Special Note: cases with weights which place them on the decile
boundaries are split in order to provide exact decile
shares. However this does not deal with tied cases.
If you want to allocate these you should add a small
random variable to the y variable.
Input: y variable whose distribution is to be described.
w weight variable (default = 1).
dsin data set containing these variables (program is most
efficient if no other variables are present)
(default = _last_).
dout name of output dataset for deciles
gout name of output dataset for gini
suffix Specify two letters (or numbers) to be appended to
the output variable names. This is useful if you
want to combine output from separate runs of this
program.
print Specify 0 if you do not want printed output (def = 1)
Note: The program uses the library format decile.
Output: Decile boundaries, mean incomes, shares, cumulative
shares and cumulative shares*mean.
Also gini coefficient.
The dataset dsout will contain 11 cases representing
the 10 deciles plus a total (placed last) with the
following variables (where zz represents the suffix
if specified),
decile - decile 1...10 and 99 for the total.
wcaseszz-number of weighted cases in decile.
lbzz - lower bound of decile,
ubzz - upper bound
meanzz - mean income in decile,
sharezz- income share of decile (in %)
csharezz-cumulative share (in %)
cmzz - cumulative mean.
In addition a data set with name gout will be
created with one case with the following variables,
ginizz - gini coefficient
meanzz, wcaseszz - as defined above,
nzz - unweighted cases.
Author: BWB Sept 90.
Modifications BWB May 91, CM rather than CSM output.
--------------------------------------------------------------------;
* sort file
---------;
proc sort data=&dsin out=sorttmp;by &y;
* Get the grand totals of the weight variable
-------------------------------------------;
proc means noprint;
var &w;
output out=gtotals sum=gtwgts;
* Calculate decile rankings
-------------------------;
data frank;
set sorttmp;
if _n_=1 then set gtotals;
* check for missing cases;
if &y=. | &w=. then error "missing values found";
* initialise some variables;
retain cs 0;
length decile 3;
retain cy 0;
cs = cs+&w; /* cumulative sum of case weights */
frank = cs/gtwgts; /* fractional rank */
* determine the decile of the present case;
select;
when (frank<=0.10) decile = 1; /* bottom decile */
when (frank<=0.20) decile = 2; /* next decile */
when (frank<=0.30) decile = 3;
when (frank<=0.40) decile = 4;
when (frank<=0.50) decile = 5;
when (frank<=0.60) decile = 6;
when (frank<=0.70) decile = 7;
when (frank<=0.80) decile = 8;
when (frank<=0.90) decile = 9;
otherwise decile = 10; /* top decile */
end;
* Gini calculations;
wy = &y * &w; /* total income represented by present case */
cy = cy + wy; /* cumulative income */
area = (cy - wy/2) * &w; /* un-normalised lorenz area */
original = 1; /* flag for original cases */
length original 3;
keep &y &w decile area wy original;
* adjustments for cases which straddle the quantile boundary;
ldecile = lag(decile);
if ldecile^=. & ldecile^=decile then do;
* i.e. just for the first case over the boundary;
* find the fractional rank of the boundary just passed;
select;
when (decile = 2) bound = .1 ;
when (decile = 3) bound = .2 ;
when (decile = 4) bound = .3;
when (decile = 5) bound = .4;
when (decile = 6) bound = .5;
when (decile = 7) bound = .6;
when (decile = 8) bound = .7;
when (decile = 9) bound = .8;
when (decile = 10) bound = .9;
end;
* create temporary variables;
tempw = &w;
tdecile = decile;
* output a case below boundary;
decile = ldecile; /* decile is value below boundary */
&w = &w-(frank-bound)*gtwgts;
* e.g. if the boundary is 0.1, and the present cases fractional
rank is 0.1004, and there are a total of 1,000,000 weighted cases
and the weight of the present case is 1000, then this will split
into 1000-(0.1004-0.1)*1,000,000 = 400 cases below the boundary
and 600 above.;
output;
* output case above quantile boundary;
decile = tdecile; /* i.e. the original decile variable */
&w = (frank-bound)*gtwgts;
area = 0; /* not needed a second time for ginis */
wy = 0;
original = 0; /* used in counting no of unweighted cases */
output;
end; /* of cases straddling the boundary */
else output; /* output other cases */
run;
* stats for deciles
-----------------;
proc summary data=frank ; /* this outputs one case per decile + */
weight &w; /* a case for the whole file */
class decile;
var &y ;
output sumwgt(&y)=wcases&suffix mean(&y)=mean&suffix sum(&y)=totaly
min(&y)=lb&suffix max(&y)=ub&suffix
;
data ;
set _last_;
if _type_=0 then do; /* ie grand total */
gtw=wcases&suffix; /* grand total weighted cases */
gty=totaly; /* grand total weighted y */
gtm=mean&suffix; /* overall mean y */
decile=99;
end;
retain gtw gty gtm; /* grand totals to be used */
share&suffix= 100*totaly/gty; /* income share of decile in %*/
retain cshare&suffix 0;
cshare&suffix = share&suffix + cshare&suffix;
/* cumulative income share */
if _type_=0 then cshare&suffix = 0;/* reinitialise for totals case */
cm&suffix = (cshare&suffix*gtm/100)/(decile/10);
keep _type_ decile share&suffix mean&suffix wcases&suffix
cshare&suffix cm&suffix lb&suffix ub&suffix;
* these are the variables to be found in the dataset dout;
* Put the totals last;
proc sort;
by decile;
data &dout;
set _last_;
if decile=99 then do;
cshare&suffix=.;
cm&suffix=.;
end;
run;
* print decile shares;
%if &print %then %do;
proc print split='|';
Title3 "Decile Shares for &y";
title4 "from data set &dsin with weight variable &w";
format decile 3.0
lb&suffix ub&suffix comma9.0
mean&suffix comma8.0 share&suffix cshare&suffix 6.2
cm&suffix comma8.0 wcases&suffix comma10.0;
id decile;
var lb&suffix ub&suffix mean&suffix share&suffix cshare&suffix
cm&suffix wcases&suffix ;
label lb&suffix = 'Lower|Bound';
label ub&suffix = 'Upper|Bound';
label mean&suffix = 'Mean';
label share&suffix = 'Share';
label cshare&suffix = 'Cumulative|Share';
label cm&suffix = 'Cumulative|Mean';
label wcases&suffix = 'Weighted|Cases';
%end;
* do gini calculation
-------------------;
proc summary data=frank; /* produces just one case */
var area wy &w original;
output sum(area)=tarea sum(wy)=ty
sum(&w)=sumwgt sum(original)=n&suffix;
data &gout;
set _last_;
wcases&suffix = sumwgt; /* grand total weighted cases */
mean&suffix = ty/sumwgt; /* mean y */
gini&suffix = 1 - 2*(tarea/(ty*sumwgt));
keep gini&suffix mean&suffix wcases&suffix n&suffix ;
run;
* print gini statistics;
%if &print %then %do;
proc print;
title3 "Gini Coefficient for variable &y weighted by &w";
title4 "from data set &dsin";
format gini&suffix 5.3 ;
format mean&suffix comma10.0 ;
format wcases&suffix comma20.0;
format n&suffix comma8.0;
run;
%end;
* reset the titles;
title3;
%mend decile;