Date: Wed, 3 Nov 2004 12:03:42 -0700
Reply-To: Michael Murff <mjm33@MSM1.BYU.EDU>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: Michael Murff <mjm33@MSM1.BYU.EDU>
Subject: Re: Output first record of consecutive recordstreak(s) with
countof each streak
Content-Type: text/plain; charset=US-ASCII
Hi SAS-L,
I would like to warmly thank (in no particular order) Kevin, Howard,
Puddin, Nat, and Richard for their expert assistance on this problem.
For summary purposes I include three working approaches below. The goal
was to count up the number of streaks or runs within a by group. I have
tested these programs for their CPU efficiency on a PC, but did not
bother to do so on our Linux server.
I ran two test, one with i=100 and one with i=100K. Note that N is
roughly equal to i*50 (as an upper limit). The performance was as
follows:
(rd) - (small n: ~.18 sec; large n: ~20 sec)
(hs) - (small n: ~.07 sec; large n: ~5 sec)
(kv) - (small n: ~.18 sec; large n: ~21 sec)
These numbers are the sum of each step, where more than one is used. I
have to say that the Double DoW is pretty darn impressive both in speed
and syntactical brevity. Can anyone think of a case where a triple DoW
or some higher order DoW might be useful?
Best,
Michael Murff
**********************************************;
%let fromdate=01jan1980;
%let todate=01nov2004;
/* data simulation */
data test;
do i=1 to 100;
do j = 1 to int(50*ranuni(10));
coname=compress("coname"||put(i,z3.));
date1=ranuni(1)*("&todate"d-"&fromdate"d)+"&fromdate"d;
date2=ranuni(2)*("&todate"d-"&fromdate"d)+"&fromdate"d;
a = ranuni(3);
b = ranuni(4);
if a > b then streak = 1;
else streak = 0;
obs +1;
output;
format date1 date2 date9.;
end;
end;
drop i j;
run;
/************/
/* Richard Devenzia's Solution */
/* two-step approach */
/************/
data rdtemp;
length streakid markcount 8;
set test;
by coname;
if first.coname then markcount = 0;
if a>b
then markcount + 1;
else markcount = 0;
if markcount = 1 then
streakid + 1;
run;
proc sql;
create table rdevenzia as
select coname, date1,date2, a,b, obs, max(markcount) as streaklength
from rdtemp
group by coname, streakid
having markcount=1 and max(markcount) > 3
;
quit;
/************/
/* Howard Schreier's Solution */
/* Double DoW */
/************/
data hschreier(drop=n);
do cnt = 1 by 1 until(last.streak);
set hstemp;
by coname streak notsorted;
end;
do n = 1 to cnt;
set hstemp;
by coname streak notsorted;
if n=1 and streak and cnt>3 then output;
end;
run;
/************/
/* Kevin Viel's Solution */
/* SQL / DoW with Array processing */
/************/
proc sql noprint ;
select put( max( count ) , 8. ) into : nobs
from ( select count( * ) as count
from test
group by coname
)
;
select put( count( distinct coname ) , 8. ) into : n_coname
from test
;
quit;
%let streak = 4 ;
data kviel ( drop = _m_ ) ;
array O_ ( &n_coname. , &nobs. ) _temporary_ ;
array C_ ( &n_coname. , &nobs. ) _temporary_ ;
do _m_ = 1 by 1 until ( end ) ;
count = 0 ;
do _n_ = 1 by 1 until ( last.coname ) ;
set test end = end ;
by coname ;
if last.coname = 0 then
do ;
if a <= b and count => &streak. then
do ;
O_( _m_ , _n_ - count ) = 1 ;
C_( _m_ , _n_ - count ) = count ;
count = 0 ;
end ;
else if a > b then count + 1 ;
else if a <= b then count = 0 ;
end ;
else
do ;
if a > b and count => %eval( &streak. - 1 ) then
do ;
O_( _m_ , _n_ - count ) = 1 ;
C_( _m_ , _n_ - count ) = count ;
count = 0 ;
end ;
else if a <= b and count => &streak. then
do ;
O_( _m_ , _n_ - count ) = 1 ;
C_( _m_ , _n_ - count ) = count ;
end ;
end ;
end ;
end ;
do _m_ = 1 by 1 until ( end1 ) ;
do _n_ = 1 by 1 until ( last.coname ) ;
set test end = end1 ;
by coname ;
*obs + 1 ;
if O_( _m_ , _n_ ) = 1 then
do ;
count = C_( _m_ , _n_ ) ;
output ;
end ;
end ;
end ;
run ;
proc compare base=hschreier compare=kviel;
var obs date1 date2;
run;
proc compare base=hschreier compare=rdevenzia;
var obs date1 date2;
run;