Date: Sat, 4 Sep 2010 08:33:30 -0400
Reply-To: Richard DeVenezia <rdevenezia@GMAIL.COM>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: Richard DeVenezia <rdevenezia@GMAIL.COM>
Subject: Re: Conditional Column Drop
In-Reply-To: <201009040041.o83I2p7d003981@willow.cc.uga.edu>
Content-Type: text/plain; charset=ISO-8859-1
On Fri, Sep 3, 2010 at 8:41 PM, oloolo <dynamicpanel@yahoo.com> wrote:
> for small to mid size data set, Art's and Mike's PROC FREQ is the fastest
> and most convenient. Go over array in data step is much slower than internal
> functions in PROC FREQ.
...
>> do over _n;
>> if _n = . and findw(drop_names,vname(_n)) = 0 then
>> drop_names = catx(' ',drop_names, vname(_n));
>> end;
The use of FINDW as a lookup engine is extremely inefficient,
especially if the variable name list is going to be quite large and
the DROP_NAMES variables takes on longer lengths (16k-32k). I have
found that repeated long length string handling techniques are
frustratingly slow.
The DO OVER _N approach can still be used and greatly improved if the
list of found items is maintained in a hash table.
do _n_ = 1 to dim(_n);
if _n(_n_) = . then do;
varname = vname (_n(_n_));
if hnv.find() ne 0 then
hnv.add();
end;
end;
There is a detriment in this approach. The determination of the array
address of element _n(_n_) is a runtime computation. Whilst concise
and understandle, it can add significantly to runtime depending on the
number of elements and number of rows.
An unrolling technique can be used to determine the variable names
apriori to the DATA Step, and then macro is used to codegen source
with explicit variable names. This removes the run-time address
computation load and places it in the compilation stage. The downside
is increased complexity, the upside being a faster running step. I
have found in general that the increased compilation load does not
exceed the benefit of a reduced run-time. Unrolling should be used
sparingly and only in cases where optimal edge performance is
mandated.
Your performance numbers will vary from mine, but hopefully be in same
relative order
----------
* For example, I used a data set with 130K observations and 334
numerical variables ;
data foo;
do rowid = 1 to 1.3E5;
array v(334);
output;
end;
run;
*---- A character variable and FINDW are inefficient, 3ish minutes;
data _null_;
do until(eof);
set foo end = eof;
array _n _numeric_;
array _c _character_;
length drop_names $ 2000;
do over _n;
if _n = . and findw(drop_names,vname(_n)) = 0 then
drop_names = catx(' ',drop_names, vname(_n));
end;
do over _c;
if _c = ' ' and findw(drop_names,vname(_c)) = 0 then
drop_names = catx(' ',drop_names, vname(_c));
end;
end;
if eof then call symputx('drop_names',drop_names);
run;
*---- A hash table much better for managing found items, slight
concerns about array address computations in loop within loop
situation - 20ish seconds;
data _null_;
declare hash hnv();
hnv.defineKey ('varname');
hnv.defineData ('varname');
hnv.defineDone();
do until(eof);
set foo end = eof;
array _n _numeric_;
array _c _character_;
length varname $32;
do _n_ = 1 to dim(_n);
if _n(_n_) = . then do;
varname = vname (_n(_n_));
if hnv.find() ne 0 then
hnv.add();
end;
end;
do _n_ = 1 to dim(_c);
if _c(_n_) = . then do;
varname = vname (_c(_n_));
if hnv.find() ne 0 then
hnv.add();
end;
end;
end;
if eof then hnv.output(dataset:'varswithmissing');
run;
*---- Stay with hash table, unroll inner loop by way of macro code -
15ish seconds, small tweaks could perhaps improve performance another
10% ;
%macro numvars (data=);
%local dsid i varname;
%let dsid = %sysfunc(OPEN(&data));
%do i = 1 %to %sysfunc(ATTRN(&dsid,NVARS));
%if (%sysfunc(VARTYPE(&dsid,&i)) = N) %then
%sysfunc(VARNAME(&dsid,&i))
;
%end;
%let dsid = %sysfunc(CLOSE(&dsid));
%mend;
%macro charvars (data=);
%local dsid i varname;
%let dsid = %sysfunc(OPEN(&data));
%do i = 1 %to %sysfunc(ATTRN(&dsid,NVARS));
%if (%sysfunc(VARTYPE(&dsid,&i)) = C) %then
%sysfunc(VARNAME(&dsid,&i))
;
%end;
%let dsid = %sysfunc(CLOSE(&dsid));
%mend;
%macro unrolled_nullcheck(data=);
%local nvars cvars i vname;
%let nvars = %numvars(data=&data);
%let cvars = %charvars(data=&data);
data _null_;
declare hash hnv();
hnv.defineKey ('varname');
hnv.defineData ('varname');
hnv.defineDone();
length varname $32;
do until(eof);
set foo end = eof;
%let i = 1;
%do %while (%length(%scan(&nvars,&i)));
%let vname = %scan(&nvars,&i);
if &vname = . then do;
varname = "&vname";
if hnv.find() ne 0 then
hnv.add();
end;
%let i = %eval (&i+1);
%end;
%let i = 1;
%do %while (%length(%scan(&cvars,&i)));
%let vname = %scan(&cvars,&i);
if &vname = '' then do;
varname = "&vname";
if hnv.find() ne 0 then
hnv.add();
end;
%let i = %eval (&i+1);
%end;
end;
if eof then hnv.output(dataset:'varswithmissing2');
run;
%mend;
options mprint;
%unrolled_nullcheck(data=foo);
----------
Richard A. DeVenezia
http://www.devenezia.com