Date: Tue, 29 Sep 2009 17:35:17 -0400
Reply-To: "Keintz, H. Mark" <mkeintz@WHARTON.UPENN.EDU>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: "Keintz, H. Mark" <mkeintz@WHARTON.UPENN.EDU>
Subject: Re: Cross-Summing problem
In-Reply-To: <2fc7f3340909282058x718b6e4bo61900812cf7bcbbd@mail.gmail.com>
Content-Type: text/plain; charset="us-ascii"
Here's a solution that accumulates all the desired company counts customer by customer. For each CID it populates a small hash table of AMT by company. At the end of each CID it adds the counts to a larger (but still very small) hash table of totals amounts.
Note the use of 2 hash iterators applied to the same hash object.
Regards,
Mark
data _null_;
/* Total AMT for each COMPANY, for customers who also bought from COMPANY2 */
dcl hash totals (ordered:'a');
totals.definekey('company2','company');
totals.definedata('company2','company','total_amt');
totals.definedone();
/** AMT, by company, for the current CID **/
dcl hash cid_amt();
cid_amt.definekey('company');
cid_amt.definedata('company','amt');
cid_amt.definedone();
dcl hiter ci1('cid_amt');
dcl hiter ci2('cid_amt');
do until (end_of_data);
** For each CID, retrieve all AMTs, by company, into CID_AMT **;
do until (last.cid);
set have end=end_of_data;
by cid;
cid_amt.add();
end;
do i=ci2.first() by 0 while (i=0); ** For each company in CID_AMT ... **;
company2=company; ** Set it as company2 **;
do j=ci1.first()by 0 while (j=0); ** For this COMPANY2, pass through all COMPANYs **;
total_amt=0;
rt=totals.find(); ** Get current TOTAL_AMT **;
total_amt=sum(total_amt,amt); ** Update TOTAL_AMT **;
rt=totals.replace(); ** .. and put it back into TOTALS **;
j=ci1.next(); ** next company **;
end;
i=ci2.next(); ** next company2 **;
end;
cid_amt.clear();
end; ** until end_of_data **;
totals.output(dataset:'TEMP');
run;
> -----Original Message-----
> From: SAS(r) Discussion [mailto:SAS-L@LISTSERV.UGA.EDU] On Behalf Of
> Muthia Kachirayan
> Sent: Monday, September 28, 2009 11:58 PM
> To: SAS-L@LISTSERV.UGA.EDU
> Subject: Re: Cross-Summing problem
>
> For those who happen to read this thread may find it easy to follow the
> sequence in one place.
>
> Paul Phan posted the following on Thu, Sep 24, 2009 at 4:49 AM:
>
> HAVE:
>
> Row(R) CID COMPANY Amt
> 1 111 A 4
> 2 111 D 2
> 3 111 G 3
> 4 111 X 4
> 5 112 A 6
> 6 112 D 2
> 7 112 G 8
> 8 113 G 4
> 9 113 X 2
> 10 114 A 5
> 11 115 X 5
>
> WANT:
> A D G X
> A 15 4 11 4
> D 10 4 11 4
> G 10 4 15 6
> X 4 2 7 11
>
> Ya Haung responded with the Proc SQL solution:
>
> proc sql;
> create table xx1 as
> select distinct a.company as row,b.company as col,sum(b.amt) as tot
> from xx a, xx b
> where a.cid=b.cid
> group by a.company,b.company
> order by a.company,b.company
> ;
> proc transpose data=xx1 out=xx2 (drop=_name_);
> by row;
> id col;
> var tot;
> run;
>
> Paul Dorfman improved the speed of the Proc SQL with code:
>
> proc sql ;
> create table pre_sum as
> select company
> , cid
> , sum (amt) as pre_sum
> , count (amt) as pre_cnt
> from test
> group company
> , cid
> ;
> create table totals as
> select a.company as row
> , b.company as col
> , sum (b.pre_sum) as sum
> , count (b.pre_cnt) as cnt
> from pre_sum a, pre_sum b
> where a.cid = b.cid
> group a.company
> , b.company
> order a.company
> , b.company
> ;
> quit ;
> proc transpose data = totals out=sum_tran (drop=_name_) ;
> by row ;
> id col ;
> var sum ;
> run ;
> proc transpose data = totals out=cnt_tran (drop=_name_) ;
> by row ;
> id col ;
> var cnt ;
> run ;
>
> Wei Wang posted a hash solution with :
>
> *assuming data is not aggregated at the customer and company level;
> data pre_sum_hash (rename = (company = row) drop = amt)
> pre_sum_hash2 (rename = (company = col) drop = amt);
> **********************************************************
> Define variables
> **********************************************************;
> if 0 then set xx;
> **********************************************************
> Define Hash
> **********************************************************;
> dcl hash h(hashexp:4, ordered: 'a');
> h.defineKey('CID', 'COMPANY');
> h.defineData('CID', 'COMPANY', 'presum', 'precnt');
> h.defineDone();
> **********************************************************
> Calculate Sum and count
> **********************************************************;
> do while(not done);
> set xx end=done;
> by cid company;
> if h.find() ne 0 then do;
> presum = 0;
> precnt = 0;
> end;
> presum + amt;
> precnt + 1;
> if last.company then output;
> end;
> **********************************************************
> Output
> **********************************************************;
> stop;
> run;
>
> *get the cartesian join in data step;
> *it might be possible to conbine the next two steps into one;
> *for readability, I keep them seperated;
> data _null_;
> if 0 then set pre_sum_hash2;
> **********************************************************
> Define Hashes
> **********************************************************;
> dcl hash h (dataset: 'pre_sum_hash2', hashexp:4, ordered: 'Y');
> h.defineKey('CID', 'col');
> h.defineDone();
> dcl hash hout (ordered: 'a');
> hout.defineKey('row', 'col', 'n');
> hout.definedata('row', 'col', 'presum', 'precnt');
> hout.defineDone();
> do while(not done);
> set pre_sum_hash end=done;
> HoldCID = CID;
> declare hiter ih("h");
> rc=ih.first();
> do while(rc=0);
> If CID=HoldCID then do;
> hout.add();
> n+1;
> end;
> rc=ih.next();
> end;
> end;
> hout.output(dataset : 'CartesianJoin');
> stop;
> drop holdcid rc;
> run;
>
> data hash_totals;
> set CartesianJoin;
> by row col;
> if first.col then do;
> sum = 0;
> cnt = 0;
> end;
> sum + presum;
> cnt + precnt;
> if last.col then output;
> drop presum precnt;
> run;
>
> Paul Phan replied with:
>
> "Thank you Paul and Wei for your help. I was able to run Paul code but
> the
> data hash solution from Wei generate this error message
> ERROR: Hash object added 2228208 items when memory failure occurred.
> FATAL: Insufficient memory to execute data step program. Aborted during
> the EXECUTION phase."
>
> This shows that the hash object could not find sufficient memory to
> hold
> data.
>
> Another version of hash is given below which will provide an output
> similar
> to the PROC SQL code given by Paul. Further, using Proc Tanspose the
> output
> table can be processed to give square matrices of SUMS as well as
> COUNTS or
> using other datastep.
>
> Paul Phan may check this code with the original data. My guess is that
> when
> Paul's Proc SQL has done the job, this will do the job in a slightly
> lesser
> time.
>
>
>
> data test (drop = Row);
> input Row CID COMPANY $ Amt ;
> cards ;
> 1 111 A 4
> 2 111 D 2
> 3 111 G 3
> 4 111 X 4
> 5 112 A 6
> 6 112 D 2
> 7 112 G 8
> 8 113 G 4
> 9 113 X 2
> 10 114 A 5
> 11 115 X 5
> ;
> run ;
>
> The following is oneway to get the unique number of Companies. One may
> use
> datastep or Proc sql for this.
>
> data _null_;
> if _n_ = 1 then do;
> declare hash h();
> h.definekey('company');
> h.definedone();
> end;
> do until(eof);
> set test (keep = company) end = eof;
> if h.find() ne 0 then h.add();
> end;
> call symputx('num_companies',h.num_items);
> run;
> %put &num_companies;
>
> The fullstimer option may be chosen to see the memory usage.
>
> The dataset, TEST, must be presorted by CID. Within each CID, Company
> and
> Amt are stored into temporary arrays, TR[ ] and TAMT[ ]. Array TR[ ] is
> used
> to make the cartesian products and simultaneously TOT_AMT and COUNT are
> accumulated and saved into the hash table. Note the use of NROWS which
> helps
> to know how many elements are used for the current CID to place into
> the
> array, TR[ ] saving us from reinitializing it for every CID.
>
>
> *options fullstimer;
> data _null_;
> if _n_ = 1 then do;
> if 0 then set test;
> declare hash h(ordered:'a');
> h.definekey('row','col');
> h.definedata('row','col','tot_amt','count');
> h.definedone();
> end;
> do until(eof);
> nrows = 0;
> do until(last.CID);
> set test (keep = cid company amt rename = (company = row)) end
> = eof
> ;
> by CID;
> array tr[&num_companies.]$ _temporary_;
> array tamt[&num_companies.] _temporary_;
> nrows + 1;
> tr[nrows] = row;
> tamt[nrows] = amt;
> end;
>
> do j = 1 to nrows;
> row = tr[j];
> do k = 1 to nrows;
> col = tr[k];
> if h.find() ne 0 then do;
> tot_amt = 0;
> count = 0;
> end;
> tot_amt + tamt[k];
> count + 1;
> h.replace();
> end;
> end;
> end;
> h.output(dataset:'want');
> stop;
> run;
>
> Muthia Kachirayan
|