28 std::ostream&
operator<<(std::ostream& o,
const tensor_ranges& r) {
31 o <<
"[0.." << r[i] <<
"]";
49 template<
typename IT>
class basic_multi_iterator {
53 tensor_strides strides;
55 index_set ilst2idxnums;
56 std::vector<const tensor_strides*> slst;
60 const tensor_ranges& getcnt()
const {
return cnt; }
61 basic_multi_iterator() {
66 ilst2idxnums.reserve(64); iter.reserve(4);
68 const tensor_ranges& all_ranges()
const {
return ranges; }
69 const index_set& all_indexes()
const {
return idxnums; }
72 void insert(
const index_set& idxs,
const tensor_ranges& r,
const tensor_strides& s, IT it_) {
73 assert(idxs.size() == r.size()); assert(s.size() == r.size()+1);
75 for (
unsigned int i=0; i < idxs.size(); ++i) {
76 index_set::const_iterator f=std::find(idxnums.begin(), idxnums.end(), idxs[i]);
77 if (f == idxnums.end()) {
78 ilst2idxnums.push_back(dim_type(idxnums.size()));
79 idxnums.push_back(idxs[i]);
80 ranges.push_back(r[i]);
82 ilst2idxnums.push_back(dim_type(f-idxnums.begin()));
83 assert(ranges[ilst2idxnums.back()] == r[i]);
91 strides.assign(N*idxnums.size(),0);
92 for (
unsigned int i=0; i < N; ++i) {
93 for (
unsigned int j=0; j < slst[i]->size()-1; ++j) {
94 stride_type s = (*slst[i])[j];
95 strides[ilst2idxnums[c]*N + i] = s;
100 for (
unsigned int i=0; i < idxnums.size(); ++i) {
101 for (
unsigned int j=0; j < N; ++j)
105 cnt.assign(idxnums.size(),0);
110 bool next(
unsigned int b) {
return next(b,N); }
111 bool next(
unsigned int b,
unsigned int NN) {
114 if (++cnt[i0] < ranges[i0]) {
115 for (
unsigned int i=b; i < NN; ++i)
116 iter[i] += strides[i0*NN+i];
119 for (
unsigned int i=b; i < NN; ++i)
120 iter[i] -= strides[i0*NN+i]*(ranges[i0]-1);
127 template<
unsigned b,
unsigned NN>
bool qnext() {
return next(b,NN); }
128 IT it(
unsigned int b)
const {
return iter[b]; }
136 tensor_mask::tensor_mask(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
137 assign(tm1,tm2,and_op); unset_card();
140 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
141 clear(); unset_card();
142 if (tm1.ndim()==0) { assign(tm2);
return; }
143 if (tm2.ndim()==0) { assign(tm1);
return; }
145 idxs = tm1.indexes();
150 tensor_ranges global_r(std::max(tm1.max_dim(),tm2.max_dim())+1, index_type(1));
152 for (index_type i = 0; i < tm1.indexes().size(); ++i)
153 global_r[tm1.indexes()[i]] = tm1.ranges()[i];
156 for (index_type i = 0; i < tm2.indexes().size(); ++i) {
157 index_set::const_iterator f=std::find(tm1.indexes().begin(), tm1.indexes().end(), tm2.indexes()[i]);
158 if (f == tm1.indexes().end()) {
159 assert(global_r[tm2.indexes()[i]]==1);
160 global_r[tm2.indexes()[i]] = tm2.ranges()[i];
161 r.push_back(tm2.ranges()[i]);
162 idxs.push_back(tm2.indexes()[i]);
164 else assert(global_r[tm2.indexes()[i]] = tm2.ranges()[i]);
168 m.assign(size(),
false);
170 for (tensor_ranges_loop l(global_r); !l.finished(); l.next()) {
172 if (tm1(l.cnt) && tm2(l.cnt))
175 if (tm1(l.cnt) || tm2(l.cnt))
184 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
185 clear(); unset_card();
188 if (tm1.ndim()==0) { assign(tm2);
return; }
189 if (tm2.ndim()==0) { assign(tm1);
return; }
192 if (tm1.indexes() == tm2.indexes() &&
193 tm1.ranges() == tm2.ranges() &&
194 tm1.strides() == tm2.strides()) {
195 r = tm1.ranges(); idxs=tm1.indexes(); s=tm1.strides();
196 assert(tm1.m.size() == tm2.m.size());
199 for (index_type i=0; i < tm2.m.size(); ++i)
203 for (index_type i=0; i < tm2.m.size(); ++i)
210 basic_multi_iterator<unsigned> bmit;
211 bmit.insert(tm1.indexes(), tm1.ranges(), tm1.strides(), 0);
212 bmit.insert(tm2.indexes(), tm2.ranges(), tm2.strides(), 0);
213 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
215 m.assign(size(),
false);
216 bmit.insert(indexes(), ranges(), strides(), 0);
221 if (tm1.m[bmit.it(0)]) {
223 if (tm2.m[bmit.it(1)])
227 }
while (bmit.qnext<1,3>());
229 }
while (bmit.qnext<0,3>());
232 bool v1 = tm1.m[bmit.it(0)];
234 if (v1 || tm2.m[bmit.it(1)])
236 }
while (bmit.qnext<1,3>());
237 }
while (bmit.qnext<0,3>());
245 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*>& tm) {
250 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
251 index_set mask_start; unset_card();
253 r.reserve(255); idxs.reserve(255); mask_start.reserve(255);
254 for (dim_type i=0; i < tm.size(); ++i) {
255 mask_start.push_back(r.size());
256 for (dim_type j=0; j < tm[i]->ndim(); ++j) {
257 r.push_back(tm[i]->ranges()[j]);
258 idxs.push_back(tm[i]->indexes()[j]);
261 eval_strides(); assert(size());
262 m.assign(size(),
false);
263 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
265 for (dim_type i=0; is_in && i < tm.size(); ++i) {
267 for (dim_type j=0; j < tm[i]->ndim(); ++j)
268 s_ += l.cnt[j+mask_start[i]]*tm[i]->strides()[j];
272 if (is_in) m.add(lpos(l.cnt));
277 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
279 if (tm.size() == 0) {
clear();
return; }
280 if (tm.size() == 1) { assign(*tm[0]);
return; }
282 basic_multi_iterator<unsigned> bmit;
283 for (dim_type i=0; i < tm.size(); ++i)
284 bmit.insert(tm[i]->indexes(), tm[i]->ranges(), tm[i]->strides(), 0);
285 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
287 m.assign(size(),
false);
288 bmit.insert(indexes(), ranges(), strides(), 0);
290 unsigned N = unsigned(tm.size());
291 bool finished =
false;
295 for (b=0; b < N; ++b) {
296 if (!tm[b]->m[bmit.it(b)]) {
301 if (is_in) { m[bmit.it(N)] = 1; b = N-1; }
302 while (!finished && !bmit.next(b)) {
if (b == 0) finished =
true; b--; }
306 void tensor_mask::unpack_strides(
const tensor_strides& packed, tensor_strides& unpacked)
const {
307 if (packed.size() != card())
308 assert(packed.size()==card());
309 unpacked.assign(size(),INT_MIN);
311 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
312 if (m[lpos(l.cnt)]) unpacked[lpos(l.cnt)] = packed[i++];
316 void tensor_mask::check_assertions()
const {
317 GMM_ASSERT3(r.size() == idxs.size(),
"");
318 GMM_ASSERT3(s.size() == idxs.size()+1,
"");
319 GMM_ASSERT3(m.size() == size(),
"");
321 for (dim_type i=0; i < idxs.size(); ++i) {
322 GMM_ASSERT3(!bv.is_in(i),
"");
327 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*> tm1,
const std::vector<const tensor_mask*> tm2,
bool and_op) {
328 assign(tensor_mask(tm1), tensor_mask(tm2), and_op);
331 void tensor_ref::set_sub_tensor(
const tensor_ref& tr,
const tensor_shape& sub) {
337 strides_.resize(masks().size());
338 for (dim_type i = 0; i < strides_.size(); ++i)
339 strides_[i].
resize(mask(i).card());
341 pbase_ = tr.pbase_; base_shift_ = tr.base_shift();
349 std::vector<tensor_strides> trstrides_unpacked(tr.masks().size());
350 for (dim_type im = 0; im < tr.masks().size(); ++im) {
351 tr.mask(im).check_assertions();
352 tr.mask(im).unpack_strides(tr.strides()[im], trstrides_unpacked[im]);
357 for (dim_type im = 0; im < masks().size(); ++im) {
358 const tensor_mask& m = masks()[im];
364 std::vector<dim_type> trmasks; trmasks.reserve(tr.masks().size());
365 for (dim_type i=0; i < m.indexes().size(); ++i) {
366 if (tr.index_is_valid(m.indexes()[i])) {
367 dim_type im2 = tr.index_to_mask_num(m.indexes()[i]);
368 if (std::find(trmasks.begin(), trmasks.end(), im2)==trmasks.end())
369 trmasks.push_back(im2);
373 tensor_ranges gcnt(tr.ndim(),0);
374 stride_type stcnt = 0;
376 for (tensor_ranges_loop l(m.ranges()); !l.finished(); l.next()) {
378 for (dim_type i=0; i < m.ranges().size(); ++i)
379 gcnt[m.indexes()[i]] = l.index(i);
383 stride_type tr_s = 0;
387 for (dim_type i=0; i < trmasks.size(); ++i) {
388 const tensor_mask &mm = tr.mask(trmasks[i]);
392 tr_s += trstrides_unpacked[trmasks[i]][mm.pos(gcnt)];
393 assert(trstrides_unpacked[trmasks[i]][mm.pos(gcnt)]>=0);
394 }
else { in_trm =
false;
break; }
397 if (in_m) assert(in_trm);
398 if (!in_trm) assert(!in_m);
402 strides_[im][stcnt++] = tr_s;
407 assert(stcnt == stride_type(m.card()));
415 tensor_ref::tensor_ref(
const tensor_ref& tr, tensor_mask::Slice slice) {
416 set_sub_tensor(tr, tr.slice_shape(slice));
422 const tensor_mask& m1(index_to_mask(slice.dim));
423 dim_type mdim = index_to_mask_dim(slice.dim);
425 tensor_ranges r(m1.ranges()); r.erase(r.begin()+mdim);
426 index_set idx(m1.indexes()); idx.erase(idx.begin()+mdim);
427 tensor_mask m2(r,idx);
428 index_type pos1 = 0, pos2 = 0;
429 for (tensor_ranges_loop l(m1.ranges()); !l.finished(); l.next()) {
430 if (l.index(mdim) == slice.i0)
431 m2.set_mask_val(pos2++, m1(pos1));
433 assert(m1(pos1) == 0);
439 assert(index_to_mask_num(slice.dim) < masks().size());
440 masks()[index_to_mask_num(slice.dim)] = m2;
443 remove_mask(index_to_mask_num(slice.dim));
446 shift_dim_num_ge(slice.dim,-1);
447 set_ndim_noclean(dim_type(ndim()-1));
452 struct compare_packed_range {
453 const std::vector<packed_range_info>& pri;
454 std::vector<stride_type> mean_inc;
455 compare_packed_range(
const std::vector<packed_range_info>& pri_) : pri(pri_) {}
456 bool operator()(dim_type a, dim_type b) {
457 if (pri[a].n < pri[b].n)
return true;
458 else if (pri[a].n > pri[b].n)
return false;
460 if (pri[a].mean_increm > pri[b].mean_increm)
467 void multi_tensor_iterator::init(std::vector<tensor_ref> trtab,
bool with_index_values) {
468 N = index_type(trtab.size());
469 index_type N_packed_idx = 0;
473 tensor_shape ts(trtab[0]);
474 for (dim_type i=1; i < trtab.size(); ++i)
480 for (dim_type i = 0; i < N; ++i) {
481 tensor_ref tmp = trtab[i];
482 trtab[i].set_sub_tensor(tmp,ts);
486 dal::bit_vector packed_idx; packed_idx.sup(0,ts.ndim());
487 for (index_type mi=0; mi < ts.masks().size(); ++mi) {
488 if (ts.masks()[mi].card() != 1) {
493 for (dim_type j=0; j < N; ++j) {
494 if (trtab[j].strides()[mi].size() != 0) {
495 assert(trtab[j].strides()[mi].size() == 1);
496 assert(trtab[j].strides()[mi][0] == 0);
502 pr.resize(N_packed_idx);
503 pri.resize(N_packed_idx);
508 for (dim_type mi = 0; mi < ts.masks().size(); ++mi) {
509 if (packed_idx[mi]) {
511 pri[pmi].original_masknum = mi;
512 pri[pmi].range = ts.masks()[mi].card();
513 for (n = 0; n < N; n++)
514 if (trtab[n].index_is_valid(mi))
523 std::sort(pri.begin(), pri.end());
527 bloc_rank.resize(N+1); bloc_rank[0] = 0;
528 bloc_nelt.resize(N+1); bloc_nelt[0] = 0;
529 for (index_type i=1; i <= N; ++i) {
530 bloc_rank[i] = bloc_rank[i-1];
531 while (bloc_rank[i] < pri.size() && pri[bloc_rank[i]].n == i-1) bloc_rank[i]++;
532 bloc_nelt[i] = bloc_rank[i] - bloc_rank[i-1];
536 for (pmi = 0; pmi < pri.size(); ++pmi) {
537 index_type mi = pri[pmi].original_masknum;
538 pri[pmi].mean_increm = 0;
539 pri[pmi].inc.assign(pri[pmi].range*(N-pri[pmi].n), 0);
540 pri[pmi].have_regular_strides.reset();
541 pri[pmi].have_regular_strides = std::bitset<32>((1 << N)-1);
542 for (dim_type n=pri[pmi].n; n < N; ++n) {
543 index_type pos0 = (n - pri[pmi].n);
544 for (index_type i = 0; i < pri[pmi].range; ++i) {
545 index_type pos = i * (N-pri[pmi].n) + pos0;
546 if (i != pri[pmi].range-1) {
547 stride_type increm = trtab[n].strides()[mi][i+1] - trtab[n].strides()[mi][i];
548 pri[pmi].inc[pos] = increm;
549 if (pri[pmi].inc[pos] != pri[pmi].inc[pos0])
550 pri[pmi].have_regular_strides[n] =
false;
551 pri[pmi].mean_increm += increm;
553 pri[pmi].inc[pos] = -trtab[n].strides()[mi][i];
557 pri[pmi].mean_increm /= (N-pri[pmi].n)*(pri[pmi].range-1);
561 index_set pr_reorder(pri.size());
562 for (pmi = 0; pmi < pri.size(); ++pmi) {
563 pr_reorder[pmi]=dim_type(pmi);
565 std::sort(pr_reorder.begin(), pr_reorder.end(), compare_packed_range(pri));
567 std::vector<packed_range> tmppr(pr);
568 std::vector<packed_range_info> tmppri(pri);
569 for (dim_type i =0; i < pri.size(); ++i) {
570 pr[i] = tmppr [pr_reorder[i]];
571 pri[i] = tmppri[pr_reorder[i]];
576 if (with_index_values) {
577 idxval.resize(ts.ndim());
579 for (dim_type i=0; i < ts.ndim(); ++i) {
580 idxval[i].ppinc = NULL;
581 idxval[i].pposbase = NULL;
585 for (index_type mi = 0; mi < ts.masks().size(); ++mi) {
587 pmi = index_type(-1);
588 for (dim_type i=0; i < pr.size(); ++i)
589 if (pri[i].original_masknum == mi)
592 if (pmi != index_type(-1)) {
593 ts.masks()[mi].gen_mask_pos(pri[pmi].mask_pos);
595 ts.masks()[mi].gen_mask_pos(v);
598 for (dim_type i=0; i < ts.masks()[mi].indexes().size(); ++i) {
599 dim_type ii = ts.masks()[mi].indexes()[i];
600 idxval[ii].cnt_num = dim_type(pmi);
601 idxval[ii].pos_ = (pmi != index_type(-1)) ? 0 : v[0];
602 idxval[ii].mod = ts.masks()[mi].strides()[i+1];
603 idxval[ii].div = ts.masks()[mi].strides()[i];
611 vectorized_strides_.resize(N); vectorized_size_ = 0;
612 std::fill(vectorized_strides_.begin(), vectorized_strides_.end(), 0);
613 vectorized_pr_dim = index_type(pri.size());
614 for (vectorized_pr_dim = index_type(pri.size()-1); vectorized_pr_dim != index_type(-1); vectorized_pr_dim--) {
615 std::vector<packed_range_info>::const_iterator pp = pri.begin() + vectorized_pr_dim;
616 if (vectorized_pr_dim == pri.size()-1) {
617 if (pp->have_regular_strides.count() == N) vectorized_size_ = pp->range;
618 for (dim_type n=pp->n; n < N; ++n) {
619 GMM_ASSERT3(n < pp->inc.size(),
"");
620 vectorized_strides_[n] = pp->inc[n];
623 if (pp->have_regular_strides.count() != N)
break;
624 bool still_ok =
true;
625 for (dim_type n=pp->n; n < N; ++n) {
626 if (stride_type(vectorized_strides_[n]*vectorized_size_) != pp->inc[n]) still_ok =
false;
629 vectorized_size_ *= pp->range;
634 it.resize(N); pit0.resize(N); itbase.resize(N);
635 for (dim_type n=0; n < N; ++n) {
636 pit0[n]=trtab[n].pbase();
637 itbase[n]=trtab[n].base_shift();
642 void multi_tensor_iterator::print()
const {
643 cout <<
"MTI(N=" << N <<
"): ";
644 for (dim_type i=0; i < pr.size(); ++i)
645 cout <<
" pri[" <<
int(i) <<
"]: n=" << int(pri[i].n)
646 <<
", range=" << pri[i].range <<
", mean_increm="
647 << pri[i].mean_increm <<
", regular = " << pri[i].have_regular_strides
648 <<
", inc=" << vref(pri[i].inc) <<
"\n";
649 cout <<
"bloc_rank: " << vref(bloc_rank) <<
", bloc_nelt: " << vref(bloc_nelt) <<
"\n";
650 cout <<
"vectorized_size : " << vectorized_size_ <<
", strides = " << vref(vectorized_strides_) <<
", pr_dim=" << vectorized_pr_dim <<
"\n";
653 void tensor_reduction::insert(
const tensor_ref& tr_,
const std::string& s) {
654 tensor_shape ts(tr_);
656 trtab.push_back(tref_or_reduction(tensor_ref(tr_, ts), s));
663 void tensor_reduction::insert(
const tref_or_reduction& t,
const std::string& s) {
664 if (!t.is_reduction()) {
667 trtab.push_back(t); trtab.back().ridx = s;
677 void tensor_reduction::update_reduction_chars() {
678 reduction_chars.clear();
679 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
680 assert(it->ridx.size() == it->tr().ndim());
681 for (
unsigned i =0; i < it->ridx.size(); ++i) {
682 if (it->ridx[i] !=
' ' &&
683 reduction_chars.find(it->ridx[i]) == std::string::npos)
684 reduction_chars.push_back(it->ridx[i]);
690 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
691 it->gdim.resize(it->ridx.size());
692 for (
unsigned i =0; i < it->ridx.size(); ++i) {
693 char c = it->ridx[i];
694 if (c !=
' ' && it->ridx.find(c) != i) {
695 for (c =
'A'; c <=
'Z'; ++c)
696 if (reduction_chars.find(c) == std::string::npos)
break;
698 reduction_chars.push_back(it->ridx[i]);
707 void tensor_reduction::pre_prepare() {
708 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
709 assert(it->ridx.size() == it->tr().ndim());
710 it->rdim.resize(it->ridx.size());
712 for (dim_type i =0; i < it->ridx.size(); ++i) {
713 if (it->ridx[i] ==
' ') {
714 reduced_range.push_back(it->tr().dim(i));
715 it->rdim[i] = dim_type(reduced_range.size()-1);
717 it->rdim[i] = dim_type(-1);
724 size_type tensor_reduction::find_best_sub_reduction(dal::bit_vector &best_lst, std::string &best_idxset) {
727 best_lst.clear(); best_idxset.clear();
729 update_reduction_chars();
732 GMM_ASSERT2(trtab.size() <= 32,
"wow it was assumed that nobody would "
733 "ever need a reduction on more than 32 tensors..");
735 std::vector<std::bitset<32> > idx_occurences(reduction_chars.size());
737 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
738 char c = reduction_chars[ir];
739 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum)
740 idx_occurences[ir][tnum] = (trtab[tnum].ridx.find(c) != std::string::npos);
744 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
745 lst.clear(); lst.add(ir);
746 idxset.resize(0); idxset.push_back(reduction_chars[ir]);
748 for (
unsigned ir2=0; ir2 < reduction_chars.size(); ++ir2) {
750 if ((idx_occurences[ir2] | idx_occurences[ir]) == idx_occurences[ir]) {
752 idxset.push_back(reduction_chars[ir2]);
758 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum) {
759 if (!idx_occurences[ir][tnum])
761 std::bitset<int(32)> once((
int)reduction_chars.size());
762 for (dim_type i=0; i < trtab[tnum].tr().ndim(); ++i) {
764 for (dal::bv_visitor j(lst); !j.finished(); ++j) {
765 if (trtab[tnum].ridx[i] == reduction_chars[(
size_t)j]) {
766 if (once[j]) ignore =
true;
771 redsz *= trtab[tnum].tr().dim(i);
775 if (redsz < best_redsz) {
778 for (
unsigned i=0; i < trtab.size(); ++i)
779 if (idx_occurences[ir][i]) best_lst.add(i);
780 best_idxset = idxset;
790 void tensor_reduction::make_sub_reductions() {
791 dal::bit_vector bv; std::string red;
794 find_best_sub_reduction(bv,red);
795 if (bv.card() < trtab.size() && red.size()) {
797 auto sub = std::make_shared<tensor_reduction>();
798 std::vector<dim_type> new_rdim; new_rdim.reserve(16);
799 std::string new_reduction;
800 for (dal::bv_visitor tnum(bv); !tnum.finished(); ++tnum) {
801 tref_or_reduction &t = trtab[tnum];
802 std::string re = t.ridx; t.ridx.clear();
803 for (
unsigned i = 0; i < re.size(); ++i) {
804 bool reduced =
false;
807 if (red.find(re[i]) == std::string::npos) c =
' ';
811 t.ridx.push_back(re[i]);
812 new_rdim.push_back(t.rdim[i]);
813 new_reduction.push_back(re[i]);
818 sub->insert(trtab[tnum], re);
824 trtab[bv.first_true()] = tref_or_reduction(sub, new_reduction);
825 trtab[bv.first_true()].rdim = new_rdim;
826 std::vector<tref_or_reduction> trtab2; trtab2.reserve(trtab.size() - bv.card() + 1);
827 for (
size_type i=0; i < trtab.size(); ++i)
828 if (!bv.is_in(i) || i == bv.first_true())
829 trtab2.push_back(trtab[i]);
843 void tensor_reduction::prepare(
const tensor_ref* tr_out) {
845 make_sub_reductions();
848 if (tr_out == NULL) {
849 trres = tensor_ref(reduced_range);
850 out_data.resize(trres.card());
851 pout_data = &out_data[0];
852 trres.set_base(pout_data);
854 GMM_ASSERT3(tr_out->ndim() == reduced_range.size(),
"");
855 for (dim_type i=0; i < tr_out->ndim(); ++i)
856 GMM_ASSERT3(tr_out->dim(i) == reduced_range[i],
"");
864 tensor_ranges global_range;
866 std::string global_chars;
868 global_range.reserve(16);
869 global_range.assign(reduced_range.begin(), reduced_range.end());
870 global_chars.insert(
size_type(0), reduced_range.size(),
' ');
871 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
872 assert(it->rdim.size() == it->tr().ndim());
874 for (dim_type i=0; i < it->ridx.size(); ++i) {
875 if (it->rdim[i] == dim_type(-1)) {
876 assert(it->ridx[i] !=
' ');
878 if (p == std::string::npos) {
879 global_chars.push_back(it->ridx[i]);
880 global_range.push_back(it->tr().dim(i));
881 it->gdim[i] = dim_type(global_range.size() - 1);
883 GMM_ASSERT1(it->tr().dim(i) == global_range[p],
884 "inconsistent dimensions for reduction index "
885 << it->ridx[i] <<
"(" <<
int(it->tr().dim(i))
886 <<
" != " <<
int(global_range[p]) <<
")");
887 it->gdim[i] = dim_type(p);
895 std::vector<dim_type> reorder(global_chars.size(), dim_type(-1));
897 for (dim_type i=0; i < reduced_range.size(); ++i) reorder[i] = i;
899 trres.permute(reorder);
900 std::vector<tensor_ref> tt; tt.reserve(trtab.size()+1);
904 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
905 std::fill(reorder.begin(), reorder.end(), dim_type(-1));
906 for (dim_type i=0; i < it->gdim.size(); ++i) {
907 reorder[it->gdim[i]] = i;
910 it->tr().permute(reorder);
911 tt.push_back(it->tr());
919 static void do_reduction1(bgeot::multi_tensor_iterator &mti) {
924 }
while (mti.bnext(1));
926 }
while (mti.bnext(0));
929 static bool do_reduction2v(bgeot::multi_tensor_iterator &mti) {
931 const std::vector<stride_type> &s = mti.vectorized_strides();
932 if (n && s[0] && s[1] && s[2] == 0) {
933 #if defined(GMM_USES_BLAS)
934 BLAS_INT nn = n, incx = s[1], incy = s[0];
936 dim_type incx = dim_type(s[1]), incy = dim_type(s[0]);
944 #if defined(GMM_USES_BLAS)
945 gmm::daxpy_(&nn, &mti.p(2), &mti.p(1), &incx, &mti.p(0), &incy);
948 scalar_type* itx = &mti.p(1);
949 scalar_type* ity = &mti.p(0);
957 }
while (mti.vnext());
962 static void do_reduction2a(bgeot::multi_tensor_iterator &mti) {
963 if (!do_reduction2v(mti)) {
965 mti.p(0) += mti.p(1)*mti.p(2);
966 }
while (mti.bnext(0));
970 static void do_reduction2b(bgeot::multi_tensor_iterator &mti) {
977 }
while (mti.bnext(2));
979 }
while (mti.bnext(1));
981 }
while (mti.bnext(0));
984 static bool do_reduction3v(bgeot::multi_tensor_iterator &mti) {
986 const std::vector<stride_type> &s = mti.vectorized_strides();
987 if (n && s[0] && s[1] && s[2] == 0 && s[3] == 0) {
988 #if defined(GMM_USES_BLAS)
989 BLAS_INT nn = n, incx = s[1], incy = s[0];
991 dim_type incx = dim_type(s[1]), incy = dim_type(s[0]);
994 double a = mti.p(2)*mti.p(3);
995 #if defined(GMM_USES_BLAS)
996 gmm::daxpy_(&nn, &a, &mti.p(1), &incx, &mti.p(0), &incy);
998 scalar_type* itx = &mti.p(1);
999 scalar_type* ity = &mti.p(0);
1007 }
while (mti.vnext());
1013 static void do_reduction3a(bgeot::multi_tensor_iterator &mti) {
1014 if (!do_reduction3v(mti)) {
1016 mti.p(0) += mti.p(1)*mti.p(2)*mti.p(3);
1017 }
while (mti.bnext(0));
1021 static void do_reduction3b(bgeot::multi_tensor_iterator &mti) {
1030 }
while (mti.bnext(3));
1032 }
while (mti.bnext(2));
1034 }
while (mti.bnext(1));
1036 }
while (mti.bnext(0));
1039 void tensor_reduction::do_reduction() {
1045 if (out_data.size()) memset(&out_data[0], 0, out_data.size()*
sizeof(out_data[0]));
1046 for (
unsigned i=0; i < trtab.size(); ++i) {
1047 if (trtab[i].is_reduction()) {
1048 trtab[i].reduction->do_reduction();
1049 trtab[i].reduction->result(trtab[i].tr());
1054 dim_type N = dim_type(trtab.size());
1057 }
else if (N == 2) {
1058 if (!mti.bnext_useful(2) && !mti.bnext_useful(1)) {
1059 do_reduction2a(mti);
1061 do_reduction2b(mti);
1063 }
else if (N == 3) {
1064 if (!mti.bnext_useful(1) && (!mti.bnext_useful(2)) && !mti.bnext_useful(3)) {
1065 do_reduction3a(mti);
1067 do_reduction3b(mti);
1069 }
else if (N == 4) {
1080 }
while (mti.bnext(4));
1082 }
while (mti.bnext(3));
1084 }
while (mti.bnext(2));
1086 }
while (mti.bnext(1));
1088 }
while (mti.bnext(0));
1090 GMM_ASSERT1(
false,
"unhandled reduction case ! (N=" <<
int(N) <<
")");
1094 void tensor_reduction::clear() {
1097 reduced_range.resize(0);
1098 reduction_chars.clear();
1101 pout_data = 0; trtab.reserve(10);
1106 void tensor_mask::print(std::ostream &o)
const {
1107 index_type c=card(
true);
1109 o <<
" mask : card=" << c <<
"(card_=" << card_ <<
", uptodate=" << card_uptodate <<
"), indexes=";
1110 for (dim_type i=0; i < idxs.size(); ++i)
1111 o << (i==0?
"":
", ") << int(idxs[i]) <<
":" << int(r[i]);
1113 if (c == size()) o <<
" FULL" << endl;
1116 if (idxs.size() == 1) {
1117 for (index_type i=0; i < m.size(); ++i)
1118 o << (m[i] ? 1 : 0);
1120 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
1121 if (l.cnt[0] == 0 && l.cnt[1] == 0 && r.size()>2) {
1123 for (dim_type i=2; i < r.size(); ++i)
1124 o <<
"," << l.cnt[i];
1127 o << (m[lpos(l.cnt)] ? 1 : 0);
1128 if (l.cnt[0] == r[0]-1) {
1129 if (l.cnt[1] != r[1]-1) o <<
",";
1130 else if (idxs.size() > 2) o <<
"}";
1140 void tensor_shape::print(std::ostream& o)
const {
1141 o <<
" tensor_shape: n=" << idx2mask.size() <<
", idx2mask=";
1142 for (dim_type i=0; i < idx2mask.size(); ++i) {
1144 if (idx2mask[i].is_valid()) {
1145 o <<
"r" << dim(i) <<
":m" << int(idx2mask[i].mask_num) <<
"/" << int(idx2mask[i].mask_dim);
1146 }
else o <<
" (na) ";
1150 for (dim_type i=0; i < masks_.size(); ++i) o << masks_[i];
1151 o <<
" ^-- end tensor_shape" << endl;
1154 void tensor_ref::print(std::ostream& o)
const {
1155 o <<
"tensor_ref, n=" << int(ndim()) <<
", card=" << card(
true) <<
", base=" << base() << endl;
1156 for (dim_type i=0; i < strides().size(); ++i) {
1157 o <<
" * strides["<<i<<
"]={";
1158 for (
size_type j=0; j < strides()[i].size(); ++j) o << (j>0?
",":
"") << strides()[i][j];
1161 multi_tensor_iterator mti(*
this,
true);
1163 for (dim_type i = 0; i < mti.ndim(); ++i) {
1164 o << (i==0?
"(":
",");
1165 if (index_is_valid(i))
1171 o <<
" = " << mti.p(0) <<
"\t@base+" << &mti.p(0) - base();
1172 }
else o <<
"\t@" << size_t(&mti.p(0))/
sizeof(scalar_type);
1174 }
while (mti.qnext1());
1176 o <<
"^---- end tensor_ref" << endl;
1179 std::ostream&
operator<<(std::ostream& o,
const tensor_mask& m) {
1180 m.print(o);
return o;
1182 std::ostream&
operator<<(std::ostream& o,
const tensor_shape& ts) {
1183 ts.print(o);
return o;
1185 std::ostream&
operator<<(std::ostream& o,
const tensor_ref& tr) {
1186 tr.print(o);
return o;
1188 void print_tm(
const tensor_mask& tm) { tm.print_(); }
1189 void print_ts(
const tensor_shape& ts) { ts.print_(); }
1190 void print_tr(
const tensor_ref& tr) { tr.print_(); }
Sparse tensors, used during the assembly.
void clear(L &l)
clear (fill with zeros) a vector or matrix.
void resize(V &v, size_type n)
*/
gmm interface for fortran BLAS.
std::ostream & operator<<(std::ostream &o, const convex_structure &cv)
Print the details of the convex structure cvs to the output stream o.
size_t size_type
used as the common size type in the library