Main MRPT website > C++ reference
MRPT logo

GeneralBlockPanelKernel.h

Go to the documentation of this file.
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // Eigen is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU Lesser General Public
00008 // License as published by the Free Software Foundation; either
00009 // version 3 of the License, or (at your option) any later version.
00010 //
00011 // Alternatively, you can redistribute it and/or
00012 // modify it under the terms of the GNU General Public License as
00013 // published by the Free Software Foundation; either version 2 of
00014 // the License, or (at your option) any later version.
00015 //
00016 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
00017 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
00019 // GNU General Public License for more details.
00020 //
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License and a copy of the GNU General Public License along with
00023 // Eigen. If not, see <http://www.gnu.org/licenses/>.
00024 
00025 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00026 #define EIGEN_GENERAL_BLOCK_PANEL_H
00027 
00028 namespace internal {
00029 
00030 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00031 class gebp_traits;
00032 
00033 /** \internal */
00034 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00035 {
00036   static std::ptrdiff_t m_l1CacheSize = 0;
00037   static std::ptrdiff_t m_l2CacheSize = 0;
00038   if(m_l1CacheSize==0)
00039   {
00040     m_l1CacheSize = queryL1CacheSize();
00041     m_l2CacheSize = queryTopLevelCacheSize();
00042 
00043     if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024;
00044     if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024;
00045   }
00046 
00047   if(action==SetAction)
00048   {
00049     // set the cpu cache size and cache all block sizes from a global cache size in byte
00050     eigen_internal_assert(l1!=0 && l2!=0);
00051     m_l1CacheSize = *l1;
00052     m_l2CacheSize = *l2;
00053   }
00054   else if(action==GetAction)
00055   {
00056     eigen_internal_assert(l1!=0 && l2!=0);
00057     *l1 = m_l1CacheSize;
00058     *l2 = m_l2CacheSize;
00059   }
00060   else
00061   {
00062     eigen_internal_assert(false);
00063   }
00064 }
00065 
00066 /** \brief Computes the blocking parameters for a m x k times k x n matrix product
00067   *
00068   * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
00069   * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
00070   * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
00071   *
00072   * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
00073   * this function computes the blocking size parameters along the respective dimensions
00074   * for matrix products and related algorithms. The blocking sizes depends on various
00075   * parameters:
00076   * - the L1 and L2 cache sizes,
00077   * - the register level blocking sizes defined by gebp_traits,
00078   * - the number of scalars that fit into a packet (when vectorization is enabled).
00079   *
00080   * \sa setCpuCacheSizes */
00081 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00082 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00083 {
00084   // Explanations:
00085   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
00086   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
00087   // per kc x nr vertical small panels where nr is the blocking size along the n dimension
00088   // at the register level. For vectorization purpose, these small vertical panels are unpacked,
00089   // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
00090   // stay in L1 cache.
00091   std::ptrdiff_t l1, l2;
00092 
00093   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00094   enum {
00095     kdiv = KcFactor * 2 * Traits::nr
00096          * Traits::RhsProgress * sizeof(RhsScalar),
00097     mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00098     mr_mask = (0xffffffff/mr)*mr
00099   };
00100 
00101   manage_caching_sizes(GetAction, &l1, &l2);
00102   k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00103   std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00104   if(_m<m) m = _m & mr_mask;
00105   n = n;
00106 }
00107 
00108 template<typename LhsScalar, typename RhsScalar>
00109 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00110 {
00111   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00112 }
00113 
00114 #ifdef EIGEN_HAS_FUSE_CJMADD
00115   #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
00116 #else
00117 
00118   // FIXME (a bit overkill maybe ?)
00119 
00120   template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00121     EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
00122     {
00123       c = cj.pmadd(a,b,c);
00124     }
00125   };
00126 
00127   template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00128     EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00129     {
00130       t = b; t = cj.pmul(a,t); c = padd(c,t);
00131     }
00132   };
00133 
00134   template<typename CJ, typename A, typename B, typename C, typename T>
00135   EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00136   {
00137     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00138   }
00139 
00140   #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
00141 //   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
00142 #endif
00143 
00144 /* Vectorization logic
00145  *  real*real: unpack rhs to constant packets, ...
00146  * 
00147  *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
00148  *          storing each res packet into two packets (2x2),
00149  *          at the end combine them: swap the second and addsub them 
00150  *  cf*cf : same but with 2x4 blocks
00151  *  cplx*real : unpack rhs to constant packets, ...
00152  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
00153  */
00154 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00155 class gebp_traits
00156 {
00157 public:
00158   typedef _LhsScalar LhsScalar;
00159   typedef _RhsScalar RhsScalar;
00160   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00161 
00162   enum {
00163     ConjLhs = _ConjLhs,
00164     ConjRhs = _ConjRhs,
00165     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00166     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00167     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00168     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00169     
00170     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00171 
00172     // register block size along the N direction (must be either 2 or 4)
00173     nr = NumberOfRegisters/4,
00174 
00175     // register block size along the M direction (currently, this one cannot be modified)
00176     mr = 2 * LhsPacketSize,
00177     
00178     WorkSpaceFactor = nr * RhsPacketSize,
00179 
00180     LhsProgress = LhsPacketSize,
00181     RhsProgress = RhsPacketSize
00182   };
00183 
00184   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00185   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00186   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00187 
00188   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00189   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00190   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00191 
00192   typedef ResPacket AccPacket;
00193   
00194   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00195   {
00196     p = pset1<ResPacket>(ResScalar(0));
00197   }
00198 
00199   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00200   {
00201     for(DenseIndex k=0; k<n; k++)
00202       pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00203   }
00204 
00205   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00206   {
00207     dest = pload<RhsPacket>(b);
00208   }
00209 
00210   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00211   {
00212     dest = pload<LhsPacket>(a);
00213   }
00214 
00215   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00216   {
00217     tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00218   }
00219 
00220   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00221   {
00222     r = pmadd(c,alpha,r);
00223   }
00224 
00225 protected:
00226 //   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00227 //   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
00228 };
00229 
00230 template<typename RealScalar, bool _ConjLhs>
00231 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00232 {
00233 public:
00234   typedef std::complex<RealScalar> LhsScalar;
00235   typedef RealScalar RhsScalar;
00236   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00237 
00238   enum {
00239     ConjLhs = _ConjLhs,
00240     ConjRhs = false,
00241     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00242     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00243     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00244     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00245     
00246     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00247     nr = NumberOfRegisters/4,
00248     mr = 2 * LhsPacketSize,
00249     WorkSpaceFactor = nr*RhsPacketSize,
00250 
00251     LhsProgress = LhsPacketSize,
00252     RhsProgress = RhsPacketSize
00253   };
00254 
00255   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00256   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00257   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00258 
00259   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00260   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00261   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00262 
00263   typedef ResPacket AccPacket;
00264 
00265   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00266   {
00267     p = pset1<ResPacket>(ResScalar(0));
00268   }
00269 
00270   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00271   {
00272     for(DenseIndex k=0; k<n; k++)
00273       pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00274   }
00275 
00276   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00277   {
00278     dest = pload<RhsPacket>(b);
00279   }
00280 
00281   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00282   {
00283     dest = pload<LhsPacket>(a);
00284   }
00285 
00286   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00287   {
00288     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00289   }
00290 
00291   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00292   {
00293     tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00294   }
00295 
00296   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
00297   {
00298     c += a * b;
00299   }
00300 
00301   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00302   {
00303     r = cj.pmadd(c,alpha,r);
00304   }
00305 
00306 protected:
00307   conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00308 };
00309 
00310 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00311 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00312 {
00313 public:
00314   typedef std::complex<RealScalar>  Scalar;
00315   typedef std::complex<RealScalar>  LhsScalar;
00316   typedef std::complex<RealScalar>  RhsScalar;
00317   typedef std::complex<RealScalar>  ResScalar;
00318   
00319   enum {
00320     ConjLhs = _ConjLhs,
00321     ConjRhs = _ConjRhs,
00322     Vectorizable = packet_traits<RealScalar>::Vectorizable
00323                 && packet_traits<Scalar>::Vectorizable,
00324     RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
00325     ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
00326     
00327     nr = 2,
00328     mr = 2 * ResPacketSize,
00329     WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00330 
00331     LhsProgress = ResPacketSize,
00332     RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00333   };
00334   
00335   typedef typename packet_traits<RealScalar>::type RealPacket;
00336   typedef typename packet_traits<Scalar>::type     ScalarPacket;
00337   struct DoublePacket
00338   {
00339     RealPacket first;
00340     RealPacket second;
00341   };
00342 
00343   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
00344   typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00345   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00346   typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00347   
00348   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00349 
00350   EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00351   {
00352     p.first   = pset1<RealPacket>(RealScalar(0));
00353     p.second  = pset1<RealPacket>(RealScalar(0));
00354   }
00355 
00356   /* Unpack the rhs coeff such that each complex coefficient is spread into
00357    * two packects containing respectively the real and imaginary coefficient
00358    * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
00359    */
00360   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00361   {
00362     for(DenseIndex k=0; k<n; k++)
00363     {
00364       if(Vectorizable)
00365       {
00366         pstore((RealScalar*)&b[k*ResPacketSize*2+0], pset1<RealPacket>(real(rhs[k])));
00367         pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], pset1<RealPacket>(imag(rhs[k])));
00368       }
00369       else
00370         b[k] = rhs[k];
00371     }
00372   }
00373 
00374   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00375 
00376   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00377   {
00378     dest.first  = pload<RealPacket>((const RealScalar*)b);
00379     dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00380   }
00381 
00382   // nothing special here
00383   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00384   {
00385     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00386   }
00387 
00388   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
00389   {
00390     c.first   = padd(pmul(a,b.first), c.first);
00391     c.second  = padd(pmul(a,b.second),c.second);
00392   }
00393 
00394   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
00395   {
00396     c = cj.pmadd(a,b,c);
00397   }
00398   
00399   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00400   
00401   EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00402   {
00403     // assemble c
00404     ResPacket tmp;
00405     if((!ConjLhs)&&(!ConjRhs))
00406     {
00407       tmp = pcplxflip(pconj(ResPacket(c.second)));
00408       tmp = padd(ResPacket(c.first),tmp);
00409     }
00410     else if((!ConjLhs)&&(ConjRhs))
00411     {
00412       tmp = pconj(pcplxflip(ResPacket(c.second)));
00413       tmp = padd(ResPacket(c.first),tmp);
00414     }
00415     else if((ConjLhs)&&(!ConjRhs))
00416     {
00417       tmp = pcplxflip(ResPacket(c.second));
00418       tmp = padd(pconj(ResPacket(c.first)),tmp);
00419     }
00420     else if((ConjLhs)&&(ConjRhs))
00421     {
00422       tmp = pcplxflip(ResPacket(c.second));
00423       tmp = psub(pconj(ResPacket(c.first)),tmp);
00424     }
00425     
00426     r = pmadd(tmp,alpha,r);
00427   }
00428 
00429 protected:
00430   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00431 };
00432 
00433 template<typename RealScalar, bool _ConjRhs>
00434 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00435 {
00436 public:
00437   typedef std::complex<RealScalar>  Scalar;
00438   typedef RealScalar  LhsScalar;
00439   typedef Scalar      RhsScalar;
00440   typedef Scalar      ResScalar;
00441 
00442   enum {
00443     ConjLhs = false,
00444     ConjRhs = _ConjRhs,
00445     Vectorizable = packet_traits<RealScalar>::Vectorizable
00446                 && packet_traits<Scalar>::Vectorizable,
00447     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00448     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00449     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00450     
00451     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00452     nr = 4,
00453     mr = 2*ResPacketSize,
00454     WorkSpaceFactor = nr*RhsPacketSize,
00455 
00456     LhsProgress = ResPacketSize,
00457     RhsProgress = ResPacketSize
00458   };
00459 
00460   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00461   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00462   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00463 
00464   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00465   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00466   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00467 
00468   typedef ResPacket AccPacket;
00469 
00470   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00471   {
00472     p = pset1<ResPacket>(ResScalar(0));
00473   }
00474 
00475   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00476   {
00477     for(DenseIndex k=0; k<n; k++)
00478       pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00479   }
00480 
00481   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00482   {
00483     dest = pload<RhsPacket>(b);
00484   }
00485 
00486   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00487   {
00488     dest = ploaddup<LhsPacket>(a);
00489   }
00490 
00491   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00492   {
00493     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00494   }
00495 
00496   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00497   {
00498     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00499   }
00500 
00501   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
00502   {
00503     c += a * b;
00504   }
00505 
00506   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00507   {
00508     r = cj.pmadd(alpha,c,r);
00509   }
00510 
00511 protected:
00512   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00513 };
00514 
00515 /* optimized GEneral packed Block * packed Panel product kernel
00516  *
00517  * Mixing type logic: C += A * B
00518  *  |  A  |  B  | comments
00519  *  |real |cplx | no vectorization yet, would require to pack A with duplication
00520  *  |cplx |real | easy vectorization
00521  */
00522 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00523 struct gebp_kernel
00524 {
00525   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00526   typedef typename Traits::ResScalar ResScalar;
00527   typedef typename Traits::LhsPacket LhsPacket;
00528   typedef typename Traits::RhsPacket RhsPacket;
00529   typedef typename Traits::ResPacket ResPacket;
00530   typedef typename Traits::AccPacket AccPacket;
00531 
00532   enum {
00533     Vectorizable  = Traits::Vectorizable,
00534     LhsProgress   = Traits::LhsProgress,
00535     RhsProgress   = Traits::RhsProgress,
00536     ResPacketSize = Traits::ResPacketSize
00537   };
00538 
00539   EIGEN_FLATTEN_ATTRIB
00540   void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00541                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00542   {
00543     Traits traits;
00544     
00545     if(strideA==-1) strideA = depth;
00546     if(strideB==-1) strideB = depth;
00547     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00548 //     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00549     Index packet_cols = (cols/nr) * nr;
00550     const Index peeled_mc = (rows/mr)*mr;
00551     // FIXME:
00552     const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00553     const Index peeled_kc = (depth/4)*4;
00554 
00555     if(unpackedB==0)
00556       unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00557 
00558     // loops on each micro vertical panel of rhs (depth x nr)
00559     for(Index j2=0; j2<packet_cols; j2+=nr)
00560     {
00561       traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
00562 
00563       // loops on each largest micro horizontal panel of lhs (mr x depth)
00564       // => we select a mr x nr micro block of res which is entirely
00565       //    stored into mr/packet_size x nr registers.
00566       for(Index i=0; i<peeled_mc; i+=mr)
00567       {
00568         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00569         prefetch(&blA[0]);
00570 
00571         // gets res block as register
00572         AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00573                   traits.initAcc(C0);
00574                   traits.initAcc(C1);
00575         if(nr==4) traits.initAcc(C2);
00576         if(nr==4) traits.initAcc(C3);
00577                   traits.initAcc(C4);
00578                   traits.initAcc(C5);
00579         if(nr==4) traits.initAcc(C6);
00580         if(nr==4) traits.initAcc(C7);
00581 
00582         ResScalar* r0 = &res[(j2+0)*resStride + i];
00583         ResScalar* r1 = r0 + resStride;
00584         ResScalar* r2 = r1 + resStride;
00585         ResScalar* r3 = r2 + resStride;
00586 
00587         prefetch(r0+16);
00588         prefetch(r1+16);
00589         prefetch(r2+16);
00590         prefetch(r3+16);
00591 
00592         // performs "inner" product
00593         // TODO let's check wether the folowing peeled loop could not be
00594         //      optimized via optimal prefetching from one loop to the other
00595         const RhsScalar* blB = unpackedB;
00596         for(Index k=0; k<peeled_kc; k+=4)
00597         {
00598           if(nr==2)
00599           {
00600             LhsPacket A0, A1;
00601             RhsPacket B0;
00602             RhsPacket T0;
00603             
00604 EIGEN_ASM_COMMENT("mybegin2");
00605             traits.loadLhs(&blA[0*LhsProgress], A0);
00606             traits.loadLhs(&blA[1*LhsProgress], A1);
00607             traits.loadRhs(&blB[0*RhsProgress], B0);
00608             traits.madd(A0,B0,C0,T0);
00609             traits.madd(A1,B0,C4,B0);
00610             traits.loadRhs(&blB[1*RhsProgress], B0);
00611             traits.madd(A0,B0,C1,T0);
00612             traits.madd(A1,B0,C5,B0);
00613 
00614             traits.loadLhs(&blA[2*LhsProgress], A0);
00615             traits.loadLhs(&blA[3*LhsProgress], A1);
00616             traits.loadRhs(&blB[2*RhsProgress], B0);
00617             traits.madd(A0,B0,C0,T0);
00618             traits.madd(A1,B0,C4,B0);
00619             traits.loadRhs(&blB[3*RhsProgress], B0);
00620             traits.madd(A0,B0,C1,T0);
00621             traits.madd(A1,B0,C5,B0);
00622 
00623             traits.loadLhs(&blA[4*LhsProgress], A0);
00624             traits.loadLhs(&blA[5*LhsProgress], A1);
00625             traits.loadRhs(&blB[4*RhsProgress], B0);
00626             traits.madd(A0,B0,C0,T0);
00627             traits.madd(A1,B0,C4,B0);
00628             traits.loadRhs(&blB[5*RhsProgress], B0);
00629             traits.madd(A0,B0,C1,T0);
00630             traits.madd(A1,B0,C5,B0);
00631 
00632             traits.loadLhs(&blA[6*LhsProgress], A0);
00633             traits.loadLhs(&blA[7*LhsProgress], A1);
00634             traits.loadRhs(&blB[6*RhsProgress], B0);
00635             traits.madd(A0,B0,C0,T0);
00636             traits.madd(A1,B0,C4,B0);
00637             traits.loadRhs(&blB[7*RhsProgress], B0);
00638             traits.madd(A0,B0,C1,T0);
00639             traits.madd(A1,B0,C5,B0);
00640 EIGEN_ASM_COMMENT("myend");
00641           }
00642           else
00643           {
00644 EIGEN_ASM_COMMENT("mybegin4");
00645             LhsPacket A0, A1;
00646             RhsPacket B0, B1, B2, B3;
00647             RhsPacket T0;
00648             
00649             traits.loadLhs(&blA[0*LhsProgress], A0);
00650             traits.loadLhs(&blA[1*LhsProgress], A1);
00651             traits.loadRhs(&blB[0*RhsProgress], B0);
00652             traits.loadRhs(&blB[1*RhsProgress], B1);
00653 
00654             traits.madd(A0,B0,C0,T0);
00655             traits.loadRhs(&blB[2*RhsProgress], B2);
00656             traits.madd(A1,B0,C4,B0);
00657             traits.loadRhs(&blB[3*RhsProgress], B3);
00658             traits.loadRhs(&blB[4*RhsProgress], B0);
00659             traits.madd(A0,B1,C1,T0);
00660             traits.madd(A1,B1,C5,B1);
00661             traits.loadRhs(&blB[5*RhsProgress], B1);
00662             traits.madd(A0,B2,C2,T0);
00663             traits.madd(A1,B2,C6,B2);
00664             traits.loadRhs(&blB[6*RhsProgress], B2);
00665             traits.madd(A0,B3,C3,T0);
00666             traits.loadLhs(&blA[2*LhsProgress], A0);
00667             traits.madd(A1,B3,C7,B3);
00668             traits.loadLhs(&blA[3*LhsProgress], A1);
00669             traits.loadRhs(&blB[7*RhsProgress], B3);
00670             traits.madd(A0,B0,C0,T0);
00671             traits.madd(A1,B0,C4,B0);
00672             traits.loadRhs(&blB[8*RhsProgress], B0);
00673             traits.madd(A0,B1,C1,T0);
00674             traits.madd(A1,B1,C5,B1);
00675             traits.loadRhs(&blB[9*RhsProgress], B1);
00676             traits.madd(A0,B2,C2,T0);
00677             traits.madd(A1,B2,C6,B2);
00678             traits.loadRhs(&blB[10*RhsProgress], B2);
00679             traits.madd(A0,B3,C3,T0);
00680             traits.loadLhs(&blA[4*LhsProgress], A0);
00681             traits.madd(A1,B3,C7,B3);
00682             traits.loadLhs(&blA[5*LhsProgress], A1);
00683             traits.loadRhs(&blB[11*RhsProgress], B3);
00684 
00685             traits.madd(A0,B0,C0,T0);
00686             traits.madd(A1,B0,C4,B0);
00687             traits.loadRhs(&blB[12*RhsProgress], B0);
00688             traits.madd(A0,B1,C1,T0);
00689             traits.madd(A1,B1,C5,B1);
00690             traits.loadRhs(&blB[13*RhsProgress], B1);
00691             traits.madd(A0,B2,C2,T0);
00692             traits.madd(A1,B2,C6,B2);
00693             traits.loadRhs(&blB[14*RhsProgress], B2);
00694             traits.madd(A0,B3,C3,T0);
00695             traits.loadLhs(&blA[6*LhsProgress], A0);
00696             traits.madd(A1,B3,C7,B3);
00697             traits.loadLhs(&blA[7*LhsProgress], A1);
00698             traits.loadRhs(&blB[15*RhsProgress], B3);
00699             traits.madd(A0,B0,C0,T0);
00700             traits.madd(A1,B0,C4,B0);
00701             traits.madd(A0,B1,C1,T0);
00702             traits.madd(A1,B1,C5,B1);
00703             traits.madd(A0,B2,C2,T0);
00704             traits.madd(A1,B2,C6,B2);
00705             traits.madd(A0,B3,C3,T0);
00706             traits.madd(A1,B3,C7,B3);
00707           }
00708 
00709           blB += 4*nr*RhsProgress;
00710           blA += 4*mr;
00711         }
00712         // process remaining peeled loop
00713         for(Index k=peeled_kc; k<depth; k++)
00714         {
00715           if(nr==2)
00716           {
00717             LhsPacket A0, A1;
00718             RhsPacket B0;
00719             RhsPacket T0;
00720 
00721             traits.loadLhs(&blA[0*LhsProgress], A0);
00722             traits.loadLhs(&blA[1*LhsProgress], A1);
00723             traits.loadRhs(&blB[0*RhsProgress], B0);
00724             traits.madd(A0,B0,C0,T0);
00725             traits.madd(A1,B0,C4,B0);
00726             traits.loadRhs(&blB[1*RhsProgress], B0);
00727             traits.madd(A0,B0,C1,T0);
00728             traits.madd(A1,B0,C5,B0);
00729           }
00730           else
00731           {
00732             LhsPacket A0, A1;
00733             RhsPacket B0, B1, B2, B3;
00734             RhsPacket T0;
00735 
00736             traits.loadLhs(&blA[0*LhsProgress], A0);
00737             traits.loadLhs(&blA[1*LhsProgress], A1);
00738             traits.loadRhs(&blB[0*RhsProgress], B0);
00739             traits.loadRhs(&blB[1*RhsProgress], B1);
00740 
00741             traits.madd(A0,B0,C0,T0);
00742             traits.loadRhs(&blB[2*RhsProgress], B2);
00743             traits.madd(A1,B0,C4,B0);
00744             traits.loadRhs(&blB[3*RhsProgress], B3);
00745             traits.madd(A0,B1,C1,T0);
00746             traits.madd(A1,B1,C5,B1);
00747             traits.madd(A0,B2,C2,T0);
00748             traits.madd(A1,B2,C6,B2);
00749             traits.madd(A0,B3,C3,T0);
00750             traits.madd(A1,B3,C7,B3);
00751           }
00752 
00753           blB += nr*RhsProgress;
00754           blA += mr;
00755         }
00756 
00757         ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
00758         ResPacket alphav = pset1<ResPacket>(alpha);
00759 
00760                   R0 = ploadu<ResPacket>(r0);
00761                   R1 = ploadu<ResPacket>(r1);
00762         if(nr==4) R2 = ploadu<ResPacket>(r2);
00763         if(nr==4) R3 = ploadu<ResPacket>(r3);
00764                   R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00765                   R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00766         if(nr==4) R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00767         if(nr==4) R7 = ploadu<ResPacket>(r3 + ResPacketSize);
00768 
00769                   traits.acc(C0, alphav, R0);
00770                   traits.acc(C1, alphav, R1);
00771         if(nr==4) traits.acc(C2, alphav, R2);
00772         if(nr==4) traits.acc(C3, alphav, R3);
00773                   traits.acc(C4, alphav, R4);
00774                   traits.acc(C5, alphav, R5);
00775         if(nr==4) traits.acc(C6, alphav, R6);
00776         if(nr==4) traits.acc(C7, alphav, R7);
00777 
00778                   pstoreu(r0, R0);
00779                   pstoreu(r1, R1);
00780         if(nr==4) pstoreu(r2, R2);
00781         if(nr==4) pstoreu(r3, R3);
00782                   pstoreu(r0 + ResPacketSize, R4);
00783                   pstoreu(r1 + ResPacketSize, R5);
00784         if(nr==4) pstoreu(r2 + ResPacketSize, R6);
00785         if(nr==4) pstoreu(r3 + ResPacketSize, R7);
00786       }
00787       
00788       if(rows-peeled_mc>=LhsProgress)
00789       {
00790         Index i = peeled_mc;
00791         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00792         prefetch(&blA[0]);
00793 
00794         // gets res block as register
00795         AccPacket C0, C1, C2, C3;
00796                   traits.initAcc(C0);
00797                   traits.initAcc(C1);
00798         if(nr==4) traits.initAcc(C2);
00799         if(nr==4) traits.initAcc(C3);
00800 
00801         // performs "inner" product
00802         const RhsScalar* blB = unpackedB;
00803         for(Index k=0; k<peeled_kc; k+=4)
00804         {
00805           if(nr==2)
00806           {
00807             LhsPacket A0;
00808             RhsPacket B0, B1;
00809 
00810             traits.loadLhs(&blA[0*LhsProgress], A0);
00811             traits.loadRhs(&blB[0*RhsProgress], B0);
00812             traits.loadRhs(&blB[1*RhsProgress], B1);
00813             traits.madd(A0,B0,C0,B0);
00814             traits.loadRhs(&blB[2*RhsProgress], B0);
00815             traits.madd(A0,B1,C1,B1);
00816             traits.loadLhs(&blA[1*LhsProgress], A0);
00817             traits.loadRhs(&blB[3*RhsProgress], B1);
00818             traits.madd(A0,B0,C0,B0);
00819             traits.loadRhs(&blB[4*RhsProgress], B0);
00820             traits.madd(A0,B1,C1,B1);
00821             traits.loadLhs(&blA[2*LhsProgress], A0);
00822             traits.loadRhs(&blB[5*RhsProgress], B1);
00823             traits.madd(A0,B0,C0,B0);
00824             traits.loadRhs(&blB[6*RhsProgress], B0);
00825             traits.madd(A0,B1,C1,B1);
00826             traits.loadLhs(&blA[3*LhsProgress], A0);
00827             traits.loadRhs(&blB[7*RhsProgress], B1);
00828             traits.madd(A0,B0,C0,B0);
00829             traits.madd(A0,B1,C1,B1);
00830           }
00831           else
00832           {
00833             LhsPacket A0;
00834             RhsPacket B0, B1, B2, B3;
00835 
00836             traits.loadLhs(&blA[0*LhsProgress], A0);
00837             traits.loadRhs(&blB[0*RhsProgress], B0);
00838             traits.loadRhs(&blB[1*RhsProgress], B1);
00839 
00840             traits.madd(A0,B0,C0,B0);
00841             traits.loadRhs(&blB[2*RhsProgress], B2);
00842             traits.loadRhs(&blB[3*RhsProgress], B3);
00843             traits.loadRhs(&blB[4*RhsProgress], B0);
00844             traits.madd(A0,B1,C1,B1);
00845             traits.loadRhs(&blB[5*RhsProgress], B1);
00846             traits.madd(A0,B2,C2,B2);
00847             traits.loadRhs(&blB[6*RhsProgress], B2);
00848             traits.madd(A0,B3,C3,B3);
00849             traits.loadLhs(&blA[1*LhsProgress], A0);
00850             traits.loadRhs(&blB[7*RhsProgress], B3);
00851             traits.madd(A0,B0,C0,B0);
00852             traits.loadRhs(&blB[8*RhsProgress], B0);
00853             traits.madd(A0,B1,C1,B1);
00854             traits.loadRhs(&blB[9*RhsProgress], B1);
00855             traits.madd(A0,B2,C2,B2);
00856             traits.loadRhs(&blB[10*RhsProgress], B2);
00857             traits.madd(A0,B3,C3,B3);
00858             traits.loadLhs(&blA[2*LhsProgress], A0);
00859             traits.loadRhs(&blB[11*RhsProgress], B3);
00860 
00861             traits.madd(A0,B0,C0,B0);
00862             traits.loadRhs(&blB[12*RhsProgress], B0);
00863             traits.madd(A0,B1,C1,B1);
00864             traits.loadRhs(&blB[13*RhsProgress], B1);
00865             traits.madd(A0,B2,C2,B2);
00866             traits.loadRhs(&blB[14*RhsProgress], B2);
00867             traits.madd(A0,B3,C3,B3);
00868 
00869             traits.loadLhs(&blA[3*LhsProgress], A0);
00870             traits.loadRhs(&blB[15*RhsProgress], B3);
00871             traits.madd(A0,B0,C0,B0);
00872             traits.madd(A0,B1,C1,B1);
00873             traits.madd(A0,B2,C2,B2);
00874             traits.madd(A0,B3,C3,B3);
00875           }
00876 
00877           blB += nr*4*RhsProgress;
00878           blA += 4*LhsProgress;
00879         }
00880         // process remaining peeled loop
00881         for(Index k=peeled_kc; k<depth; k++)
00882         {
00883           if(nr==2)
00884           {
00885             LhsPacket A0;
00886             RhsPacket B0, B1;
00887 
00888             traits.loadLhs(&blA[0*LhsProgress], A0);
00889             traits.loadRhs(&blB[0*RhsProgress], B0);
00890             traits.loadRhs(&blB[1*RhsProgress], B1);
00891             traits.madd(A0,B0,C0,B0);
00892             traits.madd(A0,B1,C1,B1);
00893           }
00894           else
00895           {
00896             LhsPacket A0;
00897             RhsPacket B0, B1, B2, B3;
00898 
00899             traits.loadLhs(&blA[0*LhsProgress], A0);
00900             traits.loadRhs(&blB[0*RhsProgress], B0);
00901             traits.loadRhs(&blB[1*RhsProgress], B1);
00902             traits.loadRhs(&blB[2*RhsProgress], B2);
00903             traits.loadRhs(&blB[3*RhsProgress], B3);
00904 
00905             traits.madd(A0,B0,C0,B0);
00906             traits.madd(A0,B1,C1,B1);
00907             traits.madd(A0,B2,C2,B2);
00908             traits.madd(A0,B3,C3,B3);
00909           }
00910 
00911           blB += nr*RhsProgress;
00912           blA += LhsProgress;
00913         }
00914 
00915         ResPacket R0, R1, R2, R3;
00916         ResPacket alphav = pset1<ResPacket>(alpha);
00917 
00918         ResScalar* r0 = &res[(j2+0)*resStride + i];
00919         ResScalar* r1 = r0 + resStride;
00920         ResScalar* r2 = r1 + resStride;
00921         ResScalar* r3 = r2 + resStride;
00922 
00923                   R0 = ploadu<ResPacket>(r0);
00924                   R1 = ploadu<ResPacket>(r1);
00925         if(nr==4) R2 = ploadu<ResPacket>(r2);
00926         if(nr==4) R3 = ploadu<ResPacket>(r3);
00927 
00928                   traits.acc(C0, alphav, R0);
00929                   traits.acc(C1, alphav, R1);
00930         if(nr==4) traits.acc(C2, alphav, R2);
00931         if(nr==4) traits.acc(C3, alphav, R3);
00932 
00933                   pstoreu(r0, R0);
00934                   pstoreu(r1, R1);
00935         if(nr==4) pstoreu(r2, R2);
00936         if(nr==4) pstoreu(r3, R3);
00937       }
00938       for(Index i=peeled_mc2; i<rows; i++)
00939       {
00940         const LhsScalar* blA = &blockA[i*strideA+offsetA];
00941         prefetch(&blA[0]);
00942 
00943         // gets a 1 x nr res block as registers
00944         ResScalar C0(0), C1(0), C2(0), C3(0);
00945         // TODO directly use blockB ???
00946         const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00947         for(Index k=0; k<depth; k++)
00948         {
00949           if(nr==2)
00950           {
00951             LhsScalar A0;
00952             RhsScalar B0, B1;
00953 
00954             A0 = blA[k];
00955             B0 = blB[0];
00956             B1 = blB[1];
00957             MADD(cj,A0,B0,C0,B0);
00958             MADD(cj,A0,B1,C1,B1);
00959           }
00960           else
00961           {
00962             LhsScalar A0;
00963             RhsScalar B0, B1, B2, B3;
00964 
00965             A0 = blA[k];
00966             B0 = blB[0];
00967             B1 = blB[1];
00968             B2 = blB[2];
00969             B3 = blB[3];
00970 
00971             MADD(cj,A0,B0,C0,B0);
00972             MADD(cj,A0,B1,C1,B1);
00973             MADD(cj,A0,B2,C2,B2);
00974             MADD(cj,A0,B3,C3,B3);
00975           }
00976 
00977           blB += nr;
00978         }
00979                   res[(j2+0)*resStride + i] += alpha*C0;
00980                   res[(j2+1)*resStride + i] += alpha*C1;
00981         if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
00982         if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
00983       }
00984     }
00985     // process remaining rhs/res columns one at a time
00986     // => do the same but with nr==1
00987     for(Index j2=packet_cols; j2<cols; j2++)
00988     {
00989       // unpack B
00990       {
00991         traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
00992 //         const RhsScalar* blB = &blockB[j2*strideB+offsetB];
00993 //         for(Index k=0; k<depth; k++)
00994 //           pstore(&unpackedB[k*RhsPacketSize], pset1<RhsPacket>(blB[k]));
00995       }
00996 
00997       for(Index i=0; i<peeled_mc; i+=mr)
00998       {
00999         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01000         prefetch(&blA[0]);
01001 
01002         // TODO move the res loads to the stores
01003 
01004         // get res block as registers
01005         AccPacket C0, C4;
01006         traits.initAcc(C0);
01007         traits.initAcc(C4);
01008 
01009         const RhsScalar* blB = unpackedB;
01010         for(Index k=0; k<depth; k++)
01011         {
01012           LhsPacket A0, A1;
01013           RhsPacket B0;
01014           RhsPacket T0;
01015 
01016           traits.loadLhs(&blA[0*LhsProgress], A0);
01017           traits.loadLhs(&blA[1*LhsProgress], A1);
01018           traits.loadRhs(&blB[0*RhsProgress], B0);
01019           traits.madd(A0,B0,C0,T0);
01020           traits.madd(A1,B0,C4,B0);
01021 
01022           blB += RhsProgress;
01023           blA += 2*LhsProgress;
01024         }
01025         ResPacket R0, R4;
01026         ResPacket alphav = pset1<ResPacket>(alpha);
01027 
01028         ResScalar* r0 = &res[(j2+0)*resStride + i];
01029 
01030         R0 = ploadu<ResPacket>(r0);
01031         R4 = ploadu<ResPacket>(r0+ResPacketSize);
01032 
01033         traits.acc(C0, alphav, R0);
01034         traits.acc(C4, alphav, R4);
01035 
01036         pstoreu(r0,               R0);
01037         pstoreu(r0+ResPacketSize, R4);
01038       }
01039       if(rows-peeled_mc>=LhsProgress)
01040       {
01041         Index i = peeled_mc;
01042         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01043         prefetch(&blA[0]);
01044 
01045         AccPacket C0;
01046         traits.initAcc(C0);
01047 
01048         const RhsScalar* blB = unpackedB;
01049         for(Index k=0; k<depth; k++)
01050         {
01051           LhsPacket A0;
01052           RhsPacket B0;
01053           traits.loadLhs(blA, A0);
01054           traits.loadRhs(blB, B0);
01055           traits.madd(A0, B0, C0, B0);
01056           blB += RhsProgress;
01057           blA += LhsProgress;
01058         }
01059 
01060         ResPacket alphav = pset1<ResPacket>(alpha);
01061         ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01062         traits.acc(C0, alphav, R0);
01063         pstoreu(&res[(j2+0)*resStride + i], R0);
01064       }
01065       for(Index i=peeled_mc2; i<rows; i++)
01066       {
01067         const LhsScalar* blA = &blockA[i*strideA+offsetA];
01068         prefetch(&blA[0]);
01069 
01070         // gets a 1 x 1 res block as registers
01071         ResScalar C0(0);
01072         // FIXME directly use blockB ??
01073         const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01074         for(Index k=0; k<depth; k++)
01075         {
01076           LhsScalar A0 = blA[k];
01077           RhsScalar B0 = blB[k];
01078           MADD(cj, A0, B0, C0, B0);
01079         }
01080         res[(j2+0)*resStride + i] += alpha*C0;
01081       }
01082     }
01083   }
01084 };
01085 
01086 #undef CJMADD
01087 
01088 // pack a block of the lhs
01089 // The travesal is as follow (mr==4):
01090 //   0  4  8 12 ...
01091 //   1  5  9 13 ...
01092 //   2  6 10 14 ...
01093 //   3  7 11 15 ...
01094 //
01095 //  16 20 24 28 ...
01096 //  17 21 25 29 ...
01097 //  18 22 26 30 ...
01098 //  19 23 27 31 ...
01099 //
01100 //  32 33 34 35 ...
01101 //  36 36 38 39 ...
01102 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01103 struct gemm_pack_lhs
01104 {
01105   void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01106                   Index stride=0, Index offset=0)
01107   {
01108 //     enum { PacketSize = packet_traits<Scalar>::size };
01109     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01110     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01111     const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01112     Index count = 0;
01113     Index peeled_mc = (rows/Pack1)*Pack1;
01114     for(Index i=0; i<peeled_mc; i+=Pack1)
01115     {
01116       if(PanelMode) count += Pack1 * offset;
01117       for(Index k=0; k<depth; k++)
01118         for(Index w=0; w<Pack1; w++)
01119           blockA[count++] = cj(lhs(i+w, k));
01120       if(PanelMode) count += Pack1 * (stride-offset-depth);
01121     }
01122     if(rows-peeled_mc>=Pack2)
01123     {
01124       if(PanelMode) count += Pack2*offset;
01125       for(Index k=0; k<depth; k++)
01126         for(Index w=0; w<Pack2; w++)
01127           blockA[count++] = cj(lhs(peeled_mc+w, k));
01128       if(PanelMode) count += Pack2 * (stride-offset-depth);
01129       peeled_mc += Pack2;
01130     }
01131     for(Index i=peeled_mc; i<rows; i++)
01132     {
01133       if(PanelMode) count += offset;
01134       for(Index k=0; k<depth; k++)
01135         blockA[count++] = cj(lhs(i, k));
01136       if(PanelMode) count += (stride-offset-depth);
01137     }
01138   }
01139 };
01140 
01141 // copy a complete panel of the rhs
01142 // this version is optimized for column major matrices
01143 // The traversal order is as follow: (nr==4):
01144 //  0  1  2  3   12 13 14 15   24 27
01145 //  4  5  6  7   16 17 18 19   25 28
01146 //  8  9 10 11   20 21 22 23   26 29
01147 //  .  .  .  .    .  .  .  .    .  .
01148 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01149 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01150 {
01151   typedef typename packet_traits<Scalar>::type Packet;
01152   enum { PacketSize = packet_traits<Scalar>::size };
01153   void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01154                   Index stride=0, Index offset=0)
01155   {
01156     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01157     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01158     Index packet_cols = (cols/nr) * nr;
01159     Index count = 0;
01160     for(Index j2=0; j2<packet_cols; j2+=nr)
01161     {
01162       // skip what we have before
01163       if(PanelMode) count += nr * offset;
01164       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01165       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01166       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01167       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01168       for(Index k=0; k<depth; k++)
01169       {
01170                   blockB[count+0] = cj(b0[k]);
01171                   blockB[count+1] = cj(b1[k]);
01172         if(nr==4) blockB[count+2] = cj(b2[k]);
01173         if(nr==4) blockB[count+3] = cj(b3[k]);
01174         count += nr;
01175       }
01176       // skip what we have after
01177       if(PanelMode) count += nr * (stride-offset-depth);
01178     }
01179 
01180     // copy the remaining columns one at a time (nr==1)
01181     for(Index j2=packet_cols; j2<cols; ++j2)
01182     {
01183       if(PanelMode) count += offset;
01184       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01185       for(Index k=0; k<depth; k++)
01186       {
01187         blockB[count] = cj(b0[k]);
01188         count += 1;
01189       }
01190       if(PanelMode) count += (stride-offset-depth);
01191     }
01192   }
01193 };
01194 
01195 // this version is optimized for row major matrices
01196 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01197 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01198 {
01199   enum { PacketSize = packet_traits<Scalar>::size };
01200   void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01201                   Index stride=0, Index offset=0)
01202   {
01203     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01204     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01205     Index packet_cols = (cols/nr) * nr;
01206     Index count = 0;
01207     for(Index j2=0; j2<packet_cols; j2+=nr)
01208     {
01209       // skip what we have before
01210       if(PanelMode) count += nr * offset;
01211       for(Index k=0; k<depth; k++)
01212       {
01213         const Scalar* b0 = &rhs[k*rhsStride + j2];
01214                   blockB[count+0] = cj(b0[0]);
01215                   blockB[count+1] = cj(b0[1]);
01216         if(nr==4) blockB[count+2] = cj(b0[2]);
01217         if(nr==4) blockB[count+3] = cj(b0[3]);
01218         count += nr;
01219       }
01220       // skip what we have after
01221       if(PanelMode) count += nr * (stride-offset-depth);
01222     }
01223     // copy the remaining columns one at a time (nr==1)
01224     for(Index j2=packet_cols; j2<cols; ++j2)
01225     {
01226       if(PanelMode) count += offset;
01227       const Scalar* b0 = &rhs[j2];
01228       for(Index k=0; k<depth; k++)
01229       {
01230         blockB[count] = cj(b0[k*rhsStride]);
01231         count += 1;
01232       }
01233       if(PanelMode) count += stride-offset-depth;
01234     }
01235   }
01236 };
01237 
01238 } // end namespace internal
01239 
01240 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
01241   * \sa setCpuCacheSize */
01242 inline std::ptrdiff_t l1CacheSize()
01243 {
01244   std::ptrdiff_t l1, l2;
01245   internal::manage_caching_sizes(GetAction, &l1, &l2);
01246   return l1;
01247 }
01248 
01249 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
01250   * \sa setCpuCacheSize */
01251 inline std::ptrdiff_t l2CacheSize()
01252 {
01253   std::ptrdiff_t l1, l2;
01254   internal::manage_caching_sizes(GetAction, &l1, &l2);
01255   return l2;
01256 }
01257 
01258 /** Set the cpu L1 and L2 cache sizes (in bytes).
01259   * These values are use to adjust the size of the blocks
01260   * for the algorithms working per blocks.
01261   *
01262   * \sa computeProductBlockingSizes */
01263 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01264 {
01265   internal::manage_caching_sizes(SetAction, &l1, &l2);
01266 }
01267 
01268 #endif // EIGEN_GENERAL_BLOCK_PANEL_H



Page generated by Doxygen 1.7.3 for MRPT 0.9.4 SVN: at Sat Mar 26 06:40:17 UTC 2011