libflame  revision_anchor
Functions
bl1_gemm.c File Reference

(r)

Functions

void bl1_sgemm (trans1_t transa, trans1_t transb, int m, int k, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs, float *beta, float *c, int c_rs, int c_cs)
 
void bl1_dgemm (trans1_t transa, trans1_t transb, int m, int k, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs, double *beta, double *c, int c_rs, int c_cs)
 
void bl1_cgemm (trans1_t transa, trans1_t transb, int m, int k, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs, scomplex *beta, scomplex *c, int c_rs, int c_cs)
 
void bl1_zgemm (trans1_t transa, trans1_t transb, int m, int k, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs, dcomplex *beta, dcomplex *c, int c_rs, int c_cs)
 
void bl1_sgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, float *alpha, float *a, int lda, float *b, int ldb, float *beta, float *c, int ldc)
 
void bl1_dgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, double *alpha, double *a, int lda, double *b, int ldb, double *beta, double *c, int ldc)
 
void bl1_cgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
 
void bl1_zgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
 

Function Documentation

◆ bl1_cgemm()

void bl1_cgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
scomplex alpha,
scomplex a,
int  a_rs,
int  a_cs,
scomplex b,
int  b_rs,
int  b_cs,
scomplex beta,
scomplex c,
int  c_rs,
int  c_cs 
)

References bl1_c0(), bl1_c1(), bl1_callocm(), bl1_caxpymt(), bl1_cconjm(), bl1_ccopymt(), bl1_ccreate_contigm(), bl1_ccreate_contigmt(), bl1_cfree(), bl1_cfree_contigm(), bl1_cfree_saved_contigm(), bl1_cgemm_blas(), bl1_cscalm(), bl1_is_col_storage(), bl1_is_conjnotrans(), bl1_zero_dim3(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

536 {
537  int m_save = m;
538  int n_save = n;
539  scomplex* a_save = a;
540  scomplex* b_save = b;
541  scomplex* c_save = c;
542  int a_rs_save = a_rs;
543  int a_cs_save = a_cs;
544  int b_rs_save = b_rs;
545  int b_cs_save = b_cs;
546  int c_rs_save = c_rs;
547  int c_cs_save = c_cs;
548  scomplex zero = bl1_c0();
549  scomplex one = bl1_c1();
550  scomplex* a_unswap;
551  scomplex* b_unswap;
552  scomplex* a_conj;
553  scomplex* b_conj;
554  scomplex* c_trans;
555  int lda, inca;
556  int ldb, incb;
557  int ldc, incc;
558  int lda_conj, inca_conj;
559  int ldb_conj, incb_conj;
560  int ldc_trans, incc_trans;
561  int m_gemm, n_gemm;
562  int gemm_needs_axpyt = FALSE;
563  int a_was_copied;
564  int b_was_copied;
565 
566  // Return early if possible.
567  if ( bl1_zero_dim3( m, k, n ) )
568  {
570  m,
571  n,
572  beta,
573  c, c_rs, c_cs );
574  return;
575  }
576 
577  // If necessary, allocate, initialize, and use a temporary contiguous
578  // copy of each matrix rather than the original matrices.
579  bl1_ccreate_contigmt( transa,
580  m,
581  k,
582  a_save, a_rs_save, a_cs_save,
583  &a, &a_rs, &a_cs );
584 
585  bl1_ccreate_contigmt( transb,
586  k,
587  n,
588  b_save, b_rs_save, b_cs_save,
589  &b, &b_rs, &b_cs );
590 
592  n,
593  c_save, c_rs_save, c_cs_save,
594  &c, &c_rs, &c_cs );
595 
596  // Figure out whether A and/or B was copied to contiguous memory. This
597  // is used later to prevent redundant copying.
598  a_was_copied = ( a != a_save );
599  b_was_copied = ( b != b_save );
600 
601  // These are used to track the original values of a and b prior to any
602  // operand swapping that might take place. This is necessary for proper
603  // freeing of memory when one is a temporary contiguous matrix.
604  a_unswap = a;
605  b_unswap = b;
606 
607  // These are used to track the dimensions of the product of the
608  // A and B operands to the BLAS invocation of gemm. These differ
609  // from m and n when the operands need to be swapped.
610  m_gemm = m;
611  n_gemm = n;
612 
613  // Initialize with values assuming column-major storage.
614  lda = a_cs;
615  inca = a_rs;
616  ldb = b_cs;
617  incb = b_rs;
618  ldc = c_cs;
619  incc = c_rs;
620 
621  // Adjust the parameters based on the storage of each matrix.
622  if ( bl1_is_col_storage( c_rs, c_cs ) )
623  {
624  if ( bl1_is_col_storage( a_rs, a_cs ) )
625  {
626  if ( bl1_is_col_storage( b_rs, b_cs ) )
627  {
628  // requested operation: C_c += tr( A_c ) * tr( B_c )
629  // effective operation: C_c += tr( A_c ) * tr( B_c )
630  }
631  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
632  {
633 
634  // requested operation: C_c += tr( A_c ) * tr( B_r )
635  // effective operation: C_c += tr( A_c ) * tr( B_c )^T
636  bl1_swap_ints( ldb, incb );
637 
638  bl1_toggle_trans( transb );
639  }
640  }
641  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
642  {
643  if ( bl1_is_col_storage( b_rs, b_cs ) )
644  {
645  // requested operation: C_c += tr( A_r ) * tr( B_c )
646  // effective operation: C_c += tr( A_r )^T * tr( B_c )
647  bl1_swap_ints( lda, inca );
648 
649  bl1_toggle_trans( transa );
650  }
651  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
652  {
653  // requested operation: C_c += tr( A_r ) * tr( B_r )
654  // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
655  bl1_swap_ints( lda, inca );
656  bl1_swap_ints( ldb, incb );
657 
658  bl1_cswap_pointers( a, b );
659  bl1_swap_ints( a_was_copied, b_was_copied );
660  bl1_swap_ints( lda, ldb );
661  bl1_swap_ints( inca, incb );
662  bl1_swap_trans( transa, transb );
663 
664  gemm_needs_axpyt = TRUE;
665  bl1_swap_ints( m_gemm, n_gemm );
666  }
667  }
668  }
669  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
670  {
671  if ( bl1_is_col_storage( a_rs, a_cs ) )
672  {
673  if ( bl1_is_col_storage( b_rs, b_cs ) )
674  {
675  // requested operation: C_r += tr( A_c ) * tr( B_c )
676  // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
677  bl1_swap_ints( ldc, incc );
678 
679  bl1_swap_ints( m, n );
680 
681  gemm_needs_axpyt = TRUE;
682  }
683  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
684  {
685  // requested operation: C_r += tr( A_c ) * tr( B_r )
686  // effective operation: C_c += tr( B_c ) * tr( A_c )^T
687  bl1_swap_ints( ldc, incc );
688  bl1_swap_ints( ldb, incb );
689 
690  bl1_toggle_trans( transa );
691 
692  bl1_swap_ints( m, n );
693  bl1_swap_ints( m_gemm, n_gemm );
694  bl1_cswap_pointers( a, b );
695  bl1_swap_ints( a_was_copied, b_was_copied );
696  bl1_swap_ints( lda, ldb );
697  bl1_swap_ints( inca, incb );
698  bl1_swap_trans( transa, transb );
699  }
700  }
701  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
702  {
703  if ( bl1_is_col_storage( b_rs, b_cs ) )
704  {
705  // requested operation: C_r += tr( A_r ) * tr( B_c )
706  // effective operation: C_c += tr( B_c )^T * tr( A_c )
707  bl1_swap_ints( ldc, incc );
708  bl1_swap_ints( lda, inca );
709 
710  bl1_toggle_trans( transb );
711 
712  bl1_swap_ints( m, n );
713  bl1_swap_ints( m_gemm, n_gemm );
714  bl1_cswap_pointers( a, b );
715  bl1_swap_ints( a_was_copied, b_was_copied );
716  bl1_swap_ints( lda, ldb );
717  bl1_swap_ints( inca, incb );
718  bl1_swap_trans( transa, transb );
719  }
720  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
721  {
722  // requested operation: C_r += tr( A_r ) * tr( B_r )
723  // effective operation: C_c += tr( B_c ) * tr( A_c )
724  bl1_swap_ints( lda, inca );
725  bl1_swap_ints( ldb, incb );
726  bl1_swap_ints( ldc, incc );
727 
728  bl1_swap_ints( m, n );
729  bl1_swap_ints( m_gemm, n_gemm );
730  bl1_cswap_pointers( a, b );
731  bl1_swap_ints( a_was_copied, b_was_copied );
732  bl1_swap_ints( lda, ldb );
733  bl1_swap_ints( inca, incb );
734  bl1_swap_trans( transa, transb );
735  }
736  }
737  }
738 
739  // We need a temporary matrix for the case where A is conjugated.
740  a_conj = a;
741  lda_conj = lda;
742  inca_conj = inca;
743 
744  // If transa indicates conjugate-no-transpose and A was not already
745  // copied, then copy and conjugate it to a temporary matrix. Otherwise,
746  // if transa indicates conjugate-no-transpose and A was already copied,
747  // just conjugate it.
748  if ( bl1_is_conjnotrans( transa ) && !a_was_copied )
749  {
750  a_conj = bl1_callocm( m_gemm, k );
751  lda_conj = m_gemm;
752  inca_conj = 1;
753 
755  m_gemm,
756  k,
757  a, inca, lda,
758  a_conj, inca_conj, lda_conj );
759  }
760  else if ( bl1_is_conjnotrans( transa ) && a_was_copied )
761  {
762  bl1_cconjm( m_gemm,
763  k,
764  a_conj, inca_conj, lda_conj );
765  }
766 
767  // We need a temporary matrix for the case where B is conjugated.
768  b_conj = b;
769  ldb_conj = ldb;
770  incb_conj = incb;
771 
772  // If transb indicates conjugate-no-transpose and B was not already
773  // copied, then copy and conjugate it to a temporary matrix. Otherwise,
774  // if transb indicates conjugate-no-transpose and B was already copied,
775  // just conjugate it.
776  if ( bl1_is_conjnotrans( transb ) && !b_was_copied )
777  {
778  b_conj = bl1_callocm( k, n_gemm );
779  ldb_conj = k;
780  incb_conj = 1;
781 
783  k,
784  n_gemm,
785  b, incb, ldb,
786  b_conj, incb_conj, ldb_conj );
787  }
788  else if ( bl1_is_conjnotrans( transb ) && b_was_copied )
789  {
790  bl1_cconjm( k,
791  n_gemm,
792  b_conj, incb_conj, ldb_conj );
793  }
794 
795  // There are two cases where we need to perform the gemm and then axpy
796  // the result into C with a transposition. We handle those cases here.
797  if ( gemm_needs_axpyt )
798  {
799  // We need a temporary matrix for holding C^T. Notice that m and n
800  // represent the dimensions of C, while m_gemm and n_gemm are the
801  // dimensions of the actual product op(A)*op(B), which may be n-by-m
802  // since the operands may have been swapped.
803  c_trans = bl1_callocm( m_gemm, n_gemm );
804  ldc_trans = m_gemm;
805  incc_trans = 1;
806 
807  // Compute tr( A ) * tr( B ), where A and B may have been swapped
808  // to reference the other, and store the result in C_trans.
809  bl1_cgemm_blas( transa,
810  transb,
811  m_gemm,
812  n_gemm,
813  k,
814  alpha,
815  a_conj, lda_conj,
816  b_conj, ldb_conj,
817  &zero,
818  c_trans, ldc_trans );
819 
820  // Scale C by beta.
822  m,
823  n,
824  beta,
825  c, incc, ldc );
826 
827  // And finally, accumulate the matrix product in C_trans into C
828  // with a transpose.
830  m,
831  n,
832  &one,
833  c_trans, incc_trans, ldc_trans,
834  c, incc, ldc );
835 
836  // Free the temporary matrix for C.
837  bl1_cfree( c_trans );
838  }
839  else // no extra axpyt step needed
840  {
841  bl1_cgemm_blas( transa,
842  transb,
843  m_gemm,
844  n_gemm,
845  k,
846  alpha,
847  a_conj, lda_conj,
848  b_conj, ldb_conj,
849  beta,
850  c, ldc );
851  }
852 
853  if ( bl1_is_conjnotrans( transa ) && !a_was_copied )
854  bl1_cfree( a_conj );
855 
856  if ( bl1_is_conjnotrans( transb ) && !b_was_copied )
857  bl1_cfree( b_conj );
858 
859  // Free any temporary contiguous matrices, copying the result back to
860  // the original matrix.
861  bl1_cfree_contigm( a_save, a_rs_save, a_cs_save,
862  &a_unswap, &a_rs, &a_cs );
863 
864  bl1_cfree_contigm( b_save, b_rs_save, b_cs_save,
865  &b_unswap, &b_rs, &b_cs );
866 
867  bl1_cfree_saved_contigm( m_save,
868  n_save,
869  c_save, c_rs_save, c_cs_save,
870  &c, &c_rs, &c_cs );
871 }
void bl1_cfree_saved_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:59
Definition: blis_type_defs.h:81
scomplex bl1_c1(void)
Definition: bl1_constants.c:61
void bl1_ccreate_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:81
scomplex bl1_c0(void)
Definition: bl1_constants.c:125
int bl1_is_conjnotrans(trans1_t trans)
Definition: bl1_is.c:25
Definition: blis_type_defs.h:55
void bl1_ccreate_contigmt(trans1_t trans_dims, int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmt.c:89
void bl1_caxpymt(trans1_t trans, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:149
int bl1_zero_dim3(int m, int k, int n)
Definition: bl1_is.c:123
int bl1_is_col_storage(int rs, int cs)
Definition: bl1_is.c:90
void bl1_cconjm(int m, int n, scomplex *a, int a_rs, int a_cs)
Definition: bl1_conjm.c:23
Definition: blis_type_defs.h:56
void bl1_ccopymt(trans1_t trans, int m, int n, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition: bl1_copymt.c:215
Definition: blis_type_defs.h:132
void bl1_cscalm(conj1_t conj, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:169
scomplex * bl1_callocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:40
void bl1_cgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
Definition: bl1_gemm.c:1295
void bl1_cfree(scomplex *p)
Definition: bl1_free.c:40
void bl1_cfree_contigm(scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:45

◆ bl1_cgemm_blas()

void bl1_cgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
scomplex alpha,
scomplex a,
int  lda,
scomplex b,
int  ldb,
scomplex beta,
scomplex c,
int  ldc 
)

References bl1_param_map_to_netlib_trans(), cblas_cgemm(), CblasColMajor, and F77_cgemm().

Referenced by bl1_cgemm().

1296 {
1297 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1298  enum CBLAS_ORDER cblas_order = CblasColMajor;
1299  enum CBLAS_TRANSPOSE cblas_transa;
1300  enum CBLAS_TRANSPOSE cblas_transb;
1301 
1302  bl1_param_map_to_netlib_trans( transa, &cblas_transa );
1303  bl1_param_map_to_netlib_trans( transb, &cblas_transb );
1304 
1305  cblas_cgemm( cblas_order,
1306  cblas_transa,
1307  cblas_transb,
1308  m,
1309  n,
1310  k,
1311  alpha,
1312  a, lda,
1313  b, ldb,
1314  beta,
1315  c, ldc );
1316 #else
1317  char blas_transa;
1318  char blas_transb;
1319 
1320  bl1_param_map_to_netlib_trans( transa, &blas_transa );
1321  bl1_param_map_to_netlib_trans( transb, &blas_transb );
1322 
1323  F77_cgemm( &blas_transa,
1324  &blas_transb,
1325  &m,
1326  &n,
1327  &k,
1328  alpha,
1329  a, &lda,
1330  b, &ldb,
1331  beta,
1332  c, &ldc );
1333 #endif
1334 }
CBLAS_ORDER
Definition: blis_prototypes_cblas.h:17
CBLAS_TRANSPOSE
Definition: blis_prototypes_cblas.h:18
void bl1_param_map_to_netlib_trans(trans1_t blis_trans, void *blas_trans)
Definition: bl1_param_map.c:15
void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)
void F77_cgemm(char *transa, char *transb, int *m, int *n, int *k, scomplex *alpha, scomplex *a, int *lda, scomplex *b, int *ldb, scomplex *beta, scomplex *c, int *ldc)
Definition: blis_prototypes_cblas.h:17

◆ bl1_dgemm()

void bl1_dgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
double *  alpha,
double *  a,
int  a_rs,
int  a_cs,
double *  b,
int  b_rs,
int  b_cs,
double *  beta,
double *  c,
int  c_rs,
int  c_cs 
)

References bl1_d0(), bl1_d1(), bl1_dallocm(), bl1_daxpymt(), bl1_dcreate_contigm(), bl1_dcreate_contigmt(), bl1_dfree(), bl1_dfree_contigm(), bl1_dfree_saved_contigm(), bl1_dgemm_blas(), bl1_dscalm(), bl1_is_col_storage(), bl1_zero_dim3(), BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Bsvd_v_opd_var2(), FLA_Bsvd_v_opz_var2(), FLA_Gemm_external(), FLA_Tevd_v_opd_var2(), and FLA_Tevd_v_opz_var2().

275 {
276  int m_save = m;
277  int n_save = n;
278  double* a_save = a;
279  double* b_save = b;
280  double* c_save = c;
281  int a_rs_save = a_rs;
282  int a_cs_save = a_cs;
283  int b_rs_save = b_rs;
284  int b_cs_save = b_cs;
285  int c_rs_save = c_rs;
286  int c_cs_save = c_cs;
287  double zero = bl1_d0();
288  double one = bl1_d1();
289  double* a_unswap;
290  double* b_unswap;
291  double* c_trans;
292  int lda, inca;
293  int ldb, incb;
294  int ldc, incc;
295  int ldc_trans, incc_trans;
296  int m_gemm, n_gemm;
297  int gemm_needs_axpyt = FALSE;
298 
299  // Return early if possible.
300  if ( bl1_zero_dim3( m, k, n ) )
301  {
303  m,
304  n,
305  beta,
306  c, c_rs, c_cs );
307  return;
308  }
309 
310  // If necessary, allocate, initialize, and use a temporary contiguous
311  // copy of each matrix rather than the original matrices.
312  bl1_dcreate_contigmt( transa,
313  m,
314  k,
315  a_save, a_rs_save, a_cs_save,
316  &a, &a_rs, &a_cs );
317 
318  bl1_dcreate_contigmt( transb,
319  k,
320  n,
321  b_save, b_rs_save, b_cs_save,
322  &b, &b_rs, &b_cs );
323 
325  n,
326  c_save, c_rs_save, c_cs_save,
327  &c, &c_rs, &c_cs );
328 
329  // These are used to track the original values of a and b prior to any
330  // operand swapping that might take place. This is necessary for proper
331  // freeing of memory when one is a temporary contiguous matrix.
332  a_unswap = a;
333  b_unswap = b;
334 
335  // These are used to track the dimensions of the product of the
336  // A and B operands to the BLAS invocation of gemm. These differ
337  // from m and n when the operands need to be swapped.
338  m_gemm = m;
339  n_gemm = n;
340 
341  // Initialize with values assuming column-major storage.
342  lda = a_cs;
343  inca = a_rs;
344  ldb = b_cs;
345  incb = b_rs;
346  ldc = c_cs;
347  incc = c_rs;
348 
349  // Adjust the parameters based on the storage of each matrix.
350  if ( bl1_is_col_storage( c_rs, c_cs ) )
351  {
352  if ( bl1_is_col_storage( a_rs, a_cs ) )
353  {
354  if ( bl1_is_col_storage( b_rs, b_cs ) )
355  {
356  // requested operation: C_c += tr( A_c ) * tr( B_c )
357  // effective operation: C_c += tr( A_c ) * tr( B_c )
358  }
359  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
360  {
361 
362  // requested operation: C_c += tr( A_c ) * tr( B_r )
363  // effective operation: C_c += tr( A_c ) * tr( B_c )^T
364  bl1_swap_ints( ldb, incb );
365 
366  bl1_toggle_trans( transb );
367  }
368  }
369  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
370  {
371  if ( bl1_is_col_storage( b_rs, b_cs ) )
372  {
373  // requested operation: C_c += tr( A_r ) * tr( B_c )
374  // effective operation: C_c += tr( A_r )^T * tr( B_c )
375  bl1_swap_ints( lda, inca );
376 
377  bl1_toggle_trans( transa );
378  }
379  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
380  {
381  // requested operation: C_c += tr( A_r ) * tr( B_r )
382  // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
383  bl1_swap_ints( lda, inca );
384  bl1_swap_ints( ldb, incb );
385 
386  bl1_dswap_pointers( a, b );
387  bl1_swap_ints( lda, ldb );
388  bl1_swap_ints( inca, incb );
389  bl1_swap_trans( transa, transb );
390 
391  gemm_needs_axpyt = TRUE;
392  bl1_swap_ints( m_gemm, n_gemm );
393  }
394  }
395  }
396  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
397  {
398  if ( bl1_is_col_storage( a_rs, a_cs ) )
399  {
400  if ( bl1_is_col_storage( b_rs, b_cs ) )
401  {
402  // requested operation: C_r += tr( A_c ) * tr( B_c )
403  // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
404  bl1_swap_ints( ldc, incc );
405 
406  bl1_swap_ints( m, n );
407 
408  gemm_needs_axpyt = TRUE;
409  }
410  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
411  {
412  // requested operation: C_r += tr( A_c ) * tr( B_r )
413  // effective operation: C_c += tr( B_c ) * tr( A_c )^T
414  bl1_swap_ints( ldc, incc );
415  bl1_swap_ints( ldb, incb );
416 
417  bl1_toggle_trans( transa );
418 
419  bl1_swap_ints( m, n );
420  bl1_swap_ints( m_gemm, n_gemm );
421  bl1_dswap_pointers( a, b );
422  bl1_swap_ints( lda, ldb );
423  bl1_swap_ints( inca, incb );
424  bl1_swap_trans( transa, transb );
425  }
426  }
427  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
428  {
429  if ( bl1_is_col_storage( b_rs, b_cs ) )
430  {
431  // requested operation: C_r += tr( A_r ) * tr( B_c )
432  // effective operation: C_c += tr( B_c )^T * tr( A_c )
433  bl1_swap_ints( ldc, incc );
434  bl1_swap_ints( lda, inca );
435 
436  bl1_toggle_trans( transb );
437 
438  bl1_swap_ints( m, n );
439  bl1_swap_ints( m_gemm, n_gemm );
440  bl1_dswap_pointers( a, b );
441  bl1_swap_ints( lda, ldb );
442  bl1_swap_ints( inca, incb );
443  bl1_swap_trans( transa, transb );
444  }
445  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
446  {
447  // requested operation: C_r += tr( A_r ) * tr( B_r )
448  // effective operation: C_c += tr( B_c ) * tr( A_c )
449  bl1_swap_ints( lda, inca );
450  bl1_swap_ints( ldb, incb );
451  bl1_swap_ints( ldc, incc );
452 
453  bl1_swap_ints( m, n );
454  bl1_swap_ints( m_gemm, n_gemm );
455  bl1_dswap_pointers( a, b );
456  bl1_swap_ints( lda, ldb );
457  bl1_swap_ints( inca, incb );
458  bl1_swap_trans( transa, transb );
459  }
460  }
461  }
462 
463  // There are two cases where we need to perform the gemm and then axpy
464  // the result into C with a transposition. We handle those cases here.
465  if ( gemm_needs_axpyt )
466  {
467  // We need a temporary matrix for holding C^T. Notice that m and n
468  // represent the dimensions of C, while m_gemm and n_gemm are the
469  // dimensions of the actual product op(A)*op(B), which may be n-by-m
470  // since the operands may have been swapped.
471  c_trans = bl1_dallocm( m_gemm, n_gemm );
472  ldc_trans = m_gemm;
473  incc_trans = 1;
474 
475  // Compute tr( A ) * tr( B ), where A and B may have been swapped
476  // to reference the other, and store the result in C_trans.
477  bl1_dgemm_blas( transa,
478  transb,
479  m_gemm,
480  n_gemm,
481  k,
482  alpha,
483  a, lda,
484  b, ldb,
485  &zero,
486  c_trans, ldc_trans );
487 
488  // Scale C by beta.
490  m,
491  n,
492  beta,
493  c, incc, ldc );
494 
495  // And finally, accumulate the matrix product in C_trans into C
496  // with a transpose.
498  m,
499  n,
500  &one,
501  c_trans, incc_trans, ldc_trans,
502  c, incc, ldc );
503 
504  // Free the temporary matrix for C.
505  bl1_dfree( c_trans );
506  }
507  else // no extra axpyt step needed
508  {
509  bl1_dgemm_blas( transa,
510  transb,
511  m_gemm,
512  n_gemm,
513  k,
514  alpha,
515  a, lda,
516  b, ldb,
517  beta,
518  c, ldc );
519  }
520 
521  // Free any temporary contiguous matrices, copying the result back to
522  // the original matrix.
523  bl1_dfree_contigm( a_save, a_rs_save, a_cs_save,
524  &a_unswap, &a_rs, &a_cs );
525 
526  bl1_dfree_contigm( b_save, b_rs_save, b_cs_save,
527  &b_unswap, &b_rs, &b_cs );
528 
529  bl1_dfree_saved_contigm( m_save,
530  n_save,
531  c_save, c_rs_save, c_cs_save,
532  &c, &c_rs, &c_cs );
533 }
Definition: blis_type_defs.h:81
double bl1_d0(void)
Definition: bl1_constants.c:118
void bl1_dfree(double *p)
Definition: bl1_free.c:35
void bl1_dcreate_contigm(int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:47
Definition: blis_type_defs.h:55
int bl1_zero_dim3(int m, int k, int n)
Definition: bl1_is.c:123
void bl1_dcreate_contigmt(trans1_t trans_dims, int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmt.c:51
int bl1_is_col_storage(int rs, int cs)
Definition: bl1_is.c:90
void bl1_dfree_saved_contigm(int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:36
void bl1_dfree_contigm(double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:29
double * bl1_dallocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:35
void bl1_dgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, double *alpha, double *a, int lda, double *b, int ldb, double *beta, double *c, int ldc)
Definition: bl1_gemm.c:1254
void bl1_daxpymt(trans1_t trans, int m, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:81
double bl1_d1(void)
Definition: bl1_constants.c:54
void bl1_dscalm(conj1_t conj, int m, int n, double *alpha, double *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:65

◆ bl1_dgemm_blas()

void bl1_dgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
double *  alpha,
double *  a,
int  lda,
double *  b,
int  ldb,
double *  beta,
double *  c,
int  ldc 
)

References bl1_param_map_to_netlib_trans(), cblas_dgemm(), CblasColMajor, and F77_dgemm().

Referenced by bl1_dgemm().

1255 {
1256 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1257  enum CBLAS_ORDER cblas_order = CblasColMajor;
1258  enum CBLAS_TRANSPOSE cblas_transa;
1259  enum CBLAS_TRANSPOSE cblas_transb;
1260 
1261  bl1_param_map_to_netlib_trans( transa, &cblas_transa );
1262  bl1_param_map_to_netlib_trans( transb, &cblas_transb );
1263 
1264  cblas_dgemm( cblas_order,
1265  cblas_transa,
1266  cblas_transb,
1267  m,
1268  n,
1269  k,
1270  *alpha,
1271  a, lda,
1272  b, ldb,
1273  *beta,
1274  c, ldc );
1275 #else
1276  char blas_transa;
1277  char blas_transb;
1278 
1279  bl1_param_map_to_netlib_trans( transa, &blas_transa );
1280  bl1_param_map_to_netlib_trans( transb, &blas_transb );
1281 
1282  F77_dgemm( &blas_transa,
1283  &blas_transb,
1284  &m,
1285  &n,
1286  &k,
1287  alpha,
1288  a, &lda,
1289  b, &ldb,
1290  beta,
1291  c, &ldc );
1292 #endif
1293 }
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc)
CBLAS_ORDER
Definition: blis_prototypes_cblas.h:17
CBLAS_TRANSPOSE
Definition: blis_prototypes_cblas.h:18
void bl1_param_map_to_netlib_trans(trans1_t blis_trans, void *blas_trans)
Definition: bl1_param_map.c:15
Definition: blis_prototypes_cblas.h:17
void F77_dgemm(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc)

◆ bl1_sgemm()

void bl1_sgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
float *  alpha,
float *  a,
int  a_rs,
int  a_cs,
float *  b,
int  b_rs,
int  b_cs,
float *  beta,
float *  c,
int  c_rs,
int  c_cs 
)

References bl1_is_col_storage(), bl1_s0(), bl1_s1(), bl1_sallocm(), bl1_saxpymt(), bl1_screate_contigm(), bl1_screate_contigmt(), bl1_sfree(), bl1_sfree_contigm(), bl1_sfree_saved_contigm(), bl1_sgemm_blas(), bl1_sscalm(), bl1_zero_dim3(), BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

14 {
15  int m_save = m;
16  int n_save = n;
17  float* a_save = a;
18  float* b_save = b;
19  float* c_save = c;
20  int a_rs_save = a_rs;
21  int a_cs_save = a_cs;
22  int b_rs_save = b_rs;
23  int b_cs_save = b_cs;
24  int c_rs_save = c_rs;
25  int c_cs_save = c_cs;
26  float zero = bl1_s0();
27  float one = bl1_s1();
28  float* a_unswap;
29  float* b_unswap;
30  float* c_trans;
31  int lda, inca;
32  int ldb, incb;
33  int ldc, incc;
34  int ldc_trans, incc_trans;
35  int m_gemm, n_gemm;
36  int gemm_needs_axpyt = FALSE;
37 
38  // Return early if possible.
39  if ( bl1_zero_dim3( m, k, n ) )
40  {
42  m,
43  n,
44  beta,
45  c, c_rs, c_cs );
46  return;
47  }
48 
49  // If necessary, allocate, initialize, and use a temporary contiguous
50  // copy of each matrix rather than the original matrices.
51  bl1_screate_contigmt( transa,
52  m,
53  k,
54  a_save, a_rs_save, a_cs_save,
55  &a, &a_rs, &a_cs );
56 
57  bl1_screate_contigmt( transb,
58  k,
59  n,
60  b_save, b_rs_save, b_cs_save,
61  &b, &b_rs, &b_cs );
62 
64  n,
65  c_save, c_rs_save, c_cs_save,
66  &c, &c_rs, &c_cs );
67 
68  // These are used to track the original values of a and b prior to any
69  // operand swapping that might take place. This is necessary for proper
70  // freeing of memory when one is a temporary contiguous matrix.
71  a_unswap = a;
72  b_unswap = b;
73 
74  // These are used to track the dimensions of the product of the
75  // A and B operands to the BLAS invocation of gemm. These differ
76  // from m and n when the operands need to be swapped.
77  m_gemm = m;
78  n_gemm = n;
79 
80  // Initialize with values assuming column-major storage.
81  lda = a_cs;
82  inca = a_rs;
83  ldb = b_cs;
84  incb = b_rs;
85  ldc = c_cs;
86  incc = c_rs;
87 
88  // Adjust the parameters based on the storage of each matrix.
89  if ( bl1_is_col_storage( c_rs, c_cs ) )
90  {
91  if ( bl1_is_col_storage( a_rs, a_cs ) )
92  {
93  if ( bl1_is_col_storage( b_rs, b_cs ) )
94  {
95  // requested operation: C_c += tr( A_c ) * tr( B_c )
96  // effective operation: C_c += tr( A_c ) * tr( B_c )
97  }
98  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
99  {
100 
101  // requested operation: C_c += tr( A_c ) * tr( B_r )
102  // effective operation: C_c += tr( A_c ) * tr( B_c )^T
103  bl1_swap_ints( ldb, incb );
104 
105  bl1_toggle_trans( transb );
106  }
107  }
108  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
109  {
110  if ( bl1_is_col_storage( b_rs, b_cs ) )
111  {
112  // requested operation: C_c += tr( A_r ) * tr( B_c )
113  // effective operation: C_c += tr( A_r )^T * tr( B_c )
114  bl1_swap_ints( lda, inca );
115 
116  bl1_toggle_trans( transa );
117  }
118  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
119  {
120  // requested operation: C_c += tr( A_r ) * tr( B_r )
121  // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
122  bl1_swap_ints( lda, inca );
123  bl1_swap_ints( ldb, incb );
124 
125  bl1_sswap_pointers( a, b );
126  bl1_swap_ints( lda, ldb );
127  bl1_swap_ints( inca, incb );
128  bl1_swap_trans( transa, transb );
129 
130  gemm_needs_axpyt = TRUE;
131  bl1_swap_ints( m_gemm, n_gemm );
132  }
133  }
134  }
135  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
136  {
137  if ( bl1_is_col_storage( a_rs, a_cs ) )
138  {
139  if ( bl1_is_col_storage( b_rs, b_cs ) )
140  {
141  // requested operation: C_r += tr( A_c ) * tr( B_c )
142  // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
143  bl1_swap_ints( ldc, incc );
144 
145  bl1_swap_ints( m, n );
146 
147  gemm_needs_axpyt = TRUE;
148  }
149  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
150  {
151  // requested operation: C_r += tr( A_c ) * tr( B_r )
152  // effective operation: C_c += tr( B_c ) * tr( A_c )^T
153  bl1_swap_ints( ldc, incc );
154  bl1_swap_ints( ldb, incb );
155 
156  bl1_toggle_trans( transa );
157 
158  bl1_swap_ints( m, n );
159  bl1_swap_ints( m_gemm, n_gemm );
160  bl1_sswap_pointers( a, b );
161  bl1_swap_ints( lda, ldb );
162  bl1_swap_ints( inca, incb );
163  bl1_swap_trans( transa, transb );
164  }
165  }
166  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
167  {
168  if ( bl1_is_col_storage( b_rs, b_cs ) )
169  {
170  // requested operation: C_r += tr( A_r ) * tr( B_c )
171  // effective operation: C_c += tr( B_c )^T * tr( A_c )
172  bl1_swap_ints( ldc, incc );
173  bl1_swap_ints( lda, inca );
174 
175  bl1_toggle_trans( transb );
176 
177  bl1_swap_ints( m, n );
178  bl1_swap_ints( m_gemm, n_gemm );
179  bl1_sswap_pointers( a, b );
180  bl1_swap_ints( lda, ldb );
181  bl1_swap_ints( inca, incb );
182  bl1_swap_trans( transa, transb );
183  }
184  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
185  {
186  // requested operation: C_r += tr( A_r ) * tr( B_r )
187  // effective operation: C_c += tr( B_c ) * tr( A_c )
188  bl1_swap_ints( lda, inca );
189  bl1_swap_ints( ldb, incb );
190  bl1_swap_ints( ldc, incc );
191 
192  bl1_swap_ints( m, n );
193  bl1_swap_ints( m_gemm, n_gemm );
194  bl1_sswap_pointers( a, b );
195  bl1_swap_ints( lda, ldb );
196  bl1_swap_ints( inca, incb );
197  bl1_swap_trans( transa, transb );
198  }
199  }
200  }
201 
202  // There are two cases where we need to perform the gemm and then axpy
203  // the result into C with a transposition. We handle those cases here.
204  if ( gemm_needs_axpyt )
205  {
206  // We need a temporary matrix for holding C^T. Notice that m and n
207  // represent the dimensions of C, while m_gemm and n_gemm are the
208  // dimensions of the actual product op(A)*op(B), which may be n-by-m
209  // since the operands may have been swapped.
210  c_trans = bl1_sallocm( m_gemm, n_gemm );
211  ldc_trans = m_gemm;
212  incc_trans = 1;
213 
214  // Compute tr( A ) * tr( B ), where A and B may have been swapped
215  // to reference the other, and store the result in C_trans.
216  bl1_sgemm_blas( transa,
217  transb,
218  m_gemm,
219  n_gemm,
220  k,
221  alpha,
222  a, lda,
223  b, ldb,
224  &zero,
225  c_trans, ldc_trans );
226 
227  // Scale C by beta.
229  m,
230  n,
231  beta,
232  c, incc, ldc );
233 
234  // And finally, accumulate the matrix product in C_trans into C
235  // with a transpose.
237  m,
238  n,
239  &one,
240  c_trans, incc_trans, ldc_trans,
241  c, incc, ldc );
242 
243  // Free the temporary matrix for C.
244  bl1_sfree( c_trans );
245  }
246  else // no extra axpyt step needed
247  {
248  bl1_sgemm_blas( transa,
249  transb,
250  m_gemm,
251  n_gemm,
252  k,
253  alpha,
254  a, lda,
255  b, ldb,
256  beta,
257  c, ldc );
258  }
259 
260  // Free any temporary contiguous matrices, copying the result back to
261  // the original matrix.
262  bl1_sfree_contigm( a_save, a_rs_save, a_cs_save,
263  &a_unswap, &a_rs, &a_cs );
264 
265  bl1_sfree_contigm( b_save, b_rs_save, b_cs_save,
266  &b_unswap, &b_rs, &b_cs );
267 
268  bl1_sfree_saved_contigm( m_save,
269  n_save,
270  c_save, c_rs_save, c_cs_save,
271  &c, &c_rs, &c_cs );
272 }
void bl1_sfree_saved_contigm(int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:13
float * bl1_sallocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:30
float bl1_s1(void)
Definition: bl1_constants.c:47
Definition: blis_type_defs.h:81
void bl1_sfree(float *p)
Definition: bl1_free.c:30
void bl1_screate_contigm(int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:13
Definition: blis_type_defs.h:55
void bl1_sgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, float *alpha, float *a, int lda, float *b, int ldb, float *beta, float *c, int ldc)
Definition: bl1_gemm.c:1213
int bl1_zero_dim3(int m, int k, int n)
Definition: bl1_is.c:123
int bl1_is_col_storage(int rs, int cs)
Definition: bl1_is.c:90
void bl1_sfree_contigm(float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:13
void bl1_sscalm(conj1_t conj, int m, int n, float *alpha, float *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:13
void bl1_saxpymt(trans1_t trans, int m, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:13
void bl1_screate_contigmt(trans1_t trans_dims, int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmt.c:13
float bl1_s0(void)
Definition: bl1_constants.c:111

◆ bl1_sgemm_blas()

void bl1_sgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
float *  alpha,
float *  a,
int  lda,
float *  b,
int  ldb,
float *  beta,
float *  c,
int  ldc 
)

References bl1_param_map_to_netlib_trans(), cblas_sgemm(), CblasColMajor, and F77_sgemm().

Referenced by bl1_sgemm().

1214 {
1215 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1216  enum CBLAS_ORDER cblas_order = CblasColMajor;
1217  enum CBLAS_TRANSPOSE cblas_transa;
1218  enum CBLAS_TRANSPOSE cblas_transb;
1219 
1220  bl1_param_map_to_netlib_trans( transa, &cblas_transa );
1221  bl1_param_map_to_netlib_trans( transb, &cblas_transb );
1222 
1223  cblas_sgemm( cblas_order,
1224  cblas_transa,
1225  cblas_transb,
1226  m,
1227  n,
1228  k,
1229  *alpha,
1230  a, lda,
1231  b, ldb,
1232  *beta,
1233  c, ldc );
1234 #else
1235  char blas_transa;
1236  char blas_transb;
1237 
1238  bl1_param_map_to_netlib_trans( transa, &blas_transa );
1239  bl1_param_map_to_netlib_trans( transb, &blas_transb );
1240 
1241  F77_sgemm( &blas_transa,
1242  &blas_transb,
1243  &m,
1244  &n,
1245  &k,
1246  alpha,
1247  a, &lda,
1248  b, &ldb,
1249  beta,
1250  c, &ldc );
1251 #endif
1252 }
CBLAS_ORDER
Definition: blis_prototypes_cblas.h:17
CBLAS_TRANSPOSE
Definition: blis_prototypes_cblas.h:18
void F77_sgemm(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc)
void bl1_param_map_to_netlib_trans(trans1_t blis_trans, void *blas_trans)
Definition: bl1_param_map.c:15
Definition: blis_prototypes_cblas.h:17
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc)

◆ bl1_zgemm()

void bl1_zgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
dcomplex alpha,
dcomplex a,
int  a_rs,
int  a_cs,
dcomplex b,
int  b_rs,
int  b_cs,
dcomplex beta,
dcomplex c,
int  c_rs,
int  c_cs 
)

References bl1_is_col_storage(), bl1_is_conjnotrans(), bl1_z0(), bl1_z1(), bl1_zallocm(), bl1_zaxpymt(), bl1_zconjm(), bl1_zcopymt(), bl1_zcreate_contigm(), bl1_zcreate_contigmt(), bl1_zero_dim3(), bl1_zfree(), bl1_zfree_contigm(), bl1_zfree_saved_contigm(), bl1_zgemm_blas(), bl1_zscalm(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

874 {
875  int m_save = m;
876  int n_save = n;
877  dcomplex* a_save = a;
878  dcomplex* b_save = b;
879  dcomplex* c_save = c;
880  int a_rs_save = a_rs;
881  int a_cs_save = a_cs;
882  int b_rs_save = b_rs;
883  int b_cs_save = b_cs;
884  int c_rs_save = c_rs;
885  int c_cs_save = c_cs;
886  dcomplex zero = bl1_z0();
887  dcomplex one = bl1_z1();
888  dcomplex* a_unswap;
889  dcomplex* b_unswap;
890  dcomplex* a_conj;
891  dcomplex* b_conj;
892  dcomplex* c_trans;
893  int lda, inca;
894  int ldb, incb;
895  int ldc, incc;
896  int lda_conj, inca_conj;
897  int ldb_conj, incb_conj;
898  int ldc_trans, incc_trans;
899  int m_gemm, n_gemm;
900  int gemm_needs_axpyt = FALSE;
901  int a_was_copied;
902  int b_was_copied;
903 
904  // Return early if possible.
905  if ( bl1_zero_dim3( m, k, n ) )
906  {
908  m,
909  n,
910  beta,
911  c, c_rs, c_cs );
912  return;
913  }
914 
915  // If necessary, allocate, initialize, and use a temporary contiguous
916  // copy of each matrix rather than the original matrices.
917  bl1_zcreate_contigmt( transa,
918  m,
919  k,
920  a_save, a_rs_save, a_cs_save,
921  &a, &a_rs, &a_cs );
922 
923  bl1_zcreate_contigmt( transb,
924  k,
925  n,
926  b_save, b_rs_save, b_cs_save,
927  &b, &b_rs, &b_cs );
928 
930  n,
931  c_save, c_rs_save, c_cs_save,
932  &c, &c_rs, &c_cs );
933 
934  // Figure out whether A and/or B was copied to contiguous memory. This
935  // is used later to prevent redundant copying.
936  a_was_copied = ( a != a_save );
937  b_was_copied = ( b != b_save );
938 
939  // These are used to track the original values of a and b prior to any
940  // operand swapping that might take place. This is necessary for proper
941  // freeing of memory when one is a temporary contiguous matrix.
942  a_unswap = a;
943  b_unswap = b;
944 
945  // These are used to track the dimensions of the product of the
946  // A and B operands to the BLAS invocation of gemm. These differ
947  // from m and n when the operands need to be swapped.
948  m_gemm = m;
949  n_gemm = n;
950 
951  // Initialize with values assuming column-major storage.
952  lda = a_cs;
953  inca = a_rs;
954  ldb = b_cs;
955  incb = b_rs;
956  ldc = c_cs;
957  incc = c_rs;
958 
959  // Adjust the parameters based on the storage of each matrix.
960  if ( bl1_is_col_storage( c_rs, c_cs ) )
961  {
962  if ( bl1_is_col_storage( a_rs, a_cs ) )
963  {
964  if ( bl1_is_col_storage( b_rs, b_cs ) )
965  {
966  // requested operation: C_c += tr( A_c ) * tr( B_c )
967  // effective operation: C_c += tr( A_c ) * tr( B_c )
968  }
969  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
970  {
971 
972  // requested operation: C_c += tr( A_c ) * tr( B_r )
973  // effective operation: C_c += tr( A_c ) * tr( B_c )^T
974  bl1_swap_ints( ldb, incb );
975 
976  bl1_toggle_trans( transb );
977  }
978  }
979  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
980  {
981  if ( bl1_is_col_storage( b_rs, b_cs ) )
982  {
983  // requested operation: C_c += tr( A_r ) * tr( B_c )
984  // effective operation: C_c += tr( A_r )^T * tr( B_c )
985  bl1_swap_ints( lda, inca );
986 
987  bl1_toggle_trans( transa );
988  }
989  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
990  {
991  // requested operation: C_c += tr( A_r ) * tr( B_r )
992  // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
993  bl1_swap_ints( lda, inca );
994  bl1_swap_ints( ldb, incb );
995 
996  bl1_zswap_pointers( a, b );
997  bl1_swap_ints( a_was_copied, b_was_copied );
998  bl1_swap_ints( lda, ldb );
999  bl1_swap_ints( inca, incb );
1000  bl1_swap_trans( transa, transb );
1001 
1002  gemm_needs_axpyt = TRUE;
1003  bl1_swap_ints( m_gemm, n_gemm );
1004  }
1005  }
1006  }
1007  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
1008  {
1009  if ( bl1_is_col_storage( a_rs, a_cs ) )
1010  {
1011  if ( bl1_is_col_storage( b_rs, b_cs ) )
1012  {
1013  // requested operation: C_r += tr( A_c ) * tr( B_c )
1014  // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
1015  bl1_swap_ints( ldc, incc );
1016 
1017  bl1_swap_ints( m, n );
1018 
1019  gemm_needs_axpyt = TRUE;
1020  }
1021  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
1022  {
1023  // requested operation: C_r += tr( A_c ) * tr( B_r )
1024  // effective operation: C_c += tr( B_c ) * tr( A_c )^T
1025  bl1_swap_ints( ldc, incc );
1026  bl1_swap_ints( ldb, incb );
1027 
1028  bl1_toggle_trans( transa );
1029 
1030  bl1_swap_ints( m, n );
1031  bl1_swap_ints( m_gemm, n_gemm );
1032  bl1_zswap_pointers( a, b );
1033  bl1_swap_ints( a_was_copied, b_was_copied );
1034  bl1_swap_ints( lda, ldb );
1035  bl1_swap_ints( inca, incb );
1036  bl1_swap_trans( transa, transb );
1037  }
1038  }
1039  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
1040  {
1041  if ( bl1_is_col_storage( b_rs, b_cs ) )
1042  {
1043  // requested operation: C_r += tr( A_r ) * tr( B_c )
1044  // effective operation: C_c += tr( B_c )^T * tr( A_c )
1045  bl1_swap_ints( ldc, incc );
1046  bl1_swap_ints( lda, inca );
1047 
1048  bl1_toggle_trans( transb );
1049 
1050  bl1_swap_ints( m, n );
1051  bl1_swap_ints( m_gemm, n_gemm );
1052  bl1_zswap_pointers( a, b );
1053  bl1_swap_ints( a_was_copied, b_was_copied );
1054  bl1_swap_ints( lda, ldb );
1055  bl1_swap_ints( inca, incb );
1056  bl1_swap_trans( transa, transb );
1057  }
1058  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
1059  {
1060  // requested operation: C_r += tr( A_r ) * tr( B_r )
1061  // effective operation: C_c += tr( B_c ) * tr( A_c )
1062  bl1_swap_ints( lda, inca );
1063  bl1_swap_ints( ldb, incb );
1064  bl1_swap_ints( ldc, incc );
1065 
1066  bl1_swap_ints( m, n );
1067  bl1_swap_ints( m_gemm, n_gemm );
1068  bl1_zswap_pointers( a, b );
1069  bl1_swap_ints( a_was_copied, b_was_copied );
1070  bl1_swap_ints( lda, ldb );
1071  bl1_swap_ints( inca, incb );
1072  bl1_swap_trans( transa, transb );
1073  }
1074  }
1075  }
1076 
1077  // We need a temporary matrix for the case where A is conjugated.
1078  a_conj = a;
1079  lda_conj = lda;
1080  inca_conj = inca;
1081 
1082  // If transa indicates conjugate-no-transpose and A was not already
1083  // copied, then copy and conjugate it to a temporary matrix. Otherwise,
1084  // if transa indicates conjugate-no-transpose and A was already copied,
1085  // just conjugate it.
1086  if ( bl1_is_conjnotrans( transa ) && !a_was_copied )
1087  {
1088  a_conj = bl1_zallocm( m_gemm, k );
1089  lda_conj = m_gemm;
1090  inca_conj = 1;
1091 
1093  m_gemm,
1094  k,
1095  a, inca, lda,
1096  a_conj, inca_conj, lda_conj );
1097  }
1098  else if ( bl1_is_conjnotrans( transa ) && a_was_copied )
1099  {
1100  bl1_zconjm( m_gemm,
1101  k,
1102  a_conj, inca_conj, lda_conj );
1103  }
1104 
1105  // We need a temporary matrix for the case where B is conjugated.
1106  b_conj = b;
1107  ldb_conj = ldb;
1108  incb_conj = incb;
1109 
1110  // If transb indicates conjugate-no-transpose and B was not already
1111  // copied, then copy and conjugate it to a temporary matrix. Otherwise,
1112  // if transb indicates conjugate-no-transpose and B was already copied,
1113  // just conjugate it.
1114  if ( bl1_is_conjnotrans( transb ) && !b_was_copied )
1115  {
1116  b_conj = bl1_zallocm( k, n_gemm );
1117  ldb_conj = k;
1118  incb_conj = 1;
1119 
1121  k,
1122  n_gemm,
1123  b, incb, ldb,
1124  b_conj, incb_conj, ldb_conj );
1125  }
1126  else if ( bl1_is_conjnotrans( transb ) && b_was_copied )
1127  {
1128  bl1_zconjm( k,
1129  n_gemm,
1130  b_conj, incb_conj, ldb_conj );
1131  }
1132 
1133  // There are two cases where we need to perform the gemm and then axpy
1134  // the result into C with a transposition. We handle those cases here.
1135  if ( gemm_needs_axpyt )
1136  {
1137  // We need a temporary matrix for holding C^T. Notice that m and n
1138  // represent the dimensions of C, while m_gemm and n_gemm are the
1139  // dimensions of the actual product op(A)*op(B), which may be n-by-m
1140  // since the operands may have been swapped.
1141  c_trans = bl1_zallocm( m_gemm, n_gemm );
1142  ldc_trans = m_gemm;
1143  incc_trans = 1;
1144 
1145  // Compute tr( A ) * tr( B ), where A and B may have been swapped
1146  // to reference the other, and store the result in C_trans.
1147  bl1_zgemm_blas( transa,
1148  transb,
1149  m_gemm,
1150  n_gemm,
1151  k,
1152  alpha,
1153  a_conj, lda_conj,
1154  b_conj, ldb_conj,
1155  &zero,
1156  c_trans, ldc_trans );
1157 
1158  // Scale C by beta.
1160  m,
1161  n,
1162  beta,
1163  c, incc, ldc );
1164 
1165  // And finally, accumulate the matrix product in C_trans into C
1166  // with a transpose.
1168  m,
1169  n,
1170  &one,
1171  c_trans, incc_trans, ldc_trans,
1172  c, incc, ldc );
1173 
1174  // Free the temporary matrix for C.
1175  bl1_zfree( c_trans );
1176  }
1177  else // no extra axpyt step needed
1178  {
1179  bl1_zgemm_blas( transa,
1180  transb,
1181  m_gemm,
1182  n_gemm,
1183  k,
1184  alpha,
1185  a_conj, lda_conj,
1186  b_conj, ldb_conj,
1187  beta,
1188  c, ldc );
1189  }
1190 
1191  if ( bl1_is_conjnotrans( transa ) && !a_was_copied )
1192  bl1_zfree( a_conj );
1193 
1194  if ( bl1_is_conjnotrans( transb ) && !b_was_copied )
1195  bl1_zfree( b_conj );
1196 
1197  // Free any temporary contiguous matrices, copying the result back to
1198  // the original matrix.
1199  bl1_zfree_contigm( a_save, a_rs_save, a_cs_save,
1200  &a_unswap, &a_rs, &a_cs );
1201 
1202  bl1_zfree_contigm( b_save, b_rs_save, b_cs_save,
1203  &b_unswap, &b_rs, &b_cs );
1204 
1205  bl1_zfree_saved_contigm( m_save,
1206  n_save,
1207  c_save, c_rs_save, c_cs_save,
1208  &c, &c_rs, &c_cs );
1209 }
void bl1_zcreate_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:115
void bl1_zcreate_contigmt(trans1_t trans_dims, int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmt.c:127
dcomplex bl1_z0(void)
Definition: bl1_constants.c:133
Definition: blis_type_defs.h:81
dcomplex * bl1_zallocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:45
void bl1_zscalm(conj1_t conj, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:273
void bl1_zfree(dcomplex *p)
Definition: bl1_free.c:45
int bl1_is_conjnotrans(trans1_t trans)
Definition: bl1_is.c:25
Definition: blis_type_defs.h:55
int bl1_zero_dim3(int m, int k, int n)
Definition: bl1_is.c:123
void bl1_zaxpymt(trans1_t trans, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:248
int bl1_is_col_storage(int rs, int cs)
Definition: bl1_is.c:90
void bl1_zfree_saved_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:82
Definition: blis_type_defs.h:56
void bl1_zcopymt(trans1_t trans, int m, int n, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition: bl1_copymt.c:286
void bl1_zfree_contigm(dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:61
void bl1_zconjm(int m, int n, dcomplex *a, int a_rs, int a_cs)
Definition: bl1_conjm.c:72
void bl1_zgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
Definition: bl1_gemm.c:1336
Definition: blis_type_defs.h:137
dcomplex bl1_z1(void)
Definition: bl1_constants.c:69

◆ bl1_zgemm_blas()

void bl1_zgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
dcomplex alpha,
dcomplex a,
int  lda,
dcomplex b,
int  ldb,
dcomplex beta,
dcomplex c,
int  ldc 
)

References bl1_param_map_to_netlib_trans(), cblas_zgemm(), CblasColMajor, and F77_zgemm().

Referenced by bl1_zgemm().

1337 {
1338 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1339  enum CBLAS_ORDER cblas_order = CblasColMajor;
1340  enum CBLAS_TRANSPOSE cblas_transa;
1341  enum CBLAS_TRANSPOSE cblas_transb;
1342 
1343  bl1_param_map_to_netlib_trans( transa, &cblas_transa );
1344  bl1_param_map_to_netlib_trans( transb, &cblas_transb );
1345 
1346  cblas_zgemm( cblas_order,
1347  cblas_transa,
1348  cblas_transb,
1349  m,
1350  n,
1351  k,
1352  alpha,
1353  a, lda,
1354  b, ldb,
1355  beta,
1356  c, ldc );
1357 #else
1358  char blas_transa;
1359  char blas_transb;
1360 
1361  bl1_param_map_to_netlib_trans( transa, &blas_transa );
1362  bl1_param_map_to_netlib_trans( transb, &blas_transb );
1363 
1364  F77_zgemm( &blas_transa,
1365  &blas_transb,
1366  &m,
1367  &n,
1368  &k,
1369  alpha,
1370  a, &lda,
1371  b, &ldb,
1372  beta,
1373  c, &ldc );
1374 #endif
1375 }
CBLAS_ORDER
Definition: blis_prototypes_cblas.h:17
void F77_zgemm(char *transa, char *transb, int *m, int *n, int *k, dcomplex *alpha, dcomplex *a, int *lda, dcomplex *b, int *ldb, dcomplex *beta, dcomplex *c, int *ldc)
CBLAS_TRANSPOSE
Definition: blis_prototypes_cblas.h:18
void bl1_param_map_to_netlib_trans(trans1_t blis_trans, void *blas_trans)
Definition: bl1_param_map.c:15
Definition: blis_prototypes_cblas.h:17
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)