flame
diff --git a/‎build/bli_config.h.in
+4 b/‎build/bli_config.h.in
+4
diff --git a/‎configure
+43-18 b/‎configure
+43-18
diff --git a/‎frame/1m/packm/bli_packm.h
-1 b/‎frame/1m/packm/bli_packm.h
-1
diff --git a/‎frame/1m/packm/bli_packm_blk_var1.c
+8-8 b/‎frame/1m/packm/bli_packm_blk_var1.c
+8-8
diff --git a/‎frame/3/bli_l3_sup_packm_var.c
+8-8 b/‎frame/3/bli_l3_sup_packm_var.c
+8-8
diff --git a/‎frame/3/bli_l3_sup_var12.c
+2-2 b/‎frame/3/bli_l3_sup_var12.c
+2-2
diff --git a/‎frame/3/bli_l3_thrinfo.h
+6-6 b/‎frame/3/bli_l3_thrinfo.h
+6-6
diff --git a/‎frame/3/gemm/bli_gemm_cntl.c
+19-4 b/‎frame/3/gemm/bli_gemm_cntl.c
+19-4
@@ -80,6 +80,10 @@
 #define BLIS_ENABLE_JRIR_RR
 #endif
 
+#if @enable_jrir_tlb@
+#define BLIS_ENABLE_JRIR_TLB
+#endif
+
 #if @enable_pba_pools@
 #define BLIS_ENABLE_PBA_POOLS
 #else
 
@@ -340,16 +340,36 @@ print_usage()
 	echo " "
 	echo "   -r METHOD, --thread-part-jrir=METHOD"
 	echo " "
-	echo "                 Request a method of assigning micropanels to threads in"
-	echo "                 the JR and IR loops. Valid values for METHOD are 'slab'"
-	echo "                 and 'rr'. Using 'slab' assigns (as much as possible)"
-	echo "                 contiguous regions of micropanels to each thread while"
-	echo "                 using 'rr' assigns micropanels to threads in a round-"
-	echo "                 robin fashion. The chosen method also applies during"
-	echo "                 the packing of A and B. The default method is 'slab'."
-	echo "                 NOTE: Specifying this option constitutes a request,"
-	echo "                 which may be ignored in select situations if the"
-	echo "                 implementation has a good reason to do so."
+	echo "                 Select a strategy for partitioning computation in JR and"
+	echo "                 IR loops and assigning that computation to threads. Valid"
+	echo "                 values for METHOD are 'rr', 'slab', and 'tlb':"
+	echo "                  'rr':   Assign the computation associated with whole"
+	echo "                          columns of microtiles to threads in a round-"
+	echo "                          robin fashion. When selected, round-robin"
+	echo "                          assignment is also employed during packing."
+	echo "                  'slab': Partition the computation into N contiguous"
+	echo "                          regions, where each region contains a whole"
+	echo "                          number of microtile columns, and assign one"
+	echo "                          region to each thread. For some operations, the"
+	echo "                          number of microtile columns contained within a"
+	echo "                          given region may differ from that of other"
+	echo "                          regions, depending on how much work is implied"
+	echo "                          by each region. When selected, slab assignment"
+	echo "                          is also employed during packing."
+	echo "                  'tlb':  Tile-level load balancing is similar to slab,"
+	echo "                          except that regions will be divided at a more"
+	echo "                          granular level (individual microtiles instead"
+	echo "                          of whole columns of microtiles) to ensure more"
+	echo "                          equitable assignment of work to threads. When"
+	echo "                          selected, tlb will only be employed for level-3"
+	echo "                          operations except trsm; due to practical and"
+	echo "                          algorithmic limitations, slab partitioning will"
+	echo "                          be used instead during packing and for trsm."
+	echo "                 The default strategy is 'slab'. NOTE: Specifying this"
+	echo "                 option constitutes a request, which may be ignored in"
+	echo "                 select situations if implementation has a good reason to"
+	echo "                 do so. (See description of 'tlb' above for an example of"
+	echo "                 this.)"
 	echo " "
 	echo "   --disable-trsm-preinversion, --enable-trsm-preinversion"
 	echo " "
@@ -3731,16 +3751,20 @@ main()
 
 	# Check the method of assigning micropanels to threads in the JR and IR
 	# loops.
-	enable_jrir_slab_01=0
 	enable_jrir_rr_01=0
-	if [ "x${thread_part_jrir}" = "xslab" ]; then
-		echo "${script_name}: requesting slab threading in jr and ir loops."
-		enable_jrir_slab_01=1
-	elif [ "x${thread_part_jrir}" = "xrr" ]; then
-		echo "${script_name}: requesting round-robin threading in jr and ir loops."
+	enable_jrir_slab_01=0
+	enable_jrir_tlb_01=0
+	if   [ "x${thread_part_jrir}" = "xrr" ]; then
+		echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
 		enable_jrir_rr_01=1
+	elif [ "x${thread_part_jrir}" = "xslab" ]; then
+		echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
+		enable_jrir_slab_01=1
+	elif [ "x${thread_part_jrir}" = "xtlb" ]; then
+		echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
+		enable_jrir_tlb_01=1
 	else
-		echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}."
+		echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}."
 		exit 1
 	fi
 
@@ -4177,8 +4201,9 @@ main()
 		| sed   -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
 		| sed   -e "s/@enable_hpx@/${enable_hpx_01}/g" \
 		| sed   -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \
-		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
 		| sed   -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
+		| sed   -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
+		| sed   -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \
 		| sed   -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
 		| sed   -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \
 		| sed   -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \
 
@@ -39,7 +39,6 @@
 #include "bli_packm_init.h"
 #include "bli_packm_int.h"
 #include "bli_packm_scalar.h"
-#include "bli_packm_thrinfo.h"
 
 #include "bli_packm_part.h"
 
 
@@ -170,11 +170,11 @@ void bli_packm_blk_var1
 	const dim_t tid = bli_thrinfo_work_id( thread );
 
 	// Determine the thread range and increment using the current thread's
-	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	// will depend on whether slab or round-robin partitioning was requested
 	// at configure-time.
 	dim_t it_start, it_end, it_inc;
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
 
 	char* p_begin = p_cast;
 
@@ -195,10 +195,10 @@ void bli_packm_blk_var1
 
 			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;
 
-			// Hermitian/symmetric and general packing may use slab or
-			// round-robin (bli_packm_my_iter()), depending on which was
-			// selected at configure-time.
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) )
+			// Hermitian/symmetric and general packing may use slab or round-
+			// robin (bli_is_my_iter()), depending on which was selected at
+			// configure-time.
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) )
 			{
 				packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
 				                diagc,
@@ -286,9 +286,9 @@ void bli_packm_blk_var1
 			// We nudge the imaginary stride up by one if it is odd.
 			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
 
-			// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
+			// NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr())
 			// when packing micropanels of a triangular matrix.
-			if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) )
+			if ( bli_is_my_iter_rr( it, tid, nt ) )
 			{
 				packm_ker_cast( strucc,
 				                diagc,
 
@@ -155,10 +155,10 @@ void PASTEMAC(ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( ic  = ic0,    it  = 0; it < n_iter; \
@@ -175,9 +175,9 @@ void PASTEMAC(ch,varname) \
 			panel_len_i     = panel_len_full; \
 			panel_len_max_i = panel_len_max; \
 \
-			/* The definition of bli_packm_my_iter() will depend on whether slab
+			/* The definition of bli_is_my_iter() will depend on whether slab
 			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				f \
 				( \
@@ -398,10 +398,10 @@ void PASTEMAC(ch,varname) \
 	dim_t it_start, it_end, it_inc; \
 \
 	/* Determine the thread range and increment using the current thread's
-	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
+	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
 	   will depend on whether slab or round-robin partitioning was requested
 	   at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
+	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
 \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( it = 0; it < n_iter; it += 1 ) \
@@ -412,9 +412,9 @@ void PASTEMAC(ch,varname) \
 		ctype* p_use = p_begin; \
 \
 		{ \
-			/* The definition of bli_packm_my_iter() will depend on whether slab
+			/* The definition of bli_is_my_iter() will depend on whether slab
 			   or round-robin partitioning was requested at configure-time. */ \
-			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
+			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
 				PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
 				( \
 
@@ -357,11 +357,11 @@ void PASTEMAC(ch,varname) \
 						   object. */ \
 /*
 						ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
-						if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
+						if ( bli_is_last_iter_slrr( i, ir_iter, 0, 1 ) ) \
 						{ \
 							a2 = a_00; \
 							b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \
-							if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \
+							if ( bli_is_last_iter_slrr( j, jr_iter, 0, 1 ) ) \
 								b2 = b_00; \
 						} \
 \
 
@@ -39,22 +39,22 @@
 
 // gemm
 
-// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // gemmt
 
-// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
+// NOTE: Here, we assume NO parallelism in the IR loop.
+#define bli_gemmt_l_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
+        ( a0 + ( (-doff_j + 1*nr) / mr ) * step )
+#define bli_gemmt_u_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
+        ( a0 )
+
 // trmm
 
-// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
-// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
 #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 
@@ -61,10 +61,25 @@ cntl_t* bli_gemmbp_cntl_create
 	void_fp macro_kernel_fp;
 
 	// Choose the default macrokernel based on the operation family...
-	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
-	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
-	else /* should never execute */ macro_kernel_fp = NULL;
+	if      ( family == BLIS_GEMM )  macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_gemm_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_gemm_ker_var2;
+	                                   #endif
+	else if ( family == BLIS_GEMMT ) macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_gemmt_x_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_gemmt_x_ker_var2;
+	                                   #endif
+	else if ( family == BLIS_TRMM )  macro_kernel_fp =
+	                                   #ifdef BLIS_ENABLE_JRIR_TLB
+	                                   bli_trmm_xx_ker_var2b;
+	                                   #else // ifdef ( _SLAB || _RR )
+	                                   bli_trmm_xx_ker_var2;
+	                                   #endif
+	else /* should never execute */  macro_kernel_fp = NULL;
 
 	// ...unless a non-NULL kernel function pointer is passed in, in which
 	// case we use that instead.