/*******************************************************************************
*                                                                              *
*   (C) 1997-2021 by Ernst W. Mayer.                                           *
*                                                                              *
*  This program is free software; you can redistribute it and/or modify it     *
*  under the terms of the GNU General Public License as published by the       *
*  Free Software Foundation; either version 2 of the License, or (at your      *
*  option) any later version.                                                  *
*                                                                              *
*  This program is distributed in the hope that it will be useful, but WITHOUT *
*  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
*  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
*  more details.                                                               *
*                                                                              *
*  You should have received a copy of the GNU General Public License along     *
*  with this program; see the file GPL.txt.  If not, you may view one at       *
*  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
*  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
*  02111-1307, USA.                                                            *
*                                                                              *
*******************************************************************************/

#include "align.h"
#include "util.h"
#include "factor.h"	// Needed for twopmodq64() prototype
#include "imul_macro.h"
#ifdef TEST_SIMD
	#include "dft_macro.h"
  #ifdef USE_SSE2
	#include "sse2_macro_gcc64.h"
	#include "radix16_dif_dit_pass_asm.h"
  #endif
#endif
#ifdef USE_GPU
	#include "gpu_iface.h"
#endif

#if 0
	#define USE_FMADD
	#warning USE_FMADD local-defined!
#endif
/**********************************/
/******* INFO, WARN ASSERT ********/
/**********************************/

void INFO(long line, char*file, char*info_string, char*info_file, int copy2stderr) {
	FILE *fp = 0x0;
	if(STRNEQ(info_file, "")) {
		fp = mlucas_fopen(info_file,"a");
		if(!fp) fprintf(stderr,"WARNING: unable to open file %s in call to DEBUG_INFO.\n", info_file);
	}
	if(fp) {
		fprintf(fp,"INFO: At line %lu of file %s:\n", line, file);	fprintf(fp,"%s\n", info_string);	fclose(fp); fp = 0x0;
	}
	if(copy2stderr || !fp) {
		fprintf(stderr,"INFO: At line %lu of file %s:\n", line, file);	fprintf(stderr,"%s\n", info_string);	fflush(stderr);
	}
}

void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stderr) {
	FILE *fp = 0x0;
	if(STRNEQ(warn_file, "")) {
		fp = mlucas_fopen(warn_file,"a");
		if(!fp) fprintf(stderr,"WARNING: unable to open file %s in call to DBG_WARN.\n", warn_file);
	}
	if(fp) {
		fprintf(fp,"WARN: At line %lu of file %s:\n", line, file);	fprintf(fp,"%s\n", warn_string);	fclose(fp); fp = 0x0;
	}
	if(copy2stderr || !fp) {
		fprintf(stderr,"WARN: At line %lu of file %s:\n", line, file);	fprintf(stderr,"%s\n", warn_string);	fflush(stderr);
	}
}

#ifdef __CUDA_ARCH__
	/* No-op for GPU device-code compiles: */
	__device__ void ASSERT(long line, char*file, int expr, char*assert_string) {}
#else

  #ifdef USE_C99

	void ASSERT(char*func, long line, char*file, int expr, char*assert_string) {
		/* Define a convenient spot to set a breakpoint: */
		if(!expr) {
			fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file);	fprintf(stderr,"Assertion failed: %s\n", assert_string);
			/* Flush all output streams prior to asserting. We replace the original assert(0) call with
			an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
			fflush(NULL);
			exit(EXIT_FAILURE);
		}
	}

  #else

	void ASSERT(long line, char*file, int expr, char*assert_string) {
		/* Define a convenient spot to set a breakpoint: */
		if(!expr) {
			fprintf(stderr,"ERROR: at line %lu of file %s\n", line, file);	fprintf(stderr,"Assertion failed: %s\n", assert_string);
			/* Flush all output streams prior to asserting. We replace the original assert(0) call with
			an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
			fflush(NULL);
			exit(EXIT_FAILURE);	// Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
		}
	}

  #endif

#endif	// __CUDA_ARCH__ ?

/***************/

/* ewm: Not sure what I intended this for... */
void	VAR_WARN(char *typelist, ...)
{
	char *c;
	 int32 ival;
	uint32 uval;
	double dval;

	va_list varargs;
	va_start(varargs, typelist);
	/* Define a convenient spot to set a breakpoint: */
	for(c = typelist; *c; c++)
	{
		switch(*c)
		{
			case 'i':
				ival = va_arg(varargs, int32);
				break;
			case 'u':
				uval = va_arg(varargs,uint32);
				break;
			case 'd':
				dval = va_arg(varargs,double);
				break;
			default :
				ASSERT(HERE, 0,"0");
				break;
		}
	}
	va_end(varargs);
}

int mlucas_nanosleep(const struct timespec *req)
{
	struct timespec tmp = *req;
	int ret;
	do {
		ret = nanosleep(&tmp, &tmp);
	} while (ret == -1 && errno == EINTR);
	return ret;
}

/******* Supplements to stdio (e.g. binary-formatted output) *********/

void byte_bitstr(const uint8 byte, char*ostr)
{
	strcpy(ostr, bytestr[byte]);
}

void	ui32_bitstr(const uint32 ui32, char*ostr)
{
	int i;
	for(i = 0; i < 32; i += 8) {
		// High byte ==> leftmost 8 chars of output string, thus the (24 - i)
		byte_bitstr((uint8)(ui32 >> (24 - i)), ostr + i);
	}
}

void	ui64_bitstr(const uint64 ui64, char*ostr)
{
	int i;
	for(i = 0; i < 64; i += 8) {
		// High byte ==> leftmost 8 chars of output string, thus the (56 - i)
		byte_bitstr((uint8)(ui64 >> (56 - i)), ostr + i);
	}
}

// Def ENABLE_MPRIME_PM1_SMOOTH at compile time to enable Mp p-1 smoothness code:
#ifdef ENABLE_MPRIME_PM1_SMOOTH

	#undef psmooth
	struct psmooth {
		uint32 p;
		uint32 b;	// Standard B-smooth measure based on largest prime factor
		double r;	// L2 "roughness" metric in (0,1] defined by L2 norm of log factor sizes
	};

	// Decimal-print m(p) to a file in 100-digit chunks:
	void print_mp_dec(const uint32 p)
	{
		char *str = 0x0, fname[STR_MAX_LEN];	fname[0] = 'p'; fname[1] = '\0';
		FILE*fp = 0x0;
		uint32 i, lenX, lenD, nchars,nc, wrap_every = 100;	// Insert a LF every 100 digits
		uint64 *x,*y,*d,*r;
		uint64 ONES64 = 0xFFFFFFFFFFFFFFFFull;	// In GCC, making this 'const' gives "warning: overflow in implicit constant conversion" wherever it is used.
		i = convert_uint64_base10_char(fname+1, (uint64)p);
		strcpy(fname+1,fname+i+1);
		strcat(fname,"_decimal.txt");
		// Allocate the array containing first M(p) and then subsequent divide-by-10^100 results.
		// Due to the requirement in mi64_div() that dividend and quotient arrays may not point
		// to the same memory, bounce successive-divide results between 2 arrays, x and y:
		lenX = (p>>6);
	//	x = (uint64 *)calloc(lenX + 1, sizeof(uint64));
		x = (uint64 *)calloc(((lenX + 3) & ~3), sizeof(uint64));	// Zero-pad to make multiple of 4, allowing 64-bit DIV algo to use 4-way-folded loops
		memset(x,ONES64,(lenX<<3));	x[lenX++] = (1ull << (p&63)) - 1;
		nchars = ceil(p * log(2.0)/log(10.));
		fprintf(stderr,"Generating decimal printout of M(%u), which has [%u] decimal digits; will write results to file '%s'...\n",p,nchars,fname);

		// Until have generic-FFT-based mi64_divrem algo in place, use mod-10^27, the largest power of 10 whose
		// odd factor (5^27) fits in a uint64, thus allowing the core div-and-mod loops to use 1-word arguments:
		nc = nchars + (nchars/27) + 1;	// Add newlines to count
		str = (char *)calloc(nc, sizeof(char));
		y = (uint64 *)calloc(lenX + 1, sizeof(uint64));
		// 10^100 has 333 bits, thus needs 6 uint64s, as do the mod-10^100 remainders,
		// but we allow the convert_base10_char_mi64() utility to do the allocation of the former for us:
		lenD = 0; ASSERT(HERE, 0x0 != (d = convert_base10_char_mi64("1000000000000000000000000000", &lenD)) && (lenD == 2), "0");
		r = (uint64 *)calloc(lenD, sizeof(uint64));
		nc -= 28;		// starting char of first 27-digit chunk
		for(i = 0; ; i+=2) {	// i = #divides counter; do 2 divs per loop exec in attempt to get some modest pipelining
			mi64_div(x, d, lenX, lenD, y, r);	// dividend in y, remainder in r
			convert_mi64_base10_char_print_lead0(str + nc, r, lenD, 27,0);	nc -= 28;	mi64_clear(r, lenD);
			lenX = mi64_getlen(y, lenX);
			if( (lenX < lenD) || ((lenX == lenD) && mi64_cmpult(y,d,lenX)) ) {
				convert_mi64_base10_char(str, y, lenX, 27);
				break;
			}
			mi64_div(y, d, lenX, lenD, x, r);	// dividend in y, remainder in r
			convert_mi64_base10_char_print_lead0(str + nc, r, lenD, 27,0);	nc -= 28;	mi64_clear(r, lenD);
			lenX = mi64_getlen(x, lenX);
			if( (lenX < lenD) || ((lenX == lenD) && mi64_cmpult(x,d,lenX)) ) {
				convert_mi64_base10_char(str, x, lenX, 27);
				break;
			}
			if((i % 1023) == 0)	// 1M digits ~= 37037 loop execs
				fprintf(stderr,"At digit %u of %5.2fM...\n",27*i,(float)nchars/1000000);
		}
		nc = nchars + (nchars/27) + 1;	// Add newlines to count
		str[nc-1] = '\0';
		fp = mlucas_fopen(fname, "w");
		ASSERT(HERE, fp != 0x0, "Null file pointer!");
		fprintf(fp,"%s\n", str);
		fclose(fp);	fp = 0x0;
		fprintf(stderr,"Done writing %s.",fname);
	}

	// Binary predicates for use of stdlib qsort() on the b-subfield of the above psmooth struct:
	int psmooth_cmp_b(const void *x, const void *y)	// Default-int compare predicate
	{
		uint32 a = ((struct psmooth*)x)->b, b = ((struct psmooth*)y)->b;
		return ncmp_uint32( (void*)&a, (void*)&b );
	}

	// Binary predicates for use of stdlib qsort() on the r-subfield of the above psmooth struct:
	int psmooth_cmp_r(const void *x, const void *y)	// Default-int compare predicate
	{
		double two53float = (double)1.0*0x08000000*0x04000000;
		uint64 a = two53float*((struct psmooth*)x)->r, b = two53float*((struct psmooth*)y)->r;
		return ncmp_uint64( (void*)&a, (void*)&b );
	}

	/* Here are the 9366 base-2 Fermat pseudoprimes < 2^32 not divisible by 3 or 5: */
	const uint32 fbase2psp[9366] = {
	341u,1387u,1729u,2047u,2701u,2821u,3277u,4033u,4369u,4681u,5461u,6601u,7957u,8321u,8911u,10261u,13741u,13747u,13981u,14491u,15709u,15841u,18721u,19951u,23377u,29341u,30121u,30889u,31417u,31609u,31621u,35333u,41041u,42799u,46657u,49141u,49981u,52633u,57421u,60701u,60787u,63973u,65077u,65281u,68101u,75361u,80581u,83333u,85489u,88357u,88561u,90751u,91001u,93961u,101101u,104653u,113201u,115921u,123251u,126217u,129889u,130561u,137149u,149281u,150851u,158369u,162193u,162401u,164737u,172081u,176149u,181901u,188057u,188461u,194221u,196021u,196093u,204001u,215749u,219781u,220729u,226801u,228241u,233017u,241001u,249841u,252601u,253241u,256999u,258511u,264773u,271951u,272251u,275887u,276013u,280601u,282133u,284581u,285541u,294271u,
	294409u,314821u,318361u,323713u,334153u,340561u,341497u,348161u,357761u,367081u,387731u,390937u,396271u,399001u,401401u,410041u,422659u,423793u,435671u,443719u,448921u,452051u,458989u,476971u,481573u,486737u,488881u,489997u,493697u,512461u,513629u,514447u,530881u,534061u,552721u,556169u,563473u,574561u,574861u,580337u,582289u,587861u,604117u,611701u,617093u,622909u,625921u,635401u,642001u,647089u,653333u,657901u,658801u,665281u,665333u,665401u,670033u,672487u,679729u,680627u,683761u,688213u,710533u,711361u,721801u,722201u,722261u,729061u,738541u,741751u,742813u,745889u,748657u,769567u,769757u,786961u,818201u,838201u,838861u,841681u,847261u,852481u,852841u,873181u,875161u,877099u,916327u,934021u,950797u,976873u,983401u,997633u,1004653u,1016801u,1018921u,1023121u,1024651u,1033669u,1052503u,1052929u,
	1053761u,1064053u,1073021u,1082401u,1082809u,1092547u,1093417u,1104349u,1109461u,1128121u,1132657u,1139281u,1141141u,1145257u,1152271u,1157689u,1168513u,1193221u,1194649u,1207361u,1251949u,1252697u,1277179u,1293337u,1302451u,1306801u,1325843u,1333333u,1357441u,1357621u,1373653u,1397419u,1398101u,1419607u,1433407u,1441091u,1457773u,1459927u,1461241u,1463749u,1472353u,1493857u,1500661u,1507561u,1507963u,1509709u,1530787u,1533601u,1533961u,1534541u,1537381u,1549411u,1569457u,1579249u,1584133u,1615681u,1678541u,1690501u,1711381u,1719601u,1730977u,1735841u,1746289u,1755001u,1773289u,1801969u,1809697u,1811573u,1826203u,1827001u,1837381u,1839817u,1840357u,1857241u,1876393u,1896961u,1907851u,1909001u,1937881u,1969417u,1987021u,1993537u,1994689u,2004403u,2008597u,2035153u,2081713u,2085301u,2089297u,2100901u,2113921u,2121301u,2134277u,2144521u,2163001u,2165801u,2171401u,2181961u,2184571u,2205967u,
	2233441u,2261953u,2264369u,2269093u,2284453u,2299081u,2304167u,2313697u,2327041u,2350141u,2387797u,2433601u,2434651u,2455921u,2487941u,2491637u,2503501u,2508013u,2510569u,2528921u,2537641u,2603381u,2609581u,2615977u,2617451u,2626177u,2628073u,2649029u,2649361u,2670361u,2704801u,2719981u,2722681u,2746477u,2746589u,2748023u,2757241u,2773981u,2780731u,2797921u,2811271u,2827801u,2867221u,2880361u,2899801u,2909197u,2921161u,2940337u,2944261u,2953711u,2976487u,2977217u,2987167u,3048841u,3057601u,3059101u,3073357u,3090091u,3094273u,3116107u,3125281u,3146221u,3165961u,3186821u,3225601u,3235699u,3316951u,3336319u,3337849u,3345773u,3363121u,3375041u,3375487u,3400013u,3413533u,3429037u,3471071u,3539101u,3542533u,3567481u,3568661u,3581761u,3605429u,3656449u,3679201u,3726541u,3755521u,3763801u,3814357u,3828001u,3898129u,3911197u,3916261u,3936691u,3985921u,4005001u,4014361u,4038673u,4069297u,4072729u,
	4082653u,4097791u,4101637u,4151869u,4154161u,4154977u,4181921u,4188889u,4209661u,4314967u,4335241u,4360621u,4361389u,4363261u,4415251u,4463641u,4469471u,4480477u,4504501u,4513841u,4567837u,4650049u,4670029u,4682833u,4698001u,4706821u,4714201u,4767841u,4806061u,4827613u,4835209u,4863127u,4864501u,4868701u,4869313u,4877641u,4903921u,4909177u,4917331u,4917781u,4922413u,4974971u,4984001u,5016191u,5031181u,5034601u,5044033u,5049001u,5095177u,5131589u,5148001u,5173169u,5173601u,5176153u,5187637u,5193721u,5250421u,5256091u,5258701u,5271841u,5284333u,5310721u,5351537u,5423713u,5444489u,5456881u,5481451u,5489641u,5524693u,5551201u,5575501u,5590621u,5672041u,5681809u,5733649u,5758273u,5766001u,5804821u,5859031u,5872361u,5919187u,5968261u,5968873u,5977153u,6027193u,6049681u,6118141u,6122551u,6140161u,6159301u,6183601u,6189121u,6226193u,6233977u,6236257u,6236473u,6255341u,6278533u,6309901u,6313681u,
	6334351u,6350941u,6368689u,6386993u,6474691u,6539527u,6617929u,6631549u,6658669u,6732817u,6733693u,6749021u,6779137u,6787327u,6836233u,6840001u,6868261u,6886321u,6912079u,6952037u,6955541u,6973057u,6973063u,6998881u,7008001u,7017193u,7207201u,7215481u,7232321u,7259161u,7273267u,7295851u,7306261u,7306561u,7414333u,7416289u,7428421u,7429117u,7455709u,7462001u,7516153u,7519441u,7546981u,7656721u,7674967u,7725901u,7759937u,7803769u,7808593u,7814401u,7820201u,7883731u,7995169u,8036033u,8043841u,8095447u,8134561u,8137633u,8180461u,8209657u,8231653u,8239477u,8280229u,8321671u,8341201u,8355841u,8362201u,8384513u,8388607u,8462233u,8534233u,8640661u,8646121u,8650951u,8719309u,8719921u,8725753u,8727391u,8745277u,8812273u,8830801u,8902741u,8916251u,8927101u,8992201u,9006401u,9037729u,9040013u,9046297u,9056501u,9069229u,9073513u,9084223u,9106141u,9131401u,9143821u,9273547u,9345541u,9371251u,9439201u,
	9480461u,9494101u,9533701u,9564169u,9567673u,9585541u,9588151u,9591661u,9613297u,9692453u,9724177u,9729301u,9774181u,9834781u,9863461u,9890881u,9908921u,9920401u,9995671u,10004681u,10024561u,10031653u,10033777u,10079521u,10084177u,10134601u,10185841u,10226161u,10251473u,10266001u,10267951u,10317601u,10323769u,10331141u,10386241u,10393201u,10402237u,10402561u,10403641u,10425511u,10505701u,10513261u,10545991u,10606681u,10610063u,10635751u,10700761u,10712857u,10763653u,10802017u,10837321u,10877581u,10956673u,10958221u,10974881u,11075857u,11081459u,11115037u,11157721u,11205601u,11328409u,11335501u,11367137u,11433301u,11541307u,11585293u,11592397u,11644921u,11767861u,11777599u,11972017u,12032021u,12096613u,12261061u,12262321u,12263131u,12273769u,12322133u,12327121u,12376813u,12407011u,12490201u,12498061u,12584251u,12599233u,12643381u,12659989u,12711007u,12757361u,12783811u,12854437u,12932989u,12936763u,12939121u,13057787u,13073941u,13216141u,13295281u,13333441u,13338371u,
	13357981u,13421773u,13446253u,13448593u,13500313u,13554781u,13635289u,13635649u,13694761u,13696033u,13747361u,13757653u,13773061u,13838569u,13856417u,13942081u,13971841u,13991647u,13996951u,14012797u,14026897u,14154337u,14179537u,14282143u,14324473u,14469841u,14589901u,14609401u,14671801u,14676481u,14684209u,14709241u,14794081u,14796289u,14865121u,14870801u,14898631u,14899751u,14913991u,14980411u,15082901u,15101893u,15124969u,15139199u,15162941u,15188557u,15207361u,15220951u,15247621u,15248773u,15268501u,15472441u,15479777u,15510041u,15525241u,15560461u,15583153u,15603391u,15621409u,15698431u,15700301u,15716041u,15732721u,15757741u,15802681u,15829633u,15888313u,15913261u,15976747u,15978007u,16046641u,16053193u,16070429u,16132321u,16149169u,16149601u,16153633u,16158331u,16324001u,16349477u,16360381u,16435747u,16539601u,16666651u,16705021u,16717061u,16773121u,16778881u,16818877u,16822081u,16843009u,16853077u,16879501u,16973393u,16998961u,17020201u,17098369u,17116837u,17134043u,17208601u,
	17236801u,17316001u,17327773u,17375249u,17405537u,17429861u,17450569u,17509501u,17585969u,17586361u,17590957u,17641207u,17698241u,17759681u,17777191u,17812081u,17870561u,17895697u,18003349u,18067501u,18073817u,18151861u,18162001u,18300241u,18307381u,18366937u,18443701u,18454921u,18468901u,18487267u,18490381u,18535177u,18541441u,18595801u,18607009u,18653353u,18736381u,18740971u,18779761u,18900973u,18985627u,19020191u,19054933u,19149571u,19328653u,19384289u,19404139u,19471033u,19607561u,19683001u,19734157u,19985269u,20081953u,20099017u,20117467u,20140129u,20202481u,20234341u,20261251u,20417311u,20489239u,20647621u,20770621u,20964961u,20968501u,21042001u,21224401u,21303343u,21306157u,21355951u,21359521u,21397381u,21400481u,21414169u,21417991u,21459361u,21474181u,21559741u,21585313u,21623659u,21654533u,21715681u,21789901u,21814417u,21880801u,21907009u,22066201u,22075579u,22087477u,22137809u,22203181u,22215961u,22351249u,22369621u,22397497u,22432201u,22480381u,22487101u,22509691u,22513457u,
	22564081u,22591301u,22669501u,22711873u,22848541u,22849481u,22885129u,22899097u,22953673u,23247901u,23261713u,23283037u,23286781u,23315977u,23382529u,23386441u,23405341u,23464033u,23577497u,23634181u,23734901u,23808721u,23822329u,23828017u,23872213u,23963869u,23966011u,24037021u,24158641u,24214051u,24356377u,24726773u,24776557u,24904153u,24913681u,24929281u,25080101u,25150501u,25276421u,25326001u,25457833u,25520833u,25540291u,25557121u,25603201u,25629913u,25640641u,25696133u,25768261u,25840081u,25846913u,25873381u,25909453u,25947959u,26254801u,26280073u,26377921u,26465089u,26470501u,26474581u,26553241u,26634301u,26758057u,26813221u,26821601u,26840269u,26877421u,26886817u,26921089u,26932081u,26977001u,27062101u,27108397u,27118601u,27128201u,27168337u,27218269u,27219697u,27271151u,27279409u,27331921u,27336673u,27380831u,27392041u,27401401u,27402481u,27409541u,27476641u,27491237u,27492581u,27509653u,27600001u,27664033u,27700609u,27714961u,27798461u,27808463u,27846721u,27966709u,28011001u,
	28029001u,28071121u,28172629u,28312921u,28325881u,28406953u,28449961u,28527049u,28572961u,28629613u,28717483u,29020321u,29111881u,29137021u,29143633u,29214541u,29581501u,29593159u,29732221u,29878381u,30022129u,30058381u,30069721u,30090817u,30185569u,30219757u,30295141u,30296761u,30338593u,30388753u,30418957u,30529693u,30576151u,30662497u,30718441u,30739969u,30740417u,30881551u,30894307u,30951181u,30958201u,30971161u,30992401u,30996001u,31040833u,31118221u,31146661u,31150351u,31166803u,31198693u,31405501u,31436123u,31735621u,31759121u,31766983u,31794241u,32080651u,32091781u,32095057u,32158621u,32168117u,32264029u,32285041u,32368609u,32497921u,32676481u,32701297u,32756581u,32899201u,32914441u,33146717u,33193117u,33298337u,33302401u,33596641u,33600533u,33627301u,33704101u,33840397u,33848311u,33872593u,33965261u,34003061u,34043101u,34100821u,34111441u,34124641u,34196401u,34386121u,34540801u,34581457u,34657141u,34856167u,34890481u,34901461u,34944001u,35428141u,35498467u,35571601u,35576599u,
	35626501u,35703361u,35820937u,35851037u,35926801u,35932441u,35976721u,36255451u,36291193u,36307981u,36338653u,36354449u,36448387u,36721021u,36724591u,36765901u,36852481u,36861901u,36919681u,36942157u,36974341u,36981601u,37109467u,37167361u,37280881u,37376509u,37439201u,37469701u,37491301u,37727341u,37769887u,37938901u,37962541u,37964809u,37988497u,38010307u,38046817u,38118763u,38151361u,38171953u,38210323u,38239741u,38342071u,38404501u,38439523u,38560861u,38584801u,38624041u,38637361u,38801089u,38903287u,38971661u,39016741u,39052333u,39117439u,39126313u,39465091u,39467377u,39512773u,39573073u,39655153u,39684157u,39789841u,40094341u,40160737u,40165093u,40238797u,40315441u,40325041u,40361197u,40374901u,40430401u,40622401u,40629601u,40778989u,40782589u,40801861u,40827473u,40841821u,40917241u,40928701u,40987201u,41017681u,41073241u,41121433u,41262073u,41341321u,41396921u,41471521u,41541241u,41568101u,41590297u,41604109u,41607721u,41642681u,41662297u,41840809u,41866001u,41987111u,42009217u,
	42344609u,42485119u,42490801u,42623017u,42694279u,42697873u,42702661u,42709591u,42763501u,42984589u,43039501u,43136821u,43224397u,43235641u,43286881u,43331401u,43363601u,43397551u,43584481u,43620409u,43661257u,43798457u,43914949u,44070841u,44081101u,44238481u,44314129u,44347381u,44465221u,44472001u,44482901u,44521301u,44671001u,44695211u,44731051u,44823241u,44824501u,44912701u,44953441u,44963029u,45100177u,45175201u,45219329u,45393601u,45414433u,45485881u,45563027u,45819541u,45830161u,45877861u,45879941u,45890209u,46045117u,46055851u,46094401u,46104697u,46256489u,46325029u,46386589u,46469809u,46483633u,46517857u,46657181u,46679761u,46860001u,46878601u,47063611u,47220367u,47253781u,47349373u,47356171u,47647117u,47734141u,47744209u,47759041u,47903701u,47918581u,47930023u,47953621u,48064021u,48191653u,48269761u,48277081u,48316321u,48316969u,48321001u,48369727u,48400753u,48448661u,48462301u,48506581u,48551161u,48563089u,48592393u,48628801u,48650641u,48656869u,49075417u,49084321u,49303801u,
	49333201u,49411801u,49439237u,49459801u,49472281u,49664207u,50155733u,50176477u,50193793u,50201089u,50376019u,50443201u,50473501u,50523661u,51030601u,51129781u,51283501u,51302353u,51340807u,51500521u,51509161u,51627817u,51803821u,52072021u,52119289u,52142221u,52181407u,52204237u,52365457u,52869601u,53154337u,53245921u,53283169u,53399449u,53542147u,53560801u,53656021u,53675623u,53695721u,53711113u,53728921u,53795521u,54029741u,54060721u,54177949u,54215161u,54448153u,54449431u,54468001u,54545821u,54651961u,54767881u,54772381u,55035001u,55109401u,55176097u,55200181u,55318957u,55324801u,55462177u,55610837u,55729957u,56052361u,56420033u,56479897u,56608201u,56643181u,56669041u,56687821u,56810137u,57172501u,57421219u,57707209u,57762433u,58003213u,58422409u,58449847u,58489201u,58509977u,58679941u,58755877u,58903741u,59408497u,59426221u,59469697u,59586241u,59589973u,59631211u,59840537u,59913157u,59953741u,59955331u,59999011u,60155201u,60352921u,60514129u,60547831u,60558841u,60566431u,60581401u,
	60696661u,60738257u,60761701u,60769051u,60925879u,60957361u,60998653u,61198921u,61201009u,61219789u,61309333u,61330291u,61377109u,61754941u,61755751u,61794709u,61832377u,62176661u,62248649u,62289541u,62490901u,62633371u,62756641u,63001801u,63002501u,63065281u,63127681u,63167743u,63294553u,63318169u,63328469u,63337393u,63346999u,63388033u,63526519u,63729073u,63781901u,63884521u,63919801u,64009261u,64148717u,64162693u,64238021u,64377991u,64477249u,64605041u,64735897u,64774081u,64940041u,65037817u,65144501u,65241793u,65254393u,65301013u,65350801u,65359477u,65427701u,65523781u,65565457u,66024901u,66096253u,66296401u,66384121u,66760471u,66790057u,66849091u,66886417u,66932851u,66976273u,66977281u,67194401u,67213801u,67529437u,67541761u,67559383u,67642513u,67653433u,67763803u,67902031u,67928221u,67940449u,67994641u,68033801u,68075177u,68102641u,68154001u,68165761u,68275873u,68512867u,68621701u,68776561u,68800501u,68830021u,68839597u,69030901u,69128641u,69176647u,69228967u,69231061u,69331969u,
	69414301u,69485281u,69612061u,69615793u,69678307u,69705529u,69741001u,69885649u,69917371u,70006021u,70030501u,70147561u,70149631u,70195501u,70461331u,70463489u,70488001u,70541099u,70561921u,70593931u,70626301u,70695769u,70728121u,71079661u,71107681u,71570017u,71572957u,71711839u,71734417u,71804161u,72108421u,72286501u,72348409u,72498253u,72543547u,72595951u,72680431u,72734041u,72803809u,72805981u,72884701u,73522051u,73562833u,73645001u,73721341u,73988641u,74193769u,74217487u,74329399u,74411131u,74658629u,74705401u,74874869u,74927161u,74945953u,75140137u,75143251u,75151441u,75187669u,75501793u,75565873u,75676861u,75681541u,75765313u,75927853u,76595761u,76704433u,76725091u,76745101u,76923461u,76969621u,77295961u,77334769u,77477401u,77518981u,77533123u,77576401u,77594653u,77648941u,77812153u,77817979u,77826001u,77858221u,78073801u,78091201u,78120001u,78206917u,78526729u,78671651u,78795181u,78844801u,78905989u,78939089u,79398901u,79411201u,79417801u,79464533u,79525261u,79539197u,79624621u,
	79625701u,79739713u,79786523u,79854409u,79895401u,80142761u,80146909u,80282161u,80375707u,80556337u,80687881u,80787421u,80891009u,80918281u,80927821u,81189433u,81433591u,81480601u,81638401u,81680941u,81789301u,81926461u,81954133u,82023649u,82139471u,82219801u,82268033u,82273201u,82279741u,82416101u,82452061u,82506439u,82870517u,82882753u,82929001u,82995421u,83058481u,83083001u,83099521u,83103329u,83204801u,83814151u,83966401u,84164033u,84311569u,84350561u,84375761u,84421081u,84487457u,84758389u,84809521u,84998503u,85030921u,85207669u,85328717u,85374577u,85400701u,85519337u,85759147u,85761481u,85823401u,85875361u,86027329u,86067241u,86114117u,86204977u,86438857u,86484269u,86530621u,86968981u,86999837u,87211573u,87318001u,87483241u,87499651u,87558127u,87571121u,87694261u,88099339u,88123141u,88256449u,88368853u,88407361u,88466521u,88615801u,88661861u,88689601u,88735921u,88930463u,89244901u,89308771u,89670001u,89784581u,89896717u,89961421u,90014653u,90270613u,90278161u,90341197u,90532141u,
	90665789u,90670861u,90698401u,90803429u,91367431u,91433281u,91587289u,91659283u,92139433u,92438581u,92625121u,92631001u,92645437u,92677273u,92829781u,92974921u,93431521u,93473953u,93541537u,93571633u,93591721u,93614521u,93643201u,93677761u,93839201u,93845161u,93926197u,94316401u,94502701u,94536001u,94790401u,95053249u,95200561u,95423329u,95451361u,95452781u,96135601u,96259681u,96271681u,96618397u,96791881u,96888641u,96895441u,96904081u,96916279u,96925921u,97047301u,97255801u,97420141u,97496449u,97655933u,97676723u,97796953u,97863529u,97924217u,98124481u,98523877u,98586541u,98602381u,98735209u,98756281u,99036001u,99115297u,99486889u,99551881u,99789673u,99830641u,99898801u,99945007u,99971821u,100017223u,100302301u,100359337u,100427041u,100462141u,100463443u,100618933u,100860997u,100907047u,100943201u,101141461u,101152133u,101158093u,101218921u,101270251u,101276579u,101592973u,101612641u,101649241u,101929681u,101954077u,101957401u,102004421u,102090781u,102134113u,102237721u,102443749u,102678031u,102690677u,102690901u,
	102696901u,102757501u,102922117u,102940993u,103022551u,103301633u,104078857u,104086801u,104233141u,104404861u,104484601u,104524421u,104569501u,104852881u,104857391u,104891761u,104988673u,105007549u,105085981u,105117481u,105305443u,105309289u,105517621u,105569101u,105869401u,105919633u,105941851u,106041937u,106169761u,106402801u,106485121u,106622353u,106743073u,106775761u,107085007u,107101513u,107264521u,107360641u,107543333u,107714881u,108145741u,108150661u,108596953u,108870961u,108927721u,108952411u,109052113u,109118791u,109231229u,109316593u,109322501u,109393201u,109437751u,109541461u,109577161u,109879837u,110135821u,110139499u,110301121u,110312773u,110413333u,110495083u,110717861u,110851741u,111202297u,111291181u,111370141u,111654401u,111837181u,112032001u,112402981u,112792519u,112828801u,112844131u,113352401u,113359321u,113589601u,113605201u,113730481u,113886361u,113892589u,114046381u,114305441u,114329881u,114362281u,114469073u,114507121u,114701341u,114712753u,114842677u,114910489u,115007581u,115039081u,115085701u,115174681u,115196033u,115497901u,115582741u,115595701u,115756201u,
	115804501u,115873801u,116090081u,116148649u,116151661u,116321617u,116463007u,116617289u,116682721u,116696161u,116964289u,116998669u,117246949u,117251641u,117445987u,117744901u,117959221u,117987841u,118216981u,118466401u,118634977u,118712881u,118886401u,118901521u,119051101u,119074501u,119092801u,119118121u,119204809u,119261113u,119273701u,119327041u,119378351u,119558011u,119743537u,119811601u,119940853u,120296677u,120517021u,120557053u,120570409u,120838609u,120981601u,121062001u,121128361u,121247281u,121374241u,121422301u,121472359u,121609489u,122166307u,122396737u,122401441u,122569993u,122649901u,122785741u,122941981u,123054841u,123191797u,123330371u,123481777u,123559837u,123671671u,123718861u,123877081u,123886003u,123987793u,124071977u,124116763u,124145473u,124593301u,124630273u,124793521u,124818601u,125284141u,125686241u,125707681u,125739361u,125848577u,126132553u,126619741u,126682753u,126886447u,127050067u,127306081u,127479097u,127664461u,127710563u,128027831u,128079409u,128124151u,128396921u,128468957u,128536561u,128622781u,128626219u,128665319u,128697361u,128987429u,129205781u,
	129255841u,129256273u,129357061u,129461617u,129524669u,129569881u,129604321u,129762001u,129812761u,129924451u,130497361u,130513429u,130556329u,130651753u,130693393u,130766239u,130922221u,130944133u,131023201u,131117941u,131421541u,131567929u,131701921u,131821747u,131922701u,131938561u,132239521u,132332201u,132338881u,132440521u,132511681u,132575071u,132841369u,132901561u,132915601u,133157701u,133205761u,133216381u,133302781u,133344793u,133427449u,133467517u,133496221u,133800661u,134188381u,134384069u,134564501u,134696801u,134767153u,134809921u,134857801u,134868029u,135263269u,135296053u,135308881u,135437129u,135866017u,135945853u,135969401u,136043641u,136217251u,136545067u,136578421u,136625941u,136645561u,136661201u,136722433u,137345221u,137415821u,137763037u,137897881u,137972561u,138012733u,138030721u,138223009u,138280381u,138336661u,138403981u,138511441u,138652921u,138736153u,138828821u,139133611u,139295701u,139319293u,139363927u,139487041u,139592101u,139952671u,140047921u,140167729u,140197051u,140201569u,140241361u,140249201u,140285377u,140296861u,140710421u,140723401u,140996401u,
	141165361u,141574219u,142525333u,142825033u,142922413u,143071601u,143106133u,143168581u,143258689u,144014833u,144093313u,144218341u,145082201u,145206361u,145334821u,145348529u,145856701u,146023351u,146156617u,146272901u,146659801u,146843929u,146884393u,147028001u,147269251u,147287141u,147868201u,148087801u,148109473u,148171769u,148392781u,148705481u,148910653u,149065489u,149069989u,149389633u,149762017u,149796001u,150017401u,150260893u,150379693u,150418801u,150846961u,150886681u,150960239u,150966901u,150988753u,151530401u,151533377u,151589881u,151813201u,152192041u,152255611u,152486551u,152716537u,152922001u,152991841u,153365521u,153369061u,153384661u,153393637u,153449101u,153589801u,153754873u,153927961u,153928133u,153958321u,154045801u,154195801u,154287451u,154364221u,154513633u,154910869u,154944533u,155094121u,155102221u,155156581u,155203361u,155255881u,155358529u,155840777u,156114061u,156226321u,156532799u,156538201u,157010389u,157069189u,157368661u,157405249u,157725829u,157731841u,157852201u,158068153u,158115721u,158192317u,158397247u,158404141u,158496911u,158544401u,158864833u,
	158895281u,159412051u,159420457u,159492061u,159874021u,160348189u,160378861u,160491329u,160587841u,160672201u,160730389u,161035057u,161109613u,161184013u,161216021u,161289649u,161293249u,161304001u,161341181u,161369101u,161423377u,161498681u,161913961u,162026869u,162067441u,162690481u,162771337u,162776041u,163021423u,163442551u,163759753u,163849393u,163954561u,164008321u,164111281u,164391481u,164916001u,165061909u,165224321u,165479113u,165538447u,165654721u,165886561u,165892501u,165938653u,166082309u,166099501u,166339057u,166406561u,166444181u,166724041u,166827943u,167188861u,167579497u,167582377u,167692141u,167881121u,167958961u,167979421u,168132601u,168566501u,168659569u,169004921u,169057801u,169570801u,169655641u,169930549u,170640961u,170782921u,170853211u,170856533u,171068893u,171149749u,171438001u,171454321u,171567481u,171679561u,171747577u,171804061u,171823693u,172028053u,172116181u,172272187u,172290241u,172295761u,172430401u,172436713u,172596601u,172947529u,173085121u,173401621u,173405233u,173928601u,174156481u,174479729u,174638419u,174769621u,175313401u,175484291u,175656601u,
	175747457u,176030977u,176571089u,176597821u,176609441u,176659201u,176977921u,177167233u,177254533u,177349147u,177693521u,177702241u,177927641u,177951973u,178451857u,178482151u,178837201u,178956971u,179083601u,179285137u,179695321u,179820257u,180115489u,180497633u,180703451u,180801253u,181111951u,181154701u,181234801u,181285001u,181285537u,181299811u,181382401u,181397161u,181449073u,181542601u,181647497u,181870591u,182082901u,182356993u,182383111u,182688661u,183349453u,183502369u,183554407u,183677341u,183739141u,183788161u,184353001u,184411567u,184527101u,185203921u,185206757u,185265809u,185365141u,185653333u,185697121u,186183469u,186393481u,186654241u,186739057u,186782401u,186846301u,186983521u,187050529u,187155383u,187188001u,187667969u,187761241u,187823141u,188082301u,188382487u,188481601u,188516329u,188519653u,188689501u,188719381u,188821951u,188985961u,189184661u,189714193u,189738361u,189941761u,190212181u,190382161u,190667851u,190824817u,190913297u,191191933u,191233813u,191326141u,191648161u,191752093u,191981609u,192070081u,192081097u,192112921u,192346153u,192857761u,192920161u,
	193004281u,193009861u,193330237u,193638337u,193708801u,193816351u,193910977u,193949641u,194120389u,194296369u,194523121u,194556451u,194675041u,195412621u,195475351u,195827521u,196035001u,196049701u,196231393u,196358977u,196958161u,197168401u,197200201u,197466361u,197526421u,197747377u,197781221u,198982759u,199674721u,200143351u,200453401u,200479301u,200753281u,201081001u,201246139u,201261061u,201646801u,201789127u,202006441u,202130197u,202156813u,202538857u,202900021u,203049721u,203215297u,203269177u,203331877u,203505697u,203789951u,203875001u,203886241u,204280501u,204582457u,204669829u,204766381u,204821401u,205057561u,205374961u,205534681u,205997401u,206028271u,206304961u,206453509u,206472961u,206504033u,206529737u,206623873u,206804701u,206955841u,207008569u,207030541u,207104041u,207132481u,207399917u,207477001u,207487561u,207618781u,207767281u,208051201u,208096681u,208474669u,208715221u,208965421u,208969201u,208969223u,209246701u,209404369u,209990881u,210431761u,210565981u,210592873u,210628657u,210636707u,210687841u,210842113u,211734271u,212007601u,212027401u,213035761u,213111163u,
	213228121u,213349039u,213350681u,213451549u,213545473u,213804191u,213835861u,214038533u,214110541u,214850881u,214852609u,214858717u,215036641u,215107201u,215436241u,215878531u,215957281u,215973001u,216431041u,216821881u,217123069u,217145881u,217472501u,217517653u,217875571u,218321797u,218603617u,218642029u,218698921u,218947121u,219621781u,219716641u,220261861u,220507561u,220531501u,220883521u,221368153u,221415781u,221669953u,221884001u,222010721u,222123889u,222354721u,222436501u,222524347u,222630193u,223435261u,223449463u,223625851u,223782263u,223846129u,224074369u,224080921u,224136013u,224460181u,224578081u,224582221u,224617541u,224769241u,224957893u,225000007u,225028117u,225651469u,225853633u,226359547u,226450297u,226509361u,227132641u,227319001u,227444101u,227475481u,227518271u,227752993u,228652201u,228842209u,228988033u,229330441u,229589413u,229906711u,230029021u,230357761u,230630401u,230879041u,231383461u,231405701u,231575761u,231638797u,231821659u,231927781u,232114433u,232265881u,232460821u,232674541u,232771501u,232794253u,233082331u,233110081u,233424841u,233812657u,234420481u,
	234564829u,234691381u,234743041u,234869009u,234870301u,235426913u,235476697u,235562041u,235742513u,235928071u,236530981u,237023281u,237570191u,237597361u,237791143u,237984241u,237994381u,238001653u,238199941u,238244041u,238432201u,238565713u,238568749u,238833421u,239194171u,239582401u,239604301u,239612297u,240068041u,240175321u,240371713u,240694513u,240785047u,240982561u,241031701u,241048081u,241180369u,241242001u,241477039u,241505377u,241533527u,241927861u,241955701u,242067841u,242131889u,242214721u,242239321u,242641153u,242650717u,242729401u,242819809u,242860069u,243583201u,243829261u,243955141u,244175779u,244306151u,244528561u,244883981u,244973431u,245006623u,245388781u,245950561u,245956501u,246099317u,246282511u,246434761u,246446929u,246658441u,246975481u,247095361u,247318957u,247321301u,247416101u,247536811u,247704757u,247800709u,248290927u,249582481u,249679501u,249993101u,250200721u,250385401u,250436033u,250612831u,250707073u,250958401u,250988173u,251528401u,251663837u,251737993u,251855893u,252141121u,252778681u,252853921u,253571291u,253610281u,253880641u,253893397u,255160621u,
	255318001u,255416897u,255955393u,256192861u,256828321u,256831433u,256901569u,256979609u,257059279u,257495641u,257590661u,257595457u,257725693u,258020473u,258043229u,258172333u,258234401u,258434749u,258634741u,258676741u,258910513u,258943741u,258944401u,259763093u,259765747u,259913501u,260005141u,260035741u,260156101u,260518801u,260736341u,260963389u,261186001u,261312481u,261430237u,261523801u,261598213u,261703417u,261870841u,261938561u,262700137u,262979501u,263428181u,264269449u,264350521u,264384469u,264724201u,265020001u,265584133u,265728101u,265735969u,265836161u,266003101u,266462701u,266790481u,266811169u,266925601u,266983501u,267559627u,268359001u,268505089u,268554313u,268787701u,269053681u,269081509u,269392201u,269470501u,269800741u,269953489u,270495841u,270525737u,270857521u,271272569u,271481329u,271682651u,271763467u,271794601u,271826629u,271950829u,272223557u,272263501u,272786737u,272946241u,272973517u,273361789u,273480637u,273769921u,273853801u,274455457u,274566721u,274569601u,274683421u,274701913u,274810241u,274919401u,275283401u,275529559u,275619961u,275933281u,276018913u,
	276131137u,276542401u,276638321u,276693121u,277241401u,277739767u,277787141u,277897813u,278152381u,278388881u,278943061u,279377281u,279729661u,280067761u,280761481u,280885153u,281719351u,282240721u,282253141u,282471853u,282599101u,282707461u,282721297u,282769771u,283900961u,283936001u,284166877u,284301751u,284631061u,284666111u,284736091u,284830273u,284834299u,285212689u,285600601u,285820501u,286065571u,286316801u,287160301u,287449091u,287715121u,288099001u,288117721u,288120421u,288728761u,288735277u,289109941u,289765981u,289766701u,289795837u,289860481u,290020061u,290344753u,290495101u,290643601u,290706781u,290743459u,290953921u,291088513u,291461633u,291848401u,292153681u,292236841u,292244833u,292290181u,292433321u,292776121u,292902481u,293346637u,293847721u,293938261u,294184801u,294911401u,295308433u,295419097u,295643089u,295665409u,295743017u,295826581u,295981687u,296023729u,296509357u,296559361u,296639869u,297411661u,297624961u,297798961u,297812677u,298212601u,298811017u,299041381u,299367877u,299671921u,299736181u,300614161u,301030801u,301068001u,301413001u,302214817u,302560501u,
	302635351u,303373801u,303532201u,303639337u,303817411u,303899149u,304080001u,304875553u,304929613u,305594101u,305897131u,306143647u,306703801u,306842401u,306859861u,306871201u,307367171u,307629401u,307694323u,307972801u,308119141u,308448649u,308483209u,308550061u,308980321u,309666361u,310302901u,310474249u,310585771u,310978027u,311157901u,311177213u,311295601u,311297701u,311388337u,311411629u,311655829u,311671361u,312408113u,312614021u,312703093u,312890689u,313338061u,313533529u,313748611u,313980913u,314184487u,314292889u,314721001u,314970121u,315034513u,315145477u,315351521u,316126081u,316349281u,317137969u,317365933u,317501401u,317520401u,317641171u,317657341u,317796119u,318266641u,319053281u,319161361u,319374577u,319440769u,319726177u,320326003u,320454751u,320819941u,320894687u,321324589u,321524281u,321602401u,321831181u,321850849u,322020973u,322469701u,322941881u,323192521u,323654041u,323901007u,324171541u,324477697u,325028089u,325352101u,325546873u,325645321u,325842481u,326266051u,326405713u,326469137u,326628721u,326694301u,326695141u,326695357u,326854981u,327073601u,327093409u,
	327201121u,327398009u,327642613u,328098601u,328135051u,328302901u,328375501u,328573477u,328719601u,329153653u,329326621u,329558329u,329725981u,329769721u,329788981u,330133321u,330198331u,330396701u,330759617u,331240001u,331458739u,331658081u,331934989u,332162521u,332448229u,332737121u,332981293u,333229141u,333619357u,333801937u,333874333u,334849321u,335429641u,335480759u,336098957u,336183697u,337135501u,337420679u,337665901u,337783981u,337799701u,338018617u,338125537u,338395681u,338455153u,338458807u,338740417u,338896369u,338914369u,339006841u,339023833u,339195097u,339396401u,339492169u,339780041u,339794641u,339858541u,341174821u,341505829u,341958121u,341994131u,342838313u,343017529u,343052833u,343915321u,344040061u,344201441u,344255551u,344776301u,346080391u,346808881u,347347087u,347540401u,348089281u,348140881u,348989101u,349369021u,349752913u,350015257u,350031973u,350244577u,350638597u,351058753u,351143101u,351177769u,351593899u,352418041u,352802803u,352932337u,353815801u,353932801u,354062809u,354099089u,354313441u,354815761u,354938221u,355033729u,355552561u,356037241u,356604421u,
	356836819u,357277921u,357348601u,357380101u,357872971u,358416577u,358554841u,358856641u,358940737u,359112857u,359394751u,359499781u,359727073u,360018361u,360067201u,360145633u,360375181u,360787771u,361223941u,361307521u,361312337u,361430161u,361536869u,362569201u,363170837u,363245581u,363430637u,364155793u,364209199u,364282381u,364550761u,364590721u,364637701u,364857751u,364992481u,365077373u,365231401u,365461741u,365932297u,366069601u,366333613u,366487201u,366532321u,366652201u,367328977u,367559501u,367632301u,367649569u,367804801u,368016949u,368113411u,368476501u,369269551u,369372193u,369667561u,369930133u,369985681u,370851481u,370988521u,371011801u,371611153u,372128149u,372167101u,373012777u,373241089u,373501441u,373533617u,373632673u,373647313u,373669453u,373906513u,374008321u,374296627u,374325841u,374346361u,374469847u,374636353u,374703127u,374988661u,375504791u,376261117u,376446889u,376957153u,377192353u,377199661u,377300557u,377334497u,377458849u,377616421u,377806687u,377869031u,378682537u,378792649u,379732501u,379843309u,380137633u,382304161u,382536001u,382837729u,383221033u,
	383425351u,383443201u,383813431u,383955481u,384046273u,384100001u,384824441u,385175113u,385319089u,385454161u,385606981u,385702681u,386563801u,387072661u,387082453u,387188561u,387833531u,388695301u,389064589u,389841661u,390357793u,390489121u,390507001u,390609941u,390612221u,390922741u,391014937u,391880497u,391938301u,392099401u,392534231u,392679737u,393122521u,393513121u,393611653u,393716701u,394255261u,394723177u,395044651u,395264101u,395900557u,396036877u,396539101u,396864469u,396899581u,397460911u,398661901u,398679121u,398724701u,399156661u,399302581u,399647221u,399906001u,400374253u,400385701u,400557109u,400748657u,400898233u,400943401u,401100881u,401518657u,402507769u,403043257u,403095421u,403095967u,403219201u,403293313u,403317421u,404496401u,404864701u,405042001u,405739681u,405782623u,406122361u,406544461u,407282851u,407737201u,407889161u,408344851u,408910133u,409090501u,409280851u,409302001u,409419811u,409458241u,409724569u,409927057u,410613809u,410680357u,411618241u,411782089u,411851389u,412659721u,412836689u,412919041u,413058601u,413138881u,413429801u,413778817u,414216461u,
	414368641u,414395281u,414395701u,415200361u,415204501u,415476343u,415770103u,415787041u,415878541u,416219161u,416964241u,416998207u,417027451u,417102767u,417275677u,417767201u,417779909u,417924361u,418019701u,418044563u,418226581u,418616161u,418617281u,418667401u,419065921u,419184481u,419394781u,419520241u,420468481u,420607441u,421121701u,421335721u,421942951u,422103133u,422372821u,422429041u,422594201u,422625391u,422928101u,423006781u,423122057u,423384001u,423465001u,423857701u,424165163u,424175761u,424411501u,424431541u,425671429u,425750689u,425794601u,425854241u,425967301u,426174101u,426219649u,426462997u,426770437u,426783811u,426821473u,426846577u,426876397u,427294141u,427750291u,428175073u,428180191u,428314531u,428373401u,428758201u,428965741u,429105041u,429135841u,429202541u,429509837u,430017701u,430046857u,430381921u,430646401u,430733701u,430733941u,430802101u,431230801u,431350561u,431975089u,432227449u,432347371u,434042801u,434330401u,434343601u,434932961u,435016187u,435267001u,435358657u,435648901u,435882901u,435993301u,436189051u,436465501u,437128861u,437247841u,437289029u,
	437462101u,437597101u,437866087u,438359041u,438740149u,439309261u,439331491u,439849181u,440219011u,440306461u,440359921u,441354497u,441650591u,441758461u,442050577u,442181291u,442543553u,442921609u,443708917u,444660421u,445102183u,445429693u,445448641u,446414621u,446619617u,447018521u,447191893u,447555361u,447884857u,447940141u,447949201u,448197589u,449372551u,449501761u,449881081u,450016901u,450612001u,450807481u,450866021u,450872573u,452178673u,452877841u,452990401u,453085381u,453366029u,453652381u,453967739u,454133953u,454302241u,454328281u,454607281u,454745773u,454934401u,455106601u,455193113u,455198563u,455398903u,455918257u,456082001u,457050361u,457274161u,457320533u,457376401u,457457617u,458140957u,458368201u,459127189u,459785089u,459817133u,460251733u,460336801u,460486921u,460585861u,461151121u,461272267u,461329601u,461502097u,461854261u,462199681u,462357739u,462587329u,462639409u,462701513u,462755521u,463161601u,463164451u,464012033u,464021641u,464169259u,464560921u,464790781u,464826781u,464955857u,465505633u,466290949u,466331041u,466679251u,466758181u,466998301u,467100937u,
	467430913u,467491753u,467832421u,468410113u,468663481u,468950021u,469832401u,470120257u,470122777u,470268137u,470579831u,470644021u,470896201u,471168253u,471275713u,471441001u,471535373u,471664513u,471812701u,471905281u,472443517u,472814413u,473581057u,473847121u,474892741u,474970501u,474983881u,475468927u,475723849u,476011901u,476301649u,476669557u,477006181u,478317601u,478614067u,479131969u,479317561u,479445613u,479489557u,479962009u,480668347u,481153501u,481239361u,481593997u,482082601u,482164597u,482417857u,482455717u,482488393u,482517217u,482823469u,482824669u,482921297u,483006889u,483029821u,483226741u,483351217u,483570301u,483786433u,483945601u,484200289u,484487641u,484662529u,484914001u,486063001u,486070213u,486902929u,486913681u,487855681u,487896601u,488062901u,488104681u,488169289u,488585521u,488656981u,488844301u,489994201u,490045501u,490057837u,490099681u,490378981u,490503601u,490518181u,490806121u,490950461u,491738801u,492291673u,492559141u,492805261u,492989281u,493108481u,493961737u,494288677u,494657791u,495062443u,495426541u,495909871u,496050841u,496109729u,496560349u,
	496803553u,497148599u,497163121u,497285713u,498059647u,498195961u,498434437u,498443401u,498662561u,498706651u,498905189u,499310197u,500117773u,500448821u,500747293u,501172241u,501423361u,501472333u,501497569u,502041931u,502541803u,502686713u,503080201u,503590573u,503758801u,504142381u,504187021u,504454879u,504870241u,505473263u,505532773u,505798213u,505955737u,506349421u,506852161u,507142567u,507206701u,507323521u,507726901u,507960001u,508606771u,509033161u,509302873u,509551201u,509776741u,509801183u,509822401u,510048529u,510825601u,510925609u,511009801u,511098521u,511215521u,511338241u,511611673u,512096761u,512330281u,512626201u,514044301u,514738981u,515199901u,515675161u,516045197u,516097051u,516259657u,516684961u,516764063u,517662001u,517937581u,518117041u,518216201u,518548801u,518706721u,521501473u,521972081u,521980201u,522390109u,522758233u,523551601u,523756711u,523842337u,524151253u,524810861u,525255197u,525565441u,526067821u,526359289u,526686889u,526698937u,527761081u,528013333u,528043753u,528220117u,528820501u,529456033u,529782121u,529984801u,530443201u,530630701u,530714887u,
	531095029u,531681281u,532126801u,532526401u,532688401u,532758241u,532800133u,533429881u,533860309u,534782293u,535252867u,535428577u,535517581u,536003333u,536114197u,536300821u,536342419u,536357053u,536484031u,536562937u,536682511u,536870911u,537209921u,537302701u,539019361u,539234821u,539443213u,539799481u,540066241u,540207097u,540621181u,540654409u,540680141u,541672957u,541935901u,542497201u,542536457u,542626201u,543226321u,543459961u,544101481u,544861633u,545220869u,545363281u,545550433u,545570641u,545622401u,546102481u,546117301u,546322201u,546649741u,546748931u,547652161u,548080513u,548205841u,548289001u,548409401u,548409473u,548871961u,548989561u,549308761u,549333121u,549538081u,549659521u,550122001u,550132741u,550230409u,550635373u,550853137u,551140129u,551313001u,551672221u,551686201u,551840221u,552022219u,552573793u,552894301u,552914071u,553027201u,553079179u,553125937u,554050861u,554104261u,554487121u,554599051u,554964001u,555046097u,555321007u,555465601u,555726421u,556001377u,556069849u,556095433u,556114609u,556199281u,556450777u,557160241u,557165209u,557437999u,557795161u,
	558235109u,558570961u,558900821u,558977761u,559043941u,559547773u,559702153u,561448487u,561481921u,561492181u,561777121u,562367821u,562854601u,563298061u,563947141u,564084841u,564276061u,564298489u,564535441u,564651361u,564689381u,565422001u,565566001u,565664761u,565707061u,566530849u,566620201u,566689927u,567094501u,567358513u,567468721u,567596401u,568227241u,568773361u,568902001u,568967221u,569332177u,569495809u,569708161u,570012121u,570326401u,570491461u,570699181u,570941881u,571389001u,571623583u,572116501u,572123521u,572189941u,572228929u,572430769u,572567353u,572936869u,573183451u,573817861u,573862021u,573896881u,573911857u,574998841u,575326033u,575574049u,576691741u,576724219u,577210181u,577240273u,577337761u,577352641u,577613261u,578595989u,579139561u,579342673u,579373873u,579606301u,579956653u,580087873u,580565233u,580660081u,581618143u,582389641u,582584941u,582799951u,583248601u,583527601u,584414041u,585261637u,586014067u,586019071u,586538503u,586706821u,586877351u,587336401u,587343541u,587422609u,588049001u,588450061u,588469649u,589196881u,590356357u,590473801u,591242653u,
	591717787u,591822001u,592170601u,592467451u,592468777u,593102881u,593198497u,593234929u,593420941u,593628481u,593682169u,593728489u,593970769u,594734257u,595405201u,595590841u,596642713u,597537361u,597717121u,599135767u,599945293u,600026131u,600240181u,600507277u,600892993u,600893921u,600926509u,601606487u,601830881u,602379181u,602426161u,602593441u,603255001u,604584221u,604596601u,604611019u,605221501u,605341837u,605454917u,605961049u,606662281u,606682081u,606872449u,607148653u,607263553u,607540033u,607600871u,607677181u,607706401u,607750681u,608421637u,608442121u,608761297u,608917753u,608961241u,609046957u,609361567u,609414961u,609813781u,609865201u,610000501u,611097401u,611146861u,611374453u,611770513u,611812321u,611817421u,612006253u,612185029u,612587521u,612816751u,613782601u,613849601u,614742241u,615020401u,615344227u,615361183u,615623737u,615760133u,615895897u,616280897u,616458961u,616463809u,616523701u,616544101u,617087701u,617984101u,618068881u,619239457u,619268401u,619365121u,619480601u,619656001u,620052301u,620072251u,620169409u,620544961u,620580961u,620755537u,620827621u,
	621078301u,621100741u,621338641u,621769669u,622047427u,622137601u,622905661u,623100457u,623613961u,623735953u,624303241u,624732421u,625060801u,625482001u,626717471u,627886657u,628832881u,628868467u,629134081u,629692801u,630022069u,630496621u,630622753u,630811513u,630888481u,631071001u,631677421u,631767943u,631974613u,632997001u,633289807u,633639097u,634399417u,635147101u,635155291u,635291077u,635319361u,635907581u,636021211u,636097177u,636111451u,636130769u,636287653u,636337073u,636936697u,637907581u,638502913u,638837761u,638959321u,639305921u,639807781u,639894421u,639925441u,640650931u,640804243u,640977373u,641468401u,641686081u,641835811u,642708001u,642795427u,643036321u,643316461u,643445881u,643552909u,643767931u,644004817u,644453633u,644457551u,644731357u,644900257u,645556481u,645589351u,645986401u,647065321u,647190253u,648056449u,648328801u,648408773u,648993961u,650028061u,650506321u,650663861u,651011329u,651064681u,651109537u,651151801u,651514753u,651830821u,652469641u,652684201u,652969351u,653235841u,653260633u,654000061u,654255467u,654947101u,655264369u,655503913u,656187001u,
	656189101u,656723161u,656958061u,657732349u,658126621u,658476001u,658831741u,659526601u,659846021u,660095641u,660754117u,661122881u,661207177u,662134201u,663699961u,663760681u,664384321u,664574653u,665096941u,665462081u,665498107u,665743429u,666455581u,666673261u,668498321u,668734753u,670976641u,670987021u,671024389u,671271581u,671716921u,672103001u,672108193u,672389641u,672533921u,672579671u,672687133u,673326013u,673352629u,673389601u,673725469u,673778561u,673778827u,674503921u,674666641u,675168193u,675260477u,675651469u,676243261u,676280221u,676359391u,676880821u,677451241u,678481693u,678981871u,679033681u,680863261u,680972909u,680983817u,681019921u,681124207u,681303241u,682528687u,683032801u,683316001u,683362681u,683379841u,684350833u,684979693u,685201141u,685374691u,686023537u,686043661u,686059921u,686071009u,686129221u,686551351u,687741401u,688431601u,688436893u,688607101u,688804399u,689235121u,689537341u,689537441u,690035713u,690562601u,691131349u,691395871u,691830811u,691847573u,691914161u,692535637u,692597647u,692895421u,692948029u,693456521u,694031221u,694116893u,694656901u,
	696042901u,696321949u,696447181u,696998251u,697821857u,697906561u,698192041u,698548201u,698784361u,698819711u,700932961u,701043421u,701247001u,702683101u,703995733u,704934361u,705101761u,705303457u,705351583u,705890219u,706368713u,706728377u,706979341u,707469841u,707691601u,707926801u,707941081u,708621217u,708843241u,709409993u,709436557u,710382401u,710408917u,710541481u,710617861u,710721001u,711374401u,711832061u,711981649u,713383171u,713588401u,713778661u,713917891u,714490481u,714663139u,715614901u,716406481u,716652001u,716923729u,717096641u,717164449u,717653129u,717831211u,718731001u,718902241u,719256497u,719605393u,719617249u,720767521u,720931121u,721244161u,722508229u,722923201u,722955773u,723645001u,724160251u,724274671u,724677797u,724947337u,724969087u,725508241u,725525137u,726242881u,726501601u,727083001u,728708401u,728816401u,728851507u,728931841u,729094843u,730144441u,730364137u,730925713u,731276521u,731894131u,732627401u,732736621u,732738097u,732805681u,732812353u,733098913u,733219201u,733995883u,734166217u,734590837u,734652901u,734770681u,734895721u,735432773u,736668013u,
	737261953u,738820351u,739036541u,739444021u,739576801u,740988151u,741182401u,741203281u,741214237u,741470549u,741795133u,741965821u,742017181u,742039441u,742550401u,743397733u,743404663u,744500641u,745493761u,745745461u,745823881u,745960501u,746110201u,746296993u,746331041u,746706961u,746793763u,747406801u,748419127u,748638001u,748959709u,749172821u,749640161u,750616739u,750632137u,750970801u,751226401u,751705597u,752102401u,752186593u,752780161u,753022201u,753233717u,753532781u,753574537u,753594001u,753776173u,754020361u,754722001u,754874257u,756205633u,756249901u,756271909u,756318751u,756980137u,757639387u,758581651u,758687581u,758875601u,758901701u,759085273u,759129229u,759252367u,759266621u,759472561u,759502081u,759622753u,759638881u,759691801u,760264009u,762278161u,762428071u,762645313u,762699649u,763488241u,763596709u,763907741u,764033999u,764240611u,764636569u,764923477u,765067321u,765245881u,765378241u,765946441u,766303693u,766503409u,766661221u,766823797u,766836481u,768440063u,768653281u,769006153u,769445561u,770056141u,770201221u,770909107u,770937931u,771043201u,771337891u,
	771350581u,771721949u,772495777u,773131927u,773807401u,774346981u,775006201u,775086481u,775135201u,775368901u,775866001u,775896181u,776176261u,776443769u,777218989u,777668401u,777778561u,778762501u,779708281u,779819587u,780417001u,781347841u,781420097u,781471001u,781517881u,782823281u,783170137u,784242901u,784450393u,784586881u,784777393u,784783477u,784966297u,784990921u,785901313u,786262321u,787085857u,787209277u,788046901u,788378701u,788931361u,789082001u,790020001u,790239241u,790453049u,790623289u,790799801u,791118043u,792049381u,792144161u,792145729u,793045561u,794201333u,794399041u,794910961u,794937601u,795064909u,796072003u,796200901u,796560703u,797418997u,797834017u,798695191u,798770161u,798823081u,799104721u,799146601u,799162561u,799275961u,799518721u,799630753u,799808401u,799865461u,799898833u,799916101u,800368261u,800712901u,801093011u,801227269u,801606401u,801866647u,803073601u,803264281u,803756449u,803823553u,804232261u,804801313u,804802777u,804978721u,805505957u,805589621u,805771501u,805797433u,807115753u,807218413u,807857389u,808107301u,808214161u,808857721u,809702401u,
	809790881u,809883361u,810023881u,810455101u,810514081u,810543301u,810915701u,811110301u,811374709u,811478533u,811607777u,811730923u,812070721u,814056001u,814832177u,815430533u,815737177u,815796413u,816024161u,816075457u,816215401u,816549121u,816588901u,816799369u,816890561u,817013401u,817832329u,818126311u,818391211u,818401321u,818742961u,818762569u,819019201u,819466201u,819743233u,819891679u,820009831u,821561203u,821652601u,822018961u,822531841u,823106593u,823286701u,823536781u,824389441u,824405041u,826004467u,826841641u,828380791u,828499393u,829450909u,829512001u,829678141u,829932601u,830295901u,830664451u,831933901u,832048447u,832060801u,832127489u,832169857u,832251421u,833079121u,833608321u,833610751u,834147721u,834244501u,834589801u,834720601u,835051057u,836154047u,836515681u,836683849u,836794843u,837311761u,837766217u,837843931u,838357141u,839268139u,839275921u,839280691u,839908217u,840749761u,841217653u,841340521u,841399021u,841402801u,841552921u,841660961u,841776001u,842202361u,842429809u,842785841u,842824981u,842960981u,843161887u,843463501u,843704401u,843983191u,844075051u,
	844523569u,844545271u,844788757u,845376533u,846063361u,846961321u,847178641u,847324843u,847491361u,848090377u,848755969u,849064321u,849245167u,849548671u,850141207u,850193401u,850885141u,851703301u,851778817u,851934601u,852081121u,852163157u,852432769u,852571951u,852645691u,852729121u,852888961u,854094781u,854172271u,854319269u,854868257u,855429841u,855674041u,855734401u,855762559u,856379057u,857009881u,857100421u,857902861u,858336529u,858687103u,858895921u,859096477u,859481921u,859889917u,859996477u,860334301u,860736889u,861533821u,861752809u,861993793u,862082677u,862678081u,863196181u,863471521u,863609113u,863744701u,863984881u,864014581u,865242841u,866008153u,867022747u,867110501u,867638201u,867800701u,867836593u,867965251u,868088341u,868111597u,868234081u,868691401u,868967401u,869019481u,870248821u,870985223u,871102441u,871157233u,871195561u,871840261u,871908481u,873503401u,873631027u,875828101u,876850801u,877542481u,878492941u,878529601u,878603701u,878940833u,879995689u,880105381u,880288049u,880870513u,880922657u,881186801u,881290609u,882185941u,882516991u,882796321u,882866161u,
	883276549u,883571401u,884304037u,884701441u,884952001u,885177541u,885336481u,885390017u,885501721u,885510239u,886002481u,886180429u,886526641u,886975441u,887526251u,887795221u,888700681u,888868441u,891706861u,892650067u,892740853u,893466301u,893601541u,893692819u,894264337u,894872071u,895893433u,895992721u,896231953u,896901461u,897063677u,897087361u,897283213u,897387481u,897395201u,897842401u,897880321u,898343713u,898384501u,898966801u,899019353u,899104501u,900736411u,901074259u,901848301u,902566501u,902645857u,903108821u,903390643u,905040953u,906008281u,906060169u,906187381u,907378669u,907670501u,907711561u,908005249u,910202509u,910651393u,910867481u,911040481u,911484421u,913394881u,914348737u,914688433u,914906539u,915641821u,915743251u,917041301u,917704081u,918023891u,918063343u,918661501u,919941841u,920375821u,920696653u,920834197u,920849761u,921858631u,922050481u,922845241u,923437213u,923810401u,924528061u,925265551u,925276681u,925972201u,926181361u,926586001u,926756881u,927106561u,927749203u,927877001u,928040131u,928152673u,928447861u,928482241u,929159941u,930530701u,930745621u,
	931050721u,931694401u,932148253u,933184801u,933243301u,933729421u,934168861u,934784929u,935794081u,936421141u,937477801u,937675393u,938376181u,939408601u,939706021u,939947009u,940123801u,941056273u,941116177u,941734657u,942088201u,942911461u,943271569u,943795201u,944157061u,944832533u,946033201u,946034057u,946378657u,946787377u,947105461u,947829889u,947878081u,947950501u,947993761u,949317217u,949697233u,949891321u,951204961u,951941161u,952282801u,952893881u,953022151u,954551431u,954608761u,954711241u,954732853u,954924013u,955134181u,955160701u,955237141u,955359901u,955527301u,956094193u,956422657u,957044881u,957600541u,957631249u,958131157u,958304893u,958588681u,958669921u,958735681u,958762729u,959080601u,960269377u,960946321u,962415721u,962442001u,962489557u,962491237u,962500561u,962523169u,962769553u,963163201u,963168193u,964245001u,964412837u,964435969u,965490677u,965501857u,967172909u,967266451u,967270129u,967287751u,967714021u,967790401u,968283247u,968324881u,968413217u,968553181u,968751241u,968915521u,969528337u,970299721u,970586713u,971032861u,971454661u,971515777u,971975071u,
	974113601u,974471243u,974774401u,974864521u,975303121u,975576281u,975705781u,975765787u,976396961u,976938061u,977392711u,977483449u,977737321u,977755351u,977892241u,979363153u,979408801u,979421111u,980056507u,980725201u,980957461u,981484561u,981789337u,981855281u,981914401u,982492561u,983456377u,983778269u,984133441u,984252001u,985052881u,985075681u,986088961u,986392021u,987465151u,987842101u,988008337u,989739829u,990061957u,990409421u,990893569u,993420289u,993905641u,994133479u,994964251u,995586373u,995650921u,996524101u,997170931u,997695661u,997753901u,997836841u,998489017u,998590601u,998596741u,998724481u,999828727u,1001152801u,1001723911u,1002261781u,1002381871u,1002439153u,1002781441u,1003062061u,1003509511u,1005402133u,1005654601u,1005833971u,1006207021u,1006679401u,1006800829u,1007246431u,1007608753u,1008032689u,1008777001u,1008839999u,1009025263u,1009140161u,1010324161u,1010394001u,1010979817u,1011319501u,1011333061u,1011570457u,1011754423u,1011909271u,1012438391u,1013833153u,1013996821u,1015339441u,1015626151u,1016597737u,1017517501u,1017748057u,1017817801u,1018878001u,1019089369u,1020220661u,1020515761u,1021026601u,
	1021281301u,1021314001u,1022336611u,1022616097u,1023267281u,1024041853u,1024123501u,1024249933u,1024605121u,1025035129u,1025067241u,1026583801u,1026738161u,1027334881u,1027354321u,1027744453u,1028360593u,1028494429u,1030401901u,1031750401u,1032101461u,1033161841u,1033449121u,1033480201u,1033501877u,1034252929u,1034261881u,1034283421u,1034736121u,1034958601u,1035608041u,1036335301u,1036816633u,1037560681u,1037935813u,1038148801u,1038165961u,1038880753u,1039224121u,1039579451u,1040234231u,1040449769u,1042855801u,1043938369u,1044959501u,1046080339u,1046508373u,1046656501u,1049116069u,1049584313u,1049790001u,1050102901u,1050506101u,1050535501u,1050721561u,1051080913u,1054999441u,1055009117u,1055384929u,1056121453u,1057367593u,1057426651u,1058575981u,1059193297u,1059282001u,1060291849u,1061013301u,1061075197u,1061084701u,1061413151u,1061609761u,1063212481u,1063801909u,1064926801u,1065508321u,1065602281u,1066415191u,1066603681u,1066938769u,1066972301u,1067494861u,1067917501u,1069388497u,1069731901u,1070011321u,1070639389u,1070659201u,1070941987u,1071512749u,1071643249u,1072096201u,1072570801u,1072581277u,1072812001u,1072898711u,1072915921u,1073159281u,1073288581u,1073299501u,1073356831u,
	1073484823u,1075100041u,1077133397u,1078014301u,1078467589u,1079237809u,1079556193u,1080124657u,1080255241u,1081176601u,1081798061u,1082472553u,1082553121u,1084187521u,1084241341u,1084444481u,1086444001u,1087190371u,1088953921u,1089174013u,1089682837u,1089991981u,1090858081u,1092518407u,1092558013u,1093150081u,1093297661u,1093352833u,1093526353u,1093552201u,1093690621u,1093916341u,1094042321u,1095414601u,1095474121u,1097416321u,1098027601u,1098743563u,1098895249u,1100624857u,1100674561u,1101270457u,1101623381u,1101673501u,1102573501u,1102750013u,1103102191u,1103145121u,1104194521u,1105024481u,1105038871u,1105322653u,1105779277u,1106410033u,1106529761u,1106580817u,1106595493u,1107011521u,1107138961u,1107263521u,1107717841u,1108135381u,1108485001u,1108706857u,1109052001u,1109304913u,1110293101u,1110582947u,1111205873u,1111939201u,1112103541u,1112247289u,1112495671u,1112502403u,1112671603u,1113541073u,1114277221u,1115910727u,1116379301u,1117202557u,1117610033u,1117785881u,1117828001u,1117890019u,1119075841u,1119412321u,1119793501u,1120068001u,1120076281u,1120258657u,1120981021u,1121176981u,1122922801u,1122947101u,1123406047u,1123625501u,1123727617u,1123942387u,1124396521u,1124841577u,
	1124925913u,1125038377u,1126587151u,1126602961u,1127040769u,1127596801u,1128169441u,1128950281u,1129146481u,1130473361u,1130933429u,1131092381u,1131222841u,1132002847u,1134010801u,1134044821u,1134367777u,1135263583u,1135679761u,1136410943u,1136940031u,1137966061u,1138049137u,1138289041u,1138607233u,1139109121u,1139137057u,1139434453u,1140441121u,1140573601u,1141269481u,1141781461u,1142092081u,1142466151u,1142770133u,1143378419u,1143427861u,1144088101u,1144357201u,1144839697u,1144998841u,1145170153u,1145677201u,1147357559u,1147434289u,1148063573u,1148237861u,1148578201u,1150229761u,1150270849u,1150534747u,1151670001u,1152161191u,1152793621u,1153049341u,1153164097u,1153366501u,1153440289u,1153642501u,1154343961u,1154691409u,1154987209u,1155939709u,1156761911u,1156993373u,1157585329u,1157627353u,1157839381u,1159421509u,1159712737u,1160527501u,1160737201u,1160804419u,1160844821u,1161306721u,1161531601u,1161907201u,1162202581u,1163044681u,1163098249u,1163168077u,1163227759u,1163523241u,1163659861u,1164033613u,1164218641u,1164340237u,1164430657u,1165552697u,1165717129u,1166016373u,1166475601u,1166598217u,1168108201u,1168221121u,1168256953u,1168492417u,1171525681u,1171811653u,1172155601u,
	1172449351u,1172800729u,1173229201u,1173545533u,1174300093u,1174476913u,1174614001u,1175799241u,1177195201u,1177773421u,1177800481u,1177874587u,1178196859u,1178717713u,1180398961u,1180817569u,1180954321u,1180970407u,1181566219u,1182047401u,1182875401u,1183104001u,1183338241u,1184291641u,1184554801u,1185589913u,1185605209u,1186325981u,1187082001u,1187235193u,1188180001u,1189238401u,1190524651u,1190790721u,1190895301u,1191153937u,1191216133u,1191545929u,1192314817u,1192412033u,1192903531u,1193043601u,1193229577u,1193557093u,1194508873u,1194866101u,1195400641u,1195524181u,1195556653u,1196189281u,1196364727u,1196852273u,1196969707u,1198387201u,1198650961u,1198880261u,1200456577u,1200778753u,1202050873u,1202142061u,1203090379u,1204119731u,1204176751u,1204205449u,1204218709u,1205606533u,1205772499u,1206057601u,1206843463u,1207193149u,1207252621u,1209112633u,1209998077u,1210393801u,1210562701u,1210653541u,1211686849u,1212081517u,1212725137u,1212919201u,1213153201u,1213245001u,1213619761u,1213981441u,1214027137u,1214341129u,1214703721u,1214770577u,1216631521u,1217181061u,1217823517u,1217924159u,1218132761u,1218501181u,1219493437u,1219816261u,1219858921u,1220114377u,1220491691u,1221044653u,
	1221127013u,1221367057u,1222568707u,1222861271u,1223475841u,1223531677u,1223884969u,1223941657u,1224437833u,1225128829u,1225506217u,1226230297u,1226855293u,1227133513u,1227220801u,1227280681u,1227474431u,1227493081u,1227743401u,1228425121u,1228962197u,1229491063u,1229536801u,1229746393u,1229751667u,1230330817u,1230393241u,1230446653u,1231002901u,1231153561u,1231362793u,1231637761u,1231726981u,1231869601u,1232445677u,1232469001u,1233715393u,1233759241u,1234125721u,1234646533u,1235188597u,1235864033u,1236313501u,1236442421u,1236640021u,1236691021u,1237091401u,1237712617u,1238018797u,1238825569u,1239010201u,1239896701u,1241120881u,1241293681u,1242171349u,1242227647u,1242858317u,1242958501u,1243518373u,1244576881u,1246588201u,1247375273u,1247688961u,1247750111u,1248314761u,1248453361u,1249166881u,1249785941u,1250656621u,1250896849u,1251295501u,1251748807u,1251992281u,1252236421u,1252489057u,1252957501u,1253314693u,1254277909u,1254318481u,1254580541u,1255135501u,1255665613u,1256276581u,1256780071u,1256855041u,1257102001u,1257216577u,1258094801u,1258903981u,1258962541u,1259851321u,1260332137u,1260404821u,1262641501u,1262991913u,1263293281u,1263344581u,1264145401u,1264264561u,1264505863u,
	1265477791u,1265740717u,1266003461u,1266025351u,1266273793u,1266425101u,1267154071u,1267345081u,1267643557u,1267834459u,1268104993u,1268604001u,1268946253u,1269171049u,1269211021u,1269295201u,1269835201u,1270193401u,1270489621u,1270667353u,1271325841u,1272558739u,1272866167u,1273165921u,1273691791u,1278280081u,1278649441u,1280045989u,1280074081u,1280727701u,1281100549u,1281344689u,1282369321u,1282371091u,1282447477u,1282568741u,1282637521u,1284321611u,1284796801u,1284816583u,1284997429u,1285636801u,1286298133u,1286298263u,1287250021u,1290156841u,1290927751u,1291773451u,1291933501u,1292581751u,1293180481u,1293866533u,1293971041u,1295577361u,1295948431u,1296613501u,1297090081u,1297092721u,1297443913u,1299072721u,1299656821u,1299784141u,1299963601u,1301509249u,1301715949u,1301926081u,1302607801u,1302745481u,1303352353u,1304553251u,1304780431u,1306742221u,1306836001u,1307004641u,1307520469u,1307823661u,1308196201u,1308758533u,1308998741u,1309440001u,1309531861u,1309723213u,1309983901u,1310329567u,1311070657u,1311255661u,1311616153u,1312332001u,1312573123u,1312845661u,1312944931u,1313396221u,1313428201u,1313596201u,1315858381u,1316052893u,1316169541u,1316417173u,1316958721u,1317828601u,
	1317912541u,1318126321u,1318717531u,1318755439u,1318853537u,1319136661u,1319182201u,1319695501u,1319736601u,1319978701u,1319992181u,1320793813u,1321058213u,1321545961u,1321983937u,1322437201u,1322916253u,1323378001u,1323668917u,1324857293u,1325172421u,1325329297u,1327551233u,1328081833u,1328256247u,1329174601u,1329431689u,1330202273u,1330655041u,1331226121u,1331419321u,1331973329u,1332123661u,1332313291u,1333144561u,1336210313u,1336288669u,1338317293u,1338579257u,1339223509u,1341010577u,1341026401u,1341042361u,1341672913u,1341871777u,1341926401u,1342185841u,1343575381u,1343851867u,1343872153u,1344039841u,1344597577u,1344975721u,1345227361u,1345514101u,1345523401u,1346316973u,1347387361u,1347914701u,1348114069u,1348964401u,1349536981u,1349671681u,1349855497u,1350371821u,1350685001u,1351126261u,1352453257u,1352531269u,1353051517u,1353976801u,1354991653u,1355042833u,1356241321u,1356328121u,1356661711u,1357459183u,1358288471u,1358662669u,1361195551u,1361355751u,1362132541u,1362463807u,1362515701u,1362742561u,1363178701u,1363519501u,1363563397u,1364828257u,1365662917u,1365746491u,1365783961u,1366587661u,1366608377u,1367490601u,1367732161u,1368769681u,1369166239u,1370776577u,1371090721u,
	1371908137u,1372681861u,1373410081u,1373986801u,1375322101u,1376550737u,1376781601u,1376799577u,1376844481u,1376906041u,1377829441u,1378231633u,1378483393u,1378646179u,1379464633u,1379937781u,1381243709u,1381568059u,1381581481u,1382114881u,1382453333u,1382557969u,1383283129u,1384157161u,1384701409u,1385142661u,1385656829u,1385804161u,1385920693u,1386705433u,1386734761u,1388232241u,1388400751u,1388972353u,1389353941u,1389975149u,1391564161u,1391890033u,1392189569u,1393253191u,1393851553u,1394640941u,1394746081u,1394942473u,1395564127u,1396134997u,1397357851u,1398883201u,1400575177u,1400859847u,1401840833u,1404008369u,1404111241u,1404228421u,1404253369u,1404403273u,1406826241u,1406851249u,1407060181u,1407548341u,1407818749u,1408352401u,1409372779u,1410521941u,1410833281u,1411728571u,1412193601u,1412437501u,1413067501u,1413803197u,1414154827u,1414529533u,1415969101u,1417292911u,1417986901u,1419339691u,1419459121u,1419575167u,1419706601u,1420093081u,1420613161u,1420784281u,1421475031u,1422477001u,1422713161u,1423668961u,1424503849u,1425860101u,1426319563u,1426534201u,1427771089u,1428432349u,1428966001u,1429093261u,1430262769u,1431677941u,1432227601u,1432354901u,1433624401u,1434177821u,
	1435091377u,1435146077u,1435768771u,1435921201u,1436131621u,1436452381u,1436976289u,1437330241u,1437717847u,1438648993u,1439328001u,1439492041u,1440231941u,1440922891u,1441139641u,1441316269u,1441678411u,1442374801u,1442761201u,1442945689u,1443388481u,1443742273u,1445084173u,1445581801u,1446247001u,1446298309u,1446434677u,1446818651u,1448221297u,1448921633u,1450115101u,1450478459u,1451635201u,1452201241u,1452759401u,1452767521u,1453391941u,1453645663u,1454282449u,1454445413u,1455726097u,1456527461u,1457137201u,1457378449u,1458756721u,1458995371u,1459654561u,1461026953u,1461307717u,1462393201u,1463030101u,1463065501u,1463178817u,1463992661u,1464568381u,1465290841u,1465307351u,1465454101u,1465749451u,1465908193u,1465945417u,1466169829u,1466414119u,1468540477u,1468824787u,1469059481u,1469074321u,1469768653u,1469960377u,1470080501u,1470650851u,1471628401u,1471883641u,1472221921u,1472772421u,1472970421u,1473580001u,1474936871u,1475200441u,1475841511u,1476304501u,1476648901u,1477289941u,1481619601u,1481626513u,1482152101u,1482274513u,1482876673u,1483199641u,1483873861u,1483918801u,1484080291u,1485061471u,1485162721u,1485880921u,1486285801u,1486564301u,1489587997u,1490046481u,1490056501u,
	1490247841u,1490564881u,1490621461u,1490636449u,1493114149u,1494352861u,1494595801u,1494714493u,1495190699u,1497221281u,1497262321u,1497965713u,1499971457u,1499989177u,1500142001u,1500884581u,1501011001u,1501165097u,1501194397u,1502171117u,1502403121u,1502570513u,1502770193u,1503240559u,1503705601u,1504139521u,1504604101u,1504651681u,1504728541u,1504832033u,1505010991u,1505432881u,1506674521u,1507243681u,1507746241u,1508119081u,1508953447u,1509156013u,1509600001u,1509677401u,1510474841u,1510860121u,1510870241u,1511558533u,1513888157u,1514608559u,1515175087u,1515664153u,1515785041u,1515874921u,1516071547u,1516962241u,1517039371u,1518014689u,1518066829u,1518290707u,1518521401u,1519801381u,1520190341u,1520467201u,1521221473u,1521835381u,1522302121u,1522573921u,1522669051u,1522918201u,1524137041u,1525345097u,1525655461u,1526732803u,1526967961u,1527236341u,1527578209u,1528936501u,1529298541u,1529544961u,1529648231u,1529819971u,1530159247u,1530275671u,1530495289u,1530757801u,1531436401u,1532383217u,1532419099u,1532569681u,1532586601u,1532755369u,1533343261u,1534063081u,1535020133u,1535505301u,1536112001u,1536251047u,1536883357u,1537433899u,1537641691u,1538012449u,1538753581u,1539068401u,
	1539171289u,1539219781u,1539583921u,1539804001u,1540454761u,1540550413u,1541004451u,1541047813u,1541755909u,1541818321u,1541849761u,1541955409u,1544145121u,1545019813u,1545177581u,1545387481u,1545914371u,1546106773u,1546340401u,1546508057u,1547140841u,1547543161u,1547712601u,1549308001u,1549477837u,1549698481u,1550256481u,1550643139u,1550924873u,1552778501u,1553233481u,1554270481u,1557118081u,1560312001u,1560620041u,1561800833u,1563607081u,1565074699u,1565683181u,1565893201u,1566001801u,1566594551u,1566654481u,1567830241u,1568101591u,1568471813u,1568916311u,1569488977u,1569663271u,1569843451u,1571111587u,1571503801u,1572279791u,1572932089u,1573132561u,1573895701u,1574362441u,1574601601u,1575340921u,1576187713u,1576826161u,1577983489u,1578009401u,1578114721u,1579869361u,1580201501u,1580449201u,1580591377u,1581576641u,1581714481u,1581943837u,1582212101u,1582783777u,1582886341u,1583230241u,1583582113u,1583658649u,1584405649u,1584443341u,1584462331u,1586436193u,1587483001u,1587650401u,1588246561u,1588247851u,1589307919u,1590394313u,1590564601u,1592109751u,1592668441u,1593706201u,1594212061u,1595120473u,1595622997u,1595647351u,1595887921u,1597009393u,1597330813u,1597821121u,1598197201u,
	1600952221u,1602517949u,1603188841u,1603765021u,1603810561u,1603994701u,1604440111u,1606734481u,1609913059u,1609916491u,1609935913u,1611716401u,1611842101u,1612121473u,1612702621u,1613347741u,1614290641u,1614400417u,1614508267u,1615204669u,1615565953u,1615744261u,1616387653u,1616873413u,1617795181u,1617921667u,1618070737u,1619447741u,1620646177u,1620653761u,1622134801u,1622809111u,1623368293u,1623794113u,1625667121u,1626167341u,1627103521u,1627151197u,1627636561u,1627733341u,1627898401u,1628059501u,1628692201u,1630062253u,1630307617u,1631314609u,1631394961u,1632082141u,1632286673u,1632513601u,1632785701u,1633044241u,1633771873u,1633931461u,1635241153u,1635548221u,1636046389u,1636185601u,1636572253u,1637176801u,1637434657u,1637436457u,1637930893u,1638294661u,1638983347u,1639256641u,1639351981u,1639846391u,1640144461u,1641086371u,1641971701u,1642207051u,1642814653u,1643317237u,1643962069u,1644637051u,1645228621u,1645253989u,1645413001u,1645677811u,1646426881u,1646645041u,1646923201u,1647225529u,1647290503u,1648076041u,1648130977u,1649422981u,1649430889u,1649684401u,1650117809u,1650265549u,1650581593u,1650682153u,1651154581u,1651880413u,1652420197u,1652932513u,1654940509u,1655660761u,
	1656229921u,1656280033u,1656812521u,1656917377u,1657700353u,1659009601u,1659935761u,1659965077u,1661202113u,1662320881u,1662684661u,1662784117u,1663998337u,1664852161u,1667600101u,1668037621u,1668926629u,1669843891u,1669893661u,1670044321u,1670388229u,1670729281u,1671033401u,1671603667u,1671714241u,1672125131u,1672719217u,1673480161u,1674091141u,1674256441u,1674658133u,1674944881u,1675348201u,1675352701u,1675978193u,1676203201u,1676641681u,1678274581u,1678305451u,1678569121u,1679130641u,1679881681u,1680187537u,1680901381u,1682056201u,1683174533u,1685266561u,1685433413u,1686001861u,1686495601u,1686886081u,1687248001u,1688214529u,1688639041u,1689411601u,1689957383u,1690230241u,1690380901u,1690914781u,1691249869u,1691745821u,1692605041u,1693101241u,1693715101u,1694128129u,1695158921u,1696572001u,1696574449u,1696893101u,1697213407u,1698623641u,1698707377u,1699279441u,1699471117u,1700250049u,1700978401u,1701016801u,1701516961u,1702210861u,1704682753u,1705231921u,1705470481u,1706302153u,1707704461u,1708102201u,1708179229u,1708549501u,1709127811u,1709157169u,1709909293u,1710375121u,1710616321u,1710753001u,1712127241u,1712392321u,1713319453u,1713600049u,1714322377u,1714721059u,1716160321u,
	1716714793u,1716774481u,1718013133u,1718088301u,1718341409u,1718769241u,1718951873u,1719020161u,1719122653u,1719197621u,1720434001u,1720630759u,1721061497u,1721290741u,1721986313u,1722007169u,1722685777u,1722703501u,1723305403u,1725547321u,1725675451u,1726007797u,1726372441u,1729884511u,1730231101u,1730281121u,1730549017u,1730644201u,1731048937u,1731995497u,1732625713u,1732924001u,1733027341u,1733474197u,1733628961u,1734059291u,1734285601u,1734795361u,1735071913u,1736188201u,1736392801u,1736481601u,1738687469u,1740214841u,1740420577u,1740980431u,1741490323u,1741920301u,1742288881u,1742815621u,1742969041u,1743166441u,1743275521u,1744315363u,1744576687u,1744605097u,1745114581u,1745441701u,1746692641u,1746721681u,1749124829u,1749580561u,1750412161u,1751246641u,1752710401u,1753588081u,1754818561u,1757055847u,1757148121u,1758731941u,1759540693u,1760014561u,1760460481u,1762570553u,1762742477u,1764717781u,1765554061u,1766984389u,1767200059u,1767234613u,1769031901u,1769091241u,1769267761u,1769846321u,1770236893u,1771044773u,1771303801u,1772267281u,1773486001u,1773582977u,1775611201u,1775668129u,1775919601u,1776439261u,1776723349u,1776820033u,1777380301u,1778373961u,1778382541u,1778644561u,
	1779649381u,1779892577u,1780945501u,1781537773u,1784291041u,1784306273u,1784323201u,1784638309u,1784975941u,1785500081u,1785507361u,1785843547u,1786005521u,1787127817u,1787934881u,1789167931u,1789656931u,1790023861u,1790101633u,1791157537u,1791426787u,1792442737u,1792588813u,1793417341u,1794814103u,1795163161u,1795216501u,1795674131u,1797382823u,1798502641u,1798706521u,1799674471u,1799805547u,1801369369u,1801558201u,1801774081u,1801818901u,1802099569u,1802510669u,1802778901u,1803278401u,1803308057u,1803768091u,1804906517u,1804954201u,1805947313u,1806597937u,1807352209u,1809888967u,1811514721u,1813073653u,1813474321u,1813625893u,1816408273u,1817067169u,1818108721u,1819829749u,1820306953u,1820514241u,1821514633u,1821689101u,1821792457u,1822160341u,1823541721u,1824612241u,1825017521u,1825140241u,1825428781u,1825568641u,1825794097u,1827554653u,1828377001u,1828682101u,1828887061u,1829375209u,1830949313u,1831048561u,1831258601u,1832689801u,1833166333u,1833179041u,1833328621u,1835112241u,1835114401u,1836304561u,1836484651u,1837156049u,1837599769u,1837837729u,1839568981u,1840920361u,1841034961u,1841099261u,1841479501u,1841683681u,1843610041u,1844028961u,1845128533u,1846171781u,1846817281u,
	1847811673u,1848681121u,1849811041u,1849964117u,1850233897u,1850598961u,1852496761u,1853926777u,1854001513u,1854084649u,1854583381u,1854940231u,1855100017u,1855139341u,1856689453u,1857221281u,1857695281u,1857893401u,1858098497u,1858197961u,1858395529u,1859554957u,1859668021u,1860373241u,1861026133u,1861039021u,1861880689u,1862880401u,1862883311u,1864009153u,1865227651u,1866409861u,1867165249u,1867485781u,1867906721u,1868076451u,1868682241u,1868951881u,1869875281u,1871987041u,1872538273u,1872937057u,1873177693u,1874634721u,1874849929u,1875796561u,1876652713u,1877555681u,1878156001u,1878691753u,1879088401u,1879111697u,1879480513u,1879623157u,1879775501u,1879965781u,1882301201u,1883377441u,1883509633u,1883699581u,1883785681u,1883814337u,1885915841u,1887933601u,1888407781u,1892333521u,1892911801u,1894344001u,1894909141u,1894955311u,1896789349u,1896961801u,1897700113u,1898107681u,1899081757u,1899525601u,1899768763u,1900687381u,1900823653u,1902297601u,1902303721u,1902900241u,1902938401u,1903447841u,1904558701u,1904658913u,1904833393u,1904842033u,1905958891u,1906709281u,1908088001u,1909566073u,1910134309u,1911197947u,1912950241u,1913016001u,1913258311u,1914303841u,1914413761u,1915391521u,
	1916729101u,1916987593u,1917363361u,1917397637u,1917525961u,1918534901u,1918699771u,1918820593u,1919767681u,1920301951u,1921295359u,1921309633u,1922092567u,1922687293u,1922906161u,1923224689u,1923311317u,1923845801u,1923932251u,1923972751u,1924201501u,1925042737u,1928482601u,1928903971u,1929862849u,1930403333u,1930447501u,1930534453u,1930915169u,1931085433u,1932025921u,1932608161u,1933798321u,1934350351u,1935121321u,1938264241u,1938285701u,1940048881u,1941454657u,1941746161u,1942183909u,1942608529u,1943951041u,1944125633u,1945042181u,1947867361u,1948642561u,1949584501u,1949646601u,1950483601u,1950987193u,1951469101u,1952298811u,1952513369u,1952968753u,1955324449u,1955898841u,1955950129u,1957009981u,1957073329u,1957283461u,1957705177u,1958102641u,1958468491u,1958613601u,1959519601u,1959659857u,1959880321u,1960708261u,1962810571u,1963149553u,1965007601u,1965258361u,1966146451u,1968002149u,1968134897u,1968237601u,1968661801u,1969734313u,1970065681u,1970097001u,1971139733u,1973398141u,1974420721u,1974474049u,1975591801u,1976295241u,1977147019u,1977257441u,1977619249u,1978602271u,1982123893u,1982259721u,1982826961u,1984089601u,1985348527u,1985652361u,1985754961u,1986262201u,1987020361u,
	1987464449u,1988071801u,1988713189u,1988835713u,1988965861u,1989192277u,1989985453u,1990822961u,1991063449u,1992023461u,1992841201u,1995211801u,1995784961u,1995830761u,1996231189u,1996339649u,1997844157u,1998780001u,1999053601u,1999111801u,1999743661u,2000241751u,2000436751u,2004299641u,2005360351u,2007646961u,2011080751u,2012581441u,2013208681u,2013554869u,2013757681u,2013834961u,2016058681u,2016481477u,2017021333u,2017509601u,2018268253u,2019530113u,2019564769u,2020813873u,2021392369u,2021884343u,2022021001u,2022794641u,2023073101u,2023351681u,2023528501u,2025055033u,2025223597u,2025677161u,2026001341u,2026574201u,2026654201u,2027675701u,2028279793u,2028631361u,2028685753u,2028812399u,2029554241u,2029651561u,2029830409u,2030600833u,2035858117u,2035948201u,2036224321u,2036326501u,2037732451u,2038957921u,2040131809u,2041025581u,2042467741u,2042940563u,2043173273u,2043400681u,2044366633u,2046320701u,2048443501u,2048751901u,2049191761u,2049204781u,2049293401u,2049842203u,2050617713u,2050864921u,2051270221u,2051369281u,2051790521u,2052149221u,2053128361u,2054711381u,2055634561u,2056892761u,2057188501u,2057267941u,2057516461u,2057835781u,2058072041u,2058874201u,2059585921u,2059739221u,
	2060732161u,2062612033u,2064236401u,2064373921u,2065230833u,2065240801u,2065503961u,2066334463u,2067887557u,2068399201u,2068867841u,2069032057u,2069151931u,2070739441u,2071397101u,2071597009u,2071904761u,2072285209u,2072624761u,2073312253u,2073560401u,2075827501u,2075853001u,2076192007u,2078133421u,2080442881u,2080680481u,2080995841u,2081039297u,2081551753u,2082146617u,2083034113u,2083997441u,2084800121u,2084833081u,2085453649u,2085882661u,2086645009u,2086737761u,2088286201u,2089977121u,2090066947u,2092981129u,2093300401u,2095181881u,2095627153u,2096046457u,2097317377u,2099613601u,2100043933u,2100292841u,2100522313u,2101078729u,2101170097u,2101470541u,2101590361u,2101744837u,2101873501u,2102670961u,2104994449u,2105594401u,2106147457u,2107148761u,2107535221u,2108275489u,2108353523u,2108761201u,2111416021u,2111488561u,2111732857u,2112030361u,2114643217u,2115769633u,2115986557u,2116483027u,2116541221u,2117031263u,2117555641u,2117725921u,2117955451u,2118621097u,2120096161u,2121791101u,2121877801u,2123334577u,2123601751u,2124078653u,2124691213u,2125053721u,2125367881u,2126689501u,2126800747u,2127197489u,2127768481u,2127818161u,2128104001u,2128392631u,2128719853u,2128925989u,2129304997u,
	2129331241u,2129796781u,2130134533u,2130804481u,2131004737u,2131811501u,2134906957u,2136546061u,2137052881u,2139155051u,2140082101u,2140483381u,2140538401u,2140699681u,2140771609u,2141236021u,2141340833u,2141576599u,2141744221u,2141843041u,2142324367u,2144961253u,2146582621u,2147022749u,2147291821u,2147418113u,2147429509u,2147744041u,2148929761u,2150812709u,2150849317u,2151835381u,2152627801u,2154446641u,2155046141u,2155416251u,2156100421u,2156151313u,2158577569u,2159003281u,2159678053u,2160272353u,2161342261u,2163951217u,2164282177u,2164862701u,2165183077u,2165266621u,2165571857u,2166133001u,2166414851u,2167577527u,2168431201u,2168869949u,2169158479u,2169278881u,2170186021u,2170282969u,2171029609u,2172155819u,2173227301u,2173499329u,2173540951u,2173579801u,2174349029u,2174837281u,2175126601u,2175406201u,2175646177u,2177374321u,2177537543u,2177645557u,2177694541u,2178082901u,2178672517u,2178939221u,2178944461u,2179389889u,2179515601u,2180221201u,2182200907u,2182281601u,2182693717u,2182802689u,2183385271u,2184384961u,2185223041u,2185362233u,2186762569u,2187717761u,2188955501u,2190477121u,2193980881u,2194205311u,2194228201u,2194363861u,2195201321u,2195607943u,2195768161u,2195936501u,
	2197126009u,2199617701u,2199700321u,2199931651u,2200115713u,2201169601u,2201474969u,2201924341u,2202101761u,2202205897u,2202735301u,2203226131u,2203539433u,2203649197u,2203856497u,2204338501u,2205160651u,2206095589u,2209642849u,2210578759u,2212405273u,2213431729u,2214761251u,2215407601u,2215782997u,2216255041u,2216430721u,2216960929u,2217299821u,2217708769u,2217879901u,2217951073u,2218767773u,2218852801u,2219069281u,2219072017u,2220067081u,2220157681u,2221010749u,2222229767u,2223876601u,2224252801u,2224278253u,2224519921u,2228123521u,2229468697u,2230305949u,2231332357u,2232701251u,2233031701u,2233186201u,2233511281u,2234003857u,2239622113u,2240507821u,2241880033u,2241982009u,2243686501u,2244048913u,2244356401u,2244932281u,2245519981u,2245804561u,2245921201u,2246762899u,2246796721u,2246916001u,2248354153u,2248929271u,2249509159u,2249681281u,2249831201u,2251732033u,2254314241u,2254757077u,2254796281u,2255172841u,2255274001u,2255678881u,2256197761u,2256653161u,2256748777u,2256751837u,2258118721u,2260569001u,2260734301u,2262315823u,2262861901u,2264617261u,2265650401u,2266645757u,2268655381u,2269307587u,2269348171u,2270483281u,2272748401u,2274584089u,2276463553u,2276530261u,2278091971u,
	2278677961u,2279223661u,2280284137u,2280630241u,2280656953u,2282310577u,2283289681u,2284148113u,2284416181u,2284569169u,2284660351u,2284775533u,2286701621u,2287788841u,2288451961u,2289251669u,2289624793u,2290316377u,2290910257u,2291205461u,2292068143u,2293939117u,2295209281u,2296894753u,2296995121u,2298291121u,2298727121u,2299190401u,2299876417u,2300628601u,2300795353u,2301292753u,2301745249u,2301828211u,2301931801u,2302024969u,2302419601u,2302695253u,2303036741u,2303523541u,2303611381u,2303681821u,2304120001u,2304344813u,2304710123u,2305087201u,2305360981u,2305787881u,2306909593u,2308955947u,2308966661u,2309027281u,2309241601u,2309405617u,2311558021u,2311575001u,2312542981u,2313774001u,2314756081u,2315137261u,2315727601u,2315820001u,2316123181u,2317802761u,2318497201u,2319724441u,2320224481u,2320527613u,2320690177u,2321591371u,2322648901u,2323147201u,2323329821u,2323952161u,2324742421u,2324799049u,2324867399u,2325338821u,2326319101u,2329267501u,2329584217u,2330569541u,2331181621u,2332301401u,2332627249u,2335341601u,2335379707u,2335640077u,2338157597u,2338728001u,2339165521u,2339464681u,2340460487u,2341131967u,2341590901u,2342644921u,2342993797u,2343710101u,2344310101u,2344578077u,
	2345651137u,2345907961u,2346273961u,2347597981u,2347910461u,2348226091u,2349467389u,2349644449u,2351311921u,2352371251u,2352960361u,2353309981u,2353548601u,2353639681u,2354453561u,2355230749u,2355320101u,2355622721u,2355649921u,2355735089u,2356272367u,2357292421u,2358534361u,2358622897u,2359020301u,2359147141u,2359686241u,2360261989u,2361232477u,2361669521u,2362421953u,2366169241u,2366765851u,2367379201u,2368671661u,2370163001u,2370275041u,2370771181u,2370928337u,2371036361u,2371350101u,2371681081u,2372122963u,2372976563u,2373987781u,2374232977u,2375415841u,2377166401u,2377871941u,2378309041u,2380339837u,2380603501u,2381782597u,2382364601u,2382678101u,2383164577u,2383939741u,2384234281u,2384804129u,2385370261u,2385574201u,2385911809u,2389072321u,2389544977u,2391137281u,2392008631u,2393708761u,2394311233u,2394852601u,2395190161u,2396357041u,2397946357u,2397958381u,2398393661u,2398798801u,2401060117u,2401166041u,2401992349u,2402976967u,2404912501u,2405599561u,2405665297u,2407276081u,2407532629u,2411128441u,2411277019u,2412172153u,2412675721u,2413973071u,2414167741u,2414829781u,2418525607u,2421244921u,2422296241u,2423011501u,2423401681u,2423727433u,2425053643u,2425249601u,2426927329u,
	2427180301u,2428648967u,2428870753u,2428986913u,2429123761u,2429407961u,2430556381u,2430697513u,2430813001u,2431136401u,2431144801u,2432761633u,2432860273u,2433791593u,2433943891u,2434785571u,2434964321u,2434974433u,2435091221u,2436691321u,2436927907u,2437711381u,2437907779u,2438403661u,2438778413u,2439162433u,2442050353u,2442115021u,2442454561u,2443205821u,2443708961u,2443829641u,2444950561u,2445236353u,2448039497u,2448374689u,2449037593u,2449452361u,2449575001u,2449637281u,2449856317u,2449977757u,2450701501u,2452396871u,2453212081u,2453473049u,2453931649u,2454285751u,2455563041u,2456536681u,2457846161u,2459217349u,2459637181u,2461902001u,2462440753u,2463491233u,2463713281u,2463980401u,2467813621u,2468730097u,2470348441u,2470397329u,2470869253u,2470894273u,2471205361u,2473120961u,2473189441u,2473616017u,2473823353u,2474308069u,2474676949u,2475994501u,2476283239u,2477771731u,2477814193u,2478643907u,2480097421u,2480147521u,2480343553u,2482435981u,2482682131u,2484033481u,2484408301u,2485414801u,2486017249u,2486988361u,2488420801u,2488507201u,2488591117u,2489462641u,2490707401u,2490994549u,2492474401u,2492480233u,2494465921u,2494660033u,2494984321u,2495834329u,2497638781u,2498508937u,
	2499327041u,2500452361u,2501012599u,2501091451u,2501748901u,2501771329u,2502274321u,2502317413u,2502525637u,2503908433u,2504008609u,2506087441u,2506529257u,2506733189u,2507121037u,2507909851u,2508178843u,2509198669u,2509698601u,2509860961u,2510085721u,2510363909u,2513230891u,2514105301u,2514589561u,2514929581u,2516684801u,2517516001u,2517572671u,2517767473u,2519297089u,2519621857u,2519819281u,2519926201u,2521333801u,2522790721u,2523182101u,2523193417u,2523476981u,2523947041u,2524474297u,2525070241u,2526525001u,2526566041u,2526678901u,2526866021u,2527395421u,2527812001u,2528291341u,2529410281u,2529827821u,2529854713u,2530351561u,2531277001u,2531449921u,2531575201u,2532226393u,2532559393u,2532630787u,2533465661u,2533797017u,2535516173u,2536014041u,2536808941u,2537105761u,2538074701u,2538376441u,2539024741u,2539406281u,2539736257u,2539967281u,2540469901u,2541660367u,2542479481u,2543019301u,2544020641u,2544590161u,2545660981u,2545934077u,2547492713u,2547621973u,2547796681u,2548051801u,2548840801u,2549080981u,2550139253u,2550780277u,2551365769u,2552418761u,2552575159u,2552782849u,2552791969u,2553272929u,2554610521u,2555219713u,2555391481u,2558637901u,2560104001u,2560600351u,2560891327u,
	2561945401u,2562814621u,2564272621u,2564536201u,2564889601u,2565186137u,2566731217u,2567761057u,2568157801u,2568646001u,2569316113u,2570087521u,2570239441u,2571180247u,2572432801u,2573073721u,2573686441u,2574243721u,2575060949u,2575260241u,2575737361u,2576686861u,2577345541u,2577477889u,2577770561u,2579288041u,2582092189u,2582246701u,2582863921u,2582952769u,2583322381u,2584115101u,2584460701u,2585810161u,2586927553u,2587287001u,2588054401u,2588218777u,2588333293u,2588582089u,2588653081u,2589001309u,2590120501u,2590323877u,2590663681u,2592565417u,2593065721u,2593070011u,2593182901u,2593490153u,2594603437u,2595276353u,2596671001u,2597289241u,2597294701u,2597928961u,2597953639u,2598933481u,2599972453u,2600611861u,2601144001u,2602343521u,2602378721u,2603237713u,2604465013u,2604803701u,2605557781u,2607056761u,2607162961u,2607237361u,2609525161u,2609814313u,2611122229u,2611461529u,2612738161u,2613382201u,2614688801u,2616180821u,2616662881u,2617181281u,2617563031u,2617882807u,2620066681u,2621080741u,2621977627u,2622124597u,2622993661u,2623338001u,2624513401u,2624549929u,2625665701u,2625903601u,2626783921u,2626987277u,2627169121u,2627284987u,2629554353u,2629564561u,2629870153u,2630374741u,
	2630643401u,2631365281u,2632022009u,2632605049u,2634284801u,2634804481u,2634820813u,2635031701u,2635245361u,2638067881u,2639099233u,2642025673u,2642159809u,2642582251u,2643735931u,2645695781u,2646751249u,2646790033u,2648662777u,2648671957u,2649907201u,2650820329u,2651412401u,2651507713u,2654176861u,2654716321u,2654851387u,2656296091u,2656494271u,2657502001u,2658630913u,2658696301u,2659265701u,2659745089u,2659939453u,2660336701u,2661150961u,2661744961u,2662372621u,2662524361u,2662700041u,2662709641u,2663616901u,2664020341u,2665141921u,2668095181u,2668469431u,2670187297u,2670972949u,2672236801u,2672605657u,2672651521u,2674021681u,2676053333u,2677147201u,2677821121u,2678785621u,2678867821u,2679197521u,2680883617u,2680980823u,2681041843u,2682823681u,2683078273u,2683256401u,2683742491u,2684284441u,2685422593u,2685856537u,2687655169u,2688124001u,2688238471u,2689248241u,2689427281u,2690408533u,2690867401u,2690902201u,2692079167u,2693302921u,2693739751u,2693939401u,2694515761u,2695064653u,2695115473u,2696970277u,2700582301u,2700818017u,2700891839u,2701479001u,2701878941u,2702470861u,2704546027u,2704957909u,2705647141u,2705912357u,2706863833u,2707661501u,2708811001u,2708826841u,2709611521u,
	2710638353u,2711314801u,2713095841u,2715586237u,2716157989u,2716275007u,2716368733u,2717428033u,2718074881u,2718920101u,2719319513u,2719940041u,2720754401u,2721666817u,2721721939u,2721727801u,2723194601u,2723330701u,2723753341u,2723859001u,2724552907u,2725357249u,2725818481u,2726438581u,2726640001u,2727014851u,2728102651u,2730680821u,2732475241u,2732995387u,2733156029u,2733329101u,2733494401u,2733504737u,2735309521u,2736316301u,2737916641u,2738184697u,2738645521u,2740336561u,2740575121u,2740576313u,2741749889u,2741814361u,2741937199u,2741992201u,2742234841u,2742823681u,2744329909u,2744634061u,2746021741u,2748148261u,2749015171u,2749138393u,2750055841u,2751708961u,2753333227u,2753538001u,2753722153u,2757680773u,2758158601u,2759392633u,2765323397u,2766006253u,2766065681u,2766172501u,2766901501u,2767672189u,2768304961u,2769080161u,2769602333u,2770560241u,2770687921u,2774295577u,2775563029u,2775827921u,2776874941u,2777887297u,2778304273u,2778517681u,2778813041u,2779302241u,2779477741u,2781117721u,2781226477u,2783289961u,2783626249u,2783647441u,2785732741u,2786028337u,2787998641u,2789218909u,2789540051u,2791053541u,2791678681u,2791893511u,2794689601u,2794946701u,2797002901u,2799557641u,
	2800048717u,2800352011u,2801124001u,2802534877u,2804551741u,2805762961u,2806205689u,2806759801u,2807723737u,2809635901u,2810596741u,2810679481u,2811315361u,2812672981u,2813372869u,2813594521u,2814748201u,2815304401u,2817814001u,2820490961u,2823570433u,2823624181u,2823851773u,2823996961u,2824256377u,2824804693u,2824854913u,2825939801u,2827031221u,2827131841u,2828205397u,2828435941u,2830122181u,2830242961u,2831134657u,2831510881u,2831801401u,2832384133u,2832480001u,2832743713u,2833704841u,2833846561u,2835565063u,2837373883u,2837697773u,2837917633u,2839343689u,2840634109u,2840871041u,2841190381u,2841474601u,2841642301u,2842912381u,2844725941u,2845651033u,2846470453u,2847894377u,2848466281u,2848621987u,2848722131u,2849718781u,2849949991u,2852595517u,2852990449u,2855046421u,2855071801u,2855512909u,2858298301u,2860516693u,2861403841u,2862066481u,2863069201u,2863404727u,2864884771u,2865483601u,2866005139u,2866028551u,2866527841u,2866872781u,2867755969u,2869121641u,2870377309u,2871536561u,2872327321u,2872527733u,2872948321u,2874382853u,2876065951u,2877769501u,2879499169u,2880052969u,2881429741u,2882370481u,2883582901u,2884418629u,2885594497u,2885966161u,2885972701u,2886284401u,2887282129u,
	2887955533u,2890316801u,2890414873u,2892426029u,2893195201u,2894667781u,2895004927u,2898058801u,2899294889u,2899527661u,2899835959u,2900730001u,2903776129u,2906853349u,2914350209u,2915953633u,2916247819u,2918295451u,2919550393u,2919669901u,2920085641u,2920691161u,2923042141u,2923286953u,2923693501u,2924158001u,2924317621u,2924708401u,2927264341u,2928676747u,2929062533u,2929106753u,2929239937u,2930420351u,2930570501u,2930831641u,2931690049u,2931708097u,2932327549u,2932664803u,2933809621u,2933894341u,2934429961u,2936227603u,2936491273u,2936958181u,2937299663u,2939179643u,2941174897u,2941343633u,2942328961u,2942952481u,2943030781u,2943556201u,2944266733u,2944555681u,2944677961u,2945208001u,2945517433u,2945549881u,2947521721u,2948225281u,2948673901u,2950375561u,2951104333u,2951136343u,2951771761u,2954187413u,2955113161u,2956210741u,2956724317u,2957155441u,2957320351u,2958697897u,2959558561u,2961709741u,2964816109u,2964930697u,2965085641u,2965700233u,2966800033u,2967053953u,2967689941u,2968206601u,2969736001u,2974312081u,2974506841u,2975096953u,2975377429u,2976929281u,2976930001u,2977476989u,2978096701u,2978196001u,2978766341u,2979186421u,2979367561u,2980689601u,2980916821u,2982028981u,
	2982072601u,2982283951u,2984619001u,2986025677u,2986570081u,2987230541u,2987414977u,2988134717u,2988607051u,2989297741u,2990152901u,2993462713u,2993495041u,2993666521u,2994098281u,2994415201u,2998202353u,2998467901u,2998850401u,2998919873u,2999691217u,3000688381u,3001561441u,3002281981u,3002647829u,3002823901u,3003310927u,3004007231u,3004443679u,3006704701u,3007909837u,3007991701u,3008110141u,3009628301u,3009756751u,3010328101u,3011304901u,3011421841u,3012089203u,3012624161u,3013708057u,3014101261u,3015502181u,3016350457u,3016957381u,3017144701u,3017444761u,3017887471u,3017920123u,3018147217u,3018576689u,3018720481u,3019333681u,3019916461u,3021190921u,3023095511u,3024108001u,3024774901u,3025350343u,3025708561u,3026575553u,3028586471u,3029349073u,3030393901u,3030469201u,3030758401u,3030994513u,3031135141u,3033332281u,3033332641u,3033369991u,3034203361u,3034402681u,3034817209u,3035175121u,3035375047u,3035837161u,3036079729u,3036809483u,3037203431u,3037295801u,3037781251u,3038190337u,3038880473u,3039463801u,3039681457u,3041984353u,3042630533u,3042763501u,3043917001u,3044238121u,3044430361u,3044970001u,3045287797u,3047241553u,3047896621u,3048014201u,3048159841u,3048432241u,3048467381u,
	3048928861u,3048937849u,3049386451u,3049610257u,3050190163u,3050401681u,3050533861u,3051569233u,3051985681u,3052082593u,3052234081u,3053183869u,3053783161u,3056100623u,3056160929u,3057111721u,3057886591u,3058670677u,3059251301u,3059397793u,3060527041u,3063685633u,3063875881u,3065023921u,3065998717u,3066077161u,3066671479u,3068106301u,3068534701u,3069196417u,3070465561u,3072080089u,3072094201u,3072578741u,3072942211u,3073935481u,3075075901u,3075098041u,3076070257u,3076505209u,3077122133u,3077220301u,3077802001u,3078386641u,3079496551u,3080954449u,3081886603u,3082054697u,3082068013u,3082246801u,3083053387u,3083537689u,3083884651u,3084642001u,3085326181u,3086414761u,3086434561u,3088134721u,3088408429u,3089013313u,3090578401u,3091019777u,3092164069u,3093256537u,3093959257u,3094763851u,3097001401u,3097527577u,3099670657u,3100791421u,3102175297u,3102234751u,3103800701u,3104207821u,3105567361u,3105710609u,3107812849u,3112471297u,3112955209u,3112974481u,3114125071u,3114343441u,3114896689u,3115667521u,3116438417u,3116456071u,3117899161u,3118328161u,3118762921u,3119101921u,3120445697u,3120891757u,3121279201u,3121418161u,3122001001u,3122287981u,3127462001u,3128623561u,3129914881u,3131816689u,
	3132198001u,3132209761u,3133899409u,3135040133u,3136813417u,3138302401u,3139995937u,3140524801u,3141144721u,3143282221u,3144216673u,3145410761u,3147032701u,3149633341u,3149833609u,3150972917u,3151546609u,3154371361u,3155747869u,3156599161u,3156643141u,3157579861u,3157741801u,3158553853u,3159919621u,3159939601u,3160342417u,3163106953u,3163296193u,3164060809u,3164207761u,3165237361u,3165594769u,3166504273u,3167442721u,3167795521u,3169830751u,3170262409u,3172658653u,3172880101u,3175204531u,3175255717u,3176208541u,3176257481u,3177449713u,3178375201u,3179632801u,3180632833u,3181356263u,3181391641u,3182606857u,3182655361u,3182891401u,3184139701u,3185472001u,3185571181u,3185704801u,3186499009u,3187035113u,3187421077u,3187939921u,3189888001u,3190894201u,3191212729u,3193382881u,3193414093u,3195176401u,3195867961u,3196344061u,3196397821u,3196431829u,3197565001u,3197632441u,3197911001u,3197911741u,3198074423u,3199081621u,3199164901u,3199264201u,3199915981u,3203380891u,3203895601u,3205663921u,3206193277u,3207297773u,3207744541u,3208806001u,3208902491u,3210950341u,3212465437u,3213007669u,3213538651u,3214169201u,3215031751u,3215441951u,3217412881u,3218502001u,3219767461u,3219808411u,3220158517u,
	3221580281u,3222693421u,3222889921u,3223077001u,3223878001u,3224143441u,3225020401u,3225049381u,3225081473u,3226002913u,3227082823u,3227209057u,3227618521u,3228648151u,3229131137u,3230915377u,3231743593u,3232060741u,3232475401u,3233558021u,3237992101u,3238307941u,3240392401u,3242533897u,3243596491u,3243805111u,3244517521u,3244709161u,3245204161u,3245477761u,3246206701u,3246238801u,3248236309u,3248313913u,3248891101u,3249258013u,3249266749u,3250348417u,3250552201u,3250700737u,3252148621u,3253665241u,3254927089u,3257334541u,3258647809u,3258892801u,3261114601u,3262284721u,3262472797u,3263097641u,3263568901u,3263626957u,3264628081u,3264820001u,3265122451u,3267417677u,3267776161u,3268506541u,3268841941u,3270933121u,3271076771u,3271999249u,3272030401u,3272256001u,3272702497u,3273449257u,3274264033u,3275218171u,3275671969u,3276075709u,3276102787u,3276238333u,3277047649u,3277653133u,3278640289u,3280067129u,3280593611u,3281736601u,3282974857u,3284228881u,3284630713u,3285566101u,3285725701u,3287174129u,3288757249u,3290428801u,3292356943u,3294029377u,3295362727u,3296403601u,3297427561u,3299246833u,3299956747u,3301164901u,3301335577u,3302322241u,3304307341u,3305644849u,3305829073u,3306393751u,
	3306686659u,3306957593u,3307124821u,3308325301u,3309632321u,3310858777u,3311484121u,3312489577u,3312536569u,3313196881u,3313744561u,3314111761u,3315139717u,3316525801u,3316579981u,3319323601u,3320669437u,3323308501u,3323590463u,3323829169u,3326617057u,3326665537u,3326971301u,3327488893u,3328354801u,3328437481u,3329284561u,3330883781u,3331422343u,3332101201u,3332184841u,3332800021u,3333016177u,3334350781u,3335190001u,3336236461u,3336384031u,3337776883u,3339299881u,3340160881u,3340214413u,3341638441u,3342005633u,3342769921u,3342871081u,3343770001u,3343969981u,3344191241u,3345585407u,3345878017u,3346172189u,3347570941u,3347908801u,3348140401u,3348271081u,3349218881u,3349275151u,3349340611u,3350342477u,3350993969u,3352091557u,3353166181u,3353809537u,3355382857u,3355610977u,3355953001u,3356529001u,3357417181u,3359737921u,3360511981u,3361897441u,3363360841u,3366862633u,3369139201u,3369251047u,3370514011u,3371024521u,3371452921u,3371693063u,3372667121u,3373086601u,3373454161u,3373684081u,3374598781u,3377265607u,3378014641u,3380740301u,3381052177u,3381901921u,3385842877u,3386603221u,3387014401u,3387487351u,3388007161u,3388350181u,3389030261u,3394104121u,3394456381u,3395091311u,3399205591u,
	3399890413u,3402234749u,3404705473u,3405294361u,3406329577u,3407609221u,3407652721u,3407772817u,3407952169u,3408135121u,3408200641u,3409339393u,3409497181u,3411250081u,3411338491u,3411574801u,3411829693u,3412575097u,3413440801u,3413656441u,3413894617u,3414918001u,3415025629u,3415379701u,3415832137u,3417522841u,3419588767u,3420143941u,3421044901u,3421562401u,3421845001u,3423222757u,3423580481u,3424593481u,3427038721u,3427050673u,3428133103u,3429457921u,3429982081u,3430422407u,3430804297u,3431460781u,3432192961u,3432695921u,3432997537u,3433439341u,3433458073u,3434575327u,3435973837u,3438709681u,3438721441u,3439583521u,3439633681u,3440195713u,3441837421u,3441861481u,3442631257u,3443019667u,3443704261u,3445230661u,3447654211u,3448817281u,3449768513u,3450717901u,3453354607u,3453900913u,3458257741u,3461605561u,3461861761u,3462426241u,3463737019u,3463907761u,3464236901u,3465389899u,3466026061u,3466158361u,3466560241u,3468903571u,3470006161u,3470716657u,3471001777u,3474335437u,3476582701u,3478650721u,3479004107u,3479711509u,3479716321u,3480174001u,3481937341u,3482161261u,3482948917u,3483556021u,3483871969u,3485027911u,3485607139u,3485747521u,3487441321u,3488303281u,3489958697u,3490107373u,
	3491763493u,3492178873u,3492883081u,3493262761u,3493832401u,3495447929u,3495943153u,3496558301u,3496727521u,3497607433u,3499095601u,3499146721u,3501194797u,3502404577u,3502454401u,3502917061u,3504132113u,3504570301u,3504722521u,3505631677u,3508507801u,3512030497u,3512291021u,3512369857u,3512949529u,3513604657u,3515023801u,3516565057u,3517415749u,3519318721u,3520175581u,3520491283u,3520909237u,3520934177u,3521945041u,3523954141u,3524086333u,3525088961u,3529119361u,3529669501u,3529864391u,3532687201u,3533662129u,3533856913u,3534510001u,3535036441u,3535644241u,3535849801u,3536476021u,3537280361u,3538213381u,3542303047u,3543203333u,3543220801u,3543755801u,3544181551u,3547777349u,3548378341u,3549286001u,3549988261u,3552158521u,3553567057u,3553728841u,3554383141u,3555636481u,3556116361u,3557646401u,3559062241u,3560114681u,3561826129u,3562963973u,3563021449u,3563340457u,3566428301u,3567688213u,3568781399u,3571146091u,3571451677u,3572979481u,3573005701u,3573842581u,3574532221u,3574891757u,3575706133u,3576237301u,3576804001u,3576818293u,3577228201u,3577288121u,3577354741u,3577836991u,3578189341u,3579028597u,3579288691u,3579940513u,3582711841u,3583249921u,3583604161u,3584800801u,3586143541u,
	3586803661u,3586833253u,3587553971u,3587802901u,3588111721u,3589937261u,3590409439u,3593259181u,3593276353u,3593464957u,3593541601u,3594110081u,3594300841u,3594968821u,3595600801u,3595874801u,3596491907u,3596815169u,3597270517u,3597761761u,3598497631u,3598772761u,3600918181u,3602006101u,3602171311u,3602890507u,3603030637u,3604182121u,3604213909u,3605151241u,3607369481u,3607806313u,3611571121u,3612298321u,3612825221u,3614770573u,3615565681u,3616574081u,3618244081u,3618918001u,3620631169u,3624041701u,3627991081u,3628512301u,3628526287u,3629431741u,3630291841u,3630596257u,3631828481u,3632452741u,3634571473u,3635771833u,3635993089u,3636657361u,3637718281u,3637831753u,3639975553u,3640175341u,3642747313u,3649116277u,3649180601u,3649965281u,3650158849u,3651572609u,3653803501u,3656355841u,3658678849u,3658730893u,3658741021u,3662387977u,3662503093u,3663084541u,3663549001u,3664146889u,3665242951u,3665439361u,3666600593u,3666709201u,3668926801u,3669587533u,3670572517u,3672754633u,3673078513u,3674917139u,3677108161u,3677180797u,3679657997u,3680455117u,3680845579u,3681257581u,3681626467u,3682471321u,3684201751u,3685480201u,3685647701u,3685775741u,3692307161u,3692934001u,3693853801u,3695628133u,
	3697278427u,3697673959u,3697952401u,3700801861u,3704091751u,3705582073u,3705623281u,3705947857u,3707230429u,3708123301u,3708905341u,3709626961u,3710144201u,3711367621u,3711456001u,3712280041u,3712887289u,3713287801u,3713332321u,3713448769u,3715938721u,3716344201u,3717981001u,3718226401u,3721486081u,3722793481u,3723410161u,3723699373u,3725016749u,3725696141u,3727589761u,3727828501u,3728463481u,3729097633u,3732347521u,3732472261u,3733761739u,3736293461u,3741762181u,3742120801u,3745081901u,3745192001u,3746082901u,3746101189u,3746734201u,3747356581u,3748606369u,3749383681u,3750270721u,3751005457u,3751554581u,3751782737u,3753602191u,3753861707u,3754483201u,3754680403u,3756668401u,3759781369u,3760622689u,3760896133u,3762110881u,3767640601u,3767865601u,3770496473u,3772354483u,3773061337u,3774337201u,3776698801u,3778341121u,3782157841u,3782625301u,3783166381u,3783722701u,3784123501u,3784755151u,3786262241u,3787491457u,3789787501u,3791614681u,3791707327u,3791867821u,3792666241u,3793746853u,3794104987u,3795206827u,3796118087u,3796666237u,3798040471u,3798626833u,3799111681u,3800084401u,3800513761u,3801823441u,3804261733u,3805181281u,3805699501u,3806669491u,3807044251u,3807112123u,3807308269u,
	3807749821u,3809018947u,3809423801u,3813919453u,3815417629u,3815910673u,3815945869u,3817422181u,3817561777u,3817706621u,3819502651u,3820084741u,3820463101u,3821233121u,3822308461u,3824601781u,3827035237u,3831667313u,3832413097u,3832559641u,3832646221u,3832807681u,3833208961u,3834444901u,3835537861u,3835591921u,3839716441u,3840473737u,3842941741u,3843146341u,3844074781u,3846174151u,3846532801u,3846762601u,3847106803u,3847985837u,3849809041u,3850058689u,3852800033u,3853584967u,3854657521u,3856609531u,3858853681u,3859004501u,3861601129u,3862404013u,3863326897u,3865604023u,3866389033u,3867183937u,3868602661u,3870481681u,3870667321u,3871693189u,3872902051u,3874471147u,3874523017u,3875096893u,3875965417u,3876602521u,3876859351u,3880251649u,3881445751u,3883439647u,3884747581u,3886515361u,3886643801u,3887423437u,3887635753u,3888441001u,3890243701u,3890462401u,3890619097u,3891209851u,3891338101u,3891892421u,3891919417u,3892244491u,3892863301u,3893670001u,3893979941u,3894053311u,3894518881u,3896079281u,3897197641u,3897241129u,3897869201u,3898857757u,3898906129u,3900327241u,3901632313u,3901730401u,3901871161u,3902738581u,3903543281u,3903711841u,3904576201u,3905533721u,3905876501u,3906869461u,
	3907357441u,3907577521u,3907752241u,3910414201u,3911700001u,3912174421u,3914015281u,3914864773u,3914880337u,3914923211u,3915467341u,3915604421u,3915826433u,3915921241u,3916203721u,3916342789u,3917319841u,3918227437u,3922321561u,3922752121u,3923817397u,3923905321u,3924718057u,3924721729u,3925683973u,3926200321u,3926912669u,3927284131u,3928256641u,3929293061u,3929584681u,3933485501u,3934940833u,3935864017u,3936123601u,3936927457u,3939817201u,3940139641u,3942955081u,3944712521u,3945165841u,3945322601u,3947233201u,3947383201u,3949249501u,3951382753u,3951813601u,3953408801u,3953949421u,3955572001u,3955764121u,3956938921u,3957731281u,3958597301u,3958930441u,3959157217u,3959578801u,3959921521u,3960728641u,3962037061u,3962786213u,3964790509u,3966350203u,3966877883u,3967343161u,3969787267u,3971095301u,3971294713u,3972960433u,3973396141u,3973548601u,3973556837u,3975414301u,3978028421u,3979485931u,3981047941u,3982017601u,3982156993u,3986835121u,3987528793u,3987960913u,3988551763u,3989570761u,3990268261u,3991124341u,3992697997u,3993956353u,3994051321u,3996987251u,3997536427u,3998275849u,3998554561u,4004179201u,4004332417u,4005660961u,4007365741u,4008224221u,4008263401u,4011996871u,4013467081u,
	4014932857u,4014986977u,4015029061u,4015548769u,4016302681u,4017684529u,4018283501u,4019646961u,4020144133u,4020441283u,4023179551u,4023453889u,4026597361u,4026684481u,4026822577u,4027012021u,4027518961u,4028465873u,4028771849u,4030864201u,4031223841u,4033380097u,4033491991u,4033542721u,4034969401u,4034993269u,4035028861u,4035498409u,4036395581u,4037628013u,4040676121u,4042538497u,4044408817u,4044601751u,4044884689u,4047749641u,4048493983u,4051598189u,4051907041u,4053249523u,4053267217u,4054039841u,4057195309u,4057697947u,4058114581u,4058433931u,4059151489u,4059776533u,4060942381u,4061009971u,4064633821u,4065120241u,4065133501u,4065714161u,4067039461u,4067887501u,4068671881u,4071644893u,4072037887u,4074585313u,4075241633u,4075721921u,4076009857u,4076107811u,4077957961u,4079665633u,4079682361u,4083376067u,4085074909u,4085426521u,4086968981u,4087390501u,4088147617u,4088656469u,4088838913u,4092929149u,4093143601u,4093466677u,4094183413u,4096122001u,4098208297u,4098254161u,4098258707u,4099180801u,4099303297u,4100934241u,4103745689u,4105691393u,4106693701u,4108970251u,4109400127u,4109461709u,4109711581u,4110320663u,4111149169u,4113013141u,4113333001u,4113586837u,4113742057u,4115677501u,
	4115891893u,4117058221u,4117447441u,4119741001u,4121286907u,4123265041u,4126312891u,4126424287u,4126852801u,4126884301u,4127050621u,4127696731u,4128469381u,4129891501u,4129914673u,4131665461u,4132720909u,4133048921u,4133928761u,4134204361u,4134273793u,4134696677u,4135847101u,4136916001u,4137262541u,4138747921u,4138838401u,4139015987u,4142256301u,4142655217u,4144596091u,4145196433u,4146685921u,4147919821u,4150174393u,4151084497u,4153142449u,4153689541u,4153758071u,4155375349u,4157008813u,4157652619u,4160472121u,4160523361u,4162880401u,4166032873u,4167025201u,4167038161u,4169092201u,4169867689u,4172804707u,4173482881u,4173838327u,4176142891u,4176385921u,4176538241u,4181350177u,4182995521u,4183664101u,4185636781u,4186561633u,4187360341u,4189357987u,4189909501u,4191864013u,4192060699u,4193496803u,4194144361u,4195300681u,4195843037u,4196323561u,4197177629u,4199202001u,4199246161u,4199529601u,4199612257u,4199932801u,4201014949u,4201794451u,4202009461u,4202842561u,4204344601u,4205237201u,4206006229u,4206295433u,4210095667u,4210922233u,4211044201u,4211640811u,4211747581u,4212105409u,4212413569u,4212665491u,4215885697u,4216799521u,4217128021u,4217502449u,4218773491u,4218900001u,4219849297u,
	4220122321u,4223060047u,4231459597u,4231512709u,4231653481u,4231678933u,4231686061u,4232966251u,4233223501u,4234009397u,4234223701u,4234224601u,4235818861u,4235887891u,4237212061u,4239462241u,4241343151u,4241478913u,4243744201u,4244022301u,4244657077u,4244663651u,4246462657u,4246910641u,4247772913u,4247990917u,4250920459u,4251904273u,4253605201u,4255288237u,4255695013u,4255766969u,4257003353u,4257672049u,4261352869u,4261932001u,4265537221u,4265864281u,4267277291u,4268877361u,4269382741u,4271267333u,4271582809u,4275011401u,4276933201u,4277526901u,4277982241u,4278067921u,4278305651u,4279658761u,4281766711u,4282867213u,4283998993u,4284050473u,4285148981u,4286383201u,4286813749u,4288664869u,4289470021u,4289641621u,4289884201u,4289906089u,4293088801u,4293329041u,4294868509u,4294901761u
	};

	/* Return the heuristic-estimated number of M-primes in the specified interval.
	From Chris Caldwell http://primes.utm.edu/notes/faq/NextMersenne.html page, citing
	the 1980 Lenstra and Pomerance heuristic analyses: "The probability that 2^p-1 is prime
	is about (e^gamma log ap )/(p log 2) where a=2 if p=3 (mod 4) and a=6 if p=1 (mod 4)."
	Need to sum this over odd primes of the specific residue class in the given interval.
	*/
	double est_num_mp_in_interval(const uint32 plo, const uint32 phi)
	{
		const double iln2 = 1.0/LOG2, eGammaIln2 = 1.78107241799019798523*iln2;	// exp(0.57721566490153286060...)/log2
		double expNumPeq1mod4 = 0.0, expNumPeq3mod4 = 0.0;
		// Small-primes-sieving code ripped off from factor.c:
		const uint32 pdsum_8[8] = { 0, 2, 6, 8,12,18,20,26};
		uint32 curr_p,i,ihi,itmp32,maxp,nprime,neq1mod4 = 0,neq3mod4 = 0;
		uint32 fbase2psp_idx = 0;	// Index to next-expected Fermat base-2 pseudoprime in the precomputed table
		if((phi < 3) || (phi < plo)) return 0.0;
		// Pre-procees p < 11, so can start loop with curr_p = 11 == 1 (mod 10), as required by twopmodq32_x8();
		// Note we wait apply the const-multiplier eGammaIln2 to the final 2 summed estimates:
		nprime = 0;	// #odd primes used
		if((plo < 4) && (phi > 2)) { ++nprime;	++neq3mod4;	curr_p = 3;	expNumPeq3mod4 += log(2.0*curr_p)/curr_p; }
		if((plo < 6) && (phi > 4)) { ++nprime;	++neq1mod4;	curr_p = 5;	expNumPeq1mod4 += log(6.0*curr_p)/curr_p; }
		if((plo < 8) && (phi > 6)) { ++nprime;	++neq3mod4;	curr_p = 7;	expNumPeq3mod4 += log(2.0*curr_p)/curr_p; }
		/* Process chunks of length 30, starting with curr_p == 11 (mod 30). Applying the obvious
		divide-by-3,5 mini-sieve, have 8 candidates in each block: curr_p + [ 0, 2, 6, 8,12,18,20,26].
		For example: curr_p = 11 gives the 8 candidates: 11,13,17,19,23,29,31,37.
		*/
		maxp = MIN(phi,0xffffffe3);	// Make sure (curr_p + 29) < 2^32 in our loop
		for(curr_p = 11; curr_p <= maxp; curr_p += 30) {
			/* Do a quick Fermat base-2 compositeness test before invoking the more expensive mod operations: */
			itmp32 = twopmodq32_x8(curr_p, curr_p+ 2, curr_p+ 6, curr_p+ 8, curr_p+12, curr_p+18, curr_p+20, curr_p+26);
			for(i = 0; i < 8; ++i) {
				// It's a PRP: check vs table of known pseudoprimes and (if it's not a PSP) init for the next PSP:
				if((itmp32 >> i)&0x1) {
					ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
					if((curr_p + pdsum_8[i]) == fbase2psp[fbase2psp_idx]) {	// It's a base-2 pseudoprime
						++fbase2psp_idx;
						continue;
					} else {	// It's prime:
						ihi = (curr_p + pdsum_8[i]);
						if(ihi < plo) continue;
						if(ihi > maxp) break;
						++nprime;
					//	printf("At prime = %u, (mod 4) = %u\n",ihi,ihi&3);
						if((ihi&3) == 1) { ++neq1mod4;	expNumPeq1mod4 += log(6.0*ihi)/ihi; }
						if((ihi&3) == 3) { ++neq3mod4;	expNumPeq3mod4 += log(2.0*ihi)/ihi; }
					}
				}
			}
		}
		expNumPeq1mod4 *= eGammaIln2;	expNumPeq3mod4 *= eGammaIln2;
		printf("Using %u odd primes in [%u,%u], of which (%u,%u) == 1,3 (mod 4); Expected #Mp with p == 1,3 (mod 4) = %8.3f, %8.3f\n",nprime,plo,phi,neq1mod4,neq3mod4,expNumPeq1mod4,expNumPeq3mod4);
		printf("Max prime used = %u\n",ihi);
		return expNumPeq1mod4 + expNumPeq3mod4;
	}

	/* Linear least-squares applied to lg(p) for known-M(p) exponents, as described at http://primes.utm.edu/notes/faq/NextMersenne.html
	Jan 2016: See http://www.mersenneforum.org/showthread.php?p=423266#post423266 for results based on latest, M#49
	*/
	void compute_mers_best_fit()
	{
		const double iln2 = 1.0/LOG2;
		double xi, xavg, yavg, num, den, a,b,
			y[] = {2,3,5,7,13,17,19,31,61,89,107,127,521,607,1279,2203,2281,3217,4253,4423,9689,9941
			,11213,19937,21701,23209,44497,86243,110503,132049,216091,756839,859433,1257787,1398269,2976221,3021377,6972593
			,13466917,20996011,24036583,25964951,30402457,32582657,37156667,42643801,43112609,57885161,74207281,0.0};
		int i,j,n,p,eq1mod4 = 0,starts_with_peq2 = (y[0]==2);
		// Convert exponents p (stored as doubles in y-array) into lg(p), and compute averages:
		for(i = 0, yavg = 0.0; y[i] != 0; i++) {
			eq1mod4 += ((uint32)y[i]&3) == 1;
		//	printf("p = %8u, p%%4 = %u\n",(uint32)y[i],(uint32)y[i]&3);
			y[i] = log(y[i])*iln2;	yavg += y[i];
		}	n = i;
		xavg = (1.0 + n)/2;	yavg /= n;	// X-avg uses unit offset
		printf("#M-prime exponents = %u, #==1,3 (mod4) = %u,%u\n",n,eq1mod4,(n-eq1mod4-starts_with_peq2));
		printf("Sample size = %u, xavg = %8.4f, yavg = %8.4f\n",n,xavg,yavg);
		/*
		Linear least squares: Assume best-fit is to line a*x+b, slope a and y-intercept b TBD.
		Each datapoint has x = [index of Mersenne number], hence exact, under assumption of no
		as-yet-undiscivered primes with p less than max_p of of our knowns[] dataset. Ith point
		has 'error' w.r.to best-fit line measured via y-offset (as opposed to, say, normal distance,
		i.e. a total-least-squares approach as to the 'ordinary' one here, which would be more
		appropriate for data with non-exact x-values), di := yi - (a*xi+b). We
		seek a,b such that the sums of the squares of the di-values for our dataset is minimized.

			S = sum_i [yi - (a*xi+b)]^2 = sum_i [yi^2 - 2*yi*(a*xi+b) + (a^2*xi^2 + 2*a*b*xi + b^2)] .

		Take partial derivative of S w.r.to a: dS/da = sum_i [-2*xi*yi + 2*a*xi^2 + 2*b*xi] = 0. [1]

		Take partial derivative of S w.r.to b: dS/db = sum_i [-2*yi + 2*a*xi + 2*b] = 0. [2]

		Since b appears in [2] unmultiplied by xi or yi, can easily solve for it: b = y@ - a*x@, [**]
		where @ denotes the sample mean of the qty in question: x@ = [sum_i xi]/n, y@ = [sum_i yi]/n .
		Substituting the expression for b [**] into [1] we can solve for the slope paramater:

			sum_i [-2*xi*yi + 2*a*xi^2 + 2*b*xi] = 0, div by 2 and sub for b:
		->	sum_i [-xi*yi + a*xi^2 + (y@ - a*x@)*xi] = 0
		->	sum_i [-xi*yi + a*xi^2 + y@*xi - a*x@*xi] = 0
		->	sum_i [-(yi - y@)*xi + a*(xi - x@)*xi] = 0, separate into 2 sums, pull a out of 2nd one and solve for it:

		a = sum_i [(yi - y@)*xi] / sum_i [(xi - x@)*xi]. [*]
		*/
		for(i = 0, num = den = 0.0; i < n; i++) {
			xi = i+1;	num += (y[i] - yavg)*xi;	den += (xi - xavg)*xi;
		}
		a = num/den;	b = yavg - a*xavg;
		printf("Least-squares of full %u-point dataset gives slope = %8.4f, y-intercept = %8.4f\n",n,a,b);

	// Now do another linear regression, this time omitting smallest 10 M-exponents:
		for(i = 10, yavg = 0.0; i < n; i++) {
			yavg += y[i];
		}
		xavg = (1.0 + 10 + n)/2;	yavg /= (n-10);	// X-avg uses unit offset
		printf("Omitting 10 smallest M(p): Sample size = %u, xavg = %8.4f, yavg = %8.4f\n",n-10,xavg,yavg);
		for(i = 10, num = den = 0.0; i < n; i++) {
			xi = i+1;	num += (y[i] - yavg)*xi;	den += (xi - xavg)*xi;
		}
		a = num/den;	b = yavg - a*xavg;
		printf("Least-squares omitting 10 smallest M(p) gives slope = %8.4f, y-intercept = %8.4f\n",a,b);

	// Now do another linear regression, this time omitting smallest 20 M-exponents:
		for(i = 20, yavg = 0.0; i < n; i++) {
			yavg += y[i];
		}
		xavg = (1.0 + 20 + n)/2;	yavg /= (n-20);	// X-avg uses unit offset
		printf("Omitting 20 smallest M(p): Sample size = %u, xavg = %8.4f, yavg = %8.4f\n",n-20,xavg,yavg);
		for(i = 20, num = den = 0.0; i < n; i++) {
			xi = i+1;	num += (y[i] - yavg)*xi;	den += (xi - xavg)*xi;
		}
		a = num/den;	b = yavg - a*xavg;
		printf("Least-squares omitting 20 smallest M(p) gives slope = %8.4f, y-intercept = %8.4f\n",a,b);

	// Now do another linear regression, this time omitting smallest 30 M-exponents:
		for(i = 30, yavg = 0.0; i < n; i++) {
			yavg += y[i];
		}
		xavg = (1.0 + 30 + n)/2;	yavg /= (n-30);	// X-avg uses unit offset
		printf("Omitting 30 smallest M(p): Sample size = %u, xavg = %8.4f, yavg = %8.4f\n",n-30,xavg,yavg);
		for(i = 30, num = den = 0.0; i < n; i++) {
			xi = i+1;	num += (y[i] - yavg)*xi;	den += (xi - xavg)*xi;
		}
		a = num/den;	b = yavg - a*xavg;
		printf("Least-squares omitting 30 smallest M(p) gives slope = %8.4f, y-intercept = %8.4f\n",a,b);

	// Lastly, do another linear regression, this time omitting smallest 40 M-exponents:
		for(i = 40, yavg = 0.0; i < n; i++) {
			yavg += y[i];
		}
		xavg = (1.0 + 40 + n)/2;	yavg /= (n-40);	// X-avg uses unit offset
		printf("Omitting 40 smallest M(p): Sample size = %u, xavg = %8.4f, yavg = %8.4f\n",n-40,xavg,yavg);
		for(i = 40, num = den = 0.0; i < n; i++) {
			xi = i+1;	num += (y[i] - yavg)*xi;	den += (xi - xavg)*xi;
		}
		a = num/den;	b = yavg - a*xavg;
		printf("Least-squares omitting 40 smallest M(p) gives slope = %8.4f, y-intercept = %8.4f\n",a,b);
	}

	void test_mp_pm1_smooth(uint32 p)
	{
		double u_so_smoove, logf, ilogn, dtmp;
		const double ln2 = log(2.0);
		uint32 nprime = 1000, pm_gap = 10000, thresh = 100000;
		uint32 curr_p,fbase2psp_idx,i,ihi,itmp32,j,jlo,jhi,k,max_diff,m,nfac,np,pm1;
		const uint32 pdiff_8[8] = {2,1,2,1,2,3,1,3}, pdsum_8[8] = { 0, 2, 6, 8,12,18,20,26};
		// Compact table storing the (difference/2) between adjacent odd primes.
		unsigned char *pdiff = (unsigned char *)calloc(nprime, sizeof(unsigned char));	// 1000 primes is plenty for this task
		// Struct used for storing smoothness data ... make big enough to store all primes in [p - pm_gap, p + pm_gap] with a safety factor
		struct psmooth sdat;
		// .../10 here is an approximation based on prime density for primes > 100000;
		// note the code uses an interval [p-pm_gap, p+pm_gap], i.e. of length 2*pm_gap, so the calloc needs to be twice pm_gap/10:
		struct psmooth*psmooth_vec = (struct psmooth *)calloc(2*pm_gap/10, sizeof(struct psmooth));

		/* Init first few diffs between 3/5, 5/7, 7/11, so can start loop with curr_p = 11 == 1 (mod 10), as required by twopmodq32_x8(): */
		pdiff[1] = pdiff[2] = 1;
		ihi = curr_p = 11;
		/* Process chunks of length 30, starting with curr_p == 11 (mod 30). Applying the obvious divide-by-3,5 mini-sieve,
		we have 8 candidates in each interval: curr_p + [ 0, 2, 6, 8,12,18,20,26].
		For example: curr_p = 11 gives the 8 candidates: 11,13,17,19,23,29,31,37.
		*/
		fbase2psp_idx = 0;	// Index to next-expected Fermat base-2 pseudoprime in the precomputed table
		for(i = 3; i < nprime; curr_p += 30) {
			/* Make sure (curr_p + 29) < 2^32: */
			if(curr_p > 0xffffffe3) {
				fprintf(stderr,"curr_p overflows 32 bits!");
				nprime = i;
				break;
			}
			/* Do a quick Fermat base-2 compositeness test before invoking the more expensive mod operations: */
			itmp32 = twopmodq32_x8(curr_p, curr_p+ 2, curr_p+ 6, curr_p+ 8, curr_p+12, curr_p+18, curr_p+20, curr_p+26);
			for(j = 0; j < 8; ++j) {
				if((itmp32 >> j)&0x1)	// It's a PRP, so check against the table of known pseudoprimes and
				{						// (if it's not a PSP) init for the next gap
					ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
					if((curr_p + pdsum_8[j]) == fbase2psp[fbase2psp_idx]) {	/* It's a base-2 pseudoprime */
						++fbase2psp_idx;
						pdiff[i] += pdiff_8[j];
						continue;
					} else {	/* It's prime - add final increment to current pdiff[i] and then increment i: */
						ihi = (curr_p + pdsum_8[j]);
						pdiff[i] += pdiff_8[j];
						if(pdiff[i] > max_diff) {
							max_diff = pdiff[i];
						#if DBG_SIEVE
							printf("pdiff = %d at curr_p = %u\n", 2*max_diff,ihi);
						#endif
						}
						if(++i == nprime)
							break;
					}
				} else
					pdiff[i] += pdiff_8[j];
			}
			continue;
		}
		printf("Using first %u odd primes; max gap = %u\n",nprime,2*max_diff);
		printf("max sieving prime = %u\n",ihi);

		ASSERT(HERE, p > thresh, "Mersenne prime exponent must be larger that allowable threshold!");
		ASSERT(HERE, twopmodq32(p-1, p) == 1, "p fails base-2 fprp test!");
		np = 0;	// #primes in the current p-centered cohort
		// find N primes < and > p, compute smoothness norm based on p-1 factorization for each, store each [p,snorm] pair
		fbase2psp_idx = 0;	// Index to next-expected Fermat base-2 pseudoprime in the precomputed table
		jlo = p-pm_gap; jhi = p+pm_gap;
		// Find right tarting slot in base-2 pseudoprime table:
		while(fbase2psp[fbase2psp_idx] < jlo)
			++fbase2psp_idx;
		for(j = jlo; j <= jhi; j+=2) {
			// Do base-2 fprp test of j:
			if(!twopmodq32(j-1,j))
				continue;
			if(j == fbase2psp[fbase2psp_idx]) {	// It's a base-2 pseudoprime
				++fbase2psp_idx;
				continue;
			}
			// j is prime - compute factorization of j-1:
			sdat.p = j;
			pm1 = j - 1;
			printf("%u is prime: factorization of p-1 = ",j);
			ilogn = 1/log(1.0*pm1);	// 1/log(n)
			// We know 2 is a factor; special-case for that:
			nfac = 0;
			u_so_smoove = 0.0;
			curr_p = 2;
			logf = ln2;	// log(factor)
			while((pm1 & 1) == 0) {
				nfac++;	pm1 >>= 1;	dtmp = logf*ilogn;	u_so_smoove += dtmp*dtmp;
			}
			if(nfac > 1) {
				printf("2^%u",nfac);
			} else {
				printf("2");
			}
			curr_p = 3;
			for(m = 0; m < nprime; m++) {
				if(pm1 < curr_p*curr_p)	{	// Remaining cofactor must be prime
					sdat.b = pm1;
					printf(".%u",pm1);
					nfac++;
					logf = log(1.0*pm1);	// log(factor)
					dtmp = logf*ilogn;
					u_so_smoove += dtmp*dtmp;
					break;
				}
				k = 0;	// factor multiplicity counter
				while((pm1 % curr_p) == 0) {// curr_p divides (p-1)
					nfac++;	k++;
					pm1 /= curr_p;
					logf = log(1.0*curr_p);	// log(factor)
					dtmp = logf*ilogn;
					u_so_smoove += dtmp*dtmp;
				}
				sdat.b = curr_p;
				if(k > 1) {
					printf(".%u^%u",curr_p,k);
				} else if(k == 1) {
					printf(".%u",curr_p);
				}
				if(pm1 == 1) break;
				curr_p += (pdiff[m] << 1);
			}
			// L2 norm: divide by #factors (multiple-counting repeated factors):
			u_so_smoove = sqrt(u_so_smoove)/nfac;
			sdat.r = u_so_smoove;
			psmooth_vec[np++] = sdat;	// Write completed datum to array or later sorting
			printf("; %u factors, L2 smoothness = %15.13f\n",nfac,u_so_smoove);
		}	// for(j in [p +- pm_gap] loop
		printf("\n");

		// Using array of [p,snorm]-pair structs, sort resulting array-aof-structs by snorm value:
		qsort(psmooth_vec, np, sizeof(struct psmooth), psmooth_cmp_b);
		for(j = 0; j < np; j++) {
			sdat = psmooth_vec[j];
		//	printf("p = %u: B -smoothness = %u\n",sdat.p,sdat.b);
			if(sdat.p == p) {
				printf("B -smoothness: %u is %u of %u, percentile = %5.2f\n",p,j+1,np,100.0*((double)np-j)/np);
				break;
			}
		}
		qsort(psmooth_vec, np, sizeof(struct psmooth), psmooth_cmp_r);
		for(j = 0; j < np; j++) {
			sdat = psmooth_vec[j];
		//	printf("p = %u: L2 smoothness = %15.13f\n",sdat.p,sdat.r);
			if(sdat.p == p) {
				printf("L2-smoothness: %u is %u of %u, percentile = %5.2f\n",p,j+1,np,100.0*((double)np-j)/np);
				break;
			}
		}
	}	// test_mp_pm1_smooth()

#endif	// ENABLE_MPRIME_PM1_SMOOTH

#if defined(USE_GPU) && defined(__CUDACC__)

	// Simple vector-add test function:
	__global__ void VecAdd(float* A, float* B, float* C, int N)
	{
		int i = blockDim.x * blockIdx.x + threadIdx.x;
		// Uncomment if() to Print basic info about threads ... keep I/O reasonable, only do so for first 10 of each batch of 2^18:
		if(i%0x3ffff < 10)
			printf("GPU block %d[dim %d], thread %d ==> seq-thread %d [i%0x3ffff = %d]... \n", blockIdx.x, blockDim.x, threadIdx.x, i,i%0x3ffff);
		if (i < N)
			C[i] = A[i] + B[i];
		else
			printf("GPU block %d[dim %d], thread %d: ERROR: I = %d out of range!\n", blockIdx.x, blockDim.x, threadIdx.x, i);
	}

	// Host code for the VecAdd test:
	void cudaVecAddTest()
	{
		int i, N = 1024*1024;
		size_t size = N * sizeof(float);
		// Allocate input vectors h_A and h_B in host memory
		float *h_A = (float*)malloc(size), *h_B = (float*)malloc(size), *h_C = (float*)malloc(size);
		// Initialize input vectors
		for(i = 0; i < N; ++i) {
			*(h_A+i) = i;
			*(h_B+i) = i*0.1;
		}
		// Allocate vectors in device memory
		float *d_A, *d_B, *d_C;
		cudaMalloc(&d_A, size);	cudaMalloc(&d_B, size);	cudaMalloc(&d_C, size);
		// Copy vectors from host memory to device memory
		cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
		cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
		// Invoke kernel
		int threadsPerBlock = 256;
		int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
		VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
		// Copy result from device memory to host memory
		// h_C contains the result in host memory
		cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
		// Free device memory
		cudaFree(d_A);	cudaFree(d_B);	cudaFree(d_C);
		// Debug-print results sample:
		for(i = 0; i < 10; ++i) {
			printf("i = %d: Sum = %10.2f\n", i, *(h_C+i));
		}
		printf("...\n");
		for(i = 10; i > 0; --i) {
			printf("i = %d: Sum = %10.2f\n", N-i, *(h_C+N-i));
		}
	}

  #ifdef USE_FMADD

	__global__ void VecMul50x50_exact(double*a, double*b, double*lo, double*hi)
	{
		int i = blockDim.x * blockIdx.x + threadIdx.x;
		// Exact product a*b = lo + hi:
		hi[i] = a[i] * b[i];
		lo[i] = fma(a[i],b[i], -hi[i]);
	}

	// Host code for the VecAdd test:
	void cudaVecMul50x50Test()
	{
		int i, N = 1024*1024, pow2;
		double pow2_dmult;
		uint64 iax,iay,ialo,iahi;
		size_t size = N * sizeof(double);
		// Allocate input vectors h_A and h_B in host memory
		double *h_A = malloc(size), *h_B = malloc(size), *h_C = malloc(size), *h_D = malloc(size);
		// Allocate vectors in device memory
		double*d _A, *d_B, *d_C, *d_D;
		cudaMalloc(&d_A, size), cudaMalloc(&d_B, size), cudaMalloc(&d_C, size), cudaMalloc(&d_D, size);
		// Assumes rng_isaac_init() has already been called on entry
		pow2_dmult = TWO50FLOAT;	// This must match the loop-starting value of pow2:
		for(pow2 = 50; pow2 < 54; ++pow2)	// Only makes sense to test up the #bits in an IEEE-double mantissa: Any larger and we start losing
		{									// LSBs (I.e. the test may 'succeed' for pow2 > 53, but is only testing the equivalent of pow2 = 53.)
		//	printf("Testing CUDA fma_dmult for %d bits, dmult = %f:\n",pow2,pow2_dmult);
			// Initialize input vectors
			for(i = 0; i < N; ++i) {
				// Input multiplicands in [-2^pow2, +2^pow2]:
				*(h_A+i) = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );
				*(h_B+i) = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );
			}
			// Copy vectors from host memory to device memory
			cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
			cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
			// Invoke kernel
			int threadsPerBlock = 256;
			int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
			VecMul50x50_exact<<<blocksPerGrid, threadsPerBlock>>>(d_A,d_B, d_C,d_D);
			// Copy result from device memory to host memory
			// h_C contains the result in host memory
			cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
			cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost);
			// Debug-print results sample:
			for(i = 0; i < N; ++i) {
				iax = ABS(h_A[i]);	iay = ABS(h_B[i]);
			#ifdef MUL_LOHI64_SUBROUTINE
				MUL_LOHI64(iax,iay,&ialo,&iahi);
			#else
				MUL_LOHI64(iax,iay, ialo, iahi);
			#endif
			//	printf("I = %d: x = %f; y = %f; hi,lo = %f,%f\n",i, h_A[i],h_B[i],h_D[i],h_C[i]);
				if(cmp_fma_lohi_vs_exact(h_A[i],h_B[i],h_D[i],h_C[i], iax,iay,iahi,ialo)) {
					printf("ERROR: pow2 = %d, I = %d, outputs differ!\n",pow2,i);
					ASSERT(HERE, 0, "fma_dmult tests failed!");
				}
			}	// i-loop
			pow2_dmult *= 2;
		}	// pow2-loop
		// Free device memory
		cudaFree(d_A);	cudaFree(d_B);	cudaFree(d_C);	cudaFree(d_D);
		printf("CUDA fma_dmult_tests completed successfully!\n");
	}

  #endif

	/**********************************************************************************************/

	#include "factor.h"
	#include "fac_test_dat64.h"
	#include "fac_test_dat96.h"
	#include "twopmodq80.h"

	// Host code for the 64-bit VecModpow test:
	void cudaVecModpowTest64()
	{
		int i, nelt64;
		uint64 p, pshift, k,q;
		uint32 start_index, zshift, j,jshift, leadb;
		const uint32 N = 1<<10;
		double dbl, rnd;
		// Allocate input vectors (which take the TF p/pshift/zshift/start_index/k data on input) in host memory:
		uint64 *h_p = malloc(N<<3), *h_pshft = malloc(N<<3), *h_k = malloc(N<<3);
		uint32 *h_zshft = malloc(N<<2), *h_stidx = malloc(N<<2);

		// Do counting pass to set nelt64, the number of 64-bit test data available:
		for(i = 0; i < N; ++i) {
			if(0 == fac64[i].p) break;
		}
		nelt64 = i;
		for(i = 0; i < N; ++i) {
			if(i < nelt64) {
				p = fac64[i].p;
				q = fac64[i].q;
			} else {	// Fill in any remaining slots with 63-bit test data. of which we know we have > (1<<10):
				p = fac63[i-nelt64].p;
				q = fac63[i-nelt64].q;
	//if((i-nelt64) < 10)printf("p[%3d] = %u: q = %llu ... ",i, p, q);
			}
			ASSERT(HERE, p != 0, "p must be nonzero!");
			// Compute auxiliary TF data:
			pshift = p + 64;
			jshift = leadz64(pshift);
			/* Extract leftmost 6 bits of pshift and subtract from 64: */
			leadb = ((pshift<<jshift) >> 58);
			start_index = 64-jshift-6;
			zshift = 63 - leadb;
			zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
			pshift = ~pshift;

			// Compute factor k using fast DP math. Integer-truncation-on-store should obviate the need
			// to subtract 1 from q, and (double)q is only accurate to 53 bits to begin with):
			dbl = (double)q;
			dbl /= (2.0*p);
			rnd = DNINT(dbl);
			k = (uint64)rnd;
			ASSERT(HERE, k*(p<<1)+1 == q, "k computed incorrectly!");
			*(h_p     + i) = p          ;	*(h_pshft + i) = pshift     ;	*(h_k + i) = k;
			*(h_zshft + i) = zshift     ;	*(h_stidx + i) = start_index;
		//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",i, p, pshift, zshift, start_index, k);
		}
		printf("Testing %d = %d 64-bit and %d 63-bit known-factors...",N,nelt64,N-nelt64);

		// Initialize output vector (resulting 2^p mod q, in binary "is factor?" form) in host memory:
		uint8*  h_B = (uint8 *)malloc(N);	// Until impl packed-bitmap scheme for device code return values, use byte array for return values
		for(i = 0; i < N; ++i) {
			*(h_B+i) = 0;
		}
	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
		// Allocate vectors in device memory
		uint64 *d_p,*d_pshft,*d_k;
		uint32 *d_zshft,*d_stidx;
		cudaMalloc(&d_p    , N<<3);	cudaMalloc(&d_pshft, N<<3);	cudaMalloc(&d_k    , N<<3);
		cudaMalloc(&d_zshft, N<<2);	cudaMalloc(&d_stidx, N<<2);
		uint8 * d_B;
		cudaMalloc(&d_B, N);
		// Copy vectors from host memory to device memory
		cudaMemcpy(d_p    , h_p    , N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_pshft, h_pshft, N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_zshft, h_zshft, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_stidx, h_stidx, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_k    , h_k    , N<<3, cudaMemcpyHostToDevice);
		// Do we need to copy the as-yet-uninited output vector to (or just from) the device?
	//	cudaMemcpy(d_B, h_B, N   , cudaMemcpyHostToDevice);

		// Invoke kernel
		int threadsPerBlock = 256;
		int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	//	printf("VecModpow64<<< %d, %d >>> with %d 64-bit known-factors:\n", blocksPerGrid, threadsPerBlock, N);
		VecModpow64<<<blocksPerGrid, threadsPerBlock>>>(d_p,d_pshft,d_zshft,d_stidx,d_k, d_B, N);

		// Copy result from device memory to host memory
		// h_B contains the result in host memory
		cudaMemcpy(h_B, d_B, N, cudaMemcpyDeviceToHost);
		// Free device memory
		cudaFree(d_p);	cudaFree(d_pshft);	cudaFree(d_zshft);	cudaFree(d_stidx);	cudaFree(d_k);	cudaFree(d_B);

		// Reference computation to Test GPU results:
		for(i = 0; i < N; ++i) {
			p = *(h_p + i);
			k = *(h_k + i);	q = k*(p<<1)+1;
			j = (uint32)twopmodq64((uint64)p, q);
			if((j != 1) || (*(h_B + i) != 1)) {
				printf("cudaVecModpowTest64: Mismatch between Ref and GPU result:\n");
				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,q) = %u, %llu\n", i,*(h_B + i), j,p,q);
				ASSERT(HERE, 0, "cudaVecModpowTest64 failed!");
			}
		}
		printf("cudaVecModpowTest64 with %d test (p,q) pairs succeeded!\n",N);
	}

	// Host code for the simpler VecModpow test, same 78-bit [p,q] pair for each thread:
	void cudaVecModpowTest78_0()
	{
		int i;
		uint64 p, pshift, k;
		uint32 start_index, zshift, j, leadb;
		uint32 N = 1<<10;
		// Allocate input vectors (which take the TF p/pshift/zshift/start_index/k data on input) in host memory:
		uint64 *h_p = malloc(N<<3), *h_pshft = malloc(N<<3), *h_k = malloc(N<<3);
		uint32 *h_zshft = malloc(N<<2), *h_stidx = malloc(N<<2);

		p = 16727479;
		k = 7946076362870052ull;
		// Compute auxiliary TF data:
		pshift = p + 78;
		j = leadz64(pshift);
		/* Extract leftmost 7 bits of pshift (if > 77, use the leftmost 6) and subtract from 96: */
		leadb = ((pshift<<j) >> 57);
		if(leadb > 77) {
			leadb >>= 1;
			start_index =  64-j-6;	/* Use only the leftmost 6 bits */
		} else {
			start_index =  64-j-7;
		}
		zshift = 77 - leadb;
		zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
		pshift = ~pshift;

		// Copy to all N vector-input-data:
		for(i = 0; i < N; ++i) {
			*(h_p     + i) = p          ;	*(h_pshft + i) = pshift     ;	*(h_k + i) = k;
			*(h_zshft + i) = zshift     ;	*(h_stidx + i) = start_index;
		}
		printf("Testing %d 78-bit known-factors...",N);

		// Initialize output vector (resulting 2^p mod q, in binary "is factor?" form) in host memory:
		uint8*  h_B = (uint8 *)malloc(N);	// Until impl packed-bitmap scheme for device code return values, use byte array for return values
		for(i = 0; i < N; ++i) {
			*(h_B+i) = 0;
		}
	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
		// Allocate vectors in device memory
		uint64 *d_p,*d_pshft,*d_k;
		uint32 *d_zshft,*d_stidx;
		cudaMalloc(&d_p    , N<<3);	cudaMalloc(&d_pshft, N<<3);	cudaMalloc(&d_k    , N<<3);
		cudaMalloc(&d_zshft, N<<2);	cudaMalloc(&d_stidx, N<<2);
		uint8 * d_B;
		cudaMalloc(&d_B, N);
		// Copy vectors from host memory to device memory
		cudaMemcpy(d_p    , h_p    , N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_pshft, h_pshft, N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_zshft, h_zshft, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_stidx, h_stidx, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_k    , h_k    , N<<3, cudaMemcpyHostToDevice);
		// Do we need to copy the as-yet-uninited output vector to (or just from) the device?
	//	cudaMemcpy(d_B, h_B, N   , cudaMemcpyHostToDevice);

		// Invoke kernel
		int threadsPerBlock = 256;
		int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	//	printf("VecModpow78<<< %d, %d >>> with %d copies of same 78-bit known-factor [blocksPerGrid = %d, threadsPerBlock = %d]:\n", blocksPerGrid, threadsPerBlock, N);
		VecModpow78<<<blocksPerGrid, threadsPerBlock>>>(d_p,d_pshft,d_zshft,d_stidx,d_k, d_B, N);
	//	printf("GPU_TF78<<< %d, %d >>> with %d copies of same 78-bit known-factor:\n", blocksPerGrid, threadsPerBlock, N);
	//	GPU_TF78<<<blocksPerGrid, threadsPerBlock>>>(p,pshift,zshift,start_index, d_k, d_B, N);

		// Copy result from device memory to host memory
		// h_B contains the result in host memory
		cudaMemcpy(h_B, d_B, N, cudaMemcpyDeviceToHost);
		// Free device memory
		cudaFree(d_p);	cudaFree(d_pshft);	cudaFree(d_zshft);	cudaFree(d_stidx);	cudaFree(d_k);	cudaFree(d_B);

		// Reference computation:
		j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k);
		ASSERT(HERE, (j == 1), "cudaVecModpowTest78_0 ref-comp failed!");
		// Test GPU results:
		for(i = 0; i < N; ++i) {
			if(*(h_B + i) != 1) {
				printf("cudaVecModpowTest78_0: Mismatch between Ref and GPU result:\n");
				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
				ASSERT(HERE, *(h_B + i) == 1, "cudaVecModpowTest78_0 failed!");
			}
		}
		printf("cudaVecModpowTest78_0 with %d test (p,q) pairs succeeded!\n",N);
	}

	// Host code for the 78-bit VecModpow test:
	void cudaVecModpowTest78()
	{
		int i;
		uint64 p, pshift, k;
		uint32 start_index, zshift, j, leadb;
		uint32 N = 1<<10,nelts;
		uint96 q96;
		double dbl, rnd;
		// Allocate input vectors (which take the TF p/pshift/zshift/start_index/k data on input) in host memory:
		uint64 *h_p = malloc(N<<3), *h_pshft = malloc(N<<3), *h_k = malloc(N<<3);
		uint32 *h_zshft = malloc(N<<2), *h_stidx = malloc(N<<2);

		for(i = 0, nelts = 0; i < N; ++i) {
			p = fac96[i].p;
			if(p == 0) {
				break;
			}
			q96.d1 = fac96[i].d1; q96.d0 = fac96[i].d0;
			if((q96.d1 >> 14) != 0) {
				continue;
			}
			// Good to go - compute auxiliary TF data:
			pshift = p + 78;
			j = leadz64(pshift);
			/* Extract leftmost 7 bits of pshift (if > 77, use the leftmost 6) and subtract from 96: */
			leadb = ((pshift<<j) >> 57);
			if(leadb > 77) {
				leadb >>= 1;
				start_index =  64-j-6;	/* Use only the leftmost 6 bits */
			} else {
				start_index =  64-j-7;
			}
			zshift = 77 - leadb;
			zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
			pshift = ~pshift;

			// Compute factor k using fast DP math. Integer-truncation-on-store should obviate the need
			// to subtract 1 from q, and (double)q is only accurate to 53 bits to begin with):
			dbl = (double)q96.d0 + (double)q96.d1*TWO64FLOAT;
			dbl /= (2.0*p);
			rnd = DNINT(dbl);
			k = (uint64)rnd;
			*(h_p     + nelts) = p          ;	*(h_pshft + nelts) = pshift     ;	*(h_k + nelts) = k;
			*(h_zshft + nelts) = zshift     ;	*(h_stidx + nelts) = start_index;
	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k);
			++nelts;
		}
		printf("Testing %d 78-bit known-factors...",nelts);
		// "Fill in" remaining slots with copy of same datum used in cudaVecModpowTest78_0:
		p = 16727479;
		k = 7946076362870052ull;
		// Compute auxiliary TF data:
		pshift = p + 78;
		j = leadz64(pshift);
		/* Extract leftmost 7 bits of pshift (if > 77, use the leftmost 6) and subtract from 96: */
		leadb = ((pshift<<j) >> 57);
		if(leadb > 77) {
			leadb >>= 1;
			start_index =  64-j-6;	/* Use only the leftmost 6 bits */
		} else {
			start_index =  64-j-7;
		}
		zshift = 77 - leadb;
		zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
		pshift = ~pshift;
		// Copy to all still-uninited vector-input-data:
		for(i = nelts; i < N; ++i) {
			*(h_p     + i) = p          ;	*(h_pshft + i) = pshift     ;	*(h_k + i) = k;
			*(h_zshft + i) = zshift     ;	*(h_stidx + i) = start_index;
		}

		// Initialize output vector (resulting 2^p mod q, in binary "is factor?" form) in host memory:
		uint8*  h_B = (uint8 *)malloc(N);	// Until impl packed-bitmap scheme for device code return values, use byte array for return values
		for(i = 0; i < N; ++i) {
			*(h_B+i) = 0;
		}
	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
		// Allocate vectors in device memory
		uint64 *d_p,*d_pshft,*d_k;
		uint32 *d_zshft,*d_stidx;
		cudaMalloc(&d_p    , N<<3);	cudaMalloc(&d_pshft, N<<3);	cudaMalloc(&d_k    , N<<3);
		cudaMalloc(&d_zshft, N<<2);	cudaMalloc(&d_stidx, N<<2);
		uint8 * d_B;
		cudaMalloc(&d_B, N);
		// Copy vectors from host memory to device memory
		cudaMemcpy(d_p    , h_p    , N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_pshft, h_pshft, N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_zshft, h_zshft, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_stidx, h_stidx, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_k    , h_k    , N<<3, cudaMemcpyHostToDevice);
		// Do we need to copy the as-yet-uninited output vector to (or just from) the device?
	//	cudaMemcpy(d_B, h_B, N   , cudaMemcpyHostToDevice);

		// Invoke kernel
		int threadsPerBlock = 256;
		int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	//	printf("VecModpow78 with %d 78-bit known-factors [blocksPerGrid = %d, threadsPerBlock = %d]:\n",nelts, blocksPerGrid, threadsPerBlock);
		VecModpow78<<<blocksPerGrid, threadsPerBlock>>>(d_p,d_pshft,d_zshft,d_stidx,d_k, d_B, nelts);

		// Copy result from device memory to host memory
		// h_B contains the result in host memory
		cudaMemcpy(h_B, d_B, N, cudaMemcpyDeviceToHost);
		// Free device memory
		cudaFree(d_p);	cudaFree(d_pshft);	cudaFree(d_zshft);	cudaFree(d_stidx);	cudaFree(d_k);	cudaFree(d_B);

		// Reference computation:
		// Test GPU results:
		for(i = 0; i < nelts; ++i) {
			p = *(h_p + i);
			k = *(h_k + i);
			j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k);
			if((j != 1) || (*(h_B + i) != 1)) {
				printf("cudaVecModpowTest78: Mismatch between Ref and GPU result:\n");
				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
				ASSERT(HERE, 0, "cudaVecModpowTest78 failed!");
			}
		}
		printf("cudaVecModpowTest78 with %d test (p,q) pairs succeeded!\n",nelts);
	}

	// Host code for the 96-bit VecModpow test:
	void cudaVecModpowTest96()
	{
		int i;
		uint64 p, pinv, pshift, k, hi64;
		uint32 tmp32, start_index, zshift, j, leadb;
		uint32 N = 1<<10,nelts;
		uint96 q96,x96,pinv96;
		// Allocate input vectors (which take the TF p/pshift/zshift/start_index/k data on input) in host memory:
		uint64 *h_p = malloc(N<<3), *h_pshft = malloc(N<<3), *h_k = malloc(N<<3);
		uint32 *h_zshft = malloc(N<<2), *h_stidx = malloc(N<<2);

		for(i = 0, nelts = 0; i < N; ++i) {
			p = fac96[i].p;
			if(p == 0) {
				break;
			}
			q96.d1 = fac96[i].d1; q96.d0 = fac96[i].d0;
			// Good to go - compute auxiliary TF data:
			pshift = p + 96;
			j = leadz64(pshift);
			// Extract leftmost 7 bits of pshift (if > 95, use the leftmost 6) and subtract from 96:
			leadb = ((pshift<<j) >> 57);
			if(leadb > 95) {
				leadb >>= 1;
				start_index =  64-j-6;	// Use only the leftmost 6 bits
			} else {
				start_index =  64-j-7;
			}
			zshift = 95 - leadb;
			zshift <<= 1;				// Doubling the shift count here takes cares of the first SQR_LOHI
			pshift = ~pshift;

			/* To find the quotient k = (q-1)/(2*p), which may be > 64 bits, use mod-inverse with base 2^96 arithmetic.
			Since the Newtonian mod-inverse algorithm only works for odd inputs, instead of finding (q-1)/(2*p), we find ((q-1)/2)/p.
			First, find inverse (mod 2^96) of p in preparation for modular multiply. See twopmodq96 for an explanation of this:
			*/
			pinv = (p +p +p) ^ 2;
			for(j = 0; j < 3; j++) {
				tmp32 = p * pinv;
				pinv = pinv*(2 - tmp32);
			}
			// One more iteration using uint64 math to get 64-bit inverse:
			pinv96.d0 = (uint64)pinv;	pinv96.d1 = (uint64)0;
			hi64 = (uint64)p * pinv96.d0;
			pinv96.d0 = pinv96.d0*((uint64)2 - hi64);
			// pinv96 has 96 bits, but only the upper 64 get modified here:
		#ifdef MUL_LOHI64_SUBROUTINE
			pinv96.d1 = -pinv96.d0*__MULH64((uint64)p, pinv96.d0);
		#else
			MULH64((uint64)p, pinv96.d0, hi64);
			pinv96.d1 = -pinv96.d0*hi64;
		#endif
			// k is simply the bottom 96 bits of ((q-1)/2)*pinv96:
			x96.d0	= ((q96.d0-1) >> 1) + ((uint64)q96.d1 << 63);	x96.d1	= (q96.d1 >> 1);	// (q-1)/2
			MULL96(x96, pinv96, x96);
			k = x96.d0;
			// Skip any (p,q) pair for which the k > 2^64:
			if(x96.d1 != 0) {	// x128 holds k
			//	printf("Warning: k > 2^64 detected for (p,q) = %u,[%u*2^64 + %llu] ... skipping this datum.\n",p,q96.d1,q96.d0);
				continue;
			}
			*(h_p     + nelts) = p          ;	*(h_pshft + nelts) = pshift     ;	*(h_k + nelts) = k;
			*(h_zshft + nelts) = zshift     ;	*(h_stidx + nelts) = start_index;
	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k);
			++nelts;
		}
		printf("Testing %d 96-bit known-factors...",nelts);
		// "Fill in" remaining slots with copy of same datum used in cudaVecModpowTest96_0:
		p = 16727479;
		k = 7946076362870052ull;
		// Compute auxiliary TF data:
		pshift = p + 96;
		j = leadz32(pshift);
		/* Extract leftmost 7 bits of pshift (if > 85, use the leftmost 6) and subtract from 96: */
		leadb = ((pshift<<j) >> 57);
		if(leadb > 95) {
			leadb >>= 1;
			start_index =  64-j-6;	/* Use only the leftmost 6 bits */
		} else {
			start_index =  64-j-7;
		}
		zshift = 95 - leadb;
		zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
		pshift = ~pshift;
		// Copy to all still-uninited vector-input-data:
		for(i = nelts; i < N; ++i) {
			*(h_p     + i) = p          ;	*(h_pshft + i) = pshift     ;	*(h_k + i) = k;
			*(h_zshft + i) = zshift     ;	*(h_stidx + i) = start_index;
		}

		// Initialize output vector (resulting 2^p mod q, in binary "is factor?" form) in host memory:
		uint8*  h_B = (uint8 *)malloc(N);	// Until impl packed-bitmap scheme for device code return values, use byte array for return values
		for(i = 0; i < N; ++i) {
			*(h_B+i) = 0;
		}
	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
		// Allocate vectors in device memory
		uint64 *d_p,*d_pshft,*d_k;
		uint32 *d_zshft,*d_stidx;
		cudaMalloc(&d_p    , N<<3);	cudaMalloc(&d_pshft, N<<3);	cudaMalloc(&d_k    , N<<3);
		cudaMalloc(&d_zshft, N<<2);	cudaMalloc(&d_stidx, N<<2);
		uint8 * d_B;
		cudaMalloc(&d_B, N);
		// Copy vectors from host memory to device memory
		cudaMemcpy(d_p    , h_p    , N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_pshft, h_pshft, N<<3, cudaMemcpyHostToDevice);
		cudaMemcpy(d_zshft, h_zshft, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_stidx, h_stidx, N<<2, cudaMemcpyHostToDevice);
		cudaMemcpy(d_k    , h_k    , N<<3, cudaMemcpyHostToDevice);
		// Do we need to copy the as-yet-uninited output vector to (or just from) the device?
	//	cudaMemcpy(d_B, h_B, N   , cudaMemcpyHostToDevice);

		// Invoke kernel
		int threadsPerBlock = 256;
		int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	//	printf("VecModpow96 with %d 96-bit known-factors [blocksPerGrid = %d, threadsPerBlock = %d]:\n",nelts, blocksPerGrid, threadsPerBlock);
		VecModpow96<<<blocksPerGrid, threadsPerBlock>>>(d_p,d_pshft,d_zshft,d_stidx,d_k, d_B, nelts);

		// Copy result from device memory to host memory
		// h_B contains the result in host memory
		cudaMemcpy(h_B, d_B, N, cudaMemcpyDeviceToHost);
		// Free device memory
		cudaFree(d_p);	cudaFree(d_pshft);	cudaFree(d_zshft);	cudaFree(d_stidx);	cudaFree(d_k);	cudaFree(d_B);

		// Reference computation:
		// Test GPU results:
		for(i = 0; i < nelts; ++i) {
			p = *(h_p + i);
			k = *(h_k + i);
			q96 = twopmodq96((uint64)p, k);
			j = (q96.d1 == 0) && (q96.d0 == 1);
			if((j != 1) || (*(h_B + i) != 1)) {
				printf("cudaVecModpowTest96: Mismatch between Ref and GPU result:\n");
				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
				ASSERT(HERE, 0, "cudaVecModpowTest96 failed!");
			}
		}
		printf("cudaVecModpowTest96 with %d test (p,q) pairs succeeded!\n",nelts);
	}

#endif	// #if defined(USE_GPU) && defined(__CUDACC__)

#undef PLATFORM_SKIP_RND_CONST_ENFORCEMENT

/*********************************************************************************/
/* Globals. Unless specified otherwise, these are declared in Mdata.h:           */
/*********************************************************************************/

/* These externs used in x86 builds by util.c:set_x87_fpu_params() yield 64-mantissa-bit
floating-point register mode (bits <9:8> = 3), with IEEE and truncating rounding mode set
via bits <11:10> = 0 and 3, respectively. The other 12 bits are identical to the MSVC defaults:
*/
unsigned short FPU_64RND = 0x037f, FPU_64CHOP = 0x0f7f;

const int CHAROFFSET = '0';

double RND_A, RND_B;	/* Used for fast NINT emulation; set in util.c. */

double             TWO13FLINV;	/* (double)2^13 inverse */
double TWO25FLOAT, TWO25FLINV;	/* (double)2^25 and inverse */
double TWO26FLOAT, TWO26FLINV;	/* (double)2^26 and inverse */
double TWO32FLOAT, TWO32FLINV;	/* (double)2^32 and inverse */
double TWO48FLOAT, TWO48FLINV;	/* (double)2^48 and inverse */
double TWO50FLOAT, TWO50FLINV;	/* (double)2^50 and inverse */
double TWO51FLOAT, TWO51FLINV;	/* (double)2^51 and inverse */
double TWO52FLOAT, TWO52FLINV;	/* (double)2^52 and inverse */
double TWO53FLOAT, TWO53FLINV;	/* (double)2^53 and inverse */
double TWO54FLOAT;	/* (double)2^54 */
double TWO63FLOAT;	/* (double)2^63 */
double TWO64FLOAT, TWO64FLINV;	/* (double)2^64 and inverse */

int32 DAT_BITS, PAD_BITS;	/* Array padding parameters */

/* Fixed-size (but only necessarily constant during a given FFT-based MUL)
   base for generic FFT-based mul:

	FFT_MUL_BASE = 2^(FFT_MUL_BITS), where FFT_MUL_BITS #def'ed in Mdata.h
*/
double FFT_MUL_BASE, FFT_MUL_BASE_INV;

/***********************/

/*** 11/23/05: MSVC/.NET buggered things up with the second of these tables
     when each table was local to its respective calling function, so moved 'em here: ***/

	/* Table of approximate byte-inverses of 1.{byteval} is here. Since we know
	the input is in [1, 2), we know the multiplicative inverse is in (0.5, 1],
	i.e. we know that the MSB of the inverse (the one immediately right of the
	binary point) is 1. Thus we can use the hidden-bit-is-1 property of inputs
	to also gain a bit of precision in the bytewise approximate inverses, by
	neglecting the leading-order bit - since that one would get stored in the
	hidden-bit slot of the output anyway, this also makes our work easier. */
	/* Unix bc code:
	bc -l
	ibase=2
	obase=2
	d=0.00000001;
	x=1.000000001-d;
	x+=d;1/x
	{256 of these, round 10th bit into MS9, replace MSB by '0x', convert rest to hex}
	*/
	static const uint8 byte_lookup_finvest[256] = {
	0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf1,0xf0,0xee,0xec,0xea,0xe8,0xe6,0xe5,0xe3,
	0xe1,0xdf,0xdd,0xdc,0xda,0xd8,0xd7,0xd5,0xd3,0xd2,0xd0,0xce,0xcd,0xcb,0xc9,0xc8,
	0xc6,0xc5,0xc3,0xc2,0xc0,0xbf,0xbd,0xbc,0xba,0xb9,0xb7,0xb6,0xb4,0xb3,0xb1,0xb0,
	0xae,0xad,0xac,0xaa,0xa9,0xa7,0xa6,0xa5,0xa3,0xa2,0xa1,0x9f,0x9e,0x9d,0x9c,0x9a,
	0x99,0x98,0x96,0x95,0x94,0x93,0x91,0x90,0x8f,0x8e,0x8d,0x8b,0x8a,0x89,0x88,0x87,
	0x86,0x84,0x83,0x82,0x81,0x80,0x7f,0x7e,0x7c,0x7b,0x7a,0x79,0x78,0x77,0x76,0x75,
	0x74,0x73,0x72,0x71,0x70,0x6f,0x6e,0x6d,0x6c,0x6b,0x6a,0x69,0x68,0x67,0x66,0x65,
	0x64,0x63,0x62,0x61,0x60,0x5f,0x5e,0x5d,0x5c,0x5b,0x5a,0x59,0x58,0x58,0x57,0x56,
	0x55,0x54,0x53,0x52,0x51,0x51,0x50,0x4f,0x4e,0x4d,0x4c,0x4b,0x4b,0x4a,0x49,0x48,
	0x47,0x46,0x46,0x45,0x44,0x43,0x42,0x42,0x41,0x40,0x3f,0x3f,0x3e,0x3d,0x3c,0x3b,
	0x3b,0x3a,0x39,0x38,0x38,0x37,0x36,0x35,0x35,0x34,0x33,0x33,0x32,0x31,0x30,0x30,
	0x2f,0x2e,0x2e,0x2d,0x2c,0x2c,0x2b,0x2a,0x2a,0x29,0x28,0x28,0x27,0x26,0x26,0x25,
	0x24,0x24,0x23,0x22,0x22,0x21,0x20,0x20,0x1f,0x1e,0x1e,0x1d,0x1d,0x1c,0x1b,0x1b,
	0x1a,0x1a,0x19,0x18,0x18,0x17,0x17,0x16,0x15,0x15,0x14,0x14,0x13,0x12,0x12,0x11,
	0x11,0x10,0x10,0x0f,0x0f,0x0e,0x0d,0x0d,0x0c,0x0c,0x0b,0x0b,0x0a,0x0a,0x09,0x09,
	0x08,0x07,0x07,0x06,0x06,0x05,0x05,0x04,0x04,0x03,0x03,0x02,0x02,0x01,0x01,0x00
	};

/***********************/

	/* Table of approximate byte-inverses of 2 * 1.{byteval} is here. Since we know
	the input is in [1, 4), we know the inverse-square-rootis in (0.5, 1],
	i.e. we know that the MSB of the ISQRT (the one immediately right of the
	binary point) is 1.	We cheat a little on the 0 element of the byte table,
	since sqrt(1.000000001) really should give 0x100, not 0xff. But the
	alternative is using uint16s, which doubles the size of the table. */
	/* Unix bc code:
	bc -l
	ibase=2
	obase=2
	d=0.00000001;
	x=1.000000001-d;
	x+=d;1/sqrt(x)
	{768 of these, round 10th bit into MS9, replace MSB by '0x', convert rest to hex}
	*/
	/* Used to store MS 8 non-hidden mantissa bits. We'd need to use a 16-bit int
	to allow for the possibility of a carryout (i.e. result = 256) from rounding
	the 9th-most-significant NHB into the upper 8 (which would involve
	additional logic to handle), we instead deal with the issue of rounding
	by assuming the midpoint - e.g. if truncating to the MS 8 NHBs yields
	a certain integer in [0,255], we assume the resulting roundoff error
	is always 0.5, i.e. our precomputed 1/x values are approximations to
	the resulting midpoints. This also avoids our having to treat an input
	of 1.00000000 as a special case, since we munge that to 1.000000001,
	whose inverse is < 1.0: */
	static const uint8 byte_lookup_fisqrtest[768] = {
	0xff,0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,0xf8,0xf7,0xf6,0xf5,0xf4,0xf3,0xf2,0xf1,
	0xf0,0xef,0xee,0xee,0xed,0xec,0xeb,0xea,0xe9,0xe8,0xe7,0xe7,0xe6,0xe5,0xe4,0xe3,
	0xe2,0xe1,0xe1,0xe0,0xdf,0xde,0xdd,0xdd,0xdc,0xdb,0xda,0xd9,0xd9,0xd8,0xd7,0xd6,
	0xd5,0xd5,0xd4,0xd3,0xd2,0xd2,0xd1,0xd0,0xcf,0xcf,0xce,0xcd,0xcc,0xcc,0xcb,0xca,
	0xca,0xc9,0xc8,0xc7,0xc7,0xc6,0xc5,0xc5,0xc4,0xc3,0xc3,0xc2,0xc1,0xc1,0xc0,0xbf,
	0xbf,0xbe,0xbd,0xbd,0xbc,0xbb,0xbb,0xba,0xb9,0xb9,0xb8,0xb7,0xb7,0xb6,0xb6,0xb5,
	0xb4,0xb4,0xb3,0xb2,0xb2,0xb1,0xb1,0xb0,0xaf,0xaf,0xae,0xae,0xad,0xac,0xac,0xab,
	0xab,0xaa,0xaa,0xa9,0xa8,0xa8,0xa7,0xa7,0xa6,0xa6,0xa5,0xa5,0xa4,0xa3,0xa3,0xa2,
	0xa2,0xa1,0xa1,0xa0,0xa0,0x9f,0x9f,0x9e,0x9d,0x9d,0x9c,0x9c,0x9b,0x9b,0x9a,0x9a,
	0x99,0x99,0x98,0x98,0x97,0x97,0x96,0x96,0x95,0x95,0x94,0x94,0x93,0x93,0x92,0x92,
	0x91,0x91,0x90,0x90,0x8f,0x8f,0x8f,0x8e,0x8e,0x8d,0x8d,0x8c,0x8c,0x8b,0x8b,0x8a,
	0x8a,0x89,0x89,0x89,0x88,0x88,0x87,0x87,0x86,0x86,0x85,0x85,0x85,0x84,0x84,0x83,
	0x83,0x82,0x82,0x82,0x81,0x81,0x80,0x80,0x7f,0x7f,0x7f,0x7e,0x7e,0x7d,0x7d,0x7d,
	0x7c,0x7c,0x7b,0x7b,0x7a,0x7a,0x7a,0x79,0x79,0x78,0x78,0x78,0x77,0x77,0x76,0x76,
	0x76,0x75,0x75,0x75,0x74,0x74,0x73,0x73,0x73,0x72,0x72,0x72,0x71,0x71,0x70,0x70,
	0x70,0x6f,0x6f,0x6f,0x6e,0x6e,0x6d,0x6d,0x6d,0x6c,0x6c,0x6c,0x6b,0x6b,0x6b,0x6a,
	0x6a,0x6a,0x69,0x69,0x68,0x68,0x68,0x67,0x67,0x67,0x66,0x66,0x66,0x65,0x65,0x65,
	0x64,0x64,0x64,0x63,0x63,0x63,0x62,0x62,0x62,0x61,0x61,0x61,0x60,0x60,0x60,0x5f,
	0x5f,0x5f,0x5e,0x5e,0x5e,0x5d,0x5d,0x5d,0x5d,0x5c,0x5c,0x5c,0x5b,0x5b,0x5b,0x5a,
	0x5a,0x5a,0x59,0x59,0x59,0x58,0x58,0x58,0x58,0x57,0x57,0x57,0x56,0x56,0x56,0x55,
	0x55,0x55,0x55,0x54,0x54,0x54,0x53,0x53,0x53,0x53,0x52,0x52,0x52,0x51,0x51,0x51,
	0x51,0x50,0x50,0x50,0x4f,0x4f,0x4f,0x4f,0x4e,0x4e,0x4e,0x4d,0x4d,0x4d,0x4d,0x4c,
	0x4c,0x4c,0x4c,0x4b,0x4b,0x4b,0x4a,0x4a,0x4a,0x4a,0x49,0x49,0x49,0x49,0x48,0x48,
	0x48,0x48,0x47,0x47,0x47,0x47,0x46,0x46,0x46,0x45,0x45,0x45,0x45,0x44,0x44,0x44,
	0x44,0x43,0x43,0x43,0x43,0x42,0x42,0x42,0x42,0x41,0x41,0x41,0x41,0x40,0x40,0x40,
	0x40,0x3f,0x3f,0x3f,0x3f,0x3f,0x3e,0x3e,0x3e,0x3e,0x3d,0x3d,0x3d,0x3d,0x3c,0x3c,
	0x3c,0x3c,0x3b,0x3b,0x3b,0x3b,0x3a,0x3a,0x3a,0x3a,0x3a,0x39,0x39,0x39,0x39,0x38,
	0x38,0x38,0x38,0x38,0x37,0x37,0x37,0x37,0x36,0x36,0x36,0x36,0x36,0x35,0x35,0x35,
	0x35,0x34,0x34,0x34,0x34,0x34,0x33,0x33,0x33,0x33,0x32,0x32,0x32,0x32,0x32,0x31,
	0x31,0x31,0x31,0x31,0x30,0x30,0x30,0x30,0x30,0x2f,0x2f,0x2f,0x2f,0x2e,0x2e,0x2e,
	0x2e,0x2e,0x2d,0x2d,0x2d,0x2d,0x2d,0x2c,0x2c,0x2c,0x2c,0x2c,0x2b,0x2b,0x2b,0x2b,
	0x2b,0x2a,0x2a,0x2a,0x2a,0x2a,0x29,0x29,0x29,0x29,0x29,0x28,0x28,0x28,0x28,0x28,
	0x28,0x27,0x27,0x27,0x27,0x27,0x26,0x26,0x26,0x26,0x26,0x25,0x25,0x25,0x25,0x25,
	0x24,0x24,0x24,0x24,0x24,0x24,0x23,0x23,0x23,0x23,0x23,0x22,0x22,0x22,0x22,0x22,
	0x22,0x21,0x21,0x21,0x21,0x21,0x20,0x20,0x20,0x20,0x20,0x20,0x1f,0x1f,0x1f,0x1f,
	0x1f,0x1f,0x1e,0x1e,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1d,0x1d,0x1d,0x1c,0x1c,0x1c,
	0x1c,0x1c,0x1c,0x1b,0x1b,0x1b,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x19,
	0x19,0x19,0x19,0x19,0x19,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x17,0x17,0x17,0x17,
	0x17,0x16,0x16,0x16,0x16,0x16,0x16,0x15,0x15,0x15,0x15,0x15,0x15,0x15,0x14,0x14,
	0x14,0x14,0x14,0x14,0x13,0x13,0x13,0x13,0x13,0x13,0x13,0x12,0x12,0x12,0x12,0x12,
	0x12,0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x10,0x10,0x10,0x10,0x10,0x10,0x0f,0x0f,
	0x0f,0x0f,0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0e,0x0e,0x0e,0x0e,0x0d,0x0d,0x0d,0x0d,
	0x0d,0x0d,0x0d,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b,0x0b,0x0b,0x0b,
	0x0b,0x0a,0x0a,0x0a,0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x08,
	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x06,0x06,
	0x06,0x06,0x06,0x06,0x06,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x04,0x04,0x04,
	0x04,0x04,0x04,0x04,0x04,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x02,0x02,0x02,0x02,
	0x02,0x02,0x02,0x02,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00
	};


/********FILE-ACCESS-RELATED:**********/

int	file_valid(FILE*fp)
{
	return (fp && !ferror(fp) && !feof(fp));
}

/* Apparently each implementation uses its own names for these file-internals flags -
TODO: need a portable way of checking file-access mode here.
int		file_valid_for_read (FILE*fp)
{
	blah... (similar to below)
}

int		file_valid_for_write(FILE*fp)
{
	if(fp && (fp->flag & (_EOF | _ERR) == 0) && (fp->flag & _WRITE != 0) )
		return TRUE;
	else
		return FALSE;
}
*/
/******************/

/* Print key platform info, (on x86) set FPU mode, do some basic self-tests: */
void host_init(void)
{
#ifdef MULTITHREAD
	int ncpu, nthr;
#endif
	double dbl;
	/*...time-related stuff	*/
	clock_t clock1, clock2;
	double tdiff;

	/* Various useful precomputed powers of 2 in floating-double form: */
	TWO25FLOAT = (double)0x02000000;				TWO25FLINV = 1.0/TWO25FLOAT;
	TWO26FLOAT = (double)0x04000000;				TWO26FLINV = 1.0/TWO26FLOAT;
	dbl = qfdbl(qfmul_pow2(QONE, -26));
	ASSERT(HERE, TWO26FLINV == dbl, "TWO26FLINV!");

	TWO13FLINV = qfdbl(qfmul_pow2(QONE, -13));

	TWO32FLOAT = (double)2.0*0x80000000;			TWO32FLINV = 1.0/TWO32FLOAT;
	TWO48FLOAT = (double)1.0*0x01000000*0x01000000;	TWO48FLINV = 1.0/TWO48FLOAT;
	TWO50FLOAT = (double)1.0*0x01000000*0x04000000;	TWO50FLINV = 1.0/TWO50FLOAT;
	TWO51FLOAT = (double)1.0*0x02000000*0x04000000;	TWO51FLINV = 1.0/TWO51FLOAT;
	TWO52FLOAT = (double)1.0*0x04000000*0x04000000;	TWO52FLINV = 1.0/TWO52FLOAT;
	TWO53FLOAT = (double)1.0*0x08000000*0x04000000;	TWO53FLINV = 1.0/TWO53FLOAT;
	TWO54FLOAT = (double)1.0*0x08000000*0x08000000;
	TWO63FLOAT = (double)2.0*0x80000000*0x80000000;
	TWO64FLOAT = (double)4.0*0x80000000*0x80000000;	TWO64FLINV = 1.0/TWO64FLOAT;

	// Check hashing routines in mi64 library (TO-DO: elaborate this into a full-blown mi64_test suite):
/*	printf("INFO: mi64 hashing routines...\n");
	uint64 md5[2], *x= (uint64 *)calloc(1000, sizeof(uint64));
	uint32 len = 10; // Only need 8 for md5-hash of 0
	mi64_md5(x, len, md5, cbuf);	sprintf(stderr,"MD5 = %s\n",cbuf);
*/
	/* Check qfloat routines (this call is also needed to init various qfloat global constants): */
	printf("INFO: testing qfloat routines...\n");
	qtest();	// 09/23/2012: Move to after above float-consts-inits because of the qfloat/mi64 routines which use those consts.

	/* Use qfloat routines to set the global floating-point constant 1/sqrt(2): */
	ASSERT(HERE, ISRT2 == qfdbl(QISRT2), "1/sqrt2 precision check failed!");
	ASSERT(HERE, SQRT2 == qfdbl(QSQRT2), "  sqrt2 precision check failed!");

#ifdef CPU_IS_X86	// May 2018: It seems I only found need to call this runtime CPU-mode setting in 32-bit x86 mode, not 64-bit. But had occasion
					// to fiddle w/rnd-mode in some x86_64 tests, so changed things so that the function is *defined* in both 32 and 64-bit modes.
	set_x87_fpu_params(FPU_64RND);
#endif
	// ewm [4. Aug 2014] - move below set_x87_fpu_params(), since need rnd-const set for any DNINT-using ref-computations in the GPU self-tests:
	print_host_info();
	check_nbits_in_types();

	// Test wide-mul routines:
	printf("INFO: testing IMUL routines...\n");
	ASSERT(HERE, test_mul() == 0, "test_mul() returns nonzero!");

	// Test the 64-bit 2^[+|-]p (mod q) functions:
	uint32 imax = 100000;
	fprintf(stderr,"INFO: Testing 64-bit 2^p (mod q) functions with %u random (p, q odd) pairs...\n",imax);
	clock1 = clock();
	ASSERT(HERE, test_twopmodq64(imax) == 0, "test_twopmodq64() returns nonzero!");
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
//	printf("Time for %u 2^[+|-]p (mod q) call pairs =%s\n",imax, get_time_str(tdiff));

	// Test certain aspects of SIMD functionality (aim is to expand this into a decently comprehensive
	// timing-test-of-key-SIMD-code-constructs suite):
#ifdef TEST_SIMD
	printf("INFO: Timing-testing selected FFT macros...\n");

  #if defined(USE_SSE2) && !defined(USE_AVX)	// 4-DFT is SSE2-only
//	ASSERT(HERE, test_radix4_dft() == 0, "test_radix4_dft() returns nonzero!");
  #endif

//	ASSERT(HERE, test_radix16_dft() == 0, "test_radix16_dft() returns nonzero!");

	#include "radix32_dif_dit_pass_asm.h"	// Commenting this out gives compile error
//	ASSERT(HERE, test_radix32_dft() == 0, "test_radix32_dft() returns nonzero!");

  #ifdef USE_AVX
//	ASSERT(HERE, test_simd_transpose_4x4() == 0, "test_simd_transpose_4x4() returns nonzero!");
  #endif
  #ifdef USE_AVX512
	ASSERT(HERE, test_simd_transpose_8x8() == 0, "test_simd_transpose_8x8() returns nonzero!");
exit(0);
  #endif
#endif

// Quick timings of various mi64 stuff:
#if INCLUDE_GMP && 0
	uint32 m = 33;	// 7 Sep 2021: GMP gcd on Haswell quad needs 24|54 min for F31|32-sized inputs; insufficient RAM (8 GB) for F33
					// On KNL with 16GB MCDRAM, need ??|??|?? min for F31|32|33, with F30 running on cores 0-63 and GIMPS-DC on 64-67.
	ASSERT(HERE, m < 64, "Fermat-number index must be < 64!");
	printf("INFO: testing GCD routines on F%u-sized inputs\n",m);
	// Apr 2021: check known factor of F31 using both mi64_div and GMP gcd, to get timing on the latter:
	rng_isaac_init(TRUE);
	uint64 rem[2] = {0ull,0ull}, q[2] = {3118754346955702273ull,2544ull};	// Known factor of F31: k = 3.13.140091319777; q = k.2^(m+2) + 1
	int i,isfact,nlimb = (1<<(m-6)) + 1;	// # of 64-bit limbs in Fm, which has 2^m+1 bits, thus needs one extra limb for the high 1-bit
	// vec0 is used for scratch storage, since mi64_mul_vector() does not permit in-place operation:
	uint64*vec0 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec0 != NULL, "vec0[]-array alloc failed!");
	uint64*vec1 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec1 != NULL, "vec1[]-array alloc failed!");
	uint64*vec2 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec2 != NULL, "vec2[]-array alloc failed!");
	// Init 2 random (mlimb-1)-length multiples of q:
	for(i = 0; i < nlimb-2; i++) { vec0[i] = rng_isaac_rand(); vec1[i] = rng_isaac_rand(); }
	// i holds product length on return:
	mi64_mul_vector(vec1,nlimb-2, q,2, vec2,&i);	ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!");
	mi64_mul_vector(vec0,nlimb-2, q,2, vec1,&i);	ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!");
	isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem);	ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!");
	isfact = mi64_div(vec2,q, nlimb,2, 0x0, rem);	ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!");
	// Now feed our two random-multiple vectors to GMP gcd:
	char gcd_str[STR_MAX_LEN];
	isfact = gcd(0,0ull,vec1,vec2,nlimb,gcd_str);	// 1st arg = stage just completed
exit(0);
#endif

#if 0
	rng_isaac_init(TRUE);
	uint32 i32,x32,bit, i,imax;
	uint64 i64;
	imax = 100000000;

	clock1 = clock();
	i64 = 0;
	for(i = 0; i < imax; i++) {
		i64 += rng_isaac_rand();
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u rng64 calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i64 != 0ull,"rng64 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		x32 = (uint32)i64;
		i32 += popcount32(x32);
		i32 += popcount32((uint32)(i64>>16));
		i32 += popcount32((uint32)(i64>>24));
		i32 += popcount32((uint32)(i64>>32));
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*popcount32()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"popcount32 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		i32 += popcount64(i64);
		i32 += popcount64(i64>>16);
		i32 += popcount64(i64>>24);
		i32 += popcount64(i64>>32);
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*popcount64()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"popcount64 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		x32 = (uint32)i64;
		i32 += leadz32(x32);
		i32 += leadz32((uint32)(i64>>16));
		i32 += leadz32((uint32)(i64>>24));
		i32 += leadz32((uint32)(i64>>32));
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*leadz32()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"leadz32 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		i32 += leadz64(i64);
		i32 += leadz64(i64>>16);
		i32 += leadz64(i64>>24);
		i32 += leadz64(i64>>32);
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*leadz64()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"leadz64 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		x32 = (uint32)i64;
		i32 += trailz32(x32);
		i32 += trailz32(x32);
		i32 += trailz32((uint32)(i64>>16));
		i32 += trailz32((uint32)(i64>>24));
		i32 += trailz32((uint32)(i64>>32));
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*trailz32()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"trailz32 sum = 0!");

	clock1 = clock();
	i32 = 0;
	for(i = 0; i < imax; i++) {
		i64 = rng_isaac_rand();
		i32 += trailz64(i64);
		i32 += trailz64(i64>>16);
		i32 += trailz64(i64>>24);
		i32 += trailz64(i64>>32);
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u [rng64 + 4*trailz64()] calls =%s\n",imax, get_time_str(tdiff));
	ASSERT(HERE, i32,"trailz64 sum = 0!");
exit(0);
	clock1 = clock();
	for(i = 0; i < imax; i++) {
		uint64 i64 = rng_isaac_rand();
		bit = (i64>>32) & 0x1f;	if(!bit) continue;
		x32 = (uint32)i64;
		int ii = ith_set_bit32(x32,bit);
		if(popcount32(x32) < bit)
			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
		else {
			uint32 tmp32 = x32 << (31-ii);
			ASSERT(HERE, tmp32 & 0x80000000,"ith_set_bit64 retval not actually set!");
			ASSERT(HERE, popcount32(tmp32) == bit, "ith_set_bit32 checksum fail!");
		}
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u ith_set_bit32() calls =%s\n",imax, get_time_str(tdiff));

	clock1 = clock2;
	for(i = 0; i < imax; i++) {
		uint64 i64 = rng_isaac_rand();
		bit = (i64>>32) & 0x3f;	if(!bit) continue;
		int ii = ith_set_bit64(i64,bit);
		if(popcount64(i64) < bit)
			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
		else {
			uint64 tmp64 = i64 << (63-ii);
			// Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit:
			ASSERT(HERE, (tmp64 & 0x8000000000000000ull) != 0,"ith_set_bit64 retval not actually set!");
			ASSERT(HERE, popcount64(tmp64) == bit, "ith_set_bit64 checksum fail!");
		}
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u ith_set_bit64() calls =%s\n",imax, get_time_str(tdiff));

	clock1 = clock2;
	uint64 iarr[4];
	for(i = 0; i < imax; i++) {
		iarr[0] = rng_isaac_rand();
		iarr[1] = rng_isaac_rand();
		iarr[2] = rng_isaac_rand();
		iarr[3] = rng_isaac_rand();
		bit = (iarr[0]>>32) & 0xff;	if(!bit) continue;
		int ii = mi64_ith_set_bit(iarr,bit,4);
		if(mi64_popcount(iarr,4) < bit)
			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
		else {
			mi64_shl(iarr,iarr,(255-ii),4);
			// Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit:
			ASSERT(HERE, (iarr[3] & 0x8000000000000000ull) != 0,"mi64_ith_set_bit64 retval not actually set!");
			ASSERT(HERE, mi64_popcount(iarr,4) == bit, "mi64_ith_set_bit64 checksum fail!");
		}
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf("Time for %u mi64_ith_set_bit() calls =%s\n",imax, get_time_str(tdiff));
exit(0);

#elif 0
	printf("INFO: Testing mi64_add speed...\n");
	int i;
	const int n = 1000, iters = 1000000;
	// Allocate the main data arrays, require these to be on 16-byte boundaries to enable SSE2-based addsub:
	uint64 *u = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)u & 0xf) == 0, "u not 16-byte aligned!");
	uint64 *v = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)v & 0xf) == 0, "u not 16-byte aligned!");
	uint64 *x = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)x & 0xf) == 0, "u not 16-byte aligned!");
	uint64 *y = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)y & 0xf) == 0, "u not 16-byte aligned!");

	/* Init the RNG and the inputs: */
	rng_isaac_init(TRUE);
	for(i = 0; i < n; i++)
	{
		u[i] = rng_isaac_rand();
		v[i] = rng_isaac_rand();
	}

	// First test correctness:
	uint64 cy1 = mi64_add(u,v,x,n);
	uint64 cy2 = mi64_add_ref(u,v,y,n);
	if(cy1 != cy2) {
		printf("Carryout mismatch: cy1 = %llu, cy2 = %llu\n",cy1,cy2);
	//	ASSERT(HERE, 0, "Incorrect mi64_add carryout");	// GCC 4.4.5 builds on my SB give carry-mismatch here ... wtf?
	}
	for(i = 0; i < n; i++)
	{
		if(x[i] != y[i]) {
			printf("Output mismatch: x[%d] = %llu, y[%d] = %llu\n",i,x[i],i,y[i]);
			ASSERT(HERE, 0, "Incorrect mi64_add output element");
		}
	}

	// Now do timing:
	clock1 = clock();
	for(i = 0; i < iters; i++)
	{
		mi64_add(u,v,x,n);
	}
	clock2 = clock();
	tdiff = (double)(clock2 - clock1);
	printf	("mi64_add: Time for %llu limbs =%s\n",(uint64)iters*n, get_time_str(tdiff));
	exit(0);
#endif
	/************************************************************/
	/* Activate these in turn when a new M-prime is discovered: */
	/************************************************************/
#ifdef ENABLE_MPRIME_PM1_SMOOTH
	uint32 p = 82589933;
	// Uncomment the specific subfunctions you desire:
//	est_num_mp_in_interval(0,p);
//	compute_mers_best_fit();
	test_mp_pm1_smooth(p);
//	print_mp_dec(p);
	exit(0);
#endif

#ifdef MULTITHREAD

  #ifndef USE_PTHREAD
	#error PTHREAD define barfed - Did you include e.g. '-pthread' in your compile flags?
  #endif

  #ifdef USE_OMP
	#error Phreads is the only supported multithreading API!
//	MAX_THREADS = omp_get_num_procs();
  #elif(defined(USE_PTHREAD))
	MAX_THREADS = get_num_cores();
  #else
	#error Unrecognized multithreading model!
  #endif
	// MAX_THREADS based on number of processing cores will most often be a power of 2, but don't assume that.
	ASSERT(HERE, MAX_THREADS > 0,"Mlucas.c: MAX_THREADS must be > 0");

	printf("INFO: System has %d available processor cores.\n", MAX_THREADS);

	/* Test Multithreading via simple pthreading self-test: */
  #if 0
	ncpu = MAX_THREADS;
	printf("INFO: Testing Multithreading support with %d threads...\n", ncpu);
	// Toggle boolean 2nd arg here to enable verbose mode:
	ASSERT(HERE, test_pthreads(nthr,FALSE) == 0, "test_pthreads() returns nonzero!");
  #endif
#endif

// Define TEST_FFT_RADIX at compile time to activate short-length DFT self-test [Must select params in test_fft_radix.c]
#ifdef TEST_FFT_RADIX
  #ifdef USE_SSE2
	#error TEST_FFT_RADIX requires non-SIMD build!
  #endif
	test_fft_radix();
	exit(0);
#endif
}

// Jun 2015: Thanks to Alex Vong for the detective work here - In 32-bit Linux, may need
// to up the stacklimit from the defaults to avoid SIGSEGV faults during running of the
// alloc-heavy self-test suites.
// Since we see such crashes under at least one major distro (Debian),
// invoke the code below for all Linuxes, since at worst it will amount to a no-op.
// OS X shows no issues, but since the implementation here uses only the Posix-compliant
// [get,set]rlimit() utilities (as opposed to the Linux-only prlimit()), could easily extend
// to OS X or, better, "All Posix-compliant OSes", if needed, by changing defined(OS_TYPE_LINUX)
// in the #if function-body wrapper to defined(OS_POSIX_COMPLIANT).

/* If needed, set stack soft limit to stack hard limit and resume execution after registering
the changed setting with the shell, using execvp().

From http://linux.die.net/man/2/setrlimit, here is form of struct returned by getrlimit():
	struct rlimit {
		rlim_t rlim_cur;  // Soft limit
		rlim_t rlim_max;  // Hard limit (ceiling for rlim_cur)
	};
(I use C++ // inside the struct, so as to play nice with my own C-style comment wrapper here).
*/
void set_stacklimit_restart(char *argv[])
{
#if defined(OS_TYPE_LINUX) && defined(CPU_IS_X86)	// CPU_IS_X86 (platform.h) == '32-bit x86'.
	struct rlimit stack_limits;

	if (getrlimit(RLIMIT_STACK, &stack_limits)) {
		fprintf(stderr, "Call to getrlimit() failed.\n");
		ASSERT(HERE, 0, "Exiting.");
	}
	printf("Old stack_limits: cur = %zu, max = %zu, [RLIM_INFINITY = %zu]\n",
	       stack_limits.rlim_cur, stack_limits.rlim_max, RLIM_INFINITY);

	if (stack_limits.rlim_cur == stack_limits.rlim_max)
		return;
	stack_limits.rlim_cur = stack_limits.rlim_max;

	if (setrlimit(RLIMIT_STACK, &stack_limits)) {
		fprintf(stderr, "Call to setrlimit() failed.\n");
		ASSERT(HERE, 0, "Exiting.");
	}
	printf("New stack_limits: cur = %zu, max = %zu\n",
	       stack_limits.rlim_cur, stack_limits.rlim_max);

	if(execvp(argv[0], argv)) {
		fprintf(stderr, "Call to execvp() failed.\n");
		ASSERT(HERE, 0, "Exiting.");
	}
#endif /* CPU_IS_X86  */
}

// Return available system RAM in MB:
uint32 get_system_ram(void) {
#ifdef OS_TYPE_LINUX

	#include <sys/sysinfo.h>
	/* Here the syntax of the sysinfo() call, and the components of the sysinfo struct:

		int sysinfo(struct sysinfo *info);

		struct sysinfo {
			long uptime;				// Seconds since boot
			unsigned long loads[3];		// 1, 5, and 15 minute load averages
			unsigned long totalram;		// Total usable main memory size
			unsigned long freeram;		// Available memory size
			unsigned long sharedram;	// Amount of shared memory
			unsigned long bufferram;	// Memory used by buffers
			unsigned long totalswap;	// Total swap space size
			unsigned long freeswap;		// swap space still available
			unsigned short procs;		// Number of current processes
			unsigned long totalhigh;	// Total high memory size
			unsigned long freehigh;		// Available high memory size
			unsigned int mem_unit;		// Memory unit size in bytes
			char _f[20-2*sizeof(long)-sizeof(int)]; // Padding for libc5
		};
	*/
	struct sysinfo info;
	sysinfo(&info);
	fprintf(stderr,"System total RAM = %lu, free RAM = %lu\n",info.totalram>>20,info.freeram>>20);
	return (info.freeram)>>20;

#elif defined(OS_TYPE_MACOSX)

	/* get the number of CPUs from the system; For details, 'man sysctl', and/or bookmark this Apple Developer page:
	https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/sysctlbyname.3.html. */
	uint64 totalram;	// Under OS X, this needs to be an int (size_t gave garbage results)
	int mib[4];
	size_t len = sizeof(totalram);
	sysctlbyname("hw.memsize", &totalram, &len, NULL, 0);
	return (totalram >> 20);

#endif
}

// Apr 2018: Due to portability issues, replace the system-headers-based version of the "has advanced SIMD?"
// check with one based on what amounts to "is the result of 'grep asimd /proc/cpuinfo' empty or not?".
// Dec 2020: Apple M1 needs special handling, use the Clang/GCC-shared __ARM_NEON__ predefine to detect SIMD support:
#ifdef CPU_IS_ARM_EABI

  #ifdef OS_TYPE_MACOSX

	int has_asimd(void)
	{
	#ifdef __ARM_NEON
		return (int)__ARM_NEON;
	#else
		return 0;
	#endif
	}

  #elif 1

	int has_asimd(void)
	{
		char in_line[STR_MAX_LEN];
		FILE*fp = mlucas_fopen("/proc/cpuinfo", "r");
		ASSERT(HERE, fp != 0x0, "/proc/cpuinfo file not found!");
		while(fgets(in_line, STR_MAX_LEN, fp) != 0x0) {
			if(strstr(in_line, "asimd") != 0)
				return 1;
		}
		fclose(fp);	fp = 0x0;
		return 0;
	}

  #elif __ARM_ARCH >= 8 // Rest of the preprocessor-conditional is the old version:

	#error This system-header-based ARM-has-ASIMD code should not be used!
	// Thanks to Laurent Desnogues for this:
	#include <sys/auxv.h>
	#include <asm/hwcap.h>
	// Check for flag indicating CPU supports advanced SIMD instruction set.
	// NB: For reasons unknown, when I tried putting this function def into get_cpuid.c as I do with the
	// x86 has_sse2() and similar functions, everything compiled fine but got linker errors on the ARM,
	// linker was unable to find the has_asimd() function. After dicking around with that problem for
	// several hours (and hence, several hours too many), tried moving the def here, and it worked:
	int has_asimd(void)
	{
		unsigned long hwcaps = getauxval(AT_HWCAP);
	#ifndef HWCAP_ASIMD	// This is not def'd on pre-ASIMD platforms
		const unsigned long HWCAP_ASIMD = 0;
	#endif
		if (hwcaps & HWCAP_ASIMD) {
			return 1;
		}
		return 0;
	}

  #else	// Pre-v8 ARM does not have above asimd-headers, no point even looking for them

	int has_asimd(void) { return 0; }

  #endif	// #ifdef OS_TYPE_MACOSX

#endif	// #ifdef CPU_IS_ARM_EABI

/***The following 3 routines MUST BE CALLED IN THE SAME ORDER AS IN host_init()!***/

void print_host_info(void)
{
	// Get available system RAM in MB, store in SYSTEM_RAM global:
	SYSTEM_RAM = get_system_ram();
#ifdef OS_TYPE_LINUX
	printf("INFO: %u MB of free system RAM detected.\n",SYSTEM_RAM);
#else
	printf("INFO: %u MB of available system RAM detected.\n",SYSTEM_RAM);
#endif

#if defined(USE_GPU) && defined(__CUDACC__)

	gpu_config_t gpu_config;
	gpu_info_t ginfo;
	int32 igpu;

	gpu_init(&gpu_config);
	if (gpu_config.num_gpu > 0) {
		printf("Detected %u CUDA-enabled GPU devices.\n", gpu_config.num_gpu);
		for(igpu = 0; igpu < gpu_config.num_gpu; ++igpu) {
			ginfo = gpu_config.gpu_info[igpu];
			printf("GPU #%u: %s v%u.%u\n", igpu, ginfo.name, ginfo.major, ginfo.minor);
			printf("clockRate = %u MHz\n", ginfo.clockRate/1000);
			printf("multiProcessorCount = %u\n", ginfo.multiProcessorCount);
			printf("totalConstMem = %u\n", ginfo.totalConstMem);
			printf("sharedMemPerBlock = %u\n", ginfo.sharedMemPerBlock);
			printf("totalGlobalMem = %u\n", ginfo.totalGlobalMem);
			printf("reg[ister]sPerBlock = %u\n", ginfo.regsPerBlock);
			printf("maxThreadsPerBlock = %u\n", ginfo.maxThreadsPerBlock);
			printf("deviceOverlap = %u\n", ginfo.deviceOverlap);
			printf("concurrentKernels = %u\n", ginfo.concurrentKernels);
			printf("warpSize = %u\n", ginfo.warpSize);
			printf("max_thread_dim[3] = [%u,%u,%u]\n", ginfo.maxThreadsDim[0], ginfo.maxThreadsDim[1], ginfo.maxThreadsDim[2]);
			printf("max_grid_size[3] = [%u,%u,%u]\n", ginfo.maxGridSize[0], ginfo.maxGridSize[1], ginfo.maxGridSize[2]);
		}
	} else {
		printf("ERROR: No CUDA-enabled GPUs found\n");
		exit(-1);
	}

	// Disable default spin-loop-wait-for-GPU:
	cudaSetDeviceFlags(cudaDeviceBlockingSync);

	cudaError_t cudaError = cudaGetLastError();
	if(cudaError != cudaSuccess)
	{
		printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
		ASSERT(HERE, 0, "gpu_sieve: GPU-side error detected!");
	}

//	cudaVecAddTest();
	cudaVecModpowTest64();
	cudaVecModpowTest78_0();
	cudaVecModpowTest78();
	cudaVecModpowTest96();
  #ifdef USE_FMADD
	cudaVecMul50x50Test();
  #endif
//exit(0);

#endif

#if EWM_DEBUG
	printf("INFO: Program compiled with debugging diagnostics ON.\n");
#endif

	printf("CPU Family = %s, OS = %s, %2d-bit Version, compiled with %s, Version %s.\n", CPU_NAME, OS_NAME, OS_BITS, COMPILER_NAME, COMPILER_VERSION);

#ifdef CPU_IS_ARM_EABI

	if(has_asimd()) {
	#ifdef USE_ARM_V8_SIMD
		printf("INFO: Build uses ARMv8 advanced-SIMD instruction set.\n");
	#else
		printf("INFO: CPU supports ARMv8 advanced-SIMD instruction set, but using scalar floating-point build.\n");
	#endif
	} else {
	#ifdef USE_ARM_V8_SIMD
		ASSERT(HERE, 0, "#define USE_ARM_V8_SIMD invoked but no advanced-SIMD support detected on this CPU!\n");
	#endif
	}

#elif(defined(CPU_IS_X86) || defined(CPU_IS_IA64) || defined(CPU_IS_X86_64) || defined(CPU_IS_K1OM))

//	get_cpu();

/* Enable this call to get gory details: */
	#if(1)
		/* if(1) --> if(0) Enables section below */
	#elif(defined(COMPILER_TYPE_MSVC) || defined(COMPILER_TYPE_ICC))
		cpu_details();
	#endif

  #ifdef USE_IMCI512	// 1st-gen Xeon Phi (KNF,KNC)

	if(has_avx512()) {
		ASSERT(HERE, 0, "Build uses AVX-512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
	} else if(has_imci512()) {
		printf("INFO: Build uses IMCI-512 instruction set.\n");
	} else {
		#define CPUID(arg1,arg2,ax,bx,cx,dx)\
			__asm__ __volatile__ ("cpuid":\
		"=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (arg1), "c" (arg2));
		uint32 a,b,c,d;
		CPUID(1,0,a,b,c,d);
		printf("has_imci512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
		printf("#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
		ASSERT(HERE, 0, "#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
	}

  #elif(defined(USE_AVX512))

	if(has_imci512()) {
		ASSERT(HERE, 0, "Build uses AVX512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
	} else 	if(has_avx512()) {
		printf("INFO: Build uses AVX512 instruction set.\n");
	} else {
		#define CPUID(arg1,arg2,ax,bx,cx,dx)\
			__asm__ __volatile__ ("cpuid":\
		"=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (arg1), "c" (arg2));
		uint32 a,b,c,d;
		CPUID(1,0,a,b,c,d);
		printf("has_avx512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
		printf("#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
		ASSERT(HERE, 0, "#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
	}

  #elif(defined(USE_AVX2))

	if(has_avx512()) {
		printf("INFO: CPU supports AVX-512 instruction set, but using AVX2-enabled build.\n");
	} else 	if(has_avx2()) {
		printf("INFO: Build uses AVX2 instruction set.\n");
	} else {
		#define CPUID(arg1,arg2,ax,bx,cx,dx)\
			__asm__ __volatile__ ("cpuid":\
		"=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (arg1), "c" (arg2));
		uint32 a,b,c,d;
		CPUID(1,0,a,b,c,d);
		printf("has_avx2: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
		printf("#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
		ASSERT(HERE, 0, "#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
	}

  #elif(defined(USE_AVX))

	if(has_avx2()) {
		printf("INFO: CPU supports AVX2 instruction set, but using AVX-enabled build.\n");
	} else if(has_avx()) {
		printf("INFO: Build uses AVX instruction set.\n");
	} else {
		ASSERT(HERE, 0, "#define USE_AVX invoked but no AVX support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
	}

  #elif(defined(USE_SSE2))
	/* This doesn't work on non-AVX platforms, since XGETBV (needed by has_avx*() functions) does not exist
	if(has_avx2()) {
		printf("INFO: CPU supports AVX2 instruction set, but using SSE2-enabled build.\n");
	} else if(has_avx()) {
		printf("INFO: CPU supports AVX instruction set, but using SSE2-enabled build.\n");
	} else */
	if(has_sse2()) {
		printf("INFO: Build uses SSE2 ... 'enhanced SSE2' supported by CPU: SSE[3,3e,4.1,4.2] = [%u,%u,%u,%u]\n",has_sse3(),has_sse3e(),has_sse41(),has_sse42());
	} else {
		ASSERT(HERE, 0, "#define USE_SSE2 invoked but no SSE2 support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
	}

  #else

	if(has_sse2()) {
		printf("INFO: CPU supports SSE2 instruction set, but using scalar floating-point build.\n");
	}

  #endif

#endif

#if PFETCH
	printf("INFO: Using prefetch.\n");
#endif

#ifdef MUL_LOHI64_SUBROUTINE
	printf("INFO: Using subroutine form of MUL_LOHI64.\n");
#else
	printf("INFO: Using inline-macro form of MUL_LOHI64.\n");
#endif

#ifdef USE_FMADD
	printf("INFO: Using FMADD-based 100-bit modmul routines for factoring.\n");
#elif(defined(USE_FLOAT))
	printf("INFO: Using floating-double-based modmul routines for factoring.\n");
#endif

	set_mlucas_path();
	printf("INFO: MLUCAS_PATH is set to \"%s\"\n", MLUCAS_PATH);

	if (MLUCAS_PATH[0] != '\0') {
		if (!mkdir_p(MLUCAS_PATH)) {
			printf("INFO: mkdir -p \"%s\" succeeded\n", MLUCAS_PATH);
		} else {
			fprintf(stderr, "ERROR: mkdir -p \"%s\" failed\n", MLUCAS_PATH);
			ASSERT(HERE, 0, "Exiting.");
		}
	}
}

/*
80x87 FPU Control Register stuff: FPUCTRL is a 16-bit register. Counting from 0-15,
the 4 bits of interest in the present application are bits 8 and 9 which control the
FPU precision, and bits 10 and 11, which control the FPU rounding mode:

	Bits <9:8>:	Value	Precision		<11:10>:	Value	Rounding Mode
				-----	---------					-----	---------------
				00		24 bits						00		==> nearest (i.e. IEEE)
				01		Reserved					01		==> -oo
				10		53 bits						10		==> +oo
				11		64 bits						11		==> 0 (i.e. truncate)

For the purpose of completeness, the other FPU control bits are as follows
( Adapted from http://maven.smith.edu/~thiebaut/ArtOfAssembly/CH14/CH14-3.html ):

	Bits zero through five are the exception masks. These are similar to the
	interrupt enable bit in the 80x86's flags register. If these bits contain
	a one, the corresponding condition is ignored by the 80x87 FPU. However,
	if any bit contains zero, and the corresponding condition occurs, then
	the FPU immediately generates an interrupt so the program can handle the
	degenerate condition.

	Bit zero corresponds to an invalid operation error. This generally occurs
	as the result of a programming error. Problem which raise the invalid
	operation exception include pushing more than eight items onto the stack
	or attempting to pop an item off an empty stack, taking the square root
	of a negative number, or loading a non-empty register.

	Bit one masks the denormalized interrupt which occurs whenever you try to
	manipulate denormalized values. Denormalized values generally occur when
	you load arbitrary extended precision values into the FPU or work with
	very small numbers just beyond the range of the FPU's capabilities.
	Normally, you would probably not enable this exception.

	Bit two masks the zero divide exception. If this bit contains zero, the FPU
	will generate an interrupt if you attempt to divide a nonzero value by zero.
	If you do not enable the zero division exception, the FPU will produce NaN
	(not a number) whenever you perform a zero division.

	Bit three masks the overflow exception. The FPU will raise the overflow
	exception if a calculation overflows or if you attempt to store a value
	which is too large to fit into a destination operand (e.g., storing a large
	extended precision value into a single precision variable).

	Bit four, if set, masks the underflow exception. Underflow occurs when the
	result is too small to fit in the desintation operand. Like overflow, this
	exception can occur whenever you store a small extended precision value into
	a smaller variable (single or double precision) or when the result of
	computation is too small for extended precision.

	Bit five controls whether the precision exception can occur. A precision
	exception occurs whenever the FPU produces an imprecise result, generally
	the result of an internal rounding operation. Although many operations will
	produce an exact result, many more will not. For example, dividing one by
	ten will produce an inexact result. Therefore, this bit is usually one since
	inexact results are very common.

	Bits six and thirteen through fifteen in the control register are currently
	undefined and reserved for future use. Bit seven is the interrupt enable mask,
	but it is only active on the 8087 FPU; a zero in this bit enables 8087
	interrupts and a one disables FPU interrupts.

	The 80x87 provides two instructions, FLDCW (load control word) and FSTCW (store
	control word), that let you load and store the contents of the control register.
	The single operand to these instructions must be a 16 bit memory location. The
	FLDCW instruction loads the control register from the specified memory location,
	FSTCW stores the control register into the specified memory location.
*/

/* Re.the SIMD control word, from http://softpixel.com/~cwright/programming/simd/sse.php :

	The MXCSR register is a 32-bit register containing flags for control and status information regarding SSE instructions.
	As of SSE3, only bits 0-15 have been defined.
														[EWM: Default value on MSVC/ia32 = 0x1FA0, so the bits marked with [x] are set:]
	Mnemonic	Bit Location	Description				[EWM: Default value on GCC/Core2-ia64 = 0x1FA2 = 1111110100010, bits [y] are set:]
	--------	------------	---------------------	[EWM: Default value on GCC/Haswell-ia64 = 0x9FE2 = 1111110100010, bits [z] are set:]
		FZ		bit 15			Flush To Zero							[z]
		R+		bit<14:13> = 10	Round Positive
		R-		bit<14:13> = 01	Round Negative
		RZ		bit<14:13> = 11	Round To Zero
		RN		bit<14:13> = 00	Round To Nearest		[x]		[y]		[z]
		PM		bit 12			Precision Mask			[x]		[y]		[z]
		UM		bit 11			Underflow Mask			[x]		[y]		[z]
		OM		bit 10			Overflow Mask			[x]		[y]		[z]
		ZM		bit 9			Divide By Zero Mask		[x]		[y]		[z]
		DM		bit 8			Denormal Mask			[x]		[y]		[z]
		IM		bit 7			Invalid Operation Mask	[x]		[y]		[z]
		DAZ		bit 6			Denormals Are Zero						[z]
		PE		bit 5			Precision Flag			[x]		[y]		[z]
		UE		bit 4			Underflow Flag
		OE		bit 3			Overflow Flag
		ZE		bit 2			Divide By Zero Flag
		DE		bit 1			Denormal Flag					[y]		[z]
		IE		bit 0			Invalid Operation Flag
*/
// May 2018: changed things so that the function is *defined* in both 32 and 64-bit modes:
#if defined(CPU_IS_X86) || defined(CPU_IS_X86_64)

	/* Example: To flip bits 13:14 in MXCSR from their default value 00 (round-to-nearest) 11 (round-toward-0):
		uint32 i = 0x00006000; i = x86_simd_mxcsr_toggle(i);
	*/
	// Return current value of the MXCSR
	uint32 x86_simd_mxcsr_getval(void) {
	#ifdef USE_SSE2	// Only supported if SSE is
		uint32 MXCSR_VALUE;
		__asm__ volatile ("stmxcsr %0" : "=m" (MXCSR_VALUE) );
		 return MXCSR_VALUE;
	#else
		return 0;
	#endif
	}
	// Set value of the MXCSR to the specified one. Returns the old value, to support reset-to-default:
	uint32 x86_simd_mxcsr_setval(uint32 MXCSR_NEWVAL) {
	#ifdef USE_SSE2	// Only supported if SSE is
		uint32 MXCSR_OLDVAL;
		__asm__ volatile ("stmxcsr %0" : "=m" (MXCSR_OLDVAL) );
		__asm__ volatile ("ldmxcsr %0" :: "m" (MXCSR_NEWVAL) );
		return MXCSR_OLDVAL;
	#else
		return 0;
	#endif
	}
	// For every set bit in the input XOR-mask, flip the corresponding bit in the MXCSR. Returns the old value.
	uint32 x86_simd_mxcsr_toggle(uint32 MXCSR_MASK) {
	#ifdef USE_SSE2	// Only supported if SSE is
		uint32 MXCSR_OLDVAL,MXCSR_NEWVAL;
		__asm__ volatile ("stmxcsr %0" : "=m" (MXCSR_OLDVAL) );
		MXCSR_NEWVAL = MXCSR_OLDVAL ^ MXCSR_MASK;
		__asm__ volatile ("ldmxcsr %0" :: "m" (MXCSR_NEWVAL) );
		return MXCSR_OLDVAL;
	#else
		return 0;
	#endif
	}

	void set_x87_fpu_params(unsigned short FPU_MODE)
	{
		/* SSE FPU control word support: */
	#ifdef USE_SSE2
		int oldMXCSR, newMXCSR;
	#endif
	#ifdef CPU_IS_IA64
		int64 FPUCTRL;
	#else
		unsigned short FPUCTRL;
	#endif
		ASSERT(HERE, (FPU_MODE == FPU_64RND) || (FPU_MODE == FPU_64CHOP), "Illegal value of FPU_MODE");

		// Check the SIMD control word:
	#ifdef USE_SSE2
		#ifdef COMPILER_TYPE_MSVC

			__asm	stmxcsr oldMXCSR
			newMXCSR = oldMXCSR | 0x8040; // set DAZ and FZ bits
			__asm ldmxcsr newMXCSR

		#elif(defined(COMPILER_TYPE_GCC) || defined(COMPILER_TYPE_SUNC))

			__asm__ volatile ("stmxcsr %0" : "=m" (oldMXCSR) );
			newMXCSR = oldMXCSR | 0x8040; // set DAZ and FZ bits
			__asm__ volatile ("ldmxcsr %0" :: "m" (newMXCSR) );

		#endif
	#endif

		/* Copy the FPU control word set by the compiler to a local variable
		(mainly so can see what the compiler sets), then overwrite with one of the above:
		*/
	#if(defined(COMPILER_TYPE_ICC) && defined(CPU_IS_X86))

		#error Intel C compiler currently unsupported for x86!

	#elif(FP_MANTISSA_BITS_DOUBLE == 64)/* (defined(CPU_IS_X86) || defined(CPU_IS_IA64))) */

	  #ifdef COMPILER_TYPE_GCC
		#warning INFO: Setting rnd_const-emulated DNINT for 64-bit x86 register-double significand
	  #endif

		#ifdef CPU_IS_IA64

			#ifndef COMPILER_TYPE_ICC
				#error unsupported compiler type for ia64!
			#endif
			FPUCTRL = _mm_getfpsr();
			info_x87_fpu_ctrl(FPUCTRL);
			/* Just use the same full-16-bit constant on all platforms, to ensure that there
			are no compiler-based differences in the other 12 bits, either: */
			FPUCTRL = (FPUCTRL & 0x0000) + FPU_MODE;
			_mm_setfpsr(FPUCTRL);

		#else

			#if(defined(COMPILER_TYPE_MWERKS) || defined(COMPILER_TYPE_MSVC))

				__asm	fstcw	FPUCTRL
				/*_controlfp(_PC_64, _MCW_PC);*/

			#elif(defined(COMPILER_TYPE_GCC) || defined(COMPILER_TYPE_SUNC))

				__asm__ volatile ("fstcw %0" : "=m" (FPUCTRL) );

			#else
				#error unsupported compiler type for x87!
			#endif
			info_x87_fpu_ctrl((uint64)FPUCTRL);

			/* Irrespective of what values the compiler set for bitfields <9:8> and <11:10>
			of FPUCTRL, set <9:8> = 11 and <11:10> = 00 to get the full 64-mantissa-bit
			precision available on the x87 and to ensure IEEE rounding mode, respectively:
			*/
		  #if 1
			/* Just use the same full-16-bit constant on all platforms, to ensure that there
			are no compiler-based differences in the other 12 bits, either: */
			FPUCTRL = (FPUCTRL & 0x0000) + FPU_MODE;
		  #else
			***obsolete:***
			FPUCTRL &= 0xf0ff;	/* Clear bits 8:11... */
			FPUCTRL |= 0x0300;	/* And set them to the desired value. */
		  #endif

			/* ...and then reload the FPU control word for the changes to take effect. */
			#if(defined(COMPILER_TYPE_MWERKS) || defined(COMPILER_TYPE_MSVC))
				__asm	fldcw	FPUCTRL
			#elif(defined(COMPILER_TYPE_GCC) || defined(COMPILER_TYPE_SUNC))
				__asm__ volatile ("fldcw %0" :: "m" (FPUCTRL) );
			#endif

		#endif	/* endif(CPU_IS_X86...) */

	#endif	/* endif(FP_MANTISSA_BITS_DOUBLE) */
	}

	void info_x87_fpu_ctrl(uint64 FPUCTRL)
	{
	#if EWM_DEBUG
		printf("INFO: x87 FPU Control Word = %16X.\n", (uint64)FPUCTRL);
	#endif

		/* Check bits <9:8>, and warn if the compiler isn't specifying 64-bit precision: */
		switch ((FPUCTRL >> 8) & 0x3) {
		case 0x3:
			break;
		case 0x2:
			printf("INFO: compiler sets x87 FPU to 53-bit mantissa mode. Overriding...Setting to 64-bit mode.\n");
			break;
		case 0x0:
			printf("INFO: compiler sets x87 FPU to 24-bit mantissa mode. Overriding...Setting to 64-bit mode.\n");
			break;
		default:
			printf("INFO: compiler sets x87 FPU to unknown precision. Overriding...Setting to 64-bit mode.\n");
		}

		/* Check bits <11:10>, and warn if the compiler isn't specifying 64-bit precision: */
		switch ((FPUCTRL >> 10) & 0x3) {
		case 0x0:
			break;
		case 0x1:
			printf("INFO: compiler sets x87 FPU to [round ==> -oo] rounding mode. Overriding...Setting to [round ==> nearest].\n");
			break;
		case 0x2:
			printf("INFO: compiler sets x87 FPU to [round ==> +oo] rounding mode. Overriding...Setting to [round ==> nearest].\n");
			break;
		case 0x3:
			printf("INFO: compiler sets x87 FPU to [round ==> 0] (truncate) rounding mode. Overriding...Setting to [round ==> nearest].\n");
			break;
		default:
			ASSERT(HERE, 0,"0");
		}
	}

#endif	// CPU_IS_X86 ?

/******* DEFINE GLOBALS, CHECK TYPE-LENGTHS AND ENDIANNESS, AND TEST RND-CONST FAST-NINT: *******/
#define FAST_UINT32_MOD	0	// Set = 1 to enable this test

void check_nbits_in_types(void)
{
	uint32 i,j;
	double ftmp, fran, ferr, finv, fsrt;
	double tpi = 3.1415926535897932384;
	double ln2 = LOG2;

	/* Make sure TRUE and FALSE behave as required: */
	ASSERT(HERE, !FALSE && TRUE, "TRUE and FALSE do not behave as required in check_nbits_in_types");

	/* Check lengths of basic data types: */
    ASSERT(HERE, sizeof( int8 ) == 1, "sizeof( int8 ) != 1");
    ASSERT(HERE, sizeof(uint8 ) == 1, "sizeof(uint8 ) != 1");
    ASSERT(HERE, sizeof( int16) == 2, "sizeof( int16) != 2");
    ASSERT(HERE, sizeof(uint16) == 2, "sizeof(uint16) != 2");
    ASSERT(HERE, sizeof( int32) == 4, "sizeof( int32) != 4");
    ASSERT(HERE, sizeof(uint32) == 4, "sizeof(uint32) != 4");
    ASSERT(HERE, sizeof( int64) == 8, "sizeof( int64) != 8");
    ASSERT(HERE, sizeof(uint64) == 8, "sizeof(uint64) != 8");
    ASSERT(HERE, sizeof(uint64) >= sizeof(void*), "sizeof(long long) != sizeof(void*)");    /* ALIGN_DOUBLES assumes this. */

	/* AltiVec vector types: */
#if(CPU_HAS_ALTIVEC || CPU_IS_CELL)
	ASSERT(HERE, sizeof(vec_uint8X16) == 16 , "sizeof(vec_uint8X16) != 16 ");
	ASSERT(HERE, sizeof(vec_uint16X8) == 16 , "sizeof(vec_uint16x8) != 16 ");
	ASSERT(HERE, sizeof(vec_uint32X4) == 16 , "sizeof(vec_uint32x4) != 16 ");
#endif

	uint64 x = 0x0706050403020100ull;
	uint8 *byte_arr = (uint8*)&x;
	// Runtime ordering is little-endian:
	if(byte_arr[0] == 0 && byte_arr[1] == 1 && byte_arr[2] == 2 && byte_arr[3] == 3 && byte_arr[4] == 4 && byte_arr[5] == 5 && byte_arr[6] == 6 && byte_arr[7] == 7) {
	  #ifdef USE_BIG_ENDIAN
		ASSERT(HERE, 0, "USE_BIG_ENDIAN set in platform.h but little-endian detected at runtime!");
	  #endif
	} else if(byte_arr[0] == 7 && byte_arr[1] == 6 && byte_arr[2] == 5 && byte_arr[3] == 4 && byte_arr[4] == 3 && byte_arr[5] == 2 && byte_arr[6] == 1 && byte_arr[7] == 0) {
	  #ifndef USE_BIG_ENDIAN
		ASSERT(HERE, 0, "USE_BIG_ENDIAN not set in platform.h but big-endian detected at runtime!");
	  #endif
	} else {
		ASSERT(HERE, 0, "Endianness detected as neither big nor little-endian at runtime!");
	}

	// Init RNG:
	rng_isaac_init(TRUE);

	/* We move this into a function defined in a separate file in an
	attempt to avoid too-clever compilers realizing that RND_A and RND_B
	have the same value and optimizing the +RND_A-RND_B sequences below away:
	*/
	get_fp_rnd_const(&RND_A, &RND_B);

	/* Attempted workaround for the ICC v9.0 weirdness here: */
	if(DNINT(tpi) != 3.0 || DNINT(ln2) != 1.0)
	{
		if(FP_MANTISSA_BITS_DOUBLE == 64)
		{
			sprintf(cbuf, "WARN: 64-bit rounding constant not behaving as expected - trying 53-bit version.\n");
			DBG_WARN(HERE, cbuf, "", 0);
			/*
			If there are any platforms which fail here not because they need
			the 53-bit form of the rounding constant but rather because they inevitably
			optimize away the rounding add/sub sequence, but which appear to do the
			right thing in the actual ditN_cy_dif1 routines, indicate in the #if here:
			*/
		#if(1)
		  #define PLATFORM_SKIP_RND_CONST_ENFORCEMENT
		#else
		  #define PLATFORM_SKIP_RND_CONST_ENFORCEMENT
			RND_A = 3.0*0x4000000*0x2000000;
			RND_B =12.0*0x2000000*0x1000000;
		#endif
		}
		else
		{
			sprintf(cbuf, "WARN: 53-bit rounding constant not behaving as expected - trying 64-bit version.\n");
			DBG_WARN(HERE, cbuf, "", 0);
			/*
			If there are any platforms which fail here not because they need
			the 53-bit form of the rounding constant but rather because they inevitably
			optimize away the rounding add/sub sequence, but which appear to do the
			right thing in the actual ditN_cy_dif1 routines, indicate in the #if here:
			*/
		#if(1)
		  #define PLATFORM_SKIP_RND_CONST_ENFORCEMENT
		#else
		  #define PLATFORM_SKIP_RND_CONST_ENFORCEMENT
			RND_A = 3.0*0x4000000*0x2000000*0x800;
			RND_B =12.0*0x2000000*0x1000000*0x800;
		#endif
		}
	}

#ifdef PLATFORM_SKIP_RND_CONST_ENFORCEMENT

	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, pi  = %20.3f,  DNINT(pi ) = %20.3f\n", RND_A, tpi, (double)DNINT(tpi));
	if((double)DNINT(tpi) != 3.0) {
		DBG_WARN(HERE, cbuf, "", TRUE);
	}
	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, ln2 = %20.3f,  DNINT(ln2) = %20.3f\n", RND_A, ln2, (double)DNINT(ln2));
	if((double)DNINT(ln2) != 1.0) {
		DBG_WARN(HERE, cbuf, "", TRUE);
	}

#else

	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, pi  = %20.3f,  DNINT(pi ) = %20.3f\n", RND_A, tpi, (double)DNINT(tpi));
	ASSERT(HERE, (double)DNINT(tpi) == 3.0, cbuf);

	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, ln2 = %20.3f,  DNINT(ln2) = %20.3f\n", RND_A, ln2, (double)DNINT(ln2));
	ASSERT(HERE, (double)DNINT(ln2) == 1.0, cbuf);

#endif

#if 0
#error Code obsolete as of Dec 2015!
	/* We typically need more information re. the FFT-mul params before being
	able to inteligently set the anti-thrashing array-padding params, so set = -1
	(which is taken to mean uninitialized) here:
	*/
	DAT_BITS = PAD_BITS = (int32)0xffffffff;
#else
	// Dec 20155: Subquadratic GCD needs FFT-mul with variable array lengths, thus no longer
	// have the "one FFT length at a time" simplicity of LL-testing where we can fiddle these
	// depending on exponent being tested, so now just set FFT array-padding params right here:
	DAT_BITS = DAT_BITS_DEF;	PAD_BITS = PAD_BITS_DEF;
	printf("Setting DAT_BITS = %d, PAD_BITS = %d\n",DAT_BITS,PAD_BITS);
#endif

	FFT_MUL_BASE = (double)((uint64)1 << FFT_MUL_BITS);
/* Intend to relax this later to allow powers of 2 as large as 2^54: */
ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");

	ASSERT(HERE, trailz64((uint64)FFT_MUL_BASE) == FFT_MUL_BITS, "mi64_cvt_double_uint64: trailz64((uint64)FFT_MUL_BASE) != FFT_MUL_BITS");
	ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_double_uint64: FFT_MUL_BASE not pure-integer!");
	ASSERT(HERE, FFT_MUL_BASE < 1.0*0x8000000*0x8000000, "mi64_cvt_double_uint64: FFT_MUL_BASE >= maximum allowed value of 2^54!");
	FFT_MUL_BASE_INV = 1.0/FFT_MUL_BASE;

  #if FAST_UINT32_MOD
	// Test fast 32-bit mod algo:
	#define gen_pinv(p)	(0xffffffff / (p) )

	const uint32 BITS1 = 24, mask1 = 0xffffffff >> (32 - BITS1);	// mask for x-input
	const uint32 BITS2 = 17, mask2 = 0xffffffff >> (32 - BITS2);	// mask for p-input
	uint32 x,p,pinv, r,y, nfail = 0;

	/* Using 10^9 random (x,p) pairs shows no failures up to the full 32 bits, when we fail in cases where pinv = 1.
	Here is a small sampling of the resulting #fails:

		I = 534397: Incorrect a (mod p) result! a = 1698375046, p = 3399405424, pinv = 1: r = 2593936918 [exact = 1698375046]
		I = 534400: Incorrect a (mod p) result! a = 471952975, p = 3563494799, pinv = 1: r = 1203425472 [exact = 471952975]
		I = 534401: Incorrect a (mod p) result! a = 1844700405, p = 3680268453, pinv = 1: r = 2459399248 [exact = 1844700405]
		I = 534409: Incorrect a (mod p) result! a = 190672429, p = 2680969504, pinv = 1: r = 1804670221 [exact = 190672429]
		I = 534410: Incorrect a (mod p) result! a = 2507724634, p = 4081983633, pinv = 1: r = 2720708297 [exact = 2507724634]
		I = 534411: Incorrect a (mod p) result! a = 1772269933, p = 3052535324, pinv = 1: r = 3014701905 [exact = 1772269933]
		I = 534418: Incorrect a (mod p) result! a = 1791901102, p = 4244017845, pinv = 1: r = 1842850553 [exact = 1791901102]

	In each of these cases the correct result is simply the unmodified x-input value, but our algo gives MUL-pair
	result r = 0, thus
		x = x - 0;
		r = x - p;
	and since p > x, the subtract has a borrow, i.e. yields 2^32 + x - p, which is only correct if interpreted as signed.
	To guard against this we simply add logic that checks if the MUL-pair result is 0 and returns x if true. That solves
	part of our problem, but we still must compute r = x - p, now guarding against underflow-on-subtract, as shown below.

	For inputs < 2^31, though, we can just use the simpler 4-step algorithm sans any conditionals except for the final
		r = (r >= p) ? x : r ;
	select step.
	*/
	uint32 ntry = 10000000, nneg = 0;
	printf ("INFO: Performing Fast-uint32-mod test with %u [%u,%u]-bit input pairs ... ",ntry,BITS1,BITS2);
	for(i = 0; i < ntry; i++)	// Cut #test down to 10^7 for production version
	{
		uint64 i64 = rng_isaac_rand();
		x = (uint32)(i64 >> 32);
		p = (uint32)i64;
		x &= mask1;	// Only support inputs of 24-bits or less
		p &= mask2;
		// Must guard against 0 modulus:
		while(p == 0) {
			i64 = rng_isaac_rand();
			p = (uint32)i64 & mask2;
		//	printf ("I = %d: p = 0! Replacing with %u\n",i, p);
		}
		y = x;	// Use x-copy in place of x below, since must leave inputs as-is
		pinv = gen_pinv(p);
		// Compute x % p (we hope) and check:
		r = __MULH32(y, pinv);	//	r = __mulhi (y, pinv);
		r = __MULL32(r, p   );	//	r = r * p;
		if(r != 0) {
			y = y - r;
			r = y - p;
			nneg += (r < p);	// Collect stats on how many of these cases - i.e. where y did need downward adjustment - occur
		//	if(r < p)printf ("I = %d Needed extra sub: a = %u; p = %u; pinv = %u [a/p = %f]: y = %u, r = %u]\n",i, x, p, pinv,(float)x/p, y,r);
			r = (r >= p) ? y : r ;
		} else {
			r = y - p;
			r = (r > y) ? y : r ;	// Must guard vs underflow-on-subtract
		}
		if (r != x%p) {
			++nfail;
			printf ("I = %d: Incorrect a (mod p) result! a = %u, p = %u, pinv = %u: r = %u [exact = %u]\n",i, x, p, pinv, r, x%p);
		}
	}
	printf ("%u cases of %u [%6.2f%%] needed adjustment.\n",nneg,ntry,100.*nneg/(float)ntry);
	ASSERT(HERE, nfail == 0, "Fast-uint32-mod test failed for 1 or more inputs!");
  #endif	// #if FAST_UINT32_MOD ?

	/* Test approximate 1/x and 1/sqrt(x) routines: */
	ftmp = finvest(1.5,  8);	/*fprintf(stderr, "finvest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 4e-03, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(1.5, 53);	/*fprintf(stderr, "finvest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 1e-14, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(1.0, 53);	/*fprintf(stderr, "finvest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(2.0, 53);	/*fprintf(stderr, "finvest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.500000000000000));*/	ASSERT(HERE, fabs(ftmp - 0.500000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(0.5, 53);	/*fprintf(stderr, "finvest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(.75, 53);	/*fprintf(stderr, "finvest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.333333333333333));*/	ASSERT(HERE, fabs(ftmp - 1.333333333333333) < 1e-14, "Unacceptable level of error in finvest() call!");
	/* Try some large and small inputs: */
	ftmp = finvest(3.141592653589793e+15, 53);	/*fprintf(stderr, "finvest(3.141592653589793e+15, 53) gives err = %20.10e\n", fabs(ftmp - 3.183098861837907e-16));*/	ASSERT(HERE, fabs(ftmp - 3.183098861837907e-16) < 1e-14, "Unacceptable level of error in finvest() call!");
	ftmp = finvest(3.183098861837907e-16, 53);	/*fprintf(stderr, "finvest(3.183098861837907e-16, 53) gives err = %20.10e\n", fabs(ftmp - 3.141592653589793e+15));*/	ASSERT(HERE, fabs(ftmp - 3.141592653589793e+15) < 1e+00, "Unacceptable level of error in finvest() call!");

	ftmp = fisqrtest(1.5,  8);	/*fprintf(stderr, "fisqrtest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-3 , "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(1.5, 53);	/*fprintf(stderr, "fisqrtest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(1.0, 53);	/*fprintf(stderr, "fisqrtest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(2.0, 53);	/*fprintf(stderr, "fisqrtest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.707106781186548));*/	ASSERT(HERE, fabs(ftmp - 0.707106781186548) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(0.5, 53);	/*fprintf(stderr, "fisqrtest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 1.414213562373095));*/	ASSERT(HERE, fabs(ftmp - 1.414213562373095) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(0.3, 53);	/*fprintf(stderr, "fisqrtest(0.3, 53) gives err = %20.10e\n", fabs(ftmp - 1.825741858350554));*/	ASSERT(HERE, fabs(ftmp - 1.825741858350554) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(.25, 53);	/*fprintf(stderr, "fisqrtest(.25, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(.75, 53);	/*fprintf(stderr, "fisqrtest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.154700538379251));*/	ASSERT(HERE, fabs(ftmp - 1.154700538379251) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(3.0, 53);	/*fprintf(stderr, "fisqrtest(3.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.577350269189626));*/	ASSERT(HERE, fabs(ftmp - 0.577350269189626) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
	/* Try some large and small inputs: */
	ftmp = fisqrtest(3.141592653589793e+15, 53);	/*fprintf(stderr, "fisqrtest(3.141592653589793e+15, 53); gives err = %20.10e\n", fabs(ftmp - 1.784124116152771e-08));*/	ASSERT(HERE, fabs(ftmp - 1.784124116152771e-08) < 1e-22, "Unacceptable level of error in fisqrtest() call!");
	ftmp = fisqrtest(3.183098861837907e-16, 53);	/*fprintf(stderr, "fisqrtest(3.183098861837907e-16, 53); gives err = %20.10e\n", fabs(ftmp - 5.604991216397928e+07));*/	ASSERT(HERE, fabs(ftmp - 5.604991216397928e+07) < 1e-07, "Unacceptable level of error in fisqrtest() call!");

	/* Now do a whole mess of 'em: */
	for(i = 0; i < 100000; i++)
	{
		fran = rng_isaac_rand_double();
		fran = fabs(fran);
		if(fran > 0.0) {
			ftmp = finvest  (fran, 53);
			finv = 1.0/fran;
			ferr = (ftmp - finv)/(ftmp + finv);
			ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in finvest  () call!");

			ftmp = fisqrtest(fran, 53);
			fsrt = 1.0/sqrt(fran);
			ferr = (ftmp - fsrt)/(ftmp + fsrt);
			ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
		}

		fran = rng_isaac_rand_double_norm_pos();
		if(fran < 0.0 || fran >= 1.0) {
			sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pos returns illegal value outside [0, 1): i = %d, %e\n", i,fran);
			ASSERT(HERE, 0, cbuf);
		}

		fran = rng_isaac_rand_double_norm_pm1();
		if(fabs(fran) >= 1.0) {
			sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pm1 returns illegal value outside (-1,+1): i = %d, %e\n", i, fran);
			ASSERT(HERE, 0, cbuf);
		}
	}

#ifdef USE_FMADD
#warning USE_FMADD enabled in util.c!
	/* Test any FMADD-based routines, if def'd: //
	printf("INFO: Testing MUL50x50 routines ... ");
	uint32 nerr = test_mul50x50();
	if(!nerr)
		printf("fma_dmult_tests completed successfully!\n");
	else
		ASSERT(HERE, 0, "fma_dmult_tests failed!\n");
	*/
#endif

#ifdef USE_FGT61
	printf("INFO: Testing FGT (mod M61) arithmetic ... \n");
	const uint64 q = 0x1FFFFFFFFFFFFFFFull;
	uint64 order, root_re,root_im, re,im;

	// Test various functions defined in fgt_m61.c - modular powering (both that
	// used in root-taking and of the results is a good 'whole code' test.

	// [1] Test out power-of-2 roots in GF(q^2) -
	//	Maximal order (q^2-1) = 2^62 * (2^60-1), allowing power-of-2 roots up to 2^62:
	/* Here are the roots found in the loop below:
		FGT: prim-root of order 2^ 0 = 1 + I*0
		FGT: prim-root of order 2^ 1 = 2305843009213693950 + I*0 == -1 (mod q)
		FGT: prim-root of order 2^ 2 = 0 + I*1
		FGT: prim-root of order 2^ 3 =          1073741824 + I*         1073741824 = 2^30 * (1 + I)
		FGT: prim-root of order 2^ 4 = 1693317751237720973 + I*2283815672160731785
		FGT: prim-root of order 2^ 5 =  697323983679957246 + I*  83304533336094567
		...
		FGT: prim-root of order 2^62 = 1895584235299698857 + I*2150133374943417338
	*/
	for(i = 0; i < 63; i++)
	{
		order = 1ull << i;
		prim_root_q(order, &root_re,&root_im);
	//	printf("FGT: prim-root of order 2^%2u = %llu + I*%llu\n",i, root_re,root_im);
		// Check order-primitivity of roots of order > 1 by powering result up to 2nd order; result must == -1 (mod q):
		if(i > 0) {
			for(j = 1; j < i; j++) {
				csqr_modq(root_re,root_im, &root_re,&root_im);
				root_re = qreduce(root_re);	root_im = qreduce(root_im);	// Only partially reduce intermediates...
			}
			root_re = qreduce_finish(root_re);	root_im = qreduce_finish(root_im);	// ...and then finish reducing here.
			ASSERT(HERE, root_re ==  q-1 && root_im == 0ull, "Bad prim_root_q result!");
		} else {
			ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
		}
	}

#if 0
	// Play with conjugates of both power-of-2 and non-power-of-2 (but still even-order) roots:
	// Power-of-2 roots satisfy simple conjugate rule, modular analog of complex conj(Re,Im) = (Re,-Im):
	order = 16;	prim_root_q(order, &root_re,&root_im);
	pow_modq(order-1, root_re,root_im, &re,&im);
	printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im);
//	FGT: prim-root of order 16 = 1693317751237720973 + I*2283815672160731785,
//					Conjugate =  1693317751237720973 + I*  22027337052962166 [q-Im = 2283815672160731785]
	ASSERT(HERE, root_re == re && root_im == (q-im), "Bad power-of-2 conjugate!");

	// Non-power-of-2 roots satisfy no simple conjugate rules, so multiply root and its conjugate together as sanity check:
	order = 24;	prim_root_q(order, &root_re,&root_im);
	pow_modq(order-1, root_re,root_im, &re,&im);
	printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im);
	cmul_modq(root_re,root_im, re,im, &re,&im);
	re = qreduce_full(re);	im = qreduce_full(im);
	ASSERT(HERE, re == 1ull && im == 0ull, "Bad non-power-of-2 conjugate!");
/*
	24th root:
	FGT: prim-root of order 24 = 244692701471512749 + I*2061150307742181202,
					Conjugate = 2061150308815923026 + I*2061150308815923026 [q-Im = 244692700397770925]

	*NOTE* Conjugate has Re == Im! Thus reminiscent of 8th root:
	FGT: prim-root of order 8 =          1073741824 + I*         1073741824 = 2^30 * (1 + I)

	FGT: prim-root of order 3 = 1669582390241348315 + I*0 = R3

	printf("Powers of prim-root:\n");
	re = root_re;	im = root_im;
	for(i = 0; i < order; i++) {
		printf("%2u: %20llu[-= %20llu] + I*%20llu[-= %20llu]\n",i+1, re,q-re,im,q-im);
		cmul_modq(root_re,root_im, re,im, &re,&im);
		re = qreduce_full(re);	im = qreduce_full(im);
	}
Gives
	Powers of prim-root:
	 1:   244692701471512749[-=  2061150307742181202] + I* 2061150307742181202[-=   244692701471512749]	<*** Call thib [a,b]
	 2:                    0[-=  2305843009213693951] + I*  636260618972345636[-=  1669582390241348315]	[0,c]
	 3:           1073741824[-=  2305843008139952127] + I*          1073741824[-=  2305843008139952127]	[d,d]
	 4:  1669582390241348316[-=   636260618972345635] + I*                   0[-=  2305843009213693951]	[e,0]
	 5:   244692700397770925[-=  2061150308815923026] + I* 2061150308815923026[-=   244692700397770925]	[f,-f]
	 6:                    0[-=  2305843009213693951] + I*                   1[-=  2305843009213693950]	[0,1]
	 7:   244692701471512749[-=  2061150307742181202] + I*  244692701471512749[-=  2061150307742181202]	[a,-b]
	 8:  1669582390241348315[-=   636260618972345636] + I*                   0[-=  2305843009213693951]	[-c,0]
	 9:  2305843008139952127[-=           1073741824] + I*          1073741824[-=  2305843008139952127]	[-d,d]
	10:                    0[-=  2305843009213693951] + I* 1669582390241348316[-=   636260618972345635]	[0,e]
	11:   244692700397770925[-=  2061150308815923026] + I*  244692700397770925[-=  2061150308815923026]	[f,f]
	12:  2305843009213693950[-=                    1] + I*                   0[-=  2305843009213693951]	[-1,0]
	13:  2061150307742181202[-=   244692701471512749] + I*  244692701471512749[-=  2061150307742181202]	[-a,b]
	14:                    0[-=  2305843009213693951] + I* 1669582390241348315[-=   636260618972345636]	[0,-c]
	15:  2305843008139952127[-=           1073741824] + I* 2305843008139952127[-=           1073741824]	[-d,-d]
	16:   636260618972345635[-=  1669582390241348316] + I*                   0[-=  2305843009213693951]	[-e,0]
	17:  2061150308815923026[-=   244692700397770925] + I*  244692700397770925[-=  2061150308815923026]	[-f,f]
	18:                    0[-=  2305843009213693951] + I* 2305843009213693950[-=                    1]	[0,-1]
	19:  2061150307742181202[-=   244692701471512749] + I* 2061150307742181202[-=   244692701471512749]	[-a,-b]
	20:   636260618972345636[-=  1669582390241348315] + I*                   0[-=  2305843009213693951]	[c,0]
	21:           1073741824[-=  2305843008139952127] + I* 2305843008139952127[-=           1073741824]	[d,-d]
	22:                    0[-=  2305843009213693951] + I*  636260618972345635[-=  1669582390241348316]	[0,-e]
	23:  2061150308815923026[-=   244692700397770925] + I* 2061150308815923026[-=   244692700397770925]	[-f,-f]
	24:                    1[-=  2305843009213693950] + I*                   0[-=  2305843009213693951]	[1,0]
The four [+-d,+-d] and four powers of I are just the eight 8th roots of unity which are hit on multiple-of-3 index values.
*/
#endif

	// [2] Test out odd-order roots - that means any order dividing (2^60-1) = 3^2.5^2.7.11.13.31.41.61.151.331.1321:
	/* Here are the roots found in the loop below:
		FGT: prim-root of order 3 = 1669582390241348315 + I*0
		FGT: prim-root of order 9 = 1102844585000305877 + I*0
		FGT: prim-root of order 45 = 295230898440480023 + I*0
		...
		FGT: prim-root of order 1152921504606846975 = 1754029865706415802 + I*0
	*/
	const uint32 odd_ord_facs[13] = {3,3,5,5,7,11,13,31,41,61,151,331,1321};
	order = 1ull;
	for(i = 0; i < 13; i++)
	{
		order *= odd_ord_facs[i];
		prim_root_q(order, &root_re,&root_im);
	//	printf("FGT: prim-root of order %llu = %llu + I*%llu\n",order, root_re,root_im);
		ASSERT(HERE, root_im == 0ull, "Odd roots must be strictly real!!");
		// Check order-primitivity of roots by raising result to (order)th power; result must == -1 (mod q):
		pow_modq(order, root_re,root_im, &root_re,&root_im);
		ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
	}
	printf("fgt_m61 tests completed successfully!\n");
#endif

	return;
}

/*
Fast-uint32-mod test:
Any obvious pattern in the need-adjustment cases? Fractional part of a/p shows it's cases where a/p has small frac-part:
I = 19 Needed extra sub: a = 1024576945; p = 12037616; pinv = 356 [a/p = 85.114609]: y = 13417201, r = 1379585]
I = 25 Needed extra sub: a = 230529187; p = 11523083; pinv = 372 [a/p = 20.005859]: y = 11590610, r = 67527]
I = 32 Needed extra sub: a = 211760194; p = 111687; pinv = 38455 [a/p = 1896.014648]: y = 113329, r = 1642]
I = 43 Needed extra sub: a = 1066942432; p = 6128196; pinv = 700 [a/p = 174.103836]: y = 6764524, r = 636328]
I = 50 Needed extra sub: a = 968649863; p = 8139751; pinv = 527 [a/p = 119.002396]: y = 8159245, r = 19494]
I = 87 Needed extra sub: a = 968400190; p = 10515906; pinv = 408 [a/p = 92.089088]: y = 11452744, r = 936838]
I = 94 Needed extra sub: a = 1057486414; p = 15537360; pinv = 276 [a/p = 68.060883]: y = 16483294, r = 945934]
I = 109 Needed extra sub: a = 533959987; p = 14415015; pinv = 297 [a/p = 37.041931]: y = 15019447, r = 604432]
I = 111 Needed extra sub: a = 993195985; p = 3571948; pinv = 1202 [a/p = 278.054443]: y = 3766389, r = 194441]
I = 130 Needed extra sub: a = 992040856; p = 6158255; pinv = 697 [a/p = 161.091217]: y = 6720056, r = 561801]
I = 173 Needed extra sub: a = 770797430; p = 10005071; pinv = 429 [a/p = 77.040680]: y = 10412034, r = 406963]
I = 182 Needed extra sub: a = 811377653; p = 2526617; pinv = 1699 [a/p = 321.132050]: y = 2860213, r = 333596]
I = 188 Needed extra sub: a = 1013613270; p = 8803835; pinv = 487 [a/p = 115.133148]: y = 9976080, r = 1172245]
I = 204 Needed extra sub: a = 862933752; p = 4663527; pinv = 920 [a/p = 185.038864]: y = 4844784, r = 181257]
I = 209 Needed extra sub: a = 381286368; p = 4536865; pinv = 946 [a/p = 84.041817]: y = 4726573, r = 189708]
I = 229 Needed extra sub: a = 311393414; p = 12943483; pinv = 331 [a/p = 24.057930]: y = 13693305, r = 749822]
I = 241 Needed extra sub: a = 875437669; p = 6339203; pinv = 677 [a/p = 138.099014]: y = 6966858, r = 627655]
I = 279 Needed extra sub: a = 860144752; p = 8108626; pinv = 529 [a/p = 106.077744]: y = 8739022, r = 630396]
I = 282 Needed extra sub: a = 874645009; p = 14087847; pinv = 304 [a/p = 62.085072]: y = 15286342, r = 1198495]
I = 295 Needed extra sub: a = 131723624; p = 10972820; pinv = 391 [a/p = 12.004537]: y = 11022604, r = 49784]
I = 312 Needed extra sub: a = 478423934; p = 1696304; pinv = 2531 [a/p = 282.039032]: y = 1762510, r = 66206]
I = 346 Needed extra sub: a = 773372954; p = 7359080; pinv = 583 [a/p = 105.090981]: y = 8028634, r = 669554]
I = 347 Needed extra sub: a = 769693293; p = 3115804; pinv = 1378 [a/p = 247.028793]: y = 3205509, r = 89705]
I = 354 Needed extra sub: a = 753736067; p = 7317291; pinv = 586 [a/p = 103.007530]: y = 7372385, r = 55094]
I = 355 Needed extra sub: a = 402434763; p = 3193892; pinv = 1344 [a/p = 126.001366]: y = 3198263, r = 4371]
I = 373 Needed extra sub: a = 303380021; p = 12635991; pinv = 339 [a/p = 24.009199]: y = 12752228, r = 116237]
I = 375 Needed extra sub: a = 980850028; p = 11514899; pinv = 372 [a/p = 85.180954]: y = 13598512, r = 2083613]
I = 385 Needed extra sub: a = 484277536; p = 4000805; pinv = 1073 [a/p = 121.045021]: y = 4180936, r = 180131]
I = 400 Needed extra sub: a = 453815187; p = 6670148; pinv = 643 [a/p = 68.036751]: y = 6915271, r = 245123]
I = 404 Needed extra sub: a = 867743335; p = 6621183; pinv = 648 [a/p = 131.055634]: y = 6989545, r = 368362]
I = 433 Needed extra sub: a = 866814550; p = 12037278; pinv = 356 [a/p = 72.010841]: y = 12167812, r = 130534]
I = 452 Needed extra sub: a = 914104228; p = 6818552; pinv = 629 [a/p = 134.061340]: y = 7236812, r = 418260]
I = 458 Needed extra sub: a = 392296762; p = 14003966; pinv = 306 [a/p = 28.013262]: y = 14189680, r = 185714]
I = 465 Needed extra sub: a = 504882326; p = 10740929; pinv = 399 [a/p = 47.005463]: y = 10799592, r = 58663]
I = 476 Needed extra sub: a = 255805064; p = 2013610; pinv = 2132 [a/p = 127.038033]: y = 2090204, r = 76594]
I = 540 Needed extra sub: a = 699026720; p = 544411; pinv = 7889 [a/p = 1284.005493]: y = 547407, r = 2996]
I = 549 Needed extra sub: a = 1066225928; p = 8260936; pinv = 519 [a/p = 129.068420]: y = 8826120, r = 565184]
I = 561 Needed extra sub: a = 811854178; p = 4584781; pinv = 936 [a/p = 177.075897]: y = 4932722, r = 347941]
I = 571 Needed extra sub: a = 505541881; p = 11747289; pinv = 365 [a/p = 43.034771]: y = 12155743, r = 408454]
I = 590 Needed extra sub: a = 675714616; p = 6756887; pinv = 635 [a/p = 100.003838]: y = 6782803, r = 25916]
I = 605 Needed extra sub: a = 718292932; p = 10559748; pinv = 406 [a/p = 68.021790]: y = 10789816, r = 230068]
I = 616 Needed extra sub: a = 470903290; p = 7846004; pinv = 547 [a/p = 60.018234]: y = 7989054, r = 143050]
I = 639 Needed extra sub: a = 900990749; p = 13056614; pinv = 328 [a/p = 69.006462]: y = 13140997, r = 84383]
I = 654 Needed extra sub: a = 518909096; p = 7518515; pinv = 571 [a/p = 69.017494]: y = 7650076, r = 131561]
I = 658 Needed extra sub: a = 746682360; p = 9439608; pinv = 454 [a/p = 79.100990]: y = 10392936, r = 953328]
I = 693 Needed extra sub: a = 891783098; p = 14139951; pinv = 303 [a/p = 63.068329]: y = 15106136, r = 966185]
I = 731 Needed extra sub: a = 618218131; p = 13436506; pinv = 319 [a/p = 46.010334]: y = 13575361, r = 138855]
I = 732 Needed extra sub: a = 497047750; p = 4518412; pinv = 950 [a/p = 110.004959]: y = 4540842, r = 22430]
I = 740 Needed extra sub: a = 508301929; p = 10796855; pinv = 397 [a/p = 47.078701]: y = 11646599, r = 849744]
I = 857 Needed extra sub: a = 864091071; p = 8538861; pinv = 502 [a/p = 101.195122]: y = 10204971, r = 1666110]
I = 858 Needed extra sub: a = 131898711; p = 13183606; pinv = 325 [a/p = 10.004752]: y = 13246257, r = 62651]
I = 861 Needed extra sub: a = 887750200; p = 4247290; pinv = 1011 [a/p = 209.015686]: y = 4313880, r = 66590]
I = 864 Needed extra sub: a = 435680256; p = 350783; pinv = 12243 [a/p = 1242.022095]: y = 358553, r = 7770]
I = 891 Needed extra sub: a = 860496863; p = 10227272; pinv = 419 [a/p = 84.137474]: y = 11633287, r = 1406015]
I = 897 Needed extra sub: a = 940041261; p = 10427428; pinv = 411 [a/p = 90.150826]: y = 12000169, r = 1572741]
I = 913 Needed extra sub: a = 1059585665; p = 13749374; pinv = 312 [a/p = 77.064285]: y = 14633241, r = 883867]
I = 929 Needed extra sub: a = 804154824; p = 5824066; pinv = 737 [a/p = 138.074463]: y = 6257782, r = 433716]
I = 964 Needed extra sub: a = 1022970393; p = 5911156; pinv = 726 [a/p = 173.057587]: y = 6251561, r = 340405]
I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.156265]: y = 13391342, r = 1809773]
*/

/********************/

#if defined(USE_AVX2) && !defined(USE_IMCI512)

	// Self-test of fmadd-based 50x50==>100-bit exact integer product algorithms:
	uint32	test_mul50x50()
	{
		int i,j,k, pow2;
		double pow2_dmult,pow2_imult;
		uint32 nerr = 0, itmp32;
		const double crnd = 3.0*0x4000000*0x2000000, crnd50 = crnd*TWO50FLOAT;	// Consts used to emulate DNINT(x) and 2^50 * DNINT(x*2^-50)
										// (i.e. round-to-nearest-multiple-of-2^50 ... alas the AVX-512 VRNDSCALEPD instruction only supports
										// round-to-nearest-multiple-of-negative-power-of-2, and said power is further restricted to pow < 16.
		static vec_dbl *sc_arr = 0x0;
		static double *sc_ptr;
		double *tmp, *dptr1,*dptr2,*dptr3,*dptr4, l2lo,l2hi, dblo,dbhi, sqr100lo[4],sqr100hi[4], dtmp,cy_max;
		static double *ax,*bx,*cx,*dx, *ay,*by,*cy,*dy, *alo,*blo,*clo,*dlo, *ahi,*bhi,*chi,*dhi, *acy,*alo_norm,*ahi_norm;
		uint64 itmp64, iax,ibx,icx,idx, iay,iby,icy,idy, ialo,iblo,iclo,idlo, iahi,ibhi,ichi,idhi;
		const double prod1_adj = 3.0;	// Const to multiply by base and add to prod[1] to ensure latter >= 0
		if(!sc_arr) {
			sc_arr = ALLOC_VEC_DBL(sc_arr, 8);
			if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
			sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);
			ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
			/* Remember, rhese are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */
			tmp = (double *)sc_ptr;
			ax  = tmp + 0;	bx  = tmp + 1;	cx  = tmp + 2;	dx  = tmp + 3;	tmp += 4;
			ay  = tmp + 0;	by  = tmp + 1;	cy  = tmp + 2;	dy  = tmp + 3;	tmp += 4;
			alo = tmp + 0;	blo = tmp + 1;	clo = tmp + 2;	dlo = tmp + 3;	tmp += 4;
			ahi = tmp + 0;	bhi = tmp + 1;	chi = tmp + 2;	dhi = tmp + 3;	tmp += 4;
			acy = tmp + 0;	tmp += 4;
			alo_norm = tmp + 0;	tmp += 4;
			ahi_norm = tmp + 0;	tmp += 4;
		}

		// Assumes rng_isaac_init() has already been called on entry
		pow2_dmult = TWO48FLOAT;	// This must match the loop-starting value of 2^pow2:
		pow2_imult = TWO48FLINV;
		dptr1 = &pow2_dmult;	dptr2 = &pow2_imult;
		dptr3 = &crnd50;		dptr4 = &prod1_adj;
		for(pow2 = 48; pow2 < 54; ++pow2) {
			// Only makes sense to test up the #bits in an IEEE-double mantissa: Any larger and we start losing
			// LSBs (I.e. the test may 'succeed' for pow2 > 53, but is only testing the equivalent of pow2 = 53):
			ASSERT(HERE, pow2 < 54, "No point testing > 53-bit inputs due to loss of LSBs!");
			printf("Testing fma_dmult for %d bits, dmult = %f:\n",pow2,pow2_dmult);
			l2lo = l2hi = cy_max = 0.;	// Init log2-range-bounds-storing vars
			for(j = 0; j < 4; j++) {
				sqr100lo[j] = sqr100hi[j] = 0;
			}

			for(i = 0; i < 100000000; i++)
			{
				// Input multiplicands in [-2^pow2, +2^pow2]:
				*ax = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	iax = ABS(*ax);	// Will use positive-signed version of
				*bx = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	ibx = ABS(*bx);	// float-double FMA-pair result to ease
				*cx = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	icx = ABS(*cx);	// comparison with exact integer result
				*dx = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	idx = ABS(*dx);	// computed via unsigned MUL_LOHI64.
				*ay = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	iay = ABS(*ay);
				*by = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	iby = ABS(*by);
				*cy = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	icy = ABS(*cy);
				*dy = DNINT( rng_isaac_rand_double_norm_pm1() * pow2_dmult );	idy = ABS(*dy);
				__asm__ volatile (\
					/* Use ymm[< 16] for broadcast-from-scalar consts to avoid illegal-instruction exceptions (AVX-512F supports b'cast-to-full-width-zmm) */\
					"movq	%[__base],%%rax		\n\t	vbroadcastsd	(%%rax),%%ymm15	\n\t"/* BASE */\
					"movq	%[__binv],%%rbx		\n\t	vbroadcastsd	(%%rbx),%%ymm14	\n\t"/* BINV */\
					"movq	%[__ax] ,%%rax	\n\t"\
					"movq	%[__ay] ,%%rbx	\n\t"\
					"vmovaps	(%%rax),%%ymm2	\n\t"\
					"vmovaps	(%%rbx),%%ymm3	\n\t"\
					"     vmulpd	%%ymm2,%%ymm3,%%ymm1	\n\t"/* hi = fma(a,b, 0  ) - use VMULPD instead of VFMADD231PD, since addend = 0. */
					"vmovaps	%%ymm1,%%ymm0				\n\t"/* cpy hi into lo-reg */\
					"vfmsub231pd	%%ymm2,%%ymm3,%%ymm0	\n\t"/* lo = fma(a,b, -hi) */
					"movq	%[__alo],%%rax	\n\t"\
					"movq	%[__ahi],%%rbx	\n\t"\
					"vmovaps	%%ymm0,(%%rax)	\n\t"/* lo */\
					"vmovaps	%%ymm1,(%%rbx)	\n\t"/* hi */\
					"movq	%[__alo_norm],%%rax	\n\t"\
					"movq	%[__ahi_norm],%%rbx	\n\t"\
					"movq	%[__acy],%%rcx	\n\t"\
					"     vmulpd	%%ymm1,%%ymm14,%%ymm2	\n\t"/* tmp = hi*BINV */\
					"vroundpd	$0,%%ymm2,%%ymm2			\n\t"/* hh = DNINT(hi*BINV) */\
					"vmovaps	%%ymm2,%%ymm3				\n\t"/* cpy hh into cy-reg */\
					"vfnmadd213pd	%%ymm1,%%ymm15,%%ymm3	\n\t"/* cy = FMA(hh,-BASE, hi)	hi - hh*BASE = 'backward carry' from hi into lo, needed for proper base-normalization */\
					"vaddpd			%%ymm3,%%ymm0,%%ymm1	\n\t"/* lo += cy */\
					"vmovaps	%%ymm1,(%%rax)	\n\t"/* lo, base-normalized */\
					"vmovaps	%%ymm2,(%%rbx)	\n\t"/* hh = hi, base-normalized */\
					"vmovaps	%%ymm3,(%%rcx)	\n\t"/* cy */\
					:					/* outputs: none */\
					: [__ax] "m" (ax)	/* All inputs from memory addresses here */\
					 ,[__ay] "m" (ay)\
					 ,[__alo] "m" (alo)\
					 ,[__ahi] "m" (ahi)\
					 ,[__acy] "m" (acy)\
					 ,[__alo_norm] "m" (alo_norm)\
					 ,[__ahi_norm] "m" (ahi_norm)\
					 ,[__base] "m" (dptr1)\
					 ,[__binv] "m" (dptr2)\
					: "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm14","xmm15"		/* Clobbered registers */\
				);
			//	printf("i = %u: x = %1.0f; %1.0f; lo = %1.0f; hi = %1.0f\n",i,*ax,*ay,*alo,*ahi);
				for(tmp = acy; tmp < acy+4; tmp++) {
					dtmp = *tmp;	// preserve signs
					if(fabs(dtmp) > fabs(cy_max)) cy_max = dtmp;
				}
				// Update log2-range-bounds-storing vars:
				for(tmp = alo; tmp < ahi; tmp++) {
					dblo = fabs(*tmp); dbhi = fabs(*(tmp+4));
					// Odds of a 0 operand are slim, but if() around it anyway:
					if(dblo) { dblo = log(dblo)*ILG2;	if(dblo > l2lo) l2lo = dblo; }
					if(dbhi) { dbhi = log(dbhi)*ILG2;	if(dbhi > l2hi) l2hi = dbhi; }
				}
			  #ifdef MUL_LOHI64_SUBROUTINE
				MUL_LOHI64(iax,iay,&ialo,&iahi);
				MUL_LOHI64(ibx,iby,&iblo,&ibhi);
				MUL_LOHI64(icx,icy,&iclo,&ichi);
				MUL_LOHI64(idx,idy,&idlo,&idhi);
			  #else
				MUL_LOHI64(iax,iay, ialo, iahi);
				MUL_LOHI64(ibx,iby, iblo, ibhi);
				MUL_LOHI64(icx,icy, iclo, ichi);
				MUL_LOHI64(idx,idy, idlo, idhi);
			  #endif
			  /*
				if(pow2 == 53 && i < 100) {
					printf("I = %d: ax = %llu ay = %llu ahi,alo = %f,%f\n",i, *ax,*ay, *ahi,*alo);
					printf("I = %d: bx = %llu by = %llu bhi,blo = %f,%f\n",i, *bx,*by, *bhi,*blo);
					printf("I = %d: cx = %llu cy = %llu chi,clo = %f,%f\n",i, *cx,*cy, *chi,*clo);
					printf("I = %d: dx = %llu dy = %llu dhi,dlo = %f,%f\n",i, *dx,*dy, *dhi,*dlo);
				}
			  */
				if(cmp_fma_lohi_vs_exact(*ax,*ay,*ahi,*alo, iax,iay,iahi,ialo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, A-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
				if(cmp_fma_lohi_vs_exact(*bx,*by,*bhi,*blo, ibx,iby,ibhi,iblo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, B-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
				if(cmp_fma_lohi_vs_exact(*cx,*cy,*chi,*clo, icx,icy,ichi,iclo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, C-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
				if(cmp_fma_lohi_vs_exact(*dx,*dy,*dhi,*dlo, idx,idy,idhi,idlo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, D-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
			  #if 0
				#error to-do!
				double r1,r2, lo,hi;
				r1 = rng_isaac_rand_double_norm_pm1() * pow2_dmult;	// in [-2^50, +2^50]
				r2 = rng_isaac_rand_double_norm_pm1() * pow2_dmult;	// in [-2^50, +2^50]
				mul50x50_debug(r1,r2, &lo,&hi);
				printf("mul50x50_: a,b = %llu, %llu\n",*(uint64*)&r1,*(uint64*)&r2);
				printf("mul50x50_: lo = %16llu\n",*(uint64*)alo);
				printf("mul50x50_: hi = %16llu\n",*(uint64*)ahi);
			  #endif

			/******************** experimental code: Try squaring [lo,hi] (in ymm1,2), sans intermediate base-normalizations: *******************/
				__asm__ volatile (\
					/* Use ymm[< 16] for broadcast-from-scalar consts to avoid illegal-instruction exceptions (AVX-512F supports b'cast-to-full-width-zmm) */\
					"movq	%[__base],%%rax		\n\t	vbroadcastsd	(%%rax),%%ymm15	\n\t"/* BASE */\
					"movq	%[__binv],%%rbx		\n\t	vbroadcastsd	(%%rbx),%%ymm14	\n\t"/* BINV */\
					"movq	%[__crnd50],%%rcx	\n\t	vbroadcastsd	(%%rcx),%%ymm13	\n\t"/* CRND*2^50 */\
					"movq	%[__prod1_adj],%%rdx\n\t	vbroadcastsd	(%%rdx),%%ymm12	\n\t"/* Const to multiply by base and add to prod[1] to ensure latter >= 0 */\
					"movq	%[__ax] ,%%rax	\n\t	vmovaps	(%%rax),%%ymm1			\n\t"/* x.lo */\
					"movq	%[__ay] ,%%rbx	\n\t"/* x.hi */\
					"movq	%[__alo],%%rcx	\n\t"\
					"vaddpd			%%ymm1,%%ymm1,%%ymm2	\n\t"/* 2*lo */\
				/* lo*lo: */
					"     vmulpd	%%ymm1,%%ymm1,%%ymm3	\n\t"/* hi = fma(a,b, 0  ) - use VMULPD instead of VFMADD231PD, since addend = 0. */
					"vmovaps	%%ymm3,%%ymm0				\n\t"/* cpy hi into lo-reg */\
					"vfmsub231pd	%%ymm1,%%ymm1,%%ymm0	\n\t"/* lo = fma(a,b, -hi) */
					"vaddpd			%%ymm13,%%ymm3,%%ymm1	\n\t"\
					"vsubpd			%%ymm13,%%ymm1,%%ymm1	\n\t"/* hh = hi +- crnd50 to round-to-nearest-multiple-of-2^50 */\
					"vsubpd			%%ymm1,%%ymm3,%%ymm3	\n\t"/* hi - hh gives backward-carry... */\
					"vaddpd			%%ymm3,%%ymm0,%%ymm0	\n\t"/* ...which we add to lo (prod[0]). */\
					"vmovaps	%%ymm0,    (%%rcx)	\n\t"/* write prod[0] */\
														/*** prod[0] in ymm0, hi*2^50 in ymm1. ***/\
				/* 2*lo*hi: */
					"     vmulpd	(%%rbx),%%ymm2,%%ymm3	\n\t"/* hi = fma(a,b, 0  ) - use VMULPD instead of VFMADD231PD, since addend = 0. */
					"vmovaps	%%ymm3,%%ymm0				\n\t"/* cpy hi into lo-reg */\
					"vfmsub231pd	(%%rbx),%%ymm2,%%ymm0	\n\t"/* lo = fma(a,b, -hi) */
					"vaddpd			%%ymm13,%%ymm3,%%ymm2	\n\t"\
					"vsubpd			%%ymm13,%%ymm2,%%ymm2	\n\t"/* hh = hi +- crnd50 to round-to-nearest-multiple-of-2^50 */\
					"vfmadd231pd	%%ymm14,%%ymm1,%%ymm0	\n\t"/* Add lo to hi-output of previous lo:hi, pair, which also needs *= binv */
					"vsubpd			%%ymm2,%%ymm3,%%ymm3	\n\t"/* hi - hh gives backward-carry... */\
					"vaddpd			%%ymm3,%%ymm0,%%ymm1	\n\t"/* ...which we add to lo (prod[1]). */\
				"vfmadd231pd	%%ymm12,%%ymm15,%%ymm1	\n\t"/* Add const*base and add to prod[1] to ensure latter >= 0 */
				"vfnmadd231pd	%%ymm12,%%ymm15,%%ymm2	\n\t"/* Must sub same const from prod[2] by way of a carry - but note hh still scaled *= base. */\
					"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"/* write prod[1] */\
														/*** prod[1] in ymm1, hi*2^50 in ymm2. ***/\
				/* hi*hi: */
					"vmovaps	(%%rbx),%%ymm3				\n\t"/* reload hi into ymm3 */\
					"     vmulpd	%%ymm3,%%ymm3,%%ymm0	\n\t"/* hi = fma(a,b, 0  ) - use VMULPD instead of VFMADD231PD, since addend = 0. */
					"vmovaps	%%ymm0,%%ymm1				\n\t"/* cpy hi into lo-reg */\
					"vfmsub231pd	%%ymm3,%%ymm3,%%ymm1	\n\t"/* lo = fma(a,b, -hi) */
					"vaddpd			%%ymm13,%%ymm0,%%ymm3	\n\t"\
					"vsubpd			%%ymm13,%%ymm3,%%ymm3	\n\t"/* hh = hi +- crnd50 to round-to-nearest-multiple-of-2^50 */\
					"vfmadd231pd	%%ymm14,%%ymm2,%%ymm1	\n\t"/* Add lo to hi-output of previous lo:hi, pair, which also needs *= binv */
					"vsubpd			%%ymm3,%%ymm0,%%ymm0	\n\t"/* hi - hh gives backward-carry... */\
					"vaddpd			%%ymm0,%%ymm1,%%ymm2	\n\t"/* ...which we add to lo (prod[2]). */\
					"     vmulpd	%%ymm3,%%ymm14,%%ymm3	\n\t"/* prod[3] = hh*binv */\
					"vmovaps	%%ymm2,0x40(%%rcx)	\n\t"/* write prod[2] */\
					"vmovaps	%%ymm3,0x60(%%rcx)	\n\t"/* write prod[3] */\
														/*** prod[2,3] in ymm2,3. ***/\
					:					/* outputs: none */\
					: [__ax] "m" (ax)	/* All inputs from memory addresses here */\
					 ,[__ay] "m" (ay)\
					 ,[__alo] "m" (alo)\
					 ,[__base] "m" (dptr1)\
					 ,[__binv] "m" (dptr2)\
					 ,[__crnd50] "m" (dptr3)\
					 ,[__prod1_adj] "m" (dptr4)\
					: "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm2","xmm3","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
				);
			//	printf("i = %u: x0 = %1.0f; x1 = %1.0f; p0-3 = %1.0f,%1.0f,%1.0f,%1.0f\n",i,*ax,*ay,*alo,*ahi);
				// Update log2-range-bounds-storing vars:
				tmp = alo;
				for(j = 0; j < 4; j++) {
					for(k = 0; k < 4; k++,tmp++) {
						dtmp = *tmp;
						if(dtmp < sqr100lo[j]) { sqr100lo[j] = dtmp; }
						if(dtmp > sqr100hi[j]) { sqr100hi[j] = dtmp; }
					}
				}

			}	// i-loop

			printf("\t%u |Outputs|.lo for %d x %d-bit fma_dmult have l2max = %10.7f:\n",i,pow2,pow2,l2lo);
			printf("\t%u |Outputs|.hi for %d x %d-bit fma_dmult have l2max = %10.7f:\n",i,pow2,pow2,l2hi);
			// Use 1.0f as format - .0 means no fractional part, and i/o routines will override the length-1 with actual length:
			if(cy_max > 0) {
				itmp64 = cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32;
				printf("\tcy_max = %1.0f =  %llu * 2^%u\n",cy_max,itmp64,itmp32);
			} else if(cy_max < 0) {
				itmp64 =-cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32;
				printf("\tcy_max = %1.0f = -%llu * 2^%u\n",cy_max,itmp64,itmp32);
			} else {
				printf("\tcy_max =  0\n");
			}

			printf("SQR outputs have the following ranges:\n");
			dtmp = pow2_imult;
			for(j = 0; j < 4; j++) {
				printf("\tprod[%u]: [%1.2f, %1.2f] = [%5.2f, %5.2f]*base\n",j,sqr100lo[j],sqr100hi[j],sqr100lo[j]*dtmp,sqr100hi[j]*dtmp);
			}
			printf("\n");

			pow2_dmult *= 2.0;
			pow2_imult *= 0.5;
		}	// pow2-loop
exit(0);
		return nerr;
	}

/* Generic version for non-CUDA-only FMA:

void	mul50x50_debug(double a, double b, double *lo, double *hi)
	{
		// Exact product a*b = lo + hi:
		*hi = fma(a,b, 0   );
		*lo = fma(a,b, -*hi);
	}
*/

	/*
	Example: FMA in-and-outputs are
	x = -163183843911180; y = 1039312675530994; hi,lo = -169599037418760590289388175360, 69115062440,
                          sum to yield exact result:  = -169599037418760590220273112920
                        now write in base-2^64 form:  = -(9193982241*2^64 + 802977670696261464) ,
	We then simply compare result inside () to MUL_LOHI64 result. In practice the assembly requires a bit more work,
	but the principle is simple.
	*/
	int cmp_fma_lohi_vs_exact(double dx, double dy, double dhi, double dlo, uint64 ix, uint64 iy, uint64 ihi, uint64 ilo)
	{
		int retval;
		uint64 i64, e1,e0, m1,m0;
		uint32 s1,s0;
		const uint64 two52 = 0x0010000000000000ull, mmask = 0x000FFFFFFFFFFFFFull;
		uint128 exact;
		const char char_sgn[2] = {' ','-'};
	//printf("I = %d: x = %f; y = %f; hi,lo = %f,%f\n",i, dx,dy, dhi,dlo);
		if(dx == 0. || dy == 0.)	// Comparison algo needs further tweaks to handle 0-result ... not worth coding time.
			return 0;
		s1 = (dhi < 0);	// Sign of product = sign of hi output
		s0 = (dlo < 0);
		if(s1) {	// If product < 0, negate both FMA outputs prior to comparing vs the (unsigned-int) MUL_LOHI64 result
			dhi = -dhi;
			if(dlo != 0.)	// Need to preserve sign of dlo, if = 0
				dlo = -dlo;
		}
		// Extract exp & mantissa fields of the double outputs and restore hidden bits:
		i64 = *(uint64*)&dhi; e1 = (i64>>52)&0x7ff; m1 = (i64&mmask) + two52;
		i64 = *(uint64*)&dlo; e0 = (i64>>52)&0x7ff; m0 = (i64&mmask) +(two52 & (-(dlo != 0.)));
		int nsh1 = e1 - 0x433;	// Shift count of hi-double result = exp - 0x3ff - 52
		int nsh0 = e0 - 0x433;	// Shift count of lo-double result
		exact.d0 = m1; exact.d1 = 0;	LSHIFT128(exact,nsh1, exact);
		if(nsh0 < 0)
			m0 >>= -nsh0;
		else if(nsh0 > 0)
			m0 <<=  nsh0;
		if(s1 ^ s0) {	// Whether to add or sub the lo term depends on the *relative* signs of hi,lo outputs
			i64 = exact.d0; exact.d0 -= m0; exact.d1 -= (exact.d0 > i64);
		} else {
							exact.d0 += m0; exact.d1 += (exact.d0 < m0 );
		}
		retval = ( (ihi != exact.d1) || (ilo != exact.d0) );
		if(retval) {
			printf("In cmp_fma_lohi_vs_exact: FMA-double and pure-int DMUL results differ!\n");
			printf("dx = %f; dy = %f; hi,lo = %f,%f\n",dx,dy, dhi * (1 - 2*(s1 != 0)), dlo * (1 - 2*(s0 != 0)));
			printf("ix = %lld; iy = %lld; ihi,lo = %lld,%llu\n",ix,iy, ihi,ilo);
			printf("Unsigned FMA result: ihi = %llX; ilo = %llX\n",*(uint64*)&dhi,*(uint64*)&dlo);
			printf("nsh1,0 = %d,%d: ehi = %llu; elo = %llu [mlo = %c%llu]\n",nsh1,nsh0,exact.d1,exact.d0, char_sgn[s1 ^ s0],m0);
		}
		return retval;
	}

#endif

/*
Originally wrote this as part of a workaround for the the DEC Unix V4.0 real*16 sincos bug,
which caused incorrect real*16 sincos results when the argument theta = m*pi/4 +- pi/512, m integer.
The more general purpose is to compare a pair of computed sin(theta), cos(theta) values
(either truncated-to-double extended-real ones, or fast-math-library doubles, or whatever)
to the results returned by calls to the standard math library sincos on the host system.
If a discrepancy between the input sincos data and the standard-library ones is detected
which exceeds some threshold (we use 1e-10 at present), print a warning and replace the
inputs with the latter. (If you instead suspect the latter of being the problematic ones,
you'll need to make the appropriate modifications).

Returns the largest absolute difference between real*8 and real*16 sin(theta) and cos(theta)
for the calling values, e.g. if the user wants to save the maximum discrepancy over a series of calls.
*/
#define	USE_DOUBLE_DEFAULT	0

double 	errprint_sincos(double *x, double *y, double theta)
{
	double tmp, adiff, maxdiff = 0.0;

	tmp = cos(theta);
	adiff = fabs(*x - tmp);
	if(adiff > maxdiff) maxdiff = adiff;
	if(adiff > 1e-10)
	{
		fprintf(stderr, "WARNING: real*16 sine error : theta = %20.15f, long double = %20.15f, double = %20.15f", theta,*x,tmp);
	#if USE_DOUBLE_DEFAULT
		fprintf(stderr, " ... using double-precision values instead.\n");
		*x = tmp;
	#else
		fprintf(stderr, "\n");
	#endif
	}

	tmp = sin(theta);
	adiff = fabs(*y - tmp);
	if(adiff > maxdiff) maxdiff = adiff;
	if(adiff > 1e-10)
	{
		fprintf(stderr, "WARNING: real*16 sine error : theta = %20.15f, long double = %20.15f, double = %20.15f", theta,*y,tmp);
	#if USE_DOUBLE_DEFAULT
		fprintf(stderr, " ... using double-precision values instead.\n");
		*y = tmp;
	#else
		fprintf(stderr, "\n");
	#endif
	}
	return adiff;
}

/****************/

/* Return bit-reversed complement of an [nbits] input integer i.
Since Mlucas isn't restricted to power-of-2 FFT lengths and further uses a forward-DIF/inverse-DIT
FFT to avoid the need for bit-reversal reordering of the data, we don't actually use this function
much except as an auxiliary utility.
*/
uint32 reverse(uint32 i, uint32 nbits)
{
	uint32 j, tmp = 0;
	ASSERT(HERE,nbits <= 32,"ERROR: bitlength limit 32 exceeded in call to REVERSE.\n");
	for(j = 0; j < nbits; j++) {
		tmp += tmp + (i & 1);
		i >>= 1;
	}
	return(tmp);
}

/* 64-bit bytewise-llokup-based version of above. Input is not overwritten, unless caller also uses it to store result.
version which also works with multiword inputs, cf. mi64.c:brev64():
*/
uint64 reverse64(uint64 i, uint32 nbits)
{
	uint32 pad_bits = 64-nbits;
	uint8 *bin8 = (uint8 *)&i, bout8[8];
	bout8[0] = brev8[bin8[7]];
	bout8[1] = brev8[bin8[6]];
	bout8[2] = brev8[bin8[5]];
	bout8[3] = brev8[bin8[4]];
	bout8[4] = brev8[bin8[3]];
	bout8[5] = brev8[bin8[2]];
	bout8[6] = brev8[bin8[1]];
	bout8[7] = brev8[bin8[0]];
	return (*(uint64 *)bout8) >> pad_bits;
}

/******* Bit-level utilities: ********/

// 32 and 64-bit leftward circular shift, shift count n assumed unsigned < #bits-in-type:
DEV uint32 cshft32(uint32 x, uint32 n)
{
	if(n)
		return (x << n) + (x >> (32-n));
	else
		return x;
}

DEV uint64 cshft64(uint64 x, uint64 n)
{
	if(n)
		return (x << n) + (x >> (64-n));
	else
		return x;
}

// 32 and 64-bit analogs of the F90 intrinsic ISHFT function:
DEV uint32 ishft32(uint32 x, int shift)
{
	uint32 r;
	if(shift >= 32)
		r  = 0ull;
	else if(shift > 0)
		r  = x << shift;
	else if(shift > -32)
		r  = x >> (-shift);
	else
		r  = 0ull;
	return r;
}

DEV uint64 ishft64(uint64 x, int shift)
{
	uint64 r;
	if(shift > 64)
		r  = 0ull;
	else if(shift > 0)
		r  = x << shift;
	else if(shift > -64)
		r  = x >> (-shift);
	else
		r  = 0ull;
	return r;
}

// Clears [bit]th of array [arr]. No bounds checking is performed.
void bit_clr32(uint32*arr, uint32 bit)
{
#ifdef X32_ASM
	__asm__ volatile (\
		"btrl  %1, %0		\n\t"\
		:	/* outputs: none */\
		:  "m" (*arr), "r" (bit)\
		: "cc","memory"	/* Clobbered registers */\
		);
#else
	uint32 n = bit>>5;
	uint32 mask = ~((uint32)1 << (bit&31));
	arr[n] &= mask;
#endif
}

void bit_clr32_x4(uint32*arr, uint32 bit1, uint32 bit2, uint32 bit3, uint32 bit4)
{
#ifdef X32_ASM
	#if 1
	__asm__ volatile (\
		"movl	%0,%%esi	\n\t"\
		"btrl	%1,%%esi		\n\t"\
		"btrl	%2,%%esi		\n\t"\
		"btrl	%3,%%esi		\n\t"\
		"btrl	%4,%%esi		\n\t"\
		:	/* outputs: none */\
		:  "m" (*arr), "r" (bit1), "r" (bit2), "r" (bit3), "r" (bit4)\
		: "cc","memory","esi"	/* Clobbered registers */\
	);
	#else
	__asm__ volatile (\
		"btrl  %1, %0		\n\t"\
		"btrl  %2, %0		\n\t"\
		"btrl  %3, %0		\n\t"\
		"btrl  %4, %0		\n\t"\
		:	/* outputs: none */\
		:  "m" (*arr), "r" (bit1), "r" (bit2), "r" (bit3), "r" (bit4)\
		: "cc","memory"	/* Clobbered registers */\
		);
	#endif
#else
	uint32 n1 = bit1>>5, n2 = bit2>>5, n3 = bit3>>5, n4 = bit4>>5;
	uint32 mask1 = ~((uint32)1 << (bit1&31)),mask2 = ~((uint32)1 << (bit2&31)), mask3 = ~((uint32)1 << (bit3&31)),mask4 = ~((uint32)1 << (bit4&31));
	arr[n1] &= mask1;
	arr[n2] &= mask2;
	arr[n3] &= mask3;
	arr[n4] &= mask4;
#endif
}

// Clears [bit]th (assumed in <0:31>) bit of [n]th word of array [arr]. No bounds checking is performed.
void bit_clr64(uint64*arr, uint64 bit)
{
#ifdef X64_ASM
	__asm__ volatile (\
		"btrq  %0, %1		\n\t"\
		:	/* outputs: none */\
		: "r" (bit), "m" (*arr)\
		: "cc","memory"	/* Clobbered registers */\
		);
#else
	uint64 n = bit>>6;
	uint64 mask = ~((uint64)1 << (bit&63));
	arr[n] &= mask;
#endif
}

void bit_clr64_x4(uint64*arr, uint64 bit1, uint64 bit2, uint64 bit3, uint64 bit4)
{
#ifdef X64_ASM
	__asm__ volatile (\
		"btrq  %1, %0		\n\t"\
		"btrq  %2, %0		\n\t"\
		"btrq  %3, %0		\n\t"\
		"btrq  %4, %0		\n\t"\
		:	/* outputs: none */\
		:  "m" (*arr), "r" (bit1), "r" (bit2), "r" (bit3), "r" (bit4)\
		: "cc","memory"	/* Clobbered registers */\
		);
#else
	uint64 n1 = bit1>>6, n2 = bit2>>6, n3 = bit3>>6, n4 = bit4>>6;
	uint64 mask1 = ~((uint64)1 << (bit1&63)),mask2 = ~((uint64)1 << (bit2&63)), mask3 = ~((uint64)1 << (bit3&63)),mask4 = ~((uint64)1 << (bit4&63));
	arr[n1] &= mask1;
	arr[n2] &= mask2;
	arr[n3] &= mask3;
	arr[n4] &= mask4;
#endif
}

// Counts set bits in 32-bit int x - only tiny speedup on Haswell from using 32-bit POPCNT, so no ASM here:
DEV uint32 popcount32(uint32 x)
{
	uint8 *byte_arr = (uint8*)&x;
	uint32 i,retval = 0;
	for(i = 0; i < 4; i++) {
		retval += pop8[byte_arr[i]];
	}
	return retval;
}

// Counts set bits in 64-bit int x - ~30% speedup on Haswell from using 64-bit POPCNT:
DEV uint32 popcount64(uint64 x)
{
// Intel introduced POPCNT at same time as SSE4.2, but "...not considered part of the SSE4.2
// instruction set; instead, they have their own dedicated CPUID bits to indicate support",
// so avoid dealing with that stupidity by only enabling in AVX-and-beyond builds:
#ifdef USE_AVX
	uint64 i64 = 0;
	__asm__ volatile ("popcntq %1,%0" : "=r" (i64) : "r" (x));
	return (uint32)i64;
#else
	uint8 *byte_arr = (uint8*)&x;
	uint32 i,retval = 0;
	// May 2018: Unrolling the for-loop in favor of an inlined 8-fold sum gave a nice speedup:
  #if 0
	for(i = 0; i < 8; i++) {
		retval += pop8[byte_arr[i]];
	}
  #else
	retval = pop8[byte_arr[0]] + pop8[byte_arr[1]] + pop8[byte_arr[2]] + pop8[byte_arr[3]]
		   + pop8[byte_arr[4]] + pop8[byte_arr[5]] + pop8[byte_arr[6]] + pop8[byte_arr[7]];
  #endif
	return retval;
#endif
}

// Return bit position [0:31] of the [bit = 1:32]th set bits in 32-bit int x, or -1 if there are fewer than
// [bit] set bits in x, or if bit == 0 (i.e. user requests position of nonexistent "0th set bit"):
int ith_set_bit32(uint32 x, uint32 bit)
{
	uint8 curr_byte;
	int curr_pop,i,j,k,retval = 0;
	if(!x || !bit) return -1;
	ASSERT(HERE, bit <= 32, "[bit]th-bit specifier out of range!");
	// Find the byte in which the [bit]th set-bit occurs:
	for(i = 0; i < 32; i += 8) {
		curr_byte = (uint8)(x >> i);
		curr_pop = pop8[curr_byte];
		retval += curr_pop;	// At this point retval stores the popcount-to-date...
		// ... If that >= [bit], replace that with the location of the [bit]th set bit:
		if(retval >= bit) {
			retval -= curr_pop;	// Subtract pop of curr_byte back off
			k = (bit-retval-1)*4;	// Need to sub-1 since e.g. 3rd set-bit is encoded in hex-char 2
			j = (ith_set_bit8[curr_byte] >> k) & 0xf;
			return i+j;
		}
	}
	return -1;
}

// 64-bit version of ith_set_bit32:
// Remember, bit-index in arglist is *unit* offset, i.e. must be in [1:64]!
int ith_set_bit64(uint64 x, uint32 bit)
{
	uint8 curr_byte;
	int curr_pop,i,j,k,retval = 0;
	if(!x || !bit) return -1;
	ASSERT(HERE, bit <= 64, "[bit]th-bit specifier out of range!");
	// Find the byte in which the [bit]th set-bit occurs:
	for(i = 0; i < 64; i += 8) {
		curr_byte = (uint8)(x >> i);
		curr_pop = pop8[curr_byte];
		retval += curr_pop;	// At this point retval stores the popcount-to-date...
		// ... If that >= [bit], replace that with the location of the [bit]th set bit:
		if(retval >= bit) {
			retval -= curr_pop;	// Subtract pop of curr_byte back off
			// On Core2, ith_set_bit8-version cuts 40% off the function runtime vs loop-over-bits-of-curr_byte:
		  #if 0
			for(j = 0; j < 8; j++) {	// ...and step 1-bit-at-a-time until reach desired popcount
				retval += (curr_byte>>j)&1;
				if(retval == bit) return i+j;
			}
		  #else
			k = (bit-retval-1)*4;	// Need to sub-1 since e.g. 3rd set-bit is encoded in hex-char 2
			j = (ith_set_bit8[curr_byte] >> k) & 0xf;
			return i+j;
		  #endif
		}
	}
	return -1;
}

/*** leading and trailing-zero-counting algorithms: ***/
DEV uint32 trailz32(uint32 x)
{
	uint8 *byte_arr = (uint8*)&x, curr_byte;
	uint32 i,retval = 0;
  #ifdef USE_BIG_ENDIAN
	for(i = 0; i < 4; i++) {
		curr_byte = byte_arr[3-i];
		retval += tz8[curr_byte];
		if(curr_byte) break;
	}
  #else
	for(i = 0; i < 4; i++) {
		curr_byte = byte_arr[i];
		retval += tz8[curr_byte];
		if(curr_byte) break;
	}
  #endif
	return retval;
}

DEV uint32 trailz64(uint64 x)
{
#ifdef X64_ASM
	int bpos;
	if(x == 0) return 64;
	__asm__ volatile (\
		"bsfq %[__x],%%rax		\n\t"\
		"movl %%eax,%[__bpos]	\n\t"\
		:	/* outputs: none */\
		: [__x] "m" (x)	/* All inputs from memory addresses here */\
		 ,[__bpos] "m" (bpos)	\
		: "cc","memory","rax"	/* Clobbered registers */\
	);
	return bpos;
#else
	uint8 *byte_arr = (uint8*)&x, curr_byte;
	uint32 i,retval = 0;
  #ifdef USE_BIG_ENDIAN
	for(i = 0; i < 8; i++) {
		curr_byte = byte_arr[7-i];
		retval += tz8[curr_byte];
		if(curr_byte) break;
	}
  #else
	for(i = 0; i < 8; i++) {
		curr_byte = byte_arr[i];
		retval += tz8[curr_byte];
		if(curr_byte) break;
	}
  #endif
	return retval;
#endif
}

/***************/
// Return number of leading (leftmost) zeros of input:
DEV uint32 leadz32(uint32 x)
{
#ifdef X32_ASM
	uint32 lz;
	int bpos;
	if(x == 0) return 32;
	__asm__ volatile (\
		"bsrl %[__x],%%eax		\n\t"\
		"movl %%eax,%[__bpos]	\n\t"\
		:	/* outputs: none */\
		: [__x] "m" (x)	/* All inputs from memory addresses here */\
		 ,[__bpos] "m" (bpos)	\
		: "cc","memory","eax"	/* Clobbered registers */\
	);
	lz = (31 - bpos);	// BSR returns *index* of leftmost set bit, must subtract from (#bits - 1) to get #lz.
	return lz;
#else
	uint8 *byte_arr = (uint8*)&x, curr_byte;
	uint32 i,retval = 0;
  #ifdef USE_BIG_ENDIAN
	for(i = 0; i < 4; i++) {
		curr_byte = byte_arr[i];
		retval += lz8[curr_byte];
		if(curr_byte) break;
	}
  #else
	for(i = 0; i < 4; i++) {
		curr_byte = byte_arr[3-i];
		retval += lz8[curr_byte];
		if(curr_byte) break;
	}
  #endif
	return retval;
#endif
}

DEV uint32 leadz64(uint64 x)
{
#ifdef X64_ASM
	uint32 lz;
	int bpos;
	if(x == 0) return 64;
	__asm__ volatile (\
		"bsrq %[__x],%%rax		\n\t"\
		"movl %%eax,%[__bpos]	\n\t"\
		:	/* outputs: none */\
		: [__x] "m" (x)	/* All inputs from memory addresses here */\
		 ,[__bpos] "m" (bpos)	\
		: "cc","memory","rax"	/* Clobbered registers */\
	);
	lz = (63 - bpos);	// BSR returns *index* of leftmost set bit, must subtract from (#bits - 1) to get #lz.
	return lz;
#else
	uint8 *byte_arr = (uint8*)&x, curr_byte;
	uint32 i,retval = 0;
  #ifdef USE_BIG_ENDIAN
	for(i = 0; i < 8; i++) {
		curr_byte = byte_arr[i];
		retval += lz8[curr_byte];
		if(curr_byte) break;
	}
  #else
	for(i = 0; i < 8; i++) {
		curr_byte = byte_arr[7-i];
		retval += lz8[curr_byte];
		if(curr_byte) break;
	}
  #endif
	return retval;
#endif
}

DEV uint32	leadz128(uint128 i)
{
	if(i.d1)
		return leadz64(i.d1);
	else
		return leadz64(i.d0) + 64;
}

DEV uint32	leadz192(uint192 i)
{
	if(i.d2)
		return leadz64(i.d2);
	else if(i.d1)
		return leadz64(i.d1) + 64;
	else
		return leadz64(i.d0) + 128;
}

DEV uint32	leadz256(uint256 i)
{
	if(i.d3)
		return leadz64(i.d3);
	else if(i.d2)
		return leadz64(i.d2) + 64;
	else if(i.d1)
		return leadz64(i.d1) + 128;
	else
		return leadz64(i.d0) + 192;
}

/***************/

/* If the input == 2^p, returns 1; otherwise returns 0.
Note: We do not consider 0 as being a power of 2, since there is no number x such that 2^x = 0.
Algo: for unsigned twos-comp int n,
  n&(n-1)  ('is n&(n-1) nonzero?' in boolean terms) is only 0 (false) for n a power-of-2, with exception of n = 0.
Conversely,
!(n&(n-1)) ('is n&(n-1) zero?' in boolean terms) is only !0 (true) for n a power-of-2, with exception of n = 0,
which latter instance is easily special-cased, via n && !(n&(n-1)).

On hardware with a fast pop-count instruction can also consider using (popcount == 1)?, but not worth the tiny cycle savings.
*/
DEV uint32 isPow2(uint32 i32)
{
	return i32 && !(i32&(i32-1));
}

DEV uint64 isPow2_64(uint64 i64)
{
	return i64 && !(i64&(i64-1));
}

/***************/

/* If the input == 4^p, returns 1; otherwise returns 0. */
DEV uint32 isPow4(uint32 i32)
{
	return isPow2(i32) && (i32 & 0x55555555);
}

DEV uint64 isPow4_64(uint64 i64)
{
	return isPow2_64(i64) && (i64 & 0x5555555555555555ull);
}

/***************/
// Return bitlength of input, based on position of leftmost set bit:
DEV uint32 nbits32(uint32 i) { return 32-leadz32(i); }
DEV uint64 nbits64(uint64 i) { return 64-leadz64(i); }

/***************/
// Extract (nbits) bits beginning at position (beg):
DEV uint32 ibits32(uint32 i, uint32 beg, uint32 nbits)
{
	uint32 ones_mask = 0xFFFFFFFF;
	return ( (i >> beg) & ~(ones_mask << nbits) );
}

DEV uint64 ibits64(uint64 i, uint32 beg, uint32 nbits)
{
	uint64 ib;
	uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull;
	ib = (i >> beg) & ~(ones_mask << nbits);
	return ( ib );
}

/***************/

/* Return (nbits) bits of a 64-bit integer x, starting at bit
(src_bit_start) in a target 64-bit integer y (the return value), starting at bit (tgt_bit_start).
Entire bit-copy range must lie within bits <0:63> of source operand; any bits which
'overhang' the end of the destination operand are discarded.
If bit-index parameters are illegal, asserts.
*/
DEV uint64	getbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint32 tgt_bit_start)
{
	const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull;
	uint64 mask;
	ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
	if(nbits == 0) return 0;
	mask = (ones_mask >> (64-nbits));
	return ((x >> src_bit_start) & mask) << tgt_bit_start;
}

/* Alternate version of getbits64, here splicing the requested bit into an argument, leaving the surrounding bits unchanged.
The syntax of this version mirrors that of the Fortran-90 MVBITS library function.
*/
DEV void	mvbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint64*y, uint32 tgt_bit_start)
{
	const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull;
	uint64 mask;
	ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
	if(nbits == 0) return;
	mask = (ones_mask >> (64-nbits));
	/* Zero out the target bits: */
	*y &= ~(mask << tgt_bit_start);
	/* Copy the source bits into the gap: */
	*y += ((x >> src_bit_start) & mask) << tgt_bit_start;
}

/***************/

/* returns 1 if p is a base-z Fermat pseudoprime, 0 otherwise. */
DEV int pprimeF(uint32 p, uint32 base)
{
	if(p <= 2)
		return(p == 2);
	if(base == 2)
		return(twompmodq32(p-1,p) == 1);
	uint64 y = 1, n = p-1, flag;
	uint64 z = base;	/* Need a 64-bit to store intermediate products without overflow */
	// Compute base^(p-1) (mod p) via modular binary powering - no advantage to using LR in this scalar version:
	while(n) {
		flag = n & 1;
		n >>= 1;
		if(flag)
			y = (y*z)%p;
		z = (z*z)%p;
		if(!z) return 0;
	}
	return((int)(y==1));
}

// Uses binary search to determine if a given integer n < 2^32 is a Fermat-base-2 pseudoprime (1 if yes, 0 if no).
// The optional arg idx_next_psp, if present, contains the index in the f2psp[]-array of the next 2-psp >= n .
#include "f2psp_3_5.h"	// f2psp[] is sorted table of the 9366 Fermat-base-2 pseudoprimes < 2^32 not divisible by 3 or 5
DEV uint32 is_f2psp(uint32 n, uint32*idx_next_psp) {
	uint32 lo,hi,mid;
	if(n < 341 || IS_EVEN(n)) {
		if(idx_next_psp)
			*idx_next_psp = 0;
		return 0;
	}
	lo = 0; hi = 9366-1;
	while(lo < hi) {
		// if hi-lo odd (one even, one odd), mid on low side of 1/2-midpoint, i.e. for hi-lo = 1, mid = lo:
		mid = (lo + hi)>>1;
		if(f2psp[mid] < n) {	// n strictly in upper part of interval
			lo = mid+1;
		} else {	// n in lower part of interval, possibly at midpoint
			hi = mid;
		}
	}
	// Last pass thru above while() always has hi-lo = 1, i.e. , mid = lo on exit and we want to check hi-element for equality:
	if(f2psp[hi] == n) {
	//	fprintf(stderr,"f2psp[%u] == %u\n",hi,n);
		return 1;
	} else {
		if(idx_next_psp)
			*idx_next_psp = hi;
		return 0;
	}
}

// Rigorous is-prime for n < 2^32:
DEV uint32 is_prime(uint32 n) {
	if(n < 2)
		return 0;
	if(IS_EVEN(n))
		return(n == 2);
	if(n < 7)
		return 1;
	return(pprimeF(n,2) && !is_f2psp(n,0x0));
}

// Get nearest Fermat 2-PRP to N in the specified search direction, up or down. Algorithm is slow try-next-odd:
DEV uint32 next_prime(uint32 n, int dir) {
	// direction properly specified?
	ASSERT(HERE, ABS(dir) == 1,"next_prime(): Direction of search not properly specified, must = +1 (up) or -1 (down).");
	// Some special-casing for small n:
	if(n <= 3 && dir == -1) {
		return(2*(n == 3));
	}
	if(n <  2 && dir == 1) {	// Ensuing oddifying step handles n == 2
		return(2);
	}
	// startval of search must be odd:
	if(!IS_ODD(n)) {
		n -= dir;
	}
	dir += dir;
	while(1) {
		n += dir;
		// N which pass Fermat base-2 PRP test further checked for Fermat base-PSP-ness to weed out pseudoprimes:
		if(pprimeF(n,2) && !is_f2psp(n,0x0)) {
			return(n);
		}
	}
	return 0;
}

// On Core2 gives 1779361 primes in [1000000,30000000] in 2.5 sec; 8095883 primes in [5000000,150000000] in 14 sec.
// If for some reason need to speed things up further, next thing to try is to use 4-way or 8-way modexp in pprimeF.
DEV uint32 nprimes_in_range(uint32 b1, uint32 b2) {
	uint32 i=0,idx=0,n,np = 0,nmod30;
	if(b1 >= b2) {
		fprintf(stderr,"Error: nprimes_in_range() inputs must be b1 < b2; user input %u,%u\n",b1,b2);
		exit(1);
	}
	if(b1 < 7) {
		np = (b1 <= 2) + (b1 <= 3) + (b1 <= 5);	b1 = 7;
	}
	n = b1;
	// n must be odd:
	if(!(n&1)) {
		n += 1;
	}
	while(n%3 == 0 || n%5 == 0) {
		n += 2;
	}
	/* Now we have n odd >= 7 and not divisible by 3 or 5 on loop entry.
	Use of f2psp, the table of 9366 base-2 Fermat pseudoprimes < 2^32 not divisible by 3 or 5,
	requires we filter out such (n%3 == 0 || n%5 == 0) using a simple mod-30 sieve. For a given
	n%30 in the 'keep' list, 'incr' means the increment needed to get to the next 'keep' value:
		n%30:	01 03 05 07 09 11 13 15 17 19 21 23 25 27 29
		keep:	 x        x     x  x     x  x     x        x
		incr:	 6        4     2  4     2  4     6        2
	Store the 8 increments in a array, cycle through them as many times as needed.
	*/
	const uint32 incr[8] = {6,4,2,4,2,4,6,2};
	nmod30 = n%30;
	if(nmod30 == 1)
		idx = 0;
	else if(nmod30 == 7)
		idx = 1;
	else if(nmod30 == 11)
		idx = 2;
	else if(nmod30 == 13)
		idx = 3;
	else if(nmod30 == 17)
		idx = 4;
	else if(nmod30 == 19)
		idx = 5;
	else if(nmod30 == 23)
		idx = 6;
	else if(nmod30 == 29)
		idx = 7;
	// On return i contains index of nearest 2-psp >= n:
	is_f2psp(n,&i);
	while(n < b2) {
		if(pprimeF(n,2)) {
			// If n is 2-psp, increment 2-psp table index - cheaper than doing full-blown binary search each time
			if(f2psp[i] == n) {
			//	fprintf(stderr,"hit 2-psp %u\n",n);
				i++;
			} else {
			//	fprintf(stderr,"p = %u\n",n);
				np++;
			}
		}
		n += incr[idx]; idx = (idx + 1)&7;
	}
	return np;
}

// 64-bit analog of pprimeF:
DEV int pprimeF64(uint64 p, uint64 base)
{
	if(p <= 2ull)
		return(p == 2ull);
	if(p < 0xFFFFFFFFull && base < 0xFFFFFFFFull)
		return pprimeF((uint32)p, (uint32)base);
	if(base == 2ull)
		return twopmodq64(p-1,p) == 1ull;
	uint64 y = 1ull, n = p-1ull, flag;
	uint64 z = base;
	while(n) {
		flag = n & 1ull;
		n >>= 1;
		if(flag)
			y = mi64_modmul64(y,z,p);
		z = mi64_modmul64(z,z,p);
		if(!z) return 0;
	}
	return((int)(y == 1ull));
}

/***************/

// Nov 2020: For 32-bit args, replaced old isPRP with above rigorous is_prime.
DEV int isPRP64(uint64 p)
{
	// Handle even-argument case separately, since the powmod routines may not accept even moduli:
	if((p & 1ull) == 0ull)
		return (p == 2ull);
	return twopmodq64(p-1,p) == 1ull;
//	return(pprimeF64(p,2ull) && pprimeF64(p,3ull) && pprimeF64(p,5ull) && pprimeF64(p,7ull) && pprimeF64(p,11ull) && pprimeF64(p,13ull));
}

/*******************/

/* Calculate 2^-p mod q for p, q 32-bit unsigned ints. This can be used (among
other things) to effect a fast Fermat base-2 pseudoprime test, by calling with q = p-1.
*/
// V1 returns the full powmod result:
DEV uint32 twompmodq32(uint32 p, uint32 q)	// 2^-p % q
{
	 int32 j;
	uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi;

	ASSERT(HERE, (q&1) == 1, "twompmodq32: even modulus!");
	qhalf = q >> 1;	/* = (q-1)/2, since q odd. */

	pshift = p + 32;
	if(pshift < p)	/* Need special-casing for p just below 2^32  - the primes 2^32-(5,17) are good testcases here. */
	{
		j = -1;	/* leadz32(pshift) for 33-bit pshift goes negative */
		/* Extract leftmost 5 bits of pshift: */
		lead5 = 16 + (pshift >> 28);
	}
	else
	{
		/* Find number of leading zeros in p, use it to find the position of the leftmost ones bit: */
		j = leadz32(pshift);
		/* Extract leftmost 5 bits of pshift: */
		lead5 = ((pshift<<j) >> 27);
	}

	start_index = 32-j-5;	/* Leftward bit at which to start the l-r binary powering, assuming
							the leftmost 5 bits have already been processed via a shift (see next). */

	zshift = 31 - lead5;
	zshift <<= 1;		/* Doubling the shift count here takes cares of the first SQR_LOHI */
	pshift = ~pshift;	/* Overflow doesn't matter here, as long as we got the leading 5 bits of pshift right. */

	qinv = (q+q+q) ^ (uint32)2;	/* Overflow doesn't matter here, since we only care about the low 2 bits of 3*q. */

	qinv = qinv*((uint32)2 - q*qinv);
	qinv = qinv*((uint32)2 - q*qinv);
	qinv = qinv*((uint32)2 - q*qinv);

	/* Since zstart is a power of two < 2^32, use a streamlined code sequence for the first iteration: */
	j = start_index-1;

	/* For 64-bit hardware, Make sure we get a 32-bit shift result here by ANDing with 2^32-1: */
	lo = (qinv << zshift) & (uint32)0xffffffff;
	/* Emulate MULH64 here by getting full 64-bit product and right-shifting: */
	lo = (uint32)(((uint64)q * (uint64)lo) >> 32);
	x  = q - lo;

	if((pshift >> j) & (uint32)1)
	{
		DBG_ASSERT(HERE, x < q,"util.c: x < q");
		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
		if(x > qhalf) {
			x += x;
			x -= q;
		} else {
			x += x;
		}
	}

	for(j = start_index-2; j >= 0; j--)
	{
		/* SQR_LOHI32(x,lo,hi): */
		MUL_LOHI32(x,x, lo,hi);
		lo *= qinv;
		MULH32(q,lo, lo);

		/* Branchless version is much faster: */
		x = hi - lo + ((-(hi < lo)) & q);

		if((pshift >> j) & (uint32)1)
		{
			x = x + x - ((-(x > qhalf)) & q);
		}
	}
	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
	return(x + x - ((-(x > qhalf)) & q));
}

// V2 returns binary == 0? of powmod result:
DEV int twopmodq32(uint32 p, uint32 q)	// (2^-p % q) == 0
{
	 int32 j;
	uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi;

	ASSERT(HERE, (q&1) == 1, "twopmodq32: even modulus!");
	qhalf = q >> 1;	/* = (q-1)/2, since q odd. */
	pshift = p + 32;
	if(pshift < p)	/* Need special-casing for p just below 2^32  - the primes 2^32-(5,17) are good testcases here. */
	{
		j = -1;	/* leadz32(pshift) for 33-bit pshift goes negative */
		/* Extract leftmost 5 bits of pshift: */
		lead5 = 16 + (pshift >> 28);
	} else {
		/* Find number of leading zeros in p, use it to find the position of the leftmost ones bit: */
		j = leadz32(pshift);
		/* Extract leftmost 5 bits of pshift: */
		lead5 = ((pshift<<j) >> 27);
	}

	start_index = 32-j-5;	/* Leftward bit at which to start the l-r binary powering, assuming
							the leftmost 5 bits have already been processed via a shift (see next). */
	zshift = 31 - lead5;
	zshift <<= 1;		/* Doubling the shift count here takes cares of the first SQR_LOHI */
	pshift = ~pshift;	/* Overflow doesn't matter here, as long as we got the leading 5 bits of pshift right. */

	/*
	!    Find modular inverse (mod 2^32) of q in preparation for modular multiply.
	!    We use the simple and elegant iterative inversion method of Montgomery,
	!    which amounts to a modular analogue of Newton's method for iterative inversion:
	!
	!    0)   Zinv = Z                   ! Z*Zinv == 1 (mod 2^3)
	!    1)   Zinv = Zinv*(2 - Z*Zinv)   ! Z*Zinv == 1 (mod 2^6), etc.
	!
	!    where the number of correct bits (at the low end) doubles at each step,
	!    all arithmetic is modulo 2^32 and we repeat step (1) until we have the needed 32 bits.
	!
	!    We choose a different starting value of Zinv, XOR(3*Z, 2),
	!    so the first congruence holds modulo 2^4, thus requiring just 3 iterations.
	*/
	qinv = (q+q+q) ^ (uint32)2;	/* Overflow doesn't matter here, since we only care about the low 2 bits of 3*q. */

	qinv = qinv*((uint32)2 - q*qinv);
	qinv = qinv*((uint32)2 - q*qinv);
	qinv = qinv*((uint32)2 - q*qinv);

	/* Since zstart is a power of two < 2^32, use a streamlined code sequence for the first iteration: */
	j = start_index-1;

	/* For 64-bit hardware, Make sure we get a 32-bit shift result here by ANDing with 2^32-1: */
	lo = (qinv << zshift) & (uint32)0xffffffff;
	/* Emulate MULH64 here by getting full 64-bit product and right-shifting: */
	lo = (uint32)(((uint64)q * (uint64)lo) >> 32);
	x  = q - lo;

	if((pshift >> j) & (uint32)1)
	{
		DBG_ASSERT(HERE, x < q,"util.c: x < q");
		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
		if(x > qhalf) {
			x += x;
			x -= q;
		} else {
			x += x;
		}
	}

	for(j = start_index-2; j >= 0; j--)
	{
		/* SQR_LOHI32(x,lo,hi): */
		MUL_LOHI32(x,x, lo,hi);
		lo *= qinv;
		MULH32(q,lo, lo);

		/* Branchless version is much faster, but less readable, so give the branched one inside a #if 0: */
	#ifdef NOBRANCH
		x = hi - lo + ((-(hi < lo)) & q);
	#else
		x = hi - lo;
		if(x > hi)
			x += q;	/* had a borrow */
	#endif

		if((pshift >> j) & (uint32)1)
		{
		/* Branchless version is much faster, but less readable, so give the branched one inside a #if 0: */
		#ifdef NOBRANCH
			x = x + x - ((-(x > qhalf)) & q);
		#else
			if(x > qhalf) {	/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
				x = x + x;
				x -= q;
			} else {
				x = x + x;
			}
		#endif
		}
	}

	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
	return((int)((x + x - q) == 1));
}

/* Does an 8-fold base-2 PRP test on the prime candidates q0-7. */
DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uint32 q5, uint32 q6, uint32 q7)
{
	int retval = 0;
	 int32 j;
	uint32 start_index;
	uint32 lead0, pshift0, qinv0, zshift0, x0, lo0, hi0, qhalf0;
	uint32 lead1, pshift1, qinv1, zshift1, x1, lo1, hi1, qhalf1;
	uint32 lead2, pshift2, qinv2, zshift2, x2, lo2, hi2, qhalf2;
	uint32 lead3, pshift3, qinv3, zshift3, x3, lo3, hi3, qhalf3;
	uint32 lead4, pshift4, qinv4, zshift4, x4, lo4, hi4, qhalf4;
	uint32 lead5, pshift5, qinv5, zshift5, x5, lo5, hi5, qhalf5;
	uint32 lead6, pshift6, qinv6, zshift6, x6, lo6, hi6, qhalf6;
	uint32 lead7, pshift7, qinv7, zshift7, x7, lo7, hi7, qhalf7;

	DBG_ASSERT(HERE, (q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq32_x8: Inputs nonmonotone!");

	qhalf0 = q0 >> 1;	/* = (q-1)/2, since q odd. */
	qhalf1 = q1 >> 1;
	qhalf2 = q2 >> 1;
	qhalf3 = q3 >> 1;
	qhalf4 = q4 >> 1;
	qhalf5 = q5 >> 1;
	qhalf6 = q6 >> 1;
	qhalf7 = q7 >> 1;

	/* (p[i]-1)+32 = p + [31,33,37,39,41,49,54,60]: */
	pshift0 = q0 + 31;
	pshift1 = q1 + 31;
	pshift2 = q2 + 31;
	pshift3 = q3 + 31;
	pshift4 = q4 + 31;
	pshift5 = q5 + 31;
	pshift6 = q6 + 31;
	pshift7 = q7 + 31;

	/* Find number of leading zeros in p, use it to find the position of the leftmost ones bit: */
	j = leadz32(pshift0);
	if( leadz32(pshift7) != j )	/* Fused 8-fold algo needs all p's to have same bitlength */
	{
		retval  = (uint32)twopmodq32(q0-1, q0);
		retval += (uint32)twopmodq32(q1-1, q1) << 1;
		retval += (uint32)twopmodq32(q2-1, q2) << 2;
		retval += (uint32)twopmodq32(q3-1, q3) << 3;
		retval += (uint32)twopmodq32(q4-1, q4) << 4;
		retval += (uint32)twopmodq32(q5-1, q5) << 5;
		retval += (uint32)twopmodq32(q6-1, q6) << 6;
		retval += (uint32)twopmodq32(q7-1, q7) << 7;
		return retval;
	}

	if(pshift0 < q0)	/* Need special-casing for p just below 2^32  - the primes 2^32-(5,17) are good testcases here. */
	{
		j = -1;	/* leadz32(pshift) for 33-bit pshift goes negative */
		/* Extract leftmost 5 bits of pshift: */
		lead0 = 16 + (pshift0 >> 28);
		lead1 = 16 + (pshift1 >> 28);
		lead2 = 16 + (pshift2 >> 28);
		lead3 = 16 + (pshift3 >> 28);
		lead4 = 16 + (pshift4 >> 28);
		lead5 = 16 + (pshift5 >> 28);
		lead6 = 16 + (pshift6 >> 28);
		lead7 = 16 + (pshift7 >> 28);
	}
	else
	{
		/* Extract leftmost 5 bits of pshift and subtract from 32: */
		lead0 = ((pshift0<<j) >> 27);
		lead1 = ((pshift1<<j) >> 27);
		lead2 = ((pshift2<<j) >> 27);
		lead3 = ((pshift3<<j) >> 27);
		lead4 = ((pshift4<<j) >> 27);
		lead5 = ((pshift5<<j) >> 27);
		lead6 = ((pshift6<<j) >> 27);
		lead7 = ((pshift7<<j) >> 27);
	}

	start_index = 32-j-5;	/* Leftward bit at which to start the l-r binary powering, assuming
				 the leftmost 5 bits have already been processed via a shift (see next). */

	/* Doubling the shift count here takes cares of the first SQR_LOHI */
	zshift0 = 31 - lead0;	zshift0 <<= 1;	pshift0 = ~pshift0;
	zshift1 = 31 - lead1;	zshift1 <<= 1;	pshift1 = ~pshift1;
	zshift2 = 31 - lead2;	zshift2 <<= 1;	pshift2 = ~pshift2;
	zshift3 = 31 - lead3;	zshift3 <<= 1;	pshift3 = ~pshift3;
	zshift4 = 31 - lead4;	zshift4 <<= 1;	pshift4 = ~pshift4;
	zshift5 = 31 - lead5;	zshift5 <<= 1;	pshift5 = ~pshift5;
	zshift6 = 31 - lead6;	zshift6 <<= 1;	pshift6 = ~pshift6;
	zshift7 = 31 - lead7;	zshift7 <<= 1;	pshift7 = ~pshift7;

	/*
	Find modular inverse (mod 2^32) of q in preparation for modular multiply.
	*/
	qinv0 = (q0+q0+q0) ^ (uint32)2;
	qinv1 = (q1+q1+q1) ^ (uint32)2;
	qinv2 = (q2+q2+q2) ^ (uint32)2;
	qinv3 = (q3+q3+q3) ^ (uint32)2;
	qinv4 = (q4+q4+q4) ^ (uint32)2;
	qinv5 = (q5+q5+q5) ^ (uint32)2;
	qinv6 = (q6+q6+q6) ^ (uint32)2;
	qinv7 = (q7+q7+q7) ^ (uint32)2;
	for(j = 0; j < 3; ++j)
	{
		qinv0 = qinv0*((uint32)2 - q0*qinv0);
		qinv1 = qinv1*((uint32)2 - q1*qinv1);
		qinv2 = qinv2*((uint32)2 - q2*qinv2);
		qinv3 = qinv3*((uint32)2 - q3*qinv3);
		qinv4 = qinv4*((uint32)2 - q4*qinv4);
		qinv5 = qinv5*((uint32)2 - q5*qinv5);
		qinv6 = qinv6*((uint32)2 - q6*qinv6);
		qinv7 = qinv7*((uint32)2 - q7*qinv7);
	}

	/* Since zstart is a power of two < 2^32, use a streamlined code sequence for the first iteration: */
	j = start_index-1;

	/* For 64-bit hardware, Make sure we get a 32-bit shift result here by ANDing with 2^32-1: */
	lo0 = (qinv0 << zshift0) & (uint32)0xffffffff;
	lo1 = (qinv1 << zshift1) & (uint32)0xffffffff;
	lo2 = (qinv2 << zshift2) & (uint32)0xffffffff;
	lo3 = (qinv3 << zshift3) & (uint32)0xffffffff;
	lo4 = (qinv4 << zshift4) & (uint32)0xffffffff;
	lo5 = (qinv5 << zshift5) & (uint32)0xffffffff;
	lo6 = (qinv6 << zshift6) & (uint32)0xffffffff;
	lo7 = (qinv7 << zshift7) & (uint32)0xffffffff;

	/* lo = MULH32(q, lo): */
	MULH32(q0,lo0, lo0);
	MULH32(q1,lo1, lo1);
	MULH32(q2,lo2, lo2);
	MULH32(q3,lo3, lo3);
	MULH32(q4,lo4, lo4);
	MULH32(q5,lo5, lo5);
	MULH32(q6,lo6, lo6);
	MULH32(q7,lo7, lo7);

	x0  = q0 - lo0;
	x1  = q1 - lo1;
	x2  = q2 - lo2;
	x3  = q3 - lo3;
	x4  = q4 - lo4;
	x5  = q5 - lo5;
	x6  = q6 - lo6;
	x7  = q7 - lo7;

	/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
	if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
	if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
	if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
	if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
	if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
	if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
	if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
	if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }

	for(j = start_index-2; j >= 0; j--)
	{
		/* SQR_LOHI32(x,lo,hi): */
		MUL_LOHI32(x0,x0, lo0,hi0);
		MUL_LOHI32(x1,x1, lo1,hi1);
		MUL_LOHI32(x2,x2, lo2,hi2);
		MUL_LOHI32(x3,x3, lo3,hi3);
		MUL_LOHI32(x4,x4, lo4,hi4);
		MUL_LOHI32(x5,x5, lo5,hi5);
		MUL_LOHI32(x6,x6, lo6,hi6);
		MUL_LOHI32(x7,x7, lo7,hi7);
		lo0 *= qinv0;
		lo1 *= qinv1;
		lo2 *= qinv2;
		lo3 *= qinv3;
		lo4 *= qinv4;
		lo5 *= qinv5;
		lo6 *= qinv6;
		lo7 *= qinv7;
		MULH32(q0,lo0, lo0);
		MULH32(q1,lo1, lo1);
		MULH32(q2,lo2, lo2);
		MULH32(q3,lo3, lo3);
		MULH32(q4,lo4, lo4);
		MULH32(q5,lo5, lo5);
		MULH32(q6,lo6, lo6);
		MULH32(q7,lo7, lo7);

		/* if(x < 0) x += q; */
		x0 = hi0 - lo0 + ((-(hi0 < lo0)) & q0);
		x1 = hi1 - lo1 + ((-(hi1 < lo1)) & q1);
		x2 = hi2 - lo2 + ((-(hi2 < lo2)) & q2);
		x3 = hi3 - lo3 + ((-(hi3 < lo3)) & q3);
		x4 = hi4 - lo4 + ((-(hi4 < lo4)) & q4);
		x5 = hi5 - lo5 + ((-(hi5 < lo5)) & q5);
		x6 = hi6 - lo6 + ((-(hi6 < lo6)) & q6);
		x7 = hi7 - lo7 + ((-(hi7 < lo7)) & q7);

		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
		if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
		if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
		if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
		if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
		if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
		if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
		if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
		if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }
	}

	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
	retval += ((x0 + x0 - q0) == 1)     ;
	retval += ((x1 + x1 - q1) == 1) << 1;
	retval += ((x2 + x2 - q2) == 1) << 2;
	retval += ((x3 + x3 - q3) == 1) << 3;
	retval += ((x4 + x4 - q4) == 1) << 4;
	retval += ((x5 + x5 - q5) == 1) << 5;
	retval += ((x6 + x6 - q6) == 1) << 6;
	retval += ((x7 + x7 - q7) == 1) << 7;
	return retval;
}

/*******************/

/* Simple Euclidean GCD for 32-bit unsigned inputs. (Cf. Algorithm 2.1.4 in Crandall/Pomerance.)
For integers x, y with x, y > 0, returns GCD(x,y). If x or y = 0, returns max(x,y).
*/
DEV uint32 gcd32(uint32 x, uint32 y)
{
	uint32 q, f;
	if(!y) return(x);
	while(y) {
		q = x/y;	/* Find quotient of current x/y and round toward zero: */
		f = x - q*y;/* Find y' and store in temporary: */
		x = y;		/* Find x', i.e. move the old value of y into the slots for x: */
		y = f;		/* New value of y: */
	}
	return(x);
}

DEV uint64 gcd64(uint64 x, uint64 y)
{
	uint64 q, f;
	if(!y) return(x);
	while(y) {
		q = x/y;	/* Find quotient of current x/y and round toward zero: */
		f = x - q*y;/* Find y' and store in temporary: */
		x = y;		/* Find x', i.e. move the old value of y into the slots for x: */
		y = f;		/* New value of y: */
	}
	return(x);
}

/*******************/

/* Simple extended Euclidean GCD for 32-bit unsigned inputs. (Cf. Algorithm 2.1.4 in Crandall/Pomerance.)
For integers x, y with x, y > 0, returns integers {a,b,g} such that a*x + b*y = g = GCD(x,y).

When g = 1 and y > 0, the residues a and b are the inverses of x (mod y) and y (mod x), respectively.

The GCD g is the return value of the function.
***NOTE***: The multipliers a and b overwrite the inputs x and y, so if the original inputs are
			needed subsequently, they must be copied prior to calling the function.
*/
DEV uint32 egcd32_B(int32 *x, int32 *y)
{
	int32 g = *x, w = *y, q;
	int32 a = 1, b = 0, u = 0, v = 1;
	/* Sign of these 3 doesn't matter since they're just temporaries: */
	int32 d, e, f;

	if(*x == *y) {
		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(HERE, 0,"0");
	} else if((*x == 0) || (*y == 0)) {
		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(HERE, 0,"0");
	}

	while(w) {
		// Find quotient of current x/y and round toward zero - makes sense to try to take advantage of the fact
		// that most q's are small (~80% of q's < 4), but in practice I've found that adding even simple logic to
		// special-case for q = 0 (e.g. if(g < w) {d = a; e = b; f = g; } else ...) slows things down:
		q = g/w;
		// Find (u', v', w') and store in 3 temporaries:
		d = a - q*u;
		e = b - q*v;
		f = g - q*w;
		// Find (a', b', g'), i.e. move the old values of (u,v,w) into the slots for (a,b,g),
		// then recover new values of (u, v, w) from the temporaries:
		a = u;	u = d;
		b = v;	v = e;
		g = w;	w = f;
	}
	*x = a;
	*y = b;
	return(g);
}

DEV uint32 egcd32(uint32 *x, uint32 *y)
{
	uint32 g = *x, w = *y, q;
	uint32 a = 1, b = 0, u = 0, v = 1;
	/* Sign of these 3 doesn't matter since they're just temporaries: */
	uint32 d, e, f;

	if(*x == *y) {
		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(HERE, 0,"0");
	} else if((*x == 0) || (*y == 0)) {
		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(HERE, 0,"0");
	}

	while(w)
	{
		// Find quotient of current x/y and round toward zero - makes sense to try to take advantage of the fact
		// that most q's are small (~80% of q's < 4), but in practice I've found that adding even simple logic to
		// special-case for q = 0 (e.g. if(g < w) {d = a; e = b; f = g; } else ...) slows things down:
		q = g/w;
//printf("egcd32: w,q = %d, %d, quotient = %d\n",w,g,q);
//printf("a,b,g = %d,%d,%d\n",a,b,g);
//printf("u,v,w = %d,%d,%d\n",u,v,w);
		/* Find (u', v', w') and store in 3 temporaries: */
		d = a - q*u;
		e = b - q*v;
		f = g - q*w;
//printf("d,e,f = %d,%d,%d\n",d,e,f);
		// Find (a', b', g'), i.e. move the old values of (u,v,w) into the slots for (a,b,g),
		// then recover new values of (u, v, w) from the temporaries:
		a = u;	u = d;
		b = v;	v = e;
		g = w;	w = f;
	}
	if(*y < a)	// E.g. inputs 2,2^31-1 gives a = 3221225473 = (int)-1073741823, need to add to modulus (*y) to get proper mod-inv 1073741824.
		*x = *y + a;
	else
		*x = a;
	*y = b;
	return(g);
}

DEV uint64 egcd64(uint64 *x, uint64 *y)
{
	uint64 g = *x, w = *y, q;
	uint64 a = 1ull, b = 0ull, u = 0ull, v = 1ull;
	/* Sign of these 3 doesn't matter since they're just temporaries: */
	uint64 d, e, f;
	if(*x == *y) {
		printf("ERROR: eGCD of identical arguments x = y = %llu is illegal!\n", *x);	ASSERT(HERE, 0,"0");
	} else if((*x | *y) == 0ull) {
		printf("ERROR: eGCD called with zero input: x = %llu, y = %llu\n", *x, *y);		ASSERT(HERE, 0,"0");
	}
	while(w) {
		q = g/w;
		d = a - q*u;
		e = b - q*v;
		f = g - q*w;
		a = u;	u = d;
		b = v;	v = e;
		g = w;	w = f;
	}
	if(*y < a)	// Cf. comment in egcd32
		*x = *y + a;
	else
		*x = a;
	*y = b;
	return(g);
}

/*********************/
/*
Finds multiplicative inverse of z (mod n).
*/
DEV int modinv32(uint32 z, uint32 n)
{
	uint32 x = z, y = n, gcd;
int32 x2 = z, y2 = n, gcd2;
	gcd = egcd32(&x, &y);
	gcd2= egcd32_B(&x2, &y2);
	if(x2 < 0)	// since egcd32() only does positive-result normalization on x-output, only do it here to the egcd32_B x-output
		x2 += n;
	if(gcd != gcd2 || x != x2 || y != y2)
		ASSERT(HERE, 0,"2 gcd results in modinv32 differ!");
	ASSERT(HERE, gcd == 1,"gcd in modinv32 is non-unity!");
	return x;
}

DEV int64 modinv64(uint64 z, uint64 n)
{
	uint64 x = z, y = n, gcd;
	gcd = egcd64(&x, &y);
	ASSERT(HERE, gcd == 1ull,"gcd in modinv64 is non-unity!");
	return x;
}

/********************/

/* Complex multiplication */
struct complex cmul(struct complex *a, struct complex *b)
{
	struct complex cout;
	cout.re = (*a).re*(*b).re - (*a).im*(*b).im;
	cout.im = (*a).re*(*b).im + (*a).im*(*b).re;
	return cout;
}

/***********************************************************************************/
/*
Function to reduce x modulo y, where x and y are both 128-bit unsigned integers.
Algorithm is simple-but-slow bitwise shift-and-subtract scheme.
*/
uint128 xmody128(uint128 x, uint128 y)
{
	uint32 lzx, lzy, nshiftl;
	uint128 t;

	/* In preparation for x%y, Find the # of leading zeros in x and y. */
	     if(x.d1)
		lzx = leadz64(x.d1);
	else
		lzx = leadz64(x.d0) + 64;

	     if(y.d1)
		lzy = leadz64(y.d1);
	else
		lzy = leadz64(y.d0) + 64;

	/* X < Y: return unmodified X. */
	if(lzx > lzy)
		return x;

	nshiftl = lzy - lzx;	/* nshiftlr = 64-nshiftl; */

	while(nshiftl)
	{
		/* Use t to store the left-shifted versions of y: */
		LSHIFT128(y, nshiftl, t);

		if(CMPULT128(t, x))
			SUB128(x, t, x);

		/* Right-shift t one place: */
		--nshiftl;
	}
	/* Must ensure that this gets done once even if lzx == lzy: */
	if(CMPULT128(y, x))
		SUB128(x, y, x);

	return x;
}

/***********************************************************************************/
/*
Function to reduce x modulo y, where x and y are both 192-bit unsigned integers.
Algorithm is simple-but-slow bitwise shift-and-subtract scheme.
Returns remainder x mod y; quotient returned in optional pointer argument q.
*/

uint192 xmody192(const uint192 x, const uint192 y, uint192*quot)
{
	uint32 lzx, lzy, nshiftl;
	uint192 r = x, qsh = ONE192, t;

	/* In preparation for x%y, Find the # of leading zeros in x and y. */
	lzx = leadz192(x);
	lzy = leadz192(y);

	/* X < Y: return unmodified X. */
	if(lzx > lzy)
		return r;

	nshiftl = lzy - lzx;
	if(quot) {
		LSHIFT192(qsh, nshiftl, qsh);	// quotient gets built up from sum of left-shifted binary ones.
		quot->d0 = quot->d1 = quot->d2 = 0ull;
	}
/*
printf("x =%20" LLU "*2^128 + %20" LLU "*2^64 + %20" LLU "\n", x.d2, x.d1, x.d0);
printf("y =%20" LLU "*2^128 + %20" LLU "*2^64 + %20" LLU "\n", y.d2, y.d1, y.d0);
printf("nshiftl = %u\n", nshiftl);
*/
	while(nshiftl)
	{
		/* Use t to store the left-shifted versions of y: */
		LSHIFT192(y, nshiftl, t);
/*printf("y<<%u=" LLU "*2^128 + %20" LLU "*2^64 + %20" LLU "\n", nshiftl, t.d2, t.d1, t.d0); */

		if(CMPULT192(t, r))
		{
			SUB192(r, t, r);
			if(quot) {
				ADD192_PTR(quot, (&qsh), quot);
			}
/*printf("r*=%20" LLU "*2^128 + %20" LLU "*2^64 + %20" LLU "\n", r.d2, r.d1, r.d0); */
		}

		/* Right-shift t one place: */
		--nshiftl;
		if(quot) {
			RSHIFT_FAST192(qsh, 1, qsh);
		}
	}
	/* Must ensure that this gets done once even if lzx == lzy: */
	if(CMPULT192(y, r)) {
		SUB192(r, y, r);
		if(quot) {
			ADD192_PTR(quot, (&qsh), quot);
		}
	}
	return r;
}

/***********************************************************************************/
/*
Function to reduce x modulo y, where x and y are both 256-bit unsigned integers.
Algorithm is simple-but-slow bitwise shift-and-subtract scheme.
Returns remainder x mod y; quotient returned in optional pointer argument q.
*/

uint256 xmody256(const uint256 x, const uint256 y, uint256*quot)
{
	uint32 lzx, lzy, nshiftl;
	uint256 r = x, qsh = ONE256, t;

	/* In preparation for x%y, Find the # of leading zeros in x and y. */
	lzx = leadz256(x);
	lzy = leadz256(y);

	/* X < Y: return unmodified X. */
	if(lzx > lzy)
		return r;

	nshiftl = lzy - lzx;
	if(quot) {
		LSHIFT256(qsh, nshiftl, qsh);	// quotient gets built up from sum of left-shifted binary ones.
		quot->d0 = quot->d1 = quot->d2 = quot->d3 = 0ull;
	}

	while(nshiftl)
	{
		/* Use t to store the left-shifted versions of y: */
		LSHIFT256(y, nshiftl, t);

		if(CMPULT256(t, r))
		{
			SUB256(r, t, r);
			if(quot) {
				ADD256_PTR(quot, (&qsh), quot);
			}
		}

		/* Right-shift t one place: */
		--nshiftl;
		if(quot) {
			RSHIFT_FAST256(qsh, 1, qsh);
		}
	}
	/* Must ensure that this gets done once even if lzx == lzy: */
	if(CMPULT256(y, r)) {
		SUB256(r, y, r);
		if(quot) {
			ADD256_PTR(quot, (&qsh), quot);
		}
	}
	return r;
}


/***********************************************************************************/
/*
Divide-with-Remainder of x by y, where x is a 128-bit unsigned (vector) integer and y a 32-bit unsigned scalar.
Returns (x - x%y)/y in x, 32-bit remainder in the function result.

If you only want the remainder, not to perform the divide, call x128_mod_y32 instead.
*/
uint32 x128_div_y32(uint128 *x, uint32 y)
{
	uint64 cy, rem, xlomody, tsum;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
/*printf("INIT: two64divy, two64mody = %20llu %20llu\n\n", two64divy, two64mody); */
	}

	/* Divide high digit by y, storing remainder in cy: */
	cy = (x->d1)%y;
	(x->d1) /= y;

	/* Remainder (must calculate this before modifying (x->d0), obviously): */
	xlomody = (x->d0)%y;
	tsum = cy*two64mody + xlomody;
	rem = tsum%y;

	/* Low digit of result: we must separately divide (x->d0) by y
	(making sure to add (x->d0)%y to  cy*two64mody first, so as not to drop a digit)
	because x->d0 may be as large as 2^64-1, and adding cy*two64mody
	prior to dividing risks unsigned integer overflow:
	*/
	(x->d0) = cy*two64divy + tsum/y + (x->d0)/y;
/*printf("%20llu %20llu %2llu %2llu\n", x->d1, x->d0, cy, rem); */
	return (uint32)rem;
}

uint32 x128_mod_y32(uint128 x, uint32 y)
{
	uint64 cy, rem, xlomody, tsum;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
	}

	/* Divide high digit by y, storing remainder in cy: */
	cy = (x.d1)%y;
	/* Remainder: */
	xlomody = (x.d0)%y;
	tsum = cy*two64mody + xlomody;
	rem = tsum%y;

	return (uint32)rem;
}

/***********************************************************************************/
/*
Divide-with-Remainder of x by y, where x is a 192-bit unsigned (vector) integer and y a 32-bit unsigned scalar.
Returns (x - x%y)/y in x, 32-bit remainder in the function result.

If you only want the remainder, not to perform the divide, call x192_mod_y32 instead.
*/
uint32 x192_div_y32(uint192 *x, uint32 y)
{
	uint64 cy, rem, xlomody, tsum;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;
	uint128 t128;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
	}

	/* Copy the upper 2 digits into a local uint128, then call 128-bit divide
	on those, return value is the carry into the low digit: */
	t128.d1 = (x->d2);
	t128.d0 = (x->d1);
	cy = x128_div_y32(&t128, y);
	(x->d2) = t128.d1;
	(x->d1) = t128.d0;

	/* Low digit: */
	xlomody = (x->d0)%y;
	tsum = cy*two64mody + xlomody;
	rem = tsum%y;
	(x->d0) = cy*two64divy + tsum/y + (x->d0)/y;

	return (uint32)rem;
}

uint32 x192_mod_y32(uint192 x, uint32 y)
{
	uint64 cy, rem;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;
	uint128 t128;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
	}

	/* Copy the upper 2 digits into a local uint128, then call 128-bit divide
	on those, return value is the carry into the low digit: */
	t128.d1 = (x.d2);
	t128.d0 = (x.d1);
	cy = x128_div_y32(&t128, y);

	/* Low digit: */
	rem = (cy*two64mody + ((x.d0))%y)%y;

	return (uint32)rem;
}

/***********************************************************************************/
/*
Divide-with-Remainder of x by y, where x is a 256-bit unsigned (vector) integer and y a 32-bit unsigned scalar.
Returns (x - x%y)/y in x, 32-bit remainder in the function result.

If you only want the remainder, not to perform the divide, call x256_mod_y32 instead.
*/
uint32 x256_div_y32(uint256 *x, uint32 y)
{
	uint64 cy, rem, xlomody, tsum;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;
	uint192 t192;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
	}

	/* Copy the upper 3 digits into a local uint192, then call 192-bit divide
	on those, return value is the carry into the low digit: */
	t192.d2 = (x->d3);
	t192.d1 = (x->d2);
	t192.d0 = (x->d1);
	cy = x192_div_y32(&t192, y);
	(x->d3) = t192.d2;
	(x->d2) = t192.d1;
	(x->d1) = t192.d0;

	/* Low digit: */
	xlomody = (x->d0)%y;
	tsum = cy*two64mody + xlomody;
	rem = tsum%y;
	(x->d0) = cy*two64divy + tsum/y + (x->d0)/y;

	return (uint32)rem;
}

uint32 x256_mod_y32(uint256 x, uint32 y)
{
	uint64 cy, rem;
	static uint32 ysave = 0;
	static uint64 two64divy, two64mody;
	uint192 t192;

	if(y != ysave)
	{
		ysave = y;
		two64divy = 0x8000000000000000ull/y;
		two64divy = (two64divy + two64divy);

		two64mody = 0x8000000000000000ull%y;
		/* To save a second (expensive) standard library mod call,
		double and subtract y, then re-add y if the result underflows: */
		two64mody = (two64mody + two64mody) - y;
		cy = (two64mody >> 63);
		two64mody += (-cy) & y;
		two64divy += (cy == 0);
	}

	/* Copy the upper 3 digits into a local uint192, then call 192-bit divide
	on those, return value is the carry into the low digit: */
	t192.d2 = (x.d3);
	t192.d1 = (x.d2);
	t192.d0 = (x.d1);
	cy = x192_div_y32(&t192, y);

	/* Low digit: */
	rem = (cy*two64mody + ((x.d0))%y)%y;

	return (uint32)rem;
}

/***********************************************************************************/

/* Need the uint64 ones of these because some compilers (e.g. MSVC, a.k.a .NET)
don't properly print 64-bit ints. */

/*
Returns decimal character representation of a 64-bit unsigned int in char_buf,
and the position of the leftmost nonzero digit (e.g. if the caller wants to print in
left-justified form) in the function result.
*/
int	convert_uint64_base10_char(char char_buf[], uint64 q)
{
	uint32 i, n_dec_digits = 0, curr_digit;
	char c;
	/* 2^64 has 20 decimal digits - assume the user has allocated at least 20+1 for char_buf: */
	uint32 MAX_DIGITS = 20;

	char_buf[MAX_DIGITS-1]='0';
	char_buf[MAX_DIGITS  ]='\0';

	/* Write the decimal digits into the string from right to left.
	This avoids the need to reverse the digits after calculating them.
	*/
	for(i=0; i < MAX_DIGITS; i++)
	{
		/* Needed to cast modulus 10 to uint32 here for result to come out correct: */
		curr_digit = q%(uint32)10;

		/* Only print leading zero if q = 0, in which case we print a right-justifed single zero: */
		if(q != 0 || n_dec_digits == 0)
		{
			c = curr_digit + CHAROFFSET;
			n_dec_digits++;
		}
		else
			c = ' ';

		char_buf[(MAX_DIGITS - 1) - i] = c;

		q /= 10;
	}

	return (int)MAX_DIGITS-n_dec_digits;
}

/********************/

/*
Returns all-caps hexadecimal character representation of a uint64 in char_buf.
*/
int	convert_uint64_base16_char(char char_buf[], uint64 q)
{
	int i;
	const int hex_chars[16] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'};

	for(i=15; i >= 0; i--)
	{
		char_buf[i] = hex_chars[q & 15];
		q >>= 4;
	}
	char_buf[16] = '\0';
	return 0;
}

// Returns binary-form character representation of a uint64 in char_buf.
// Assumes char_buf has at least 65 bytes allocated.
int	convert_uint64_base2_char(char char_buf[], uint64 q)
{
	int i;
	const int hex_chars[2] = {'0','1'};

	for(i=63; i >= 0; i--)
	{
		char_buf[i] = hex_chars[q & 1];
		q >>= 1;
	}
	char_buf[64] = '\0';
	return 0;
}

/*
For really large inputs we'll want to use base-10^19 for our mod, thus processing nearly one 64-bit
chunk at a time and cutting the number of expensive % operations by 19. But this will also require
us to count leading zeros in the leading (leftmost) base-10^19 word, which isn't worth it for small inputs.
*/

/*
Returns decimal character representation of a base-2^64 2-word unsigned int in char_buf,
and the position of the leftmost nonzero digit (e.g. if the caller wants to print in
left-justified form) in the function result.
*/
int	convert_uint128_base10_char(char char_buf[], uint128 q128)
{
	uint32 i, n_dec_digits = 0;
	char c;
	/* 2^128 has 39 decimal digits: */
	uint32 MAX_DIGITS = 39;

	char_buf[MAX_DIGITS-1]='0';
	char_buf[MAX_DIGITS  ]='\0';

	/* Write the decimal digits into the string from right to left.
	This avoids the need to reverse the digits after calculating them.
	*/
	for(i=0; i < MAX_DIGITS; i++)
	{
		/* Only print leading zero if q = 0, in which case we print a right-justifed single zero: */
		/* Since the x***_div_y32 routines return the mod *and* the divided input,
		   don't call the function until *after* performing the if() test:
		*/
		if((q128.d0 || q128.d1) || n_dec_digits == 0)
		{
			c = x128_div_y32(&q128, (uint32)10) + CHAROFFSET;
			n_dec_digits++;
		}
		else
			c = ' ';

		char_buf[(MAX_DIGITS - 1) - i] = c;
	}

	return (int)MAX_DIGITS-n_dec_digits;
}

int	convert_uint96_base10_char(char char_buf[], uint96 q96)
{
	uint128 q128;
	q128.d0 = q96.d0;
	q128.d1 = (uint64)q96.d1;
	return convert_uint128_base10_char(char_buf, q128);
}

int	convert_uint96ptr_base10_char(char char_buf[], uint96*q96)
{
	uint128 q128;
	q128.d0 = q96->d0;
	q128.d1 = (uint64)q96->d1;
	return convert_uint128_base10_char(char_buf, q128);
}

/*
Returns decimal character representation of a base-2^64 3-word unsigned int in char_buf,
and the position of the leftmost nonzero digit (e.g. if the caller wants to print in
left-justified form) in the function result.
*/
int	convert_uint192_base10_char(char char_buf[], uint192 q192)
{
	uint32 i, n_dec_digits = 0;
	char c;
	/* 2^192 has 58 decimal digits: */
	uint32 MAX_DIGITS = 58;

	char_buf[MAX_DIGITS-1]='0';
	char_buf[MAX_DIGITS  ]='\0';

	/* Write the decimal digits into the string from right to left.
	This avoids the need to reverse the digits after calculating them.
	*/
	for(i=0; i < MAX_DIGITS; i++)
	{
		/* Only print leading zero if q = 0, in which case we print a right-justifed single zero: */
		/* Since the x***_div_y32 routines return the mod *and* the divided input,
		   don't call the function until *after* performing the if() test:
		*/
		if((q192.d0 || q192.d1 || q192.d2) || n_dec_digits == 0)
		{
			c = x192_div_y32(&q192, (uint32)10) + CHAROFFSET;
			n_dec_digits++;
		}
		else
			c = ' ';

		char_buf[(MAX_DIGITS - 1) - i] = c;
	}

	return (int)MAX_DIGITS-n_dec_digits;
}

/*
Returns decimal character representation of a base-2^64 4-word unsigned int in char_buf,
and the position of the leftmost nonzero digit (e.g. if the caller wants to print in
left-justified form) in the function result.
*/
int	convert_uint256_base10_char(char char_buf[], uint256 q256)
{
	uint32 i, n_dec_digits = 0;
	char c;
	/* 2^256 has 78 decimal digits: */
	uint32 MAX_DIGITS = 78;

	char_buf[MAX_DIGITS-1]='0';
	char_buf[MAX_DIGITS  ]='\0';

	/* Write the decimal digits into the string from right to left.
	This avoids the need to reverse the digits after calculating them.
	*/
	for(i=0; i < MAX_DIGITS; i++)
	{
		/* Only print leading zero if q = 0, in which case we print a right-justifed single zero: */
		/* Since the x***_div_y32 routines return the mod *and* the divided input,
		   don't call the function until *after* performing the if() test:
		*/
		if((q256.d0 || q256.d1 || q256.d2 || q256.d3) || n_dec_digits == 0)
		{
			c = x256_div_y32(&q256, (uint32)10) + CHAROFFSET;
			n_dec_digits++;
		}
		else
			c = ' ';

		char_buf[(MAX_DIGITS - 1) - i] = c;
	}

	return (int)MAX_DIGITS-n_dec_digits;
}

/********************/
/* Basically a specialized version of the <stdlib.h> strtod function: */
double	convert_base10_char_double (const char*char_buf)
{
	uint64 curr_sum = (uint64)0;
	double curr_mul = 0.0;
	uint32 i;
	int done_with_leading_whitespace = FALSE;
	char c;
	uint64 curr_digit, hi;

	/* Read the decimal digits from the string from left to right,
	skipping any leading whitespace, and stopping if either non-leading
	whitespace or '\0' is encountered. If we encounter a decimal point,
	the curr_mul multiplier is set = 1.0 and multiplied by 0.1 for every
	numeric digit found to the right of the DP.
	*/
	for(i=0; i != 0xffffffff; i++)
	{
		c = char_buf[i];
		if(!isdigit(c))
		{
			if(isspace(c))
			{
				if(done_with_leading_whitespace)
					break;
				else
					continue;
			}

			done_with_leading_whitespace = TRUE;

			if(c == '.')	/* Found a decimal point */
			{
				ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0");	/* Make sure this is the first . we've encountered */
				curr_mul = 1.0;
				continue;
			}
			else if(c == '\n' || c == '\0')
			{
				break;
			}
			else
			{
				fprintf(stderr,"convert_base10_char_double: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
				ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0");
			}
		}
		curr_mul *= 0.1;	/* Only has an effect if we're to the right of the DP */
		curr_digit = (uint64)(c - CHAROFFSET);
		ASSERT(HERE, curr_digit < 10,"convert_base10_char_double: curr_digit < 10");
		/* Store 10*currsum in a 128-bit product, so can check for overflow: */
	#ifdef MUL_LOHI64_SUBROUTINE
		MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi);
	#else
		MUL_LOHI64((uint64)10,curr_sum, curr_sum, hi);
	#endif
		if(hi != 0)
		{
			fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_double: Offending input string = %s\n", char_buf);
			ASSERT(HERE, 0,"0");
		}
		curr_sum += curr_digit;	/* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */
	}

	/* If we encountered no DP we simply convert the pure-integer curr_sum to double
	and return that; otherwise we return (double)curr_sum*curr_mul .
	*/
#if 0
	printf("convert_base10_char_double: char_buf = %s, curr_sum = %llu, curr_mul = %lf\n",char_buf, curr_sum, curr_mul);
#endif
	if(curr_mul == 0.0)
	{
		curr_mul = (double)curr_sum;
	}
	else
	{
		curr_mul *= (double)curr_sum;
	}

	return curr_mul;
}

/********************/
/* Basically a 64-bit version of the <stdlib.h> strtoul function: */
uint64 convert_base10_char_uint64 (const char*char_buf)
{
	uint64 curr_sum = (uint64)0;
	uint32 i;
	int done_with_leading_whitespace = FALSE;
	char c;
	uint64 curr_digit, hi;

	/* Read the decimal digits from the string from left to right,
	skipping any leading whitespace, and stopping if either non-leading
	whitespace or '\0' is encountered:
	*/
	for(i=0; i != 0xffffffff; i++)
	{
		c = char_buf[i];
		if(!isdigit(c))
		{
			if(isspace(c))
			{
				if(done_with_leading_whitespace)
					break;
				else
					continue;
			}

			done_with_leading_whitespace = TRUE;

			if(c == '\n' || c == '\0')
			{
				break;
			}
			else
			{
				fprintf(stderr,"convert_base10_char_uint64: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
				ASSERT(HERE, 0,"0");
			}
		}
		curr_digit = (uint64)(c - CHAROFFSET);
		ASSERT(HERE, curr_digit < 10,"convert_base10_char_uint64: curr_digit < 10");
		/* Store 10*currsum in a 128-bit product, so can check for overflow: */
	#ifdef MUL_LOHI64_SUBROUTINE
		MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi);
	#else
		MUL_LOHI64((uint64)10,curr_sum, curr_sum, hi);
	#endif
		if(hi != 0)
		{
			fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_uint64: Offending input string = %s\n", char_buf);
			ASSERT(HERE, 0,"0");
		}
		curr_sum += curr_digit;	/* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */
	}

	return curr_sum;
}

uint96	convert_base10_char_uint96 (const char*char_buf)
{
	uint96 rslt;
	uint128 t128 = convert_base10_char_uint128(char_buf);
	rslt.d0 = t128.d0;
	rslt.d1 = (uint32)t128.d1;
	return rslt;
}

uint128	convert_base10_char_uint128(const char*char_buf)
{
	const uint32 LEN_MAX = 2;
	uint64 curr_sum[2] = {(uint64)0,(uint64)0};
	uint64 tmp = 0;
	uint128 x128;
	uint32 i, len = 1;
	int done_with_leading_whitespace = FALSE;
	char c;
	uint64 curr_digit;

	/* Read the decimal digits from the string from left to right,
	skipping any leading whitespace, and stopping if either non-leading
	whitespace or '\0' is encountered:
	*/
	for(i=0; i != 0xffffffff; i++)
	{
		c = char_buf[i];
		if(!isdigit(c))
		{
			if(isspace(c))
			{
				if(done_with_leading_whitespace)
					break;
				else
					continue;
			}

			done_with_leading_whitespace = TRUE;

			if(c == '\n' || c == '\0')
			{
				break;
			}
			else
			{
				fprintf(stderr,"convert_base10_char_uint128: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
				ASSERT(HERE, 0,"0");
			}
		}
		curr_digit = (uint64)(c - CHAROFFSET);
		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
		/* currsum *= 10, and check for overflow: */
		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
		if(tmp != 0)
		{
			if(len == LEN_MAX)
			{
				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT128: Offending input string = %s\n", char_buf);
				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
			}
			curr_sum[len++] = tmp;
		}

		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
	}

	x128.d0 = curr_sum[0];
	x128.d1 = curr_sum[1];
	return x128;
}

uint192	convert_base10_char_uint192(const char*char_buf)
{
	const uint32 LEN_MAX = 3;
	uint64 curr_sum[3] = {(uint64)0,(uint64)0,(uint64)0};
	uint64 tmp = 0;
	uint192 x192;
	uint32 i, len = 1;
	int done_with_leading_whitespace = FALSE;
	char c;
	uint64 curr_digit;

	/* Read the decimal digits from the string from left to right,
	skipping any leading whitespace, and stopping if either non-leading
	whitespace or '\0' is encountered:
	*/
	for(i=0; i != 0xffffffff; i++)
	{
		c = char_buf[i];
		if(!isdigit(c))
		{
			if(isspace(c))
			{
				if(done_with_leading_whitespace)
					break;
				else
					continue;
			}

			done_with_leading_whitespace = TRUE;

			if(c == '\n' || c == '\0')
			{
				break;
			}
			else
			{
				fprintf(stderr,"convert_base10_char_uint192: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
				ASSERT(HERE, 0,"0");
			}
		}
		curr_digit = (uint64)(c - CHAROFFSET);
		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
		/* currsum *= 10, and check for overflow: */
		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
		if(tmp != 0)
		{
			if(len == LEN_MAX)
			{
				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT192: Offending input string = %s\n", char_buf);
				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
			}
			curr_sum[len++] = tmp;
		}

		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
	}

	x192.d0 = curr_sum[0];
	x192.d1 = curr_sum[1];
	x192.d2 = curr_sum[2];
	return x192;
}

uint256	convert_base10_char_uint256(const char*char_buf)
{
	const uint32 LEN_MAX = 4;
	uint64 curr_sum[4] = {(uint64)0,(uint64)0,(uint64)0,(uint64)0};
	uint64 tmp = 0;
	uint256 x256;
	uint32 i, len = 1;
	int done_with_leading_whitespace = FALSE;
	char c;
	uint64 curr_digit;

	/* Read the decimal digits from the string from left to right,
	skipping any leading whitespace, and stopping if either non-leading
	whitespace or '\0' is encountered:
	*/
	for(i=0; i != 0xffffffff; i++)
	{
		c = char_buf[i];
		if(!isdigit(c))
		{
			if(isspace(c))
			{
				if(done_with_leading_whitespace)
					break;
				else
					continue;
			}

			done_with_leading_whitespace = TRUE;

			if(c == '\n' || c == '\0')
			{
				break;
			}
			else
			{
				fprintf(stderr,"convert_base10_char_uint256: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
				ASSERT(HERE, 0,"0");
			}
		}
		curr_digit = (uint64)(c - CHAROFFSET);
		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
		/* currsum *= 10, and check for overflow: */
		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
		if(tmp != 0)
		{
			if(len == LEN_MAX)
			{
				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT256: Offending input string = %s\n", char_buf);
				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
			}
			curr_sum[len++] = tmp;
		}

		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
	}

	x256.d0 = curr_sum[0];
	x256.d1 = curr_sum[1];
	x256.d2 = curr_sum[2];
	x256.d3 = curr_sum[3];
	return x256;
}

/***********************/

/* Functions for 96/128/160/192-bit unsigned integer selected-bit testing: */
uint64	TEST_BIT96 (uint96 __x, uint32 __bit)
{
	/* Since call by value, can overwrite __x here: */
	RSHIFT96(__x, __bit, __x);
	return (__x.d0 & 1);
}

uint64	TEST_BIT128(uint128 __x, uint32 __bit)
{
	/* Since call by value, can overwrite __x here: */
	RSHIFT128(__x, __bit, __x);
	return (__x.d0 & 1);
}

uint64	TEST_BIT160(uint160 __x, uint32 __bit)
{
	/* Since call by value, can overwrite __x here: */
	RSHIFT160(__x, __bit, __x);
	return (__x.d0 & 1);
}

uint64	TEST_BIT192(uint192 __x, uint32 __bit)
{
	/* Since call by value, can overwrite __x here: */
	RSHIFT192(__x, __bit, __x);
	return (__x.d0 & 1);
}

uint64	TEST_BIT256(uint256 __x, uint32 __bit)
{
	/* Since call by value, can overwrite __x here: */
	RSHIFT256(__x, __bit, __x);
	return (__x.d0 & 1);
}

/***********************/

/* Given an IEEE-compliant normalized 64-bit float x, generates an approximate
floating-point inverse accurate to at least (numbits) bits of precision. */
double	finvest(double x, uint32 numbits)
{
	/* Used to store MS 8 non-hidden mantissa bits. We'd need to use a 16-bit int
	to allow for the possibility of a carryout (i.e. result = 256) from rounding
	the 9th-most-significant NHB into the upper 8 (which would involve
	additional logic to handle), we instead deal with the issue of rounding
	by assuming the midpoint - e.g. if truncating to the MS 8 NHBs yields
	a certain integer in [0,255], we assume the resulting roundoff error
	is always 0.5, i.e. our precomputed 1/x values are approximations to
	the resulting midpoints. This also avoids our having to treat an input
	of 1.00000000 as a special case, since we munge that to 1.000000001,
	whose inverse is < 1.0: */
	uint32 byteval;
	int ediff;
	uint32 nacc;
	uint64 itmp, mant, exp;
	double ftmp0, ftmp, err_num, err_den;

	/* Max. precision is 53 bits: */
	if(numbits > 53)
	{
		numbits = 53;
	}

	/* Unpack double into a uint64: */
	itmp = *(uint64 *)&x;
	/* Separate upper part of the significand from the sign/exponent fields: */
	exp  = (itmp >> 52) & MASK_EXP;
	mant =  itmp        & MASK_MANT;
	/* Make sure number is normalized: */
	ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!");

	/* Store most-significant 8 non-hidden bits: */
	byteval = (mant >> 44) & 0x000000ff;

	/* Munge the exponent to get the inverse's exponent: double-precision
	1.0 has exponent 1023 and is its own inverse, so that is the corner case:
	Numbers in (1.0, 2.0) have exp = 1023 and inverses with exp = 1022, but
	1.0 *exactly* has inverse with exp = 1023. However, our approximate-midpoint
	scheme obviates the need for extra logic to handle this case - 1.0 gets
	approximated as 1.0 + {small}: */
	ediff = (int)exp - 1023;
	exp = (uint64)(1022 - ediff);

	/* Now get the approx-inverse byte and stick it into the most-significant
	8 non-hidden bits of the mantissa field: */
	mant = (uint64)byte_lookup_finvest[byteval] << 44;

	itmp = (itmp & MASK_SIGN) + (exp << 52) + mant;
	ftmp = *(double *)&itmp;

	/* Do as many Newton iterations as required - number of correct
	bits approximately doubles each iteration. The iteration we use
	for y = 1/x is
					y_{n+1} = y_n*[2 - x*y_n] ,
	which is nice, as it involves no divisions.
	*/
	/* Starting # of correct bits from table lookup = 8: */
	nacc = 8;
ftmp0 = ftmp;
	while(nacc < numbits)
	{
		ftmp = ftmp*(2.0 - x*ftmp);
		nacc += nacc;
	}
	err_num = ftmp - ftmp0;
	err_den = ftmp + ftmp0;
	if(fabs(err_num)/fabs(err_den) >= 2e-3)
	{
		sprintf(cbuf, "finvtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den));
		ASSERT(HERE, 0, cbuf);
	}

	return ftmp;
}

/* Given an IEEE-compliant normalized 64-bit float x, generates an approximate
floating-point inverse square root accurate to at least (numbits) bits of precision.
This routine is very similar to finvest, so see the comments there for details. */
double	fisqrtest(double x, uint32 numbits)
{
	uint32 byteval;
	int ediff;
	uint32 nacc;
	uint64 itmp, mant, exp;
	double ftmp0, ftmp, err_num, err_den;

	/* Max. precision is 53 bits: */
	if(numbits > 53)
	{
		numbits = 53;
	}

	/* Unpack double into a uint64: */
	itmp = *(uint64 *)&x;
	/* Separate upper part of the significand from the sign/exponent fields: */
	exp  = (itmp >> 52) & MASK_EXP;
	mant =  itmp        & MASK_MANT;
	/* Make sure number is normalized: */
	ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!");

	/* Store most-significant 9 non-hidden bits - we'll use either all
	or the high 8 of these, depending on the parity of the exponent: */
	byteval = (mant >> 43) & 0x000001ff;

	/* Munge the exponent to get the inverse square root's exponent: double-precision
	1.0 has exponent 1023 and is its own inverse, so that is the corner case:
	Numbers in (1.0, 4.0) have exp in [1023,1024] and ISQRTs with exp = 1022, but
	1.0 *exactly* has ISQRT with exp = 1023. However, our approximate-midpoint
	scheme obviates the need for extra logic to handle this case - 1.0 gets
	approximated as 1.0 + {small}. However, one additional twist in the 1/sqrt
	case is the asymmetry in the handling of ediff: e.g. 2.0 has ediff = +1 but
	maps to 0.707... with exp = 1022 (i.e. we want 1022 - ediff/2 for inputs > 1),
	but e.g. 0.5 has ediff = -1 but maps to 1/sqrt(0.5) = 1.414... with exp = 1023,
	and 0.3 has ediff = -2 and maps to 1/sqrt(0.3) = 1.825... also with exp = 1023,
	and .25 has ediff = -2 and maps to 1/sqrt(.25) = 2.000..., with exp = 1024,
	i.e. we want 1022 - (ediff-1)/2 for inputs < 1.
	*/
	ediff = (int)exp - 1023;	/* 1023 = 0x3ff */
	if(ediff >= 0)
	{
		exp = (uint64)(1022 - ediff/2);

		/* Since we need to handle mantissas in [1, 4), we differentiate via
		inputs in [1,2) and in [2,4) by examining ediff - if it's even it's the
		former interval and we need do nothing; if odd it's the latter and we
		need to add 2 to the floating version of the mantissa, i.e. 0x100 to byteval: */
		if(ediff & 0x1)
		{
			byteval += 0x100;	/* I realize "byteval" is a misnomer in this case... */
		}
		else
			byteval >>= 1;
	}
	else
	{
		exp = (uint64)(1022 - (ediff-1)/2);

		if(ediff & 0x1)
		{
			byteval += 0x100;	/* I realize "byteval" is a misnomer in this case... */
		}
		else
			byteval >>= 1;
	}

	/* Now get the approx-inverse byte and stick it into the most-significant
	8 non-hidden bits of the mantissa field: */

	mant = (uint64)byte_lookup_fisqrtest[byteval] << 44;

	itmp = (itmp & MASK_SIGN) + (exp << 52) + mant;
	ftmp = *(double *)&itmp;

	/* Do as many Newton iterations as required - number of correct
	bits approximately doubles each iteration. The iteration we use
	for y = 1/sqrt(x) is
					y_{n+1} = y_n*[3 - x*(y_n)^2]/2 ,
	which is nice, as it involves no divisions.
	*/
	/* Starting # of correct bits from table lookup = 8: */
	nacc = 8;
ftmp0 = ftmp;
	while(nacc < numbits)
	{
		ftmp = 0.5*ftmp*(3.0 - x*ftmp*ftmp);
		nacc += nacc;
	}
	err_num = ftmp - ftmp0;
	err_den = ftmp + ftmp0;
	if(fabs(err_num)/fabs(err_den) >= 2e-3)
	{
		sprintf(cbuf, "fisqrtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den));
		ASSERT(HERE, 0, cbuf);
	}

	return ftmp;
}

/*********************** SIMD functionality/cycle-count tests: **********************************/
#ifdef TEST_SIMD

	// Random (digits of Pi) input data sufficient for 64 AVX1024-sized vec_dbl elements of 16 doubles each:
	const char ran[1024] = {
	3,1,4,1,5,9,2,6,5,3,5,8,9,7,9,3,2,3,8,4,6,2,6,4,3,3,8,3,2,7,9,5,0,2,8,8,4,1,9,7,1,6,9,3,9,9,3,7,5,1,0,5,8,2,0,9,7,4,9,4,4,5,9,2,
	3,0,7,8,1,6,4,0,6,2,8,6,2,0,8,9,9,8,6,2,8,0,3,4,8,2,5,3,4,2,1,1,7,0,6,7,9,8,2,1,4,8,0,8,6,5,1,3,2,8,2,3,0,6,6,4,7,0,9,3,8,4,4,6,
	0,9,5,5,0,5,8,2,2,3,1,7,2,5,3,5,9,4,0,8,1,2,8,4,8,1,1,1,7,4,5,0,2,8,4,1,0,2,7,0,1,9,3,8,5,2,1,1,0,5,5,5,9,6,4,4,6,2,2,9,4,8,9,5,
	4,9,3,0,3,8,1,9,6,4,4,2,8,8,1,0,9,7,5,6,6,5,9,3,3,4,4,6,1,2,8,4,7,5,6,4,8,2,3,3,7,8,6,7,8,3,1,6,5,2,7,1,2,0,1,9,0,9,1,4,5,6,4,8,
	5,6,6,9,2,3,4,6,0,3,4,8,6,1,0,4,5,4,3,2,6,6,4,8,2,1,3,3,9,3,6,0,7,2,6,0,2,4,9,1,4,1,2,7,3,7,2,4,5,8,7,0,0,6,6,0,6,3,1,5,5,8,8,1,
	7,4,8,8,1,5,2,0,9,2,0,9,6,2,8,2,9,2,5,4,0,9,1,7,1,5,3,6,4,3,6,7,8,9,2,5,9,0,3,6,0,0,1,1,3,3,0,5,3,0,5,4,8,8,2,0,4,6,6,5,2,1,3,8,
	4,1,4,6,9,5,1,9,4,1,5,1,1,6,0,9,4,3,3,0,5,7,2,7,0,3,6,5,7,5,9,5,9,1,9,5,3,0,9,2,1,8,6,1,1,7,3,8,1,9,3,2,6,1,1,7,9,3,1,0,5,1,1,8,
	5,4,8,0,7,4,4,6,2,3,7,9,9,6,2,7,4,9,5,6,7,3,5,1,8,8,5,7,5,2,7,2,4,8,9,1,2,2,7,9,3,8,1,8,3,0,1,1,9,4,9,1,2,9,8,3,3,6,7,3,3,6,2,4,
	4,0,6,5,6,6,4,3,0,8,6,0,2,1,3,9,4,9,4,6,3,9,5,2,2,4,7,3,7,1,9,0,7,0,2,1,7,9,8,6,0,9,4,3,7,0,2,7,7,0,5,3,9,2,1,7,1,7,6,2,9,3,1,7,
	6,7,5,2,3,8,4,6,7,4,8,1,8,4,6,7,6,6,9,4,0,5,1,3,2,0,0,0,5,6,8,1,2,7,1,4,5,2,6,3,5,6,0,8,2,7,7,8,5,7,7,1,3,4,2,7,5,7,7,8,9,6,0,9,
	1,7,3,6,3,7,1,7,8,7,2,1,4,6,8,4,4,0,9,0,1,2,2,4,9,5,3,4,3,0,1,4,6,5,4,9,5,8,5,3,7,1,0,5,0,7,9,2,2,7,9,6,8,9,2,5,8,9,2,3,5,4,2,0,
	1,9,9,5,6,1,1,2,1,2,9,0,2,1,9,6,0,8,6,4,0,3,4,4,1,8,1,5,9,8,1,3,6,2,9,7,7,4,7,7,1,3,0,9,9,6,0,5,1,8,7,0,7,2,1,1,3,4,9,9,9,9,9,9,
	8,3,7,2,9,7,8,0,4,9,9,5,1,0,5,9,7,3,1,7,3,2,8,1,6,0,9,6,3,1,8,5,9,5,0,2,4,4,5,9,4,5,5,3,4,6,9,0,8,3,0,2,6,4,2,5,2,2,3,0,8,2,5,3,
	3,4,4,6,8,5,0,3,5,2,6,1,9,3,1,1,8,8,1,7,1,0,1,0,0,0,3,1,3,7,8,3,8,7,5,2,8,8,6,5,8,7,5,3,3,2,0,8,3,8,1,4,2,0,6,1,7,1,7,7,6,6,9,1,
	4,7,3,0,3,5,9,8,2,5,3,4,9,0,4,2,8,7,5,5,4,6,8,7,3,1,1,5,9,5,6,2,8,6,3,8,8,2,3,5,3,7,8,7,5,9,3,7,5,1,9,5,7,7,8,1,8,5,7,7,8,0,5,3,
	2,1,7,1,2,2,6,8,0,6,6,1,3,0,0,1,9,2,7,8,7,6,6,1,1,1,9,5,9,0,9,2,1,6,4,2,0,1,9,8,9,3,8,0,9,5,2,5,7,2,0,1,0,6,5,4,8,5,8,6,3,2,7,8
	};

  #ifdef USE_AVX1024
	int	test_simd_transpose_16x16()
	{
		ASSERT(HERE,0,"function not yet supported!");
		return 0;
	}
  #endif

  #ifdef USE_AVX512
	int	test_simd_transpose_8x8()
	{
		/*...time-related stuff	*/
		double clock1, clock2;
		double tdiff, t0,t1,t2,t3;
		int i,imax = 100000001, row,col, nerr;	// Use 10^8 loop execs in effort to yield timing on order of 1 sec on target CPUs
			// Add 1 to make loop count odd, thus result of (imax) successive transposes equivalent to a single one
		const int dim = 64;	// #elements in our matrix, allocate 2x this to allow for real/imag side-by-side variant
		vec_dbl *mem = 0x0, *data;
		mem = ALLOC_VEC_DBL(mem, 2*dim+4);	// Add 4 pads to allow for alignment on up-to-128-byte boundary
		data = ALIGN_VEC_DBL(mem);	ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!");
		// Init the matrix -  Input matrix has rows containing [0-7][8-15]...[56-63]:
		double *dptr = (double *)data;
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }
	//	printf("Input matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
		}
	//	printf("\n");

		// Do timing loop using 2 fundamentally different methods of effecting the transpose, the 2nd of
		// which mimics the data movement surrounding the dyadic-square and carry steps of our FFT-mul:
	  #ifdef USE_IMCI512
		// [1a] Rowwise-load and in-register data shuffles. On KNL: 45 cycles per loop-exec:
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq	%[__data],%%rax \n\t"\
			/* This mov/kmov/knot sequence saves ~4% overall-macro-runtime vs movl/kmov of 5 separate bitstrings into k1-k5 */\
			"movl $0b10101010,%%ebx	\n\t movl $0b11001100,%%ecx	\n\t movl $0b11110000,%%edx	\n\t"\
			"kmov	%%ebx,%%k1		\n\t kmov	%%ecx,%%k3		\n\t kmov	%%edx,%%k5		\n\t"\
			"knot	%%k1 ,%%k2		\n\t knot	%%k3 ,%%k4		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps	0x000(%%rax),%%zmm0	\n\t"/* zmm0 = 00 01 02 03 04 05 06 07 */\
				"vmovaps	0x040(%%rax),%%zmm1	\n\t"/* zmm1 = 10 11 12 13 14 15 16 17 */\
				"vmovaps	0x080(%%rax),%%zmm2	\n\t"/* zmm2 = 20 21 22 23 24 25 26 27 */\
				"vmovaps	0x0c0(%%rax),%%zmm3	\n\t"/* zmm3 = 30 31 32 33 34 35 36 37 */\
				"vmovaps	0x100(%%rax),%%zmm4	\n\t"/* zmm4 = 40 41 42 43 44 45 46 47 */\
				"vmovaps	0x140(%%rax),%%zmm5	\n\t"/* zmm5 = 50 51 52 53 54 55 56 57 */\
				"vmovaps	0x180(%%rax),%%zmm6	\n\t"/* zmm6 = 60 61 62 63 64 65 66 67 */\
				"vmovaps	0x1c0(%%rax),%%zmm7	\n\t"/* zmm7 = 70 71 72 73 74 75 76 77 */\

				"vblendmpd		%%zmm1%{cdab%},%%zmm0,%%zmm8%{%%k1%}	\n\t"/* zmm8 = {0, 8, 2, 10, 4, 12, 6, 14} */\
				"vblendmpd		%%zmm0%{cdab%},%%zmm1,%%zmm1%{%%k2%}	\n\t"/* zmm1 = {1, 9, 3, 11, 5, 13, 7, 15} */\
				"vblendmpd		%%zmm3%{cdab%},%%zmm2,%%zmm0%{%%k1%}	\n\t"/* zmm0 = {16, 24, 18, 26, 20, 28, 22, 30 */\
				"vblendmpd		%%zmm2%{cdab%},%%zmm3,%%zmm3%{%%k2%}	\n\t"/* zmm3 =  */\
				"vblendmpd		%%zmm5%{cdab%},%%zmm4,%%zmm2%{%%k1%}	\n\t"/* zmm2 =  */\
				"vblendmpd		%%zmm4%{cdab%},%%zmm5,%%zmm5%{%%k2%}	\n\t"/* zmm5 =  */\
				"vblendmpd		%%zmm7%{cdab%},%%zmm6,%%zmm4%{%%k1%}	\n\t"/* zmm4 =  */\
				"vblendmpd		%%zmm6%{cdab%},%%zmm7,%%zmm7%{%%k2%}	\n\t"/* zmm7 =  */\

				"vblendmpd		%%zmm0%{badc%},%%zmm8,%%zmm6%{%%k3%}	\n\t"\
				"vblendmpd		%%zmm8%{badc%},%%zmm0,%%zmm0%{%%k4%}	\n\t"\
				"vblendmpd		%%zmm3%{badc%},%%zmm1,%%zmm8%{%%k3%}	\n\t"\
				"vblendmpd		%%zmm1%{badc%},%%zmm3,%%zmm3%{%%k4%}	\n\t"\
				"vblendmpd		%%zmm4%{badc%},%%zmm2,%%zmm1%{%%k3%}	\n\t"\
				"vblendmpd		%%zmm2%{badc%},%%zmm4,%%zmm4%{%%k4%}	\n\t"\
				"vblendmpd		%%zmm7%{badc%},%%zmm5,%%zmm2%{%%k3%}	\n\t"\
				"vblendmpd		%%zmm5%{badc%},%%zmm7,%%zmm7%{%%k4%}	\n\t"\

				"vpermf32x4	$78, %%zmm1,%%zmm1		\n\t"\
				"vpermf32x4	$78, %%zmm2,%%zmm2		\n\t"\
				"vpermf32x4	$78, %%zmm4,%%zmm4		\n\t"\
				"vpermf32x4	$78, %%zmm7,%%zmm7		\n\t"\

				"vblendmpd		%%zmm1,%%zmm6,%%zmm5%{%%k5%}	\n\t"\
				"vblendmpd		%%zmm6,%%zmm1,%%zmm1%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm2,%%zmm8,%%zmm6%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm8,%%zmm2,%%zmm2%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm4,%%zmm0,%%zmm8%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm0,%%zmm4,%%zmm4%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm7,%%zmm3,%%zmm0%{%%k5%}		\n\t"\
				"vblendmpd		%%zmm3,%%zmm7,%%zmm7%{%%k5%}		\n\t"\

				"vpermf32x4	$78, %%zmm1,%%zmm1		\n\t"\
				"vpermf32x4	$78, %%zmm2,%%zmm2		\n\t"\
				"vpermf32x4	$78, %%zmm4,%%zmm4		\n\t"\
				"vpermf32x4	$78, %%zmm7,%%zmm7		\n\t"\

				"vmovaps	%%zmm5,0x000(%%rax)	\n\t"\
				"vmovaps	%%zmm6,0x040(%%rax)	\n\t"\
				"vmovaps	%%zmm8,0x080(%%rax)	\n\t"\
				"vmovaps	%%zmm0,0x0c0(%%rax)	\n\t"\
				"vmovaps	%%zmm1,0x100(%%rax)	\n\t"\
				"vmovaps	%%zmm2,0x140(%%rax)	\n\t"\
				"vmovaps	%%zmm4,0x180(%%rax)	\n\t"\
				"vmovaps	%%zmm7,0x1c0(%%rax)	\n\t"\
				:				// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1a]: Time for %u 8x8 doubles-transposes using in-register shuffles =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

	  #else	// AVX-512 version:

		// [1a] Rowwise-load and in-register data shuffles. On KNL: 45 cycles per loop-exec:
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps		0x000(%%rax),%%zmm0		\n\t"\
				"vmovaps		0x040(%%rax),%%zmm1		\n\t"\
				"vmovaps		0x080(%%rax),%%zmm2		\n\t"\
				"vmovaps		0x0c0(%%rax),%%zmm3		\n\t"\
				"vmovaps		0x100(%%rax),%%zmm4		\n\t"\
				"vmovaps		0x140(%%rax),%%zmm5		\n\t"\
				"vmovaps		0x180(%%rax),%%zmm6		\n\t"\
				"vmovaps		0x1c0(%%rax),%%zmm7		\n\t"\
				/* Transpose uses regs0-7 for data, reg8 for temp: */\
				/* [1] First step is a quartet of [UNPCKLPD,UNPCKHPD] pairs to effect transposed 2x2 submatrices - */\
				/* indices in comments at right are [row,col] pairs, i.e. octal version of linear array indices: */
				"vunpcklpd		%%zmm1,%%zmm0,%%zmm8	\n\t"/* zmm8 = 00 10 02 12 04 14 06 16 */\
				"vunpckhpd		%%zmm1,%%zmm0,%%zmm1	\n\t"/* zmm1 = 01 11 03 13 05 15 07 17 */\
				"vunpcklpd		%%zmm3,%%zmm2,%%zmm0	\n\t"/* zmm0 = 20 30 22 32 24 34 26 36 */\
				"vunpckhpd		%%zmm3,%%zmm2,%%zmm3	\n\t"/* zmm3 = 21 31 23 33 25 35 27 37 */\
				"vunpcklpd		%%zmm5,%%zmm4,%%zmm2	\n\t"/* zmm2 = 40 50 42 52 44 54 46 56 */\
				"vunpckhpd		%%zmm5,%%zmm4,%%zmm5	\n\t"/* zmm5 = 41 51 43 53 45 55 47 57 */\
				"vunpcklpd		%%zmm7,%%zmm6,%%zmm4	\n\t"/* zmm4 = 60 70 62 72 64 74 66 76 */\
				"vunpckhpd		%%zmm7,%%zmm6,%%zmm7	\n\t"/* zmm7 = 61 71 63 73 65 75 67 77 */\
			/**** Getting rid of reg-index-nicifying copies here means Outputs not in 0-7 but in 8,1,0,3,2,5,4,7, with 6 now free ****/\
				/* [2] 1st layer of VSHUFF64x2, 2 outputs each with trailing index pairs [0,4],[1,5],[2,6],[3,7]. */\
				/* Note the imm8 values expressed in terms of 2-bit index subfields again read right-to-left */\
				/* (as for the SHUFPS imm8 values in the AVX 8x8 float code) are 221 = (3,1,3,1) and 136 = (2,0,2,0): */\
				"vshuff64x2	$136,%%zmm0,%%zmm8,%%zmm6	\n\t"/* zmm6 = 00 10 04 14 20 30 24 34 */\
				"vshuff64x2	$221,%%zmm0,%%zmm8,%%zmm0	\n\t"/* zmm0 = 02 12 06 16 22 32 26 36 */\
				"vshuff64x2	$136,%%zmm3,%%zmm1,%%zmm8	\n\t"/* zmm8 = 01 11 05 15 21 31 25 35 */\
				"vshuff64x2	$221,%%zmm3,%%zmm1,%%zmm3	\n\t"/* zmm3 = 03 13 07 17 23 33 27 37 */\
				"vshuff64x2	$136,%%zmm4,%%zmm2,%%zmm1	\n\t"/* zmm1 = 40 50 44 54 60 70 64 74 */\
				"vshuff64x2	$221,%%zmm4,%%zmm2,%%zmm4	\n\t"/* zmm4 = 42 52 46 56 62 72 66 76 */\
				"vshuff64x2	$136,%%zmm7,%%zmm5,%%zmm2	\n\t"/* zmm2 = 41 51 45 55 61 71 65 75 */\
				"vshuff64x2	$221,%%zmm7,%%zmm5,%%zmm7	\n\t"/* zmm7 = 43 53 47 57 63 73 67 77 */\
			/**** Getting rid of reg-index-nicifying copies here means Outputs 8,1,2,5 -> 6,8,1,2, with 5 now free ***/\
				/* [3] Last step in 2nd layer of VSHUFF64x2, now combining reg-pairs sharing same trailing index pairs. */\
				/* Output register indices reflect trailing index of data contained therein: */\
				"vshuff64x2	$136,%%zmm1,%%zmm6,%%zmm5	\n\t"/* zmm5 = 00 10 20 30 40 50 60 70 [row 0 of transpose-matrix] */\
				"vshuff64x2	$221,%%zmm1,%%zmm6,%%zmm1	\n\t"/* zmm1 = 04 14 24 34 44 54 64 74 [row 4 of transpose-matrix] */\
				"vshuff64x2	$136,%%zmm2,%%zmm8,%%zmm6	\n\t"/* zmm6 = 01 11 21 31 41 51 61 71 [row 1 of transpose-matrix] */\
				"vshuff64x2	$221,%%zmm2,%%zmm8,%%zmm2	\n\t"/* zmm2 = 05 15 25 35 45 55 65 75 [row 5 of transpose-matrix] */\
				"vshuff64x2	$136,%%zmm4,%%zmm0,%%zmm8	\n\t"/* zmm8 = 02 12 22 32 42 52 62 72 [row 2 of transpose-matrix] */\
				"vshuff64x2	$221,%%zmm4,%%zmm0,%%zmm4	\n\t"/* zmm4 = 06 16 26 36 46 56 66 76 [row 6 of transpose-matrix] */\
				"vshuff64x2	$136,%%zmm7,%%zmm3,%%zmm0	\n\t"/* zmm0 = 03 13 23 33 43 53 63 73 [row 3 of transpose-matrix] */\
				"vshuff64x2	$221,%%zmm7,%%zmm3,%%zmm7	\n\t"/* zmm7 = 07 17 27 37 47 57 67 77 [row 7 of transpose-matrix] */\
			/**** Getting rid of reg-index-nicifying copies here means Outputs 6,8,0,3 -> 5,6,8,0 with 3 now free ***/\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm5,0x000(%%rax)		\n\t"\
				"vmovaps		%%zmm6,0x040(%%rax)		\n\t"\
				"vmovaps		%%zmm8,0x080(%%rax)		\n\t"\
				"vmovaps		%%zmm0,0x0c0(%%rax)		\n\t"\
				"vmovaps		%%zmm1,0x100(%%rax)		\n\t"\
				"vmovaps		%%zmm2,0x140(%%rax)		\n\t"\
				"vmovaps		%%zmm4,0x180(%%rax)		\n\t"\
				"vmovaps		%%zmm7,0x1c0(%%rax)		\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1a]: Time for %u 8x8 doubles-transposes using in-register shuffles =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

		// [1b] Same as [1a] but with a few reg-copies to make for a nicer indexing pattern. On KNL: 48 cycles per loop-exec:
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps		0x000(%%rax),%%zmm0		\n\t"\
				"vmovaps		0x040(%%rax),%%zmm1		\n\t"\
				"vmovaps		0x080(%%rax),%%zmm2		\n\t"\
				"vmovaps		0x0c0(%%rax),%%zmm3		\n\t"\
				"vmovaps		0x100(%%rax),%%zmm4		\n\t"\
				"vmovaps		0x140(%%rax),%%zmm5		\n\t"\
				"vmovaps		0x180(%%rax),%%zmm6		\n\t"\
				"vmovaps		0x1c0(%%rax),%%zmm7		\n\t"\
				/* [1] First step is a quartet of [UNPCKLPD,UNPCKHPD] pairs to effect transposed 2x2 submatrices - VUNPCK latency 4-7, rthru = 2: */\
				"vunpckhpd		 %%zmm1,%%zmm0,%%zmm8									\n\t"/* zmm0 = 00 10 02 12 04 14 06 16 [after reg-copy on next line] */\
				"vunpcklpd		 %%zmm1,%%zmm0,%%zmm0 	\n\t	vmovaps	%%zmm8,%%zmm1 	\n\t"/* zmm1 = 01 11 03 13 05 15 07 17 */\
				"vunpckhpd		 %%zmm3,%%zmm2,%%zmm8									\n\t"/* zmm2 = 20 30 22 32 24 34 26 36 */\
				"vunpcklpd		 %%zmm3,%%zmm2,%%zmm2 	\n\t	vmovaps	%%zmm8,%%zmm3 	\n\t"/* zmm3 = 21 31 23 33 25 35 27 37 */\
				"vunpckhpd		 %%zmm5,%%zmm4,%%zmm8									\n\t"/* zmm4 = 40 50 42 52 44 54 46 56 */\
				"vunpcklpd		 %%zmm5,%%zmm4,%%zmm4 	\n\t	vmovaps	%%zmm8,%%zmm5	\n\t"/* zmm5 = 41 51 43 53 45 55 47 57 */\
				"vunpckhpd		 %%zmm7,%%zmm6,%%zmm8									\n\t"/* zmm6 = 60 70 62 72 64 74 66 76 */\
				"vunpcklpd		 %%zmm7,%%zmm6,%%zmm6	\n\t	vmovaps	%%zmm8,%%zmm7	\n\t"/* zmm7 = 61 71 63 73 65 75 67 77 */\
				/* [2] 1st layer of VSHUFF64x2, 2 outputs each with trailing index pairs [0,4],[1,5],[2,6],[3,7] - VSHUFF64x2 latency 4-7, rthru = 2: */\
				"vshuff64x2	$136,%%zmm2,%%zmm0,%%zmm8									\n\t"/* zmm0 = 00 10 04 14 20 30 24 34 */\
				"vshuff64x2	$221,%%zmm2,%%zmm0,%%zmm2 	\n\t	vmovaps	%%zmm8,%%zmm0 	\n\t"/* zmm2 = 02 12 06 16 22 32 26 36 */\
				"vshuff64x2	$136,%%zmm3,%%zmm1,%%zmm8									\n\t"/* zmm1 = 01 11 05 15 21 31 25 35 */\
				"vshuff64x2	$221,%%zmm3,%%zmm1,%%zmm3 	\n\t	vmovaps	%%zmm8,%%zmm1 	\n\t"/* zmm3 = 03 13 07 17 23 33 27 37 */\
				"vshuff64x2	$136,%%zmm6,%%zmm4,%%zmm8									\n\t"/* zmm4 = 40 50 44 54 60 70 64 74 */\
				"vshuff64x2	$221,%%zmm6,%%zmm4,%%zmm6	\n\t	vmovaps	%%zmm8,%%zmm4 	\n\t"/* zmm6 = 42 52 46 56 62 72 66 76 */\
				"vshuff64x2	$136,%%zmm7,%%zmm5,%%zmm8									\n\t"/* zmm5 = 41 51 45 55 61 71 65 75 */\
				"vshuff64x2	$221,%%zmm7,%%zmm5,%%zmm7	\n\t	vmovaps	%%zmm8,%%zmm5	\n\t"/* zmm7 = 43 53 47 57 63 73 67 77 */\
				/* [3] Last step in 2nd layer of VSHUFF64x2, now combining reg-pairs sharing same trailing index pairs: */\
				"vshuff64x2	$136,%%zmm4,%%zmm0,%%zmm8									\n\t"/* zmm0 = 00 10 20 30 40 50 60 70 */\
				"vshuff64x2	$221,%%zmm4,%%zmm0,%%zmm4 	\n\t	vmovaps	%%zmm8,%%zmm0 	\n\t"/* zmm4 = 04 14 24 34 44 54 64 74 */\
				"vshuff64x2	$136,%%zmm5,%%zmm1,%%zmm8									\n\t"/* zmm1 = 01 11 21 31 41 51 61 71 */\
				"vshuff64x2	$221,%%zmm5,%%zmm1,%%zmm5	\n\t	vmovaps	%%zmm8,%%zmm1 	\n\t"/* zmm5 = 05 15 25 35 45 55 65 75 */\
				"vshuff64x2	$136,%%zmm6,%%zmm2,%%zmm8									\n\t"/* zmm2 = 02 12 22 32 42 52 62 72 */\
				"vshuff64x2	$221,%%zmm6,%%zmm2,%%zmm6	\n\t	vmovaps	%%zmm8,%%zmm2 	\n\t"/* zmm6 = 06 16 26 36 46 56 66 76 */\
				"vshuff64x2	$136,%%zmm7,%%zmm3,%%zmm8									\n\t"/* zmm3 = 03 13 23 33 43 53 63 73 */\
				"vshuff64x2	$221,%%zmm7,%%zmm3,%%zmm7	\n\t	vmovaps	%%zmm8,%%zmm3 	\n\t"/* zmm7 = 07 17 27 37 47 57 67 77 */\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm0,0x000(%%rax)		\n\t"\
				"vmovaps		%%zmm1,0x040(%%rax)		\n\t"\
				"vmovaps		%%zmm2,0x080(%%rax)		\n\t"\
				"vmovaps		%%zmm3,0x0c0(%%rax)		\n\t"\
				"vmovaps		%%zmm4,0x100(%%rax)		\n\t"\
				"vmovaps		%%zmm5,0x140(%%rax)		\n\t"\
				"vmovaps		%%zmm6,0x180(%%rax)		\n\t"\
				"vmovaps		%%zmm7,0x1c0(%%rax)		\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1b]: Time for %u 8x8 doubles-transposes using in-register shuffles =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

		/* [1c; Apr 2018] Variant which GeorgeW says saves a few cycles on his Skylake-X:
			"The idea is to use vbroadcastf64x4 to do the 256-bit shuffles. This is more
			load uops, but the masking can be done on either port 0 or port 5which is better
			than the vshuff64x2 it replaces which can only be done on port 5."
		George says this saves a few cycles over the above on Skylake-X, but in my test on KNL it's even slower
		than gather-based variant [2] - seems Agner Fog's 5-cycle latency for vbroadcastf64x4 on KNL is way low.
		KNL: 68 cycles, over 1.5x the 44 cycles for my best 24-shuffle version [1b], and 20% slower than [2]'s 56.
		With shuffle-passes 2 and 3 deleted (i.e. just the load, vbroadcastf64x4 and store steps), get 62 cycles,
		which means vbroadcastf64x4 is roughly as slow as vgatherdpd[!], which differs markedly from the 5-cycle-latency,
		2-per-cycle throughput listed for vbroadcastf64x4-on-KNL in Agner Fog's x86 instruction tables compilation.
		That, coupled with the middle-2-col-pairs-swapped-versus-transpose nature of the output makes it a no-go for me.
		On KNL: 62 cycles per loop-exec:
		*/
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				/* Init opmasks k2 and k3 [need only low byte of each]: */\
				"movl	$0xf0,%%eax	\n\t	kmovw	%%eax,%%k1	\n\t"\
				"movl	$0x0f,%%eax	\n\t	kmovw	%%eax,%%k2	\n\t"\
				"movq		%[__data],%%rax		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps		0x000(%%rax),%%zmm2		\n\t"\
				"vmovaps		0x040(%%rax),%%zmm4		\n\t"\
				"vmovaps		0x080(%%rax),%%zmm5		\n\t"\
				"vmovaps		0x0c0(%%rax),%%zmm3		\n\t"\
				"vmovaps		0x100(%%rax),%%zmm6		\n\t"\
				"vmovaps		0x140(%%rax),%%zmm8		\n\t"\
				"vmovaps		0x180(%%rax),%%zmm0		\n\t"\
				"vmovaps		0x1c0(%%rax),%%zmm7		\n\t"\
				/* [1] Interleave lo/hi halves of rows 0-3 with those of rows 4-7, respectively: */\
				"vbroadcastf64x4	0x100(%%rax),%%zmm2%{%%k1%}	\n\t"/* zmm2 = 00 01 02 03 40 41 42 43 */\
				"vbroadcastf64x4	0x140(%%rax),%%zmm4%{%%k1%}	\n\t"/* zmm4 = 10 11 12 13 50 51 52 53 */\
				"vbroadcastf64x4	0x180(%%rax),%%zmm5%{%%k1%}	\n\t"/* zmm5 = 20 21 22 23 60 61 62 63 */\
				"vbroadcastf64x4	0x1c0(%%rax),%%zmm3%{%%k1%}	\n\t"/* zmm3 = 30 31 32 33 70 71 72 73 */\
				"vbroadcastf64x4	0x020(%%rax),%%zmm6%{%%k2%}	\n\t"/* zmm6 = 04 05 06 07 44 45 46 47 */\
				"vbroadcastf64x4	0x060(%%rax),%%zmm8%{%%k2%}	\n\t"/* zmm8 = 14 15 16 17 54 55 56 57 */\
				"vbroadcastf64x4	0x0a0(%%rax),%%zmm0%{%%k2%}	\n\t"/* zmm0 = 24 25 26 27 64 65 66 67 */\
				"vbroadcastf64x4	0x0e0(%%rax),%%zmm7%{%%k2%}	\n\t"/* zmm7 = 34 35 36 37 74 75 76 77 */\
			/* Now a simple quartet of 4x4 transposes on the resulting four 4x4 submatrices suffices to give the
			desired 8x8 transpose, BUT! - the 4x4 AVX transpose code uses a set of vshufpd (just as our step [2] below)
			followed by a step based on vperm2f128, and there is no 512-bit version of the latter instruction. */\
				/* [2] Use 8 VSHUFPD to effect transposes of the eight 2x2 submatrices: */\
				"vshufpd	$0x00,%%zmm4,%%zmm2,%%zmm1	\n\t"/* zmm1 = 00 10 02 12 40 50 42 52 */\
				"vshufpd	$0xff,%%zmm4,%%zmm2,%%zmm4	\n\t"/* zmm4 = 01 11 03 13 41 51 43 53 */\
				"vshufpd	$0x00,%%zmm3,%%zmm5,%%zmm2	\n\t"/* zmm2 = 20 30 22 32 60 70 62 72 */\
				"vshufpd	$0xff,%%zmm3,%%zmm5,%%zmm3	\n\t"/* zmm3 = 21 31 23 33 61 71 63 73 */\
				"vshufpd	$0x00,%%zmm8,%%zmm6,%%zmm5	\n\t"/* zmm5 = 04 14 06 16 44 54 46 56 */\
				"vshufpd	$0xff,%%zmm8,%%zmm6,%%zmm8	\n\t"/* zmm8 = 05 15 07 17 45 55 47 57 */\
				"vshufpd	$0x00,%%zmm7,%%zmm0,%%zmm6	\n\t"/* zmm6 = 24 34 26 36 64 74 66 76 */\
				"vshufpd	$0xff,%%zmm7,%%zmm0,%%zmm7	\n\t"/* zmm7 = 25 35 27 37 65 75 67 77 */\
				/* [3] Last step is layer of VSHUFF64x2, now combining reg-pairs sharing same trailing index pairs. */\
				/* Note the imm8 values expressed in terms of 2-bit index subfields again read right-to-left (as for the SHUFPS imm7 */\
				/* values in the AVX 8x8 float code) are 0x88 = (2,0,2,0), 0xdd = (3,1,3,1), 0x22 = (0,2,0,2) and 0x77 = (1,3,1,3): */\
				/* Output register indices reflect trailing index of data contained therein: */\
													/****** Output col-pairs [4,5],[2,3] swapped! ******/\
				"vshuff64x2	$0x88,%%zmm2,%%zmm1,%%zmm0	\n\t"/* zmm0 = 00 10 40 50 20 30 60 70 [row 0 of transpose-matrix] */\
				"vshuff64x2	$0xdd,%%zmm2,%%zmm1,%%zmm2	\n\t"/* zmm2 = 02 12 42 52 22 32 62 72 [row 2 of transpose-matrix] */\
				"vshuff64x2	$0x88,%%zmm3,%%zmm4,%%zmm1	\n\t"/* zmm1 = 01 11 41 51 21 31 61 71 [row 1 of transpose-matrix] */\
				"vshuff64x2	$0xdd,%%zmm3,%%zmm4,%%zmm3	\n\t"/* zmm3 = 03 13 43 53 23 33 63 73 [row 3 of transpose-matrix] */\
				"vshuff64x2	$0x88,%%zmm6,%%zmm5,%%zmm4	\n\t"/* zmm4 = 04 14 44 54 24 34 64 74 [row 4 of transpose-matrix] */\
				"vshuff64x2	$0xdd,%%zmm6,%%zmm5,%%zmm6	\n\t"/* zmm6 = 06 16 46 56 26 36 66 76 [row 6 of transpose-matrix] */\
				"vshuff64x2	$0x88,%%zmm7,%%zmm8,%%zmm5	\n\t"/* zmm5 = 05 15 45 55 25 35 65 75 [row 5 of transpose-matrix] */\
				"vshuff64x2	$0xdd,%%zmm7,%%zmm8,%%zmm7	\n\t"/* zmm7 = 07 17 47 57 27 37 67 77 [row 7 of transpose-matrix] */\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm0,0x000(%%rax)		\n\t"\
				"vmovaps		%%zmm1,0x040(%%rax)		\n\t"\
				"vmovaps		%%zmm2,0x080(%%rax)		\n\t"\
				"vmovaps		%%zmm3,0x0c0(%%rax)		\n\t"\
				"vmovaps		%%zmm4,0x100(%%rax)		\n\t"\
				"vmovaps		%%zmm5,0x140(%%rax)		\n\t"\
				"vmovaps		%%zmm6,0x180(%%rax)		\n\t"\
				"vmovaps		%%zmm7,0x1c0(%%rax)		\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		// We know this version does not produce a true transpose, so just check timing:
		printf("Method [1c]: Time for %u MIDDLE-COL-PAIR-SWAPPED 8x8 doubles-transposes using in-register shuffles =%s\n",imax, get_time_str(tdiff));

		// [1d] Skylake-X-oriented variant from George. On KNL: 40 cycles per loop-exec, ~15% faster than [1a]:
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
				"kmovw	%%eax,%%k1			\n\t"\
				/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
				could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
				"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
				"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
					"vmovq		%%rax,%%xmm0 		\n\t"\
					"vmovq		%%rbx,%%xmm1 		\n\t"\
					"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
					"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
				"movq		%[__data],%%rax		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps		0x000(%%rax),%%zmm0		\n\t"\
				"vmovaps		0x040(%%rax),%%zmm1		\n\t"\
				"vmovaps		0x080(%%rax),%%zmm2		\n\t"\
				"vmovaps		0x0c0(%%rax),%%zmm3		\n\t"\
				"vmovaps		0x100(%%rax),%%zmm4		\n\t"\
				"vmovaps		0x140(%%rax),%%zmm5		\n\t"\
				"vmovaps		0x180(%%rax),%%zmm6		\n\t"\
				"vmovaps		0x1c0(%%rax),%%zmm7		\n\t"\
				/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
				"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t"/* 00 01 02 03 40 41 42 43 */\
				"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t"/* 04 05 06 07 44 45 46 47 */\
				"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t"/* 10 11 12 13 50 51 52 53 */\
				"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t"/* 14 15 16 17 54 55 56 57 */\
				"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t"/* 22 23 20 21 62 63 60 61 */\
				"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t"/* 26 27 24 25 66 67 64 65 */\
				"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t"/* 32 33 30 31 72 73 70 71 */\
				"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t"/* 36 37 34 35 76 77 74 75 *//* data in 4-11; 0-3 free */\
				/* [2] Blend in the 2-aparts */\
				"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t"/* 00 01 20 21 40 41 60 61 */\
				"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t"/* 22 23 02 03 62 63 42 43 */\
				"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t"/* 04 05 24 25 44 45 64 65 */\
				"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t"/* 26 27 06 07 66 67 46 47 */\
				"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t"/* 10 11 30 31 50 51 70 71 */\
				"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t"/* 32 33 12 13 72 73 52 53 */\
				"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t"/* 14 15 34 35 54 55 74 75 */\
				"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t"/* 36 37 16 17 76 77 56 57 *//* data in 0-5,8-9; 6-7,10-11 free */\
				/* [3] Shuffle or permute in the 1-aparts */\
				"vshufpd	$0b00000000,%%zmm2,	%%zmm0,%%zmm10 	\n\t"/* 00 10 20 30 40 50 60 70 */\
				"vshufpd	$0b11111111,%%zmm2,	%%zmm0,%%zmm11 	\n\t"/* 01 11 21 31 41 51 61 71 */\
				"vmovapd	%%zmm8,%%zmm2	\n\t"\
				"vpermt2pd				%%zmm9,	%%zmm30,%%zmm2 	\n\t"/* 02 12 22 32 42 52 62 72 */\
				"vpermt2pd				%%zmm9,	%%zmm31,%%zmm8	\n\t"/* 03 13 23 33 43 53 63 73 */\
				"vshufpd	$0b00000000,%%zmm3,	%%zmm1,%%zmm0 	\n\t"/* 04 14 24 34 44 54 64 74 */\
				"vshufpd	$0b11111111,%%zmm3,	%%zmm1,%%zmm1 	\n\t"/* 05 15 25 35 45 55 65 75 */\
				"vmovapd	%%zmm4,%%zmm3	\n\t"\
				"vpermt2pd				%%zmm5,	%%zmm30,%%zmm3 	\n\t"/* 06 16 26 36 46 56 66 76 */\
				"vpermt2pd				%%zmm5,	%%zmm31,%%zmm4	\n\t"/* 07 17 27 37 47 57 67 77 */\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm10,0x000(%%rax)		\n\t"\
				"vmovaps		%%zmm11,0x040(%%rax)		\n\t"\
				"vmovaps		%%zmm2 ,0x080(%%rax)		\n\t"\
				"vmovaps		%%zmm8 ,0x0c0(%%rax)		\n\t"\
				"vmovaps		%%zmm0 ,0x100(%%rax)		\n\t"\
				"vmovaps		%%zmm1 ,0x140(%%rax)		\n\t"\
				"vmovaps		%%zmm3 ,0x180(%%rax)		\n\t"\
				"vmovaps		%%zmm4, 0x1c0(%%rax)		\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11", "xmm30","xmm31"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1d]: Time for %u 8x8 doubles-transposes using [vshuff64x2,vblendmpd,vshufpd,vpermt2pd] =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);
	  #endif	// endif(USE_IMCI512 || USE_AVX512)

		// [2a] Columnwise-load-and-rowwise-writeback using AVX512 gather-load functionality. On KNL: 56 cycles per loop-exec.
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed

	  #ifdef USE_IMCI512
	  #elif 0	// Here the code needing fixing-up:
		#error Needs non-IMCI isntructions replaced!
		nerr = 0; clock1 = getRealTime();
		/* Compare latency/thruput/ports for shuffle and gather-based versions, using Agner Fog's KNL tables:
		[1] vunpcklpd, vshuff64x2 both have 4-7 cycle latency, one can start every 2 cycles [3,1 on Skylake-X].
								Thus 24 such in sequence with no wait-stalls ==> ~50 cycles, close to what I measure.
		[2] vgatherdpd has 7-cycle latency, no data re. thruput, thus ~60 cycles per loop, again close to that observed.
		Both [1] and [2] use port 5, but on KNL the 'empty' cycles between shuffle-op issues can be used to issue gathers,
		which is what the side-by-ide matrix-pair [2b] variant tests.
		*/
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {	// Nov 2016: 4.3 sec for 10^8 loops @1.3GHz ==> ~7 cycles per gather-load
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Auxiliary register data needed for columnwise loads: */\
			"movq	$0x1c1814100c080400,%%rbx	\n\t"/* 64-bit register w/byte offsets 0x[00,04,08,0c,10,14,18,1c], bytes numbered left-to-right */\
				"vmovq		%%rbx,%%xmm8 		\n\t"/* Copy byte pattern to low qword (64 bits) of ymm8 [NB: avx-512 only supports MOVQ to/from 128-bit vector regs] */\
				"vpmovzxbd	%%xmm8,%%ymm8		\n\t"/* vector-index offsets: ymm8 = 0x[00,04,08,0c,10,14,18,1c] in 32-bit form in low 8 dwords */\
				"vpslld	$4,%%ymm8,%%ymm8		\n\t"/* The above bytewise offsets need scale *16 to get the needed ones - would include but
												e.g. 0x1C<<4 overflows 1 byte), but x86 ISA only permits scale factors 1,2,4,8, so <<= 4 here. */\
			/* Mask-reg zmm9 = 11...11 - this is stupidly zeroed each time we do gather-load, so need to reinit: */\
			"movl	$-1,%%ebx	\n\t"/* Init opmask k1 (Only need the low byte) */\
			/* Gather instruction sets mask-reg = 0, so must re-init opmask prior to each invocation */
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x00(%%rax,%%ymm8),%%zmm0%{%%k1%}	\n\t"/* Col 0 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x08(%%rax,%%ymm8),%%zmm1%{%%k1%}	\n\t"/* Col 1 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x10(%%rax,%%ymm8),%%zmm2%{%%k1%}	\n\t"/* Col 2 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x18(%%rax,%%ymm8),%%zmm3%{%%k1%}	\n\t"/* Col 3 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x20(%%rax,%%ymm8),%%zmm4%{%%k1%}	\n\t"/* Col 4 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x28(%%rax,%%ymm8),%%zmm5%{%%k1%}	\n\t"/* Col 5 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x30(%%rax,%%ymm8),%%zmm6%{%%k1%}	\n\t"/* Col 6 */\
			"kmov	%%ebx,%%k1	\n\t	vgatherdpd 0x38(%%rax,%%ymm8),%%zmm7%{%%k1%}	\n\t"/* Col 7 */\
				/* Write original columns back as rows: */\
				"vmovaps	%%zmm0,0x000(%%rax)	\n\t"\
				"vmovaps	%%zmm1,0x040(%%rax)	\n\t"\
				"vmovaps	%%zmm2,0x080(%%rax)	\n\t"\
				"vmovaps	%%zmm3,0x0c0(%%rax)	\n\t"\
				"vmovaps	%%zmm4,0x100(%%rax)	\n\t"\
				"vmovaps	%%zmm5,0x140(%%rax)	\n\t"\
				"vmovaps	%%zmm6,0x180(%%rax)	\n\t"\
				"vmovaps	%%zmm7,0x1c0(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [2a]: Time for %u 8x8 doubles-transposes using gather-loads =%s\n",imax, get_time_str(tdiff));
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

	  #else

		/* Compare latency/thruput/ports for shuffle and gather-based versions, using Agner Fog's KNL tables:
		[1] vunpcklpd, vshuff64x2 both have 4-7 cycle latency, one can start every 2 cycles [3,1 on Skylake-X].
								Thus 24 such in sequence with no wait-stalls ==> ~50 cycles, close to what I measure.
		[2] vgatherdpd has 7-cycle latency, no data re. thruput, thus ~60 cycles per loop, again close to that observed.
		Both [1] and [2] use port 5, but on KNL the 'empty' cycles between shuffle-op issues can be used to issue gathers,
		which is what the side-by-ide matrix-pair [2b] variant tests.
		*/
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {	// Nov 2016: 4.3 sec for 10^8 loops @1.3GHz ==> ~7 cycles per gather-load
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Auxiliary register data needed for columnwise loads: */\
			"movq	$0x1c1814100c080400,%%rbx	\n\t"/* 64-bit register w/byte offsets 0x[00,04,08,0c,10,14,18,1c], bytes numbered left-to-right */\
				"vmovq		%%rbx,%%xmm8 		\n\t"/* Copy byte pattern to low qword (64 bits) of ymm8 [NB: avx-512 only supports MOVQ to/from 128-bit vector regs] */\
				"vpmovzxbd	%%xmm8,%%ymm8		\n\t"/* vector-index offsets: ymm8 = 0x[00,04,08,0c,10,14,18,1c] in 32-bit form in low 8 dwords */\
				"vpslld	$4,%%ymm8,%%ymm8		\n\t"/* The above bytewise offsets need scale *16 to get the needed ones - would include but
												e.g. 0x1C<<4 overflows 1 byte), but x86 ISA only permits scale factors 1,2,4,8, so <<= 4 here. */\
			/* Mask-reg zmm9 = 11...11 - this is stupidly zeroed each time we do gather-load, so need to reinit: */\
			"movl	$-1,%%ebx	\n\t"/* Init opmask k1 (Only need the low byte) */\
			/* Gather instruction sets mask-reg = 0, so must re-init opmask prior to each invocation */
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x00(%%rax,%%ymm8),%%zmm0%{%%k1%}	\n\t"/* Col 0 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x08(%%rax,%%ymm8),%%zmm1%{%%k1%}	\n\t"/* Col 1 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x10(%%rax,%%ymm8),%%zmm2%{%%k1%}	\n\t"/* Col 2 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x18(%%rax,%%ymm8),%%zmm3%{%%k1%}	\n\t"/* Col 3 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x20(%%rax,%%ymm8),%%zmm4%{%%k1%}	\n\t"/* Col 4 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x28(%%rax,%%ymm8),%%zmm5%{%%k1%}	\n\t"/* Col 5 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x30(%%rax,%%ymm8),%%zmm6%{%%k1%}	\n\t"/* Col 6 */\
			"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x38(%%rax,%%ymm8),%%zmm7%{%%k1%}	\n\t"/* Col 7 */\
				/* Write original columns back as rows: */\
				"vmovaps	%%zmm0,0x000(%%rax)	\n\t"\
				"vmovaps	%%zmm1,0x040(%%rax)	\n\t"\
				"vmovaps	%%zmm2,0x080(%%rax)	\n\t"\
				"vmovaps	%%zmm3,0x0c0(%%rax)	\n\t"\
				"vmovaps	%%zmm4,0x100(%%rax)	\n\t"\
				"vmovaps	%%zmm5,0x140(%%rax)	\n\t"\
				"vmovaps	%%zmm6,0x180(%%rax)	\n\t"\
				"vmovaps	%%zmm7,0x1c0(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [2a]: Time for %u 8x8 doubles-transposes using gather-loads =%s\n",imax, get_time_str(tdiff));
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

		// [2b] Hybrid shuffle/gather side-by-side matrix-pair [2b] variant of [2a]. On KNL this is faster
		// than [1a]+[2a] separately, i.e. we do save some cycles by interleaving the 2 types of transposes,
		// but still needs 1.93x the cycles of the best shuffle-based variant, thus no faster than 2 side-by-side
		// shuffle-transposes, *and* the comparison looks likely to come out even more unfavorably on Skylake-X:
		for(i = 0; i < 2*dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Lcol: read in 8 rows of matrix use by shuffle-transpose while setting up for gather-loads in rcol: */\
				"vmovaps		0x200(%%rax),%%zmm0		\n\t	movq	$0x1c1814100c080400,%%rbx	\n\t"\
				"vmovaps		0x240(%%rax),%%zmm1		\n\t	vmovq		%%rbx,%%xmm9 		\n\t"\
				"vmovaps		0x280(%%rax),%%zmm2		\n\t	vpmovzxbd	%%xmm9,%%ymm9		\n\t"/* vpmovzxbd CAN ONLY USE XMM0-15! */\
				"vmovaps		0x2c0(%%rax),%%zmm3		\n\t"\
				"vmovaps		0x300(%%rax),%%zmm4		\n\t"\
				"vmovaps		0x340(%%rax),%%zmm5		\n\t"\
				"vmovaps		0x380(%%rax),%%zmm6		\n\t	vpslld	$4,%%ymm9,%%ymm9		\n\t"\
				"vmovaps		0x3c0(%%rax),%%zmm7		\n\t	movl	$-1,%%ebx				\n\t"\
				/* Transpose uses regs0-7 for data, reg8 for temp: */\
									/* Gather instruction sets mask-reg = 0, so must re-init opmask prior to each invocation */
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x00(%%rax,%%ymm9),%%zmm10%{%%k1%}	\n\t"/* Col 0 */\
				"vunpcklpd		%%zmm1,%%zmm0,%%zmm8	\n\t"\
				"vunpckhpd		%%zmm1,%%zmm0,%%zmm1	\n\t"\
				"vunpcklpd		%%zmm3,%%zmm2,%%zmm0	\n\t"\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x08(%%rax,%%ymm9),%%zmm11%{%%k1%}	\n\t"/* Col 1 */\
				"vunpckhpd		%%zmm3,%%zmm2,%%zmm3	\n\t"\
				"vunpcklpd		%%zmm5,%%zmm4,%%zmm2	\n\t"\
				"vunpckhpd		%%zmm5,%%zmm4,%%zmm5	\n\t"\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x10(%%rax,%%ymm9),%%zmm12%{%%k1%}	\n\t"/* Col 2 */\
				"vunpcklpd		%%zmm7,%%zmm6,%%zmm4	\n\t"\
				"vunpckhpd		%%zmm7,%%zmm6,%%zmm7	\n\t"\
				"vshuff64x2	$136,%%zmm0,%%zmm8,%%zmm6	\n\t"\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x18(%%rax,%%ymm9),%%zmm13%{%%k1%}	\n\t"/* Col 3 */\
				"vshuff64x2	$221,%%zmm0,%%zmm8,%%zmm0	\n\t"\
				"vshuff64x2	$136,%%zmm3,%%zmm1,%%zmm8	\n\t"\
				"vshuff64x2	$221,%%zmm3,%%zmm1,%%zmm3	\n\t"\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x20(%%rax,%%ymm9),%%zmm14%{%%k1%}	\n\t"/* Col 4 */\
				"vshuff64x2	$136,%%zmm4,%%zmm2,%%zmm1	\n\t"\
				"vshuff64x2	$221,%%zmm4,%%zmm2,%%zmm4	\n\t"\
				"vshuff64x2	$136,%%zmm7,%%zmm5,%%zmm2	\n\t"\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x28(%%rax,%%ymm9),%%zmm15%{%%k1%}	\n\t"/* Col 5 */\
				"vshuff64x2	$221,%%zmm7,%%zmm5,%%zmm7	\n\t"\
				"vshuff64x2	$136,%%zmm1,%%zmm6,%%zmm5	\n\t"/* [output row 0] */\
				"vshuff64x2	$221,%%zmm1,%%zmm6,%%zmm1	\n\t"/* [output row 4] */\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x30(%%rax,%%ymm9),%%zmm16%{%%k1%}	\n\t"/* Col 6 */\
				"vshuff64x2	$136,%%zmm2,%%zmm8,%%zmm6	\n\t"/* [output row 1] */\
				"vshuff64x2	$221,%%zmm2,%%zmm8,%%zmm2	\n\t"/* [output row 5] */\
				"vshuff64x2	$136,%%zmm4,%%zmm0,%%zmm8	\n\t"/* [output row 2] */\
									"kmovw	%%ebx,%%k1	\n\t	vgatherdpd 0x38(%%rax,%%ymm9),%%zmm17%{%%k1%}	\n\t"/* Col 7 */\
				"vshuff64x2	$221,%%zmm4,%%zmm0,%%zmm4	\n\t"/* [output row 6] */\
				"vshuff64x2	$136,%%zmm7,%%zmm3,%%zmm0	\n\t"/* [output row 3] */\
				"vshuff64x2	$221,%%zmm7,%%zmm3,%%zmm7	\n\t"/* [output row 7] */\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm5,0x200(%%rax)		\n\t"\
				"vmovaps		%%zmm6,0x240(%%rax)		\n\t"\
				"vmovaps		%%zmm8,0x280(%%rax)		\n\t"\
				"vmovaps		%%zmm0,0x2c0(%%rax)		\n\t"\
				"vmovaps		%%zmm1,0x300(%%rax)		\n\t"\
				"vmovaps		%%zmm2,0x340(%%rax)		\n\t"\
				"vmovaps		%%zmm4,0x380(%%rax)		\n\t"\
				"vmovaps		%%zmm7,0x3c0(%%rax)		\n\t"\
																"vmovaps	%%zmm10,0x000(%%rax)	\n\t"\
																"vmovaps	%%zmm11,0x040(%%rax)	\n\t"\
																"vmovaps	%%zmm12,0x080(%%rax)	\n\t"\
																"vmovaps	%%zmm13,0x0c0(%%rax)	\n\t"\
																"vmovaps	%%zmm14,0x100(%%rax)	\n\t"\
																"vmovaps	%%zmm15,0x140(%%rax)	\n\t"\
																"vmovaps	%%zmm16,0x180(%%rax)	\n\t"\
																"vmovaps	%%zmm17,0x1c0(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [2b]: Time for 2 x %u 8x8 doubles-transposes using gather-loads =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix 1:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
	//	printf("Output matrix 2:\n");
		for(i = dim; i < 2*dim; i += 8) {
			row = i>>3;
		//	printf("Row %2u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row+56; t1 = row+64; t2 = row+72; t3 = row+80;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);

		// [2c] Side-by-side matrix-pair variant of [1d]. On KNL ?
		for(i = 0; i < 2*dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		nerr = 0; clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
				"kmovw	%%eax,%%k1			\n\t"\
				/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
				could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
				"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
				"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
					"vmovq		%%rax,%%xmm0 		\n\t"\
					"vmovq		%%rbx,%%xmm1 		\n\t"\
					"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
					"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
				"movq		%[__data],%%rax		\n\t"\
				/* Read in the 8 rows of our input matrix: */\
				"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x200(%%rax),%%zmm12		\n\t"\
				"vmovaps		0x040(%%rax),%%zmm1					\n\t	vmovaps		0x240(%%rax),%%zmm13		\n\t"\
				"vmovaps		0x080(%%rax),%%zmm2					\n\t	vmovaps		0x280(%%rax),%%zmm14		\n\t"\
				"vmovaps		0x0c0(%%rax),%%zmm3					\n\t	vmovaps		0x2c0(%%rax),%%zmm15		\n\t"\
				"vmovaps		0x100(%%rax),%%zmm4					\n\t	vmovaps		0x300(%%rax),%%zmm16		\n\t"\
				"vmovaps		0x140(%%rax),%%zmm5					\n\t	vmovaps		0x340(%%rax),%%zmm17		\n\t"\
				"vmovaps		0x180(%%rax),%%zmm6					\n\t	vmovaps		0x380(%%rax),%%zmm18		\n\t"\
				"vmovaps		0x1c0(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
				/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
				"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
				"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
				"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
				"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
				"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
				"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
				"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
				"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
				/* [2] Blend in the 2-aparts */\
				"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
				"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
				/* [3] Shuffle or permute in the 1-aparts */\
				"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
				"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
				"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
				"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
				"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
				"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
				"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
				"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
				"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
				"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
				/* Write original columns back as rows: */\
				"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x200(%%rax)		\n\t"\
				"vmovaps		%%zmm11,0x040(%%rax)				\n\t	vmovaps		%%zmm23,0x240(%%rax)		\n\t"\
				"vmovaps		%%zmm2 ,0x080(%%rax)				\n\t	vmovaps		%%zmm14,0x280(%%rax)		\n\t"\
				"vmovaps		%%zmm8 ,0x0c0(%%rax)				\n\t	vmovaps		%%zmm20,0x2c0(%%rax)		\n\t"\
				"vmovaps		%%zmm0 ,0x100(%%rax)				\n\t	vmovaps		%%zmm12,0x300(%%rax)		\n\t"\
				"vmovaps		%%zmm1 ,0x140(%%rax)				\n\t	vmovaps		%%zmm13,0x340(%%rax)		\n\t"\
				"vmovaps		%%zmm3 ,0x180(%%rax)				\n\t	vmovaps		%%zmm15,0x380(%%rax)		\n\t"\
				"vmovaps		%%zmm4, 0x1c0(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23", "xmm30","xmm31"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [2c]: Time for 2 x %u 8x8 doubles-transposes using side-by-side impl of algo [1d] =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix 1:\n");
		for(i = 0; i < dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row; t1 = row+8; t2 = row+16; t3 = row+24;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
	//	printf("Output matrix 2:\n");
		for(i = dim; i < 2*dim; i += 8) {
			row = i>>3;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3),*(dptr+i+4),*(dptr+i+5),*(dptr+i+6),*(dptr+i+7));
			// Expected (transposed-matrix) datum = row + 4*col
			t0 = row+56; t1 = row+64; t2 = row+72; t3 = row+80;
			nerr += (t0 != *(dptr+i+0)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
			t0 += 32; t1 += 32; t2 += 32; t3 += 32;
			nerr += (t0 != *(dptr+i+4)) + (t1 != *(dptr+i+5)) + (t2 != *(dptr+i+6)) + (t3 != *(dptr+i+7));
		}
		if(nerr) printf("Outputs incorrect! #mismatches = %u\n",nerr);
	  #endif	// endif(USE_IMCI512 || USE_AVX512)

		return nerr;
	}
  #endif

  #if defined(USE_AVX) && !defined(USE_IMCI512)
  	//********* Since AVX2 & AVX can run on much of the same hardware, lump them together
					// and differentiate as needed within this outer preprocessor conditional - e.g. the
					// test_simd_transpose_4x4() function uses both, if available, for comparative timings.
	// 4x4 refers to linear memory treated as a 4x4 matrix-of-doubles:
	int	test_simd_transpose_4x4()
	{
		/*...time-related stuff	*/
		double clock1, clock2;
		double tdiff, t0,t1,t2,t3;
		int i,imax = 100000001, row,col, nerr = 0;	// Use 10^8 loop execs in effort to yield timing on order of 1 sec on target CPUs
			// Add 1 to make loop count odd, thus result of (imax) successive transposes equivalent to a single one
		const int dim = 16;		// #elements in our matrix
		vec_dbl *mem  = ALLOC_VEC_DBL(mem, dim+4);	// Add 4 pads to allow for alignment on up-to-128-byte boundary
		vec_dbl *data = ALIGN_VEC_DBL(mem);
		ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!");
		// Init the matrix -  Input matrix has rows:
		double *dptr = (double *)data;	//  0, 1, 2, 3
		for(i = 0; i < dim; i++) {		//  4, 5, 6, 7
			*(dptr+i) = i;				//  8, 9,10,11
		}								// 12,13,14,15

		// Do timing loop using 2 fundamentally different methods of effecting the transpose,
		// the 1st of which comes in 2 variants dubbed [1a] and [1b]:

		// [1a] Rowwise-load and in-register data shuffles. On KNL: 23 cycles per loop-exec:
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				"vmovaps	     (%%rax),%%ymm2			\n\t"\
				"vmovaps	0x020(%%rax),%%ymm1			\n\t"\
				"vshufpd	$15,%%ymm1,%%ymm2,%%ymm3	\n\t"\
				"vshufpd	$0 ,%%ymm1,%%ymm2,%%ymm2	\n\t"\
				"vmovaps	0x040(%%rax),%%ymm4			\n\t"\
				"vmovaps	0x060(%%rax),%%ymm1			\n\t"\
				"vshufpd	$15,%%ymm1,%%ymm4,%%ymm0	\n\t"\
				"vshufpd	$0 ,%%ymm1,%%ymm4,%%ymm4	\n\t"\
				"vperm2f128 $32,%%ymm0,%%ymm3,%%ymm1	\n\t"/* Row 1 */\
				"vperm2f128 $49,%%ymm0,%%ymm3,%%ymm3	\n\t"/* Row 3 */\
				"vperm2f128 $32,%%ymm4,%%ymm2,%%ymm0	\n\t"/* Row 0 */\
				"vperm2f128 $49,%%ymm4,%%ymm2,%%ymm2	\n\t"/* Row 2 */\
				/* Write original columns back as rows: */\
				"vmovaps	%%ymm0,0x00(%%rax)	\n\t"\
				"vmovaps	%%ymm1,0x20(%%rax)	\n\t"\
				"vmovaps	%%ymm2,0x40(%%rax)	\n\t"\
				"vmovaps	%%ymm3,0x60(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1a]: Time for %u 4x4 matrix-of-doubles transposes =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 4) {
			row = i>>2;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3));
			t0 = row; t1 = row+4; t2 = row+8; t3 = row+12;	// Expected (transposed-matrix) datum = row + 4*col
			nerr += (t0 != *(dptr+i)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
		}
	//	printf("#mismatches = %u\n",nerr);

		// [1b] Rowwise-load and in-register data shuffles, using a different shuffle sequence. On KNL: 24 cycles per loop-exec:
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				"vmovaps	    (%%rax),%%xmm0				\n\t"/* r0.lo = 0,1,-,- */\
				"vmovaps	0x20(%%rax),%%xmm4				\n\t"/* r1.lo = 4,5,-,- */\
				"vinsertf128 $1,0x40(%%rax),%%ymm0,%%ymm0	\n\t"/* r0|r2.lo = 0,1,8,9 */\
				"vinsertf128 $1,0x60(%%rax),%%ymm4,%%ymm4	\n\t"/* r1|r3.lo = 4,5,c,d */\
				"vshufpd	$15,%%ymm4,%%ymm0,%%ymm1		\n\t"/* Row 1 = 1,5,9,d */\
				"vshufpd	$0 ,%%ymm4,%%ymm0,%%ymm0		\n\t"/* Row 0 = 0,4,8,c */\
				"vmovaps	0x10(%%rax),%%xmm2				\n\t"/* r0.hi = 2,3,-,- */\
				"vmovaps	0x30(%%rax),%%xmm4				\n\t"/* r1.hi = 6,7,-,- */\
				"vinsertf128 $1,0x50(%%rax),%%ymm2,%%ymm2	\n\t"/* r0|r2.hi = 2,3,a,b */\
				"vinsertf128 $1,0x70(%%rax),%%ymm4,%%ymm4	\n\t"/* r1|r3.hi = 6,7,e,f */\
				"vshufpd	$15,%%ymm4,%%ymm2,%%ymm3		\n\t"/* Row 3 = 3,7,b,f */\
				"vshufpd	$0 ,%%ymm4,%%ymm2,%%ymm2		\n\t"/* Row 2 = 2,6,a,e */\
				/* Write original columns back as rows: */\
				"vmovaps	%%ymm0,0x00(%%rax)	\n\t"\
				"vmovaps	%%ymm1,0x20(%%rax)	\n\t"\
				"vmovaps	%%ymm2,0x40(%%rax)	\n\t"\
				"vmovaps	%%ymm3,0x60(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [1b]: Time for %u 4x4 matrix-of-doubles transposes =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 4) {
			row = i>>2;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3));
			t0 = row; t1 = row+4; t2 = row+8; t3 = row+12;	// Expected (transposed-matrix) datum = row + 4*col
			nerr += (t0 != *(dptr+i)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
		}
	//	printf("#mismatches = %u\n",nerr);

		// [2] Columnwise-load-and-rowwise-writeback using AVX2 gather-load functionality. On KNL: 46 cycles per loop-exec:
	  #ifdef USE_AVX2
	   #ifdef GCC_5PLUS	// gcc 4.x may not support the needed AVX2 instructions (while still being fine for for the FMA
						// instructions used for the FFT), so require an added compile-time define to enable loop [2]
		for(i = 0; i < dim; i++) { *(dptr+i) = i; }	// Re-init the matrix to be untransposed
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			__asm__ volatile (\
				"movq		%[__data],%%rax		\n\t"\
				/* Auxiliary register data needed for columnwise loads: */\
				"movl	$0x60402000,%%ebx		\n\t"/* 32-bit register w/byte offsets [0x00,0x20,0x40,0x60], bytes numbered left-to-right */\
				"vmovd		%%ebx,%%xmm4 		\n\t"/* Copy byte pattern to low dword (32 bits) of ymm4 [NB: avx-512 only supports MOVZX to/from mem-address or 128-bit vector regs] */\
				"vpmovzxbd	%%xmm4,%%ymm4		\n\t"/* vector-index offsets: ymm4 = [0x00,0x20,0x40,0x60] in 32-bit form in low 4 dwords */\
			/* Mask-reg ymm5 = 11...11 - this is stupidly zeroed each time we do gather-load, so need to reinit: */\
			"vpcmpeqd	%%ymm5,%%ymm5,%%ymm5	\n\t	vgatherdpd %%ymm5,0x00(%%rax,%%xmm4),%%ymm0	\n\t"/* Col 0 */\
			"vpcmpeqd	%%ymm5,%%ymm5,%%ymm5	\n\t	vgatherdpd %%ymm5,0x08(%%rax,%%xmm4),%%ymm1	\n\t"/* Col 1 */\
			"vpcmpeqd	%%ymm5,%%ymm5,%%ymm5	\n\t	vgatherdpd %%ymm5,0x10(%%rax,%%xmm4),%%ymm2	\n\t"/* Col 2 */\
			"vpcmpeqd	%%ymm5,%%ymm5,%%ymm5	\n\t	vgatherdpd %%ymm5,0x18(%%rax,%%xmm4),%%ymm3	\n\t"/* Col 3 */\
				/* Write original columns back as rows: */\
				"vmovaps	%%ymm0,0x00(%%rax)	\n\t"\
				"vmovaps	%%ymm1,0x20(%%rax)	\n\t"\
				"vmovaps	%%ymm2,0x40(%%rax)	\n\t"\
				"vmovaps	%%ymm3,0x60(%%rax)	\n\t"\
				:						// outputs: none
				: [__data] "m" (data)	// All inputs from memory addresses here
				: "cc","memory","rax","rbx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5"	// Clobbered registers - use xmm form for compatibility with older versions of clang/gcc
			);
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("Method [2]: Time for %u 4x4 matrix-of-doubles transposes =%s\n",imax, get_time_str(tdiff));
		// Check the result:
	//	printf("Output matrix:\n");
		for(i = 0; i < dim; i += 4) {
			row = i>>2;
		//	printf("Row %u: %3.0f %3.0f %3.0f %3.0f\n",row,*(dptr+i),*(dptr+i+1),*(dptr+i+2),*(dptr+i+3));
			t0 = row; t1 = row+4; t2 = row+8; t3 = row+12;	// Expected (transposed-matrix) datum = row + 4*col
			nerr += (t0 != *(dptr+i)) + (t1 != *(dptr+i+1)) + (t2 != *(dptr+i+2)) + (t3 != *(dptr+i+3));
		}
	//	printf("#mismatches = %u\n",nerr);
	   #endif
	  #endif
		return nerr;
	}
  #endif	// USE_AVX ?

  #if defined(USE_SSE2) && !defined(USE_IMCI512)
	// Timing loop for 128-bit SIMD radix-4 DFT macro:
	int	test_radix4_dft()
	{
		const char func[] = "test_radix4_dft";
	#ifdef USE_AVX	//No AVX support for macros in this function
		return 0;
	#else
		/*...time-related stuff	*/
		double clock1, clock2;
		double tdiff, t0,t1,t2,t3;
		int i,j,dim,imax = 10000000, nerr = 0;	// Expect radix-4 DFT to need ~1/10th the cycles of radix-32, so use 10^7 loop execs
		int p1,p2,p3,p4;
		static vec_dbl *sc_arr = 0x0, *sc_ptr;
		double *add0,*add1,*add2,*add3;	/* Addresses into array sections */
		// Each row of 16 double data corr. to expected outputs from one of up to two vector-complex 4-DFTs being done:
		const double ref1[] = {-5.,40.,163.,139.,106.,45.,38.,-4.,31.,-36.,-115.,-83.,-120.,-45.,-70.,-48.,
								-70.,15.,230.,158.,28.,55.,-14.,-70.,56., 3.,-116.,-102.,-6.,-61.,-68.,30.};
		const double ref2[] = {22.,20.,20.,18.,-18.,-13.,30.,-1.,-26.,-4.,-18.,-12.,-8.,-146.,-28.,72.,
								13.,15.,31.,16.,-17.,27.,45.,28.,6.,-25.,-24.,15.,-6.,-1.,48.,-57.};
		vec_dbl *c_tmp,*s_tmp, *cc0,*two, *r0,*r1,*r2,*r3;
		// Alloc 8 vector-complex elts (16 vec_dbl) per input/output block rather than 4, so can also test two radix-4 DFTs done side-by-side:
		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x42);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
		sc_ptr = ALIGN_VEC_DBL(sc_arr);
		ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
		add0 = sc_ptr;
		add1 = sc_ptr+0x2;
		add2 = sc_ptr+0x4;
		add3 = sc_ptr+0x6;
		r0 = sc_ptr + 0x10;
		r1 = r0 + 0x2;
		r2 = r0 + 0x4;
		r3 = r0 + 0x6;
		cc0 = r0 + 0x10;	// Alloc 8 vector-complex elts rather than 4, so can also test two radix-4 DFTs done side-by-side
		two = r0 + 0x20;	// Similarly alloc 2 sets of 8 vector-complex twiddles
		VEC_DBL_INIT(two,2.0);

		// Do these timing-test DFTs in-place, i.e. p1 = #doubles in a pair of vec_dbl:
		p1 = RE_IM_STRIDE << 1;
		p2 = p1 + p1;
		p3 = p2 + p1;
		p4 = p2 + p2;

		// Twiddles for the purpose of this timing-test can be anything, just set to random digits in [0,9].
		// First twiddle for each of the two 4-DFT datasets = unity, but since macros don't actually do that cmul,
		// only init the non-unity twiddle-triplet needed by each of the up-to-2-independent DFTs we do:
		c_tmp = cc0; s_tmp = c_tmp+1;	/* c0,s0 */
		for(i = 0; i < 6; i+=2, c_tmp+=2, s_tmp+=2) {	// Remaining 3 vector-complex twiddles for each 4-DFT are nontrivial
			VEC_DBL_INIT(c_tmp  , ran[i  ]);	VEC_DBL_INIT(s_tmp  , ran[i+1]);
			VEC_DBL_INIT(c_tmp+6, ran[i+8]);	VEC_DBL_INIT(s_tmp+6, ran[i+9]);
/*
			// Restructure twiddle-muls to use cotangent-scheme:
			ASSERT(HERE, ran[i+1] != 0.0 && ran[i+9] != 0.0,"Need to modify test-twiddles to avoid div-by-0!");
			VEC_DBL_INIT(c_tmp  , ran[i  ]/(double)ran[i+1]);	VEC_DBL_INIT(s_tmp  , ran[i+1]);
			VEC_DBL_INIT(c_tmp+8, ran[i+8]/(double)ran[i+9]);	VEC_DBL_INIT(s_tmp+8, ran[i+9]);
*/
		}
		// Set inputs != 0 to prevent timings being thrown off by any 0-operand arithmetic-shortcuts the CPU may do:
		double *dptr = (double *)r0;
		dim = 8*RE_IM_STRIDE;	// 4 vector-complex data
		// Copy quasirandom digits-of-Pi-data into our vec_dbl inputs:
		for(j = 0; j < dim; j++) { *(add0+j) = ran[j]; }

	// 5 May 2016: 10^7-loop timings, 1-threaded on my 2GHz Core2Duo:
	//																comments
	// no-DFT timing (keeping just 1 address-load of macro)	0.246	49.2 cycles (!! ... that's a lot for loop-control and data-copy)
	// initial timing										0.473	94.6 - 49.2 = 45.4 cycles
	// leaq for add0 + 8*p[1,2,3]							0.462
	// use xmm8,9 to save 6 implied-loads in mulpd			0.454
	// use xmm8,9 to save 2 implied-loads in addpd			0.444
	// replace four add-doublings by 2*x in final butterfly	0.429	36.6 cycles ... Already nearly 20% faster!
	// move loads of xmm8,9 imm before use to ease 2-column	0.452	that is a step back, revert
	// elim 4 redundant loads in p1,3 computation			0.429	no speedup
	// move 1st 2 loads for next combo into end of pvs one	0.440	slower, oddly
	// eliminate 1st twiddle-mul                            0.438	again slower ... now *that* is bizarre
	// Added spill/reload of first output to cut regs 8,9	0.429	nice - now can do 2 such DFTs side-by-side
	// Restructure twiddle-muls to use cotangent-scheme		0.445	slower
	//*** 7 Jul 2017: *** out-of-place DFT allows 1-time init, i.e.
	// no more need to subtract initial-data-copy timing:	0.232	46 cycles - note this is the 'initial timing' 8-register macro above.
	// ARM Neon code timing (1.5 GHz Odroid C2):			0.612	93 cycles, almost exactly 2c the cycle count of SSE2 on Core2
		// Timing loop #1:
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			/* 4-DFT-with-3-complex-twiddles macro. SIMD opcount:
			x86_64 SSE2: 41 MEM (19 load[1 via mem-op in addpd], 14 store, 8 reg-copy), 22 ADDPD, 16 MULPD
			ARM v8 Neon: 11 MEM (7 load-pair, 4 store-pair), 16 FADD, 12 FMUl/FMA
			*/
		#ifdef USE_ARM_V8_SIMD
			__asm__ volatile (\
				"ldr	x0,%[__add0]		\n\t"\
				"ldr	w1,%[__p1]			\n\t"\
				"ldr	w2,%[__p2]			\n\t"\
				"ldr	w3,%[__p3]			\n\t"\
				"ldr	x4,%[__cc0]			\n\t"\
				"ldr	x5,%[__r0]			\n\t"\
				"add	x1, x0,x1,lsl #3	\n\t"\
				"add	x2, x0,x2,lsl #3	\n\t"\
				"add	x3, x0,x3,lsl #3	\n\t"\
				/* SSE2_RADIX_04_DIF_3TWIDDLE(r0,c0): */\
				/* Do	the p0,p2 combo: */\
				"ldp	q4,q5,[x2]			\n\t"\
				"ldp	q8,q9,[x4]			\n\t"/* cc0 */\
				"ldp	q0,q1,[x0]			\n\t"\
				"fmul	v6.2d,v4.2d,v8.2d	\n\t"/* twiddle-mul: */\
				"fmul	v7.2d,v5.2d,v8.2d	\n\t"\
				"fmls	v6.2d,v5.2d,v9.2d	\n\t"\
				"fmla	v7.2d,v4.2d,v9.2d	\n\t"\
				"fsub	v2.2d ,v0.2d,v6.2d	\n\t"/* 2 x 2 complex butterfly: */\
				"fsub	v3.2d ,v1.2d,v7.2d	\n\t"\
				"fadd	v10.2d,v0.2d,v6.2d	\n\t"\
				"fadd	v11.2d,v1.2d,v7.2d	\n\t"\
				/* Do	the p1,3 combo: */\
				"ldp	q8,q9,[x4,#0x40]	\n\t"/* cc0+4 */\
				"ldp	q6,q7,[x3]			\n\t"\
				"fmul	v0.2d,v6.2d,v8.2d	\n\t"/* twiddle-mul: */\
				"fmul	v1.2d,v7.2d,v8.2d	\n\t"\
				"fmls	v0.2d,v7.2d,v9.2d	\n\t"\
				"fmla	v1.2d,v6.2d,v9.2d	\n\t"\
				"ldp	q8,q9,[x4,#0x20]	\n\t"/* cc0+2 */\
				"ldp	q6,q7,[x1]			\n\t"\
				"fmul	v4.2d,v6.2d,v8.2d	\n\t"/* twiddle-mul: */\
				"fmul	v5.2d,v7.2d,v8.2d	\n\t"\
				"fmls	v4.2d,v7.2d,v9.2d	\n\t"\
				"fmla	v5.2d,v6.2d,v9.2d	\n\t"\
				"fadd	v6.2d,v4.2d,v0.2d	\n\t"/* 2 x 2 complex butterfly: */\
				"fadd	v7.2d,v5.2d,v1.2d	\n\t"\
				"fsub	v4.2d,v4.2d,v0.2d	\n\t"\
				"fsub	v5.2d,v5.2d,v1.2d	\n\t"\
				/* Finish radix-4 butterfly and store results: */\
				"fsub	v8.2d,v10.2d,v6.2d	\n\t"\
				"fsub	v9.2d,v11.2d,v7.2d	\n\t"\
				"fsub	v1.2d,v3.2d,v4.2d	\n\t"\
				"fsub	v0.2d,v2.2d,v5.2d	\n\t"\
				"fadd	v6.2d,v6.2d,v10.2d	\n\t"\
				"fadd	v7.2d,v7.2d,v11.2d	\n\t"\
				"fadd	v4.2d,v4.2d,v3.2d	\n\t"\
				"fadd	v5.2d,v5.2d,v2.2d	\n\t"\
				"stp	q6,q7,[x5      ]	\n\t"/* out 0 */\
				"stp	q0,q4,[x5,#0x20]	\n\t"/* out 1 */\
				"stp	q8,q9,[x5,#0x40]	\n\t"/* out 2 */\
				"stp	q5,q1,[x5,#0x60]	\n\t"/* out 3 */\
				:					/* outputs: none */\
				: [__add0] "m" (add0)	/* All inputs from memory addresses here */\
				 ,[__p1] "m" (p1)\
				 ,[__p2] "m" (p2)\
				 ,[__p3] "m" (p3)\
				 ,[__two] "m" (two)\
				 ,[__cc0] "m" (cc0)\
				 ,[__r0] "m" (r0)\
				: "cc","memory","x0","x1","x2","x3","x4","x5","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
			);
		#else
			__asm__ volatile (\
				"movq	%[__cc0],%%rsi 		\n\t"\
				"movq	%[__add0],%%rax		\n\t"\
				/* NB: preshifting the p-offsets to save the *= 8 below saves nothing: */\
				"movslq	%[__p1],%%rbx		\n\t"\
				"movslq	%[__p2],%%rcx		\n\t"\
				"movslq	%[__p3],%%rdx		\n\t"\
				"leaq	(%%rax,%%rbx,8),%%rbx	\n\t"\
				"leaq	(%%rax,%%rcx,8),%%rcx	\n\t"\
				"leaq	(%%rax,%%rdx,8),%%rdx	\n\t"\
				/* SSE2_RADIX_04_DIF_3TWIDDLE(r0,c0): */\
				/* Do	the p0,p2 combo: */\
				"movaps	    (%%rcx),%%xmm4	\n\t"\
				"movaps	0x10(%%rcx),%%xmm5	\n\t"\
				"movaps	    (%%rsi),%%xmm2	\n\t"\
				"movaps	0x10(%%rsi),%%xmm3	\n\t"\
				"movaps	%%xmm4,%%xmm6		\n\t"\
				"movaps	%%xmm5,%%xmm7		\n\t"\
				"movaps	    (%%rax),%%xmm0	\n\t"\
				"movaps	0x10(%%rax),%%xmm1	\n\t"\
				"mulpd	%%xmm2,%%xmm4		\n\t"\
				"mulpd	%%xmm2,%%xmm5		\n\t"\
				"mulpd	%%xmm3,%%xmm6		\n\t"\
				"mulpd	%%xmm3,%%xmm7		\n\t"\
				"movaps	%%xmm0,%%xmm2		\n\t"\
				"movaps	%%xmm1,%%xmm3		\n\t"\
				"movq	%[__r0],%%rdi 		\n\t"\
				"addpd	%%xmm6,%%xmm5		\n\t"\
				"subpd	%%xmm7,%%xmm4		\n\t"\
				"addpd	%%xmm4,%%xmm0		\n\t"\
				"addpd	%%xmm5,%%xmm1		\n\t"\
				"subpd	%%xmm4,%%xmm2		\n\t"\
				"subpd	%%xmm5,%%xmm3		\n\t"\
				"movaps	%%xmm0,0x40(%%rdi)	\n\t"/* Spill 1: free up xmm0,1 */\
				"movaps	%%xmm1,0x50(%%rdi)	\n\t"\
				/* Do	the p1,3 combo: */\
				"movaps	0x40(%%rsi),%%xmm0	\n\t"\
				"movaps	0x50(%%rsi),%%xmm1	\n\t"\
				"movaps	    (%%rdx),%%xmm6	\n\t"\
				"movaps	0x10(%%rdx),%%xmm7	\n\t"\
				"movaps	%%xmm6,%%xmm4		\n\t"\
				"movaps	%%xmm7,%%xmm5		\n\t"\
				"mulpd	%%xmm0,%%xmm4		\n\t"\
				"mulpd	%%xmm0,%%xmm5		\n\t"\
				"mulpd	%%xmm1,%%xmm6		\n\t"\
				"mulpd	%%xmm1,%%xmm7		\n\t"\
				"addpd	%%xmm6,%%xmm5		\n\t"\
				"subpd	%%xmm7,%%xmm4		\n\t"\
				"movaps	%%xmm5,0x10(%%rdi)	\n\t"/* Spill 2*/\
				"movaps	%%xmm4,    (%%rdi)	\n\t"\
				"movaps	0x20(%%rsi),%%xmm0	\n\t"\
				"movaps	0x30(%%rsi),%%xmm1	\n\t"\
				"movaps	    (%%rbx),%%xmm6	\n\t"\
				"movaps	0x10(%%rbx),%%xmm7	\n\t"\
				"movaps	%%xmm6,%%xmm4		\n\t"\
				"movaps	%%xmm7,%%xmm5		\n\t"\
				"mulpd	%%xmm0,%%xmm4		\n\t"\
				"mulpd	%%xmm0,%%xmm5		\n\t"\
				"mulpd	%%xmm1,%%xmm6		\n\t"\
				"mulpd	%%xmm1,%%xmm7		\n\t"\
				"movaps	    (%%rdi),%%xmm0	\n\t"/* Restore 2 */\
				"movaps	0x10(%%rdi),%%xmm1	\n\t"\
				"addpd	%%xmm6,%%xmm5		\n\t"\
				"subpd	%%xmm7,%%xmm4		\n\t"\
				"movaps	%%xmm5,%%xmm7		\n\t"\
				"movaps	%%xmm4,%%xmm6		\n\t"\
				"subpd	%%xmm0,%%xmm4		\n\t"\
				"subpd	%%xmm1,%%xmm5		\n\t"\
				"addpd	%%xmm0,%%xmm6		\n\t"\
				"addpd	%%xmm1,%%xmm7		\n\t"\
				/* Finish radix-4 butterfly and store results: */\
				"movq	%[__two],%%rsi		\n\t"\
				"movaps	0x40(%%rdi),%%xmm0	\n\t"/* Restore 1 */\
				"movaps	0x50(%%rdi),%%xmm1	\n\t"\
				"subpd	%%xmm6,%%xmm0		\n\t"\
				"subpd	%%xmm5,%%xmm2		\n\t"\
				"subpd	%%xmm7,%%xmm1		\n\t"\
				"subpd	%%xmm4,%%xmm3		\n\t"\
				"movaps	%%xmm0,0x40(%%rdi)	\n\t	movaps	(%%rsi),%%xmm0	\n\t"/* 2.0 */\
				"movaps	%%xmm2,0x20(%%rdi)	\n\t"\
				"movaps	%%xmm1,0x50(%%rdi)	\n\t"\
				"movaps	%%xmm3,0x70(%%rdi)	\n\t"\
				"mulpd	%%xmm0,%%xmm6		\n\t"\
				"mulpd	%%xmm0,%%xmm5		\n\t"\
				"mulpd	%%xmm0,%%xmm7		\n\t"\
				"mulpd	%%xmm0,%%xmm4		\n\t"\
				"addpd	0x40(%%rdi),%%xmm6	\n\t"\
				"addpd		%%xmm2 ,%%xmm5	\n\t"\
				"addpd		%%xmm1 ,%%xmm7	\n\t"\
				"addpd		%%xmm3 ,%%xmm4	\n\t"\
				"movaps	%%xmm6,    (%%rdi)	\n\t"\
				"movaps	%%xmm5,0x60(%%rdi)	\n\t"\
				"movaps	%%xmm7,0x10(%%rdi)	\n\t"\
				"movaps	%%xmm4,0x30(%%rdi)	\n\t"\
				:					/* outputs: none */\
				: [__add0] "m" (add0)	/* All inputs from memory addresses here */\
				 ,[__p1] "m" (p1)\
				 ,[__p2] "m" (p2)\
				 ,[__p3] "m" (p3)\
				 ,[__two] "m" (two)\
				 ,[__cc0] "m" (cc0)\
				 ,[__r0] "m" (r0)\
				: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
			);
		#endif	// ARM_V8 or X86_64 SIMD?
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s, loop 1: Time for %u macro calls =%s\n",func, imax, get_time_str(tdiff));
		// Check outputs vs ref-data:
		nerr = 0;
		dptr = (double *)r0;
		for(i = 0; i < 4; i++) {
			j = i<<2;
		//	printf("Out%u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
			nerr += (fabs(*(dptr  ) - ref1[j  ]) > 1e-10);
			nerr += (fabs(*(dptr+1) - ref1[j+1]) > 1e-10);
			nerr += (fabs(*(dptr+2) - ref1[j+2]) > 1e-10);
			nerr += (fabs(*(dptr+3) - ref1[j+3]) > 1e-10);
			dptr += 4;
		}
		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");

	// Timing loop #2 - two radix-4 DFTs (operating on separate data chunks but sharing twiddles) side-by-side:
		/* 6 May 2016, Core2:
		Baseline single-DFT timing of loop #2 = 0.416 sec with-DFT, 0.252 sec sans-DFT ==> 1-DFT = 0.164 sec ==> 32.8 cycles

		*/
		// 7 May 2016: 10^7-loop timings, 1-threaded on my 2GHz Core2Duo:
		//																comments
		// no-DFT timing (keeping just 1 address-load of macro)	0.486	97.2 cycles ... 2x as many data-inits as loop #1
		// initial timing										0.871	126.2 - 49.2 = 77 cycles, vs 36.8 for single-column ASM
		//											*** 2-column actually slows things down in terms of per-cycle throughput! ***
		// rcol down 4 lines to break same-instruction blocks	0.813	66 cycles, big improvement!
		//18 May: 2-col impl of 2x4-DFT-each-with-3-twiddles	0.791	61 cycles, better, need to get under 60

		// 9 May: GCCified George's dual complex 4-DFT macro (not a drop-in replacement candidate for mine
		// (due to its cotangent-twiddles scheme), munged addresses, pasted underneath my opening address-
		// computation block below, just to get a timing:		0.735	50 cycles, yowza!
		//
		// Copy quasirandom digits-of-Pi-data into our vec_dbl inputs:
		for(j = 0; j < dim+dim; j++) { *(add0+j) = ran[j]; }
		int k = 0x60;
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			j = 0x80;	// Set j to the bytewise address offset between the pointers to the first and second DFT's data
						// 0x60 = literal-bytewise address offset between the pointers to the first and second DFT's twiddles:
			SSE2_RADIX_04_DIF_3TWIDDLE_X2(add0,add1,add2,add3,j, two,cc0,k, r0,r1,r2,r3,j)
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s, loop 2: Time for %u macro calls =%s\n",func, imax, get_time_str(tdiff));
		// Check outputs vs ref-data:
		nerr = 0;
		dptr = (double *)r0;
		for(i = 0; i < 8; i++) {
			j = i<<2;
		//	printf("Out%u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
			nerr += (*dptr != ref1[j]) + (*(dptr+1) != ref1[j+1]) + (*(dptr+2) != ref1[j+2]) + (*(dptr+3) != ref1[j+3]);
			dptr += 4;
		}
		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");

	// Timing loop #3 - single radix-4 DIT DFT:
		dim = 8*RE_IM_STRIDE;	// 4 vector-complex data
		// Copy quasirandom digits-of-Pi-data into our vec_dbl inputs:
		for(j = 0; j < dim; j++) { *(add0+j) = ran[j]; }
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			// Single-4-DFT macro uses same arglist as paired ones, but ignores the args in the j,k,j slots:
			SSE2_RADIX_04_DIT_3TWIDDLE_X1(add0,add1,add2,add3,j, two,cc0,k, r0,r1,r2,r3,j)
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s, loop 3: Time for %u macro calls =%s\n",func, imax, get_time_str(tdiff));
		// Check outputs vs ref-data:
		nerr = 0;
		dptr = (double *)r0;
		for(i = 0; i < 4; i++) {
			j = i<<2;
		//	printf("Out%u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
			nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]);
			dptr += 4;
		}
		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");

	// Timing loop #4 - two radix-4 DIT DFTs (operating on separate data chunks but sharing twiddles) side-by-side:
		for(j = 0; j < dim+dim; j++) { *(add0+j) = ran[j]; }
		k = 0x60;
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			j = 0x80;	// Set j to the bytewise address offset between the pointers to the first and second DFT's data
						// 0x60 = literal-bytewise address offset between the pointers to the first and second DFT's twiddles:
			SSE2_RADIX_04_DIT_3TWIDDLE_X2(add0,add1,add2,add3,j, two,cc0,k, r0,r1,r2,r3,j)
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s, loop 4: Time for %u macro calls =%s\n",func, imax, get_time_str(tdiff));
		// Check outputs vs ref-data:
		nerr = 0;
		dptr = (double *)r0;
		for(i = 0; i < 8; i++) {
			j = i<<2;
		//	printf("Out%u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
			nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]);
			dptr += 4;
		}
		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");

		free((void *)sc_arr);	sc_arr=0x0;
		return nerr;
	#endif	// USE_AVX?
	}

	// Timing loop for radix-16 DIF macro:
	int	test_radix16_dft()
	{
		const char func[] = "test_radix16_dft";
		/*...time-related stuff	*/
		double clock1, clock2;
		double *dptr, tdiff, rt,it, dtmp, avg_err;
		double t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22,t23,t24,t25,t26,t27,t28,t29,t30,t31;
		int i,j,j1,j2,k,imax = 10000000, nerr = 0;	// Use 10^7 loop execs
		int p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF;
		const double c = 0.9238795325112867561281831, s = 0.3826834323650897717284599;	/* exp[i*(twopi/16)]*/
	#ifdef USE_AVX2	// FMA-based DFT needs the tangent
	  #ifdef REFACTOR_4DFT_3TWIDDLE
		#error USE_AVX2 and REFACTOR_4DFT_3TWIDDLE are mutually exclusive preprocessor directives!
	  #endif
		const double tan = 0.41421356237309504879;
	#endif
		// DIF[ref1], DIT[ref2] ref-outputs: cols 0,1 are re,im outputs for scalar-mode,
		// cols [0,1],[2,3] are [re0,im0],[re1,im1] for SSE2 mode,
		// cols [0,1],[2,3],[4,5],[6,7] are [re0,im0],[re1,im1],[re2,im2],[re3,im3] for AVX/AVX2 mode:
		const double ref1[] = {	// DIF ref-outputs:
			 71.836132626544, 51.240113957019,  93.778630517480, 76.253829098865,  84.806628663973, 73.237374006630,  75.813281246317, 71.186791613560,
			 -9.999128163205, -5.041057027004, -11.993749676902, -0.051731207851,  -2.997214610252, -5.029166077852, -11.965466770155, -1.019849970447,
			-14.872954615349,-26.053387304330,  -2.971220181207,  0.996282812198,  -6.000050539329, 22.005043558985,  -5.934761044083, -9.023636204473,
			  5.023325346528, -7.980009625544,  20.909799869969, 19.046570941658,  -0.036810883888,  5.988776862464,  10.025385052093, -5.002254733787,
			  9.056346572622, -1.914547592304,   3.981734242952,  8.184718642903,   5.433348885097,  0.012717649256,  11.329786667132, 10.968444860577,
			-15.077818730218,  7.893027085061,  -5.978667607114,  3.812269869915,  -7.427204810541, 23.968911024385, -11.266969635162, -8.942204365409,
			  0.293315070044,  4.170949230343, -12.723161421476,-21.587788299675,  -9.388763668691, -2.159379128165, -21.281854472092, -0.810792653682,
			  1.740668944464,-14.241457760931,   2.800021476805,-14.482647475474, -16.537232610515,  2.000039861260,   1.280448357857, -9.258323929924,
			 -3.152128563857,-16.771900048992,  -5.521042109482,  7.816522698106,   2.280114913431, 14.541287717387, -10.102184012659, 17.040342071843,
			  2.073046947819,  1.969010269254,  -2.360249253366,-17.814992092284,   8.483022773111, -1.532157855444,  -0.178724865422, -2.336467764655,
			 -9.872285434976,  1.404582139871, -13.035858498590, -8.152619897216,   3.973177562804, -9.433750344665,   8.526197273669, 21.162355221723,
			  3.080286869058,-10.650483193072,   5.076556271954, -1.854679849975,  29.478126055897,-11.446050304373, -30.263630386603, 16.060181777754,
			 14.065076567172,  2.036958078982,  -7.839406695376,-20.597474713295,  10.403805832586, 11.790086387134,  -0.622179305579,  1.390534729868,
			 21.045817946931,  5.610880426309,  -7.416729930697,  2.536423857502,   0.585026163834,  4.233047491042, -11.128509999099, -0.591596670215,
			-12.866425691057,  6.044821934737,  12.200367325354, -1.892360702488, -31.099488921485,  4.039179558341,  11.722605670925, 11.014905859604,
			-14.373275692520, 18.282499430601,  -4.907024330304,-16.212323682887,   8.043515193967, 11.784039593614,  16.046576222863,-15.838429842336
		};
		const double ref2[] = {	// DIT ref-outputs
			 72.000000000000, 51.000000000000,  94.000000000000, 76.000000000000,  85.000000000000, 73.000000000000,  76.000000000000, 71.000000000000,
			 -3.186854935858,-14.814979432853,  16.274310380149,  1.335323711456, -13.073174362886, -2.237484334246,  -1.658016565491, -7.214637436104,
			 -6.722101857669,  7.590944457452,  14.087819349234, -5.303699999882,   2.642267168043,  8.119296138409,   8.194965871283,  3.357676558811,
			 -2.410055786718, 17.860460799853,  -3.115089210758,  8.081417051306,  10.217674711889, 18.135933290759, -23.560603557289, 12.034604903514,
			 -4.013801115483, -8.993853490320,  -3.019938212777,-12.995382764313, -17.975435139150, 16.027592818580,   4.984654315396,-10.007658135448,
			  9.628673950784, -2.618390257617,  -4.977626115382, -7.585510986880,   8.576400244341, 14.894373226740, -32.710474648103,  9.946362646980,
			  1.407075652116, -3.103750917409,  -2.611395268707, 10.713143881330,  19.283439756536,  8.662759113515, -19.031759359980, -4.734395634916,
			  1.513899588864,  8.129850867985,  -9.169123259445,-16.471534197115,   0.296631561689, -6.875402885079,   2.295297934556, -0.702929830780,
			-20.015245660006, -4.938617333789,   7.987690523425, -4.024524829342,   9.033705168679, 10.972336621039,  18.045934640017, 14.944706185410,
			 -9.250056079529, 15.676647091397,   3.540556362652,  4.445231564276, -33.237348371394, -7.299683015832,  16.814644365039, 13.500464006934,
			 18.767722258015, 10.342316476739, -24.117427303634, -6.614666472964,  15.378722566760,  3.819731226687, -10.228223459299, -9.324804950323,
			-12.959770458085,-18.146313510762,  -3.761348735601,-13.576800261814, -11.015270817590,  2.384093214004,   0.269727641998,  2.133525968650,
			  4.013763422689,  2.981560528764,   1.022999041684,  4.995345129289,  11.999872932983, -0.055223113445,   4.981539350928, -4.022967274930,
			 -5.159441616098, -2.202809827852, -14.966371172025,-26.129659478906,   1.670348376017,-13.262428826303, -14.344280414512,  7.844841106696,
			 -1.537138437596,-22.891572112364, -15.313846915632,  9.375246960644, -13.224109940089,  7.363998369751,   5.078609933375, 10.751062960496,
			  5.707698152898,-19.823900762950,   7.918716323359, -6.034843794865,   4.573028564873, 10.363166029094,  -3.060761887966,-13.427985421598
		};

		const int stride = 2*RE_IM_STRIDE, dim = stride<<4;
		double c1,c2,c3,c4,c5,c6,c7,c8,c9,cA,cB,cC,cD,cE,cF, s1,s2,s3,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF;
		static double *a,*a_ptr;	// Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode
		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
		a     = ALIGN_VEC_DBL(a_ptr);
		ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
	#ifdef USE_SSE2
		const int pfetch_dist = 0;
		int pfetch_addr = 0;	// Don't care about pfetch in this lcal-mem context, so just set these = 0
		static vec_dbl *sc_arr = 0x0, *sc_ptr;
		double *add0,*add1,*add2;	/* Addresses into array sections */
		vec_dbl *c_tmp,*s_tmp, *i0,*i1,*i2,*i3, *o0,*o1,*o2,*o3;
		static vec_dbl *cc0, *ss0, *isrt2, *two, *r00;
		sc_arr = ALLOC_VEC_DBL(sc_arr, 72);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
		sc_ptr = ALIGN_VEC_DBL(sc_arr);
		ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
		r00 = sc_ptr + 0x00;	  isrt2 = sc_ptr + 0x20;
									cc0 = sc_ptr + 0x21;
									ss0 = sc_ptr + 0x22;
									two = sc_ptr + 0x43;
		/* These remain fixed: */
		VEC_DBL_INIT(isrt2, ISRT2);
	  #if defined(USE_AVX2) && !defined(REFACTOR_4DFT_3TWIDDLE)
		VEC_DBL_INIT(two  , 1.0);
		// cc0,ss0 inited below for AVX2
	  #else
		VEC_DBL_INIT(two  , 2.0);
		VEC_DBL_INIT(cc0  , c);
		VEC_DBL_INIT(ss0  , s);
	  #endif
	#endif	// USE_SSE2 ?

		// Do these timing-test DFTs in-place, i.e. p1 = #doubles in a pair of vec_dbl:
		p1 = dim>>4;	// Set stride equal to the AVX complex-vec_dbl value, 2*4
		p2 = p1 +p1;
		p3 = p2 +p1;
		p4 = p3 +p1;
		p5 = p4 +p1;
		p6 = p5 +p1;
		p7 = p6 +p1;
		p8 = p7 +p1;
		p9 = p8 +p1;
		pA = p9 +p1;
		pB = pA +p1;
		pC = pB +p1;
		pD = pC +p1;
		pE = pD +p1;
		pF = pE +p1;

		// Twiddles for the purpose of this timing-test are w1-15, with w := exp(2*Pi*I/2^14):
		c1 = 0.9999999264657178511447314807; s1 = 0.0003834951875713955890724616812;
		c2 = 0.9999997058628822191602282177; s2 = 0.0007669903187427045269385683580;
		c3 = 0.9999993381915255477888066109; s3 = 0.001150485337113848457071735047;
		c4 = 0.9999988234517019099290257101; s4 = 0.001533980186284765612303697150;
		c5 = 0.9999981616434870076277347923; s5 = 0.001917474809855419109500620455;
		c6 = 0.9999973527669781720689399696; s6 = 0.002300969151425805244235552264;
		c7 = 0.9999963968222943635594898320; s7 = 0.002684463154595961785455992532;
		c8 = 0.9999952938095761715115801256; s8 = 0.003067956762965976270145365491;
		c9 = 0.9999940437289858144220774704; s9 = 0.003451449920135994297977171937;
		cA = 0.9999926465807071398486621178; sA = 0.003834942569706227825960602960;
		cB = 0.9999911023649456243827897550; sB = 0.004218434655276963463076393843;
		cC = 0.9999894110819283736194723572; sC = 0.004601926120448570764901699143;
		cD = 0.9999875727319041221238780943; sD = 0.004985416908821510528222769585;
		cE = 0.9999855873151432333947502950; sE = 0.005368906963996343085634209014;
		cF = 0.9999834548319376998246454755; sF = 0.005752396229573736600123594041;
		// In the refactor-case we restructure these and overwrite w5-7,9-B,D-F as shown:
	#ifdef REFACTOR_4DFT_3TWIDDLE
		// w1.E^2 = [c1,s1].[1,I]/sqrt2 = [c1-s1,c1+s1]*ISRT2;
		// w2.E^4 = [c2,s2].I = [-s2,c2];
		// w3.E^6 = [c3,s3].[-1,I]/sqrt2 = [-c3-s3,c3-s3]*ISRT2
		c5 =  (c1-s1)*ISRT2;	s5 = (c1+s1)*ISRT2;
		c6 = -s2;				s6 = c2;
		c7 = -(c3+s3)*ISRT2;	s7 = (c3-s3)*ISRT2;

		// w1.E   = [c1,s1].[c,s] = [c1.c-s1.s,c1.s+s1.c];
		// w2.E^2 = [c2,s2].[2,I]/sqrt2 = [c2-s2,c2+s2]*ISRT2;
		// w3.E^3 = [c3,s3].[s,c] = [c3.s-s3.c,c3.c+s3.s]
		c9 = c1*c-s1*s;			s9 = c1*s+s1*c;
		cA = (c2-s2)*ISRT2;		sA = (c2+s2)*ISRT2;
		cB = c3*s-s3*c;			sB = c3*c+s3*s;

		// w1.E^3 = [c1,s1].[s,c] = [c1.s-s1.c,c1.c+s1.s];
		// w2.E^6 = [c2,s2].[-1,I]/sqrt2 = [-c2-s2,c2-s2]*ISRT2;
		// w3.-E  = [c3,s3].[-c,-s] = [-c3.c+s3.s,-c3.s-s3.c]
		cD = c1*s-s1*c;			sD = c1*c+s1*s;
		cE = -(c2+s2)*ISRT2;	sE = (c2-s2)*ISRT2;
		cF = -c3*c+s3*s;		sF = -(c3*s+s3*c);
	/*
		// And now do batch-div to setup for 'cotangent'-CMUL scheme:
		c1 /= s1;
		c2 /= s2;
		c3 /= s3;
		c4 /= s4;
		c5 /= s5;
		c6 /= s6;
		c7 /= s7;
		c8 /= s8;
		c9 /= s9;
		cA /= sA;
		cB /= sB;
		cC /= sC;
		cD /= sD;
		cE /= sE;
		cF /= sF;
	*/
	#endif

	#ifdef USE_SSE2

	  #ifdef USE_AVX2	// AVX2/FMA needs tangent-form twiddles:
		// In AVX2/FMA mode, since we need to replace most of the raw sincos data with derived ones,
		// simply place one copy of each scalar-double in a double-sized slot of the local memory.
		// We will be using AVX2/FMA-based Newtonian iterative inversion on the 16 doubles whose
		// multiplicative inverse is needed (the real part of the basic root of unity c and of the
		// 15 complex twiddles, c1-15), so store those in packed form in 4 AVX-register-sized
		// contiguous memory locations, and the others in a separate chunk of memory. After the
		// vector-iterative inversion we'll need to combine the 2 sets of data and place (in suitable
		// vector-register-sized broadcast form) into their final SIMD-suitable memory slots.

	  clock1 = getRealTime();
	  for(i = 0; i < imax; i++) {	// repeater loop
		add0 = (double *)cc0;	// add0 points to 16 cos-data-to-be-inverted; Need a double-ptr on lhs here
		add1 = add0 + 16;	// add1 points to block of memory temporarily used to store the corresponding sine data
		add2 = add0 + 32;	// add2 points to block of memory temporarily used to store the 11 [0-padded to 12]
							//	cosine data which need to be divided by other cosines (i.e. multiplied by inverses)
		/* The add2-addressed cosine ratios are arranged in 3 YMM-register/memory-sized slots like so;
		  once we have filled 4 YYMs with inverses 1/[c3,c1-15] and used those to get the 16 tangents (1st set = 1/c3
		  and discarded) we will do as described in the right column to set up for the cosine-ratios computation:

			double __c31 = __c3/__c1;
			double __c51 = __c5/__c1;
			double __c62 = __c6/__c2;
			[0 pad]						shuffle YMM with 1/[c3,c1,c2,c3] to get 1/[c1,c1,c2,c3], then *= [c3,c5,c6,0]

			double __c73 = __c7/__c3;
			double __c91 = __c9/__c1;
			double __cA2 = __cA/__c2;
			double __cB3 = __cB/__c3;	initialize YMM with 1/[c3,c1,c2,c3], then *= [c7,c9,cA,cB]

			double __cC4 = __cC/__c4;
			double __cD5 = __cD/__c5;
			double __cE6 = __cE/__c6;
			double __cF7 = __cF/__c7;	Multiply YMM with 1/[c4-7] *= [cC-F]
		*/
		// Since tan0 defined as const, use this pair of double slots to hold 1/c3 (via c3,1 on input, then invert c3 and multiply
		// them together), which extra 1/c3 copy saves some really awkward permuting, at least in terms of the idiotic x86 ISA.
		*add0++ = 0.0;	*add1++ = 1.0;
		*add0++ = c1;	// c1, for inversion
		*add1++ = s1;	// s1  slot will hold __r1 = s1 /c1
		*add0++ = c2;	// c2, for inversion
		*add1++ = s2;	// s2  slot will hold __r2 = s2 /c2
		*(add0-3) = c3;	// c3, for inversion ...
		*add0++   = c3;	// place extra copy in 0-slot as described above - put on separate line to avoid ambiguity of *(add0-3) = *add0++ = ...
		*add1++ = s3;	// s3  slot will hold __r3 = s3 /c3
		*add2++ = c3;	// c3, will get multiplied by 1/c1 to yield __c31
		*add0++ = c4;	// c4, for inversion
		*add1++ = s4;	// s4  slot will hold __r4 = s4 /c4
		*add0++ = c5;	// c5, for inversion
		*add1++ = s5;	// s5  slot will hold __r5 = s5 /c5
		*add2++ = c5;	// c5, will get multiplied by 1/c1 to yield __c51
		*add0++ = c6;	// c6, for inversion
		*add1++ = s6;	// s6  slot will hold __r6 = s6 /c6
		*add2++ = c6;	// c6, will get multiplied by 1/c2 to yield __c62
		*add2++ = 0.0;	// 0-pad will get multiplied by 1/c3 term, remains 0-pad.
		*add0++ = c7;	// c7, for inversion
		*add1++ = s7;	// s7  slot will hold __r7 = s7 /c7
		*add2++ = c7;	// c7, will get multiplied by 1/c3 to yield __c73
		*add0++ = c8;	// c8, for inversion
		*add1++ = s8;	// s8  slot will hold __r8 = s8 /c8
		*add0++ = c9;	// c9, for inversion
		*add1++ = s9;	// s9  slot will hold __r9 = s9 /c9
		*add2++ = c9;	// c9, will get multiplied by 1/c1 to yield __c91
		*add0++ = cA;	// c10, for inversion
		*add1++ = sA;	// s10 slot will hold __rA = s10/c10
		*add2++ = cA;	// c10, will get multiplied by 1/c2 to yield __cA2
		*add0++ = cB;	// c11, for inversion
		*add1++ = sB;	// s11 slot will hold __rB = s11/c11
		*add2++ = cB;	// c11, will get multiplied by 1/c3 to yield __cB3
		*add0++ = cC;	// c12, for inversion
		*add1++ = sC;	// s12 slot will hold __rC = s12/c12
		*add2++ = cC;	// c12, will get multiplied by 1/c4 to yield __cC4
		*add0++ = cD;	// c13, for inversion
		*add1++ = sD;	// s13 slot will hold __rD = s13/c13
		*add2++ = cD;	// c13, will get multiplied by 1/c5 to yield __cD5
		*add0++ = cE;	// c14, for inversion
		*add1++ = sE;	// s14 slot will hold __rE = s14/c14
		*add2++ = cE;	// c14, will get multiplied by 1/c6 to yield __cE6
		*add0++ = cF;	// c15, for inversion
		*add1++ = sF;	// s15 slot will hold __rF = s15/c15
		*add2++ = cF;	// c15, will get multiplied by 1/c7 to yield __cF7
		/*
		At this point, the 11 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data:

		0:	c3,c1-3
		1:	c4-7
		2:	c8-11
		3:	c12-c15
		4:	1.0,s1-3
		5:	s4-7
		6:	s8-11
		7:	s12-s15
		8:	c3,5,6,[0-pad]
		9:	c7,9-B
		A:	cC-F
		*/
		c_tmp = &c; s_tmp = &tan;	// GCC/Clang don't allow cd_address-taking inlined in arglist of macros, so do it here
		RADIX16_COMPUTE_FMA_SINCOS_DIF(cc0,two, c_tmp,s_tmp);
	  }	// repeater loop
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u FMA DIF tan-twiddles setup calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);

	  #elif defined(REFACTOR_4DFT_3TWIDDLE)
		/* Sincos data stored in terms of the following 5 contiguous-data triplets:
			c4,s4, c8,s8, cC,sC
			c1,s1, c2,s2, c3,s3
			c5,s5, c6,s6, c7,s7
			c9,s9, cA,sA, cB,sB
			cD,sD, cE,sE, cF,sF .
		Note that due to my layout of the SSE2_RADIX_04_DIF_3TWIDDLE_X2-macro arglist,
		we need to swap the order of the first 2 sincos-pairs of each triplet:
		*/
		c_tmp = cc0; s_tmp = c_tmp+1;	/* c0,s0 */
		VEC_DBL_INIT(c_tmp, c8);	VEC_DBL_INIT(s_tmp, s8);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c4);	VEC_DBL_INIT(s_tmp, s4);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cC);	VEC_DBL_INIT(s_tmp, sC);	c_tmp+=2; s_tmp+=2;

		VEC_DBL_INIT(c_tmp, c2);	VEC_DBL_INIT(s_tmp, s2);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c1);	VEC_DBL_INIT(s_tmp, s1);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c3);	VEC_DBL_INIT(s_tmp, s3);	c_tmp+=2; s_tmp+=2;

		VEC_DBL_INIT(c_tmp, c6);	VEC_DBL_INIT(s_tmp, s6);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c5);	VEC_DBL_INIT(s_tmp, s5);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c7);	VEC_DBL_INIT(s_tmp, s7);	c_tmp+=2; s_tmp+=2;

		VEC_DBL_INIT(c_tmp, cA);	VEC_DBL_INIT(s_tmp, sA);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c9);	VEC_DBL_INIT(s_tmp, s9);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cB);	VEC_DBL_INIT(s_tmp, sB);	c_tmp+=2; s_tmp+=2;

		VEC_DBL_INIT(c_tmp, cE);	VEC_DBL_INIT(s_tmp, sE);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cD);	VEC_DBL_INIT(s_tmp, sD);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cF);	VEC_DBL_INIT(s_tmp, sF);	c_tmp+=2; s_tmp+=2;

	  #else

		// Sincos data stored in BRed form in SSE2 local-data layout:
		c_tmp = cc0 + 0x02; s_tmp = c_tmp+1;	/* c0,s0 */
		rt = 1.0; it = 0.0;
		VEC_DBL_INIT(c_tmp, rt);	VEC_DBL_INIT(s_tmp, it);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c8);	VEC_DBL_INIT(s_tmp, s8);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c4);	VEC_DBL_INIT(s_tmp, s4);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cC);	VEC_DBL_INIT(s_tmp, sC);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c2);	VEC_DBL_INIT(s_tmp, s2);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cA);	VEC_DBL_INIT(s_tmp, sA);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c6);	VEC_DBL_INIT(s_tmp, s6);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cE);	VEC_DBL_INIT(s_tmp, sE);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c1);	VEC_DBL_INIT(s_tmp, s1);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c9);	VEC_DBL_INIT(s_tmp, s9);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c5);	VEC_DBL_INIT(s_tmp, s5);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cD);	VEC_DBL_INIT(s_tmp, sD);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c3);	VEC_DBL_INIT(s_tmp, s3);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cB);	VEC_DBL_INIT(s_tmp, sB);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, c7);	VEC_DBL_INIT(s_tmp, s7);	c_tmp+=2; s_tmp+=2;
		VEC_DBL_INIT(c_tmp, cF);	VEC_DBL_INIT(s_tmp, sF);	c_tmp+=2; s_tmp+=2;

	  #endif

	#endif

		//******************* Timing loop for Radix-16 DIF transform macro: *******************
		clock1 = getRealTime();
		for(i = 0; i < imax; i++)
		{
			// Copy digits of Pi-data into our vec_dbl inputs:
			for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ran[] input array
			{
		/* The normal index-munging takes way too many cycles in this context, so inline it via 8-way loop unroll:
			#ifdef USE_AVX
				j1 = (j & mask02) + br8[j&7];
			#elif defined(USE_SSE2)
				j1 = (j & mask01) + br4[j&3];
			#else
				j1 = j;
			#endif
				a[j1] = ran[j];
		*/
			#ifdef USE_AVX512	// Set this up so that AVX-512 can use the same ref-data as AVX:
				a[j1   ] = ran[j2  ];	/* Re0 */	a[j1+ 4] = 0;	/* Re4 */
				a[j1+ 1] = ran[j2+2];	/* Re1 */	a[j1+ 5] = 0;	/* Re5 */
				a[j1+ 2] = ran[j2+4];	/* Re2 */	a[j1+ 6] = 0;	/* Re6 */
				a[j1+ 3] = ran[j2+6];	/* Re3 */	a[j1+ 7] = 0;	/* Re7 */
				a[j1+ 8] = ran[j2+1];	/* Im0 */	a[j1+12] = 0;	/* Im4 */
				a[j1+ 9] = ran[j2+3];	/* Im1 */	a[j1+13] = 0;	/* Im5 */
				a[j1+10] = ran[j2+5];	/* Im2 */	a[j1+14] = 0;	/* Im6 */
				a[j1+11] = ran[j2+7];	/* Im3 */	a[j1+15] = 0;	/* Im7 */
			#elif defined(USE_AVX)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+4];	// Re2
				a[j1+3] = ran[j2+6];	// Re3
				a[j1+4] = ran[j2+1];	// Im0
				a[j1+5] = ran[j2+3];	// Im1
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#elif defined(USE_SSE2)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+1];	// Im0
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+6];	// Re3
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#else
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+1];	// Im0
				a[j1+2] = ran[j2+2];	// Re1
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+5];	// Im2
				a[j1+6] = ran[j2+6];	// Re3
				a[j1+7] = ran[j2+7];	// Im3
			#endif
			}

			j1 = 0; j2 = RE_IM_STRIDE;

// 19 May 2016: 10^7-loop timings, 1-threaded on my 2GHz Core2Duo:
//												comments
// no-DFT timing						0.969
// initial timing						2.467	(2.467-0.969)*200 = 300 cycles ... SSE2_RADIX_04_DIF_3TWIDDLE_X2-based (4 calls at 60ish cycles each) should beat that handily
// SSE2_RADIX_04_DIF_3TWIDDLE_X2-based	2.730	ack! [And commenting-out the 4 dft-4 macro calls gives 0.980 sec, insign. different from original no-DFT timing, i.e. the C-code pointer math is not at fault.
// fuse four 4-dft macros into radix-16	2.224	(2.224-0.969)*200 = 250 cycles ... that's more like it!
		#ifdef USE_SSE2

		  #ifdef USE_AVX2	// AVX2/FMA needs tangent-form twiddles:

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIF_TWIDDLE_1(add,p1,p2,p3,p4,p8,pC,r00,isrt2,pfetch_addr,pfetch_dist);
#if 0
10^6-timing:	setup	+=DIF	DIF-only
avx2:			.208	.380	.172 [224 cycles]
avx512:			.296	.472	.176 [229 cycles]	further fiddling with the adressing parts of this macro -> 0.460, 16 cycles faster!
													[avx512 Tan-twiddles precomp = 140 cycles]
#endif
		  #elif defined(REFACTOR_4DFT_3TWIDDLE)

		   #if 1

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIF_TWIDDLE_V2(add,p1,p2,p3,p4,p8,pC,r00,two,cc0,pfetch_addr,pfetch_dist);
/*
#ifdef USE_AVX
	dptr = (double *)r00;
	printf("Intermediates:\n");
	for(i = 0; i < 16; i++, dptr += 8) {
		printf("%2u Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3),*(dptr+4),*(dptr+5),*(dptr+6),*(dptr+7));
	}
	exit(0);
#else
	dptr = (double *)r00;
	printf("Intermediates:\n");
	for(i = 0; i < 16; i++, dptr += 4) {
		printf("%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
	}
	exit(0);
	#if 0
		SSE2, Intermediates:
		 0 Re.[d0,d1] =  12.996843798630, 24.930865132335, Im.[d0,d1] =   3.041415000035, 24.061237911217
		 1 Re.[d0,d1] =  -1.967770045489, -3.960148397371, Im.[d0,d1] =  -6.012197708235, -5.001442285342
		 2 Re.[d0,d1] =  -0.996872035772, -2.980018327208, Im.[d0,d1] =  -1.023007259458, -6.018361815583
		 3 Re.[d0,d1] =   1.967798282632, -1.990698407756, Im.[d0,d1] =   7.993789967657, -9.041433810292
		 4 Re.[d0,d1] =  17.937037540455, 15.972338992203, Im.[d0,d1] =  20.044361826623, 14.027564530942
		 5 Re.[d0,d1] =   0.026036426331,  0.015342118897, Im.[d0,d1] =  -1.992277237471, -0.012241272931
		 6 Re.[d0,d1] =   4.013790439910, 12.015332706460, Im.[d0,d1] =   2.010786096064, -1.990767874548
		 7 Re.[d0,d1] = -13.976864406696,  3.996986182440, Im.[d0,d1] =  -8.062870685215,  3.975444616537
		 8 Re.[d0,d1] =  15.947773027030, 21.940126671442, Im.[d0,d1] =  19.038248242231, 25.038232997648
		 9 Re.[d0,d1] =   1.022960222909,  2.004558386725, Im.[d0,d1] =  -1.990745621227, 11.007687462509
		10 Re.[d0,d1] =   6.039898671633,  4.022982548356, Im.[d0,d1] =  -9.001451585837,  3.010797836274
		11 Re.[d0,d1] =  -3.010631921572, -7.967667606523, Im.[d0,d1] =   3.953948964833, -7.056718296431
		12 Re.[d0,d1] =  24.987608157941, 30.970725010455, Im.[d0,d1] =   9.067480879677, 13.070512349057
		13 Re.[d0,d1] =  -8.972346109218,  2.036811854087, Im.[d0,d1] =   0.969395724120, -0.007571096299
		14 Re.[d0,d1] =  -2.999955284040, -4.989179812937, Im.[d0,d1] =   0.981587603769, -1.039861018570
		15 Re.[d0,d1] =  -1.015306764683,  3.981642948395, Im.[d0,d1] =   0.981535792434, -0.023080234188
	#endif
#endif
*/
		   #else

			/* Pass 1: */
			j = p2*8;	// Set j to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's inputs
			k = 0x80;	// Set k to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's outputs
			// Pass1-DFTs all use same twiddle-triplet, so lit-byte address offset between ptrs to 1st,2nd DFT's twiddles = 0
			i0 = (vec_dbl *)a; i1 = (vec_dbl *)(a+p4); i2 = (vec_dbl *)(a+p8); i3 = (vec_dbl *)(a+pC);
			o0 = r00; o1 = r00+2; o2 = r00+4; o3 = r00+6;
			c_tmp = cc0;	/* c8,4,C */
			SSE2_RADIX_04_DIF_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,   0, o0,o1,o2,o3,k)

			i0 = (vec_dbl *)(a+p1); i1 = (vec_dbl *)(a+p5); i2 = (vec_dbl *)(a+p9); i3 = (vec_dbl *)(a+pD);
			o0 += 16; o1 += 16; o2 += 16; o3 += 16;
			SSE2_RADIX_04_DIF_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,   0, o0,o1,o2,o3,k)

			/* Pass 2: */
			j = 0x40;	// Set j to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's inputs
			k = p4*8;	// Set k to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's outputs
			// 0x60 = literal-bytewise address offset between the pointers to the first and second DFT's twiddles = 0:
			i0 = r00; i1 = r00+16; i2 = r00+8; i3 = r00+24;
			o0 = (vec_dbl *)a; o1 = (vec_dbl *)(a+p2); o2 = (vec_dbl *)(a+p1); o3 = (vec_dbl *)(a+p3);
			c_tmp += 6;	/* c2,1,3 */
			SSE2_RADIX_04_DIF_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,0x60, o0,o1,o2,o3,k)

			i0 += 2; i1 += 2; i2 += 2; i3 += 2;
			o0 = (vec_dbl *)(a+p8); o1 = (vec_dbl *)(a+pA); o2 = (vec_dbl *)(a+p9); o3 = (vec_dbl *)(a+pB);
			c_tmp += 12;	/* cA,9,B */
			SSE2_RADIX_04_DIF_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,0x60, o0,o1,o2,o3,k)
		   #endif

		  #else	// REFACTOR_4DFT_3TWIDDLE = false:

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIF_TWIDDLE(add,p1,p2,p3,p4,p8,pC,r00,isrt2,pfetch_addr,pfetch_dist);

		  #endif

		#else	// USE_SSE2 = false:

		  #ifdef REFACTOR_4DFT_3TWIDDLE
			/*
			Pass 1:
			y0-3 = radix_4_3twid(x0,4,8,c; w4,8,c)
			y4-7 = radix_4_3twid(x2,6,a,e; w4,8,c)
			y8-b = radix_4_3twid(x1,5,9,d; w4,8,c)
			yc-f = radix_4_3twid(x3,7,b,f; w4,8,c)
			*/
		  	RADIX_04_DIF_3TWIDDLE(a[j1   ],a[j2   ],a[j1+p4],a[j2+p4],a[j1+p8],a[j2+p8],a[j1+pC],a[j2+pC], t0 ,t1 ,t2 ,t3 ,t4 ,t5 ,t6 ,t7 , c4,s4,c8,s8,cC,sC, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(a[j1+p2],a[j2+p2],a[j1+p6],a[j2+p6],a[j1+pA],a[j2+pA],a[j1+pE],a[j2+pE], t8, t9 ,t10,t11,t12,t13,t14,t15, c4,s4,c8,s8,cC,sC, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(a[j1+p1],a[j2+p1],a[j1+p5],a[j2+p5],a[j1+p9],a[j2+p9],a[j1+pD],a[j2+pD], t16,t17,t18,t19,t20,t21,t22,t23, c4,s4,c8,s8,cC,sC, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(a[j1+p3],a[j2+p3],a[j1+p7],a[j2+p7],a[j1+pB],a[j2+pB],a[j1+pF],a[j2+pF], t24,t25,t26,t27,t28,t29,t30,t31, c4,s4,c8,s8,cC,sC, rt,it);

			/*
			Pass 2:
			z0-3 = radix_4_3twid(y0,8,4,c; w1    ,w2      ,w3      )
			z4-7 = radix_4_3twid(y2,a,6,e; w1.E^2,w2.I    ,w3.I.E^2)
			z8-b = radix_4_3twid(y1,9,5,d; w1.E^1,w2.  E^2,w3.  E^3)
			zc-f = radix_4_3twid(y3,b,7,f; w1.E^3,w2.I.E^2,w3.  E  )
			*/
		  	RADIX_04_DIF_3TWIDDLE(t0 ,t1 ,t16,t17,t8 ,t9 ,t24,t25, a[j1   ],a[j2   ],a[j1+p2],a[j2+p2],a[j1+p1],a[j2+p1],a[j1+p3],a[j2+p3], c1,s1, c2,s2, c3,s3, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(t4 ,t5 ,t20,t21,t12,t13,t28,t29, a[j1+p4],a[j2+p4],a[j1+p6],a[j2+p6],a[j1+p5],a[j2+p5],a[j1+p7],a[j2+p7], c5,s5, c6,s6, c7,s7, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(t2 ,t3 ,t18,t19,t10,t11,t26,t27, a[j1+p8],a[j2+p8],a[j1+pA],a[j2+pA],a[j1+p9],a[j2+p9],a[j1+pB],a[j2+pB], c9,s9, cA,sA, cB,sB, rt,it);
		  	RADIX_04_DIF_3TWIDDLE(t6 ,t7 ,t22,t23,t14,t15,t30,t31, a[j1+pC],a[j2+pC],a[j1+pE],a[j2+pE],a[j1+pD],a[j2+pD],a[j1+pF],a[j2+pF], cD,sD, cE,sE, cF,sF, rt,it);

		  #else

			RADIX_16_DIF_TWIDDLE(
				a[j1   ],a[j2   ],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p4+p1],a[j2+p4+p1],a[j1+p4+p2],a[j2+p4+p2],a[j1+p4+p3],a[j2+p4+p3],a[j1+p8],a[j2+p8],a[j1+p8+p1],a[j2+p8+p1],a[j1+p8+p2],a[j2+p8+p2],a[j1+p8+p3],a[j2+p8+p3],a[j1+pC],a[j2+pC],a[j1+pC+p1],a[j2+pC+p1],a[j1+pC+p2],a[j2+pC+p2],a[j1+pC+p3],a[j2+pC+p3],
				c1,s1,c2,s2,c3,s3,c4,s4,c5,s5,c6,s6,c7,s7,c8,s8,c9,s9,cA,sA,cB,sB,cC,sC,cD,sD,cE,sE,cF,sF,
				c,s);

		  #endif
/*
printf("DIF Outputs =\n0  [%16.12f,%16.12f]\n1  [%16.12f,%16.12f]\n2  [%16.12f,%16.12f]\n3  [%16.12f,%16.12f]\n4  [%16.12f,%16.12f]\n5  [%16.12f,%16.12f]\n6  [%16.12f,%16.12f]\n7  [%16.12f,%16.12f]\n8  [%16.12f,%16.12f]\n9  [%16.12f,%16.12f]\n10 [%16.12f,%16.12f]\n11 [%16.12f,%16.12f]\n12 [%16.12f,%16.12f]\n13 [%16.12f,%16.12f]\n14 [%16.12f,%16.12f]\n15 [%16.12f,%16.12f]\n",
a[j1   ],a[j2   ],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p4+p1],a[j2+p4+p1],a[j1+p4+p2],a[j2+p4+p2],a[j1+p4+p3],a[j2+p4+p3],a[j1+p8],a[j2+p8],a[j1+p8+p1],a[j2+p8+p1],a[j1+p8+p2],a[j2+p8+p2],a[j1+p8+p3],a[j2+p8+p3],a[j1+pC],a[j2+pC],a[j1+pC+p1],a[j2+pC+p1],a[j1+pC+p2],a[j2+pC+p2],a[j1+pC+p3],a[j2+pC+p3]);
exit(0);
*/
		#endif
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u DIF macro calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);

		// Check outputs vs ref-data:
		nerr = 0;	dtmp = avg_err = 0.0;
		for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ref-array
		{
			j = j1+RE_IM_STRIDE;
		#ifdef USE_AVX	// Since we set up AVX-512 mode to only use nonzero data in lower 4 double-slots of each 8-vector, can use same code here:
		//	printf("Out[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1],a[j1+2],a[j1+3],a[j],a[j+1],a[j+2],a[j+3]);
		//	printf("Ref[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p1,ref1[j2],ref1[j2+2],ref1[j2+4],ref1[j2+6],ref1[j2+1],ref1[j2+3],ref1[j2+5],ref1[j2+7]);
			dtmp = fabs(a[j1  ] - ref1[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref1[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref1[j2+4]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d2\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref1[j2+6]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d3\n");*/ nerr++; };
			dtmp = fabs(a[j   ] - ref1[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j +1] - ref1[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
			dtmp = fabs(a[j +2] - ref1[j2+5]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d2\n");*/ nerr++; };
			dtmp = fabs(a[j +3] - ref1[j2+7]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d3\n");*/ nerr++; };
		#elif defined(USE_SSE2)
		//	printf("Out%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1],a[j1+2],a[j1+3]);
		//	printf("Ref%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p1,ref1[j2],ref1[j2+2],ref1[j2+1],ref1[j2+3]);
			dtmp = fabs(a[j1  ] - ref1[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref1[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref1[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref1[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
		#else
		//	printf("Out%2u Re,Im = %16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1]);
			dtmp = fabs(a[j1  ] - ref1[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref1[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
		#endif
		}
printf("DIF: nerr = %u, ",nerr);
		ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!");
		printf("\tSummed roundoff error = %20.10e]\n",avg_err);

		//******************* Timing loop for Radix-16 DIT transform macro: *******************
	  #ifdef USE_AVX2	// AVX2/FMA needs tangent-form twiddles:
		// In AVX2/FMA mode, since we need to replace most of the raw sincos data with derived ones,
		// simply place one copy of each computed double in a double-sized slot of the local memory.
		// We will be using AVX2/FMA-based Newtonian iterative inversion on the 16 doubles whose
		// multiplicative inverse is needed (the real part of the basic root of unity c and of the
		// 15 complex twiddles, c1-15), so store those in packed form in 4 AVX-register-sized
		// contiguous memory locations, and the others in a separate chunk of memory. After the
		// vector-iterative inversion we'll need to combine the 2 sets of data and place (in quadruplicate)
		// into their final SIMD-suitable memory slots.

	  clock1 = getRealTime();
	  for(i = 0; i < imax; i++) {	// repeater loop
		add0 = (double *)cc0;	// add0 points to 16 cos-data-to-be-inverted; Need a double-ptr on lhs here
		add1 = add0 + 16;		// add1 points to block of memory temporarily used to store the corresponding sine data
		*add0++ = c;	// Since tan0 defined as const, we could init these directly, but init with c0,s0 anyway
		*add1++ = s;	// and use result as a check onthe accuracy of the FMA-based Newton iterative inversion.

		*add0++ = c1;	// c1, for inversion
		*add1++ = s1;	// s1  slot will hold __r1 = s1 /c1
		*add0++ = c2;	// c2, for inversion
		*add1++ = s2;	// s2  slot will hold __r2 = s2 /c2
		*add0++ = c3;	// c3, for inversion
		*add1++ = s3;	// s3  slot will hold __r3 = s3 /c3
		*add0++ = c4;	// c4, for inversion
		*add1++ = s4;	// s4  slot will hold __r4 = s4 /c4
		*add0++ = c5;	// c5, for inversion
		*add1++ = s5;	// s5  slot will hold __r5 = s5 /c5
		*add0++ = c6;	// c6, for inversion
		*add1++ = s6;	// s6  slot will hold __r6 = s6 /c6
		*add0++ = c7;	// c7, for inversion
		*add1++ = s7;	// s7  slot will hold __r7 = s7 /c7
		*add0++ = c8;	// c8, for inversion
		*add1++ = s8;	// s8  slot will hold __r8 = s8 /c8
		*add0++ = c9;	// c9, for inversion
		*add1++ = s9;	// s9  slot will hold __r9 = s9 /c9
		*add0++ = cA;	// c10, for inversion
		*add1++ = sA;	// s10 slot will hold __rA = s10/c10
		*add0++ = cB;	// c11, for inversion
		*add1++ = sB;	// s11 slot will hold __rB = s11/c11
		*add0++ = cC;	// c12, for inversion
		*add1++ = sC;	// s12 slot will hold __rC = s12/c12
		*add0++ = cD;	// c13, for inversion
		*add1++ = sD;	// s13 slot will hold __rD = s13/c13
		*add0++ = cE;	// c14, for inversion
		*add1++ = sE;	// s14 slot will hold __rE = s14/c14
		*add0++ = cF;	// c15, for inversion
		*add1++ = sF;	// s15 slot will hold __rF = s15/c15
		/*
		At this point, the 8 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data:

		0:	c,c1-3		4:	s,s1-3
		1:	c4-7		5:	s4-7
		2:	c8-B		6:	s8-B
		3:	cC-F		7:	sC-F
		*/

		// Now send the cosine terms to the inversion routine, which also does the combine-and-populate-SIMD-slots step.
		RADIX16_COMPUTE_FMA_SINCOS_DIT(cc0,two);
	  }	// repeater loop
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u FMA DIT tan-twiddles setup calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);

	  #endif

		clock1 = getRealTime();
		for(i = 0; i < imax; i++)
		{
			// Copy digits of Pi-data into our vec_dbl inputs:
			for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ran[] input array
			{
		/* The normal index-munging takes way too many cycles in this context, so inline it via 8-way loop unroll:
			#ifdef USE_AVX
				j1 = (j & mask02) + br8[j&7];
			#elif defined(USE_SSE2)
				j1 = (j & mask01) + br4[j&3];
			#else
				j1 = j;
			#endif
				a[j1] = ran[j];
		*/
			#ifdef USE_AVX512	// Set this up so that AVX-512 can use the same ref-data as AVX:
				a[j1   ] = ran[j2  ];	/* Re0 */	a[j1+ 4] = 0;	/* Re4 */
				a[j1+ 1] = ran[j2+2];	/* Re1 */	a[j1+ 5] = 0;	/* Re5 */
				a[j1+ 2] = ran[j2+4];	/* Re2 */	a[j1+ 6] = 0;	/* Re6 */
				a[j1+ 3] = ran[j2+6];	/* Re3 */	a[j1+ 7] = 0;	/* Re7 */
				a[j1+ 8] = ran[j2+1];	/* Im0 */	a[j1+12] = 0;	/* Im4 */
				a[j1+ 9] = ran[j2+3];	/* Im1 */	a[j1+13] = 0;	/* Im5 */
				a[j1+10] = ran[j2+5];	/* Im2 */	a[j1+14] = 0;	/* Im6 */
				a[j1+11] = ran[j2+7];	/* Im3 */	a[j1+15] = 0;	/* Im7 */
			#elif defined(USE_AVX)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+4];	// Re2
				a[j1+3] = ran[j2+6];	// Re3
				a[j1+4] = ran[j2+1];	// Im0
				a[j1+5] = ran[j2+3];	// Im1
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#elif defined(USE_SSE2)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+1];	// Im0
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+6];	// Re3
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#else
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+1];	// Im0
				a[j1+2] = ran[j2+2];	// Re1
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+5];	// Im2
				a[j1+6] = ran[j2+6];	// Re3
				a[j1+7] = ran[j2+7];	// Im3
			#endif
			}

			j1 = 0; j2 = RE_IM_STRIDE;

// 23 May 2016: 10^7-loop timings, 1-threaded on my 2GHz Core2Duo:
//												comments
// no-DFT timing						0.982
// 1x 4-DFT 22 addpd 20 mulpd 31 ld/st	1.202	(1.202-0.982)*200 = 44 cycles [24 cycles for opening 16 addpd, 20 for 3 twiddle-cmul (12 mulpd, 6 addpd)]
// 2x 4-DFT, 2-column side-by-side opt	1.312	66 cycles
// current-code timing					2.422	(2.422-0.978)*200 = 288 cycles
// fuse four 4-dft macros into radix-16	2.233	250 cycles ... still 50 cycles to go, but nice speedup nonetheless.
		#ifdef USE_SSE2

		  #ifdef USE_AVX2	// AVX2/FMA needs tangent-form twiddles:

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIT_TWIDDLE_1(add,p1,p2,p3,p4,p8,pC,r00,cc0,pfetch_addr,pfetch_dist);
#if 0
10^6-timing:	setup	+=DIT	DIT-only
avx2:			.208	.398	.190 [247 cycles]
avx512:			.296	.489	.193 [251 cycles]	further fiddling with the adressing parts of this macro -> 0.476, 17 cycles faster!
													[avx512 Tan-twiddles precomp = 122 cycles]
#endif
		  #elif defined(REFACTOR_4DFT_3TWIDDLE)

		   #if 1

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIT_TWIDDLE_V2(add,p1,p2,p3,p4,p8,pC,r00,two,cc0,pfetch_addr,pfetch_dist);

		   #else

			/*
			Pass 1:
			y0-3 = radix_4dit_3twid(x0-3; w1    ,w2    ,w3    )
			y4-7 = radix_4dit_3twid(x4-7; w1.E^2,w2.E^4,w3.E^6)
			y8-b = radix_4dit_3twid(x8-b; w1.E^1,w2.E^2,w3.E^3)
			yc-f = radix_4dit_3twid(xc-f; w1.E^3,w2.E^6,w3.E^9)
			*/
			j = p4*8;	// Set j to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's inputs
			k = 0x80;	// Set k to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's outputs
			// 0x60 = literal-bytewise address offset between the pointers to the first and second DFT's twiddles:
			i0 = (vec_dbl *)a; i1 = (vec_dbl *)(a+p1); i2 = (vec_dbl *)(a+p2); i3 = (vec_dbl *)(a+p3);
			o0 = r00; o1 = r00+2; o2 = r00+4; o3 = r00+6;
			c_tmp = cc0+6;	// c2,1,3
			SSE2_RADIX_04_DIT_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,0x60, o0,o1,o2,o3,k)

			i0 = (vec_dbl *)(a+p8); i1 = (vec_dbl *)(a+p9); i2 = (vec_dbl *)(a+pA); i3 = (vec_dbl *)(a+pB);
			o0 += 16; o1 += 16; o2 += 16; o3 += 16;
			c_tmp += 12;		// cA,9,B
			SSE2_RADIX_04_DIT_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,0x60, o0,o1,o2,o3,k)

			/*
			Pass 2:
			z0,4,8,c = radix_4dit_3twid(y0,4,8,c; w4,8,c)
			z2,6,a,e = radix_4dit_3twid(y2,6,a,e; w4,8,c)
			z1,5,9,d = radix_4dit_3twid(y1,5,9,d; w4,8,c)
			z3,7,b,f = radix_4dit_3twid(y3,7,b,f; w4,8,c)
			*/
			j = 0x40;	// Set j to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's inputs
			k = p2*8;	// Set k to the bytewise address offset between the pointers to the 1st,2nd 4-DFT's outputs
			c_tmp = cc0;	// c8,4,C
			// Pass1-DFTs all use same twiddle-triplet, so lit-byte address offset between ptrs to 1st,2nd DFT's twiddles = 0
			i0 = r00; i1 = r00+8; i2 = r00+16; i3 = r00+24;
			o0 = (vec_dbl *)a; o1 = (vec_dbl *)(a+p4); o2 = (vec_dbl *)(a+p8); o3 = (vec_dbl *)(a+pC);
			SSE2_RADIX_04_DIT_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,   0, o0,o1,o2,o3,k)

			i0 += 2; i1 += 2; i2 += 2; i3 += 2;
			o0 = (vec_dbl *)(a+p1); o1 = (vec_dbl *)(a+p5); o2 = (vec_dbl *)(a+p9); o3 = (vec_dbl *)(a+pD);
			SSE2_RADIX_04_DIT_3TWIDDLE_X2(i0,i1,i2,i3,j, two,c_tmp,   0, o0,o1,o2,o3,k)

		   #endif

		  #else

			vec_dbl *add = (vec_dbl *)a;
			SSE2_RADIX16_DIT_TWIDDLE(add,p1,p2,p3,p4,p8,r00,isrt2,pfetch_addr,pfetch_dist);

		  #endif
/*
	dptr = (double *)r00;
	printf("Intermediates:\n");
	for(i = 0; i < 16; i++, dptr += 4) {
		printf("%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
	}
	exit(0);
	#if 0
	SSE2, Intermediates:
	 0 Re.[d0,d1] =  13.000000000000, 25.000000000000, Im.[d0,d1] =  10.000000000000, 16.000000000000
	 1 Re.[d0,d1] =  -2.000383348119, -0.002684466313, Im.[d0,d1] =  -0.999232936091, -6.999999485260
	 2 Re.[d0,d1] =   2.998465136951, -6.998463960403, Im.[d0,d1] =  -2.002300382682,  2.005368343957
	 3 Re.[d0,d1] =  -2.003450132394, -2.008052073743, Im.[d0,d1] =  -2.997697043900, -6.997694396666
	 4 Re.[d0,d1] =  13.000000000000, 26.000000000000, Im.[d0,d1] =  13.000000000000, 20.000000000000
	 5 Re.[d0,d1] =  -4.242098031044,  9.903290617327, Im.[d0,d1] =   1.415840490666,  9.895697799992
	 6 Re.[d0,d1] =   3.008436011095,  1.993863489176, Im.[d0,d1] =  10.997695793535, -8.001531627541
	 7 Re.[d0,d1] =  -5.653596441804, -1.409331530533, Im.[d0,d1] =   2.834933380737,  4.244264911271
	 8 Re.[d0,d1] =  26.000000000000, 26.000000000000, Im.[d0,d1] =  12.000000000000, 19.000000000000
	 9 Re.[d0,d1] =   1.622084999221, -1.464427707060, Im.[d0,d1] =  -3.920311244697,  1.689808122478
	10 Re.[d0,d1] = -11.313705171203,  9.195097171880, Im.[d0,d1] =   0.008677504888,  3.528482393280
	11 Re.[d0,d1] =  -4.358901615977,  2.772957578273, Im.[d0,d1] =   7.937252465573,  1.144860807740
	12 Re.[d0,d1] =  20.000000000000, 17.000000000000, Im.[d0,d1] =  16.000000000000, 21.000000000000
	13 Re.[d0,d1] =   1.433541444084,  7.838131936196, Im.[d0,d1] = -11.311275742731, -3.250182725754
	14 Re.[d0,d1] =  -1.415297834511,  9.897322648581, Im.[d0,d1] =  -1.413128458289, -2.836019109578
	15 Re.[d0,d1] =   9.605892403457, -2.470663184755, Im.[d0,d1] =  10.085971997443,  9.689985728963
	#endif
//
	dptr = (double *)a;
	printf("DIT Outputs:\n");
	for(i = 0; i < 16; i++, dptr += 4) {
		printf("%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",i,*dptr,*(dptr+1),*(dptr+2),*(dptr+3));
	}
	exit(0);
*/
		#else

		  #ifdef REFACTOR_4DFT_3TWIDDLE
			/*
			Pass 1:
			y0-3 = radix_4dit_3twid(x0-3; w1    ,w2    ,w3    )
			y4-7 = radix_4dit_3twid(x4-7; w1.E^2,w2.E^4,w3.E^6)
			y8-b = radix_4dit_3twid(x8-b; w1.E^1,w2.E^2,w3.E^3)
			yc-f = radix_4dit_3twid(xc-f; w1.E^3,w2.E^6,w3.E^9)
			*/
		  	RADIX_04_DIT_3TWIDDLE(a[j1   ],a[j2   ],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3], t0 ,t1 ,t2 ,t3 ,t4 ,t5 ,t6 ,t7 , c1,s1, c2,s2, c3,s3, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7], t8, t9 ,t10,t11,t12,t13,t14,t15, c5,s5, c6,s6, c7,s7, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+pA],a[j2+pA],a[j1+pB],a[j2+pB], t16,t17,t18,t19,t20,t21,t22,t23, c9,s9, cA,sA, cB,sB, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(a[j1+pC],a[j2+pC],a[j1+pD],a[j2+pD],a[j1+pE],a[j2+pE],a[j1+pF],a[j2+pF], t24,t25,t26,t27,t28,t29,t30,t31, cD,sD, cE,sE, cF,sF, rt,it);
/*
printf("DIT Midputs:\n 0 [%16.12f,%16.12f]\n 1 [%16.12f,%16.12f]\n 2 [%16.12f,%16.12f]\n 3 [%16.12f,%16.12f]\n 4 [%16.12f,%16.12f]\n 5 [%16.12f,%16.12f]\n 6 [%16.12f,%16.12f]\n 7 [%16.12f,%16.12f]\n 8 [%16.12f,%16.12f]\n 9 [%16.12f,%16.12f]\n10 [%16.12f,%16.12f]\n11 [%16.12f,%16.12f]\n12 [%16.12f,%16.12f]\n13 [%16.12f,%16.12f]\n14 [%16.12f,%16.12f]\n15 [%16.12f,%16.12f]\n",
t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22,t23,t24,t25,t26,t27,t28,t29,t30,t31);
exit(0);
*/
			/*
			Pass 2:
			z0,4,8,c = radix_4dit_3twid(y0,4,8,c; w4,8,c)
			z2,6,a,e = radix_4dit_3twid(y2,6,a,e; w4,8,c)
			z1,5,9,d = radix_4dit_3twid(y1,5,9,d; w4,8,c)
			z3,7,b,f = radix_4dit_3twid(y3,7,b,f; w4,8,c)
			*/
		  	RADIX_04_DIT_3TWIDDLE(t0 ,t1 ,t8 ,t9 ,t16,t17,t24,t25, a[j1   ],a[j2   ],a[j1+p4],a[j2+p4],a[j1+p8],a[j2+p8],a[j1+pC],a[j2+pC], c4,s4, c8,s8, cC,sC, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(t4 ,t5 ,t12,t13,t20,t21,t28,t29, a[j1+p2],a[j2+p2],a[j1+p6],a[j2+p6],a[j1+pA],a[j2+pA],a[j1+pE],a[j2+pE], c4,s4, c8,s8, cC,sC, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(t2 ,t3 ,t10,t11,t18,t19,t26,t27, a[j1+p1],a[j2+p1],a[j1+p5],a[j2+p5],a[j1+p9],a[j2+p9],a[j1+pD],a[j2+pD], c4,s4, c8,s8, cC,sC, rt,it);
		  	RADIX_04_DIT_3TWIDDLE(t6 ,t7 ,t14,t15,t22,t23,t30,t31, a[j1+p3],a[j2+p3],a[j1+p7],a[j2+p7],a[j1+pB],a[j2+pB],a[j1+pF],a[j2+pF], c4,s4, c8,s8, cC,sC, rt,it);

		  #else

			RADIX_16_DIT_TWIDDLE(
				a[j1   ],a[j2   ],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p4+p1],a[j2+p4+p1],a[j1+p4+p2],a[j2+p4+p2],a[j1+p4+p3],a[j2+p4+p3],a[j1+p8],a[j2+p8],a[j1+p8+p1],a[j2+p8+p1],a[j1+p8+p2],a[j2+p8+p2],a[j1+p8+p3],a[j2+p8+p3],a[j1+pC],a[j2+pC],a[j1+pC+p1],a[j2+pC+p1],a[j1+pC+p2],a[j2+pC+p2],a[j1+pC+p3],a[j2+pC+p3],
				c1,s1,c2,s2,c3,s3,c4,s4,c5,s5,c6,s6,c7,s7,c8,s8,c9,s9,cA,sA,cB,sB,cC,sC,cD,sD,cE,sE,cF,sF,
				c,s);

		  #endif
/*
printf("DIT Outputs:\n 0 [%16.12f,%16.12f]\n 1 [%16.12f,%16.12f]\n 2 [%16.12f,%16.12f]\n 3 [%16.12f,%16.12f]\n 4 [%16.12f,%16.12f]\n 5 [%16.12f,%16.12f]\n 6 [%16.12f,%16.12f]\n 7 [%16.12f,%16.12f]\n 8 [%16.12f,%16.12f]\n 9 [%16.12f,%16.12f]\n10 [%16.12f,%16.12f]\n11 [%16.12f,%16.12f]\n12 [%16.12f,%16.12f]\n13 [%16.12f,%16.12f]\n14 [%16.12f,%16.12f]\n15 [%16.12f,%16.12f]\n",
a[j1   ],a[j2   ],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p4+p1],a[j2+p4+p1],a[j1+p4+p2],a[j2+p4+p2],a[j1+p4+p3],a[j2+p4+p3],a[j1+p8],a[j2+p8],a[j1+p8+p1],a[j2+p8+p1],a[j1+p8+p2],a[j2+p8+p2],a[j1+p8+p3],a[j2+p8+p3],a[j1+pC],a[j2+pC],a[j1+pC+p1],a[j2+pC+p1],a[j1+pC+p2],a[j2+pC+p2],a[j1+pC+p3],a[j2+pC+p3]);
exit(0);
*/
		#endif
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u DIT macro calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);
		// Check outputs vs ref-data:
		nerr = 0;
		for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ref-array
		{
			j = j1+RE_IM_STRIDE;
		#ifdef USE_AVX	// Since we set up AVX-512 mode to only use nonzero data in lower 4 double-slots of each 8-vector, can use same code here:
		//	printf("Out[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1],a[j1+2],a[j1+3],a[j],a[j+1],a[j+2],a[j+3]);
		//	printf("Ref[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p1,ref2[j2],ref2[j2+2],ref2[j2+4],ref2[j2+6],ref2[j2+1],ref2[j2+3],ref2[j2+5],ref2[j2+7]);
			dtmp = fabs(a[j1  ] - ref2[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref2[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref2[j2+4]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d2\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref2[j2+6]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d3\n");*/ nerr++; };
			dtmp = fabs(a[j   ] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j +1] - ref2[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
			dtmp = fabs(a[j +2] - ref2[j2+5]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d2\n");*/ nerr++; };
			dtmp = fabs(a[j +3] - ref2[j2+7]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d3\n");*/ nerr++; };
		#elif defined(USE_SSE2)
		//	printf("Out%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1],a[j1+2],a[j1+3]);
		//	printf("Ref%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p1,ref1[j2],ref1[j2+2],ref1[j2+1],ref1[j2+3]);
			dtmp = fabs(a[j1  ] - ref2[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref2[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref2[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
		#else
		//	printf("Out%2u Re,Im = %16.12f,%16.12f\n",j1/p1,a[j1],a[j1+1]);
			dtmp = fabs(a[j1  ] - ref2[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
		#endif
		}
		ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!");
		printf("\tSummed roundoff error = %20.10e]\n",avg_err);

	#ifdef USE_SSE2
		free((void *)sc_arr);	sc_arr=0x0;
	#endif
		return nerr;
	}
  #endif	// USE_SSE2?

  #ifndef USE_ARM_V8_SIMD
	// Timing loop for radix-32 DIF macro:
	int	test_radix32_dft()
	{
		const char func[] = "test_radix32_dft";
		/*...time-related stuff	*/
		double clock1, clock2;
		double *dptr, tdiff, rt,it, dtmp, avg_err;
		int i,j,j1,j2,k,imax = 1000000, nerr = 0;	// Use 10^6 loop execs
		int p01,p02,p03,p04,p05,p06,p07,p08,p0C,p10,p14,p18,p1C;
		const double c = 0.92387953251128675613, s     = 0.38268343236508977173	/* exp[  i*(twopi/16)]	*/
				,c32_1 = 0.98078528040323044912, s32_1 = 0.19509032201612826784	/* exp(  i*twopi/32), the radix-32 fundamental sincos datum	*/
				,c32_3 = 0.83146961230254523708, s32_3 = 0.55557023301960222473;/* exp(3*i*twopi/32)	*/
		// DIF[ref1], DIT[ref2] ref-outputs: cols 0,1 are re,im outputs for scalar-mode,
		// cols [0,1],[2,3] are [re0,im0],[re1,im1] for SSE2 mode,
		// cols [0,1],[2,3],[4,5],[6,7] are [re0,im0],[re1,im1],[re2,im2],[re3,im3] for AVX/AVX2 mode:
		const double ref1[] = {	// DIF ref-outputs:
			140.024722521050,140.871739019788, 150.122526863264,150.790780470346, 153.197536655879,141.888816564726, 148.200334262373,134.812656830225,
			 -7.022373589968,  3.997069237833,   1.159133789373,-13.938925439525, -13.901657582104,-13.124482281622,  -2.999617826599,  4.019943693420,
			-32.977206808047,-14.181971377571,  -5.941150622203, -3.033534377769, -11.089824908060, 31.985452203788,  -3.669577041080,-34.995027947290,
			  2.947291144157,  5.983799556561,  26.055011959684,  3.038798268224,  -9.000352360218,  3.903863634243,   1.895737733555,  8.877881373357,
			  7.721695233893, 16.796650048533,  -1.649521343162, -0.277518276115,  -2.071817393690, 13.444818786285,  25.317663367727, 37.495197721454,
			-31.977013450316, 25.009310552180, -20.160830489899, -1.920722367161, -11.991767459065, 20.393896379722, -15.580661792276,  4.634000273354,
			 14.178689335366,  2.677210684856,   5.955945647964,-15.836133052721, -11.915297642057, -3.319147296605, -21.843241613928, -8.366802325879,
			-14.001726748089,-28.790061578753,  -3.984933489930, -6.185668330702, -22.209611107602, 29.068526143826, -12.083175812057, -9.916699134160,
			 -6.568428679973,-25.666742139307, -10.520259251446, -4.129936044069, -12.781454011841, -4.869019419839,   3.482712461121, 27.523676200655,
			 -1.057743014962,-14.704293563743,   0.091428302251,-12.571574727013,  -3.287527844215,-14.232470955807,   6.255645869851, -7.339896665050,
			  0.209684129827,  0.547094194837, -10.963984957490, -3.876793343529,  23.218931543837,-18.214148018491,  11.657368908341, 17.991732248441,
			-28.104387550397, -4.400564730738,  25.563818204521, 12.726467336879,  25.580181542195,-14.638111116564, -13.218849203121,  2.040527144615,
			  2.083180593821, 15.885493780831, -23.328762279212,-11.286300882939,   4.860180033335, 12.239324275083,   5.440404097349, -2.575427656550,
			 23.926133714609,  3.108615496842,   1.776500564269,  0.068947971316,  12.647835095707, 29.400844019242,   2.418436230672,-24.083351693267,
			 -0.347455062825,  6.283821567638,  16.843766876447,  4.680856333787, -39.799917123612,  4.754848839740,  21.684071271268, -4.969665935755,
			-21.918629157623, 26.580118490908,  -7.511066523755, -1.759378744471,  -1.946308210422,  5.315482264882,   2.843991214204,-16.363953283434,
			 14.337138897418, -4.962194106347,  17.400826267082, 18.868370862396,  22.650737301208,  5.471644515705,  -3.481341197924, 27.691226024499,
			 21.041509232485,-31.915501765817, -15.142982973770,-20.430421757509,  11.659109059189,-10.116796178117, -11.836298700689, -3.327236349704,
			 37.176413583993, 19.857258124098, -13.987638594313, 17.838096637263,  -5.124495180610, 23.327444083479,   2.804503746718, -0.443969092161,
			  1.406240377551,-26.736449239665, -40.981166734124,  6.201366173317,  -6.432953497570,-15.977458781115,  18.961280110441, 32.499979534635,
			  7.921206529892,-11.357602373281, -34.277534382182,  6.134348984436,  16.337886041884, 12.627179112788,  13.813910679703,  7.030688270899,
			  4.324539783228,-28.533251396752,   4.360438423123,  2.932955337777, -28.430219636280,  5.281214849604,  17.891080818108,  9.968551200424,
			  7.941424352566, -1.696843470649,  15.757691607411,-24.434672430907,   7.914095843634,  9.431160517530,  -1.598367331510, -4.272930898878,
			  2.511150536063, 14.009852378142,  -4.834059506334, -7.550322452784,  37.865137063644,-13.600877868521, -12.482983358963,-12.875845521982,
			 -3.176289269507,-13.256717474827,  -6.547259678267, -1.377382488463, -21.939148433863, 12.042048878725, -14.253417288097, -3.116163694772,
			-13.943561648678,-20.809854886680,  22.715210033873,-17.918211233280,   9.599530649074, 15.040789321374, -22.179669299814,  3.258448604457,
			 17.723097112906, -2.115349487764, -10.249196792606, 10.854902104522,   2.652452128636,  5.342084280612, -15.832686632635, 21.110019793589,
			 12.429420600041,  5.262100716978, -17.218761890202,-23.617070473007,  -0.887637901968, 25.148631396789, -30.012363448624,-40.310433439768,
			 -4.141708826570, 17.834348787749,  11.506602502169, 13.208475215709,   4.843614510496, 26.588636048305,  11.200296759274,  5.815712940036,
			-20.903728146640,-12.233903905367,  11.822574924896,-26.212273886849,  13.968586324442,-16.985128019648, -25.517711247127,-21.063662783561,
			 -4.668605110589,-23.518389239674,  17.853143730060, 10.255539627711,   7.131503684185,-21.262397894420, -32.978064448385, 17.474810119157,
			-31.094680614681, -7.824791900837,  26.314489812508,-29.243065014873,   8.682672815832,  1.643331714301,   9.700588712124, 23.776014448994
		};
		const double ref2[] = {	// DIT ref-outputs
			141.000000000000,140.000000000000, 151.000000000000,150.000000000000, 154.000000000000,141.000000000000, 149.000000000000,134.000000000000,
			 -3.783906043215, -6.736206535300,   4.973271752545, -5.600710881344, -27.516912975494, 10.704249544145,  16.298563940304,-12.532144509101,
			-13.975320751204, 16.069938560493,   3.827475127678,-11.577279846112,  -6.388477515408, 22.564077591341,  -3.986656256409, 36.982179763860,
			  4.156510948913, 41.173817228073, -17.902557040341, 12.221997670440,  17.264655075949, 22.902285126847, -16.713188704199, 17.223442192287,
			 -4.035494871322,-23.135972475146,  -7.957748291754, -5.209624508116, -13.026778182253, 15.312894054796,  -6.331210248048,-11.404515029260,
			 -1.213343589390,  2.270312000675,   2.392595859890,-24.873739661946,  12.007132532990, 23.996630560387, -30.638644917530, -4.507899445454,
			 -4.948713292598,  9.845221290990,  13.661265407876,  5.066750751913,   5.565104645159,-11.248394412931, -12.521767442562,-14.660708821623,
			-11.796906859905, -1.138597253783, -12.919382326987,-22.841118709164, -10.120884281347, -6.529844383861,  -3.726060004042, -9.189821595846,
			-26.975329278755,  8.082797183077,  22.033643988203, 10.932453183120,   9.073588606597, 23.972275440563,  -3.006121794955, -1.990786717330,
			 -9.066596305134, -1.595969993346,  -1.938409248399, -8.058592277841, -26.733887573698, -0.172909814917,   2.941161411616,  3.986398982278,
			 32.576855326640,  4.614723738419, -12.340924255396,-19.211551338795,  35.403183820966, 15.080943005094, -21.406985527751, -9.134686122438,
			 -4.548284998661,-12.297167806507,  -3.624267744685, -8.678742216190, -34.379691370578,  8.355427602184,   3.089654213922, 23.372398473839,
			 -5.905151007980, -1.215478489053,   0.351694332823, 12.776691413054,  21.280023186453, 18.994155125044,   3.489243487555,-20.986842217176,
			  3.602518389431,-14.276719830409, -26.989840512597,-47.671396585122,  -9.847473285215,-22.401510403948, -10.119044829611, 15.796167140639,
			 34.812232991152,-17.729086047369, -25.869666551597, 12.520629440701,  -3.784245396586, -2.029733818180,   2.402928300177, 16.401641652305,
			 18.052147777352,-33.857249786628,   6.801921794274, -6.576455795400,  13.566832313677, 11.095574986667, -14.340101625722,-24.725755028083,
			  2.766779909180,-38.017692314686,  37.011575254755,  1.772934618546,  16.030378227767,  4.901731722027,   3.049030603041,  7.981441748313,
			 -2.730228499662,-22.877430624432,  27.625582010048,  8.102003437602,   1.277400517221,-15.187342092285, -19.625868405112, -1.776741745095,
			  0.525658067551, -0.891291801172,  24.353656294353,  0.820464065546,  11.633979662105, -6.396990493280,  20.190490658467,-30.391285275285,
			 -9.009911878546, -5.397713458813,  11.696339407365,  3.869141878254,   3.252668870016, 13.349874761303, -30.365441147606,  7.032218838475,
			 -3.960443046134,  5.172663690128,   1.790325078625,-20.792517661947, -22.820931786580, 16.882635998245,  16.247377143023, -8.710657249299,
			 20.424243482247, -7.632556998691, -12.288080889385,  9.778300008816,   5.181110844942,  5.760433633129, -34.631930192857, 24.613585612048,
			  7.664220805692,-16.100052979546, -18.783320226262, 16.475099436075,  33.176479850760, 28.370879660324, -25.509413453942,  5.348541054986,
			 14.931180922425, 17.307008785486,  -5.480746581315,-10.068509993107,  10.669638727806, -7.286566226329,   8.364260807000,  7.732785362298,
			-13.165116965253,-17.879588789769,  -6.174617208171,-18.943972717795,   8.981211291404, -2.082749082142,  39.292868290553, 31.639698189163,
			 -9.231165386395, 33.006526880669,   9.123349632636, 16.893393598794, -39.828580130830,-14.182339621860,  30.828764118530, 22.825796980260,
			  5.057098954615, 16.039181372271, -35.856548456771,  6.202346795540,  -4.691311299012, -7.412834751256,   0.892138240784, -9.520577057717,
			-21.518086979503,-23.863875946161,  -4.011715939162,-18.450590208336,  12.326906368091, -3.662946603667,  -2.667379125681,-19.089339157323,
			 13.976462633395,  7.092975107311,   1.677177274040, -2.796344761165,   2.602447851339,-19.120929617852,   6.553117262884, 12.900941352274,
			-13.860571623143,  9.956334468539,  -2.970997394616, -4.569778700177,  13.162621390589, -4.204190717894, -18.569819812753,  0.007457485465,
			-38.057933128419,-27.821062813343,  -4.719711981316,  6.258941911695, -22.560724338994, 16.896478631264,   7.785441577353,  5.052808814922,
			 -6.672156694112, -5.749720392297,   9.001634924866, -5.548569238130,  -4.361598749612,  9.657701141829,   8.205352378866, -2.180603960068
		};

		const int stride = 2*RE_IM_STRIDE, dim = stride<<5, idx[32] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62};
		double cc[32],ss[32];
		static double *a,*a_ptr;	// Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode
		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
		a     = ALIGN_VEC_DBL(a_ptr);
		ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
	#ifdef USE_SSE2
		const int pfetch_dist = 0;
		int pfetch_addr = 0;	// Don't care about pfetch in this lcal-mem context, so just set these = 0
		static vec_dbl *sc_arr = 0x0, *sc_ptr;
		double *add0;	/* Addresses into array sections */
		vec_dbl *c_tmp,*s_tmp;
		static vec_dbl *isrt2,*sqrt2, *cc0, *ss0, *cc1, *ss1, *cc3, *ss3, *one,*two, *r00,*r10,*r20,*r30;
		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
		sc_ptr = ALIGN_VEC_DBL(sc_arr);
		ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
		r00 = sc_ptr;
		r10 = r00 + 0x10;
		r20 = r00 + 0x20;
		r30 = r00 + 0x30;
		isrt2 = r00 + 0x40;
		cc0	  = r00 + 0x41;
		ss0	  = r00 + 0x42;
		cc1	  = r00 + 0x43;
		ss1	  = r00 + 0x44;
		cc3	  = r00 + 0x45;
		ss3	  = r00 + 0x46;
		one   = r00 + 0x87;
		two   = r00 + 0x88;
		sqrt2 = r00 + 0x89;
		/* These remain fixed: */
		VEC_DBL_INIT(isrt2, ISRT2);		VEC_DBL_INIT(sqrt2, SQRT2);
		VEC_DBL_INIT(one  , 1.0  );		VEC_DBL_INIT(two, 2.0  );
		VEC_DBL_INIT(cc0  , c    );		VEC_DBL_INIT(ss0, s    );
		VEC_DBL_INIT(cc1  , c32_1);		VEC_DBL_INIT(ss1, s32_1);
		VEC_DBL_INIT(cc3  , c32_3);		VEC_DBL_INIT(ss3, s32_3);
	#endif	// USE_SSE2 ?

		// Do these timing-test DFTs in-place, i.e. p01 = #doubles in a pair of vec_dbl:
		p01 = RE_IM_STRIDE << 1;
		p02 = p01 +p01;
		p03 = p02 +p01;
		p04 = p03 +p01;
		p05 = p04 +p01;
		p06 = p05 +p01;
		p07 = p06 +p01;
		p08 = p04 +p04;
		p0C = p08 +p04;
		p10 = p0C +p04;
		p14 = p10 +p04;
		p18 = p14 +p04;
		p1C = p18 +p04;

		// Twiddles for the purpose of this timing-test are w1-15, with w := exp(2*Pi*I/2^14):
		cc[0   ] = 1.0                           ; ss[0   ] = 0.0                             ;
		cc[0x01] = 0.9999999264657178511447314807; ss[0x01] = 0.0003834951875713955890724616812;
		cc[0x02] = 0.9999997058628822191602282177; ss[0x02] = 0.0007669903187427045269385683580;
		cc[0x03] = 0.9999993381915255477888066109; ss[0x03] = 0.001150485337113848457071735047;
		cc[0x04] = 0.9999988234517019099290257101; ss[0x04] = 0.001533980186284765612303697150;
		cc[0x05] = 0.9999981616434870076277347923; ss[0x05] = 0.001917474809855419109500620455;
		cc[0x06] = 0.9999973527669781720689399696; ss[0x06] = 0.002300969151425805244235552264;
		cc[0x07] = 0.9999963968222943635594898320; ss[0x07] = 0.002684463154595961785455992532;
		cc[0x08] = 0.9999952938095761715115801256; ss[0x08] = 0.003067956762965976270145365491;
		cc[0x09] = 0.9999940437289858144220774704; ss[0x09] = 0.003451449920135994297977171937;
		cc[0x0A] = 0.9999926465807071398486621178; ss[0x0A] = 0.003834942569706227825960602960;
		cc[0x0B] = 0.9999911023649456243827897550; ss[0x0B] = 0.004218434655276963463076393843;
		cc[0x0C] = 0.9999894110819283736194723572; ss[0x0C] = 0.004601926120448570764901699143;
		cc[0x0D] = 0.9999875727319041221238780943; ss[0x0D] = 0.004985416908821510528222769585;
		cc[0x0E] = 0.9999855873151432333947502950; ss[0x0E] = 0.005368906963996343085634209014;
		cc[0x0F] = 0.9999834548319376998246454755; ss[0x0F] = 0.005752396229573736600123594041;
		cc[0x10] = 0.9999811752826011426569904375; ss[0x10] = 0.006135884649154475359640234589;
		cc[0x11] = 0.9999787486674688119399584425; ss[0x11] = 0.006519372166339468071646855519;
		cc[0x12] = 0.9999761749868975864771644677; ss[0x12] = 0.006902858724729756157652981865;
		cc[0x13] = 0.9999734542412659737751795536; ss[0x13] = 0.007286344267926522047728803984;
		cc[0x14] = 0.9999705864309741099878642477; ss[0x14] = 0.007669828739531097474998305920;
		cc[0x15] = 0.9999675715564437598575211556; ss[0x15] = 0.008053312083144971770110435865;
		cc[0x16] = 0.9999644096181183166528666053; ss[0x16] = 0.008436794242369800155687098562;
		cc[0x17] = 0.9999611006164628021038214358; ss[0x17] = 0.008820275160807412040746750608;
		cc[0x18] = 0.9999576445519638663331209194; ss[0x18] = 0.009203754782059819315102378107;
		cc[0x19] = 0.9999540414251297877847438260; ss[0x19] = 0.009587233049729224643732638124;
		cc[0x1A] = 0.9999502912364904731491606429; ss[0x1A] = 0.009970709907418029761124940991;
		cc[0x1B] = 0.9999463939865974572854009582; ss[0x1B] = 0.01035418529872884376558925796;
		cc[0x1C] = 0.9999423496760239031399400209; ss[0x1C] = 0.01073765916726449141354143107;
		cc[0x1D] = 0.9999381583053646016624044894; ss[0x1D] = 0.01112113145662802141375476658;
		cc[0x1E] = 0.9999338198752359717180973806; ss[0x1E] = 0.01150460211042271472157869220;
		cc[0x1F] = 0.9999293343862760599973422319; ss[0x1F] = 0.01188807107225209283312325799;

	#ifdef USE_SSE2
		/* Sincos data stored in BRed form in SSE2 local-data layout:
		DIF: (cc0,ss0) + 0x[06,26,16,36|0e,2e,1e,3e|0a,2a,1a,3a|12,32,22,42|08,28,18,38|10,30,20,40|0c,2c,1c,3c|14,34,24,44]
			= "  " + 6 + 0x[00,20,10,30|08,28,18,38|04,24,14,34|0c,2c,1c,3c|02,22,12,32|0a,2a,1a,3a|06,26,16,36|0e,2e,1e,3e]
			= "  " + 6 +   [ 0,16, 8,24| 4,20,12,28| 2,18,10,26| 6,22,14,30| 1,17, 9,25| 5,21,13,29| 3,19,11,27| 7,23,15,31], straight bit-reversal.
		These are the addess-offsets of roots 0-31 ... flipping this around and asking what is the roots-order in a linear walk thru memory
		(i.e. the index-offsets or roots 0,1,2,3,... in the last [] sequence above) we get the very same thing, i.e. the index-permutation
		is its own inverse; this is a property of bit-reversal reordering.
		*/
		c_tmp = cc0 + 0x06; s_tmp = c_tmp+1;	/* c0,s0 */
		for(i = 0; i < 32; i++, c_tmp+=2, s_tmp+=2) {
			j = reverse(i,5);
			VEC_DBL_INIT(c_tmp, cc[j]);	VEC_DBL_INIT(s_tmp, ss[j]);
		}
	#endif

		//******************* Timing loop for Radix-32 DIF transform macro: *******************
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			// Copy digits of Pi-data into our vec_dbl inputs:
			for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ran[] input array
			{
			#ifdef USE_AVX512	// Set this up so that AVX-512 can use the same ref-data as AVX:
				a[j1   ] = ran[j2  ];	/* Re0 */	a[j1+ 4] = 0;	/* Re4 */
				a[j1+ 1] = ran[j2+2];	/* Re1 */	a[j1+ 5] = 0;	/* Re5 */
				a[j1+ 2] = ran[j2+4];	/* Re2 */	a[j1+ 6] = 0;	/* Re6 */
				a[j1+ 3] = ran[j2+6];	/* Re3 */	a[j1+ 7] = 0;	/* Re7 */
				a[j1+ 8] = ran[j2+1];	/* Im0 */	a[j1+12] = 0;	/* Im4 */
				a[j1+ 9] = ran[j2+3];	/* Im1 */	a[j1+13] = 0;	/* Im5 */
				a[j1+10] = ran[j2+5];	/* Im2 */	a[j1+14] = 0;	/* Im6 */
				a[j1+11] = ran[j2+7];	/* Im3 */	a[j1+15] = 0;	/* Im7 */
			#elif defined(USE_AVX)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+4];	// Re2
				a[j1+3] = ran[j2+6];	// Re3
				a[j1+4] = ran[j2+1];	// Im0
				a[j1+5] = ran[j2+3];	// Im1
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#elif defined(USE_SSE2)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+1];	// Im0
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+6];	// Re3
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#else
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+1];	// Im0
				a[j1+2] = ran[j2+2];	// Re1
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+5];	// Im2
				a[j1+6] = ran[j2+6];	// Re3
				a[j1+7] = ran[j2+7];	// Im3
			#endif
			}
			j1 = 0; j2 = RE_IM_STRIDE;
		#ifdef USE_SSE2
			SSE2_RADIX32_DIF_TWIDDLE(a,p01,p02,p03,p04,p08,p0C,p10,p18,r00)
		#else
			RADIX_32_DIF_TWIDDLE_OOP( a,idx, a,idx,	// This DFT is in-place
									 cc[0x10],ss[0x10], cc[0x08],ss[0x08], cc[0x18],ss[0x18]
				, cc[0x04],ss[0x04], cc[0x14],ss[0x14], cc[0x0C],ss[0x0C], cc[0x1C],ss[0x1C]
				, cc[0x02],ss[0x02], cc[0x12],ss[0x12], cc[0x0A],ss[0x0A], cc[0x1A],ss[0x1A]
				, cc[0x06],ss[0x06], cc[0x16],ss[0x16], cc[0x0E],ss[0x0E], cc[0x1E],ss[0x1E]
				, cc[0x01],ss[0x01], cc[0x11],ss[0x11], cc[0x09],ss[0x09], cc[0x19],ss[0x19]
				, cc[0x05],ss[0x05], cc[0x15],ss[0x15], cc[0x0D],ss[0x0D], cc[0x1D],ss[0x1D]
				, cc[0x03],ss[0x03], cc[0x13],ss[0x13], cc[0x0B],ss[0x0B], cc[0x1B],ss[0x1B]
				, cc[0x07],ss[0x07], cc[0x17],ss[0x17], cc[0x0F],ss[0x0F], cc[0x1F],ss[0x1F]
			);
		#endif
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u DIF macro calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);

		// Check outputs vs ref-data:
		nerr = 0;	dtmp = avg_err = 0.0;
		for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ref-array
		{
			j = j1+RE_IM_STRIDE;
		#ifdef USE_AVX	// Since we set up AVX-512 mode to only use nonzero data in lower 4 double-slots of each 8-vector, can use same code here:
		//	printf("Out[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1],a[j1+2],a[j1+3],a[j],a[j+1],a[j+2],a[j+3]);
		//	printf("Ref[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p01,ref1[j2],ref1[j2+2],ref1[j2+4],ref1[j2+6],ref1[j2+1],ref1[j2+3],ref1[j2+5],ref1[j2+7]);
			dtmp = fabs(a[j1  ] - ref1[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref1[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref1[j2+4]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d2\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref1[j2+6]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d3\n");*/ nerr++; };
			dtmp = fabs(a[j   ] - ref1[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j +1] - ref1[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
			dtmp = fabs(a[j +2] - ref1[j2+5]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d2\n");*/ nerr++; };
			dtmp = fabs(a[j +3] - ref1[j2+7]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d3\n");*/ nerr++; };
		#elif defined(USE_SSE2)
		//	printf("Out%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1],a[j1+2],a[j1+3]);
		//	printf("Ref%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p01,ref1[j2],ref1[j2+2],ref1[j2+1],ref1[j2+3]);
			nerr += (fabs(a[j1  ] - ref1[j2  ]) > 1e-10);
			nerr += (fabs(a[j1+1] - ref1[j2+2]) > 1e-10);
			nerr += (fabs(a[j1+2] - ref1[j2+1]) > 1e-10);
			nerr += (fabs(a[j1+3] - ref1[j2+3]) > 1e-10);
		#else
		//	printf("Out%2u Re,Im = %16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1]);
			nerr += (fabs(a[j1  ] - ref1[j2  ]) > 1e-10);
			nerr += (fabs(a[j1+1] - ref1[j2+1]) > 1e-10);
		#endif
		}
		ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!");
		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
	#if 0
		10^6-timing:	setup	+=DIF	DIF-only
		sse2:			.386	.724	.338 [676 cycles]
		avx2:			.058	.530	.472 [614 cycles]
		avx512:			.100	.611	.511 [664 cycles]	Optimized address-comp and using zmm28-31 for consts ==> 585 cycles; still crappy vs
															tangent+FMA - optimized radix-16 @214 cycles, implies target of 535 cycles for radix-32.
	#endif

		//******************* Timing loop for Radix-32 DIT transform macro: *******************
		/* Sincos data in SSE2 local-data layout are a bit funky for DIT:
		DIT: (cc0,ss0) + 0x[06,26,16,36|0e,2e,1e,3e|08,28,18,38|10,30,20,40|0a,2a,1a,3a|12,32,22,42|0c,2c,1c,3c|14,34,24,44].
			= "  " + 6 + 0x[00,20,10,30|08,28,18,38|02,22,12,32|0a,2a,1a,3a|04,24,14,34|0c,2c,1c,3c|06,26,16,36|0e,2e,1e,3e]
			= "  " + 6 +   [ 0,16, 8,24| 4,20,12,28| 1,17, 9,25| 5,21,13,29| 2,18,10,26| 6,22,14,30| 3,19,11,27| 7,23,15,31], swap quartets 2,3 <-> 4,5 relative to BR!
							 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
			These are the addess-offsets of roots 0-31 ... flipping this around and asking what is the roots-order in a linear walk thru memory
			(i.e. the index-offsets or roots 0,1,2,3,... in the last [] sequence above) we get
							0,16,8,24, 4,12,20,28, 2,10,18,26, 6,14,22,30, 1,9,17,25, 5,13,21,29, 3,11,19,27, 7,15,23,31,
			i.e. the index-permutation *not* is its own inverse, due to our nor longer having a strict bit-reversal reordering. It is this latter perm that apears in dit_funky_br[] below:
		*/
	#ifdef USE_SSE2
		// This 'funky' perm is just the BR-perm with the middle 2 terms of each quartet swapped, i.e. quartet elements appearing in-order:
		const int dit_funky_br[32] = {0,8,16,24, 4,12,20,28, 2,10,18,26, 6,14,22,30, 1,9,17,25, 5,13,21,29, 3,11,19,27, 7,15,23,31};
		c_tmp = cc0 + 0x06; s_tmp = c_tmp+1;	/* c0,s0 */
		for(i = 0; i < 32; i++, c_tmp+=2, s_tmp+=2) {
			j = dit_funky_br[i];
			VEC_DBL_INIT(c_tmp, cc[j]);	VEC_DBL_INIT(s_tmp, ss[j]);
		}
	#endif
		clock1 = getRealTime();
		for(i = 0; i < imax; i++) {
			// Copy digits of Pi-data into our vec_dbl inputs:
			for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ran[] input array
			{
			#ifdef USE_AVX512	// Set this up so that AVX-512 can use the same ref-data as AVX:
				a[j1   ] = ran[j2  ];	/* Re0 */	a[j1+ 4] = 0;	/* Re4 */
				a[j1+ 1] = ran[j2+2];	/* Re1 */	a[j1+ 5] = 0;	/* Re5 */
				a[j1+ 2] = ran[j2+4];	/* Re2 */	a[j1+ 6] = 0;	/* Re6 */
				a[j1+ 3] = ran[j2+6];	/* Re3 */	a[j1+ 7] = 0;	/* Re7 */
				a[j1+ 8] = ran[j2+1];	/* Im0 */	a[j1+12] = 0;	/* Im4 */
				a[j1+ 9] = ran[j2+3];	/* Im1 */	a[j1+13] = 0;	/* Im5 */
				a[j1+10] = ran[j2+5];	/* Im2 */	a[j1+14] = 0;	/* Im6 */
				a[j1+11] = ran[j2+7];	/* Im3 */	a[j1+15] = 0;	/* Im7 */
			#elif defined(USE_AVX)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+4];	// Re2
				a[j1+3] = ran[j2+6];	// Re3
				a[j1+4] = ran[j2+1];	// Im0
				a[j1+5] = ran[j2+3];	// Im1
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#elif defined(USE_SSE2)
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+2];	// Re1
				a[j1+2] = ran[j2+1];	// Im0
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+6];	// Re3
				a[j1+6] = ran[j2+5];	// Im2
				a[j1+7] = ran[j2+7];	// Im3
			#else
				a[j1  ] = ran[j2  ];	// Re0
				a[j1+1] = ran[j2+1];	// Im0
				a[j1+2] = ran[j2+2];	// Re1
				a[j1+3] = ran[j2+3];	// Im1
				a[j1+4] = ran[j2+4];	// Re2
				a[j1+5] = ran[j2+5];	// Im2
				a[j1+6] = ran[j2+6];	// Re3
				a[j1+7] = ran[j2+7];	// Im3
			#endif
			}
			j1 = 0; j2 = RE_IM_STRIDE;
		#ifdef USE_SSE2
			SSE2_RADIX32_DIT_TWIDDLE(a,p01,p02,p03,p04,p05,p06,p07,p08,p10,r00,isrt2)
		#else
			RADIX_32_DIT_TWIDDLE( a,idx, a,idx,	// This DFT is in-place
									 cc[0x10],ss[0x10], cc[0x08],ss[0x08], cc[0x18],ss[0x18]
				, cc[0x04],ss[0x04], cc[0x14],ss[0x14], cc[0x0C],ss[0x0C], cc[0x1C],ss[0x1C]
				, cc[0x02],ss[0x02], cc[0x12],ss[0x12], cc[0x0A],ss[0x0A], cc[0x1A],ss[0x1A]
				, cc[0x06],ss[0x06], cc[0x16],ss[0x16], cc[0x0E],ss[0x0E], cc[0x1E],ss[0x1E]
				, cc[0x01],ss[0x01], cc[0x11],ss[0x11], cc[0x09],ss[0x09], cc[0x19],ss[0x19]
				, cc[0x05],ss[0x05], cc[0x15],ss[0x15], cc[0x0D],ss[0x0D], cc[0x1D],ss[0x1D]
				, cc[0x03],ss[0x03], cc[0x13],ss[0x13], cc[0x0B],ss[0x0B], cc[0x1B],ss[0x1B]
				, cc[0x07],ss[0x07], cc[0x17],ss[0x17], cc[0x0F],ss[0x0F], cc[0x1F],ss[0x1F]
			);
		#endif
		}
		clock2 = getRealTime();
		tdiff = (double)(clock2 - clock1);
		printf("%s: Time for %u DIT macro calls =%s [tdiff = %20.10e]\n",func, imax, get_time_str(tdiff), tdiff);
		// Check outputs vs ref-data:
		nerr = 0;	dtmp = avg_err = 0.0;
		for(j1 = 0, j2 = 0; j1 < dim; j1 += stride, j2 += 8)	// j2 is base-index into ref-array
		{
			j = j1+RE_IM_STRIDE;
		#ifdef USE_AVX	// Since we set up AVX-512 mode to only use nonzero data in lower 4 double-slots of each 8-vector, can use same code here:
		//	printf("Out[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1],a[j1+2],a[j1+3],a[j],a[j+1],a[j+2],a[j+3]);
		//	printf("Ref[%2u] Re.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f, Im.[d0-3] = %16.12f,%16.12f,%16.12f,%16.12f\n",j1/p01,ref2[j2],ref2[j2+2],ref2[j2+4],ref2[j2+6],ref2[j2+1],ref2[j2+3],ref2[j2+5],ref2[j2+7]);
			dtmp = fabs(a[j1  ] - ref2[j2  ]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d0\n");*/ nerr++; };
			dtmp = fabs(a[j1+1] - ref2[j2+2]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d1\n");*/ nerr++; };
			dtmp = fabs(a[j1+2] - ref2[j2+4]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d2\n");*/ nerr++; };
			dtmp = fabs(a[j1+3] - ref2[j2+6]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Re.d3\n");*/ nerr++; };
			dtmp = fabs(a[j   ] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
			dtmp = fabs(a[j +1] - ref2[j2+3]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d1\n");*/ nerr++; };
			dtmp = fabs(a[j +2] - ref2[j2+5]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d2\n");*/ nerr++; };
			dtmp = fabs(a[j +3] - ref2[j2+7]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d3\n");*/ nerr++; };
		#elif defined(USE_SSE2)
		//	printf("Out%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1],a[j1+2],a[j1+3]);
		//	printf("Ref%2u Re.[d0,d1] = %16.12f,%16.12f, Im.[d0,d1] = %16.12f,%16.12f\n",j1/p01,ref1[j2],ref1[j2+2],ref1[j2+1],ref1[j2+3]);
			nerr += (fabs(a[j1  ] - ref2[j2  ]) > 1e-10);
			nerr += (fabs(a[j1+1] - ref2[j2+2]) > 1e-10);
			nerr += (fabs(a[j1+2] - ref2[j2+1]) > 1e-10);
			nerr += (fabs(a[j1+3] - ref2[j2+3]) > 1e-10);
		#else
		//	printf("Out%2u Re,Im = %16.12f,%16.12f\n",j1/p01,a[j1],a[j1+1]);
			nerr += (fabs(a[j1  ] - ref2[j2  ]) > 1e-10);
			nerr += (fabs(a[j1+1] - ref2[j2+1]) > 1e-10);
		#endif
		}
		ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!");
		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
	#if 0
		10^6-timing:	setup	+=DIF	DIF-only
		sse2:			.386	.736	.350 [700 cycles]
		avx2:			.052	.536	.484 [629 cycles]
		avx512:			.106	.630	.524 [681 cycles]	Optimized address-comp and using zmm28-31 for consts ==> 540 cycles; can we get under 500?
	#endif

	#ifdef USE_SSE2
		free((void *)sc_arr);	sc_arr=0x0;
	#endif
		return nerr;
	}
  #endif	// ifndef(USE_ARM_V8_SIMD?)

#endif	// TEST_SIMD ?

/********** Testcode and utils for multithreading support *************/

#ifdef MULTITHREAD

  #if 0	// This works on MacOS, but is non-portable:

	int get_num_cores(void)
	{
		/* get the number of CPUs from the system; 'man sysctl' for details */
		int numCPU;	// Under OS X, this needs to be an int (size_t gave garbage results)
		int mib[4];
		size_t len = sizeof(numCPU);
		/* set the mib for hw.ncpu */
		mib[0] = CTL_HW;
		mib[1] = HW_AVAILCPU;  // alternatively, try HW_NCPU;

		sysctl(mib, 2, &numCPU, &len, NULL, 0);

		if( numCPU < 1 )
		{
			mib[1] = HW_NCPU;
			sysctl( mib, 2, &numCPU, &len, NULL, 0 );

			if( numCPU < 1 )
			{
				numCPU = 1;
			}
		}
		return numCPU;
	}

  #else	// This is alleged to be Win/Linux portable: http://stackoverflow.com/questions/4586405/get-number-of-cpus-in-linux-using-c

	#ifdef OS_TYPE_WINDOWS	// NB: Currently only support || builds unde Linux/GCC, but add Win stuff for possible future use

		#include <windows.h>

		#ifndef _SC_NPROCESSORS_ONLN
			SYSTEM_INFO info;
			GetSystemInfo(&info);
			#define sysconf(a) info.dwNumberOfProcessors
			#define _SC_NPROCESSORS_ONLN
		#endif

	#endif

	int get_num_cores(void)
	{
		long nprocs = -1;
		long nprocs_max = -1;

	#ifdef _SC_NPROCESSORS_ONLN

		nprocs = sysconf(_SC_NPROCESSORS_ONLN);
		if(nprocs < 1) {
			fprintf(stderr, "Could not determine number of CPUs online:\n%s\n", strerror (errno));
			exit (EXIT_FAILURE);
		}
		nprocs_max = sysconf(_SC_NPROCESSORS_CONF);
		if (nprocs_max < 1) {
			fprintf(stderr, "Could not determine number of CPUs configured:\n%s\n", strerror (errno));
			exit (EXIT_FAILURE);
		}
	//	printf ("%ld of %ld processors online\n",nprocs, nprocs_max);
	//	exit (EXIT_SUCCESS);

	#else

		fprintf(stderr, "Could not determine number of CPUs");
		exit (EXIT_FAILURE);

	#endif

		return nprocs;
	}

  #endif

	// Simple struct to pass multiple args to the loop/join-test thread function:
	struct do_loop_test_thread_data{
		int tid;
		int ibeg;
		int iend;
		int *retval;
	};

	int test_pthreads(int nthreads, int verbose)
	{
		// These are collected from a mish-mash of small code samples I used when initially playing with pthreads;
		// collect all the variable decls at top of this function so this will build under strict ansi C style rules.
		int i,ioffset,tid,j,retval[nthreads];
		pthread_t thread[nthreads];
		pthread_attr_t attr;
		int rc;
		void *status;
		int ibig,iinc,isum;	/* ibig = #bigwords in loop-divided-by-threads sequence */
		struct do_loop_test_thread_data tdat[nthreads];
		pthread_t pth = pthread_self();
		int        thr_id;         /* thread ID for a newly created thread */
		pthread_t  p_thread;       /* thread's structure                     */
		int        a = 1;  /* thread 1 identifying number            */
		int        b = 2;  /* thread 2 identifying number            */
		int ncpu = get_num_cores(), nshift, nextra;
		printf("Mlucas running as system-created pthread %u, threading self-test will use %d user-created pthreads.\n", (int)pth, nthreads);
		if(verbose) {
			ASSERT(HERE, nthreads > 0,"Mlucas.c: nthreads > 0");
			if(nthreads > ncpu) {
				printf("WARN: Test using more threads[%d] than there are available CPUs[%d].\n", nthreads, ncpu);
			}
		}
		/* create a pair of threads, each of which will execute a simple timing loop().
		Uncomment the prints in the thread-called function to 'see' the threads executing: */
		thr_id = pthread_create(&p_thread, NULL, ex_loop, (void*)&a);
		/* Thread which prints a hello message - Note the stdout prints resulting from this
		and the surrounding thread-tests may appear in any order, depending on system scheduling of the respective threads: */
		j = pthread_create(&p_thread, NULL, PrintHello, (void *)&b);
		if (j){
			printf("ERROR; return code from pthread_create() is %d\n", j);
			exit(-1);
		}

		/* Initialize and set thread detached attribute */
		pthread_attr_init(&attr);
		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

		iinc = 10/nthreads;	/* base loop increment; the first [ibig] chunks get one added to this */
		ibig = 10%nthreads;	/* This many of the [j] work chunks will have an extra unit */
		isum = 0;
		/* Populate the thead-specific data structs: */
		for(i = 0; i < nthreads; ++i) {
			tdat[i].tid = i;
			tdat[i].ibeg = isum;
			isum += iinc + (i < ibig);	/* loop increment for current work chunk */
			tdat[i].iend = isum;
			tdat[i].retval = &retval[i];
			if(verbose) printf("INFO: Scheduling thread %d with ibeg = %d, iend = %d\n", i, tdat[i].ibeg, tdat[i].iend);
		}
		/* create nthreads new threads each of which will execute 'do_loop()' over some specified index subrange.
		In order to match the threads executing at any given time to the available CPUs, divide the thread execution
		into [nshift] 'work shifts', each with [ncpu] threads starting and completing their work before the next shift
		comes online:
		*/
		isum = 0;
		nshift = nthreads / ncpu;	// Number of shifts with one thread for each CPU
		for(j = 0; j < nshift; ++j) {
			ioffset = j*ncpu;
			for(i = 0; i < ncpu; ++i) {
				tid = i+ioffset;
				rc = pthread_create(&thread[tid], &attr, do_loop, (void*)(&tdat[tid]));
				if (rc) {
					printf("ERROR; return code from pthread_create() is %d\n", rc);
					exit(-1);
				}
			}
			/* As each thread finishes, add its result into an accumulator in non-blocking fashion (i.e. no mutexes needed): */
			/* Attempting to join returning threads returns error code ESRCH, 'No such process', if there is just one thread in the current team: */
			if(ncpu > 1) {
				for(i = 0; i < ncpu; ++i) {
					tid = i+ioffset;
					rc = pthread_join(thread[tid], &status);
					if (rc) {
						printf("ERROR; return code from pthread_join() is %d\n", rc);
						exit(-1);
					}
					if(verbose) printf("Main: completed join with thread %d having a status of %d\n",tid,(int)status);
					isum += retval[tid];
				}
			}
		}
		// Cleanup pass for cases where ncpu does not divide nthreads
		nextra = (nthreads % ncpu);
		if(nextra != 0) {
			ioffset = j*ncpu;
			for(i = 0; i < nextra; ++i) {
				tid = i+ioffset;
				rc = pthread_create(&thread[tid], &attr, do_loop, (void*)(&tdat[tid]));
				if (rc) {
					printf("ERROR; return code from pthread_create() is %d\n", rc);
					exit(-1);
				}
			}
			/* As each thread finishes, add its result into an accumulator in non-blocking fashion (i.e. no mutexes needed): */
			if(ncpu > 1) {
				for(i = 0; i < ncpu; ++i) {
					tid = i+ioffset;
					rc = pthread_join(thread[tid], &status);
					if (rc) {
						printf("ERROR; return code from pthread_join() is %d\n", rc);
						exit(-1);
					}
					if(verbose) printf("Main: completed join with thread %d having a status of %d\n",tid,(int)status);
					isum += retval[tid];
				}
			}
		}
		/* Free attribute and wait for the other threads */
		pthread_attr_destroy(&attr);

		// 10 sequential iters of test loop yield successive values -1452071552,1390824192,-61247360,-1513318912,1329576832,
		// -122494720,-1574566272,1268329472,-1837420,-1635813632:
		ASSERT(HERE, isum == -1635813632, "retval error!");
		return 0;
	}

	// Small timing-delay loop test function for pthread stuff:
	void* ex_loop(void* data)
	{
		int i;                      /* counter, to print numbers */
		int j;                      /* counter, for delay        */
	//	int me = *((int*)data);     /* thread identifying number */
		for (i=0; i<10; i++) {
			for (j=0; j<500000; j++) /* delay loop */
				;
		//	printf("'%d' - Got '%d'\n", me, i);
		}
		/* terminate the thread */
		pthread_exit(NULL);
	}

	// A little hello-world testcode for the pthread stuff:
	void *PrintHello(void *threadid)
	{
		int tid;
		tid = *((int*)threadid);
	//	printf("Hello World! It's me, thread #%ld!\n", tid);
		pthread_exit(NULL);
	}

	void*
	do_loop(void*targ)	// Thread-arg pointer *must* be cast to void and specialized inside the function
	{
		struct do_loop_test_thread_data* thread_arg = targ;
		int i;                      /* counter, to print numbers */
		int j;                      /* counter, for delay        */
		int k = 0;	/* accumulator to keep gcc from otimizing away delay-multiply inside test loop */
		ASSERT(HERE, thread_arg != 0x0, "do_loop test function for pthread-test needs live thread_arg pointer!");

	  #if 0	// BSD thread affinity API barfs in my Mac builds
		cpuset_t *cset;
		pthread_t pth;
		cpuid_t ci;

		cset = cpuset_create();
		if (cset == NULL) {
			ASSERT(HERE, 0, "cpuset_create");
		}
		ci = 0;
		cpuset_set(ci, cset);

		pth = pthread_self();
		error = pthread_setaffinity_np(pth, cpuset_size(cset), cset);
		if (error) {
			ASSERT(HERE, 0, "pthread_setaffinity_np");
		}
		cpuset_destroy(cset);
	  #endif

	//	int me = thread_arg->tid;     /* thread identifying number */
		for (i = thread_arg->ibeg; i < thread_arg->iend; i++)
		{
			for (j=0; j<100000000; j++) {	/* delay loop */
				k += j*j;
			}
		//	printf("Thread '%d': i = %d, accum = %d\n", me, i, k);
		}
		*(thread_arg->retval) = k;
		pthread_exit(NULL);
	}

	/********* Thread-affinity utilities: *********/

	// Parse a single core-affinity-triplet substring and set the corresponding bits in the global CORE_SET bitmap.
	// Returns: #cores specified in the substring.
	uint32 parseAffinityTriplet(char*istr)
	{
		int ncpu = 0, lo = -1,hi = lo,incr = 1, i,bit,word;
		char *char_addr = istr, *endp;
		ASSERT(HERE, char_addr != 0x0, "Null input-string pointer!");
		size_t len = strlen(istr);
		if(len == 0) return 0;	// Allow 0-length input, resulting in no-op
		ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-substring length!");
		lo = strtoul(char_addr, &endp, 10);	ASSERT(HERE, lo >= 0, "lo-substring not a valid nonnegative number!");
		if(*endp) {
			ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!");
			char_addr = endp+1;
			hi = strtoul(char_addr, &endp, 10);
			ASSERT(HERE, hi >= lo, "hi-substring not a valid number >= lo!");
			if(*endp) {
				ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!");
				char_addr = endp+1;
				incr = strtoul(char_addr, &endp, 10);
				ASSERT(HERE, incr > 0, "incr-substring not a valid positive number!");
				ASSERT(HERE, *endp == 0x0, "Non-numeric increment substring in core-affinity-triplet substring!");
			} else {
				// If increment (third) argument of triplet omitted, default to incr = 1.
			}
		} else {
			hi = lo;	// If only 'lo' arg of triplet supplied, take hi = lo and incr=1, i.e. add just CPUid = lo to bitmap
		}
		// CPU set encoded by integer-triplet argument corresponds to values of integer loop
		// index i in the C-loop for(i = lo; i < hi; i += incr), excluding loop-exit value of i:
		for(i = lo; i <= hi; i += incr, ncpu++) {
			word = i>>6; bit = i & 63;	ASSERT(HERE, word < MAX_CORES, "Bitmap word exceeds MAX_CORES!");
			if(CORE_SET[word] & (1ull<<bit)) { sprintf(cbuf, "Core %d multiply specified in affinity-setting!",i);	ASSERT(HERE, 0, cbuf); }
			else { CORE_SET[word] |= 1ull<<bit; }
		}
		return ncpu;
	}

	/******************/
	// Parse a single core-affinity-triplet substring and set the corresponding bits in the global CORE_SET bitmap:
	void parseAffinityString(char*istr)
	{
		uint32 ncpu = 0, i,bit,word,nc, core_count_oflow = 0;
		char *char_addr = istr, *cptr;
		ASSERT(HERE, char_addr != 0x0, "Null input-string pointer!");
		size_t len = strlen(istr);	// length, not counting the \0 string terminator
		ASSERT(HERE, len > 0, "Zero input-string length!");
		ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-string length!");
		// Clear existing core-affinity bitmap:
		for(i = 0; i < MAX_CORES>>6; i++) { CORE_SET[i] = 0ull; }
		// Affinity-triplet substrings are delimited by commas:
		while(0x0 != (cptr = strchr(char_addr,','))) {
			strncpy(cbuf,char_addr,(cptr-char_addr));	cbuf[cptr-char_addr] = '\0';	// Copy substring into cbuf and null-terminate
			ncpu += parseAffinityTriplet(cbuf);
			char_addr = cptr+1;
		}
		ncpu += parseAffinityTriplet(char_addr);	// Final (or only) core-affinity-triplet
		printf("Set affinity for the following %u cores: ",ncpu);
		nc = 0;
		for(i = 0; i < MAX_CORES; i++) {
			word = i>>6; bit = i & 63;
			if(CORE_SET[word] & (1ull<<bit)) {
				++nc;	printf("%u.",i);
				core_count_oflow += (i >= MAX_THREADS);	// Accumulation (rather than simple if() ... = TRUE) allows us to capture #offenders in the numerical value
			}
		}
		printf("\n");
		ASSERT(HERE, nc == ncpu, "Bitmap #set-bits mismatches #cpu!");
		NTHREADS = ncpu;
		if(NTHREADS > MAX_THREADS) {	// Test this first, since if true, it implies truth of the 'else' conditional
		//	fprintf(stderr,"WARN: NTHREADS = %d exceeds number of logical cores = %d ... Affinities for core indices > %d will be set (mod %d).\n",NTHREADS,MAX_THREADS,MAX_THREADS,MAX_THREADS);
			fprintf(stderr,"ERROR: NTHREADS [ = %d] must not exceed those of available logical cores = 0-%d!\n",NTHREADS,MAX_THREADS-1);
			exit(EXIT_FAILURE);
		} else if(core_count_oflow) {	// This can be true even if #threads within bounds, e.g. 2 available cores (with indices 0,1) and user specifies -cpu 0,2
		//	fprintf(stderr,"WARN: %d cores in user-specified core set has index which exceeds number of logical cores = %d ... Affinities for core indices > %d will be set (mod %d).\n",core_count_oflow,MAX_THREADS,MAX_THREADS,MAX_THREADS);
			fprintf(stderr,"ERROR: %d cores in user-specified core set have index exceeding those of available logical cores = 0-%d!\n",core_count_oflow,MAX_THREADS-1);
			exit(EXIT_FAILURE);
		}
	}

#endif	// MULTITHREAD ?

/***********************/

double get_time(double tdiff)
{
#ifndef MULTITHREAD	// In || mode the mod_square routines use getRealTime() to accumulate wall-clock time, thus CLOCKS_PER_SEC not needed
	return tdiff/CLOCKS_PER_SEC;	/* NB: CLOCKS_PER_SEC may be a phony value used to scale clock() ranges */
#else
	return tdiff;
#endif
}

char*get_time_str(double tdiff)
{
	static char cbuf[STR_MAX_LEN];
#ifndef MULTITHREAD	// In || mode the mod_square routines use getRealTime() to accumulate wall-clock time, thus CLOCKS_PER_SEC not needed
	tdiff /= CLOCKS_PER_SEC;	/* NB: CLOCKS_PER_SEC may be a phony value used to scale clock() ranges */
#endif
	sprintf(cbuf, "%2d%1d:%1d%1d:%1d%1d.%1d%1d%1d"
	,(int)tdiff/36000,((int)tdiff%36000)/3600
	,((int)tdiff%3600)/600,((int)tdiff%600)/60
	,((int)tdiff%60)/10,(int)tdiff%10
	,(int)(10*(tdiff-(int)tdiff)),(int)(100*(tdiff-(int)tdiff))%10,(int)(1000*(tdiff-(int)tdiff))%10);
	return cbuf;
}

// EWM: Jun 2015: This code (and related smaller mods elsewhere) due to Alex Vong,
// as part of his Debian-freeware-packaging-of-Mlucas project:

/* MLUCAS_PATH is the prefix of all files opened by mlucas_fopen()
   It must end with a slash, except when it is an empty sting
   For example, "$HOME/.mlucas.d/" is a valid MLUCAS_PATH

   MLUCAS_PATH is the empty string by default,
   user can set its default value by defining cpp macro MLUCAS_DEFAULT_PATH  */
#ifdef MLUCAS_DEFAULT_PATH
char *MLUCAS_PATH = MLUCAS_DEFAULT_PATH;
#else
char *MLUCAS_PATH = "";
#endif

/* Set the global variable MLUCAS_PATH according to 1. the environment variable
   MLUCAS_PATH and 2. the default value of the global variable MLUCAS_PATH
   Notice 1 has precedence over 2 since only 1 can be set at run-time

   Both 1 and 2 will be expanded by the shell so that user can set the default
   value of the global variable MLUCAS_PATH to be something
   like "$HOME/.mlucas.d/" (See cpp macro MLUCAS_DEFAULT_PATH for more details)
   which can be expanded at run-time

   On sucess, set_mlucas_path() returns silently
   On error, set_mlucas_path() prints the cause of error to stderr
   and calls ASSERT(HERE, 0, "Exiting.");

   possible errors:
   unable to allocate buffer
   unable to open pipe
   path is longer than STR_MAX_LEN
   path does not end with a slash  */
void set_mlucas_path(void)
{
	char *mlucas_path;
	char *cmdstr;
	char *expanded_str;
	int  tmp;
	FILE *pipe_ptr;
	size_t bufsize;
	int has_err = FALSE;

	mlucas_path = getenv("MLUCAS_PATH");
	if (mlucas_path != NULL) {
		bufsize = strlen(mlucas_path) + 1;
		MLUCAS_PATH = (char*)malloc(bufsize); /* will not free!  */
		if (MLUCAS_PATH == NULL) {
			fprintf(stderr, "ERROR: unable to allocate buffer MLUCAS_PATH in set_mlucas_path()\n");
			has_err = TRUE;
			goto out_err_check;
		}
		strcpy(MLUCAS_PATH, mlucas_path);
	} else {
		bufsize = strlen(MLUCAS_PATH) + 1;
	}
	bufsize = (bufsize - 1) * 3 + 1;
	mlucas_path = (char*)malloc(bufsize);
	if (mlucas_path == NULL) {
		fprintf(stderr, "ERROR: unable to allocate buffer mlucas_path in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_err_check;
	}

	quote_spaces(mlucas_path, MLUCAS_PATH);
	cmdstr = (char*)malloc(bufsize + strlen("printf \"\""));
	if (cmdstr == NULL) {
		fprintf(stderr, "ERROR: unable to allocate buffer cmdstr in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_mlucas_path;
	}

	strcpy(cmdstr, "printf \"\"");
	strcat(cmdstr, mlucas_path);
	pipe_ptr = popen(cmdstr, "r");
	if (pipe_ptr == NULL) {
		fprintf(stderr, "ERROR: unable to open pipe pipe_ptr in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_cmdstr;
	}

	tmp = getc(pipe_ptr); /* goto out_pipe if shell output nothing  */
	if (tmp == EOF)
		goto out_pipe;
	else
		ungetc(tmp, pipe_ptr);

	expanded_str = (char*)malloc(STR_MAX_LEN + 1); /* do not free!  */
	if (expanded_str == NULL) {
		fprintf(stderr, "ERROR: unable to allocate buffer expanded_str in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_pipe;
	}
	fgets(expanded_str, STR_MAX_LEN + 1, pipe_ptr);
	if (getc(pipe_ptr) != EOF) {
		fprintf(stderr, "ERROR: environment variable MLUCAS_PATH or cpp macro MLUCAS_DEFAULT_PATH is longer than STR_MAX_LEN in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_pipe;
	}
	if (expanded_str[strlen(expanded_str) - 1] != '/') { /* strlen != 0  */
		fprintf(stderr, "ERROR: environment variable MLUCAS_PATH or cpp macro MLUCAS_DEFAULT_PATH does not end with a slash in set_mlucas_path()\n");
		has_err = TRUE;
		goto out_pipe;
	}

	MLUCAS_PATH = expanded_str;
	out_pipe:
	pclose(pipe_ptr);
	out_cmdstr:
	free(cmdstr);
	out_mlucas_path:
	free(mlucas_path);
	out_err_check:
	if (has_err)
		ASSERT(HERE, 0, "Exiting.");
}

/* Double-quote all spaces in the string pointed by src and write it to dest.
   Suppose src needs bufsize b, dest needs at most bufsize (b - 1) * 3 + 1,
   since at worst the whole string would be made of spaces

   example: `I am happy' will be transformed to `I" "am" "Happy'  */
char *quote_spaces(char *dest, char *src)
{
	size_t i;
	size_t j;

	for (i = 0, j = 0; src[i] != '\0'; ++i, ++j) {
		if (src[i] == ' ') {
			dest[j] = '"';
			++j;
			dest[j] = ' ';
			++j;
			dest[j] = '"';
		} else {
			dest[j] = src[i];
		}
	}
	dest[j] = '\0';
	return dest;
}

/* Emulate `mkdir -p path'
   The command either makes directory `path' and all its parent directories
   or does absolutely nothing

   Return 0 if the directory `path' exists and is writable
   Return 1 if the directory does not exist or is not writable  */
int mkdir_p(char *path)
{
	char mlucas_path[STR_MAX_LEN + 1];
	char cmdstr[4 * STR_MAX_LEN + 1];
	char tmp[4 * STR_MAX_LEN + 1] = "";
	char *tok;
	FILE *fp;

	strcpy(mlucas_path, path);
	if (mlucas_path[0] == '\0')
		return 1;
	else if (mlucas_path[0] == '/')
		strcpy(tmp, "/");

	for (tok = strtok(mlucas_path, "/");
	     tok != NULL;
	     tok = strtok(NULL, "/")) {
		shell_quote(cmdstr, tok);
		strcat(tmp, cmdstr);
		strcat(tmp, "/");
		strcpy(cmdstr, "mkdir ");
		strcat(cmdstr, tmp);
		strcat(cmdstr, " 2> /dev/null");
		system(cmdstr);
	}

	strcat(tmp, "_Mlucas_util_c_mkdir_p_tmp");
	strcpy(cmdstr, "printf ");
	strcat(cmdstr, tmp);
	fp = popen(cmdstr, "r");
	if (fp == NULL) {
		fprintf(stderr, "ERROR: unable to open pipe fp in mkdir_p()\n");
		ASSERT(HERE, 0, "Exiting.");
	}
	fgets(tmp, STR_MAX_LEN + 1, fp);
	pclose(fp);

	fp = fopen(tmp, "a");
	if (fp == NULL)
		return 1;
	fclose(fp);

	strcpy(cmdstr, "rm -f ");
	strcat(cmdstr, tmp);
	strcat(cmdstr, " 2> /dev/null");
	system(cmdstr);
	return 0;
}

/* Double-quote all single-quotes in the string
   and single-quote all other characters
   Suppose src needs bufsize b, dest needs bufsize (b - 1) * 3 + 1,
   since all characters needs quoting

   example: 'a' 'b' will be transformed to "'"'a'"'"' '"'"'b'"'"  */
char *shell_quote(char *dest, char *src)
{
	size_t i;
	size_t j;

	for (i = 0, j = 0; src[i] != '\0'; ++i, ++j) {
		if (src[i] == '\'') {
			dest[j] = '"';
			++j;
			dest[j] = '\'';
			++j;
			dest[j] = '"';
		} else {
			dest[j] = '\'';
			++j;
			dest[j] = src[i];
			++j;
			dest[j] = '\'';
		}
	}
	dest[j] = '\0';
	return dest;
}

/* Append path to global variable MLUCAS_PATH to form mlucas_path,
   which is then passed to fopen()

   Since the length of both MLUCAS_PATH and path are at most STR_MAX_LEN,
   we can use strcpy() and strcat() safely  */
FILE *mlucas_fopen(const char *path, const char *mode)
{
	char mlucas_path[2 * STR_MAX_LEN + 1];

	strcpy(mlucas_path, MLUCAS_PATH);
	strcat(mlucas_path, path);
	return fopen(mlucas_path, mode);
}

/*********************/
/* Print the input string to current-assignment logfile and/or stderr, according to value of echo_to_stderr flag:
	flag:	output to:
	-----	--------------
		0	logfile
		1	logfile, stderr
	 >= 2	stderr
*/
void mlucas_fprint(char*const cstr, uint32 echo_to_stderr)
{
	ASSERT(HERE, cstr != 0x0 && strlen(cstr) > 0,"Null string-pointer or empty string supplied to mlucas_fprint!");
	if(echo_to_stderr)
		fprintf(stderr,"%s",cstr);
	if(echo_to_stderr < 2) {
		FILE *fptr = mlucas_fopen(STATFILE,"a");
		if(fptr) {
			fprintf(fptr,"%s",cstr);
			fclose(fptr); fptr = 0x0;
		}
	}
}

/*********************/
// Return numerical value of specified-by-name user option in double-float form, if found in the specified file. The
// targeted option is assumed to be formatted as [optname][ws][=][ws][value], with value representable as an IEEE64 double.
// Any failure to obtain a valid value for the option returns a NaN; the caller should check for this using isNaN(result):
double mlucas_getOptVal(const char*fname, char*optname)
{
	const char func[] = "mlucas_getOptVal";
	char cstr[STR_MAX_LEN], *cptr,*cadd;
	ASSERT(HERE, fname != 0x0 && strlen(fname) > 0,"Null filename-pointer or empty string supplied to mlucas_getOptVal!");
	FILE *fptr = mlucas_fopen(fname,"r");
	double result = strtod("NaN", 0x0);
	if(fptr) {
		while(fgets(cstr, STR_MAX_LEN, fptr)) {
			if((cptr = strstr(cstr,optname)) != 0x0) {
				if((cadd = strstr(cptr + strlen(optname),"=")) != 0x0) {
				 	result = strtod(cadd+1,0x0);	// Could insert a ptr in place of 0x0 to hold ptr to any unconverted suffix, but
				 									// in the case of an mlucas.ini entry it would typically just contain a newline.
				 	return result;	// Return first occurrence of option in file
				}
			}
		}
		fclose(fptr);	fptr = 0x0;
	}
	return result;
}

