$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
From: arseny.kapoulkine_at_[hidden]
Date: 2007-08-02 17:02:50
Author: zeux
Date: 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
New Revision: 38408
URL: http://svn.boost.org/trac/boost/changeset/38408
Log:
FFT improvements & refactoring
Added:
   sandbox/SOC/2007/bigint/fft/primes.h   (contents, props changed)
Removed:
   sandbox/SOC/2007/bigint/fft/main_slow.cpp
   sandbox/SOC/2007/bigint/fft/main_word.cpp
   sandbox/SOC/2007/bigint/fft/main_word_fast.cpp
   sandbox/SOC/2007/bigint/fft/main_word_weird.cpp
   sandbox/SOC/2007/bigint/fft/sieve.cpp
Text files modified: 
   sandbox/SOC/2007/bigint/fft/main.cpp  |   374 +++++++++++++++++++-------------------- 
   sandbox/SOC/2007/bigint/fft/roots.cpp |    31 +++                                     
   2 files changed, 211 insertions(+), 194 deletions(-)
Modified: sandbox/SOC/2007/bigint/fft/main.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main.cpp	(original)
+++ sandbox/SOC/2007/bigint/fft/main.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -1,58 +1,48 @@
 #include <iostream>
-
+#include <fstream>
 #include <windows.h>
 
-const int base = 10;
-typedef short limb_t;
+#include "primes.h"
 
-struct print
+template <typename T> void P(const char* text, const T* data, size_t size)
 {
-	const limb_t* _data;
-	size_t _size;
-
-	print(const limb_t* data, size_t size): _data(data), _size(size)
-	{
-	}
+	std::cout << text;
+	for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
+	std::cout << std::endl;
+}
 
-	std::ostream& operator()(std::ostream& os) const
-	{
-		for (const limb_t* i = _data; i != _data + _size; ++i)
-			os << *i;
-	
-		return os;
-	}
-};
+template <typename T> void S(const char* file, const T* data, size_t size)
+{
+	std::ofstream out(file);
+	for (size_t i = 0; i < size; ++i) out << std::hex << data[i] << std::dec;
+}
 
-template <typename T> void P(const char* text, const T* data, size_t size)
+unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
 {
-	std::cout << text << ": ";
-	for (size_t i = 0; i < size; ++i) std::cout << data[i] << " ";
-	std::cout << std::endl;
+	return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
 }
 
-int pow(int base, int power, int prime)
+unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
 {
-	int pot = power;
+	return (a + b) % prime;
+}
 
-	// Find largest power of two that is >= rhs
-	while (pot < power && (pot << 1) != 0)
-		pot <<= 1;
+unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
+{
+	return (a + prime - b) % prime;
+}
 
-	int res = 1;
+unsigned int log2(unsigned int value)
+{
+	unsigned int r = 0;
 
-	while (pot > 0)
+	while (value > 1)
         {
-		res = (res * res) % prime;
-				
-		if ((power & pot) != 0)
-		{
-			res = (res * base) % prime;
-		}
-  
-		pot >>= 1;
+		++r;
+		value >>= 1;
         }
 
-	return res;
+	return r;
 }
 
 // reverse(x) -> reverse(x+1)
@@ -69,20 +59,6 @@
 }
 
 // swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-void fft_reorder(unsigned int* data, size_t size)
-{
-	if (size <= 2) return;
-
-	size_t r = 0;
-
-	for (size_t x = 1; x < size; ++x)
-	{
-		r = rev_next(r, size);
-
-		if (r > x) std::swap(data[x], data[r]);
-	}
-}
-
 template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
 {
         if (size <= 2)
@@ -103,46 +79,40 @@
         }
 }
 
-void fft_recursive(unsigned int* data, size_t size, int prime, int root)
+void fft_recursive(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
 {
         size /= 2;
 
         if (size > 1)
         {
-		int new_root = (root * root) % prime;
-		fft_recursive(data, size, prime, new_root);
-		fft_recursive(data + size, size, prime, new_root);
+		fft_recursive(data, size, prime, roots - 1);
+		fft_recursive(data + size, size, prime, roots - 1);
         }
 
+	unsigned int root = *roots;
         unsigned int r = 1;
 
         for (size_t i = 0; i < size; ++i)
         {
                 unsigned int a = data[i];
-		unsigned int b = (data[i + size] * r) % prime;
+		unsigned int b = modmul(data[i + size], r, prime);
 
-		data[i] = (a + b) % prime;
-		data[i + size] = (a + prime - b) % prime;
+		data[i] = modadd(a, b, prime);
+		data[i + size] = modsub(a, b, prime);
 
-		r = (r * root) % prime;
+		r = modmul(r, root, prime);
         }
 }
 
-void fft_iterative(unsigned int* data, size_t size, int prime, int root)
+void fft_iterative(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
 {
         size_t step = 1;
 
-	int roots[4];
-	roots[3] = root;
-	roots[2] = (roots[3] * roots[3]) % prime;
-	roots[1] = (roots[2] * roots[2]) % prime;
-	roots[0] = (roots[1] * roots[1]) % prime;
-
-	int nstep = 0;
+	unsigned int nstep = 0;
 
         while (step < size)
         {
-		root = roots[nstep++];
+		unsigned int root = roots[++nstep];
 
                 size_t half_step = step;
                 step *= 2;
@@ -154,208 +124,210 @@
                         for (size_t i = j; i < size; i += step)
                         {
                                 unsigned int a = data[i];
-				unsigned int b = (data[i + half_step] * r) % prime;
+				unsigned int b = modmul(data[i + half_step], r, prime);
 
-				data[i] = (a + b) % prime;
-				data[i + half_step] = (a + prime - b) % prime;
+				data[i] = modadd(a, b, prime);
+				data[i + half_step] = modsub(a, b, prime);
                         }
                         
-			r = (r * root) % prime;
+			r = modmul(r, root, prime);
                 }
         }
 }
 
-void fft(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void dft(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
 {
         fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, root);
+	fft_recursive(dest, N, prime, root_table + log2N);
 }
 
-void fft_i(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void dft_i(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
 {
         fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, root);
+	fft_iterative(dest, N, prime, root_table);
 }
 
-void fft_slow(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
 {
-	unsigned int p_j = 1;
-
-	for (int j = 0; j < N; ++j)
-	{
-		unsigned int r = 0;
-
-		unsigned int p_jk = 1;
-
-		for (int k = 0; k < N; ++k)
-		{
-			r = (r + source[k] * p_jk) % prime;
-			p_jk = (p_jk * p_j) % prime;
-		}
-
-		dest[j] = r;
-		
-		p_j = (p_j * root) % prime;
-	}
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
-{
-	int rroot = pow(root, prime - 2, prime);
-
         fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, rroot);
-
-	unsigned int N_inv = pow(N, prime - 2, prime);
+	fft_recursive(dest, N, prime, root_table + log2N);
 
         for (size_t i = 0; i < N; ++i)
         {
-		dest[i] = dest[i] * N_inv % prime;
+		dest[i] = modmul(dest[i], inv_N, prime);
         }
 }
 
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
+void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
 {
-	int rroot = pow(root, prime - 2, prime);
-
         fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, rroot);
-
-	unsigned int N_inv = pow(N, prime - 2, prime);
+	fft_iterative(dest, N, prime, root_table);
 
         for (size_t i = 0; i < N; ++i)
         {
-		dest[i] = dest[i] * N_inv % prime;
+		dest[i] = modmul(dest[i], inv_N, prime);
         }
 }
 
-void ift_slow(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
+void fft_crt_carry(unsigned short* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
 {
-	int rroot = pow(root, prime - 2, prime);
-	int N_inv = pow(N, prime - 2, prime);
+	unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
+	unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
 
-	unsigned int p_j = 1;
+	unsigned __int64 carry = 0;
 
-	for (int j = 0; j < N; ++j)
-	{
-		unsigned int p_jk = 1;
+	for (int i = 0; i < N; ++i)
+	{	
+		// 64 x == 32 c0 mod 32 p0
+		// 64 x == 32 c1 mod 32 p1
 
-		unsigned int r = 0;
+		// x == p0*t + c0
+		// p0*t + c0 == c1  mod p1
+		// t == (c1 - c0) div p0  mod p1
 
-		for (int k = 0; k < N; ++k)
-		{
-			r = (r + source[k] * p_jk) % prime;
+		// t == t0 mod p1
 
-			p_jk = (p_jk * p_j) % prime;
-		}
+		// x == p0 * t0 + c0
+		// t0 == (c1 - c0) div p0  mod p1
 
-		r = (r * N_inv) % prime;
+		unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1]);
+		unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
 
-		dest[j] = r;
-		
-		p_j = (p_j * rroot) % prime;
+		carry += x;
+
+		dest[i] = carry & 0xffff;
+		carry >>= 16;
         }
 }
 
-typedef void (*fftfunc)(unsigned int* dest, const limb_t* source, size_t N, int prime, int root);
-typedef void (*iftfunc)(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root);
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b, fftfunc fft, iftfunc ift)
+// precondition: dest has N elements allocated, a and b have N elements, last N/2 are 0, N is power of two
+// postcondition: dest has product, padded with zeroes
+void mul_fft(unsigned short* dest, const unsigned short* a, const unsigned short* b, size_t N,
+	void (*dft)(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
+	void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
+	)
 {
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
+	unsigned int log2N = log2(N);
         
-	// N * base^2 == 1600
-	// prime0 * prime1 must be > 1600
-	const int primes[] = {17, 97};
-
-	// primitive root (power N)
-	const int roots[] = {3, 8};
-
-	unsigned int convs[2][N];
+	unsigned int* workspace = new unsigned int[4*N];
+	
+	unsigned int* convs[] = {workspace, workspace + N};
+	unsigned int* fft_a = workspace + 2*N;
+	unsigned int* fft_b = workspace + 3*N;
 
         for (int p = 0; p < 2; ++p)
         {
-		unsigned int fft_a[N], fft_b[N], fft_c[N];
-		fft(fft_a, a, N, primes[p], roots[p]);
-		fft(fft_b, b, N, primes[p], roots[p]);
+		const unsigned int* root_table = fft_primitive_roots[p];
+		unsigned int prime = fft_primes[p];
+
+		dft(fft_a, a, N, log2N, prime, root_table);
+		dft(fft_b, b, N, log2N, prime, root_table);
 
                 for (size_t i = 0; i < N; ++i)
                 {
-			fft_c[i] = (fft_a[i] * fft_b[i]) % primes[p];
+			fft_a[i] = modmul(fft_a[i], fft_b[i], prime);
                 }
+		
+		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
+		unsigned int inv_N = fft_inv_N[p][log2N];
 
-		ift(convs[p], fft_c, N, primes[p], roots[p]);
+		ift(convs[p], fft_a, N, log2N, prime, inv_root_table, inv_N);
         }
 
-	// CRT:
-	// x % p1 = r1
-	// x % p2 = r2
+	fft_crt_carry(dest, convs[0], convs[1], N);
 
-	// x % p1p2 = r1 * (P/p1)*T1 + r2 * (P/p2)*T2
-	// T1 = (P/p1) ^ (p1 - 2)
-	// T2 = (P/p2) ^ (p2 - 2)
+	delete[] workspace;
+}
 
-	int prime = primes[0] * primes[1];
-	int T[2] = {pow(primes[1], primes[0] - 2, primes[0]), pow(primes[0], primes[1] - 2, primes[1])};
+// precondition: dest has N elements allocated, a has N elements, last N/2 are 0, N is power of two
+// postcondition: dest has product, padded with zeroes
+void sqr_fft(unsigned short* dest, const unsigned short* a, size_t N,
+			 void (*dft)(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
+			 void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
+			 )
+{
+	unsigned int log2N = log2(N);
 
-	unsigned int carry = 0;
+	unsigned int* workspace = new unsigned int[3*N];
+	
+	unsigned int* convs[] = {workspace, workspace + N};
+	unsigned int* fft = workspace + 2*N;
 
-	for (int i = 0; i < N; ++i)
+	for (int p = 0; p < 2; ++p)
         {
-		int conv = (convs[0][i] * T[0] * primes[1] + convs[1][i] * T[1] * primes[0]) % prime;
-		
-		carry += conv;
-		dest[i] = carry % 10;
-		carry /= 10;
+		const unsigned int* root_table = fft_primitive_roots[p];
+		unsigned int prime = fft_primes[p];
+
+		dft(fft, a, N, log2N, prime, root_table);
+
+		for (size_t i = 0; i < N; ++i)
+		{
+			fft[i] = modmul(fft[i], fft[i], prime);
+		}
+
+		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
+		unsigned int inv_N = fft_inv_N[p][log2N];
+
+		ift(convs[p], fft, N, log2N, prime, inv_root_table, inv_N);
         }
+
+	fft_crt_carry(dest, convs[0], convs[1], N);
+
+	delete[] workspace;
 }
 
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
+void mul_basecase(unsigned short* dest, const unsigned short* a, const unsigned short* b, size_t N)
 {
-	typedef int limb2_t;
+	typedef unsigned int limb2_t;
 
-	const size_t N = 16;
+	memset(dest, 0, sizeof(unsigned short) * N);
 
-	memset(dest, 0, sizeof(limb_t) * N);
-
-	limb_t* i = dest;
+	unsigned short* i = dest;
                         
-	for (const limb_t* li = a; li != a + N/2; ++li, ++i)
+	for (const unsigned short* li = a; li != a + N/2; ++li, ++i)
         {
-		limb_t carry = 0;
+		unsigned short carry = 0;
 
-		limb_t* ci = i;
+		unsigned short* ci = i;
 
-		for (const limb_t* ri = b; ri != b + N/2; ++ri)
+		for (const unsigned short* ri = b; ri != b + N/2; ++ri)
                 {
                         limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
 
-			*ci++ = static_cast<limb_t>(result % 10);
+			*ci++ = static_cast<unsigned short>(result & 0xffff);
 
-			carry = static_cast<limb_t>(result / 10);
+			carry = static_cast<unsigned short>(result >> 16);
                 }
 
                 while (carry != 0)
                 {
                         limb2_t result = static_cast<limb2_t>(*ci) + carry;
                                         
-			*ci++ = static_cast<limb_t>(result % 10);
+			*ci++ = static_cast<unsigned short>(result & 0xffff);
 
-			carry = static_cast<limb_t>(result / 10);
+			carry = static_cast<unsigned short>(result >> 16);
                 }
         }
 }
 
 int main()
 {
-	limb_t a[16] = {2, 3, 4, 5, 6, 7, 8, 9};
-	limb_t b[16] = {1, 2, 3, 4, 5, 6, 7, 8};
+	size_t bits = (1 << 20) * 16;
+	size_t N = bits / 16;
 
-	limb_t c[16] = {};
+	unsigned short* a = new unsigned short[N];
+	unsigned short* b = new unsigned short[N];
+	unsigned short* c = new unsigned short[N];
 
-	std::cout << "A = "; print(a, 16)(std::cout) << std::endl;
-	std::cout << "B = "; print(b, 16)(std::cout) << std::endl;
+	for (size_t i = 0; i < N/2; ++i)
+	{
+		a[i] = b[i] = 65535;
+	}
+
+	for (size_t i = N/2; i < N; ++i)
+	{
+		a[i] = b[i] = 0;
+	}
         
         int count = 1;
         LARGE_INTEGER start, end, freq;
@@ -364,29 +336,45 @@
 
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, fft, ift);
+		mul_fft(c, a, b, N, dft, ift);
         QueryPerformanceCounter(&end);
-	
-	std::cout << "Recursive FFT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+
+	std::cout << "Recursive: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	S("res_fft_rec.txt", c, N);
 
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, fft_i, ift_i);
+		mul_fft(c, a, b, N, dft_i, ift_i);
+	QueryPerformanceCounter(&end);
+
+	std::cout << "Iterative: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	S("res_fft_iter.txt", c, N);
+
+	QueryPerformanceCounter(&start);
+	for (int i = 0; i < count; ++i)
+		sqr_fft(c, a, N, dft, ift);
         QueryPerformanceCounter(&end);
         
-	std::cout << "Iterative FFT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	std::cout << "Recursive SQR: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	S("res_fft_rec_sqr.txt", c, N);
         
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, fft_slow, ift_slow);
+		sqr_fft(c, a, N, dft_i, ift_i);
         QueryPerformanceCounter(&end);
         
-	std::cout << "Slow FT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-	
+	std::cout << "Iterative SQR: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	S("res_fft_iter_sqr.txt", c, N);
+
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
-		mul_basecase(c, a, b);
+		mul_basecase(c, a, b, N);
         QueryPerformanceCounter(&end);
         
-	std::cout << "Basecase: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	std::cout << "Basecase: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+	S("res_basecase.txt", c, N);
+
+	delete[] a;
+	delete[] b;
+	delete[] c;
 }
Deleted: sandbox/SOC/2007/bigint/fft/main_slow.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_slow.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,155 +0,0 @@
-#include <iostream>
-
-const int base = 10;
-typedef short limb_t;
-
-struct print
-{
-	const limb_t* _data;
-	size_t _size;
-
-	print(const limb_t* data, size_t size): _data(data), _size(size)
-	{
-	}
-
-	std::ostream& operator()(std::ostream& os) const
-	{
-		for (const limb_t* i = _data; i != _data + _size; ++i)
-			os << *i;
-	
-		return os;
-	}
-};
-
-int pow(int base, int power, int prime)
-{
-	int res = 1;
-
-	for (int i = 0; i < power; ++i)
-	{
-		res = (res * base) % prime;
-	}
-
-	return res;
-}
-
-void fft(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
-{
-	for (int j = 0; j < N; ++j)
-	{
-		unsigned int r = 0;
-
-		for (int k = 0; k < N; ++k)
-		{
-			r = (r + ((source[k] * pow(root, j*k, prime)) % prime)) % prime;
-		}
-
-		dest[j] = r;
-	}
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
-{
-	int rroot = pow(root, prime - 2, prime);
-
-	for (int j = 0; j < N; ++j)
-	{
-		unsigned int r = 0;
-
-		for (int k = 0; k < N; ++k)
-		{
-			r = (r + ((source[k] * pow(rroot, j*k, prime)) % prime)) % prime;
-		}
-
-		r = (r * pow(N, prime - 2, prime)) % prime;
-
-		dest[j] = r;
-	}
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b)
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-	
-	// N * base^2 == 1600
-	// prime0 * prime1 must be > 1600
-	const int primes[] = {17, 97};
-
-	// primitive root (power N)
-	const int roots[] = {3, 8};
-
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		unsigned int fft_a[N], fft_b[N], fft_c[N];
-		fft(fft_a, a, N, primes[p], roots[p]);
-		fft(fft_b, b, N, primes[p], roots[p]);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = (fft_a[i] * fft_b[i]) % primes[p];
-		}
-
-		ift(convs[p], fft_c, N, primes[p], roots[p]);
-	}
-
-	unsigned int conv[N];
-
-	// CRT:
-	// x % p1 = r1
-	// x % p2 = r2
-
-	// x % p1p2 = r1 * (P/p1)*T1 + r2 * (P/p2)*T2
-	// T1 = (P/p1) ^ (p1 - 2)
-	// T2 = (P/p2) ^ (p2 - 2)
-
-	int prime = primes[0] * primes[1];
-	int T[2] = {pow(primes[1], primes[0] - 2, primes[0]), pow(primes[0], primes[1] - 2, primes[1])};
-
-	for (int i = 0; i < N; ++i)
-	{
-		conv[i] = (convs[0][i] * T[0] * primes[1] + convs[1][i] * T[1] * primes[0]) % prime;
-	}
-
-	std::cout << std::endl;
-	for (int i = 0; i < N; ++i)
-	{
-		std::cout << convs[0][i] << " ";
-	}
-	std::cout << std::endl;
-	for (int i = 0; i < N; ++i)
-	{
-		std::cout << convs[1][i] << " ";
-	}
-	std::cout << std::endl;
-	for (int i = 0; i < N; ++i)
-	{
-		std::cout << conv[i] << " ";
-	}
-	std::cout << std::endl;
-
-	unsigned int carry = 0;
-
-	for (int i = 0; i < N; ++i)
-	{
-		carry += conv[i];
-		dest[i] = carry % 10;
-		carry /= 10;
-	}
-}
-
-int main()
-{
-	limb_t a[16] = {2, 3, 4, 5, 6, 7, 8, 9};
-	limb_t b[16] = {1, 2, 3, 4, 5, 6, 7, 8};
-
-	limb_t c[16] = {};
-
-	mul_fft(c, a, b);
-
-	std::cout << "A = "; print(a, 16)(std::cout) << std::endl;
-	std::cout << "B = "; print(b, 16)(std::cout) << std::endl;
-	std::cout << "C = "; print(c, 16)(std::cout) << std::endl;
-}
Deleted: sandbox/SOC/2007/bigint/fft/main_word.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,260 +0,0 @@
-#include <iostream>
-
-#include <windows.h>
-
-#define __int64 long long
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
-	std::cout << text;
-	for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
-	std::cout << std::endl;
-}
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + prime - b) % prime;
-}
-
-unsigned int pow(unsigned int base, unsigned int power, unsigned int prime)
-{
-	unsigned int pot = 1;
-
-	// Find largest power of two that is >= rhs
-	while (pot < power && (pot << 1) != 0)
-		pot <<= 1;
-
-	unsigned int res = 1;
-
-	while (pot > 0)
-	{
-		res = modmul(res, res, prime);
-				
-		if ((power & pot) != 0)
-		{
-			res = modmul(res, base, prime);
-		}
-  
-		pot >>= 1;
-	}
-
-	return res;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
-	do
-	{
-		n >>= 1;
-		x ^= n;
-	}
-	while ((x & n) == 0);
-
-	return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
-	if (size <= 2)
-	{
-		std::copy(source, source + size, dest);
-		return;
-	}
-
-	size_t r = 0;
-
-	dest[0] = source[0];
-
-	for (size_t x = 1; x < size; ++x)
-	{
-		r = rev_next(r, size);
-
-		dest[x] = source[r];
-	}
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, unsigned int root)
-{
-	size /= 2;
-
-	if (size > 1)
-	{
-		unsigned int new_root = modmul(root, root, prime);
-		fft_recursive(data, size, prime, new_root);
-		fft_recursive(data + size, size, prime, new_root);
-	}
-
-	unsigned int r = 1;
-
-	for (size_t i = 0; i < size; ++i)
-	{
-		unsigned int a = data[i];
-		unsigned int b = modmul(data[i + size], r, prime);
-
-		data[i] = modadd(a, b, prime);
-		data[i + size] = modsub(a, b, prime);
-
-		r = modmul(r, root, prime);
-	}
-}
-
-void fft(unsigned int* dest, const limb_t* source, size_t N, unsigned int prime, unsigned int root)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, root);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int prime, unsigned int root)
-{
-	unsigned int rroot = pow(root, prime - 2, prime);
-
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, rroot);
-
-	unsigned int N_inv = pow(N, prime - 2, prime);
-
-	for (size_t i = 0; i < N; ++i)
-	{
-		dest[i] = modmul(dest[i], N_inv, prime);
-	}
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b)
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-	
-	// N * base^2 == 1600
-	// prime0 * prime1 must be > 1600
-	const unsigned int primes[] = {70254593, 81788929};
-
-	// primitive root (power N)
-	const unsigned int roots[] = {3183438, 22821826};
-
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		unsigned int fft_a[N], fft_b[N], fft_c[N];
-		fft(fft_a, a, N, primes[p], roots[p]);
-		fft(fft_b, b, N, primes[p], roots[p]);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = modmul(fft_a[i], fft_b[i], primes[p]);
-		}
-
-		ift(convs[p], fft_c, N, primes[p], roots[p]);
-	}
-
-	unsigned __int64 prime = static_cast<unsigned __int64>(primes[0]) * primes[1];
-	unsigned int inv_p0_mod_p1 =  pow(primes[0], primes[1] - 2, primes[1]);
-
-	unsigned __int64 carry = 0;
-
-	for (int i = 0; i < N; ++i)
-	{	
-		// 64 x == 32 c0 mod 32 p0
-		// 64 x == 32 c1 mod 32 p1
-
-		// x == p0*t + c0
-		// p0*t + c0 == c1  mod p1
-		// t == (c1 - c0) div p0  mod p1
-
-		// t == t0 mod p1
-
-		// x == p0 * t0 + c0
-		// t0 == (c1 - c0) div p0  mod p1
-
-		unsigned int t0 = modmul(modsub(convs[1][i], convs[0][i], primes[1]), inv_p0_mod_p1, primes[1]);
-		unsigned __int64 x = static_cast<unsigned __int64>(primes[0]) * t0 + convs[0][i];
-
-		carry += x;
-
-		dest[i] = carry & 0xffff;
-		carry >>= 16;
-	}
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
-	typedef unsigned int limb2_t;
-
-	const size_t N = 16;
-
-	memset(dest, 0, sizeof(limb_t) * N);
-
-	limb_t* i = dest;
-			
-	for (const limb_t* li = a; li != a + N/2; ++li, ++i)
-	{
-		limb_t carry = 0;
-
-		limb_t* ci = i;
-
-		for (const limb_t* ri = b; ri != b + N/2; ++ri)
-		{
-			limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-
-		while (carry != 0)
-		{
-			limb2_t result = static_cast<limb2_t>(*ci) + carry;
-					
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-	}
-}
-
-int main()
-{
-//	limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-//	limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
-	limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-	limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
-	limb_t c[16] = {};
-
-	P("A = ", a, 8);
-	P("B = ", b, 8);
-	
-	int count = 10000;
-	LARGE_INTEGER start, end, freq;
-
-	QueryPerformanceFrequency(&freq);
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b);
-	QueryPerformanceCounter(&end);
-	
-	P("Fast FT : C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-	
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_basecase(c, a, b);
-	QueryPerformanceCounter(&end);
-	
-	P("Basecase: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}
Deleted: sandbox/SOC/2007/bigint/fft/main_word_fast.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word_fast.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,375 +0,0 @@
-#include <iostream>
-#include <windows.h>
-
-#define __int64 long long
-
-#include "primes.h"
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
-	std::cout << text;
-	for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
-	std::cout << std::endl;
-}
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + prime - b) % prime;
-}
-
-unsigned int log2(unsigned int value)
-{
-	unsigned int r = 0;
-
-	while (value > 1)
-	{
-		++r;
-		value >>= 1;
-	}
-
-	return r;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
-	do
-	{
-		n >>= 1;
-		x ^= n;
-	}
-	while ((x & n) == 0);
-
-	return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
-	if (size <= 2)
-	{
-		std::copy(source, source + size, dest);
-		return;
-	}
-
-	size_t r = 0;
-
-	dest[0] = source[0];
-
-	for (size_t x = 1; x < size; ++x)
-	{
-		r = rev_next(r, size);
-
-		dest[x] = source[r];
-	}
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
-{
-	size /= 2;
-
-	if (size > 1)
-	{
-		fft_recursive(data, size, prime, roots - 1);
-		fft_recursive(data + size, size, prime, roots - 1);
-	}
-
-	unsigned int root = *roots;
-	unsigned int r = 1;
-
-	for (size_t i = 0; i < size; ++i)
-	{
-		unsigned int a = data[i];
-		unsigned int b = modmul(data[i + size], r, prime);
-
-		data[i] = modadd(a, b, prime);
-		data[i + size] = modsub(a, b, prime);
-
-		r = modmul(r, root, prime);
-	}
-}
-
-void fft_iterative(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
-{
-	size_t step = 1;
-
-	unsigned int nstep = 0;
-
-	while (step < size)
-	{
-		unsigned int root = roots[++nstep];
-
-		size_t half_step = step;
-		step *= 2;
-
-		unsigned int r = 1;
-
-		for (size_t j = 0; j < half_step; ++j)
-		{
-			for (size_t i = j; i < size; i += step)
-			{
-				unsigned int a = data[i];
-				unsigned int b = modmul(data[i + half_step], r, prime);
-
-				data[i] = modadd(a, b, prime);
-				data[i + half_step] = modsub(a, b, prime);
-			}
-			
-			r = modmul(r, root, prime);
-		}
-	}
-}
-
-void dft(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, root_table + log2N);
-}
-
-void dft_i(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, root_table);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, root_table + log2N);
-
-	for (size_t i = 0; i < N; ++i)
-	{
-		dest[i] = modmul(dest[i], inv_N, prime);
-	}
-}
-
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, root_table);
-
-	for (size_t i = 0; i < N; ++i)
-	{
-		dest[i] = modmul(dest[i], inv_N, prime);
-	}
-}
-
-void fft_crt_carry(limb_t* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
-{
-	unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
-	unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
-
-	unsigned __int64 carry = 0;
-
-	for (int i = 0; i < N; ++i)
-	{	
-		// 64 x == 32 c0 mod 32 p0
-		// 64 x == 32 c1 mod 32 p1
-
-		// x == p0*t + c0
-		// p0*t + c0 == c1  mod p1
-		// t == (c1 - c0) div p0  mod p1
-
-		// t == t0 mod p1
-
-		// x == p0 * t0 + c0
-		// t0 == (c1 - c0) div p0  mod p1
-
-		unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1]);
-		unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
-
-		carry += x;
-
-		dest[i] = carry & 0xffff;
-		carry >>= 16;
-	}
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b,
-	void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
-	void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-	)
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-
-	unsigned int log2N = log2(N);
-	
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		const unsigned int* root_table = fft_primitive_roots[p];
-		unsigned int prime = fft_primes[p];
-
-		unsigned int fft_a[N], fft_b[N], fft_c[N];
-		dft(fft_a, a, N, log2N, prime, root_table);
-		dft(fft_b, b, N, log2N, prime, root_table);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = modmul(fft_a[i], fft_b[i], prime);
-		}
-		
-		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
-		unsigned int inv_N = fft_inv_N[p][log2N];
-
-		ift(convs[p], fft_c, N, log2N, prime, inv_root_table, inv_N);
-	}
-
-	fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void sqr_fft(limb_t* dest, const limb_t* a,
-			 void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
-			 void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-			 )
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-
-	unsigned int log2N = log2(N);
-
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		const unsigned int* root_table = fft_primitive_roots[p];
-		unsigned int prime = fft_primes[p];
-
-		unsigned int fft_a[N], fft_c[N];
-		dft(fft_a, a, N, log2N, prime, root_table);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = modmul(fft_a[i], fft_a[i], prime);
-		}
-
-		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
-		unsigned int inv_N = fft_inv_N[p][log2N];
-
-		ift(convs[p], fft_c, N, log2N, prime, inv_root_table, inv_N);
-	}
-
-	fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
-	typedef unsigned int limb2_t;
-
-	const size_t N = 16;
-
-	memset(dest, 0, sizeof(limb_t) * N);
-
-	limb_t* i = dest;
-			
-	for (const limb_t* li = a; li != a + N/2; ++li, ++i)
-	{
-		limb_t carry = 0;
-
-		limb_t* ci = i;
-
-		for (const limb_t* ri = b; ri != b + N/2; ++ri)
-		{
-			limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-
-		while (carry != 0)
-		{
-			limb2_t result = static_cast<limb2_t>(*ci) + carry;
-					
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-	}
-}
-
-int main()
-{
-//	limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-//	limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
-	limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-	limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-/*
-	limb_t a[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
-	limb_t b[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-*/
-	limb_t c[16] = {};
-
-	P("A = ", a, 8);
-	P("B = ", b, 8);
-	
-	int count = 10000;
-	LARGE_INTEGER start, end, freq;
-
-	QueryPerformanceFrequency(&freq);
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, dft, ift);
-	QueryPerformanceCounter(&end);
-
-	P("Recursive: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, dft_i, ift_i);
-	QueryPerformanceCounter(&end);
-
-	P("Iterative: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		sqr_fft(c, a, dft, ift);
-	QueryPerformanceCounter(&end);
-	
-	P("Recursive SQR: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-	
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		sqr_fft(c, a, dft_i, ift_i);
-	QueryPerformanceCounter(&end);
-	
-	P("Iterative SQR: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_basecase(c, a, b);
-	QueryPerformanceCounter(&end);
-	
-	P("Basecase : C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}
Deleted: sandbox/SOC/2007/bigint/fft/main_word_weird.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word_weird.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,382 +0,0 @@
-#include <iostream>
-#include <windows.h>
-
-#define __int64 long long
-
-#include "primes.h"
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
-	std::cout << text;
-	for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
-	std::cout << std::endl;
-}
-
-static const double fft_primes_d[] = {1.0 / 70254593.0, 1.0 / 81788929.0};
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime, double prime_d)
-{
-    unsigned int r = a * b;
-    r = r - prime * (unsigned int)(prime_d * (double) a * (double) b);
-
-    return (r >= prime ? r - prime : r);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
-	return (a + prime - b) % prime;
-}
-
-unsigned int log2(unsigned int value)
-{
-	unsigned int r = 0;
-
-	while (value > 1)
-	{
-		++r;
-		value >>= 1;
-	}
-
-	return r;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
-	do
-	{
-		n >>= 1;
-		x ^= n;
-	}
-	while ((x & n) == 0);
-
-	return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
-	if (size <= 2)
-	{
-		std::copy(source, source + size, dest);
-		return;
-	}
-
-	size_t r = 0;
-
-	dest[0] = source[0];
-
-	for (size_t x = 1; x < size; ++x)
-	{
-		r = rev_next(r, size);
-
-		dest[x] = source[r];
-	}
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, double prime_d, const unsigned int* roots)
-{
-	size /= 2;
-
-	if (size > 1)
-	{
-		fft_recursive(data, size, prime, prime_d, roots - 1);
-		fft_recursive(data + size, size, prime, prime_d, roots - 1);
-	}
-
-	unsigned int root = *roots;
-	unsigned int r = 1;
-
-	for (size_t i = 0; i < size; ++i)
-	{
-		unsigned int a = data[i];
-		unsigned int b = modmul(data[i + size], r, prime, prime_d);
-
-		data[i] = modadd(a, b, prime);
-		data[i + size] = modsub(a, b, prime);
-
-		r = modmul(r, root, prime, prime_d);
-	}
-}
-
-void fft_iterative(unsigned int* data, size_t size, unsigned int prime, double prime_d, const unsigned int* roots)
-{
-	size_t step = 1;
-
-	unsigned int nstep = 0;
-
-	while (step < size)
-	{
-		unsigned int root = roots[++nstep];
-
-		size_t half_step = step;
-		step *= 2;
-
-		unsigned int r = 1;
-
-		for (size_t j = 0; j < half_step; ++j)
-		{
-			for (size_t i = j; i < size; i += step)
-			{
-				unsigned int a = data[i];
-				unsigned int b = modmul(data[i + half_step], r, prime, prime_d);
-
-				data[i] = modadd(a, b, prime);
-				data[i + half_step] = modsub(a, b, prime);
-			}
-			
-			r = modmul(r, root, prime, prime_d);
-		}
-	}
-}
-
-void dft(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, prime_d, root_table + log2N);
-}
-
-void dft_i(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, prime_d, root_table);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_recursive(dest, N, prime, prime_d, root_table + log2N);
-
-	for (size_t i = 0; i < N; ++i)
-	{
-		dest[i] = modmul(dest[i], inv_N, prime, prime_d);
-	}
-}
-
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-{
-	fft_copy_reorder(dest, source, N);
-	fft_iterative(dest, N, prime, prime_d, root_table);
-
-	for (size_t i = 0; i < N; ++i)
-	{
-		dest[i] = modmul(dest[i], inv_N, prime, prime_d);
-	}
-}
-
-void fft_crt_carry(limb_t* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
-{
-	unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
-	unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
-
-	unsigned __int64 carry = 0;
-
-	for (int i = 0; i < N; ++i)
-	{	
-		// 64 x == 32 c0 mod 32 p0
-		// 64 x == 32 c1 mod 32 p1
-
-		// x == p0*t + c0
-		// p0*t + c0 == c1  mod p1
-		// t == (c1 - c0) div p0  mod p1
-
-		// t == t0 mod p1
-
-		// x == p0 * t0 + c0
-		// t0 == (c1 - c0) div p0  mod p1
-
-		unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1], fft_primes_d[1]);
-		unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
-
-		carry += x;
-
-		dest[i] = carry & 0xffff;
-		carry >>= 16;
-	}
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b,
-	void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table),
-	void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-	)
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-
-	unsigned int log2N = log2(N);
-	
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		const unsigned int* root_table = fft_primitive_roots[p];
-		unsigned int prime = fft_primes[p];
-		double prime_d = fft_primes_d[p];
-
-		unsigned int fft_a[N], fft_b[N], fft_c[N];
-		dft(fft_a, a, N, log2N, prime, prime_d, root_table);
-		dft(fft_b, b, N, log2N, prime, prime_d, root_table);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = modmul(fft_a[i], fft_b[i], prime, prime_d);
-		}
-		
-		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
-		unsigned int inv_N = fft_inv_N[p][log2N];
-
-		ift(convs[p], fft_c, N, log2N, prime, prime_d, inv_root_table, inv_N);
-	}
-
-	fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void sqr_fft(limb_t* dest, const limb_t* a,
-			 void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table),
-			 void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-			 )
-{
-	// p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
-	const size_t N = 16;
-
-	unsigned int log2N = log2(N);
-
-	unsigned int convs[2][N];
-
-	for (int p = 0; p < 2; ++p)
-	{
-		const unsigned int* root_table = fft_primitive_roots[p];
-		unsigned int prime = fft_primes[p];
-		double prime_d = fft_primes_d[p];
-
-		unsigned int fft_a[N], fft_c[N];
-		dft(fft_a, a, N, log2N, prime, prime_d, root_table);
-
-		for (size_t i = 0; i < N; ++i)
-		{
-			fft_c[i] = modmul(fft_a[i], fft_a[i], prime, prime_d);
-		}
-
-		const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
-		unsigned int inv_N = fft_inv_N[p][log2N];
-
-		ift(convs[p], fft_c, N, log2N, prime, prime_d, inv_root_table, inv_N);
-	}
-
-	fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
-	typedef unsigned int limb2_t;
-
-	const size_t N = 16;
-
-	memset(dest, 0, sizeof(limb_t) * N);
-
-	limb_t* i = dest;
-			
-	for (const limb_t* li = a; li != a + N/2; ++li, ++i)
-	{
-		limb_t carry = 0;
-
-		limb_t* ci = i;
-
-		for (const limb_t* ri = b; ri != b + N/2; ++ri)
-		{
-			limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-
-		while (carry != 0)
-		{
-			limb2_t result = static_cast<limb2_t>(*ci) + carry;
-					
-			*ci++ = static_cast<limb_t>(result & 0xffff);
-
-			carry = static_cast<limb_t>(result >> 16);
-		}
-	}
-}
-
-int main()
-{
-//	limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-//	limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
-	limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-	limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-/*
-	limb_t a[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
-	limb_t b[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
-					65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-*/
-	limb_t c[16] = {};
-
-	P("A = ", a, 8);
-	P("B = ", b, 8);
-	
-	int count = 100000;
-	LARGE_INTEGER start, end, freq;
-
-	QueryPerformanceFrequency(&freq);
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, dft, ift);
-	QueryPerformanceCounter(&end);
-
-	P("Recursive: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_fft(c, a, b, dft_i, ift_i);
-	QueryPerformanceCounter(&end);
-
-	P("Iterative: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		sqr_fft(c, a, dft, ift);
-	QueryPerformanceCounter(&end);
-	
-	P("Recursive SQR: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-	
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		sqr_fft(c, a, dft_i, ift_i);
-	QueryPerformanceCounter(&end);
-	
-	P("Iterative SQR: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
-	QueryPerformanceCounter(&start);
-	for (int i = 0; i < count; ++i)
-		mul_basecase(c, a, b);
-	QueryPerformanceCounter(&end);
-	
-	P("Basecase : C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}
Added: sandbox/SOC/2007/bigint/fft/primes.h
==============================================================================
--- (empty file)
+++ sandbox/SOC/2007/bigint/fft/primes.h	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -0,0 +1,27 @@
+static const unsigned int fft_primes[] = {70254593, 81788929};
+
+// 70254593 = 67 * 2^20 + 1
+// 81788929 = 39 * 2^21 + 1
+// fft_max_N * base^2 should be less than 70254593 * 81788929 = 5746047918800897 (base = 65536)
+// fft_max_N should be greater or equal than 2^20 = 1048576 due to primes structure
+const unsigned int fft_max_N = 1048576;
+
+static const unsigned int fft_primitive_roots[2][21] =
+{
+	{1, 70254592, 50348549, 19383190, 12538576, 54567172, 35010243, 946420, 69401582, 12802663, 39325177, 61326249, 20368503, 32379324, 58783494, 58098554, 49169846, 5764801, 2401, 49, 7},
+	{1, 81788928, 23980934, 1883838, 32049591, 79935288, 22685020, 73127609, 43778833, 17637033, 63153960, 14279076, 36624597, 52261024, 54282709, 36693892, 78305599, 42953381, 41608963, 418609, 647}
+};
+
+static const unsigned int fft_inv_primitive_roots[2][21] =
+{
+	{1, 70254592, 19906044, 52459594, 3183438, 53518501, 39073710, 19737011, 63685585, 64302287, 30531912, 69788710, 18847954, 64069532, 36127269, 61117601, 3300726, 63813627, 61505687, 63085757, 20072741},
+	{1, 81788928, 57807995, 1977387, 47596280, 37757181, 33726541, 75044061, 38800727, 16012482, 79766222, 64674754, 434079, 26744811, 67309642, 61891499, 67784436, 73579077, 56837587, 80643596, 76858839}
+};
+
+static const unsigned int fft_inv_N[2][21] =
+{
+	{1, 35127297, 52690945, 61472769, 65863681, 68059137, 69156865, 69705729, 69980161, 70117377, 70185985, 70220289, 70237441, 70246017, 70250305, 70252449, 70253521, 70254057, 70254325, 70254459, 70254526},
+	{1, 40894465, 61341697, 71565313, 76677121, 79233025, 80510977, 81149953, 81469441, 81629185, 81709057, 81748993, 81768961, 81778945, 81783937, 81786433, 81787681, 81788305, 81788617, 81788773, 81788851}
+};
+
+const unsigned int fft_inv_p0_mod_p1 = 37176793;
Modified: sandbox/SOC/2007/bigint/fft/roots.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/roots.cpp	(original)
+++ sandbox/SOC/2007/bigint/fft/roots.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -75,11 +75,40 @@
                 powers[i] = power;
         }
 
-	std::cout << std::endl;
+	// convolution's element is N * base^2 at max, where N is size of each vector (provided we do N * N -> 2N multiplication)
+	unsigned __int64 modulus = static_cast<unsigned __int64>(primes[0]) * primes[1];
+
+	modulus /= base;
+	modulus /= base;
+
+	// now modulus is upper bound for N; round it down to 2^k
+
+	unsigned int maxN = 1;
+	while (maxN * 2 <= modulus) maxN *= 2;
+
+	std::cout << "// fft_max_N * base^2 should be less than " << primes[0] << " * " << primes[1] << " = " << static_cast<unsigned __int64>(primes[0]) * primes[1];
+	std::cout << " (base = " << base << ")\n";
 
         unsigned int power_base = std::min(powers[0], powers[1]);
         unsigned int power = 1 << power_base;
 
+	std::cout << "// fft_max_N should be greater or equal than 2^" << power_base << " = " << power << " due to primes structure" << std::endl;
+
+	if (power > maxN)
+	{
+		while (power > maxN)
+		{
+			--power_base;
+			power /= 2;
+		}
+	}
+	else
+	{
+		maxN = power;
+	}
+
+	std::cout << "const unsigned int fft_max_N = " << maxN << ";\n\n";
+
         unsigned int roots[2];
 
         // find primitive roots (for power)
Deleted: sandbox/SOC/2007/bigint/fft/sieve.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/sieve.cpp	2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,42 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <algorithm>
-
-int main()
-{
-	const size_t size = 100000000;
-
-	bool* sieve = new bool[size];
-
-	std::fill(sieve, sieve + size, true);
-
-	sieve[0] = false;
-	sieve[1] = false;
-
-	for (size_t i = 2; i < size; ++i)
-	{
-		if (i % (size / 10000) == 0) {
-			std::cout << (float)(i / (size / 10000)) / 100 << " %" << std::endl;
-		}
-
-		if (sieve[i])
-		{
-			for (size_t k = i + i; k < size; k += i)
-			{
-				sieve[k] = false;
-			}
-		}	
-	}
-
-	std::ofstream out("primes.txt");
-
-	for (size_t i = 0; i < size; ++i)
-	{
-		if (sieve[i] && i % 1048576 == 1)
-		{
-			out << i << " ";
-		}
-	}
-
-	delete[] sieve;
-}