Merge pull request #44 from okuhara/nmakefile

Add NMakefile for Windows self build and fix incompatibilities
abulmo · Dec 13, 2024 · aa75ba9 · aa75ba9
2 parents de4b1e0 + 84680d9
commit aa75ba9
Show file tree

Hide file tree

Showing 16 changed files with 171 additions and 50 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -14,17 +14,15 @@ jobs:
     strategy:
       fail-fast: false # We want results from all OSes even if one fails.
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-latest, macos-latest]
         include:
           - os: ubuntu-latest
             build_command: make build ARCH=x86-64-v3 COMP=gcc OS=linux
-          - os: windows-latest
-            build_command: make build ARCH=x86-64-v3 COMP=gcc OS=windows
           - os: macos-latest
             build_command: make build ARCH=armv8.5-a COMP=gcc OS=osx
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: build
       run: |-
@@ -36,3 +34,23 @@ jobs:
       with:
         name: artifact_${{ runner.os }}
         path: bin
+
+  windows-build:
+    runs-on: windows-latest
+    timeout-minutes: 10
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: build
+      run: |-
+        mkdir -p bin
+        cd src
+        call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+        nmake -fnmakefile vc-x64
+      shell: cmd
+
+    - uses: actions/upload-artifact@v4
+      with:
+        name: artifact_${{ runner.os }}
+        path: bin
diff --git a/src/NMakefile b/src/NMakefile
@@ -0,0 +1,70 @@
+#
+# makefile
+#
+# Compilation options for Microsoft Visual C++ for Windows & nmake.
+#
+#  x64-v4       x64 with sse2, avx, sse4.2 & popcount & avx2 & avx512 support
+#  x64-v3       x64 with sse2, avx, sse4.2 & popcount & avx2 support
+#  x64-v2       with sse2, avx, sse4.2 & popcount support
+#  x64          x64 with sse2 support
+#  a64          ARM v8
+
+VC_FLAGS = /std:c17 /DUNICODE /utf-8 /D_CRT_SECURE_NO_DEPRECATE /I"..\include" /O2 /fp:fast /GS- /D NDEBUG /MT
+
+vc-x64-v4:
+# remove /vlen=256 for cl earlier than 14.42
+	cl $(VC_FLAGS) /arch:AVX512 /experimental:c11atomics /GL /vlen=256 /D__POPCNT__ /D__CRC32__ all.c ws2_32.lib /Fe..\bin\wEdax-x64-v4.exe /link /VERSION:4.6
+
+vc-x64-v3:
+	cl $(VC_FLAGS) /arch:AVX2 /experimental:c11atomics /GL /D__POPCNT__ /D__CRC32__ all.c ws2_32.lib /Fe..\bin\wEdax-x64-v3.exe /link /VERSION:4.6
+
+vc-x64-v2:
+	cl $(VC_FLAGS) /experimental:c11atomics /GL /D__SSE2__ /D__POPCNT__ /D__CRC32__ all.c ws2_32.lib /Fe..\bin\wEdax-x64-v2.exe /link /VERSION:4.6
+
+vc-x64:
+	cl $(VC_FLAGS) /experimental:c11atomics /GL /D__SSE2__ all.c ws2_32.lib /Fe..\bin\wEdax-x64.exe /link /VERSION:4.6
+
+vc-a64:
+#	vcvarsamd64_arm64.bat
+	cl $(VC_FLAGS) /experimental:c11atomics /GL /D__ARM_NEON all.c ws2_32.lib /Fe..\bin\wEdax-a64.exe /link /VERSION:4.6
+
+clang-x64-v4:
+	clang-cl -mprefer-vector-width=256 $(VC_FLAGS) /U__STDC_NO_THREADS__ /arch:AVX512 all.c ws2_32.lib /Fe..\bin\wEdax-x64-v4.exe /link /VERSION:4.6
+
+clang-x64-v3:
+	clang-cl $(VC_FLAGS) /U__STDC_NO_THREADS__ /arch:AVX2 all.c ws2_32.lib /Fe..\bin\wEdax-x64-v3.exe /link /VERSION:4.6
+
+clang-x64-v2:
+	clang-cl -march=x86-64-v2 $(VC_FLAGS) /U__STDC_NO_THREADS__ all.c ws2_32.lib /Fe..\bin\wEdax-x64-v2.exe /link /VERSION:4.6
+
+clang-x64:
+	clang-cl $(VC_FLAGS) /U__STDC_NO_THREADS__ all.c ws2_32.lib /Fe..\bin\wEdax-x64.exe /link /VERSION:4.6
+
+clang-a64:
+	clang-cl --target=aarch64-win32-msvc $(VC_FLAGS) /U__STDC_NO_THREADS__ /D__ARM_NEON all.c ws2_32.lib /Fe..\bin\wEdax-a64.exe /link /VERSION:4.6
+
+vc-pgo-x64-v3:
+	set VCPROFILE_PATH=..\src
+	cl $(VC_FLAGS) /arch:AVX2 /experimental:c11atomics /GL /D__POPCNT__ /D__CRC32__ all.c ws2_32.lib /Fe..\bin\wEdax-x64-v3.exe /link /ltcg:pgi /VERSION:4.6
+	cd ..\bin
+	wEdax-x64-v3 -l 60 -solve ..\problem\fforum-20-39.obf
+	wEdax-x64-v3 -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo
+	del book.pgo book.pgo.store
+	cd ..\src
+	link all.obj ws2_32.lib /out:..\bin\wEdax-x64-v3.exe /ltcg:pgo /VERSION:4.6
+	del *.pgc ..\bin\*.pgd
+
+vc-pgo-x64:
+	set VCPROFILE_PATH=..\src
+	cl $(VC_FLAGS) /experimental:c11atomics /GL /D__POPCNT__ /D__CRC32__ all.c ws2_32.lib /Fe..\bin\wEdax-x64.exe /link /ltcg:pgi /VERSION:4.6
+	cd ..\bin
+	wEdax-x64 -l 60 -solve ..\problem\fforum-20-39.obf
+	wEdax-x64 -l 18 -auto-store on -auto-start on -repeat 2 -auto-quit on -mode 2 -book-file book.pgo
+	del book.pgo book.pgo.store
+	cd ..\src
+	link all.obj ws2_32.lib /out:..\bin\wEdax-x64.exe /ltcg:pgo /VERSION:4.6
+	del *.pgc ..\bin\*.pgd
+
+clean:
+	del -f pgopti* *.dyn all.gc* *~ *.p* *.obj
+
diff --git a/src/bit.c b/src/bit.c
@@ -82,7 +82,7 @@ int bit_count_64(const uint64_t b)
 
 		return stdc_count_ones_ul(b);      // C23 version
 
-	#elif defined(_MSC_VER)
+	#elif defined(_MSC_VER) && defined(__POPCNT__)
 
 		return __popcnt64(b);           // Microsoft Visual C/C++ version
 
@@ -120,7 +120,7 @@ int bit_count_32(const uint32_t b)
 
 		return stdc_count_ones_ui(b);      // C23 version
 
-	#elif defined(_MSC_VER)
+	#elif defined(_MSC_VER) && defined(__POPCNT__)
 
 		return __popcnt(b);           // Microsoft Visual C/C++ version
 
@@ -150,14 +150,21 @@ int bit_leading_zeros_64(uint64_t b)
 
 		return stdc_leading_zeros_ul(b);      // C23 version
 
+	#elif defined(_MSC_VER) && defined(__AVX2__)
+
+		return __lzcnt64(b);           // Microsoft Visual C/C++ BMI1 version
+
 	#elif defined(_MSC_VER)
 
-		return __lzcnt64(b);           // Microsoft Visual C/C++ version
+		unsigned long index;
+		if (_BitScanReverse64(&index, b))
+			return 63 - (int) index;
+		return 64;
 
 	#elif defined(__GNUC__)
 
-//		return b ? __builtin_clzl(b) : 64; // GNUC/CLANG version
-		return __builtin_clzl(b); // GNUC/CLANG version
+//		return b ? __builtin_clzll(b) : 64; // GNUC/CLANG version
+		return __builtin_clzll(b); // GNUC/CLANG version
 
 	#else
 
@@ -170,7 +177,7 @@ int bit_leading_zeros_64(uint64_t b)
 	c = b >>  4; if (c != 0) { n = n - 4; b = c; }
 	c = b >>  2; if (c != 0) { n = n - 2; b = c; }
 	c = b >>  1; if (c != 0) return n - 2;
-	return n - x;
+	return n - b;
 
 
 	#endif
@@ -188,9 +195,16 @@ int bit_leading_zeros_32(uint32_t b)
 
 		return stdc_leading_zeros_ui(b);      // C23 version
 
+	#elif defined(_MSC_VER) && defined(__AVX2__)
+
+		return __lzcnt(b);           // Microsoft Visual C/C++ BMI1 version
+
 	#elif defined(_MSC_VER)
 
-		return __lzcnt(b);           // Microsoft Visual C/C++ version
+		unsigned long index;
+		if (_BitScanReverse(&index, b))
+			return 31 - (int) index;
+		return 32;
 
 	#elif defined(__GNUC__)
 
@@ -207,7 +221,7 @@ int bit_leading_zeros_32(uint32_t b)
 	c = b >>  4; if (c != 0) { n = n - 4; b = c; }
 	c = b >>  2; if (c != 0) { n = n - 2; b = c; }
 	c = b >>  1; if (c != 0) return n - 2;
-	return n - x;
+	return n - b;
 
 
 	#endif

diff --git a/src/bit.h b/src/bit.h
@@ -15,6 +15,10 @@
 #include <stdio.h>
 #include <stdint.h>
 
+#ifndef __has_builtin  // Compatibility with non-clang compilers.
+	#define __has_builtin(x) 0
+#endif
+
 struct Random;
 
 /* declaration */

diff --git a/src/crc32c.c b/src/crc32c.c
@@ -11,16 +11,16 @@
 #include "settings.h"
 
 #if !USE_CRC32C || (!defined(__CRC32__) && !defined(__ARM_FEATURE_CRC32))
-	#define SOFT_CRC32C true
+	#define SOFT_CRC32C 1	/* true */
 #else 
-	#define SOFT_CRC32C false
+	#define SOFT_CRC32C 0	/* false */
 #endif
 
 #include <assert.h>
 #ifdef __ARM_FEATURE_CRC32
 	#include <arm_acle.h>
 #elif defined(__CRC32__)
-	#ifdef __MSC_VER
+	#ifdef _MSC_VER
 		#include <intrin.h>
 	#else
 		#include <x86intrin.h>

diff --git a/src/flip_bitscan.c b/src/flip_bitscan.c
@@ -165,9 +165,9 @@ static const uint64_t FLIPPED_5_V[18] = {
  */
 #if __has_builtin(__builtin_subcll)
 static inline uint64_t OutflankToFlipmask(uint64_t outflank) {
-	uint64_t flipmask, cy;
-	flipmask = __builtin_subcl(outflank, 1, 0, &cy);
-	return __builtin_addcl(flipmask, 0, cy, &cy);
+	unsigned long long flipmask, cy;
+	flipmask = __builtin_subcll(outflank, 1, 0, &cy);
+	return __builtin_addcll(flipmask, 0, cy, &cy);
 }
 #elif (defined(_M_X64) && (_MSC_VER >= 1800)) || (defined(__x86_64__) && defined(__GNUC__) && (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)))
 static inline uint64_t OutflankToFlipmask(uint64_t outflank) {

diff --git a/src/flip_carry_64.c b/src/flip_carry_64.c
@@ -374,11 +374,11 @@ static const uint64_t  FLIPPED_5_U[137] = {
 /*
  * Set all bits below the sole outflank bit if outfrank != 0
  */
-#if __has_builtin(__builtin_subcl)
+#if __has_builtin(__builtin_subcll)
 static inline uint64_t OutflankToFlipmask(uint64_t outflank) {
-	uint64_t flipmask, cy;
-	flipmask = __builtin_subcl(outflank, 1, 0, &cy);
-	return __builtin_addcl(flipmask, 0, cy, &cy);
+	unsigned long long flipmask, cy;
+	flipmask = __builtin_subcll(outflank, 1, 0, &cy);
+	return __builtin_addcll(flipmask, 0, cy, &cy);
 }
 #elif (defined(_M_X64) && (_MSC_VER >= 1800)) || (defined(__x86_64__) && defined(__GNUC__) && (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)))
 static inline uint64_t OutflankToFlipmask(uint64_t outflank) {

diff --git a/src/flip_neon_bitscan.c b/src/flip_neon_bitscan.c
@@ -85,9 +85,9 @@ static const uint64_t FLIPPED_4_H[19] = {	// ...cbahg
  */
 #if __has_builtin(__builtin_subcll)
 static inline uint64_t OutflankToFlipmask(uint64_t outflank) {
-	uint64_t flipmask, cy;
-	flipmask = __builtin_subcl(outflank, 1, 0, &cy);
-	return __builtin_addcl(flipmask, 0, cy, &cy);
+	unsigned long long flipmask, cy;
+	flipmask = __builtin_subcll(outflank, 1, 0, &cy);
+	return __builtin_addcll(flipmask, 0, cy, &cy);
 }
 #else
 	#define OutflankToFlipmask(outflank)	((outflank) - (uint32_t) ((outflank) != 0))

diff --git a/src/flip_sve_lzcnt.c b/src/flip_sve_lzcnt.c
@@ -84,6 +84,10 @@ uint64_t flip(const int pos, const uint64_t P, const uint64_t O)
 
 uint64_t board_flip(const Board *board, const int x)
 {
-	return flip(x, P, O);
+	return flip(x, board->player, board->opponent);
 }
 
+uint64x2_t mm_flip(uint64x2_t OP, int pos)
+{
+	return vdupq_n_u64(Flip(pos, vgetq_lane_u64(OP, 0), vgetq_lane_u64(OP, 1)));
+}
diff --git a/src/game.h b/src/game.h
@@ -23,12 +23,12 @@ struct Random;
 typedef struct Game {
 	Board initial_board;
 	struct {
-		uint16_t year;
-		uint8_t month;
-		uint8_t day;
-		uint8_t hour;
-		uint8_t minute;
-		uint8_t second;
+		int16_t year;
+		int8_t month;
+		int8_t day;
+		int8_t hour;
+		int8_t minute;
+		int8_t second;
 	} date;
 	char name[2][32];
 	uint8_t move[60];

diff --git a/src/hash.c b/src/hash.c
@@ -257,18 +257,28 @@ static void data_new(HashData *data, const HashStore *store)
 }
 
 /**
- * @brief Initialize a new hash table item.
+ * @brief Prefetch the hash entry.
  *
- * This implementation tries to be robust against concurrency. Data are first
- * set up in a local thread-safe structure, before being copied into the
- * hashtable entry. Then the hashcode of the entry is xored with the thread
- * safe structure ; so that any corrupted entry won't be readable.
+ * The hash entry may not be in the CPU cache and take long to read, so
+ * prefetch it as soon as the hash code is available.
+ *
+ * @param hashtable Hash table to fetch from.
+ * @param hashcode Hash code.
 */
 void hash_prefetch(HashTable *hashtable, const uint64_t hashcode) {
-	#if defined(__GNUC__)
 	Hash *hash = hashtable->hash + (hashcode & hashtable->hash_mask);
-	__builtin_prefetch(hash);
-	__builtin_prefetch(hash + HASH_N_WAY - 1);
+	#if defined(__GNUC__)
+		__builtin_prefetch(hash);
+		__builtin_prefetch(hash + HASH_N_WAY - 1);
+	#elif defined(__SSE2__)
+		_mm_prefetch((char const *) hash, _MM_HINT_T0);
+		_mm_prefetch((char const *)(hash + HASH_N_WAY - 1), _MM_HINT_T0);
+	#elif defined(__ARM_ACLE)
+		__pld(hash);
+		__pld(hash + HASH_N_WAY - 1);
+	#elif defined(_M_ARM64)
+		__prefetch(hash);
+		__prefetch(hash + HASH_N_WAY - 1);
 	#endif
 }
 

diff --git a/src/search.c b/src/search.c
@@ -105,7 +105,7 @@ const Selectivity selectivity_table [] = {
 
 /** threshold values to try stability cutoff during NWS search */
 // TODO: better values may exist.
-const uint8_t NWS_STABILITY_THRESHOLD[] = { // 99 = unused value...
+const int8_t NWS_STABILITY_THRESHOLD[] = { // 99 = unused value...
 	 99, 99, 99, 99,  6,  8, 10, 12,
 #if USE_SOLID
 	  8, 10, 20, 22, 24, 26, 28, 30,
@@ -122,7 +122,7 @@ const uint8_t NWS_STABILITY_THRESHOLD[] = { // 99 = unused value...
 
 /** threshold values to try stability cutoff during PVS search */
 // TODO: better values may exist.
-const uint8_t PVS_STABILITY_THRESHOLD[] = { // 99 = unused value...
+const int8_t PVS_STABILITY_THRESHOLD[] = { // 99 = unused value...
 	 99, 99, 99, 99, -2,  0,  2,  4,
 	  6,  8, 12, 14, 16, 18, 20, 22,
 	 24, 26, 28, 30, 32, 34, 36, 38,

diff --git a/src/search.h b/src/search.h
@@ -131,8 +131,8 @@ struct Node;
 extern const uint8_t QUADRANT_ID[];
 extern const Selectivity selectivity_table[];
 extern const int NO_SELECTIVITY;
-extern const uint8_t NWS_STABILITY_THRESHOLD[];
-extern const uint8_t PVS_STABILITY_THRESHOLD[];
+extern const int8_t NWS_STABILITY_THRESHOLD[];
+extern const int8_t PVS_STABILITY_THRESHOLD[];
 extern const uint8_t SQUARE_TYPE[];
 
 /* function definition */

diff --git a/src/settings.h b/src/settings.h
@@ -73,7 +73,7 @@
 		#define MOVE_GENERATOR MOVE_GENERATOR_AVX512CD
 	#elif defined __AVX2__
 		#define MOVE_GENERATOR MOVE_GENERATOR_AVX_ACEPCK
-	#elif defined __SSE__
+	#elif defined __SSE2__
 		#define MOVE_GENERATOR MOVE_GENERATOR_CARRY_64
 	#elif defined __ARM_NEON
 		#define MOVE_GENERATOR MOVE_GENERATOR_NEON_BITSCAN