asigalov61 commited on
Commit
b472454
·
verified ·
1 Parent(s): 58c20b4

Upload TCUPY.py

Browse files
Files changed (1) hide show
  1. TCUPY.py +110 -1
TCUPY.py CHANGED
@@ -35,7 +35,7 @@ r'''############################################################################
35
  #
36
  # Critical dependencies
37
  #
38
- # !pip install cupy-cuda12x
39
  # !pip install numpy==1.26.4
40
  #
41
  ################################################################################
@@ -1233,6 +1233,115 @@ def find_matches_fast(src_array, trg_array, seed: int = 0) -> int:
1233
 
1234
  ###################################################################################
1235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
  print('Module is loaded!')
1237
  print('Enjoy! :)')
1238
  print('=' * 70)
 
35
  #
36
  # Critical dependencies
37
  #
38
+ # !pip install cupy-cuda13x
39
  # !pip install numpy==1.26.4
40
  #
41
  ################################################################################
 
1233
 
1234
  ###################################################################################
1235
 
1236
+ def find_repeating_non_overlapping_patterns(arr, min_len):
1237
+ """
1238
+ Finds all repeating non-overlapping patterns of min_len and longer.
1239
+ GPU-Accelerated using CuPy with O(N) memory per length.
1240
+ """
1241
+ n = len(arr)
1242
+ if n < min_len * 2:
1243
+ return {}
1244
+
1245
+ arr_cpu = np.asarray(arr, dtype=np.int64)
1246
+ arr_gpu = cp.asarray(arr_cpu)
1247
+ max_len = n // 2
1248
+
1249
+ consumed = [False] * n
1250
+ result = {}
1251
+
1252
+ # Use a large prime base. We intentionally let np.int64 overflow naturally
1253
+ # (modulo 2^64), which cancels out perfectly in the subtraction below.
1254
+ BASE = np.int64(1000000007)
1255
+
1256
+ # Pre-compute rolling powers (suppress the expected overflow warning)
1257
+ powers_cpu = np.ones(max_len + 1, dtype=np.int64)
1258
+ with np.errstate(over='ignore'):
1259
+ for i in range(1, max_len + 1):
1260
+ powers_cpu[i] = powers_cpu[i-1] * BASE
1261
+
1262
+ powers_gpu = cp.asarray(powers_cpu)
1263
+
1264
+ # Prefix sum array allows O(1) hash retrieval for any length
1265
+ pref = cp.zeros(n + 1, dtype=cp.int64)
1266
+ pref[1:] = arr_gpu
1267
+ for i in range(1, n + 1):
1268
+ pref[i] = pref[i-1] * BASE + arr_gpu[i-1]
1269
+
1270
+ for L in range(max_len, min_len - 1, -1):
1271
+ n_hashes = n - L + 1
1272
+ if n_hashes <= 0:
1273
+ continue
1274
+
1275
+ # 1. O(1) Space Hashing on GPU
1276
+ # Subtracting two overflowed sums perfectly cancels out the overflow,
1277
+ # yielding the exact polynomial hash difference.
1278
+ raw = pref[L:L + n_hashes] - (pref[:n_hashes] * powers_gpu[L])
1279
+
1280
+ # XOR the upper and lower 32 bits to guarantee a positive, uniformly distributed hash
1281
+ hash_l = raw ^ (raw >> 32)
1282
+
1283
+ # 2. Group matching hashes via Global Sort
1284
+ sort_idx = cp.argsort(hash_l)
1285
+ sorted_hash = hash_l[sort_idx]
1286
+
1287
+ # Find boundaries where hashes change
1288
+ diff_idx = cp.where(sorted_hash[1:] != sorted_hash[:-1])[0]
1289
+ start_idx = cp.concatenate([cp.array([0]), diff_idx + 1])
1290
+ end_idx = cp.concatenate([diff_idx + 1, cp.array([n_hashes])])
1291
+
1292
+ # Keep only hashes that appear 2 or more times
1293
+ valid_mask = (end_idx - start_idx) >= 2
1294
+ start_idx_cpu = cp.asnumpy(start_idx[valid_mask])
1295
+ end_idx_cpu = cp.asnumpy(end_idx[valid_mask])
1296
+
1297
+ if len(start_idx_cpu) == 0:
1298
+ continue
1299
+
1300
+ # 3. Extract all candidate indices to CPU in ONE fast transfer
1301
+ sort_cpu = cp.asnumpy(sort_idx)
1302
+ indices_flat = sort_cpu[np.concatenate([np.arange(s, e) for s, e in zip(start_idx_cpu, end_idx_cpu)])]
1303
+
1304
+ # 4. Group by EXACT sub-array content using tobytes() to enforce 100% correctness
1305
+ groups = {}
1306
+ for idx in indices_flat:
1307
+ pat_key = bytes(arr_cpu[idx:idx+L].tobytes())
1308
+ if pat_key not in groups:
1309
+ groups[pat_key] = []
1310
+ groups[pat_key].append(idx)
1311
+
1312
+ # 5. Process each pattern (Sequential greedy interval selection)
1313
+ for pat_key, indices in groups.items():
1314
+ count = 0
1315
+ last_end = -1
1316
+
1317
+ valid_indices = []
1318
+ for i in indices:
1319
+ if consumed[i]:
1320
+ continue
1321
+ if i >= last_end:
1322
+ count += 1
1323
+ last_end = i + L
1324
+ valid_indices.append(i)
1325
+
1326
+ if count >= 2:
1327
+ # Reconstruct the tuple pattern, casting to native Python int
1328
+ pat = tuple(int(x) for x in arr_cpu[valid_indices[0]:valid_indices[0]+L])
1329
+ result[pat] = count
1330
+
1331
+ # Re-iterate to mark consumed indices
1332
+ last_end = -1
1333
+ for i in indices:
1334
+ if consumed[i]:
1335
+ continue
1336
+ if i >= last_end:
1337
+ for k in range(i, i + L):
1338
+ consumed[k] = True
1339
+ last_end = i + L
1340
+
1341
+ return result
1342
+
1343
+ ###################################################################################
1344
+
1345
  print('Module is loaded!')
1346
  print('Enjoy! :)')
1347
  print('=' * 70)