原标题:Fast counting matches between large number of integer arrays

我不禁要问,是否有有效的算法来计算大量的惯性阵列之间的配对数量。 Cython


cimport cython
from libc.stdlib cimport calloc, free

import numpy as np
cimport numpy as np


cdef void count_matches(int[:, ::1] target_arrays, int[::1] ref_array, int[::1] num_matches):

        Py_ssize_t i, j
        Py_ssize_t n = target_arrays.shape[0]
        Py_ssize_t c = target_arrays.shape[1]
        Py_ssize_t nf = ref_array.shape[0]
        Py_ssize_t m = ref_array[nf - 1] + 5
        int * ind = <int *> calloc(m, sizeof(int))
        int k, g

    for i in range(nf):
        ind[ref_array[i]] = 1

    for i in range(n):
        k = 0
        for j in range(c):
            g = target_arrays[i, j]
            if g < m and ind[g] == 1:
                k += 1
        num_matches[i] = k


cpdef count_num_matches(int[:, ::1] target_arrays, int[::1] ref_array):

        Py_ssize_t n = target_arrays.shape[0]
        int[::1] num_matches = np.zeros(n, dtype=np.int32)

    count_matches(target_arrays, ref_array, num_matches)

    return np.asarray(num_matches)

这里的想法非常简单。 为与参考分类阵列相匹配,按加标顺序排列(sort方法)。 设定指标阵列ind,其长度为参照阵列的最大体积(+5),以避免在范围上形成指数化,同时利用阵列中的惯性并不大。 因此,每一分类都被视为一种指数,在<编码>ind中的相应职位被定为1。 然后通过每一条<代码>具体目标——信息

在配对期间,如果<条码>内d中的索引>为<1>> /代码>,则所有在<条码>上的分类账号均视为索引和配对。


# test_main_counts.py
from match_ints import count_num_matches
import numpy as np

def count_num_matches_main():
    x = np.random.randint(50, 6000, size=(1000000, 40), dtype=np.int32)
    ref_x = np.random.randint(100, 2500, size=800, dtype=np.int32)


    return count_num_matches(x, ref_x)

if __name__ == "__main__":
     nums = count_num_matches_main()

The setup file.

from setuptools import setup
from Cython.Build import cythonize
import numpy as np

            "language_level": "3",



在这种情况下,你可以预先设定目标阵列,并形成一种“无意指数”。 每一种可能的价值都产生一系列指标,包括具有以下等价值的指标阵列:

val   target_arrays_containing_val
1     2,5,100, 999999  
2     7, 13, 3141592
6000  3,111, 222,444,555,888


for x in ref_arr:
   for a in inverted[x]:
       num_matches[a] += 1


正如评论中所建议的那样, count脚 count,使用dict子(key子)的固定操作,以找到最低发生率的成瘾者的共同/独特钥匙。

你们可以避免在打造字机时仅仅进行涂.。 至今为止,通过使用一套“een子”的方法,你可以忽略尚未看到的后来名单的价值:

   This is plain python - working code as guideline - you would have to transform that 
to cpython/numpy yourself   
# millions of arrays with small amount of unique ints in it
l1 = [1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3, 4,4,4,4,4,4, 99] 
l2 = [1,1,1,1,1,   2,2,2,2,2,   3,3,3,3,3,   4,4,4,4,4,   98]
l3 = [1,           2,2,         3,3,3]

dicts = []
seen_keys = set(l1) # initial keys from one list, doesn t matter which

# m lists: go once through each list  m times O(n)
for l in [l1,l2,l3]:   # m lists result in m dicts
    curr_d = {} 

    # o(n) with n=len(list) instead of sorting with O(n*log(n))
    for i in l:
        if i not in seen_keys: continue  # skipable -> missing in ealier list
        # in plain python you can use defaultdict(int) or Counter for speedups
        curr_d[i] += 1

# resulting dict for minimal counts of keys that are in ALL lists
total_d = {}
for d in dicts:
    # initial values 
    if not total_d:
        total_d = dict(d.items())

    # remove all things from total_d that are not in new dict
    # this will reduce runtimes the further you go as the next step has fewer updates
    diffr = total_d.keys() - d.keys()
    for remove in diffr:
        del total_d[remove]

    # reduce count to minial for any key that is in total_d and new dict
    commn =  total_d.keys() & d.keys()
    for c in commn:
        total_d[c] = min(total_d[c],d[c])  # ternary maybe faster

print(total_d) #    total_d.su 


{1: 1, 2: 2, 3: 3}

