For an MCMC implementation, I want to calculate the covariance tensor C in numpy.

The distance between two elements is based on the distance between their indices. For reference, here is the working single threaded code (with an example distance):

``````import numpy as np

#set size, dimensions, etc
size = 20
ndim = 2
shape = (size,)*ndim*2

#initialize tensor
C = np.zeros(shape)
#example distance
dist = lambda x, y: np.sqrt(np.sum((x-y)**2))

#this runs as a class method, so please forgive my sloppy coding here
def update_tensor():
while not it.finished:
idx = np.array(it.multi_index)
it = dist(idx[:idx.shape//2], idx[idx.shape//2:])
it.iternext()

update_tensor()
``````

# Solution Attempt

Now the issue is, that while applying C to a matrix x is a multithreaded operation:

``````x = np.random.standard_normal((size,)*ndim)
result = np.tensordot(C, x, axes=ndim)
``````

caculating the entries of C is not. My idea was, to split C after initialization along its first axis and iterate over the chunks separately:

``````import multiprocessing
def _calc_distances(C):
'Calculate distances of submatrices'
while not it.finished:
idx = np.array(it.multi_index)
it = dist(idx[:idx.shape//2], idx[idx.shape//2:])
it.iternext()
return C

def update_tensor(C):
#Multicore Processing
n_processes = multiprocessing.cpu_count()
Chunks = [
C[i*C.shape//n_processes:(i+1)*C.shape//n_processes] for i in range(0, n_processes-1)
]
Chunks.append(C[C.shape//n_processes*(n_processes-1):])
with multiprocessing.Pool(n_processes+1) as p:
#map and stitch together
C = np.concatenate(
p.map(_calc_distances, Chunks)
)
``````

But this fails, because the indeces of the submatrices change.

# Question

Is there a nicer solution to this? How do I fix the index issue? Probably the nicest way would be to just iterate over parts of the array with threads sharing the data of C. Is that possible?

# Q/A

Q: Do you have to use a numpy iterator? A: No, it’s nice, but I can give up on that.

Worked like this. Just going to post the class here.

# Benchmarks

``````CPU: Intel Core [email protected], boosting to ~2.9GHz
Windows 10 64-bit, Python 3.7.4, Numpy 1.17
`````` Pro: Less compute time Con: Uses a little more RAM; somewhat complicated code.

``````import multiprocessing
import numpy as np

class CovOp(object):
'F[0,1]^ndim->C[0,1]^ndim'
def f(self, r):
return np.exp(-r/self.ro)#(1 + np.sqrt(3)*r / self.ro) * np.exp(-np.sqrt(3) * r / self.ro)

def dist(self, x,y):
return np.sum((x-y)**2)

def __init__(self, ndim, size, sigma=1, ro=1):
self.tensor_cached = False
self.inverse_cached = False
self.ndim = ndim
self.size = size
self.shape = (size,)*ndim*2
self.C = np.zeros(self.shape)
self.Inv = np.zeros(self.shape)
self.ro = ro * size
self.sigma = sigma

def __call__(self, x):
if not self.tensor_cached:
self.update_tensor
if self.ndim == 0:
return self.sigma * self.C * x
elif self.ndim == 1:
return self.sigma * np.dot(self.C, x)
return self.sigma * np.tensordot(self.C, x, axes=self.ndim)

def _calc_distances(self, Chunk:tuple):
'Calculate distances of submatrices'
C, offset = Chunk
while not it.finished:
idx = np.array(it.multi_index)
idx+=offset
d = self.dist(idx[:idx.shape//2], idx[idx.shape//2:])
it = self.f(d)
it.iternext()
return C

def update_tensor(self):
#Multicore Processing
n_processes = multiprocessing.cpu_count()
Chunks = [
(
self.C[i*self.C.shape//n_processes:(i+1)*self.C.shape//n_processes],
i*self.C.shape//n_processes) for i in range(0, n_processes-1)
]
Chunks.append((
self.C[self.C.shape//n_processes*(n_processes-1):],
self.C.shape//n_processes*(n_processes-1)
)
)
with multiprocessing.Pool(n_processes+1) as p:
self.C = np.concatenate(
p.map(self._calc_distances, Chunks)
)
self.tensor_cached = True
#missing cholesky decomposition

def update_inverse(self):
if self.ndim==1:
self.Inv = np.linalg.inv(self.C)
elif self.ndim>1:
self.Inv = np.linalg.tensorinv(self.C)
else:
self.Inv = 1/self.C
self.inverse_cached = True

def inv(self, x):
if self.ndim == 0:
return self.Inv * x / self.sigma
elif self.ndim == 1:
return np.dot(self.Inv, x) / self.sigma
return np.tensordot(self.Inv, x) / self.sigma
if __name__=='__main__':

size = 30
ndim = 2
depth = 1

Cov = CovOp(ndim, size, 1, .2)

import time

n_tests = 5
t_start = time.perf_counter()
for i in range(n_tests):
Cov.update_tensor()
t_stop = time.perf_counter()
dt_new = t_stop - t_start

print(
'''Benchmark; NDim: %s, Size: %s NTests: %s
Mean time per test: