"""
leq.py
solve linear equations (NumPy and CuPy)
"""

import numpy as np
import cupy as cp
import time

# parameters
N = 5000  # matrix size
L = 1  # repeat
dkind = 1      # 1=real, 2=complex
precision = 1  # 1=single, 2=double
gpu = 0        # 0=CPU(NumPy), 1=GPU(CuPy)

# dtype
f_dtype = 'f4' if precision == 1 else 'f8'
c_dtype = 'c8' if precision == 1 else 'c16'

# timer
t0 = time.time()

# real
if dkind == 1:
    a = np.random.rand(N, N).astype(f_dtype)
    b = np.random.rand(N).astype(f_dtype)
    x = np.zeros(N, f_dtype)
# complex
elif dkind == 2:
    a = np.random.rand(N, N).astype(f_dtype) \
      + np.random.rand(N, N).astype(f_dtype) * 1j
    b = np.random.rand(N).astype(f_dtype) \
      + np.random.rand(N).astype(f_dtype) * 1j
    x = np.zeros(N, c_dtype)

# alloc device memory
if gpu == 1:
    mempool = cp.get_default_memory_pool()
    # copy to device
    d_a = cp.array(a)
    d_b = cp.array(b)
    d_x = cp.array(x)

# timer
t1 = time.time()

# solve
for _ in range(L):
    if not gpu:
        # CPU (NumPy)
        x = np.linalg.solve(a, b)
    else:
        # GPU (CuPy)
        d_x = cp.linalg.solve(d_a, d_b)

# timer
t2 = time.time()

# free device memory
if gpu == 1:
    # copy to host
    x = cp.asnumpy(d_x)
    # free
    d_a = None
    d_b = None
    d_x = None
    mempool.free_all_blocks()

# check
res = np.dot(a, x) - b
ans = np.linalg.norm(res)

# output
device = 'CPU' if gpu == 0 else 'GPU'
print('(%s) N=%s, %.2f+%.2f[sec], err=%.3e'
    % (device, N, t1 - t0, (t2 - t1) / L, ans))

# free
a = None
b = None
x = None