INFLUENCE.cpp

/*! \ingroup PMsolver */
#include <iostream>
#include <atomic>
#include <condition_variable>
#include <cstdio>
#include <cassert>
#include <cmath>

#include <armadillo>
#ifdef FSI_MPI
#include <mpi.h>
#endif
#ifdef FSI_OPENMP
#include <omp.h>
#endif

using namespace std;
#ifdef FSI_STATS
#include <chrono>
using namespace chrono;
#endif

#include <fsi_thread.h>

using namespace arma;

#define X 0
#define Y 1
#define Z 2

#pragma GCC diagnostic ignored "-Wconversion"  // armadillo indices unsigned

extern condition_variable cv_parent;
extern vector<double **> tandem_output;

static mutex false_sharing_mtx;

static FILE *fp = nullptr;

/*------------------------------------------------------------------------
 * Calculate the induced velocity by the doublet
 * on the any arbitrary points. Originally a MATLAB MEX function.
 * ---------------------------------------------------------------------- */

#if 0
double *AB,
        AB[0] = -fourpi*s*cpz*pow(rad,-1.5);
        AB[1] = -fourpi*s/sqrt(rad);
        AB[0] = 0.0;
        AB[1] = -b*fourpi;
    AB[0] = afourpi;
    AB[1] = -b * fourpi + cpz * afourpi;
#endif

    /*!
 * \brief INFLUENCE() calculates the
    pertubation potential on the given point by the surface doublet and
    source elements.
 * \param A
 * \param B
 * \param colx
 * \param coly
 * \param colz
 * \param x1
 * \param y1
 * \param z1
 * \param x2
 * \param y2
 * \param z2
 * \param x3
 * \param y3
 * \param z3
 * \param x4
 * \param y4
 * \param z4
 * \param miu
 */
static inline void INFLUENCE(double *A, double *B,
        const double colx, const double coly, const double colz,
        const double x1, const double y1, const double z1,
        const double x2, const double y2, const double z2,
        const double x3, const double y3, const double z3,
        const double x4, const double y4, const double z4,
        const double miu)
{
    // INFLUENCE(x,y,z,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,miu) calculates the
    // pertubation potential on the given point by the surface doublet and
    // source elements.
    // calculating the tangent and normal vectors

    // calculating chordwise tangent
    //d4 = sqrt((x1-x4)^2 + (y1-y4)^2+(z1-z4)^2);
    //TODO divide by 2 same as shift right 1 - for doubles??
    const double A1 = ((x4+x3)-(x1+x2))/2,
                 A2 = ((y4+y3)-(y1+y2))/2,
                 A3 = ((z4+z3)-(z1+z2))/2,
                 AA = sqrt(A1*A1+A2*A2+A3*A3),
                 tx[] = {A1/AA, A2/AA, A3/AA};
    // next another vector in this plan
    const double b1 = x2 - x1,
                 b2 = y2 - y1,
                 b3 = z2 - z1,
                 bb = sqrt(b1*b1+b2*b2+b3*b3);
    //d4 = sqrt((x1-x4)^2 + (y1-y4)^2+(z1-z4)^2);
    const double bt[] = {b1/bb, b2/bb, b3/bb};
    // normal vector
    const double v1 = tx[1] * bt[2] - tx[2] * bt[1],
                 v2 = tx[2] * bt[0] - tx[0] * bt[2],
                 v3 = tx[0] * bt[1] - tx[1] * bt[0],
                 vv = sqrt(v1*v1+v2*v2+v3*v3),
                 n[] = {v1/vv, v2/vv, v3/vv};
    // tangential vector in spanwise direction
    double ty[3];
    ty[0] = n[1]*tx[2] - n[2]*tx[1];
    ty[1] = n[2]*tx[0] - n[0]*tx[2];
    ty[2] = n[0]*tx[1] - n[1]*tx[0];

    const double tt = sqrt(ty[0]*ty[0]+ty[1]*ty[1]+ty[2]*ty[2]);
    ty[0] = ty[0]/tt;
    ty[1] = ty[1]/tt;
    ty[2] = ty[2]/tt;

    // calculation of area
    const double e11=x3-x1,
                 e22=y3-y1,
                 e33=z3-z1,
                 f1=x2-x1,
                 f2=y2-y1,
                 f3=z2-z1;

    //normal area
    const double s11=f2*b3-f3*b2,
                 s12=b1*f3-f1*b3,
                 s13=f1*b2-f2*b1,
                 s21=b2*e33-b3*e22,
                 s22=e11*b3-b1*e33,
                 s23=b1*e22-b2*e11,
                 s = 0.5*(sqrt(s11*s11+s12*s12+s13*s13)+sqrt(s21*s21+s22*s22+s23*s23));

    const double pi=3.14159265358979,
                 FF=5,
                 FF_sqr = FF*FF,
                 eror=1.0e-11,
                 fourpi=miu/4.0/pi;

    // TODO: spelling! coetroied of the qurd

    const double xc=0.25*(x1+x2+x3+x4),
                 yc=0.25*(y1+y2+y3+y4),
                 zc=0.25*(z1+z2+z3+z4);

    // panel node coordinates (into local CS)
    const double xl1 = (x1-xc)*tx[0] + (y1-yc)*tx[1] + (z1-zc)*tx[2],
                 xl2 = (x2-xc)*tx[0] + (y2-yc)*tx[1] + (z2-zc)*tx[2],
                 xl3 = (x3-xc)*tx[0] + (y3-yc)*tx[1] + (z3-zc)*tx[2],
                 xl4 = (x4-xc)*tx[0] + (y4-yc)*tx[1] + (z4-zc)*tx[2],

                 yl1 = (x1-xc)*ty[0] + (y1-yc)*ty[1] + (z1-zc)*ty[2],
                 yl2 = (x2-xc)*ty[0] + (y2-yc)*ty[1] + (z2-zc)*ty[2],
                 yl3 = (x3-xc)*ty[0] + (y3-yc)*ty[1] + (z3-zc)*ty[2],
                 yl4 = (x4-xc)*ty[0] + (y4-yc)*ty[1] + (z4-zc)*ty[2];

    // zl1 = (x1-xc)*n1 + (y1-yc)*n2 + (z1-zc)*n3;
    // zl2 = (x2-xc)*n1 + (y2-yc)*n2 + (z2-zc)*n3;
    // zl3 = (x3-xc)*n1 + (y3-yc)*n2 + (z3-zc)*n3;
    // zl4 = (x4-xc)*n1 + (y4-yc)*n2 + (z4-zc)*n3;

    //transformation of influence points into LCS

    const double cpx = (colx-xc)*tx[0] + (coly-yc)*tx[1] +(colz-zc)*tx[2],
                 cpy = (colx-xc)*ty[0] + (coly-yc)*ty[1] +(colz-zc)*ty[2],
                 cpz = (colx-xc) *n[0] + (coly-yc) *n[1] +(colz-zc) *n[2];

    // panel side lengths in local co-ordinate system

    const double d1 = sqrt((xl2-xl1)*(xl2-xl1) + (yl2-yl1)*(yl2-yl1)),
                 d2 = sqrt((xl3-xl2)*(xl3-xl2) + (yl3-yl2)*(yl3-yl2)),
                 d3 = sqrt((xl4-xl3)*(xl4-xl3) + (yl4-yl3)*(yl4-yl3)),
                 d4 = sqrt((xl1-xl4)*(xl1-xl4) + (yl1-yl4)*(yl1-yl4));

    // calculation of the diagonals in local co-ordinate system
    // note: apparently unused
    //const double a1= (xl3-xl1);
    //const double a2= (yl3-yl1);
    //const double b11=-(xl2-xl4);
    //const double b22=-(yl2-yl4);
    //const double D1=sqrt(a1*a1+a2*a2);
    //const double D2=sqrt(b11*b11+b22*b22);

    const double rad = ((cpx-xc)*(cpx-xc) + (cpy-yc)*(cpy-yc) + (cpz)*(cpz)),

                 cpx1 = cpx - xl1,
                 cpx2 = cpx - xl2,
                 cpx3 = cpx - xl3,
                 cpx4 = cpx - xl4,

                 cpy1 = cpy - yl1,
                 cpy2 = cpy - yl2,
                 cpy3 = cpy - yl3,
                 cpy4 = cpy - yl4,

                 e1 = cpx1*cpx1+cpz*cpz,
                 e2 = cpx2*cpx2+cpz*cpz,
                 e3 = cpx3*cpx3+cpz*cpz,
                 e4 = cpx4*cpx4+cpz*cpz,

                 r1 = sqrt(e1 + cpy1*cpy1),
                 r2 = sqrt(e2 + cpy2*cpy2),
                 r3 = sqrt(e3 + cpy3*cpy3),
                 r4 = sqrt(e4 + cpy4*cpy4),

                 x21 = xl2-xl1,
                 x32 = xl3-xl2,
                 x43 = xl4-xl3,
                 x14 = xl1-xl4,

                 y21 = yl2-yl1,
                 y32 = yl3-yl2,
                 y43 = yl4-yl3,
                 y14 = yl1-yl4,

                 h1 = cpx1*cpy1,
                 h2 = cpx2*cpy2,
                 h3 = cpx3*cpy3,
                 h4 = cpx4*cpy4;
    // % calculation of gradients
    // m12=y21/x21;
    // m23=y32/x32;
    // m34=y43/x43;
    // m41=y14/x14;

    // if distance of panel from influenced point is greater
    // then product of longer diagonal and "far field" coefficient

    if (sqrt(rad) > 500 * FF_sqr) {
        *A = -fourpi*s*cpz*pow(rad,-1.5);
        *B = -fourpi*s/sqrt(rad);
        return;
    }
    double a = 0, b = 0;
    if (sqrt(cpz*cpz) < eror) {
        if (d1 >= eror) b = (cpx1*y21-cpy1*x21)/d1*log((r1+r2+d1)/(r1+r2-d1));
        if (d2 >= eror) b += (cpx2*y32-cpy2*x32)/d2*log((r2+r3+d2)/(r2+r3-d2));
        if (d3 >= eror) b += (cpx3*y43-cpy3*x43)/d3*log((r3+r4+d3)/(r3+r4-d3));
        if (d4 >= eror) b += (cpx4*y14-cpy4*x14)/d4*log((r4+r1+d4)/(r4+r1-d4));
        *A = 0.0;
        *B = -b*fourpi;
        return;
    }
    if (d1 >= eror) {
        const double F1 = y21*e1 - x21*h1,
                     G1 = y21*e2 - x21*h2;
        a = atan2(cpz*x21*(F1*r2-G1*r1),(cpz*cpz*x21*x21*r1*r2+F1*G1));
        b = (cpx1*y21-cpy1*x21)/d1*log((r1+r2+d1)/(r1+r2-d1));
    }
    if (d2 >= eror) {
        const double F2 = y32*e2 - x32*h2,
                     G2 = y32*e3 - x32*h3;
        a += atan2(cpz*x32*(F2*r3-G2*r2),(cpz*cpz*x32*x32*r2*r3+F2*G2));
        b += (cpx2*y32-cpy2*x32)/d2*log( (r2+r3+d2)/(r2+r3-d2));
    }
    if (d3 >= eror) {
        const double F3 = y43*e3 - x43*h3,
                     G3 = y43*e4 - x43*h4;
        a += atan2(cpz*x43*(F3*r4-G3*r3),(cpz*cpz*x43*x43*r3*r4+F3*G3));
        b += (cpx3*y43-cpy3*x43)/d3*log( (r3+r4+d3)/(r3+r4-d3));
    }
    if (d4 >= eror) {
        const double F4 = y14*e4 - x14*h4,
                     G4 = y14*e1 - x14*h1;
        a += atan2(cpz*x14*(F4*r1-G4*r4),(cpz*cpz*x14*x14*r4*r1+F4*G4));
        b += (cpx4 * y14 - cpy4 * x14) / d4 * log((r4 + r1 + d4)/(r4 + r1 - d4));
    }
    const double afourpi = a * fourpi;
    *A = afourpi;
    *B = -b * fourpi + cpz * afourpi;
}

void influence_thread::mex_thread_init(const double pMiu, double *pAB) {

    AB = pAB;
    Miu = pMiu;

#ifdef FSI_MEX_DEBUG
    int id = 0;
    char buf[256];
    snprintf(buf, sizeof buf - 1,
#ifdef _WIN32
    "f:/data/INFthread_T%d.dat",
#else
    "/home/wm/ml/WLM_PM_CODE/data/INFthread_T%d.dat",
#endif
    id);
    fp = fopen(buf, "w");
    if (fp == nullptr) cerr << "Couldn't open %b " << buf << endl;
    else {
        fprintf(fp, "threaded\n");
        cerr << "Writing threaded INFLUENCE debugging file: " << buf << endl;
    }
#endif
    order = 0;
    threads_finished = 0;
#ifdef ALTERNATE_WORKINC
    workinc = mn / nthreads;
    if (workinc <= 0) workinc = 1;
#endif
#if defined(FSI_OPENMP)
    TSTART(mex_time, steady_clock::now());  // start mex timing
    omp_set_num_threads(nthreads);
    #pragma omp parallel
    {
        const int thrdid = omp_get_thread_num();
#ifdef FSI_STATS
        int ndoublets = 0;
        const steady_clock::time_point start = steady_clock::now();
#ifdef FSI_OPENMP
        int openmp_denied = 0;
        if (thrdid == 0) {  // only for one thread
            int nthrds = omp_get_num_threads();
            if (nthreads != nthrds) openmp_denied += nthreads - nthrds;
        }
#endif
#endif
        double * const ABrecv = tandem_output[thrdid][influence_index];  // receive buffer allocated per thread at startup
        // total doublets is mn
        assert(ABrecv != nullptr);
        int ord;
        while ((ord = order.fetch_add(workinc)) < mn) {  // while work remains
            double *pABout = ABrecv;
            for (int i = 0, iord = ord; i < workinc && iord < mn; i++, iord++) {
                // decode for row, col assuming column-major ordering (i.e., span-wise-panel-major ordering)
                const int mcnc3 = iord * 3;  // index to start of triplet
                pABout = doublet(pABout,  // packed - iterate over colocation points
                                 C[mcnc3], C[mcnc3 + Y], C[mcnc3 + Z],
                                 B,
                                 nx, ny);
#ifdef FSI_STATS
                ndoublets++;
#endif
            }
            // distribute computations from packed buffer
            const double *pAB = ABrecv;
            {  // start false sharing critical section
                #pragma omp critical
                for (int i = 0, iord = ord; i < workinc && iord < mn; i++, iord++) {
                    // decode for row, col assuming column-major ordering (i.e., span-wise-panel-major ordering)
                    const int imc = iord % nx,  // chord-wise index is low order
                              inc = iord / nx;  // span-wise index is high order
                    int mnrowcol = imc * ny + inc;
                    // unpack and relocate - logic must be consistent with doublet
                    for (int in = 0; in < ny; in++) {
                        pABout = &AB[mnrowcol];
                        mnrowcol += mn;  // preload for next span-wise panel
                        for (int im = 0; im < nx; im++, pABout += nmn) {
                            assert(&pABout[mnmn] < &AB[outsize]);
                            *pABout = *pAB++;  // A
                            pABout[mnmn] = *pAB++;  // B
                        }
                    }
                }
            }  // end false sharing critical section
        }  // while work remains
#ifdef FSI_STATS
        // per-thread statistics
        mex_stat *pmex_stat = (*thread_stat_map[thrdid])[this];
        pmex_stat->call_count++;
        pmex_stat->elapsed += duration_cast<mex_thread_time_units_t>(steady_clock::now() - start);
        pmex_stat->doublets += ndoublets;
#ifdef FSI_OPENMP
        if (openmp_denied != 0) pmex_stat->openmp_denied += openmp_denied;
#endif
#endif
    }  // end parallel
    TSTOP(mex_time, steady_clock::now());  // stop mex timing and add elapsed
#else  // low-level threading model
    mex_parent();
#endif  // low-level threading model
}

#define INIT(B) const double \
    *B##i1j = B + (3 * 1), \
    *B##ij1 = B + nx23, \
    *B##i1j1 = B##ij1 + 3

#define ITER(B) \
    B = B##i1j; \
    B##i1j += 3; \
    B##ij1 = B##i1j1; \
    B##i1j1 += 3

#define LASTITER(B) \
    B += (3 * 2); \
    B##i1j += (3 * 2); \
    B##ij1 += (3 * 2); \
    B##i1j1 += (3 * 2)

#define XYZOFFSET(B, offset) \
    B##offset[X], \
    B##offset[Y], \
    B##offset[Z]

inline double *influence_thread::doublet(double *AB,
                                         const double CXmcnc, const double CYmcnc, const double CZmcnc,
                                         const double *pB,
                                         const int m, const int n) {
    double *pAB;

    pAB = AB;
#ifndef NDEBUG
    const double * const pBinit = B;
#endif

    INIT(pB);
    for (int in = 0; in < n; in++) {
        int row = in;  // for debugging only
        for (int im = 0; im < m; im++, pAB += 2, row += n) {
            assert(row < mn);
            assert(pBi1j1 - pBinit < 3 * (m + 2) * (n + 1));
            assert(pAB - AB < mn * mn);

            INFLUENCE(pAB, &pAB[1],
                      CXmcnc, CYmcnc, CZmcnc,
                      pB[X], pB[Y], pB[Z],
                      XYZOFFSET(pB, ij1), //*pBXij1, *pBYij1, *pBZij1,
                      XYZOFFSET(pB, i1j1), //*pBXi1j1, *pBYi1j1, *pBZi1j1,
                      XYZOFFSET(pB, i1j), //*pBXi1j, *pBYi1j, *pBZi1j,
                      Miu);
//#ifndef FSI_MPI
            //cerr << "A[" << pAB - AB << "]=" << *pAB << endl;
//#endif
#ifdef FSI_MEX_DEBUG
            if (fp != nullptr) fprintf(fp, "AB[0]=%f\n%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f \n",
                      *pAB,
                      CXmcnc, CYmcnc, CZmcnc,
                      pB[X], pB[Y], pB[Z],
                      XYZOFFSET(pB, ij1), //*pBXij1, *pBYij1, *pBZij1,
                      XYZOFFSET(pB, i1j1), //*pBXi1j1, *pBYi1j1, *pBZi1j1,
                      XYZOFFSET(pB, i1j), //*pBXi1j, *pBYi1j, *pBZi1j,
                      Miu);
#endif
             ITER(pB);
        }  // for each row
        // skip over last boundary point and wrapping boundary point
        LASTITER(pB);
    }  // for each row,column in pair
    return (pAB);
}

void influence_thread::mex_thread_run (const int partner_rank) {

    double * const ABrecv = tandem_output[partner_rank][influence_index];  // receive buffer allocated per thread at startup
    //double * const ABrecv = new double[outsize];
    assert(ABrecv != nullptr);
#ifdef FSI_MPI
    //cerr << "mex_thread_run:influence outsize=" << outsize << endl;
    bool do_time_increment = true;
#endif
#ifdef FSI_STATS
    int ndoublets = 0;
    int nsyncs = 0;
#endif
    for (;;) {  // until no more data
        const int mcnc = order.fetch_add(workinc);
#ifdef FSI_STATS
        nsyncs++;
#endif
        if (mcnc >= mn) {
#ifdef FSI_MPI
            if (do_time_increment) {  // no work done - still need to increment child's time and send grids
                int status = MPI_Send(B, gridsize, MPI_DOUBLE, partner_rank, time_pulse, MPI_COMM_WORLD);
                assert(status == MPI_SUCCESS);
            }
#endif
#ifdef FSI_STATS
#ifndef FSI_OPENMP
            // statistics
            mex_stat *pmex_stat = (*thread_stat_map[this_thread::get_id()])[this];
            pmex_stat->doublets += ndoublets;
            pmex_stat->nsyncs += nsyncs;
#endif
#endif
            //cerr << "influence for rank " << partner_rank << " ends." << endl;
            // handle synchronization with parent
            if (is_last_thread()) {
                //cerr << "influence LAST THREAD for rank " << partner_rank << endl;
                if (fp != nullptr) fclose(fp);
            }
            return;  // to wait state
        }
        double *pABout = ABrecv;
#ifdef FSI_MPI
        // supply order, indicate INFLUENCE (default)
        //cerr << "mex_thread_run:INFLUENCE doing send: rank:" << partner_rank << " do time inc:" << do_time_increment << " mcnc:" << mcnc << endl;
        int status = MPI_Send(B, do_time_increment ? gridsize : 0, MPI_DOUBLE, partner_rank, mcnc, MPI_COMM_WORLD);
        assert(status == MPI_SUCCESS);
        MPI_Status recv_status;
        do_time_increment = false;
        int rstat = 0;
        rstat = MPI_Probe(partner_rank, mcnc, MPI_COMM_WORLD, &recv_status);
        assert(rstat == MPI_SUCCESS);
        int count;
        MPI_Get_count(&recv_status, MPI_DOUBLE, &count);
        assert(count <= outsize);
        //cerr << "mex_thread_run:INFLUENCE ABrecv=" << ABrecv << " issuing receive: rank:" << partner_rank << " receivemcnc:" << mcnc << " outsize=" << outsize << endl;
        try {
            rstat = MPI_Recv(ABrecv, outsize, MPI_DOUBLE, partner_rank, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
        catch (exception& e) {
            cerr << "exception caught on MPI_Recv in INFLUENCE "
                "Exception: " << e.what() << '\n';
        }
        if (rstat != MPI_SUCCESS) {
            cerr << "MPI_Recv fails rank=" << partner_rank << " with status:" << rstat << " mcnc=" << mcnc << endl;
        }
        assert(rstat == MPI_SUCCESS);
#else
        pABout = ABrecv;
        //cerr << "max_thread_run unpacking: rank=" << partner_rank << " mcnc=" << mcnc << endl;
        for (int i = 0, iord = mcnc; i < workinc && iord < mn; i++, iord++) {
            // decode for row, col assuming column-major ordering (i.e., span-wise-panel-major ordering)
            const int mcnc3 = iord * 3;  // index to start of triplet

            //cerr << "iord=" << iord << " imc=" << imc << " inc=" << inc << " mn=" << mn << endl;
            //cerr << "nx=" << nx << " ny=" << ny << endl;

            assert(iord < mn);
            pABout = doublet(pABout,  // packed - iterate over colocation points
                             C[mcnc3], C[mcnc3 + Y], C[mcnc3 + Z],
                             B,
                             nx, ny);
        }
#endif
        // distribute computations from packed buffer
        const double *pAB = ABrecv;
        //cerr << "max_thread_run unpacking: rank=" << partner_rank << " mcnc=" << mcnc << endl;
        {  // start false sharing critical section
            if (nthreads > 1) unique_lock<mutex> lck(false_sharing_mtx);
            for (int i = 0, iord = mcnc; i < workinc && iord < mn; i++, iord++) {
                // decode for row, col assuming column-major ordering (i.e., span-wise-panel-major ordering)
                const int imc = iord % nx,  // chord-wise index is low order
                          inc = iord / nx;  // span-wise index is high order
                int mnrowcol = imc * ny + inc;
                // unpack and relocate - logic must be consistent with doublet
                for (int in = 0; in < ny; in++) {
                    pABout = &AB[mnrowcol];
                    mnrowcol += mn;
                    for (int im = 0; im < nx; im++, pABout += nmn) {
                        assert(&pABout[mnmn] < &AB[outsize]);
                        *pABout = *pAB++;  // A
                        pABout[mnmn] = *pAB++;  // B
                    }
                }
#ifdef FSI_STATS
                ndoublets++;
#endif
            }
        }  // end false sharing critical section
    }
}

#ifdef FSI_MPI
void influence_thread::compute(const int pmcnc, const double pMiu) {
    Miu = pMiu;
    // reserve space for both A and B
    if (AB == nullptr) {
        AB = new double [outsize]; //% Influence co-efficient matrix of surface doublet distribution.
    }
    double *pAB = AB;
    int mcnc = pmcnc;
    for (int nwork = 0; nwork < workinc && mcnc < mn; nwork++, mcnc++) {
        // decode for row, col assuming column-major ordering (i.e., span-wise-panel-major ordering)
        const int mcnc3 = mcnc * 3;  // index to start of triplet

        assert(pAB - AB < outsize);
        pAB = doublet(pAB,  // packed - iterate over colocation points
                      C[mcnc3], C[mcnc3 + Y], C[mcnc3 + Z],
                      B,  // iterate over all of B
                      nx, ny);
    }
    // writing *A and *B adjacent for m * n * number of work increments actually done
    assert(pAB - AB <= outsize);
    //cerr << "compute INFLUENCE:sending to partner:" << 0 << " size:" << pAB - AB << " mcnc=" << pmcnc << endl;
    int status = MPI_Send(AB, pAB - AB, MPI_DOUBLE, 0, pmcnc, MPI_COMM_WORLD);
    assert(status == MPI_SUCCESS);
}
#endif

void mexINFLUENCE(const double *CX, const double *CY, const double *CZ,
        const double *BX, const double *BY, const double *BZ,
        double *AB, const double Miu,
        const int m, const int n, const int id,
        const int mslice = -1, const int nslice = -1)
{
#ifdef FSI_MEX_DEBUG
    char buf[256];
    snprintf(buf, sizeof buf - 1,
#ifdef _WIN32
    "f:/data/INF_T%d.dat",
#else
    "/home/wm/ml/WLM_PM_CODE/data/INF_T%d.dat",
#endif
    id);
    fp = fopen(buf, "w");
    if (fp == nullptr) cerr << "Couldn't open %b " << buf << endl;
    else {
        fprintf(fp, "unprotected\n");
        cerr << "Writing INFLUENCE debugging file: " << buf << endl;
    }
#endif

    //mat CX1{DIM2(2*m,n)}; - x,y,z
    //mat BX1{DIM2(2*m+2,n+1)}; -xp,yp,zp

    const int nx2 = m + 2; // BX 2 rows longer than CX - no view-based calculation

    int mupper, mlower;
    int mupper1, mlower1;
    if (mslice == -1) {
        mlower = 0;
        mupper = m;
        mlower1 = 0;
        mupper1 = m;
    } else {  // selected rows for both CX1 and BX1
        mlower = mslice;
        mupper = mlower + 1;
        mlower1 = nslice;
        mupper1 = mlower1 + 1;
    }
    int iicjc = 0;
    for (int imc = mlower; imc < mupper; imc++) {
        for (int inc = 0; inc < n; inc++, iicjc++) {
            const int icjc = 3 * (inc * m + imc);
            for (int im = mlower1, iij = im; im < mupper1; im++) {
                for (int in = 0; in < n; in++, iij++) {
                    const int ij = 3 * (im + nx2 * in),
                              i1j = ij + 3,
                              ij1 = ij + 3 * nx2,
                              i1j1 = ij1 + 3;
                    int idx;
                    if (mslice == -1) {  // deprecated - use thread version
                        idx = iicjc + n * m * iij;
                    } else {  // slice
                        idx = iicjc + n * in;
                    }
                    INFLUENCE(&AB[idx], &AB[idx + m * n],
                            CX[icjc], CY[icjc], CZ[icjc],
                            BX[ij], BY[ij], BZ[ij],
                            BX[ij1], BY[ij1], BZ[ij1],
                            BX[i1j1], BY[i1j1], BZ[i1j1],
                            BX[i1j], BY[i1j], BZ[i1j],
                            Miu);
#ifdef FSI_MEX_DEBUG
                    //if (fp != nullptr) fprintf(fp, "icjc=%d ij=%d ij1=%d i1j1=%d i1j=%d \n", icjc, ij, ij1, i1j1, i1j);
                    if (fp != nullptr) fprintf(fp, "AB[0]=%f\n%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f \n",
                            AB[idx],
                            CX[icjc], CY[icjc], CZ[icjc],
                            BX[ij], BY[ij], BZ[ij],
                            BX[ij1], BY[ij1], BZ[ij1],
                            BX[i1j1], BY[i1j1], BZ[i1j1],
                            BX[i1j], BY[i1j], BZ[i1j],
                            Miu);
#endif // 0
                    // *(A+(ic*ny+jc)+(i*ny+j)*mn)=AB[0];
                }
            }
        }
    }

#ifdef FSI_MEX_DEBUG
    fclose(fp);
#endif
}