PMsolver.cpp

#ifdef _WIN32
#include <QCoreApplication>
#include <QDebug>
#include <QDir>
#include <QSettings>
#include <QStandardPaths>
#endif

//#include <sys/param.h>
#include <iostream>
#include <sys/types.h>
#include <sstream>
#include <fstream>
#include <cmath>
#include <cstring>
#include <string>
#include <map>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <ctime>
#include <chrono>
#include <ratio>
#include <list>
#include <vector>
#include <string_view>

#ifdef FSI_OPENMP
#include <omp.h>
#endif
#include <armadillo>  // linear algebra library

#include "pugixml.hpp"

#ifdef FSI_MPI
#include <mpi.h>
#endif

const double pi = 3.14159265358979323846;
const double th = 30.0 * pi / 180.0;
static const arma::uword x = 0, y = 1, z = 2;

#include <PMsolver.h>  // TODO must be consistent and perhaps shared with GUI
#include <fsi_thread.h>
#include "fsi_output.hpp"

//#pragma GCC diagnostic ignored "-Wunused-variable"
//#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#pragma GCC diagnostic ignored "-Wsign-compare"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wpadded"  // ignore struct padding warnings

// standardize and simplify matrix object handling
#define ZERO(arg) (arg).zeros()
#define ENDROW(var) (var)((var).n_rows - 1  // note: unusual looks - missing close paren
#define ENDCOL(var) (var).n_cols - 1
#define ENDN3D(var) (var).n_slices - 1
#define NROWS(matrix) (matrix).n_rows
#define NCOLS(matrix) (matrix).n_cols
#define N3D(matrix) (matrix).n_slices
#define MATDIM(m) (m).n_rows, (m).n_cols  // assign same dimensions
#define MATDIMCAT(m,n) (m).n_rows + (n).n_rows, (m).n_cols  // assign dimensions for concatenation
#define COMP(i, j) slice(j).unsafe_col(i)  // x,y,z components of an arma cube

#define BSIZE (3 * (2 * m + 2) * (n + 1))  // number of elements in BXYZ
#define WSIZE (3 * nts * (n + 1))  // number of elements WXYZ
#define CSIZE (3 * 2 * m * n)  // number of elements CXYZ

#define BDIM 3, 2 * m + 2, n + 1
#define WDIM 3, nts, n + 1
#define CDIM 3, 2 * m, n

#define MATRIX_OUT(x) matrix_out(doc, #x, x)  // same XML name as variable name
#define MATRIX_OUTATT(x) matrix_out(doc, #x, x, t)  // picks up time t

// settings following convention envvars prefaced with FSI_NAME_, default prefaced with default_
#define SETV(n) set_fsi_parm(FSI_NAME_##n, default_##n, key_val_map, doc, argc, argv)
#define SET(n) const auto n = SETV(n)
#define SETS(n) set_fsi_parm(n, FSI_NAME_##n, default_##n, key_val_map, doc, argc, argv)
#define ATTR(varname) doc.child("settings").append_attribute(varname)
#define ATTRVAL(patt, att, val) \
        patt = doc.child("settings").attribute(att); \
        if (patt == nullptr) doc.child("settings").append_attribute(att) = val; \
        else patt.set_value(val)

#define SLEEP(secs)     this_thread::sleep_for(seconds(secs))

using namespace std;
using namespace arma;
using namespace chrono;

string read_file(string_view path);

void read_NASTRAN(const string meshfile, cube &XYZ, unsigned int &nnodes, unsigned int &nelems,
                  int &m, int &n);

void mexINFLUENCE(const double *x, const double *y, const double *z,
                  const double *xp, const double *yp, const double *zp,
                  double *AB, const double Miu,
                  const int nx, const int ny, const int t,
                  const int slice1 = -1, const int slice2 = -1);

typedef map<string,string> key_val_map_t;
bool parse_ini(istream &, key_val_map_t &);
int parse_ini(const char *, key_val_map_t &);
bool has_opt(char** begin, char** end, const string& option);

int set_fsi_parm(const char *varname,
                 const int default_val,
                 key_val_map_t &key_val_map,
                 pugi::xml_node &doc,
                 int argc, char **argv);
double set_fsi_parm(const char *varname,
                 const double default_val,
                 key_val_map_t &key_val_map,
                 pugi::xml_node &doc,
                 int argc, char **argv);
void set_fsi_parm(string &outstring,
                  const char *varname,
                  const char *default_val,
                  key_val_map_t &key_val_map,
                  pugi::xml_node &doc,
                  int argc, char **argv);

#ifdef Q_OS_WIN32
extern string Xsettings_file;
#endif

extern vector<double **> tandem_output;
vector<double **> tandem_output;

extern int min_workinc, max_workinc;
#define MIN_WORKINC 99999999
int min_workinc = MIN_WORKINC, max_workinc = 0;

extern condition_variable cv_parent;

static string settings;

// threading
static mutex mtx;  // used in parent thread to await launching of children
extern mutex parent_mtx;
mutex parent_mtx;
static condition_variable cv_child;  // set by parent thread to awaken children
condition_variable cv_parent;  // set by last child thread to awaken parent

#ifndef FSI_OPENMP
static atomic<mex_thread *>pmex = nullptr;
static int pmex_id = 0;  // identifier for synchronization
#endif

static bool program_ending = false;

extern int world_rank;
int world_rank = -1;  // default signifies no MPI

#ifdef FSI_STATS
timecheck mex_time;  // manages stop watch

#ifdef FSI_OPENMP
map<const int,        mex_stat_map_t *> thread_stat_map;
#else
map<const thread::id, mex_stat_map_t *> thread_stat_map;
#endif
// mapping: threads[id] -> pmex -> mex_stat
// so each thread has stats per mex routine
#endif

inline void rotate(cube &out, const mat &ROT, const cube &in) {
    assert(out.size() == in.size());
    vec::fixed<3> tout, tin;
    cube::const_iterator itin = in.begin();
    const cube::const_iterator itend = in.end();
    cube::iterator itout = out.begin();
    for (; itin != itend;) {
        tin(x) = *itin++;
        tin(y) = *itin++;
        tin(z) = *itin++;
        tout = ROT * tin;
        *itout++ = tout(x);
        *itout++ = tout(y);
        *itout++ = tout(z);
    }
}

inline void dot_entire(mat &out, const cube &in, const cube &in1) {
    assert(in.size() == in1.size());
    cube::const_iterator itin = in.begin(),
                         it1in = in1.begin();
    const cube::const_iterator itend = in.end();
    cube::iterator itout = out.begin();
    for (; itin != itend;) {
        *itout = *itin++ * *it1in++;  // X
        *itout += *itin++ * *it1in++;  // Y
        *itout++ += *itin++ * *it1in++;  // Z
    }
}

inline void dot_entire_xyz(cube &out, const cube &ix, const cube &iy, const cube &iz, const cube &in1) {
    assert(ix.size() == in1.size());
    assert(iy.size() == in1.size());
    assert(iz.size() == in1.size());
    cube::const_iterator ixin = ix.begin(),
                         iyin = iy.begin(),
                         izin = iz.begin(),
                         it1in = in1.begin();
    const cube::const_iterator itend = ix.end();
    cube::iterator itout = out.begin();
    for (; ixin != itend;) {
        *itout = *ixin++ * *it1in++;  // X
        *itout += *ixin++ * *it1in++;  // Y
        *itout++ += *ixin++ * *it1in;  // Z
        it1in -= 2;
        *itout = *iyin++ * *it1in++;  // X
        *itout += *iyin++ * *it1in++;  // Y
        *itout++ += *iyin++ * *it1in;  // Z
        it1in -= 2;
        *itout = *izin++ * *it1in++;  // X
        *itout += *izin++ * *it1in++;  // Y
        *itout++ += *izin++ * *it1in++;  // Z
        assert(itout <= out.end());
    }
}

/*!
 * vect computes vectors: normal, chord-wise tangent, span-wise tangent
 *
 * returns area
 *
 * MATLAB: function [n, tx, ty, s]=VECT(x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4)
 */
double vect(const double x1, const double y1, const double z1,
            const double x2, const double y2, const double z2,
            const double x3, const double y3, const double z3,
            const double x4, const double y4, double const z4,
            double *n, // normal vector
            double *tx, // chord-wise tangential vector
            double *ty // span-wise tangential vector
           ) {
    // calculating chordwise tangent
    //double d4 = sqrt(pow(x1-x4, 2) + pow(y1-y4, 2)+pow(z1-z4, 2));
    const double A1 = ((x4 + x3) - (x1 + x2)) / 2,
                 A2 = ((y4 + y3) - (y1 + y2)) / 2,
                 A3 = ((z4 + z3) - (z1 + z2)) / 2,
                 AA = sqrt(A1 * A1 + A2 * A2 + A3 * A3);
    tx[x] = A1 / AA;
    tx[y] = A2 / AA;
    tx[z] = A3 / AA;
    // next vector in this plan
    const double b1 = x2 - x1,
                 b2 = y2 - y1,
                 b3 = z2 - z1,
                 bb = sqrt(b1 * b1 + b2 * b2 + b3 * b3);
    // d4 = sqrt(pow(x1-x4, 2) + pow(y1-y4, 2) + pow(z1-z4, 2));
    const double b[] = {b1 / bb, b2 / bb, b3 / bb};
    // normal vector
    const double v1 = tx[y] * b[z] - tx[z] * b[y],
                 v2 = b[x] * tx[z] - tx[x] * b[z],
                 v3 = tx[x] * b[y] - tx[y] * b[x],
                 vv = sqrt(v1 * v1 + v2 * v2 + v3 * v3);
    n[x] = v1 / vv;
    n[y] = v2 / vv;
    n[z] = v3 / vv;
    // tangential vector in spanwise direction
    ty[x] = n[y] * tx[z] - n[z] * tx[y];
    ty[y] = tx[x] * n[z] - tx[z] * n[x];
    ty[z] = n[x] * tx[y] - n[y] * tx[x];
    // % tt=sqrt(ty(1)^2+ty(2)^2+ty(3)^2);
    // % ty(1)= ty(1)/tt;
    // % ty(2) = ty(2)/tt;
    // % ty(3) = ty(3)/tt;

    // calculation of area
    const double e1 = x3 - x1,
                 e2 = y3 - y1,
                 e3 = z3 - z1,
                 f1 = x2 - x1,
                 f2 = y2 - y1,
                 f3 = z2 - z1;
    // normal area
    const double s11 = f2 * b3 - f3 * b2,
                 s12 = b1 * f3 - f1 * b3,
                 s13 = f1 * b2 - f2 * b1,
                 s21 = b2 * e3 - b3 * e2,
                 s22 = e1 * b3 - b1 * e3,
                 s23 = b1 * e2 - b2 * e1;
    return 0.5 * (sqrt(s11 * s11 + s12 * s12 + s13 * s13) +
                  sqrt(s21 * s21 + s22 * s22 + s23 * s23));
}

/*!
 * lspace(vec, start, end)
 * variation of MATLAB linspace
 */
void lspace(vec &p, const double x1, const double x2) {
    double newval = x1;
    const size_t n = NROWS(p);
    const double step = (x2 - x1) / (n - 1);
    for (size_t i = 0; i < n; i++) {
        p[i] = newval;
        newval += step;
    }
}

void flipdim(mat &dst, const mat &src, const int dim,
             const size_t torow = 0) {

    const size_t nrows = (torow == 0 ? NROWS(src) : torow);
    const size_t ncols = NCOLS(src);

    assert(dim == 1 || dim == 2);
    assert(torow == 0 || NROWS(dst) >= nrows);
    assert(torow != 0 || NROWS(dst) >= NROWS(src));
    assert(NCOLS(dst) == NCOLS(src));

    if (dim == 1) {  // flip rows
        for (size_t row = 0; row < nrows; row++) {
            for (size_t col = 0; col < ncols; col++) {
                dst(nrows - row - 1, col) = src(row, col);
            }
        }
    }
    if (dim == 2) {  // flip columns
        for (size_t row = 0; row < nrows; row++) {
            for (size_t col = 0; col < ncols; col++) {
                dst(row, ncols - col - 1) = src(row, col);
            }
        }
    }
}

#define MACROVAR(param) #param
#define NANerr(var) if (var.has_nan()) NANerror(t, MACROVAR(var))
void NANerror(const int t, const string &var) {
    std::cerr << "ERROR:Solver computational error: calculated a NaN (not a number) for variable " << var << " at time " << t << " ." << std::endl;
    exit(1);
}

/*!
 * \brief mex_thread::mex_parent -  Awakens and synchronizes POSIX child threads
 * for MPI and POSIX thread types
 *
 * Called by main thread for each task once at each time step. Exists only in base class.
 *
 * The mex_thread object must be re-initialized by its `mex_thread_init()` each time step.
 * mex_thread_init() always calls this after its init is complete at time step for MPI and POSIX
 */
#ifndef FSI_OPENMP
void mex_thread::mex_parent() {

    unique_lock<mutex> parent_lck(parent_mtx);

    TSTART(mex_time, steady_clock::now());  // start mex timing

    // new task
    pmex_id++;  // don't mistake this run for previous run
    pmex = this;  // new task now available

    unique_lock<mutex> lck(mtx);
    try {
        cv_child.notify_all();
    }
    catch (exception& e) {
        std::cerr << "exception caught when cv_child.notify_all() in mex_parent() "
                "Exception: " << e.what() << '\n';
    }
    lck.unlock();  // allow remaining children to continue

    // wait for last worker to finish
    if (has_unfinished_workers()) {
        try {
            cv_parent.wait(parent_lck);
        }
        catch (system_error& e) {
            std::cerr << "system_error exception caught when launching cv_parent.wait() "
                    "Exception: " << e.what() << '\n';
        }
    }
    pmex = nullptr;  // task finished - no active task
    TSTOP(mex_time, steady_clock::now());  // stop mex timing and add elapsed
}

/*!
 * \brief mex_thread - main POSIX thread routine for all mex child threads
 *
 * Assumes that pmex will contain the address of the mex object to run
 *   and that the pmex object will already be initialized for the specifics of the run.
 *
 * This is the main entry point called by each POSIX child thread at startup. It
 * - waits for work to do and
 * - calls the mex_thread_run() method from the subclass
 *
 * Not used with the OpenMP thread model
 */
void mex_thread_main(int partner_rank) {

   int thread_pmex_id = -1; // prevent dups, won't block 1st time if task ready

    for (;;) {
#ifdef FSI_STATS
        const steady_clock::time_point start = steady_clock::now();
#endif
        unique_lock<mutex> lck(mtx);
        // don't wait if next task is already available
        //   and don't mistake this task for task already completed
        if (pmex == nullptr || pmex_id == thread_pmex_id)  {
            try {
                cv_child.wait(lck);  // generic child thread waits for work to do
            }
            catch (exception& e) {
                std::cerr << "exception caught when cv_child.wait() in mex_thread_main() "
                        "Exception: " << e.what() << '\n';
            }
        }
        // when told to proceed by main thread, all data must already by loaded for mex run
        lck.unlock(); // allows other threads to continue
        if (program_ending) return;
        thread_pmex_id = pmex_id;
        assert(pmex != nullptr);
#ifdef FSI_STATS
        mex_thread *local_pmex = pmex;
#endif
        (*pmex).mex_thread_run(partner_rank);
        // note: when the last thread is finishing, pmex will be cleared
#ifdef FSI_STATS
        // statistics
        mex_stat *pmex_stat = (*thread_stat_map[this_thread::get_id()])[local_pmex];
        pmex_stat->call_count++;
        pmex_stat->elapsed += duration_cast<mex_thread_time_units_t>(steady_clock::now() - start);
#endif
    }  // forever
}
#endif

// INFLUENCE - only sig
// SOURCEVEL - mat sig1(2 * m, n);
// wingwakeinf -   mat MUEA1(2 * m, n); mat MUEW1doubled(nts, n);
// WAKEINFLUENCE - mat MUEW1(nts, n);

#ifdef FSI_MPI
/*!
 * \brief compute_main - main MPI compute logic on remote machine
 * - posts MPI read, decodes response, determining calculation type and invoking compute() method
 * of corresponding type
 * - compute method calculates and sends data to corresponding POSIX thread on host system.
 * - in lock step with mex_thread_run() on host system, MPI rank == POSIX thread index-1
 * \param m
 * \param n
 * \param nts
 * \param nthreads
 * \param workinc
 * \param workfactor
 */
void compute_main(const int m, const int n, const int nts,
                  const int nthreads, const int workinc, const double workfactor) {
    //std::cerr << "compute_main: m=" << m << " n=" << n << " nts=" << nts << " nthreads=" << nthreads << " workinc=" << workinc << " workfactor=" << workfactor << std::endl;

    influence_thread *pinfluence = nullptr;
    sourcevel_thread *psourcevel = nullptr;
    wingwakeinf_thread *pwingwakeinf = nullptr;
    wakeinfluence_thread *pwakeinfluence = nullptr;
    double *BXYZ = nullptr, *CXYZ = nullptr, *WXYZ = nullptr, *precv = nullptr;
    // cube *pXYZ = cube(WXYZ, WDIM, false, false);

    int t = -1;
    const int n_elem_all_grids = BSIZE + CSIZE + WSIZE,  // total size of all grids that are broadcast
              max_recv_size = 2 * m * n + nts * n;  // wingwakeinf coefficient arrays will always the biggest
    BXYZ = new double[n_elem_all_grids];
    CXYZ = &BXYZ[BSIZE];
    WXYZ = &CXYZ[CSIZE];
    precv = new double[max_recv_size];
    for (;;) {
        MPI_Status recv_status;
        int rstat = MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD, &recv_status);
        if (rstat != MPI_SUCCESS) std::cerr << "MPI_Probe receive stat:" << rstat << std::endl;
        assert (rstat == MPI_SUCCESS);

        //mat MUEA1(2 * m, n);
        //mat MUEW1doubled(nts, n);
        //mat sig = mat (precv, 2 * m, n, false, true);
        //mat sig2 = mat (precv + sig.n_elem, nts, n, false, true);
        // WAKEINFLUENCE - mat MUEW1(nts, n);
        //mat sig3 = mat (precv, nts, n, false, true);

        const unsigned int mex_type = recv_status.MPI_TAG & MEX_MASK,
                           ord = recv_status.MPI_TAG & ~MEX_MASK;
        if (ord == do_finalize) {
            rstat = MPI_Recv(precv, 0, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            assert(rstat == MPI_SUCCESS);
            return;  // and finalize and exit
        }
        //std::cerr << "compute_main:received rank:" << rank << " mex type:" << mex_type << " ord:" << ord << std::endl;
        if (mex_type != influence_mask) {
            //std::cerr << "compute_main:calling MPI_Recv rank:" << world_rank << " precv=" << precv << " max_recv_size=" << max_recv_size << " *precv=" << *precv <<std::endl;
            rstat = MPI_Recv(precv, max_recv_size, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            assert(rstat == MPI_SUCCESS);
        }
        int count;
        switch (mex_type) {
        case influence_mask:
            MPI_Get_count(&recv_status, MPI_DOUBLE, &count);
            assert(count == 0 || count == n_elem_all_grids);
            if (count > 0) {
                t++;
                //std::cerr << "compute_main t=" << t << " :calling INFLUENCE MPI_Recv for grids, rank:" << world_rank << " ord=" << ord << std::endl;
                rstat = MPI_Recv(BXYZ, n_elem_all_grids, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                assert(rstat == MPI_SUCCESS);
                if (ord == time_pulse) break;  // only time pulse and receive grid point updates - no reply
            } else {
                //std::cerr << "compute_main:calling INFLUENCE MPI_Recv rank:" << world_rank <<std::endl;
                rstat = MPI_Recv(precv, max_recv_size, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                assert(rstat == MPI_SUCCESS);
            }
            if (t >= nts) {
                //std::cerr << "compute_main: t reaches nts, so returning" << std::endl;
                return;
            }
            if (pinfluence == nullptr) {
                //std::cerr << "compute_main: allocating INFLUENCE object compute for t=" << t << " rank=" << world_rank << std::endl;
                pinfluence = new influence_thread("INFLUENCE", nthreads, 2 * m, n, CXYZ, BXYZ, workinc, 0 /* unused */);
                // maybe mex_compute_init
            }
            //std::cerr << "compute_main: calling INFLUENCE compute for t=" << t << " rank=" << world_rank<<
            //        " tag:" << ord << std::endl;
            pinfluence->compute(ord, 1.0); // just pass 1 coefficient
            //std::cerr << "compute_main: returned from  INFLUENCE compute for t=" << t << " rank=" << world_rank<< std::endl;
            break;
        case sourcevel_mask:
            if (psourcevel == nullptr) {
                psourcevel = new sourcevel_thread("SOURCEVEL", nthreads, nts, 2 * m, n,
                                                  WXYZ, BXYZ, workinc, workfactor);
            }
            // TODO - are all these coefficients always different? If not, we can save some transmission buffering
            //std::cerr << "compute_main: calling SOURCEVEL compute for t=" << t << " rank=" << world_rank <<
            //        " tag:" << recv_status.MPI_TAG << std::endl;
            psourcevel->compute(ord, precv, t);
            break;
        case wingwakeinf_mask:
            if (pwingwakeinf == nullptr) {
                pwingwakeinf = new wingwakeinf_thread ("wingwakeinf", nthreads, nts, 2 * m, n,
                                                       WXYZ, BXYZ, workinc, workfactor);
            }
            //outd("MUEA1check", sig, t);
            //outd("MUEW1doubledcheck", sig2, t);
            //std::cerr << "compute_main: calling wingwakeinf compute for t=" << t << " rank=" << world_rank << std::endl;
            pwingwakeinf->compute(ord, precv, t);
            break;
        case wakeinfluence_mask:
            if (pwakeinfluence == nullptr) {
                pwakeinfluence = new wakeinfluence_thread ("WAKEINFLUENCE", nthreads, nts, 2 * m, n,
                         CXYZ,
                         //&WXYZ(0, 1, 0),
                         // #define WDIM 3, nts, n + 1
                         //&WXYZ[3 * nts],  // TODO check: must skip 1st column


                         &WXYZ[3],  // TODO check: must skip 1st column
                         workinc);  // skip 1st row chord-wise
            }
            // TODO check offsets
            //wakeinfluence_thread oWAKEINFLUENCE("WAKEINFLUENCE", nthreads, nts, 2 * m, n,
            //             CXYZ(0).memptr(),
            //             &WXYZ(0)(0, 1, 0), workinc);  // skip 1st row chord-wise
            // and
            // oWAKEINFLUENCE.mex_thread_init(t, &MUEW1(1, 0), C1.memptr());
            // mat MUEW1(nts, n); nts is low order, so (1,0) is offset by nts from start of sig
            //outd("MUEW1check", sig3, t);
            //std::cerr << "compute_main: calling WAKEINFLUENCE compute for t=" << t << std::endl;
            //pwakeinfluence->compute(recv_status.MPI_TAG & ~MEX_MASK, &precv[nts], t);
            pwakeinfluence->compute(ord, precv, t);
            break;
        default:
            std::cerr << "Error in received data - cannot determine work type from " << recv_status.MPI_TAG << std::endl;
            exit(1);
        }
    }  // forever
}
#endif

int main(int argc, char **argv) {

    const steady_clock::time_point start_time = steady_clock::now();

    time_t ctime_start_time;
    time(&ctime_start_time);

    int world_size = 0;
#ifdef FSI_MPI
    {
        int provided;
        int mpi_init_status = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
        if (mpi_init_status != MPI_SUCCESS) {
            std::cerr << "ERROR: MPI_Init_thread fails with status=" << mpi_init_status << std::endl;
            assert(mpi_init_status == MPI_SUCCESS);
            exit(1);
        }
        assert(provided == MPI_THREAD_MULTIPLE);
        if (mpi_init_status == MPI_SUCCESS) {
            // Get the number of processes
            MPI_Comm_size(MPI_COMM_WORLD, &world_size);
            // Get the rank of the process
            MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
            std::cerr << "started for rank " << world_rank << " PID:" << getpid() << std::endl;
            if (MPI_TAG_UB != 0 && MPI_TAG_UB < wakeinfluence_mask) {
                // Get the name of the processor
                char processor_name[MPI_MAX_PROCESSOR_NAME + 1];
                bzero(processor_name, sizeof(processor_name));
                int name_len;
                MPI_Get_processor_name(processor_name, &name_len);
                std::cerr << "PLATFORM ERROR: for processor " << processor_name <<
                        ", maximum tag size (" << MPI_TAG_UB <<
                        ") for this platform is not compatible with this program (" <<
                        argv[0] <<
                        "). Report this to developers along with system architecture and word size." <<
                        std::endl;
            }
        }
    }
#endif

    pugi::xml_document doctop;
    // validators appear to insist on an element acting as an envelope, however useless
    pugi::xml_node doc = doctop.append_child("data");
    assert(doc != nullptr);

    doc.append_child("settings");

    key_val_map_t key_val_map;
    const bool daemon_mode = (argc > 0 && has_opt(argv, argv + argc, "-d"));
    if (daemon_mode) {  // take settings from standard inpout
        std::cerr << "Running in daemon mode." << std::endl;
        if (!parse_ini(cin, key_val_map)) {
            std::cerr << "parse ini error from cin" << std::endl;
            return 1;
        }
    } else {
        //TODO this logic is broken - user should still use settings file if command line
        //  arguments - the existing code should give precedence to command line args
        // parse_ini should be called regardless of whether command line args exist
        if (argc < 2) {  // no command line arguments
            // find user settings
#ifdef Q_OS_WIN32
            // TODO is this just a hack to get through init without a settings file in win32?
            string string_default_settings(QDir::homePath().toStdString() + "/fsisettings.ini");
#else
            string string_default_settings = getenv("HOME");
            string_default_settings += SETTINGS_FILE_PATH SETTINGS_FILE_NAME;
#endif
            const char *default_settings = string_default_settings.c_str();
            // take settings path from command line or environment
            //cerr << "default settings:" << default_settings << std::endl;
            SETS(settings);
#ifndef Q_OS_WIN32
            cerr << "Discovered settings:" << settings << std::endl;
            //TODO still makes no sense for WIN32 platform
            if (default_settings == settings)  // no settings path found anywhere
                // for win32, settings are done through windows registry, so inhibit warning here
                cerr << "Standard settings file " << settings << " will be used." << std::endl;
            else
                cerr << "Settings file " << settings << " selected." << std::endl;
#endif
            // populate our key/value map according to the settings file
            {
                int ret = parse_ini(settings.c_str(), key_val_map);
                if (ret == -1) {
                    if (settings == default_settings) {
                        std::cerr << "No settings file found at " << settings << ": factory default settings will be used." << std::endl;
                    } else {
                        std::cerr << "No settings file found at " << settings << ": solver will terminate." << std::endl;
                        return 1;
                    }
                } else if (ret == 0) {
                        std::cerr << "Parsing or open error in settings file: " << settings << std::endl;
                        return 1;
                }
            }
#ifdef Q_OS_WIN32
            Xsettings_file = settings;  // set globally for win32
#endif
            std::cerr << FSI_NAME_settings " is " << settings << std::endl;
        }
    }

    string datafilepath;
#ifdef _WIN32
    //string string_default_datafilepath = QDir::homePath().toStdString();
    QString qstring_default_datafilepath;
    QStandardPaths::locate(QStandardPaths::AppDataLocation,
                           qstring_default_datafilepath,
                           QStandardPaths::LocateDirectory);
    std::cerr << "home loc: " << QDir::homePath().toStdString() << std::endl;
    {
        auto locs = QStandardPaths::standardLocations(QStandardPaths::AppLocalDataLocation);
        qstring_default_datafilepath = locs[0];
        //qstring_default_datafilepath = QDir::homePath() + "/AppData/Roaming/itcas/";
    }
    string string_default_datafilepath = qstring_default_datafilepath.toStdString();
    string_default_datafilepath += "/itcas/";
#else
    string string_default_datafilepath = getenv("HOME");
    string_default_datafilepath += SETTINGS_FILE_PATH;
#endif
    string_default_datafilepath += DATA_FILE_NAME;
    const char *default_datafilepath = string_default_datafilepath.c_str();
    SETS(datafilepath);  // location of output file
    string NASTRANpath;
    const char *default_NASTRANpath = "";
    SETS(NASTRANpath);  // location of output file
    //const double alfa = 0.0;           // Angle of Attack (historical)
    //const double AOA = alfa * pi / 180.0;    // Angle of Attack radians (historical)
    // input angular mesh adjustments and convert to radians (conversion automated)
    SET(pitch);  // pitch angle
    SET(AOA);  // Angle of Attack
    SET(roll);  // roll angle of the wing
    SET(sweep);  // Sweep angle of the wing
    std::cerr << "AOA=" << AOA << " roll=" << roll << " sweep=" << sweep << std::endl;
    const mat::fixed<3, 3> RotX1 = {{1,          0,           0},
                                    {0,          cos(roll),  -sin(roll)},
                                    {0,          sin(roll),  cos(roll)}},
                           RotY1 = {{cos(pitch), 0,           -sin(pitch)},
                                    {0,          1,           0},
                                    {sin(pitch), 0,           cos(pitch)}},
                           RotZ1 = {{cos(sweep), -sin(sweep), 0},
                                    {sin(sweep), cos(sweep),  0},
                                    {0,          0,           1}};

    // Display each command-line argument.
    if (world_rank <= 0) {
        std::cerr << FSI_NAME_datafilepath " is " << datafilepath << std::endl;
        std::cerr << "Command-line arguments (" << argc << "): ";
        for (int count = 0; count < argc; count++)
             std::cerr << argv[count] << " ";
             //std::cerr << "  argv[" << count << "]   " << argv[count];
        std::cerr << std::endl;
#ifndef _WIN32
        extern void write_test(const string);  // try to create path and test file and exit if failure
        write_test(datafilepath);
#endif
    }
    int m_mesh = SETV(m), n_mesh = SETV(n);  // first assign m,n accordding to regular input
    cube XYZ(3, 2 * m_mesh + 2, n_mesh + 1);  // boundary points
    string mesh_str;
    unsigned int nnodes = 0, nelems = 0;
    if (NASTRANpath.length() > 0) {  // read m, n, boundary points from NASTRAN file
        read_NASTRAN(NASTRANpath, XYZ, nnodes, nelems, m_mesh, n_mesh);
        mesh_str = read_file(NASTRANpath);
        pugi::xml_attribute patt;
        ATTRVAL(patt, "m", m_mesh);  // if attribute already assigned, update, else append
        ATTRVAL(patt, "n", n_mesh);
        doc.child("settings").attribute("n").set_value(n_mesh);
    }
    const int m = m_mesh,  // chord-wise panels
              n = n_mesh;  // span-wise panels
    const uword mx2 = m * 2;
    // defining global constants and wing geometry
    SET(taper);
    //%AR=5;                    // Aspect ratio of the wing
    SET(c);  // Root chord length in m
    SET(rho);  // Air density kg/m^3
    SET(twist);  // wing linear twist
    SET(span);  // Wing span
    SET(T);  // wing thickness(t),
    SET(p);  // maximum thickness from cord(p)
    SET(M);  // mean camber(m)
    SET(nts); // Number of time steps
    //const double Pinf = 101325.0;      // Free stream pressure in pasacal
    SET(Q);  // freestream velocity
    //const double Mach = Q / 343.0;        // freestream mach number
    SET(dxw);  // wake shedding distance
    //const double xf = 0.4 * c;           //  Distance from leading edge to pitching axis in m.
    const double default_ydist = span * 0.2;  //  Distance of wing root from flapping axis. in m.
    SET(ydist);  //  Distance of wing root from flapping axis. in m.
    SET(Omega_rpm);  //  rotation speed in rpm
    const double default_omegax = Omega_rpm / 60.0 * 2.0 * pi;  // rotation speed in rad/s
    SET(omegax);  // rotation speed in rad/s
    SET(tspc);  // Number of time steps per cycle
    const double default_ts = 60.0 / Omega_rpm / tspc;  // Time step
    SET(ts);  // Time step
    //const int ti = 0:ts:(nts-1)*ts;// time vector
    SET(workinc);
    SET(workfactor);
    SET(nblades); // Number of blades

    std::cerr << std::endl << "m=" << m << " n=" << n << " nts=" << nts << std::endl;
    std::cerr << "nastran path=" << NASTRANpath << std::endl;

    // TODO: putting the nastran read after the following statement results in errors in the read
    // number of CPU threads to consume
#ifdef FSI_MPI
    if (world_size <= 1) {
        std::cerr << "ERROR: world size < 2 or solver was not run under MPI" << std::endl;
        assert(world_size > 1);
        exit(1);
    }
    const int nthreads = world_size - 1;
    if (world_rank > 0) {
        compute_main(m, n, nts, nthreads, workinc, workfactor);
        MPI_Finalize();
        return (0);
    }
    ATTR("nthreads") = to_string(nthreads).c_str();  // #threads came from MPI, so file setting manually
#else
#ifdef _WIN32
    // returns 0 when not able to detect number of concurrent threads in hardware
    const int default_nthreads = (thread::hardware_concurrency() == 0 ? 3 : thread::hardware_concurrency() - 1);
#else
    //const int default_nthreads = 13;
    const int default_nthreads = (thread::hardware_concurrency() > 1 ? thread::hardware_concurrency() - 1 : 1);
#endif
    SET(nthreads);
    world_size = nthreads + 1;
#endif

    std::cerr << std::endl << "m=" << m << " n=" << n << " nts=" << nts << " nthreads=" << nthreads << " workinc=" << workinc << " workfactor=" << workfactor << std::endl;

    ATTR("program") = argv[0];

#ifdef FSI_OPENMP
#ifdef FSI_STATS
    for (int ithread = 0; ithread < nthreads; ithread++) {
        thread_stat_map[ithread] = new mex_stat_map_t;  // stats indexed first by thread
    }
#endif
    std::cerr << nthreads << " OpenMP threads will be requested.\n";
#else
    // spawn POSIX threads for MPI and POSIX-only thread model
    list<thread> all_threads;
    for (int i = 0; i < nthreads; ++i) {
        all_threads.push_back(thread(mex_thread_main, i + 1));  // rank of MPI tandem partner
    }
    for (auto &it: all_threads) {
#ifdef FSI_STATS
        thread_stat_map[it.get_id()] = new mex_stat_map_t;  // stats indexed first by thread
        //cout << "thread_stat_map " << it.get_id() << " =" << thread_stat_map[it.get_id()] << std::endl;
#else
        it.detach();  // no need to get thread ID later, so detach now
#endif
    }
#ifdef FSI_MPI
    std::cerr << nthreads << " POSIX threads started to pair with MPI threads.\n";
#else
    std::cerr << nthreads << " POSIX threads started.\n";
#endif
#endif
    std::cerr.precision(8);

    size_t i, j;
    vec ti(nts);
    lspace(ti, 0, ts * (nts - 1));

        //QString checked_val = qsettings.value(checked_var).toString();
    if (NASTRANpath.length() > 0) {  // read boundary points from NASTRAN file
        //read_NASTRAN(NASTRANpath, XYZ);
        // adding the wake panel grid
        //angp = atan2(Zpanelgridu(m+1,1:n+1) - Zpanelgridu(m,1:n+1),Xpanelgridu(m+1,1:n+1) -Xpanelgridu(m,1:n+1));
        //angp(j) = atan2(Zpanelgridu(m, j) - Zpanelgridu(m - 1, j),
        //                Xpanelgridu(m, j) - Xpanelgridu(m - 1, j));
        //cube XYZ(3, 2 * m + 2, n + 1);  // boundary points
        vec angp(n + 1);
        for (j = 0; j < n + 1; j++) {
//#define DEBUG_WAKE
#ifdef DEBUG_WAKE
            std::cerr << "BP: " << XYZ(z, 2 * m, j) << " " << XYZ(z, 2 * m - 1, j) << " " <<
                              XYZ(x, 2 * m, j) << " " << XYZ(x, 2 * m - 1, j) << std::endl;
            std::cerr << "diff: " << XYZ(z, 2 * m, j) - XYZ(z, 2 * m - 1, j) << " " <<
                                XYZ(x, 2 * m, j) - XYZ(x, 2 * m - 1, j) << std::endl;
#endif
            angp(j) = atan2(XYZ(z, 2 * m, j) - XYZ(z, 2 * m - 1, j),
                            XYZ(x, 2 * m, j) - XYZ(x, 2 * m - 1, j));
        }
        XYZ.row(y) += ydist;
#ifdef DEBUG_WAKE
        angp.raw_print(std::cerr, "angp=");
        angp.raw_print(std::cerr, "ypline=");
#endif
        //Xpanelgridu(m+2,1:n+1) =Xpanelgridu(m+1,1:n+1)+dxw*cos(angp(:,1:n+1));
        //Ypanelgridu(m+2,1:n+1) =ypline(1:n+1);
        //Zpanelgridu(m+2,1:n+1) =Zpanelgridu(m+1,1:n+1)+dxw*sin(angp(:,1:n+1));
        vec ypline = linspace<vec>(-pi / 2, pi / 2, n + 1);
        for (i = 0; i < NROWS(ypline); i++) {
            ypline(i) = ydist + (sin(ypline(i)) + 1.0) / 2.0 * span; // Nonlinear span-wise spacing symmetry (- 0 +)
        }
#ifdef DEBUG_WAKE
        std::cerr << "ydist=" << ydist << " span=" << span << std::endl;
#endif
        for (j = 0; j < n + 1; j++) {
            //Xpanelgridu(m + 1, j) = Xpanelgridu(m, j) + dxw * cos(angp(j));
            //Ypanelgridu(m + 1, j) = ypline(j);
            //Zpanelgridu(m + 1, j) = Zpanelgridu(m, j) + dxw * sin(angp(j));
#ifdef DEBUG_WAKE
            std::cerr << "wake XYZ(" << 2 * m + 1 <<  "," << j << ")=" << dxw * cos(angp(j)) <<
                                          " " << ypline(j) << " " << dxw * sin(angp(j)) << std::endl;
#endif
            XYZ(x, 2 * m + 1, j) = XYZ(x, 2 * m, j) + dxw * cos(angp(j));
            XYZ(y, 2 * m + 1, j) = ypline(j);
            XYZ(z, 2 * m + 1, j) = XYZ(z, 2 * m, j) + dxw * sin(angp(j));
        }
        // factor in rotations for AOA, sweep, roll
        for (uword iy = 0; iy < XYZ.n_slices; iy++) {
            for (uword ix = 0; ix < XYZ.n_cols; ix++) {
                XYZ.COMP(ix, iy) = RotX1 * RotY1 * RotZ1 * XYZ.COMP(ix, iy);  // apply rotations
            }
        }
    } else {  // generate boundary points

        vec Xu(m + 1), Zu(m + 1), Xl(m + 1), Zl(m + 1);
        {  // clean up namespace a little
            vec x(m + 1), r(m + 1), beta(m + 1), zc(m + 1),
                zt(NROWS(x));
            lspace(beta, 0, pi);
            for (i = 0; i < m + 1; i++) {
                x(i) = c / 2.0 * (1.0 - cos(beta(i)));  // cosine distribution of chord-wise panels
            }
            for (j = 0; j < NROWS(zc); j++) {
                if ((x(j) / c >= 0) && (x(j) / c < p)) {
                    zc(j) = (M / (p * p)) * ((2 * p * x(j)) - (pow(x(j), 2) / c));
                } else if ((x(j) / c >= p) && (x(j) / c <= 1)) {
                    zc(j) = (M * (c - x(j)) / pow((1 - p), 2)) * (1 - 2 * p + (x(j) / c));
                }
            }
            for (j = 0; j < NROWS(x); j++) {
                r(j) = (j == NROWS(x) - 1 ? 0 : atan2((zc(j + 1) - zc(j)), (x(j + 1) - x(j))));
            }
            for (i = 0; i < NROWS(zt); i++) {
                zt(i) = (T / 0.2) * c *
                    (0.2969 * pow(x(i) / c, 0.5) -
                     0.126 * x(i) / c - 0.3516 * pow(x(i) / c, 2) +
                     0.2843 * pow(x(i) / c, 3) - 0.1036 * pow(x(i) / c, 4));
            }
            for (i = 0; i < NROWS(x); i++) {
                Xu(i) = x (i) - zt(i) * sin(r(i));
                Zu(i) = zc(i) + zt(i) * cos(r(i));
                Xl(i) = x (i) + zt(i) * sin(r(i));
                Zl(i) = zc(i) - zt(i) * cos(r(i));
            }
        }
        mat Xpanelgridu(m + 2, n + 1), Ypanelgridu(m + 2, n + 1), Zpanelgridu(m + 2, n + 1),
            Xpanelgridl(m + 1, n + 1), Ypanelgridl(m + 1, n + 1), Zpanelgridl(m + 1, n + 1),
            Xpanelgridl1(m + 1, n + 1), Zpanelgridl1(m + 1, n + 1),
            Xpanelgridu1(m + 1, n + 1), Zpanelgridu1(m + 1, n + 1);
        ZERO(Xpanelgridu); ZERO(Ypanelgridu); ZERO(Zpanelgridu);
        ZERO(Xpanelgridl); ZERO(Ypanelgridl); ZERO(Zpanelgridl);

        mat Span_dez(m + 1, n + 1);
        vec ypline = linspace<vec>(-pi / 2, pi / 2, n + 1);
        for (i = 0; i < NROWS(ypline); i++) {
            ypline(i) = ydist + (sin(ypline(i)) + 1.0) / 2.0 * span; // Nonlinear span-wise spacing symmetry (- 0 +)
        }
        for (j = 0; j < n + 1; j++) {
            for (i = 0; i < m + 1; i++) {
                Span_dez(i, j) = ypline(j) - ydist;
                // grid for upper surface of the wing
                Ypanelgridu(i, j) = ypline(j);
                Xpanelgridu(i, j) = Xu(i);
                Zpanelgridu(i, j) = Zu(i);
                // grid for lower surface of the wing
                Ypanelgridl(i, j) = ypline(j);
                Xpanelgridl(i, j) = Xl(i);
                Zpanelgridl(i, j) = Zl(i);
            }
        }
        // defining the wing twist, wing taper and wing sweep
        mat L_Twistl(m + 1, n + 1), L_Twistu(m + 1, n + 1);
        for (j = 0; j < n + 1; j++) {
            for (i = 0; i < m + 1; i++) {
                // Apply linear twist (L_Twist)
                L_Twistl(i, j) = twist * (1.0 - Span_dez(i, j) / span);
                Zpanelgridl1(i, j) =
                    Zpanelgridl(i, j) * cos(L_Twistl(i, j)) -
                    Xpanelgridl(i, j) * sin(L_Twistl(i, j));
                Xpanelgridl1(i, j) =
                    Xpanelgridl(i, j) * cos(L_Twistl(i, j)) +
                    Zpanelgridl(i, j) * sin(L_Twistl(i, j));
                Xpanelgridl(i, j) = Xpanelgridl1(i, j);
                Zpanelgridl(i, j) = Zpanelgridl1(i, j);

                L_Twistu(i, j) = twist * (1 - Span_dez(i, j) / span);
                Zpanelgridu1(i, j) =
                    Zpanelgridu(i, j) * cos(L_Twistu(i, j)) -
                    Xpanelgridu(i, j) * sin(L_Twistu(i, j));
                Xpanelgridu1(i, j) =
                    Xpanelgridu(i, j) * cos(L_Twistu(i, j)) +
                    Zpanelgridu(i, j) * sin(L_Twistu(i, j));
                Xpanelgridu(i, j) = Xpanelgridu1(i, j);
                Zpanelgridu(i, j) = Zpanelgridu1(i, j);

                // Apply linear taper (backward taper)
                // Apply linear taper (forward taper)
                Zpanelgridl(i, j) *= 1.0 + (taper - 1.0) / span * (Span_dez(i, j));
                Xpanelgridl(i, j) *= 1.0 + (taper - 1.0) / span * (Span_dez(i, j));

                Zpanelgridu(i, j) *= 1.0 + (taper - 1.0) / span * (Span_dez(i, j));
                Xpanelgridu(i, j) *= 1.0 + (taper - 1.0) / span * (Span_dez(i, j));
                Xpanelgridl(i, j) += Span_dez(i, j) * tan(atan2((1 - taper) * c, span));
                Xpanelgridu(i, j) += Span_dez(i, j) * tan(atan2((1 - taper) * c, span));
                // Apply wing sweep
                Xpanelgridl(i, j) += Span_dez(i, j) * tan(sweep);
                Xpanelgridu(i, j) += Span_dez(i, j) * tan(sweep);
            }
        }
        {
            // adding the wake panel grid
            //angp = atan2(Zpanelgridu(m+1,1:n+1) - Zpanelgridu(m,1:n+1),Xpanelgridu(m+1,1:n+1) -Xpanelgridu(m,1:n+1));
            vec angp(n + 1);
            for (j = 0; j < n + 1; j++) {
#ifdef DEBUG_WAKE
                std::cerr << "BP: " << Zpanelgridu(m, j) << " " << Zpanelgridu(m - 1, j) << " " <<
                                  Xpanelgridu(m, j) << " " << Xpanelgridu(m - 1, j) << std::endl;
                std::cerr << "diff: " << Zpanelgridu(m, j) - Zpanelgridu(m - 1, j) << " " <<
                                    Xpanelgridu(m, j) - Xpanelgridu(m - 1, j) << std::endl;
#endif
                angp(j) = atan2(Zpanelgridu(m, j) - Zpanelgridu(m - 1, j),
                                Xpanelgridu(m, j) - Xpanelgridu(m - 1, j));
            }
#ifdef DEBUG_WAKE
            angp.raw_print(std::cerr, "angp=");
            angp.raw_print(std::cerr, "ypline=");
            std::cerr << "ydist=" << ydist << " span=" << span << std::endl;
#endif
            //Xpanelgridu(m+2,1:n+1) =Xpanelgridu(m+1,1:n+1)+dxw*cos(angp(:,1:n+1));
            //Ypanelgridu(m+2,1:n+1) =ypline(1:n+1);
            //Zpanelgridu(m+2,1:n+1) =Zpanelgridu(m+1,1:n+1)+dxw*sin(angp(:,1:n+1));
            for (j = 0; j < n + 1; j++) {
                std::cerr << "wake XYZ(" << 2 * m + 1 <<  "," << j << ")=" << dxw * cos(angp(j)) <<
                                              " " << ypline(j) << " " << dxw * sin(angp(j)) << std::endl;
                Xpanelgridu(m + 1, j) = Xpanelgridu(m, j) + dxw * cos(angp(j));
                Ypanelgridu(m + 1, j) = ypline(j);
                Zpanelgridu(m + 1, j) = Zpanelgridu(m, j) + dxw * sin(angp(j));
            }
        }
        // concatenating the upper and lower grid into clockwise direction
        // first point starting from lower to upper
        //X=vertcat(flipdim(Xpanelgridl,1),Xpanelgridu(2:end,:));
        XYZ.row(x) = join_vert(flipud(Xpanelgridl), Xpanelgridu.tail_rows(NROWS(Xpanelgridu) - 1));
        XYZ.row(y) = join_vert(flipud(Ypanelgridl), Ypanelgridu.tail_rows(NROWS(Ypanelgridu) - 1));
        XYZ.row(z) = join_vert(flipud(Zpanelgridl), Zpanelgridu.tail_rows(NROWS(Zpanelgridu) - 1));

        vector <string> comment;
        comment.emplace_back("");
        comment.emplace_back("span " + to_string(span));
        comment.emplace_back("c    " + to_string(c));
        if (roll != 0) comment.emplace_back("roll " + to_string(roll * 180.0 / pi) + " deg");
        if (AOA != 0) comment.emplace_back("AOA " + to_string(AOA * 180.0 / pi) + " deg");
        if (sweep != 0) comment.emplace_back("sweep " + to_string(sweep * 180.0 / pi) + " deg");
        if (twist != 0) comment.emplace_back("twist " + to_string(twist * 180.0 / pi) + " deg");
        comment.emplace_back("");
        comment.emplace_back("Note that the panel elements are laid out with points starting at the lower trailing edge");
        comment.emplace_back("  finishing with the upper trailing edge.");
        comment.emplace_back("Note also that the lower and upper trailing edges have the same points respectively across their span.");
        comment.emplace_back("The lower and upper trailing edges must either:");
        comment.emplace_back("  - be the same points, or");
        comment.emplace_back("  - have points with the same coordinates");
        comment.emplace_back("  respectively across the span.");
        if (roll != 0 || AOA != 0 || sweep != 0 || twist != 0)
            std::cerr << "NOTICE: rotations factored into generated mesh:" << std::endl;
        generate_nastran(XYZ, false, ydist, roll, AOA, sweep, span, c, comment);
    }  // end of boundary point generation
    OUTD(XYZ);
    // blade extension
    //const double h = 2.0*span;   // height of the support pole
    //const double rex = T/1.5*c;  // radius of the pole
    //double minzex = Z[0][0];
    //double maxzex = Z[0][0];
    //for (i=1; i < MSIZE(Z, 1); i++) {
    //    minzex = min(minzex, Z[i][0]);
    //    maxzex = max(maxzex, Z[i][0]);
    //}
    //const double zex=(maxzex+minzex)/2;
    // Create constant vectors
    //vec thtex {DIM(2*m+1)};
    //linspace(thtex, 0, 2.0 *pi);
    //vec y11 {DIM(n)};
    //linspace(y11, 0, 0.3*c);
    // TODO pole code here
    // Create grid points of the pole cylinder
    // center of the pole is ((c+0.5),0,0)
    //  Create grid points of the pole cylinder
    // plotting the wing surface
    // ...plot...
    cube uvww1(WDIM);

    cube uvwwakeS(3, nts, n + 1); ZERO(uvwwakeS);  // former uwakeS...
    cube uvww(3, nts, n + 1); ZERO(uvww);

    field<cube> BXYZ(nblades),
                WXYZ(nblades),
                CXYZ(nblades),
                panelgrid(nblades);
    for (uword iblade = 0; iblade < nblades; iblade++) {
        double *pcube = new double [BSIZE + CSIZE + WSIZE];
        BXYZ(iblade) = cube (pcube, BDIM, false, false); ZERO(BXYZ(iblade));
        pcube += BSIZE;
        CXYZ(iblade) = cube (pcube, CDIM, false, false); ZERO(CXYZ(iblade));
        pcube += CSIZE;
        WXYZ(iblade) = cube (pcube, WDIM, false, false); ZERO(WXYZ(iblade));
        panelgrid(iblade) = cube (3, 2 * m + 2, n + 1);
        // TODO: 'strict' should be 'true', but doesn't work
    }
    // blade 1 references to reduce arma field usage
    cube &BP1 = BXYZ(0);  // boundary points blade 1
    const cube &WP1 = WXYZ(0),  // wake points blade 1
               &CP1 = CXYZ(0);  // colocation points blade 1

    const uword endrowBXYZ = 2 * m + 1;
    //const uword endrowBXYZ = NCOLS(BXYZ(0).slice(0).col(0).n_rows) - 1;  // ending column for BXYZ - aka BXYZ(0).slice(0).n_cols - 1
    //const uword endrowBXYZ = NCOLS;  // ending column for BXYZ - aka BXYZ(0).slice(0).n_cols - 1

#if 1
    BP1 = XYZ;  // no initial rotation
#else
    {
        // orientation of the blades
        //const double RotY=[][3] = {{cos(pi/2.0), 0, sin(pi/2)}, {0, 1, 0}, {-sin(pi/2), 0, cos(pi/2)}};
        const mat RotY = {
            {cos(pi / 2.0), 0, sin(pi / 2.0)},
            {0, 1, 0},
            {-sin(pi / 2.0), 0, cos(pi / 2.0)}};

        // note: arma doesn't transpose colvec; i.e., colvec === vec
        colvec::fixed<3> tXYZ;
        for (j = 0; j < n + 1; j++) {
            for (i = 0; i < 2 * m + 2; i++) {
                //const colvec XYZ = {X(i, j), Y(i, j), Z(i, j)};
                // rotate first blade into position
                tXYZ = RotY * XYZ.COMP(i, j);
                BP1(x, i, j) = tXYZ(x);
                BP1(y, i, j) = tXYZ(y);
                BP1(z, i, j) = tXYZ(z) + c / 2.0; // note: pole effect removed -(2*zex)
                // NOTE: removing (pole) zex for comparison BZ(i,j) = tXYZ(3)+c/2-2*zex;
            }
        }
    }
#endif
    outdb("BX", x, BP1);
    outdb("BY", y, BP1);
    outdb("BZ", z, BP1);
    /*
    //TODO: pole-related?
    for (i=0; i<2*m+1; i++) {
        for (j=0; j<n+1; j++) {
            fcv(0) = xm[i][j];
            fcv(1) = ym[i][j];
            fcv(2) = zm[i][j];
            fcvt = fcv.transpose();
            tXYZ = RotY * fcvt;
            xm[i][j] = tXYZ(0);
            ym[i][j] = tXYZ(1);
            zm[i][j] = tXYZ(2) + c/2.0-2.0*zex;
        }
    }
    */
    // generate colocation points
    // calculation of colocation points and tangent and normal
    mat xl(m + 1, n + 1), yl(m + 1, n + 1), zl(m + 1, n + 1),
        xu(m + 1, n + 1), yu(m + 1, n + 1), zu(m + 1, n + 1);
    ZERO(xl); ZERO(yl); ZERO(zl);
    ZERO(xu); ZERO(yu); ZERO(zu);
    // xl(1:m+1,1:n+1)=flipdim(BX(1:m+1,1:n+1),1);
    //BDIM 3, 2 * m + 2, n + 1
    flipdim(xl, BP1.row(x), 1, m + 1);
    flipdim(yl, BP1.row(y), 1, m + 1);
    flipdim(zl, BP1.row(z), 1, m + 1);
    for (j = 0; j < n + 1; j++) {
        for (i = 0; i < m + 1; i++) {
            xu(i, j) = BP1(x, m + i, j);
            yu(i, j) = BP1(y, m + i, j);
            zu(i, j) = BP1(z, m + i, j);
        }
    }
    // generate collocation points
    // --------------------------------------------------------------------------
    // lower collocation points
    mat COLOCXl(m, n), COLOCYl(m, n), COLOCZl(m, n);
    // upper collocation points
    mat COLOCXu(m, n), COLOCYu(m, n), COLOCZu(m, n);

    for (j = 0; j < n; j++) {
        for (i = 0; i < m; i++) {
            // calculation of the lower surface collocation points
            COLOCXl(i, j) = (xl(i, j) + xl(i + 1, j) + xl(i, j + 1) + xl(i + 1, j + 1)) / 4;
            //%COLOCYl(i,j)=yp(1,j);
            COLOCYl(i, j) = (yl(i, j) + yl(i + 1, j) + yl(i, j + 1) + yl(i + 1, j + 1)) / 4;
            COLOCZl(i, j) = (zl(i, j) + zl(i + 1, j) + zl(i, j + 1) + zl(i + 1, j + 1)) / 4;
            // calculation of the upper surface collocation points
            COLOCXu(i, j) = (xu(i, j) + xu(i + 1, j) + xu(i, j + 1) + xu(i + 1, j + 1)) / 4;
            ////%COLOCYu(i,j)=yp(1,j);
            COLOCYu(i, j) = (yu(i, j) + yu(i + 1, j) + yu(i, j + 1) + yu(i + 1, j + 1)) / 4;
            COLOCZu(i, j) = (zu(i, j) + zu(i + 1, j) + zu(i, j + 1) + zu(i + 1, j + 1)) / 4;
        }
    }
    // Assembly of the collocation point matrix clockwise starting from lower surface
    const mat COLOCXL = flipud(COLOCXl),
              COLOCYL = flipud(COLOCYl),
              COLOCZL = flipud(COLOCZl);

    cube COLOC(3, 2 * m, n);
    COLOC.row(x) = join_vert(COLOCXL, COLOCXu);
    COLOC.row(y) = join_vert(COLOCYL, COLOCYu);
    COLOC.row(z) = join_vert(COLOCZL, COLOCZu);

    // calculation of tangent and nromal
    //%%%%%%% Calculation of Normal and tangent %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    // -------------------------------------------------------------------------
    // initialization of normal matrix
    // %--------------------------------------------------------------------------
    // initialization of lowerhalf normal matrix
    mat nrxl(m, n), nryl(m, n), nrzl(m, n),  // initialization of x-direction lowerhalf tangent matrix
        txxrl(m, n), txyrl(m, n), txzrl(m, n),  // initialization of y-direction lowerhalf tangent matrix
        tyxrl(m, n), tyyrl(m, n), tyzrl(m, n);
    ZERO(nrxl); ZERO(nryl); ZERO(nrzl);
    ZERO(txxrl); ZERO(txyrl); ZERO(txzrl);
    ZERO(tyxrl); ZERO(tyyrl); ZERO(tyzrl);
    // %--------------------------------------------------------------------------
    // initialization of upperhalf normal matrix
    mat nrxu(m, n), nryu(m, n), nrzu(m, n),  // initialization of x-direction upperhalf tangent matrix
        txxru(m, n), txyru(m, n), txzru(m, n),  // initialization of y-direction upperhalf tangent matrix
        tyxru(m, n), tyyru(m, n), tyzru(m, n);
    ZERO(nrxu); ZERO(nryu); ZERO(nrzu);
    ZERO(txxru); ZERO(txyru); ZERO(txzru);
    ZERO(tyxru); ZERO(tyyru); ZERO(tyzru);

    mat Sl(m, n), Su(m, n);

    for (j = 0; j < n; j++) {
        for (i = 0; i < m; i++) {
            double NL[3];
            double txl[3];
            double tyl[3];
            double NU[3];
            double txu[3];
            double tyu[3];
//[NL,txl,tyl,sl]= VECT(xl(i,j),yl(i,j),zl(i,j),xl(i,j+1),yl(i,j+1),zl(i,j+1),xl(i+1,j+1),yl(i+1,j+1),zl(i+1,j+1),xl(i+1,j),yl(i+1,j),zl(i+1,j));
//[NU,txu,tyu,su]= VECT(xu(i,j),yu(i,j),zu(i,j),xu(i,j+1),yu(i,j+1),zu(i,j+1),xu(i+1,j+1),yu(i+1,j+1),zu(i+1,j+1),xu(i+1,j),yu(i+1,j),zu(i+1,j));

            Sl(i, j) =
                vect(
                    xl(i, j),         yl(i, j),         zl(i, j),
                    xl(i, j + 1),     yl(i, j + 1),     zl(i, j + 1),
                    xl(i + 1, j + 1), yl(i + 1, j + 1), zl(i + 1, j + 1),
                    xl(i + 1, j),     yl(i + 1, j),     zl(i + 1, j),
                    NL, txl, tyl);
            Su(i, j) =
                vect(
                    xu(i, j),         yu(i, j),         zu(i, j),
                    xu(i, j + 1),     yu(i, j + 1),     zu(i, j + 1),
                    xu(i + 1, j + 1), yu(i + 1, j + 1), zu(i + 1, j + 1),
                    xu(i + 1, j),     yu(i + 1, j),     zu(i + 1, j),
                    NU, txu, tyu);
            // lowerhalf normal matrix
            nrxl(i, j) = -NL[x];
            nryl(i, j) = -NL[y];
            nrzl(i, j) = -NL[z];
            // upperhalf normal matrix
            nrxu(i, j) = NU[x];
            nryu(i, j) = NU[y];
            nrzu(i, j) = NU[z];
            // x-direction lowerhalf tangent matrix
            txxrl(i, j) = txl[x];
            txyrl(i, j) = txl[y];
            txzrl(i, j) = txl[z];
            // x-direction upperhalf tangent matrix
            txxru(i, j) = txu[x];
            txyru(i, j) = txu[y];
            txzru(i, j) = txu[z];
            // y-direction lowerhalf tangent matrix
            tyxrl(i, j) = tyl[x];
            tyyrl(i, j) = tyl[y];
            tyzrl(i, j) = tyl[z];
            // y-direction upperhalf tangent matrix
            tyxru(i, j) = tyu[x];
            tyyru(i, j) = tyu[y];
            tyzru(i, j) = tyu[z];
        }
    }

    // concatenation of the normal tangent vector matrix
    // concatenation of the x-direction tangent vector matrix
    // concatenation of the y-direction tangent vector matrix
    const mat nrx = join_vert(flipud(nrxl), nrxu),
              nry = join_vert(flipud(nryl), nryu),
              nrz = join_vert(flipud(nrzl), nrzu),
              txxr = join_vert(flipud(txxrl), txxru),
              txyr = join_vert(flipud(txyrl), txyru),
              txzr = join_vert(flipud(txzrl), txzru),
              tyxr = join_vert(flipud(tyxrl), tyxru),
              tyyr = join_vert(flipud(tyyrl), tyyru),
              tyzr = join_vert(flipud(tyzrl), tyzru);

    //nrx11  = vertcat(-nrxl,nrxu);

    // concatenation of the panel area matrix
    const mat area = join_vert(flipud(Sl), Su);

    // rotation and orientation of surface panels
    cube ROTX(3, 3, nblades);
    if (true || NASTRANpath.length() > 0) {  // read boundary points from NASTRAN file
        for (int iblade = 0; iblade < nblades; iblade++) {
            const double coe = 2 * iblade * pi / nblades;
            ROTX.slice(iblade) = {{1, 0, 0},
                                  {0, cos(coe), -sin(coe)},
                                  {0, sin(coe), cos(coe)}};
            // TODO? if (nblades == 3) ROTX.slice(iblade) += -pi
        }
    } else {
    //RotX1 = [1 0  0 ; 0  cos(-th)   -sin(-th) ;    0 sin(-th)  cos(-th)];
        ROTX.slice(0) = {{1, 0, 0},
                         {0, cos(-th), -sin(-th)},
                         {0, sin(-th), cos(-th)}};
        //RotX2 = [1 0  0 ; 0  cos(3*th)  -sin(3*th) ;   0 sin(3*th) cos(3*th)];
        ROTX.slice(1) = {{1, 0, 0},
                         {0, cos(3*th), -sin(3*th)},
                         {0, sin(3*th), cos(3*th)}};
        //RotX3 = [1 0  0 ; 0  cos(7*th)  -sin(7*th) ;   0 sin(7*th) cos(7*th)];
        ROTX.slice(2) = {{1, 0,  0},
                         {0, cos(7*th), -sin(7*th)},
                         {0, sin(7*th), cos(7*th)}};
    }
    //RotY1 = [cos(pitch) 0  -sin(pitch); 0  1  0; sin(pitch)  0  cos(pitch)];
    // blade 1
    for (uword iblade = 0; iblade < nblades; iblade++) {
        for (j = 0; j < n + 1; j++) {
            for (i = 0; i < 2 * m + 2; i++) {
                //T1=RotX1*RotY1*[BX(i,j)   BY(i,j)  BZ(i,j)]';
                //T2=RotX2*RotY1*[BX(i,j)   BY(i,j)  BZ(i,j)]';
                //T3=RotX3*RotY1*[BX(i,j)   BY(i,j)  BZ(i,j)]';
                panelgrid(iblade).COMP(i, j) = ROTX.slice(iblade) * /* RotZ1 * RotY1 * RotX1 * */ BP1.COMP(i, j);
            }
        }
    }
    outdb("Xpanelgrid1", x, panelgrid(0));
    outdb("Ypanelgrid1", y, panelgrid(0));
    outdb("Zpanelgrid1", z, panelgrid(0));

    /*  // TODO:grid points of pole cylinder?
    mat xm1(2 * m + 1, n + 1);
    mat ym1(2 * m + 1, n + 1);
    mat zm1(2 * m + 1, n + 1);
    for (i =0; i<2*m+2; i++) {
        for (j=0;j<n+1;j++) {
            // T1=RotX1*RotY1*[xm(i,j)   ym(i,j)  zm(i,j)]';
            // T2=RotX2*RotY1*[xm(i,j)   ym(i,j)  zm(i,j)]';
            // T3=RotX3*RotY1*[xm(i,j)   ym(i,j)  zm(i,j)]';
            fcv(0) = xm[i][j];
            fcv(1) = ym[i][j];
            fcv(2) = zm[i][j];
            fcvt = fcv.transpose();
            T1 = RotX1 * RotY1 * fcvt;
            xm1[i][j]=T1(0);
            ym1[i][j]=T1(1);
            zm1[i][j]=T1(2);
        }
    }
    */
    // orientation of collocation point
    // blade 1
    cube COLOC1(3, 2 * m, n); ZERO(COLOC1);
    //% blade 2
    mat COLOCX2(2 * m, n), COLOCY2(2 * m, n), COLOCZ2(2 * m, n);
    ZERO(COLOCX2); ZERO(COLOCY2); ZERO(COLOCZ2);
    //COLOCX2=zeros(m,n);
    //COLOCY2=zeros(m,n);
    //COLOCZ2=zeros(m,n);
    //% blade 3
    //COLOCX3=zeros(m,n);
    //COLOCY3=zeros(m,n);
    //COLOCZ3=zeros(m,n);
    // orientation of normal, tangent vectors
    cube nr1(3, 2 * m, n),
         txr1(3, 2 * m, n),
         tyr1(3, 2 * m, n);
    ZERO(nr1); ZERO(txr1); ZERO(tyr1);
    for (j = 0; j < n; j++) {
        for (i = 0; i < 2 * m; i++) {
            //CT1= RotX1*RotY1*[COLOCX(i,j)   COLOCY(i,j)   COLOCZ(i,j)]';
            //CT2= RotX2*RotY1*[COLOCX(i,j)   COLOCY(i,j)   COLOCZ(i,j)]';
            //CT3= RotX3*RotY1*[COLOCX(i,j)   COLOCY(i,j)   COLOCZ(i,j)]';
            //blade n
            COLOC1.COMP(i, j) = ROTX.slice(0) *
#ifdef DOCOLOCROT  // TODO maybe do this only for HAWT
                    RotZ1 * RotY1 * RotX1 *
#endif
                    COLOC.COMP(i, j);
            const colvec nr = {nrx(i, j), nry(i, j), nrz(i, j)},
                         tx = {txxr(i, j), txyr(i, j), txzr(i, j)},
                         ty = {tyxr(i, j), tyyr(i, j), tyzr(i, j)};
            nr1.COMP(i, j) = ROTX.slice(0) *
#ifdef DOCOLOCROT
                    RotZ1 * RotY1 * RotX1 *
#endif
                    nr;
            txr1.COMP(i, j) = ROTX.slice(0) *
#ifdef DOCOLOCROT
                    RotZ1 * RotY1 * RotX1 *
#endif
                    tx;
            tyr1.COMP(i, j) = ROTX.slice(0) *
#ifdef DOCOLOCROT
                    RotZ1 * RotY1 * RotX1 *
#endif
                    ty;
        }
    }
    outdb("COLOCX1", x, COLOC1);

    // orientation of normal vectors
    // % blade 1
    //% blade 2
    //nrx2  = zeros(2*m,n);
    //nry2  = zeros(2*m,n);
    //nrz2  = zeros(2*m,n);
    //% blade 3
    //nrx3  = zeros(2*m,n);
    //nry3  = zeros(2*m,n);
    //nrz3  = zeros(2*m,n);
            //NT1= RotX1*RotY1*[ nrx(i,j)    nry(i,j)    nrz(i,j)]';
            //NT2= RotX2*RotY1*[ nrx(i,j)    nry(i,j)    nrz(i,j)]';
            //NT3= RotX3*RotY1*[ nrx(i,j)    nry(i,j)    nrz(i,j)]';

            // blade 1
            //% blade 2
            //nrx2(i,j)  = NT2(1);
            //nry2(i,j)  = NT2(2);
            //nrz2(i,j)  = NT2(3);
            //% blade 3
            //nrx3(i,j)  = NT3(1);
            //nry3(i,j)  = NT3(2);
            //nrz3(i,j)  = NT3(3);
    // output structured grid representing foil
    structured_grid_out(doc, "COLOC", COLOC.row(x), COLOC.row(y), COLOC.row(z));
    structured_grid_out(doc, "COLOCl", COLOCXl, COLOCYl, COLOCZl);
    structured_grid_out(doc, "COLOCu", COLOCXu, COLOCYu, COLOCZu);
    outdb("COLOCX", x, COLOC);
    outdb("COLOCY", y, COLOC);
    outdb("COLOCZ", z, COLOC);

    structured_grid_out(doc, "B", BP1.row(x), BP1.row(y), BP1.row(z));
    // output normals, tangents
    MATRIX_OUT(txxru);
    MATRIX_OUT(txyru);
    MATRIX_OUT(txzru);

    MATRIX_OUT(tyxru);
    MATRIX_OUT(tyyru);
    MATRIX_OUT(tyzru);

    MATRIX_OUT(txxrl);
    MATRIX_OUT(txyrl);
    MATRIX_OUT(txzrl);

    MATRIX_OUT(tyxrl);
    MATRIX_OUT(tyyrl);
    MATRIX_OUT(tyzrl);

    MATRIX_OUT(nrxu);
    MATRIX_OUT(nryu);
    MATRIX_OUT(nrzu);

    MATRIX_OUT(nrxl);
    MATRIX_OUT(nryl);
    MATRIX_OUT(nrzl);

    // calculation of aerodynamics co-efficient
    // calculation of aerodynamics
    // velinduced_AK=zeros(1,1);
    // totalvelinduced=zeros(1,1);
    // totalvelinduced_BK=zeros(1,1);
    // velinduced_BK=zeros(1,1);
    // wakesource=zeros(1,1);
    // wakedublet=zeros(1,1);
    // kk=zeros(1,1);
    //wakevelinduced=zeros(1,1);
    cube AB1(2 * m * n, 2 * m * n, 2), //% Influence co-efficient matrix of surface doublet distribution. t=1
         AB2(2 * m * n, 2 * m * n, 2); //% Influence co-efficient matrix of surface source distribution. t=1
    ZERO(AB1); ZERO(AB2);

    mat C1(2 * m * n, nts * n); ZERO(C1);

    colvec GAMMA1(2 * m * n),  // RHS vector-column vector
           sigma1(2 * m * n),  // RHS vector-column vector
           RHS1(2 * m * n);

    ZERO(GAMMA1); ZERO(sigma1);

    cube delF(2 * m, n, nts),
         F_normal(2 * m, n, nts),
         F_tangential(2 * m, n, nts),
         delTorque(2 * m, n, nts);
    ZERO(delF); ZERO(F_normal); ZERO(F_tangential); ZERO(delTorque);

    vec Total_torque(nts); ZERO(Total_torque);

    cube NX(3, 2 * m, n),
         TX(3, 2 * m, n),
         TY(3, 2 * m, n);

    //TODO: time dimension is not used, so maybe omit
    //cube MUEA1(2 * m, n, nts); ZERO(MUEA1);
    mat MUEW1(nts, n); ZERO(MUEW1);
    colvec muew1(nts * n); ZERO(muew1);

    cube Ml(m, n, nts), Mu(m, n, nts),
         Miu(2 * m, n, nts); ZERO(Miu);

    //mat MUEA1(2 * m, n);
    //mat MUEW1doubled(nts, n);
    double *pmat = new double [nts * n + 2 * m * n];
    mat MUEA1 = mat (pmat, 2 * m, n, false, true); ZERO(MUEA1);
    pmat += MUEA1.n_elem;
    mat MUEW1doubled = mat (pmat, nts, n, false, true); ZERO(MUEW1doubled);

    double *pcube = new double [3 * 2 * m * n * nts];
    field <cube> Qlmn(nts);
    for (uword it = 0; it < nts; it++, pcube += 3 * 2 * m * n) {
        Qlmn(it) = cube(pcube, 3, 2 * m, n, false, true); ZERO(Qlmn(it));
    }

    mat Vt(2 * m, n);
    ZERO(Vt);

    cube Cp(2 * m, n, nts); ZERO(Cp);

    cube delmiul(m, n, nts), delmiuu(m, n, nts);

    mat Vxyz(3, n + 1);
    mat sig1(2 * m, n); ZERO(sig1);

    //gama=-omegax*ti; %  rotation of the blade.
    vec gama(nts);
    for (i = 0; i < nts; i++) {
        gama(i) = -omegax * ti(i);  // omegax is the rotation speed in rad/s
    }
    OUTD(gama);

    influence_thread oINFLUENCE("INFLUENCE", nthreads, 2 * m, n,
                         CP1.memptr(), BP1.memptr(), workinc, BSIZE + CSIZE + WSIZE);
    sourcevel_thread oSOURCEVEL("SOURCEVEL", nthreads, nts, 2 * m, n,
                         WP1.memptr(), BP1.memptr(), workinc, workfactor);
    wingwakeinf_thread owingwakeinf("wingwakeinf", nthreads, nts, 2 * m, n,
                         WP1.memptr(), BP1.memptr(), workinc, workfactor);
    wakeinfluence_thread oWAKEINFLUENCE("WAKEINFLUENCE", nthreads, nts, 2 * m, n,
                         CP1.memptr(), &WP1(0, 1, 0), workinc);  // skip 1st row chord-wise
    if (world_rank <= 0) {  // MPI root or no MPI
        // allocate per-thread output buffers for communicating with MPI children
        //   or to prevent false sharing between local child threads
        for (int ith = 0; ith < world_size; ith++) {
             double **pout = new double * [4];
             tandem_output.push_back(pout);
             *pout++ = oINFLUENCE.alloc_output();
             *pout++ = oSOURCEVEL.alloc_output();
             *pout++ = owingwakeinf.alloc_output();
             *pout = oWAKEINFLUENCE.alloc_output();
        }
    }

#ifdef FSI_STATS
    // create lookup table to find stats associated
#ifdef FSI_OPENMP
    // stats indexed by thread index, mex routine
    for (int ithread = 0; ithread < nthreads; ithread++) {
        (*thread_stat_map[ithread])[&oINFLUENCE] = new mex_stat;
        (*thread_stat_map[ithread])[&oSOURCEVEL] = new mex_stat;
        (*thread_stat_map[ithread])[&owingwakeinf] = new mex_stat;
        (*thread_stat_map[ithread])[&oWAKEINFLUENCE] = new mex_stat;
    }
#else
    // stats indexed by thread ID, mex routine
    for (auto &it: all_threads) {
        (*thread_stat_map[it.get_id()])[&oINFLUENCE] = new mex_stat;
        (*thread_stat_map[it.get_id()])[&oSOURCEVEL] = new mex_stat;
        (*thread_stat_map[it.get_id()])[&owingwakeinf] = new mex_stat;
        (*thread_stat_map[it.get_id()])[&oWAKEINFLUENCE] = new mex_stat;
    }
#endif
#endif

    mat::fixed<3, 3> RotX;
    mat qll(m, n), qlu(m, n);
    mat ql(2 * m, n);  // Induced velocity in local x-direction tangential direction (along chord)
    mat delxl(m, n), delxu(m, n);
    for (int t = 0; t < nts; t++) { // time loop
        std::cerr << " * t=" << t;
        if ((t + 1) % 5 == 0) std::cerr << std::endl;

        // RotX = [1 0 0 ; 0 cos(gama(t)) sin(gama(t)) ; 0 -sin(gama(t)) cos(gama(t))];
        RotX = {{1, 0, 0},
                {0, cos(gama(t)), sin(gama(t))},
                {0, -sin(gama(t)), cos(gama(t))}};
        // %%%%%%%%% Boundary points %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        for (uword iblade = 0; iblade < nblades; iblade++) {
            rotate(BXYZ(iblade), RotX, panelgrid(iblade));
        }

        outdb("BX1", x, BXYZ(0), t); outdb("BY1", y, BXYZ(0), t); outdb("BZ1", z, BXYZ(0), t);
        //outdb("BX2", x, BXYZ(1), t); outdb("BY2", y, BXYZ(1), t); outdb("BZ2", z, BXYZ(1), t);
        //outdb("BX3", x, BXYZ(2), t); outdb("BY3", y, BXYZ(2), t); outdb("BZ3", z, BXYZ(2), t);

        //%%%%%%%%%%% collocation point%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        //%%%%%%%%%%%% normal vector %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        //%%%%%%%%%%% tangential vector x-direction %%%%%%%%%%%%%%
        //%%%%%%%%%%% tangential vector y-direction %%%%%%%%%%%%%%
        //tempc1 = RotX*[COLOCX1(i,j) COLOCY1(i,j) COLOCZ1(i,j)]';
        //temptx1 = RotX*[txxr1(i,j) txyr1(i,j) txzr1(i,j)]';
        //tempty1 = RotX*[tyxr1(i,j) tyyr1(i,j) tyzr1(i,j)]';
        //tempn1 = RotX*[nrx1(i,j) nry1(i,j) nrz1(i,j)]';
        rotate(CXYZ(0), RotX, COLOC1);
        rotate(NX, RotX, nr1);
        rotate(TX, RotX, txr1);
        rotate(TY, RotX, tyr1);

        outdb("CX1", x, CXYZ(0), t);

        //% complete wake at the end of the time step.
        //% Shed latest vortex
        //% blade 1
        //WX1(1,:) = BX1(end,:);
        for (uword iblade = 0; iblade < nblades; iblade++) {
            for (j = 0; j < n + 1; j++) {  // set wake 0 for all panels
                WXYZ(iblade).COMP(0, j) = BXYZ(iblade).COMP(endrowBXYZ, j);
            }
        }
        outdb("WX1", x, WP1, t);
        outdb("WY1", y, WP1, t);
        outdb("WZ1", z, WP1, t);
        //if (t == 2) {std::cerr << "check WX1, W_cube_T" << t+1 << " solver returning " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;}

        // output structured grid representing wake at time
        for (uword iblade = 0; iblade < nblades; iblade++) {
            structured_grid_out(doc, "B", iblade, BXYZ(iblade), t, NCOLS(BP1) - 1, true);
        }
        for (uword iblade = 0; iblade < nblades; iblade++) {
            structured_grid_out(doc, "W", iblade, WXYZ(iblade), t);
        }

        // plotting the wake panel
        // surf(BX1(1:2*m+1,:),BY1(1:2*m+1,:),BZ1(1:2*m+1,:),'EdgeColor','none')
        // mesh(WX1(1:t,:),WY1(1:t,:),WZ1(1:t,:),'FaceColor','none')
        // ...
        //%% calculation of the wake influence
        //%   Calculation or external velocities
        cube UVW(3, mx2, n);
        {
            const double QsinAOA = Q * sin(AOA),
                         QcosAOA = Q * cos(AOA),
                         singamat = sin(gama(t)),
                         cosgamat = cos(gama(t));
            double Vbar;
            cube::iterator pUVW = UVW.begin();
            for (j = 0; j < n; j++) {
                // The relative wind due to the rotation lies in the y-z plane.
                Vbar = omegax * (sqrt(pow(COLOC(y, 0, j), 2) +
                                      pow(COLOC(z, 0, j), 2)));
                for (i = 0; i < mx2; i++) {
                    // wind turbine code:
                    // rotradius(i,j)=sqrt((COLOCY(1,j)).^2+COLOCZ(1,j).^2);
                    // Vbar(i,j)=omegax*rotradius(i,j);
                    // U(i,j)= Q;
                    // V(i,j)= -Vbar(i,j)*sin(gama(t));
                    // W(i,j)= -Vbar(i,j)*cos(gama(t));
                    *pUVW++ = QcosAOA;  // x
                    *pUVW++ = -Vbar * singamat;  // y
                    *pUVW++ = QsinAOA - Vbar * cosgamat;  // z
                }
            }
        }
        {
            colvec tsigma1(size(sigma1));
            dot_entire(tsigma1, NX, UVW);  // won't work as is, since indices are reversed
            int ik = 0;
            // reverse panel order
            for (i = 0; i < mx2; i++) {
                for (j = 0; j < n; j++) {
                    sigma1(ik++) = tsigma1((j * mx2) + i);
                }
            }
        }
        OUTDCOLVEC(sigma1);
        if (t == 0) {
            //mexINFLUENCE(CX1,CY1,CZ1,BX1(1:end-1,:),BY1(1:end-1,:),BZ1(1:end-1,:),A1,B1,1.0
            int id = 0;  // for debugging only - distinguishing between runs
            try {
                oINFLUENCE.mex_thread_init(1.0, AB1.memptr());
            }
            catch (exception& e) {
                std::cerr << "exception caught in first run of INFLUENCE "
                        "Exception: " << e.what() << '\n';
            }
            outd("A1_first", AB1.slice(0), t);
            outd("B1_first", AB1.slice(1), t);
            //std::cerr << "solver returinng " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;
            //it(0);  // R,G,B
            //A1=A1*(1.1015+(T-0.1215)+sweep/55);
            // use lambda function to multiply each value of an arma matrix by a constant
            AB1.slice(0).for_each([&](mat::elem_type &val) {
                val *= 1.1015 + T - 0.1215 + sweep / 55.0;
            });

            mat ab1(2 * m * n, 2 * m * n), // was 2)
                ab11(2 * m * n, 2 * m * n); // was 2);
            ZERO(ab1); ZERO(ab11);
            //mexINFLUENCE(CX1(1,:),CY1(1,:),CZ1(1,:),
            //             BX1(end-1:end,:),BY1(end-1:end,:),BZ1(end-1:end,:),a1,b1,1.0);
            //  CX1.head_rows(1), CY1.head_rows(1), CZ1.head_rows(1),
            //  BX1.tail_rows(2), BY1.tail_rows(2), BZ1.tail_rows(2),
            // these partials are still broken in threaded versions, but whole arrays work
            mexINFLUENCE(
                &CP1(x, 0, 0), &CP1(y, 0, 0), &CP1(z, 0, 0),
                &BP1(x, 0, 0), &BP1(y, 0, 0), &BP1(z, 0, 0),
                ab1.memptr(),
                1.0,
                2 * m, n, t + id++,
                0, 2 * m);
            outdcolvec("a1",  ab1.col(0), 0);
            //mexINFLUENCE(CX1(end,:),CY1(end,:),CZ1(end,:),
            //             BX1(end-1:end,:),BY1(end-1:end,:),BZ1(end-1:end,:),a11,b11,1.0);
            //  CX1.tail_rows(1), CY1.tail_rows(1), CZ1.tail_rows(1),
            //  BX1.tail_rows(2), BY1.tail_rows(2), BZ1.tail_rows(2),
            mexINFLUENCE(
                &CP1(x, 0, 0), &CP1(y, 0, 0), &CP1(z, 0, 0),
                &BP1(x, 0, 0), &BP1(y, 0, 0), &BP1(z, 0, 0),
                ab11.memptr(),
                1.0,
                2 * m, n, t + id++,
                2 * m - 1, 2 * m);  // ignore last 2 rows
            outdcolvec("a11",  ab11.col(0), 0);
            // A1(:,1)  = A1(:,1)  -a1/T/100/pi;
            // A1(:,end)= A1(:,end)+a11/T/100/pi;
            // why is a11 a matrix, but only 1 in its 2nd dimension?
            // mat a11(2*m*n,1);
            // mat A1(2*m*n,2*m*n); //% Influence co-efficient matrix of surface doublet distribution.
            for (uword ix = 0; ix < NROWS(AB1); ix++) {
                AB1(ix, 0, 0) -= ab1(ix, 0) / T / 100.0 / pi;
                AB1(ix, ENDCOL(AB1), 0) += ab11(ix, 0) / T / 100.0 / pi;
            }
            //outd("A1_2", AB1, t);
        }  // t == 0 calculation of influence
        else {  // t > 0
            // mexINFLUENCE(CX1,CY1,CZ1,
            //              BX1(1:end-1,:),BY1(1:end-1,:),BZ1(1:end-1,:),A2,B2,1.0);
            try {
                oINFLUENCE.mex_thread_init(1.0, AB2.memptr());
            }
            catch (exception& e) {
                std::cerr << "exception caught in B2 run of INFLUENCE at time step " << t + 1 <<
                        " Exception: " << e.what() << '\n';
            }
            outd("A2_first", AB2.slice(0), t);
            outd("B2_first", AB2.slice(1), t);

            //A2=A2*(1.1015+(T-0.1215)+sweep/55);
            // use lambda function to multiply each value of an arma matrix by a constant
            AB2.slice(0).for_each([&](mat::elem_type &val) {
                val *= 1.1015 + T - 0.1215 + sweep / 55.0;
            });

            outd("A2_second", AB2.slice(0), t);
            outd("B2_second", AB2.slice(1), t);
        }

        int k = 0; // Initialization of collocationa points counter
        for (int a = 0; a < mx2; a++) {
            for (int b = 0; b < n; b++, k++) {
                int l = 0; // Initialization of surface panel elements counter
                for (i = 0; i < mx2; i++) {
                    for (j = 0; j < n; j++, l++) {
                        if (k == l) {
                            if (t == 0) AB1(k, l, 0) = -0.5;  // used only on first time step
                            else AB2(k, l, 0) = -0.5;
                        }
                    }
                }
            }
        }
        outd("A", t == 0 ? AB1.slice(0) : AB2.slice(0), t);
        outd("B", t == 0 ? AB1.slice(1) : AB2.slice(1), t);
        //std::cerr << "INFLUENCE compare A1_second with A_T1: solver returing " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;

        for (int it = 0, kk = 0; it < t; it++) {
            for (j = 0; j < n; j++) {
                muew1(kk++) = MUEW1(it, j);  // flatten and reorder by span-wise panel over tiem
            }
        }
        outd("MUEW1a", MUEW1, t);
        OUTDCOLVEC(muew1);
        if (t > 0) {
            ZERO(C1);
            //mexWAKEINFLUENCE(CX1(:,:),CY1(:,:),CZ1(:,:),
            //                 WX1(2:t,:),WY1(2:t,:),WZ1(2:t,:),
            //                 MUEW1(2:t,:),C1);
            // NOTE: TODO this subroutine is designed to offset to t=1(zero-based)
            // as opposed to the original
            // TODO: cannot test this until wingwakeinf is coded and tested
            if (t > 2) // no influence in the first two time steps
            try {
                oWAKEINFLUENCE.mex_thread_init(t, &MUEW1(1, 0), C1.memptr());
            }
            catch (exception& e) {
                std::cerr << "exception caught WAKEINFLUENCE "
                        "Exception: " << e.what() << '\n';
            }
            OUTT(C1);
            //if (t==4) return 0;
        }
        // Calculation RHS vector and perturbation potential stength
        // not used RHS=-B*sigma1;                    % calculation of RHS vector
        // calculating RHS vector
        // RHS1           = -(B*sigma1+C1*muew1);
        RHS1 = -((t == 0 ? AB1.slice(1) : AB2.slice(1)) * sigma1 + C1 * muew1);  // i.e., B * sigma1 + C1 * muew1
        OUTDCOLVEC(RHS1);

        // GAMMA1(:,t) = (A)\RHS1;
#ifdef ARMA_USE_LAPACK
        GAMMA1 = solve((t == 0 ? AB1.slice(0) : AB2.slice(0)), RHS1);  // left division A\RHS1
#else
        GAMMA1 = inv((t == 0 ? AB1.slice(0) : AB2.slice(0))) * RHS1;  // left division A\RHS1
#endif
        outdcolvec("GAMMA1_at", GAMMA1, t);

        //sig1(:,:,t)    = reshape(sigma1,n,2*m)';
        //MUEA1(:,:,t)   = reshape(GAMMA1(:,t),n,2*m)';
        //for i=1:2*m
        //  for j=1:n
        //      Miu(i,j,t)= MUEA1(i,j,t);
        //  end
        //end
        MUEA1 = reshape(GAMMA1, MUEA1.n_cols, MUEA1.n_rows).t();
        sig1 = reshape(sigma1, sig1.n_cols, sig1.n_rows).t();
        Miu.slice(t) = MUEA1;

        //TODO:strangediscrepancy at sig[6,4] - note also discrepancy in dimensions
        OUTDATT(sig1);
        outdatt("MiuA", Miu, t);
        //Miu(3,1:n,t)    = Miu(4,1:n,t)    + Miu(4,1:n,t)*0.01;
        //Miu(2,1:n,t)    = Miu(3,1:n,t)    + Miu(3,1:n,t)*0.01;
        //Miu(2*m-2,1:n,t)= Miu(2*m-3,1:n,t)+ Miu(2*m-3,1:n,t)*0.01;
        //Miu(2*m-1,1:n,t)= Miu(2*m-2,1:n,t)+ Miu(2*m-2,1:n,t)*0.01;
        //Miu(1,1:n,t)    = Miu(2,1:n,t)    + Miu(2,1:n,t)*0.01;
        //Miu(2*m,1:n,t)  = Miu(2*m-1,1:n,t)+ Miu(2*m-1,1:n,t)*0.01;
        int jt = Miu.n_rows * Miu.n_cols * t;  // index over Miu columns at time
        for (j = 0; j < n; j++, jt += Miu.n_rows) {  // jump to next column
            Miu(2 + jt) = Miu(3 + jt) + Miu(3 + jt) * 0.01;
            Miu(1 + jt) = Miu(2 + jt) + Miu(2 + jt) * 0.01;
            Miu(mx2 - 3 + jt) = Miu(mx2 - 4 + jt) + Miu(mx2 - 4 + jt) * 0.01;
            Miu(mx2 - 2 + jt) = Miu(mx2 - 3 + jt) + Miu(mx2 - 3 + jt) * 0.01;
            Miu(jt) = Miu(1 + jt) + Miu(1 + jt) * 0.01;
            Miu(mx2 - 1 + jt) = Miu(mx2 - 2 + jt) + Miu(mx2 - 2 + jt) * 0.01;
        }

        OUTDATT(Miu);

        //MUEW1(2:t+1,:) = MUEW1(1:t,:);
        //MUEW1(1,:)     = abs(Miu(end,:,t))-abs(Miu(1,:,t));
        for (j = 0; j < n; j++) {
            for (int it = t; it > 0; it--) {
                MUEW1(it, j) = MUEW1(it - 1, j);
            }
            MUEW1(0, j) = abs(ENDROW(Miu), j, t)) - abs(Miu(0, j, t));
        }

        //MUEA2(:,:,t)   = MUEA1(:,:,t);
        //MUEA3(:,:,t)   = MUEA1(:,:,t);
        //MUEW2(1:t,:)   = MUEW1(1:t,:);
        //MUEW3(1:t,:)   = MUEW1(1:t,:);

        // rearranging the perturbation potential strength on lower and upper surface
        // Starting from leading and marching till trailing edge
        // Ml(1:m,1:n,t)= flipdim(Miu(1:m,1:n,t),1);
        // Mu(1:m,1:n,t)= Miu(m+1:2*m,1:n,t);
        for (j = 0; j < n; j++) {
            for (i = 0; i < m; i++) {
                Ml(m - i - 1, j, t) = Miu(i, j, t);
                Mu(i, j, t) = Miu(m + i, j, t);
            }
        }
        OUTDATT(Ml); OUTDATT(Mu);

        // TODO lots of things I think I don't need
        //
        assert(delxl.size() == qll.size());
        assert(delmiul.size() == Mu.size());
        for (uword ijt = delxl.n_rows * delxl.n_cols * t,
                   ij = 0,
                   j = 0; j < n; j++) {
            for (uword i = 0; i < m; i++, ijt++, ij++) {
                //delxl(i,j)=sqrt(
                //    ( xl(i+1,j)- xl(i,j)).^2 +
                //    ( zl(i+1,j)- zl(i,j)).^2+
                //    ( yl(i+1,j)- yl(i,j)).^2);
                //distance between colloccation points in x-direction (lower surface)
                //delxu(i,j)=sqrt(
                //    ( xu(i+1,j)- xu(i,j)).^2 +
                //    ( zu(i+1,j)- zu(i,j)).^2.+
                //    ( yu(i+1,j)- yu(i,j)).^2);
                //distance between colloccation points in x-direction (upper surface)
                //delmiul(i,j,t)=(Ml(i,j,t) -Ml(i-1,j,t));
                //difference of pertubation potential between 2 collocation points(lower surface)
                //delmiuu(i,j,t)=(Mu(i,j,t) -Mu(i-1,j,t));
                //difference of pertubation potential between 2 collocation points(upper surface)
                //mat xl(m + 1, n + 1), yl(m + 1, n + 1), zl(m + 1, n + 1),
                //delmiul(i,j,t)=(Ml(i+1,j,t) -Ml(i,j,t));
                //delmiuu(i,j,t)=(Mu(i+1,j,t) -Mu(i,j,t));
                delxl(ij) = sqrt(pow(xl(i + 1, j) - xl(i, j), 2) +
                                 pow(zl(i + 1, j) - zl(i, j), 2) +
                                 pow(yl(i + 1, j) - yl(i, j), 2));
                delxu(ij) = sqrt(pow(xu(i + 1, j) - xu(i, j), 2) +
                                 pow(zu(i + 1, j) - zu(i, j), 2) +
                                 pow(yu(i + 1, j) - yu(i, j), 2));
                if (i == m - 1) {
                    delmiul(ijt) = Ml(ijt) - Ml(ijt - 1);
                    delmiuu(ijt) = Mu(ijt) - Mu(ijt - 1);
                } else {
                    delmiul(ijt) = Ml(ijt + 1) - Ml(ijt);
                    delmiuu(ijt) = Mu(ijt + 1) - Mu(ijt);
                }
                // Induced tangential velocity in x-direction (lower surface)
                // qll(i,j,t)=delmiul(i,j,t)./delxl(i,j);
                // Induced tangential velocity in x-direction (upper surface)
                // qlu(i,j,t)=delmiuu(i,j,t)./delxu(i,j);
                qll(ij) = delmiul(ijt) / delxl(ij);
                qlu(ij) = delmiuu(ijt) / delxu(ij);
            }
            OUTT(delxl); OUTT(delxu);
            OUTDATT(delmiul); OUTDATT(delmiuu);
            OUTDATT(qll);
            //if (t==4) return 0;
        }
        // ql(:,:,t)=vertcat(flipdim(qll(:,:,t),1),qlu(:,:,t));
        // Initialization of local velocity matrix
        ql = join_vert(flipud(qll), qlu);
        OUTDATT(ql);

        // calculate free stream velocity components in local coordinate system - all panels
        dot_entire_xyz(Qlmn(t), TX, TY, NX, UVW);
        {
            mat::const_iterator pQlmn = Qlmn(t).begin(),
                                pMiu = Miu.slice(t).begin(),
                                pql = ql.begin(),
                                psig1 = sig1.begin();
            mat::iterator       pVt = Vt.begin();
            cube::const_iterator pXYZ = XYZ.begin();  // TODO will work if you increment 2*3 on i inc and 3 on j
            const int XYZ_next = XYZ.n_rows * XYZ.n_cols;  // next span-wise panel
            for (j = 0; j < n; j++, pXYZ += (2 * 3)) {
                for (i = 0; i < mx2; i++, ++pMiu, pXYZ += 3) {
                    //qn(i,j,t) = sig1(i,j,t); % Induced local tangential velocity in y-direction
                    // calculation free stream velocity component in local co-ordinate
                    //
                    // Tangential component (x-direction) of free stream velocity
                    //Ql(i,j,t) = dot([TXX1(i,j) TXY1(i,j) TXZ1(i,j)],  [U(i,j) V(i,j)  W(i,j) ]);
                    //Qm(i,j,t) = dot([TYX1(i,j) TYY1(i,j) TYZ1(i,j)],  [U(i,j) V(i,j)  W(i,j) ]);
                    //Qn(i,j,t) = dot([NX1(i,j)  NY1(i,j)  NZ1(i,j)],   [U(i,j) V(i,j)  W(i,j) ]);
                    // ported to C++ with dot_entire_xyz above
                    //% calculation y-direction local induced velocity
                    //%------------------------------------------------------------------
                    // if (j==n)
                    //    delcy(i,j)= sqrt(
                    //    (Y(i,j+1)-Y(i,j)).^2 +
                    //    (Z(i,j+1)-Z(i,j)).^2 +
                    //    (X(i,j+1)-X(i,j)).^2);
                    //    qm(i,j,t) = (Miu(i,j,t) -Miu(i,j-1,t))./delcy(i,j) ;
                    // else
                    //    delcy(i,j)= sqrt(
                    //    (Y(i,j+1)-Y(i,j)).^2 +
                    //    (Z(i,j+1)-Z(i,j)).^2+
                    //    (X(i,j+1)-X(i,j)).^2 );
                    //    qm(i,j,t) = (Miu(i,j+1,t) -Miu(i,j,t))./delcy(i,j);
                    // end
                    // TODO: why are XYZ out of order above? Typo?
                    // calculation y-direction local induced velocity
                    assert(&pXYZ[z + XYZ_next] - XYZ.memptr() < XYZ.size());
                    const double delcy = sqrt(pow(pXYZ[x + XYZ_next] - pXYZ[x], 2) +
                                              pow(pXYZ[y + XYZ_next] - pXYZ[y], 2) +
                                              pow(pXYZ[z + XYZ_next] - pXYZ[z], 2));
                    // Induced velocity in local y-direction tangential direction (along span)
                    assert(pMiu - Miu.memptr() + (j == n - 1 ? 0 : Miu.n_rows) < Miu.size());
                    const double qm = (j == n - 1 ?
                                       *pMiu - pMiu[-Miu.n_rows] :
                                       pMiu[Miu.n_rows] - *pMiu) /
                                      delcy;
                    //Vx(i,j,t)=  Ql(i,j,t) - ql(i,j,t); % Total local velocity in x-direction
                    //Vy(i,j,t)=  Qm(i,j,t) - qm(i,j,t); % Total local velocity in y-direction
                    //Vz(i,j,t)=  Qn(i,j,t) - qn(i,j,t); % Total local velocity in normal-direction
                    //%------------------------------------------------------------------
                    //% calculation of Cp
                    //Vt(i,j,t) = sqrt(
                    //    Vx(i,j,t).^2 +
                    //    Vy(i,j,t).^2+
                    //    Vz(i,j,t).^2);  % sum of total local velocity on the surface
                    const double Vx = *pQlmn++ - *pql++,    // Total local velocity in x-direction
                                 Vy = *pQlmn++ - qm,        // Total local velocity in y-direction
                                 Vz = *pQlmn++ - *psig1++;  // Total local velocity in normal direction
                    *pVt++ = sqrt(pow(Vx, 2) +
                                  pow(Vy, 2) +
                                  pow(Vz, 2));  // sum of total local velocity on the surface
                }
            }
        }

        //OUTDATT(Vx); OUTDATT(Vy); OUTDATT(Vz);  no need to array these
        //OUTDATT(Ql); OUTDATT(Qm); OUTDATT(Qn);
        outdb("Ql_at", x, Qlmn(t), t);
        outdb("Qm_at", y, Qlmn(t), t);
        outdb("Qn_at", z, Qlmn(t), t);

        //if (t==4) return 0;

        //Vt(2*m,:,t)= -Vt(1,:,t);
        for (j = 0; j < NCOLS(Vt); j++) {
            ENDROW(Vt), j) = -Vt(0, j);
        }
        MATRIX_OUTATT(Vt);  // output total velocity at current time to data file
        OUTDATT(Vt);

        assert(Cp.size() == delF.size());
        {
            mat::const_iterator pVt = Vt.begin(),
                                parea = area.begin();
            for (uword ijt = Cp.n_rows * Cp.n_cols * t,
                       ij = 0,
                       j = 0; j < n; j++) {
                for (i = 0; i < mx2; i++, ijt++, ij += 3) {
                    //Vref(i,j) = -sqrt(
                    //    U(i,j).^2+
                    //    V(i,j).^2+
                    //    W(i,j).^2);
                    const double Vref = -sqrt(pow(UVW(ij + x), 2) +
                                              pow(UVW(ij + y), 2) +
                                              pow(UVW(ij + z), 2));
                    //% pressure co-efficient in local co-ordinate system
                    //Cp(i,j,t) = 1-
                    //    (Vt(i,j,t)./Vref(i,j)).^2-
                    //    2*Miu(i,j,t)/
                    //    ts/
                    //    Vref(i,j).^2;
                    //Cp(i,j,t) = 1-
                    //    (Vt(i,j,t)./Vref(i,j)).^2-
                    //    2*(Miu(i,j,t)-Miu(i,j,t-1))/
                    //    ts/
                    //    Vref(i,j).^2;
                    Cp(ijt) = 1.0 -
                              pow(*pVt++ / Vref, 2) -
                              2.0 *
                              (t == 0 ?
                               Miu(ijt) :
                               Miu(ijt) - Miu(i, j, t - 1)) /
                              ts /
                              pow(Vref, 2);

                    //% calculation of the aerodynamics forces
                    //delF(i,j,t) = -(Cp(i,j,t).*
                    //                0.5*
                    //                rho*
                    //                Vref(i,j).^2).
                    //              *area(i,j);
                    //F_normal(i,j,t)     = delF(i,j,t).*nrx1(i,j);
                    //F_tangential(i,j,t) = delF(i,j,t).*nrz1(i,j);
                    //delTorque(i,j,t)    = F_tangential(i,j,t).*(COLOCY(i,j));
                    //Total_torque(1,t)   = sum(sum(delTorque(:,:,t)));
                    delF(ijt) = -(Cp(ijt) *
                                  0.5 *
                                  rho *
                                  pow(Vref, 2)) *
                                *parea++;
                    F_normal(ijt)     = delF(ijt) * nr1(x + ij);
                    F_tangential(ijt) = delF(ijt) * nr1(z + ij);
                    delTorque(ijt)    = F_tangential(ijt) * COLOC(y + ij);
                    Total_torque(t) += delTorque(ijt);
                }
            }
        }
        OUTDATT(Cp);
        OUTDATT(delF);
        OUTDATT(F_tangential);
        OUTDATT(delTorque);
        OUTT(Total_torque);
        //if (t==4) return 0;

        //NOTE: uwake11 uwake12 uwake13 aren't maintained here
        //TODO looks as if Vxx and Ql are modified above and used below
        if (t == 0) {
            // not sure why this was originally in a double loop, since it only runs once for t=1
            const double Qts = Q * ts;
            for (uword iblade = 0; iblade < nblades; iblade++) {
                for (uword in = 0; in < n + 1; in++) {  // span-wise
                    WXYZ(iblade)(x, 1, in) = WXYZ(iblade)(x, 0, in) + Qts;
                    WXYZ(iblade)(y, 1, in) = WXYZ(iblade)(y, 0, in);
                    WXYZ(iblade)(z, 1, in) = WXYZ(iblade)(z, 0, in);
                }
            }
            continue;  // to next time step
        }
        // wake roll-up calculation
        ZERO(uvww1);
        ZERO(Vxyz);
        ZERO(uvww);
        ZERO(uvwwakeS);
        // NOTE that Ql expands time-wise in MATLAB, but not in C++
        // Vxx(1,end)=Ql(end,end,end);
        // Vyy(1,end)=Qm(end,end,end);
        // Vzz(1,end)=Qn(end,end,end);
        //Vxyz(x, ENDCOL(Vxyz)) = Ql(Ql.n_rows - 1, ENDCOL(Ql), t - 1);
        {
            const cube &Qlmnm1 = Qlmn(t - 1);  // from previous time step
            Vxyz.col(ENDCOL(Vxyz)) = Qlmnm1.COMP(ENDCOL(Qlmnm1), ENDN3D(Qlmnm1));
            // Vxx(1,1:n)=Ql(end,:,end);
            // Vyy(1,1:n)=Qm(end,:,end);
            // Vzz(1,1:n)=Qn(end,:,end);
            for (j = 0; j < n; j++) {
                Vxyz.col(j) = Qlmnm1.COMP(ENDCOL(Qlmnm1), j);
            }
        }
        outdb("Vxx", x, Vxyz, t);
        outdb("Vyy", y, Vxyz, t);
        outdb("Vzz", z, Vxyz, t);

        outd("MUEA1_at", MUEA1, t);
        //outd("MUEA1attime", MUEA1, t);  // dumped when assigned
        OUTT(MUEW1);

        MUEW1doubled = MUEW1 * 2;
        //OUTT(MUEW1doubled);

//#define FSI_OPENMP2
#ifdef FSI_OPENMP2
    omp_set_num_threads(2);
    #pragma omp parallel
    {
        if (omp_get_thread_num() == 0)
#endif
        try {
            owingwakeinf.mex_thread_init(t,
                               MUEA1.memptr(),
                               MUEW1doubled.memptr(),
                               uvww.memptr());
        }
        catch (exception& e) {
            std::cerr << "exception caught in first run of wingwakeinf "
                    "Exception: " << e.what() << '\n';
        }
        //std::cerr << "wingwakeinf compare uw_cube with uwx: solver returing " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;

        //std::cerr << "wingwakeinf unthreaded: solver returning " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;
        //mexWingWakeInteraction(
        //  BX1(1:end-1,:),BY1(1:end-1,:),BZ1(1:end-1,:),
        //  WX1,WY1,WZ1,
        //  2*MUEW1,MUEA1,
        //  uwx,vwy,wwz,
        //  t);
#ifdef FSI_OPENMP2
    else
#endif
        try {
            oSOURCEVEL.mex_thread_init(t, sig1.memptr(), uvwwakeS.memptr());
        }
        catch (exception& e) {
            std::cerr << "exception caught in first run of SOURCEVEL "
                    "Exception: " << e.what() << '\n';
        }
#ifdef FSI_OPENMP2
    }  // end of parallel section
#endif
        outdb("uwx", x, uvww, t);
        outdb("vwy", y, uvww, t);
        outdb("wwz", z, uvww, t);
        outdb("uwakeS", x, uvwwakeS, t);
        outdb("vwakeS", y, uvwwakeS, t);
        outdb("wwakeS", z, uvwwakeS, t);
#ifndef NDEBUG
        NANerr(uvww);
        NANerr(uvwwakeS);
#endif

        //std::cerr << "SOURCEVEL: uwake_cube vs uwakeS: solver returing " << std::endl;  for (auto &it: all_threads) it.detach(); return 0;
        //return 0;
        //if (t==4) return 0;

        //mexSOURCEVEL(
        //      BX1(1:end-1,:),BY1(1:end-1,:),BZ1(1:end-1,:),
        //      WX1,WY1,WZ1,
        //      sig1,
        //      uwakeS,vwakeS,wwakeS,
        //      t);

        // the latest wake velocity in x-y-z-direction
        //uwx(1,:)     = Vxx*ts/5;
        //vwy(1,:)     = Vyy*ts/5;
        //wwz(1,:)     = Vzz*ts/5;
        for (j = 0; j < n + 1; j++) {
            uvww.COMP(0, j) = Vxyz.unsafe_col(j) * ts / 5.0;
        }
        // total wake induced velocity
        // uwx1=uwx+uwakeS+uwake11+uwake12+uwake13;
        // vwy1=vwy+vwakeS+vwake11+vwake12+vwake13;
        // wwz1=wwz+wwakeS+wwake11+wwake12+wwake13;
        uvww1 = uvww + uvwwakeS;

        matrix_out_t(doc, "uwx1", uvww1.row(0), t);  // induced wake velocity for blade 1

        outdb("uwx1", x, uvww1, t);
        outdb("vwy1", y, uvww1, t);
        outdb("wwz1", z, uvww1, t);

        if (t + 1 >= nts)
            break; // calculated wake-induced velocity for last time step, so done

        //WX1(2:t+1,:)=WX1(1:t,:)+(Q+uwx1(1:t,:))*ts;
        //WY1(2:t+1,:)=WY1(1:t,:)+(vwy1(1:t,:))*ts;
        //WZ1(2:t+1,:)=WZ1(1:t,:)+(wwz1(1:t,:))*ts;
        //WX2(2:t+1,:)=WX2(1:t,:)+(Q+uwx2(1:t,:))*ts;
        //WY2(2:t+1,:)=WY2(1:t,:)+(vwy2(1:t,:))*ts;
        //WZ2(2:t+1,:)=WZ2(1:t,:)+(wwz2(1:t,:))*ts;
        //WX3(2:t+1,:)=WX3(1:t,:)+(Q+uwx3(1:t,:))*ts;
        //WY3(2:t+1,:)=WY3(1:t,:)+(vwy3(1:t,:))*ts;
        //WZ3(2:t+1,:)=WZ3(1:t,:)+(wwz3(1:t,:))*ts;

        //WARNING: this assigns to t+1 (the next time step), but variable only goes to nts!!!
        assert(t + 1 < nts);
        for (uword iblade = 0; iblade < nblades; iblade++) {
            cube::iterator pWXYZblade = WXYZ(iblade).begin();
            for (uword in = 0; in < n + 1; in++, pWXYZblade += (nts * 3)) {
                int itinm1 = 3 * (in * nts + t);  // index to current time step
                cube::iterator pWXYZ = pWXYZblade + 3 * (t + 1);
                for (uword it = t + 1; it > 0; it--, itinm1 -= 3, pWXYZ -= 5) {
                    *pWXYZ = pWXYZ[-3] + (Q + uvww1(itinm1 + x)) * ts;
                    ++pWXYZ;
                    *pWXYZ = pWXYZ[-3] + uvww1(itinm1 + y) * ts;
                    ++pWXYZ;
                    *pWXYZ = pWXYZ[-3] + uvww1(itinm1 + z) * ts;
                    //WXYZ(iblade)(x, it, in) = WXYZ(iblade)(x, it - 1, in) + (Q + uvww1(x, it - 1, in)) * ts;
                    //WXYZ(iblade)(y, it, in) = WXYZ(iblade)(y, it - 1, in) + uvww1(y, it - 1, in) * ts;
                    //WXYZ(iblade)(z, it, in) = WXYZ(iblade)(z, it - 1, in) + uvww1(z, it - 1, in) * ts;
                }
            }
        }
    } // time loop
    std::cerr << std::endl;
    matrix_out(doc, "Vt", Vt);  // Vt for last time step
    matrix_out(doc, "Cp", Cp, nts - 1);  // Cp for last time step only
    matrix_out(doc, "delF", delF, nts - 1);  // delF for last time step only
    matrix_out(doc, "F_normal", F_normal, nts - 1);  // F_normal for last time step only
    matrix_out(doc, "F_tangential", F_tangential, nts - 1);  // F_normal for last time step only
    matrix_out(doc, "delTorque", delTorque, nts - 1);  // delTorque for last time step only
    array_out(doc, "Total_torque", Total_torque);
    if (mesh_str.length() > 0) {
        pugi::xml_node node = doc.append_child("mesh");
        node.append_child(pugi::node_pcdata).set_value(mesh_str.c_str());
    }
    OUTD(Total_torque);

    // pole bxm1=zeros(2*m+1,n+1);
    // pole bym1=zeros(2*m+1,n+1);
    // pole bzm1=zeros(2*m+1,n+1);

    // pole bxm2=zeros(2*m+1,n+1);
    // pole bym2=zeros(2*m+1,n+1);
    // pole bzm2=zeros(2*m+1,n+1);

    // pole bxm3=zeros(2*m+1,n+1);
    // pole bym3=zeros(2*m+1,n+1);
    // pole bzm3=zeros(2*m+1,n+1);

    pugi::xml_node perfnode = doc.append_child("performance");
    assert(perfnode != nullptr);

    const steady_clock::time_point end_time = steady_clock::now();
    perfnode.append_attribute("elapsed") =
            duration_cast<mex_thread_time_units_t>(end_time - start_time).count();
#ifdef FSI_STATS
    perfnode.append_attribute("thread_elapsed_ms") = mex_time.elapsed.count();
#endif
    time_t ctime_end_time;
    time(&ctime_end_time);
    perfnode.append_attribute("start_time") = ctime_start_time;
    perfnode.append_attribute("end_time") = ctime_end_time;
    if (max_workinc != 0) {
        perfnode.append_attribute("min_workinc") = min_workinc;
        perfnode.append_attribute("max_workinc") = max_workinc;
    }
    perfnode.append_attribute("thread_type") =
#ifdef FSI_OPENMP
            "openmp";
#else
#ifdef FSI_MPI
            "mpi";
#else
            "posix";
#endif
#endif
    if (NASTRANpath.length() > 0) {  // currently counted for NASTRAN-only
        perfnode.append_attribute("nnodes") = nnodes;
        perfnode.append_attribute("nelems") = nelems;
    }
    std::cerr << "Solver ending: " << ctime(&ctime_end_time) <<
            " duration " << ctime_end_time - ctime_start_time << " seconds" << std::endl;
    program_ending = true;
    {
        unique_lock<mutex> lck(mtx);
        cv_child.notify_all();
    }

#ifdef FSI_STATS
    assert(!mex_time.is_running);

    const mex_thread_time_units_t total_run_time = duration_cast<mex_thread_time_units_t>(end_time - start_time);
    std::cerr << "   (mex:" << mex_time.elapsed.count()/1000000 <<
            " non-mex:" << (total_run_time.count() - mex_time.elapsed.count())/1000000 << ")" <<
            std::endl;
    if (max_workinc != 0) std::cerr << "Adjusted work increment range: " << min_workinc <<
                                  ":" << max_workinc << std::endl;
    // dump raw data to be interpreted later
    // by thread, by mex: name, call_count, elapsed
    int ithread = 1;  // zero-based for OpenMP, one-based otherwise
    // total mex stats across threads
    map<const mex_thread * const, mex_stat *> mex_totals_map;
#ifdef FSI_OPENMP
    for (int it = 0; it < nthreads; it++)
#else
    for (auto &it: all_threads)
#endif
    {
        const string threadname("thread_" + to_string(ithread));  // one-based in XML stats
        pugi::xml_node threadnode = perfnode.append_child(threadname.c_str());
        assert(threadnode != nullptr);
#ifdef FSI_OPENMP
        for (auto &imex : *thread_stat_map[it])  // zero-based for OpenMP
#else
        for (auto &imex : *thread_stat_map[it.get_id()])
#endif
        {
            const mex_thread * const pm = imex.first;
            mex_stat *pstat = imex.second;
            if (mex_totals_map.find(pm) == mex_totals_map.end())
                mex_totals_map[pm] = new mex_stat;
            mex_totals_map[pm]->elapsed += pstat->elapsed;
            mex_totals_map[pm]->call_count += pstat->call_count;
            mex_totals_map[pm]->doublets += pstat->doublets;
            mex_totals_map[pm]->nsyncs += pstat->nsyncs;
            mex_totals_map[pm]->openmp_denied += pstat->openmp_denied;
            pugi::xml_node mexnode = threadnode.append_child(pm->mexname);
            assert(mexnode != nullptr);
            mexnode.append_attribute("elapsed_mus") = pstat->elapsed.count();
            mexnode.append_attribute("count") = pstat->call_count;
            mexnode.append_attribute("doublets") = pstat->doublets;
            mexnode.append_attribute("nsyncs") = pstat->nsyncs;
#ifdef FSI_OPENMP
            mexnode.append_attribute("openmp_denied") = pstat->openmp_denied;
#endif
        }
#ifndef FSI_OPENMP
        it.detach();  // no longer need thread id - must detach (or join) to return cleanly
#endif
#ifdef FSI_MPI
        int status = MPI_Send(nullptr, 0, MPI_DOUBLE, ithread, do_finalize, MPI_COMM_WORLD);
        assert(status == MPI_SUCCESS);
#endif
        ithread++;
    }
    pugi::xml_node total_node = perfnode.append_child("totals");
    assert(total_node != nullptr);
    for (auto &mex: mex_totals_map) {
        pugi::xml_node mexnode = total_node.append_child(mex.first->mexname);
        assert(mexnode != nullptr);
        mexnode.append_attribute("elapsed_mus") = mex.second->elapsed.count();
        mexnode.append_attribute("count") = mex.second->call_count;
        mexnode.append_attribute("doublets") = mex.second->doublets;
        mexnode.append_attribute("nsyncs") = mex.second->nsyncs;
        mexnode.append_attribute("openmp_denied") = mex.second->openmp_denied;
    }
#endif

    if (daemon_mode) {
        doctop.save(cout, INDENT);
    } else {
        if (doctop.save_file(datafilepath.c_str(), INDENT)) {
            std::cerr << "Saved solution to " << datafilepath << std::endl;
        } else {
            std::cerr << "ERROR: could not save " << datafilepath << std::endl;
        }
    }
#ifdef FSI_MPI
    MPI_Finalize();
#endif
    //exit(2);   // dump profile data
    return 0;
}