/* $Id: onesided.c 1397 2006-12-13 16:29:40Z olau $
 *
 * Copyright (c) 2006 Oliver Lau <ola@ctmagazin.de>
 * Copyright (c) 2006 Heise Zeitschriften Verlag
 * Alle Rechte vorbehalten. All rights reserved.
 *
 */

#ifdef WIN32
#include <windows.h>
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <gd.h>
#include <time.h>
#include "globaldefs.h"
#include "bzr.h"
#include "helper.h"

#define ROOT   (0)

#define TAG_DIST_GATHER  (31337)

#define DEFAULT_T          (100)
#define DEFAULT_TICK         (1)

// Breite und Hoehe der Welt */
#define WIDTH             (1280/2)
#define HEIGHT             (960/2)

// maximaler Wert einer Zelle
#define MAX_CELL_VALUE     (255)

// Einfluss erregter Zellen in der Nachbarschaft
// auf den Zustand einer Zelle 
#define DEFAULT_K1           (3)

// Einfluss aktiver Zellen in der Nachbarschaft
// auf den Zustand einer Zelle 
#define DEFAULT_K2           (3)

// Ausbreitungsgeschwindigkeit der Erregungswelle
#define DEFAULT_G           (39)

// Konstanten fuer Moore'sche Nachbarschaft
#define MAX_NEIGHBORS        (8)
enum { N = 0, S, E, W, NW, SE, NE, SW };

#define MAX(a, b) ((a) > (b)? (a) : (b))
#define MIN(a, b) ((a) < (b)? (a) : (b))


typedef struct _direction_t {
    int target_rank;
    int target_offset;
    MPI_Datatype target_type;
    int orig_offset;
    MPI_Datatype orig_type;
} direction_t;


int main(int argc, char *argv[]) {
    int myrank, size;
    int width; // Breite des Weltausschnitts
    int height; // Hoehe des Weltausschnitts
    int dims[2] = { 0, 0 }; // Anzahl Spalten/Zeilen je Dimension
    int periods[2] = { 1, 1 }; // Welt ist in beiden Dimensionen periodisch
    int cart_coords[2], coords[2];
    int *sec0, *sec0_o, *sec1, *sec1_o;
    int *tmpworld, *world = NULL;
    direction_t direction[8];
    MPI_Status stat;
    MPI_Request req;
    MPI_Comm comm;
    MPI_Win win0, win1;
    MPI_Datatype row_type;
    MPI_Datatype column_type;
    MPI_Datatype submatrix_type;
    MPI_Datatype section_type;

    int x, y;
    int xy[2], rank;
    int i;
    bool flip = FALSE;

    // Konstanten fuer Belousov-Zhabotinsky-Reaktion (BZR)
    int k1 = DEFAULT_K1;
    int k2 = DEFAULT_K2;
    int g  = DEFAULT_G;
    int n  = 255;
  
     // Ausmae der Welt
    int globalwidth = WIDTH;
    int globalheight = HEIGHT;

    int colors[MAX_CELL_VALUE+1];
    FILE *pngout;
    char pngname[100];
    int pngsize;
    BYTE *pngbuf = 0;
    gdImagePtr im = 0;

    int t = 0;
    double t0, dt;
    double t_sum = 0.0;
    double t_max = -1e38;
    double t_min = 1e38;

    int tmax = DEFAULT_T;
    int ttick = DEFAULT_TICK;
    bool gen_png = TRUE;

    if (argc > 1)
        tmax = atoi(argv[1]);
    if (argc > 2)
        ttick = atoi(argv[2]);
    if (argc > 3)
        gen_png = (atoi(argv[3]) != 0)? TRUE : FALSE;

    // MPI-Umgebung initialisieren
    MPI_Init(&argc, &argv);

    // Anzahl der Prozesse im Kommunikator ermitteln
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // Anzahl der Abschnitte berechnen, in die die Welt in
    // horizontaler und vertikaler Richtung zerlegt werden soll
    MPI_Dims_create(size, 2, dims);

    // Einen neuen Kommunikator mit der 2D-Welt verknuepfen
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm);

    // eigene ID innerhalb der zweidimensionalen Welt bestimmen
    MPI_Comm_rank(comm, &myrank);

    // Breite und Hoehe der Welt muessen durch die Anzahl 
    // der Abschnitte in jeder Dimension teilbar sein, damit
    // die Abschnitte nahtlos aneinander liegen
    globalwidth  -= globalwidth  % dims[0];
    globalheight -= globalheight % dims[1];

    // Die Breite eines Abschnitts ergibt sich aus der Breite 
    // bzw. Hoehe der Welt geteilt durch die Anzahl der Abschnitte
    // in der jeweiligen Dimension
    width  = globalwidth  / dims[0];
    height = globalheight / dims[1];

    // Horizontales Halo
    MPI_Type_vector(1, width, width, MPI_INT, &row_type);
    MPI_Type_commit(&row_type);

    // Vertikales Halo
    MPI_Type_vector(height, 1, width+2, MPI_INT, &column_type);
    MPI_Type_commit(&column_type);

    // Rechteckiger Ausschnitt aus der Welt
    MPI_Type_vector(height, width, globalwidth, MPI_INT, &submatrix_type);
    MPI_Type_commit(&submatrix_type);

    // Rechteckiger Ausschnitt der Welt
    MPI_Type_vector(height, width, width+2, MPI_INT, &section_type);
    MPI_Type_commit(&section_type);

    // Speicher fuer eigenen Ausschnitt inklusive Halo belegen
    MPI_Alloc_mem((width+2)*(height+2)*sizeof(*sec0), MPI_INFO_NULL, &sec0_o);
    sec0 = sec0_o + 1 + (width + 2);
    MPI_Win_create(sec0_o, // Start des Speicherfensters 
                   (width+2)*(height+2)*sizeof(*sec0_o), // Groesse des Fensters ...
                   sizeof(*sec0_o), // ... in Einheiten zu sizeof(int) Bytes 
                   MPI_INFO_NULL, comm, &win0);

    MPI_Alloc_mem((width+2)*(height+2)*sizeof(*sec1), MPI_INFO_NULL, &sec1_o);
    sec1 = sec1_o + 1 + (width + 2);
    MPI_Win_create(sec1_o, // Start des Speicherfensters 
                   (width+2)*(height+2)*sizeof(*sec1_o), // Groesse des Fensters ...
                   sizeof(*sec1_o), // ... in Einheiten zu sizeof(int) Bytes 
                   MPI_INFO_NULL, comm, &win1);

    if (myrank == ROOT) {
        printf("Erzeugen der 2D-Welt mit %d Abschnitten: dims[]=(%d, %d)\n",
               size, dims[0], dims[1]);
        MPI_Alloc_mem(globalwidth * globalheight * sizeof(*world), MPI_INFO_NULL, &world);
        // Welt mit Zufallszahlen initialisieren
        srand((unsigned int) time(0));
        for (y = 0; y < globalheight; ++y)
            for ( x = 0; x < globalwidth; ++x)
                world[x + y * globalwidth] = rand() % (MAX_CELL_VALUE+1);
        im = gdImageCreate(globalwidth, globalheight);
        for (i = 0; i < MAX_CELL_VALUE+1; ++i) 
            colors[i] = gdImageColorAllocate(im, i, i, 255 - i);
        printf("Berechnen von %d Iterationen ..\n", tmax);
    }

    MPI_Cart_shift(comm, 0, +1, &direction[W].target_rank, &direction[E].target_rank);
    direction[W].orig_type     = column_type;
    direction[W].target_type   = column_type;
    direction[W].orig_offset   = width + 2 + 1;
    direction[W].target_offset = width + 2 + width + 1;
    direction[E].orig_type     = column_type;
    direction[E].target_type   = column_type;
    direction[E].orig_offset   = width + 2 + width;
    direction[E].target_offset = width + 2;

    MPI_Cart_shift(comm, 1, +1, &direction[N].target_rank, &direction[S].target_rank);
    direction[N].orig_type     = row_type;
    direction[N].target_type   = row_type;
    direction[N].orig_offset   = width + 2 + 1;
    direction[N].target_offset = (height + 1) * (width + 2) + 1; 
    direction[S].orig_type     = row_type;
    direction[S].target_type   = row_type;
    direction[S].orig_offset   = (height) * (width + 2) + 1;
    direction[S].target_offset = 1;

    // Die kartesischen Koordinaten des zugeteilten Abschnitts ermitteln
    MPI_Cart_coords(comm, myrank, 2, cart_coords);

    coords[0] = cart_coords[0] - 1;
    coords[1] = cart_coords[1] - 1;
    MPI_Cart_rank(comm, coords, &direction[NW].target_rank);
    direction[NW].orig_type     = MPI_INT;
    direction[NW].target_type   = MPI_INT;
    direction[NW].orig_offset   = width + 2 + 1;
    direction[NW].target_offset = (height + 1) * (width + 2) + width + 1;

    coords[0] = cart_coords[0] + 1;
    coords[1] = cart_coords[1] - 1;
    MPI_Cart_rank(comm, coords, &direction[NE].target_rank);
    direction[NE].orig_type     = MPI_INT;
    direction[NE].target_type   = MPI_INT;
    direction[NE].orig_offset   = width + 2 + width;
    direction[NE].target_offset = (height + 1) * (width + 2);

    coords[0] = cart_coords[0] - 1;
    coords[1] = cart_coords[1] + 1;
    MPI_Cart_rank(comm, coords, &direction[SW].target_rank);
    direction[SW].orig_type     = MPI_INT;
    direction[SW].target_type   = MPI_INT;
    direction[SW].orig_offset   = height * (width + 2) + 1;
    direction[SW].target_offset = width + 1;

    coords[0] = cart_coords[0] + 1;
    coords[1] = cart_coords[1] + 1;
    MPI_Cart_rank(comm, coords, &direction[SE].target_rank);
    direction[SE].orig_type     = MPI_INT;
    direction[SE].target_type   = MPI_INT;
    direction[SE].orig_offset   = height * (width + 2) + width;
    direction[SE].target_offset = 0;

#ifdef DEBUG
    printf("Job %d arbeitet an Block [ %d, %d ]\n",
           myrank, cart_coords[0], cart_coords[1]);
    fflush(stdout);
#endif

    // Welt abschnittsweise an Knoten verteilen
    MPI_Irecv(sec0, 1, section_type, ROOT, TAG_DIST_GATHER, comm, &req);
    if (myrank == ROOT) {
        for (xy[1] = 0; xy[1] < dims[1]; ++xy[1]) {
            for (xy[0] = 0; xy[0] < dims[0]; ++xy[0]) {
                MPI_Cart_rank(comm, xy, &rank);
                MPI_Send(world + xy[0] * width + xy[1] * height * globalwidth,
                         1, submatrix_type, rank, TAG_DIST_GATHER,
                         comm);
            }
        }
    }
    // auf Ende von MPI_Irecv() warten
    MPI_Wait(&req, &stat);

    do {
        if (myrank == ROOT) {
            printf("\r%7d ", t);
            fflush(stdout);
            if (gen_png) {
                sprintf(pngname, "bzr-%06d.png", t);
                pngout = fopen(pngname, "wb+");
                if (pngout == 0)
                    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
                for (x = 0; x < globalwidth; ++x)
                    for (y = 0; y < globalheight; ++y)
                        gdImageSetPixel(im, x, y, colors[world[x + y * globalwidth]]);
                pngbuf = (BYTE *) gdImagePngPtr(im, &pngsize);
                fwrite(pngbuf, sizeof(*pngbuf), pngsize, pngout);
                fclose(pngout);
                gdFree((void *) pngbuf);
            }
        }

        t0 = MPI_Wtime();
        for (dt = 0; dt < ttick; ++dt) {
            MPI_Win_fence(MPI_MODE_NOPRECEDE, (flip)? win1 : win0);
            for (i = 0; i < MAX_NEIGHBORS; ++i) {
                MPI_Put(sec0_o + direction[i].orig_offset,
                        1,
                        direction[i].orig_type, 
                        direction[i].target_rank,
                        direction[i].target_offset,
                        1,
                        direction[i].target_type,
                        (flip)? win1 : win0);
            }
            MPI_Win_fence(MPI_MODE_NOSTORE |  MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, (flip)? win1 : win0);
            BZR_iterate(sec0, sec1, width, height, k1, k2, g, n);

            // Matrizen tauschen
            tmpworld = sec0;
            sec0 = sec1;
            sec1 = tmpworld;
            tmpworld = sec0_o;
            sec0_o = sec1_o;
            sec1_o = tmpworld;
            flip = !flip;
        }
        t += ttick;

        // Berechnung fertig, Ergebnis an Hauptprozess schicken
        MPI_Isend(sec0, 1, section_type, ROOT, TAG_DIST_GATHER, comm, &req);
        
        // Der Hauptprozess sammelt die Ergebnisse ein ...
        if (myrank == ROOT) {
            int xy[2], rank;
            for (xy[1] = 0; xy[1] < dims[1]; ++xy[1]) {
                for (xy[0] = 0; xy[0] < dims[0]; ++xy[0]) {
                    MPI_Cart_rank(comm, xy, &rank);
                    MPI_Recv(world + xy[0] * width + xy[1] * globalwidth * height,
                             1, submatrix_type, rank, TAG_DIST_GATHER,
                             comm, &stat);
                }
            }
        }

        // Warten, bis Hauptprozess die Daten empfangen hat
        MPI_Wait(&req, &stat);

        t0 = MPI_Wtime() - t0;
        t_max = MAX(t_max, t0);
        t_min = MIN(t_min, t0);
        t_sum += t0;
        if (myrank == ROOT)
            printf("  %lg ms", 1000 * t0);
    }
    while (t < tmax);

    if (myrank == ROOT) {
        gdImageDestroy(im);
        MPI_Free_mem(world);
        printf("\nmin./mittl./max. Zeit je %d Iteration%s:"
               " %lg / %lg / %lg ms",
               ttick, (ttick == 1)? "" : "en",
               1000 * t_min,
               1000 * t_sum / tmax * ttick,
               1000 * t_max);
        printf("\nFertig.\n");
    }

    /* Aufraeumen */
    MPI_Free_mem(&sec0_o);
    MPI_Free_mem(&sec1_o);
    MPI_Type_free(&submatrix_type);
    MPI_Type_free(&section_type);
    MPI_Type_free(&column_type);
    MPI_Type_free(&row_type);
    MPI_Comm_free(&comm);
    MPI_Win_free(&win0);
    MPI_Win_free(&win1);

    MPI_Finalize();
    return EXIT_SUCCESS;
}
