/* Last edited on 2005-06-05 14:09:00 by stolfi */ /* Tests SMP/NUMA architecture performance */ #ifdef __alpha_osf #define tt_version "dec" #define NEEDS_PTHR_EXC_H 1 #endif #ifdef __alpha_linux #define tt_version "dlx" #define NEEDS_LLD_FOR_INT64 1 #define NEEDS_LD_FOR_SIZEOF 1 #define NEEDS_STRTOLL_FOR_INT64 1 #define INCLUDE_TMS_CUTIME 1 #define NEEDS_PTHREAD_ATTR 1 #define USE_GETRUSAGE 1 #endif #ifdef __xeon_linux #define tt_version "xeo" #define NEEDS_LLD_FOR_INT64 1 #define NEEDS_STRTOLL_FOR_INT64 1 #define INCLUDE_TMS_CUTIME 1 #define NEEDS_PTHREAD_ATTR 1 #define USE_GETRUSAGE 1 #endif #ifdef __intel_linux #define tt_version "ilx" #define NEEDS_LLD_FOR_INT64 1 #define NEEDS_STRTOLL_FOR_INT64 1 #define INCLUDE_TMS_CUTIME 1 #define NEEDS_PTHREAD_ATTR 1 #define USE_GETRUSAGE 1 #endif #ifdef __sgi #define tt_version "sgi" #define NEEDS_REENTRANT 1 #define NEEDS_LLD_FOR_INT64 1 #define NEEDS_STRTOLL_FOR_INT64 1 #define NEEDS_PTHREAD_ATTR 1 #endif #ifdef __sun #define tt_version "sun" #define NEEDS_REENTRANT 1 #define NEEDS_TREADS_H 1 #define NEEDS_LLD_FOR_INT64 1 #define NEEDS_STRTOLL_FOR_INT64 1 #define NEEDS_SETCONCURRENCY 1 #define NEEDS_PTHREAD_ATTR 1 #endif #ifdef NEEDS_REENTRANT #define _REENTRANT 1 #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef NEEDS_PTHR_EXC_H #include #endif #ifdef NEEDS_TREADS_H #include #endif #include #include #include typedef long long int64; typedef int bool_t; /* Data shared by all threads: */ typedef struct { /* Read-only data: */ int rows; /* Array rows. */ int rstep; /* Increment between rows. */ int cols; /* Array columns. */ int cstep; /* Increment between columns. */ int pas; /* Passes per iteration. */ bool_t noisy; /* TRUE to print trail of sync operations. */ /* Main array: */ double *a; /* Elements are "a[0..rows*cols-1]". */ /* Pivot queue: */ int *q; /* The pivot queue "q[0..mq-1]", cyclic. */ int mq; /* Number of slots in queue vector (constant). */ int nq; /* Number of available pivots. */ int fq; /* First available pivot is "q[fq]", last is "q[((fq+nq)-1)%mq]". */ /* Iteration control: */ int nit; /* Iterations remaining. */ int64 nop; /* Number of operations actually performed */ /* Synchronization tools: */ pthread_cond_t qc; /* Condition: "nq" incremented or "nit" decremented. */ pthread_mutex_t mu; /* Protects "nq", "fq", "*q", "nit", "nop", and "stderr"; */ } TData; #define assert(test, msg) \ { if (!(test)) programerror((msg), __FILE__, __LINE__); } #define check_return(test, msg) \ { if (!(test)) returnerror((msg), __FILE__, __LINE__); } /* Prototypes: */ int main(int argc, char **argv); void parse_args( int argc, char **argv, int *wset, int *mbytes, int *mops, int *trn, int *nth ); void print_args(int argc, char **argv); void init_shared_data(TData *gp, int wset, int mbytes, int mops, bool_t trn); void init_array(int rows, int rstep, int cols, int cstep, double *a); void print_result(TData *gp); void masticate_array(TData *gp, int nth, double *etimep, double *ptimep); void *worker(void *vp); void prepare_threads(int nth); pthread_t *fork_threads(TData *gp, int nth); void join_threads(TData *gp, int nth, pthread_t *t); void grab_pivots(TData *gp, int *pp, int *qp, int delta, bool_t noisy); void release_pivots(TData *gp, int p, int q, bool_t noisy); void update_nop(TData *gp, int64 nop); void check_int_sizes(); void print_resource_limits(); int64 parse_int_arg(char *str, char *name, int64 min, int64 max); void arg_error (char *msg); void programerror (char *msg, char *file, unsigned int line); void returnerror (char *msg, char *file, unsigned int line); char *txtcat (char *a, char *b); char *today(void); void read_clock(double *etime, double *ptime); void start_clock(double *etime, double *ptime); void stop_clock(double *etime, double *ptime); /* Main thread: */ int main(int argc, char **argv) { int wset, mbytes; int mops; bool_t trn; int nth; static TData g; double etime, ptime; fprintf(stderr, "--- %s ----------------------------------------------------------\n", tt_version ); fprintf(stderr, "%s\n", today()); check_int_sizes(); print_resource_limits(); parse_args(argc, argv, &wset, &mbytes, &mops, &trn, &nth); fprintf(stderr, "arguments:\n"); fprintf(stderr, " wset = %d mbytes = %d\n", wset, mbytes); fprintf(stderr, " mops = %d trn = %d nth = %d\n", mops, trn, nth); init_shared_data(&g, wset, mbytes, mops, trn); masticate_array(&g, nth, &etime, &ptime); print_args(argc, argv); fprintf(stderr, #ifdef NEEDS_LF_FOR_DOUBLE " %4.0lf Mops %4.0lf MB", #else " %4.0f Mops %4.0f MB", #endif ((double)g.nop)/1.0e6, ((double)g.rows)*((double)g.cols)*sizeof(double)/1.0e6 ); fprintf(stderr, #ifdef NEEDS_LF_FOR_DOUBLE " %6.1lf elapsed %6.1lf totcpu\n", #else " %6.1f elapsed %6.1f totcpu\n", #endif etime, ptime ); print_result(&g); fprintf(stderr, "%s\n", today()); fprintf(stderr, "------------------------------------------------------------------\n"); return(0); } void print_result(TData *gp) { int rows = gp->rows; int rstep = gp->rstep; int cols = gp->cols; int cstep = gp->cstep; int r = 1; double *ap = &(gp->a[r*rstep]); double s = 0.0; int c; affirm(r < rows, "bad row"); for (c=0; ccols; int rstep = gp->rstep; int cstep = gp->cstep; int pas = gp->pas; bool_t noisy = gp->noisy; double *a = gp->a; int p, q, c, m; double *ap, *aq; int64 nop = 0; int delta = 0; while(1) { grab_pivots(gp, &p, &q, delta, noisy); if ((p < 0) || (q < 0)) { update_nop(gp, nop); return(NULL); } for (m=0; mrows = rows; gp->cols = cols; asz = nel*sizeof(double); fprintf(stderr, #ifdef NEEDS_LLD_FOR_INT64 "rows = %d cols = %d elems = %lld bytes = %lld\n", #else "rows = %d cols = %d elems = %ld bytes = %ld\n", #endif rows, cols, nel, asz ); gp->a = (double*) malloc(asz); check_return(gp->a != NULL, "malloc of \"a\" failed"); /* Set indexing parameters: */ if (trn) { rstep = 1; cstep = rows; } else { rstep = cols; cstep = 1; } gp->rstep = rstep; gp->cstep = cstep; /* Initialize array: */ init_array(rows, rstep, cols, cstep, gp->a); /* Allocate and initialize queue: */ { int r; gp->mq = rows; gp->q = (int*) malloc(gp->mq * sizeof(int)); check_return(gp->q != NULL, "malloc of \"q\" failed"); for(r=0; rq[r] = r; } gp->nq = rows; gp->fq = 0; } /* Initialize iteration count and passes per iteration: */ { int64 ops = ((int64)mops)*1000000; int64 pas; int64 nit; /* Ideally, we should have about one iteration per row. */ /* Since each iteration does "pas*cols" operations, we should have: */ pas = (ops + nel/2) / nel; /* However, we need at least MIN_PASSES passes over each working set: */ if (pas < MIN_PASSES) { pas = MIN_PASSES; } /* Also, each iteration must do at least MIN_ITER_OPS ops: */ if (pas*cols < MIN_ITER_OPS) { pas = (MIN_ITER_OPS + cols - 1)/cols; } /* Recompute the number of iterations to give the desired operation count: */ nit = (ops + (pas*cols)/2) / (pas*cols); /* We need at least MIN_ITERATIONS iterations to properly exercise all threads: */ if (nit < MIN_ITERATIONS) { nit = MIN_ITERATIONS; } gp->nit = nit; gp->pas = pas; gp->noisy = 1; fprintf(stderr, "iterations = %d passes/iteration = %d\n", gp->nit, gp->pas); fprintf(stderr, #ifdef NEEDS_LLD_FOR_INT64 "ops per iteration = %lld total ops = %lld\n", #else "ops per iteration = %ld total ops = %ld\n", #endif pas*cols, pas*nit*cols ); gp->nop = 0; } /* Initialize mutex and condition: */ pthread_cond_init(&(gp->qc), NULL); pthread_mutex_init(&(gp->mu), NULL); } void init_array(int rows, int rstep, int cols, int cstep, double *a) { int64 nel = ((int64)rows)*((int64)cols); int64 i; double etime = 0.0; double ptime = 0.0; double phi; double s; int r, c; double *ap; fprintf(stderr, "initializing array..."); start_clock(&etime, &ptime); phi = (sqrt(5.0) - 1.0)/2.0; s = phi; ap = a; for(i=0; i= 1.0) { s -= 1.0; } (*ap) = s; s += phi; ap++; } stop_clock(&etime, &ptime); fprintf(stderr, " %.3f elapsed, %.3f totcpu\n", etime, ptime); /* Normalize rows: */ fprintf(stderr, "normalizing rows..."); start_clock(&etime, &ptime); for(r=0; rnoisy) { fprintf(stderr, "\n"); } for(i=0; inoisy) { pthread_mutex_lock(&(gp->mu)); fprintf(stderr, "thread %d created\n", i); pthread_mutex_unlock(&(gp->mu)); } } return(t); } void join_threads(TData *gp, int nth, pthread_t *t) { int i, s; void *tr; for(i=0; inoisy) { fprintf(stderr, "\n"); } } void grab_pivots(TData *gp, int *pp, int *qp, int delta, bool_t noisy) /* If "gp->nit" is positive, decrements it, then removes two pivots from the queue, returns them in "*pp" and "*qp". If "gp->nit" is zero, returns "-1,-1" instead. */ { int mq = gp->mq; int k; int *sp; pthread_mutex_lock(&(gp->mu)); while((gp->nit > 0) && (gp->nq < 2)) { pthread_cond_wait(&(gp->qc), &(gp->mu)); } if (gp->nit <= 0) { (*pp) = -1; (*qp) = -1; if (noisy) { putc_unlocked('!', stderr); } } else { if (noisy) { putc_unlocked('/', stderr); } gp->nit--; /* Remove front element: */ (*pp) = (gp->q)[gp->fq]; gp->fq = (gp->fq + 1) % mq; gp->nq--; /* Remove element "delta" among those that remain: */ k = delta % gp->nq; sp = &((gp->q)[(gp->fq + k) % mq]); (*qp) = (*sp); if (k != 0) { (*sp) = (gp->q)[gp->fq]; } gp->fq = (gp->fq + 1) % mq; gp->nq--; pthread_cond_broadcast(&(gp->qc)); } /* if (noisy) { fprintf(stderr, "(%d,%d)", (*pp), (*qp)); } */ pthread_mutex_unlock(&(gp->mu)); } void release_pivots(TData *gp, int p, int q, bool_t noisy) /* Returns the pivots "p" and "q" to the queue. */ { int mq = gp->mq; pthread_mutex_lock(&(gp->mu)); if (noisy) { putc_unlocked('\\', stderr); } (gp->q)[(gp->fq + gp->nq - 1) % mq] = p; gp->nq++; (gp->q)[(gp->fq + gp->nq - 1) % mq] = p; gp->nq++; pthread_cond_broadcast(&(gp->qc)); pthread_mutex_unlock(&(gp->mu)); } void update_nop(TData *gp, int64 nop) { pthread_mutex_lock(&(gp->mu)); gp->nop += nop; pthread_mutex_unlock(&(gp->mu)); } void check_int_sizes() { /* Check integer data type sizes */ #ifdef NEEDS_LD_FOR_SIZEOF fprintf(stderr, "int = %ld ", sizeof(int)); fprintf(stderr, "long = %ld ", sizeof(long)); fprintf(stderr, "long long = %ld\n", sizeof(long long)); #else fprintf(stderr, "int = %d ", sizeof(int)); fprintf(stderr, "long = %d ", sizeof(long)); fprintf(stderr, "long long = %d\n", sizeof(long long)); #endif affirm(sizeof(int) >= 4, "int too small"); affirm(sizeof(int64) >= 8, "int64 too small"); } void print_resource_limits() { struct rlimit r; fprintf(stderr, "resource limits\n"); #ifdef RLIM_INFINITY fprintf(stderr, " RLIM_INFINITY = %lu\n", RLIM_INFINITY); #endif #ifdef RLIM_SAVED_MAX fprintf(stderr, " RLIM_SAVED_MAX = %lu\n", RLIM_SAVED_MAX); #endif #ifdef RLIM_SAVED_CUR fprintf(stderr, " RLIM_SAVED_CUR = %lu\n", RLIM_SAVED_CUR); #endif getrlimit(RLIMIT_AS, &r); fprintf(stderr, " maximum address space size (RLIMIT_AS): soft = %lu hard = %lu\n", r.rlim_cur, r.rlim_max ); getrlimit(RLIMIT_DATA, &r); fprintf(stderr, " maximum data area size (RLIMIT_DATA): soft = %lu hard = %lu\n", r.rlim_cur, r.rlim_max ); #ifdef RLIMIT_RSS getrlimit(RLIMIT_RSS, &r); fprintf(stderr, " maximum resident set size (RLIMIT_RSS): soft = %lu hard = %lu\n", r.rlim_cur, r.rlim_max ); #endif getrlimit(RLIMIT_STACK, &r); fprintf(stderr, " maximum stack size (RLIMIT_STACK): soft = %lu hard = %lu\n", r.rlim_cur, r.rlim_max ); } void parse_args( int argc, char **argv, int *wset, int *mbytes, int *mops, bool_t *trn, int *nth ) { /* Parse arguments */ if(argc != 6) { arg_error("wrong number of arguments"); } (*wset) = parse_int_arg(argv[1], "working set size", 1, 1000000); (*mbytes) = parse_int_arg(argv[2], "total mem size", 1, 1000000); (*mops) = parse_int_arg(argv[3], "megaops", 1, 10000); (*trn) = parse_int_arg(argv[4], "transposed", 0, 1); (*nth) = parse_int_arg(argv[5], "threads", 1, 64); } void print_args(int argc, char **argv) { int i; fprintf(stderr, "%s", argv[0]); for(i=1; i max) || (*bad != '\000') || (errno == ERANGE)) { arg_error(txtcat("bad ", name)); } return (r); } void arg_error (char *msg) { fprintf (stderr, "*** %s\n", msg); exit(1); } void programerror (char *msg, char *file, unsigned int line) { fprintf (stderr, "*** %s:%u: %s\n", file, line, msg); exit(1); } void returnerror (char *msg, char *file, unsigned int line) { fprintf (stderr, "*** %s:%u: %s\n", file, line, msg); perror ("*** errno"); exit(1); } char *txtcat (char *a, char *b) { char *r = malloc(strlen(a)+strlen(b)+1); affirm (r != NULL, "memory exhausted"); strcpy(r, a); strcat(r, b); return(r); } void read_clock(double *etime, double *ptime) { struct tms times_buf; #ifdef USE_GETRUSAGE struct rusage rus_me, rus_ch; int retcode; #endif clock_t et; double tck; et = times(×_buf); #ifdef USE_GETRUSAGE retcode = getrusage(RUSAGE_SELF, &rus_me); retcode = getrusage(RUSAGE_CHILDREN, &rus_ch); #endif tck = (double)sysconf(_SC_CLK_TCK); affirm(tck != 0, "bad tck!"); (*etime) = ((double) et)/tck; #ifdef USE_GETRUSAGE (*ptime) = ((double) (rus_me.ru_utime.tv_sec + rus_ch.ru_utime.tv_sec)) + ((double) (rus_me.ru_utime.tv_usec + rus_ch.ru_utime.tv_usec))/1.0e6; #else #ifdef INCLUDE_TMS_CUTIME (*ptime) = ((double) (times_buf.tms_utime + times_buf.tms_cutime))/tck; #else (*ptime) = ((double) times_buf.tms_utime)/tck; #endif #endif } void start_clock(double *etime, double *ptime) { double eck, pck; read_clock(&eck, &pck); (*etime) -= eck; (*ptime) -= pck; } void stop_clock(double *etime, double *ptime) { double eck, pck; read_clock(&eck, &pck); (*etime) += eck; (*ptime) += pck; } char *today(void) { time_t today_secs = time(NULL); struct tm today; char *buf = (char *) malloc(20); today = *localtime(&today_secs); sprintf(buf, "%02d-%02d-%02d %02d:%02d:%02d", today.tm_year % 100, today.tm_mon, today.tm_mday, today.tm_hour, today.tm_min, today.tm_sec ); return(buf); }