From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Wed, 5 Aug 2015 17:04:01 -0300 Subject: Initial import --- tools/perf/bench/Build | 11 + tools/perf/bench/bench.h | 48 + tools/perf/bench/futex-hash.c | 215 ++++ tools/perf/bench/futex-requeue.c | 212 ++++ tools/perf/bench/futex-wake.c | 198 +++ tools/perf/bench/futex.h | 84 ++ tools/perf/bench/mem-memcpy-arch.h | 12 + tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 12 + tools/perf/bench/mem-memcpy-x86-64-asm.S | 10 + tools/perf/bench/mem-memcpy.c | 434 +++++++ tools/perf/bench/mem-memset-arch.h | 12 + tools/perf/bench/mem-memset-x86-64-asm-def.h | 12 + tools/perf/bench/mem-memset-x86-64-asm.S | 11 + tools/perf/bench/numa.c | 1752 ++++++++++++++++++++++++++ tools/perf/bench/sched-messaging.c | 331 +++++ tools/perf/bench/sched-pipe.c | 184 +++ 16 files changed, 3538 insertions(+) create mode 100644 tools/perf/bench/Build create mode 100644 tools/perf/bench/bench.h create mode 100644 tools/perf/bench/futex-hash.c create mode 100644 tools/perf/bench/futex-requeue.c create mode 100644 tools/perf/bench/futex-wake.c create mode 100644 tools/perf/bench/futex.h create mode 100644 tools/perf/bench/mem-memcpy-arch.h create mode 100644 tools/perf/bench/mem-memcpy-x86-64-asm-def.h create mode 100644 tools/perf/bench/mem-memcpy-x86-64-asm.S create mode 100644 tools/perf/bench/mem-memcpy.c create mode 100644 tools/perf/bench/mem-memset-arch.h create mode 100644 tools/perf/bench/mem-memset-x86-64-asm-def.h create mode 100644 tools/perf/bench/mem-memset-x86-64-asm.S create mode 100644 tools/perf/bench/numa.c create mode 100644 tools/perf/bench/sched-messaging.c create mode 100644 tools/perf/bench/sched-pipe.c (limited to 'tools/perf/bench') diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build new file mode 100644 index 000000000..5ce98023d --- /dev/null +++ b/tools/perf/bench/Build @@ -0,0 +1,11 @@ +perf-y += sched-messaging.o +perf-y += sched-pipe.o +perf-y += mem-memcpy.o +perf-y += futex-hash.o +perf-y += futex-wake.o +perf-y += futex-requeue.o + +perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o +perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o + +perf-$(CONFIG_NUMA) += numa.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h new file mode 100644 index 000000000..3c4dd44d4 --- /dev/null +++ b/tools/perf/bench/bench.h @@ -0,0 +1,48 @@ +#ifndef BENCH_H +#define BENCH_H + +/* + * The madvise transparent hugepage constants were added in glibc + * 2.13. For compatibility with older versions of glibc, define these + * tokens if they are not already defined. + * + * PA-RISC uses different madvise values from other architectures and + * needs to be special-cased. + */ +#ifdef __hppa__ +# ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 67 +# endif +# ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 68 +# endif +#else +# ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 14 +# endif +# ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 15 +# endif +#endif + +extern int bench_numa(int argc, const char **argv, const char *prefix); +extern int bench_sched_messaging(int argc, const char **argv, const char *prefix); +extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); +extern int bench_mem_memcpy(int argc, const char **argv, + const char *prefix __maybe_unused); +extern int bench_mem_memset(int argc, const char **argv, const char *prefix); +extern int bench_futex_hash(int argc, const char **argv, const char *prefix); +extern int bench_futex_wake(int argc, const char **argv, const char *prefix); +extern int bench_futex_requeue(int argc, const char **argv, const char *prefix); + +#define BENCH_FORMAT_DEFAULT_STR "default" +#define BENCH_FORMAT_DEFAULT 0 +#define BENCH_FORMAT_SIMPLE_STR "simple" +#define BENCH_FORMAT_SIMPLE 1 + +#define BENCH_FORMAT_UNKNOWN -1 + +extern int bench_format; +extern unsigned int bench_repeat; + +#endif diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c new file mode 100644 index 000000000..fc9bebd2c --- /dev/null +++ b/tools/perf/bench/futex-hash.c @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * futex-hash: Stress the hell out of the Linux kernel futex uaddr hashing. + * + * This program is particularly useful for measuring the kernel's futex hash + * table/function implementation. In order for it to make sense, use with as + * many threads and futexes as possible. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include +#include +#include +#include + +static unsigned int nthreads = 0; +static unsigned int nsecs = 10; +/* amount of futexes per thread */ +static unsigned int nfutexes = 1024; +static bool fshared = false, done = false, silent = false; +static int futex_flag = 0; + +struct timeval start, end, runtime; +static pthread_mutex_t thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static pthread_cond_t thread_parent, thread_worker; + +struct worker { + int tid; + u_int32_t *futex; + pthread_t thread; + unsigned long ops; +}; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), + OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"), + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_END() +}; + +static const char * const bench_futex_hash_usage[] = { + "perf bench futex hash ", + NULL +}; + +static void *workerfn(void *arg) +{ + int ret; + unsigned int i; + struct worker *w = (struct worker *) arg; + + pthread_mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + pthread_cond_signal(&thread_parent); + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + + do { + for (i = 0; i < nfutexes; i++, w->ops++) { + /* + * We want the futex calls to fail in order to stress + * the hashing of uaddr and not measure other steps, + * such as internal waitqueue handling, thus enlarging + * the critical region protected by hb->lock. + */ + ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag); + if (!silent && + (!ret || errno != EAGAIN || errno != EWOULDBLOCK)) + warn("Non-expected futex return call"); + } + } while (!done); + + return NULL; +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); +} + +static void print_summary(void) +{ + unsigned long avg = avg_stats(&throughput_stats); + double stddev = stddev_stats(&throughput_stats); + + printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", + !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + (int) runtime.tv_sec); +} + +int bench_futex_hash(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + int ret = 0; + cpu_set_t cpu; + struct sigaction act; + unsigned int i, ncpus; + pthread_attr_t thread_attr; + struct worker *worker = NULL; + + argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0); + if (argc) { + usage_with_options(bench_futex_hash_usage, options); + exit(EXIT_FAILURE); + } + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (!nthreads) /* default to the number of CPUs */ + nthreads = ncpus; + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) + goto errmem; + + if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", + getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs); + + init_stats(&throughput_stats); + pthread_mutex_init(&thread_lock, NULL); + pthread_cond_init(&thread_parent, NULL); + pthread_cond_init(&thread_worker, NULL); + + threads_starting = nthreads; + pthread_attr_init(&thread_attr); + gettimeofday(&start, NULL); + for (i = 0; i < nthreads; i++) { + worker[i].tid = i; + worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex)); + if (!worker[i].futex) + goto errmem; + + CPU_ZERO(&cpu); + CPU_SET(i % ncpus, &cpu); + + ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu); + if (ret) + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + + ret = pthread_create(&worker[i].thread, &thread_attr, workerfn, + (void *)(struct worker *) &worker[i]); + if (ret) + err(EXIT_FAILURE, "pthread_create"); + + } + pthread_attr_destroy(&thread_attr); + + pthread_mutex_lock(&thread_lock); + while (threads_starting) + pthread_cond_wait(&thread_parent, &thread_lock); + pthread_cond_broadcast(&thread_worker); + pthread_mutex_unlock(&thread_lock); + + sleep(nsecs); + toggle_done(0, NULL, NULL); + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(worker[i].thread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + /* cleanup & report results */ + pthread_cond_destroy(&thread_parent); + pthread_cond_destroy(&thread_worker); + pthread_mutex_destroy(&thread_lock); + + for (i = 0; i < nthreads; i++) { + unsigned long t = worker[i].ops/runtime.tv_sec; + update_stats(&throughput_stats, t); + if (!silent) { + if (nfutexes == 1) + printf("[thread %2d] futex: %p [ %ld ops/sec ]\n", + worker[i].tid, &worker[i].futex[0], t); + else + printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n", + worker[i].tid, &worker[i].futex[0], + &worker[i].futex[nfutexes-1], t); + } + + free(worker[i].futex); + } + + print_summary(); + + free(worker); + return ret; +errmem: + err(EXIT_FAILURE, "calloc"); +} diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c new file mode 100644 index 000000000..ad0d9b534 --- /dev/null +++ b/tools/perf/bench/futex-requeue.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * futex-requeue: Block a bunch of threads on futex1 and requeue them + * on futex2, N at a time. + * + * This program is particularly useful to measure the latency of nthread + * requeues without waking up any tasks -- thus mimicking a regular futex_wait. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include +#include +#include +#include + +static u_int32_t futex1 = 0, futex2 = 0; + +/* + * How many tasks to requeue at a time. + * Default to 1 in order to make the kernel work more. + */ +static unsigned int nrequeue = 1; + +static pthread_t *worker; +static bool done = false, silent = false, fshared = false; +static pthread_mutex_t thread_lock; +static pthread_cond_t thread_parent, thread_worker; +static struct stats requeuetime_stats, requeued_stats; +static unsigned int ncpus, threads_starting, nthreads = 0; +static int futex_flag = 0; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('q', "nrequeue", &nrequeue, "Specify amount of threads to requeue at once"), + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_END() +}; + +static const char * const bench_futex_requeue_usage[] = { + "perf bench futex requeue ", + NULL +}; + +static void print_summary(void) +{ + double requeuetime_avg = avg_stats(&requeuetime_stats); + double requeuetime_stddev = stddev_stats(&requeuetime_stats); + unsigned int requeued_avg = avg_stats(&requeued_stats); + + printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n", + requeued_avg, + nthreads, + requeuetime_avg/1e3, + rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); +} + +static void *workerfn(void *arg __maybe_unused) +{ + pthread_mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + pthread_cond_signal(&thread_parent); + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + + futex_wait(&futex1, 0, NULL, futex_flag); + return NULL; +} + +static void block_threads(pthread_t *w, + pthread_attr_t thread_attr) +{ + cpu_set_t cpu; + unsigned int i; + + threads_starting = nthreads; + + /* create and block all threads */ + for (i = 0; i < nthreads; i++) { + CPU_ZERO(&cpu); + CPU_SET(i % ncpus, &cpu); + + if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) + err(EXIT_FAILURE, "pthread_create"); + } +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_requeue(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + pthread_attr_t thread_attr; + + argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0); + if (argc) + goto err; + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (!nthreads) + nthreads = ncpus; + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + if (nrequeue > nthreads) + nrequeue = nthreads; + + printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %p), " + "%d at a time.\n\n", getpid(), nthreads, + fshared ? "shared":"private", &futex1, &futex2, nrequeue); + + init_stats(&requeued_stats); + init_stats(&requeuetime_stats); + pthread_attr_init(&thread_attr); + pthread_mutex_init(&thread_lock, NULL); + pthread_cond_init(&thread_parent, NULL); + pthread_cond_init(&thread_worker, NULL); + + for (j = 0; j < bench_repeat && !done; j++) { + unsigned int nrequeued = 0; + struct timeval start, end, runtime; + + /* create, launch & block all threads */ + block_threads(worker, thread_attr); + + /* make sure all threads are already blocked */ + pthread_mutex_lock(&thread_lock); + while (threads_starting) + pthread_cond_wait(&thread_parent, &thread_lock); + pthread_cond_broadcast(&thread_worker); + pthread_mutex_unlock(&thread_lock); + + usleep(100000); + + /* Ok, all threads are patiently blocked, start requeueing */ + gettimeofday(&start, NULL); + while (nrequeued < nthreads) { + /* + * Do not wakeup any tasks blocked on futex1, allowing + * us to really measure futex_wait functionality. + */ + nrequeued += futex_cmp_requeue(&futex1, 0, &futex2, 0, + nrequeue, futex_flag); + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); + + update_stats(&requeued_stats, nrequeued); + update_stats(&requeuetime_stats, runtime.tv_usec); + + if (!silent) { + printf("[Run %d]: Requeued %d of %d threads in %.4f ms\n", + j + 1, nrequeued, nthreads, runtime.tv_usec/1e3); + } + + /* everybody should be blocked on futex2, wake'em up */ + nrequeued = futex_wake(&futex2, nrequeued, futex_flag); + if (nthreads != nrequeued) + warnx("couldn't wakeup all tasks (%d/%d)", nrequeued, nthreads); + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + } + + /* cleanup & report results */ + pthread_cond_destroy(&thread_parent); + pthread_cond_destroy(&thread_worker); + pthread_mutex_destroy(&thread_lock); + pthread_attr_destroy(&thread_attr); + + print_summary(); + + free(worker); + return ret; +err: + usage_with_options(bench_futex_requeue_usage, options); + exit(EXIT_FAILURE); +} diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c new file mode 100644 index 000000000..929f762be --- /dev/null +++ b/tools/perf/bench/futex-wake.c @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * futex-wake: Block a bunch of threads on a futex and wake'em up, N at a time. + * + * This program is particularly useful to measure the latency of nthread wakeups + * in non-error situations: all waiters are queued and all wake calls wakeup + * one or more tasks, and thus the waitqueue is never empty. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include +#include +#include +#include + +/* all threads will block on the same futex */ +static u_int32_t futex1 = 0; + +/* + * How many wakeups to do at a time. + * Default to 1 in order to make the kernel work more. + */ +static unsigned int nwakes = 1; + +pthread_t *worker; +static bool done = false, silent = false, fshared = false; +static pthread_mutex_t thread_lock; +static pthread_cond_t thread_parent, thread_worker; +static struct stats waketime_stats, wakeup_stats; +static unsigned int ncpus, threads_starting, nthreads = 0; +static int futex_flag = 0; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakes", &nwakes, "Specify amount of threads to wake at once"), + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_END() +}; + +static const char * const bench_futex_wake_usage[] = { + "perf bench futex wake ", + NULL +}; + +static void *workerfn(void *arg __maybe_unused) +{ + pthread_mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + pthread_cond_signal(&thread_parent); + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + + futex_wait(&futex1, 0, NULL, futex_flag); + return NULL; +} + +static void print_summary(void) +{ + double waketime_avg = avg_stats(&waketime_stats); + double waketime_stddev = stddev_stats(&waketime_stats); + unsigned int wakeup_avg = avg_stats(&wakeup_stats); + + printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n", + wakeup_avg, + nthreads, + waketime_avg/1e3, + rel_stddev_stats(waketime_stddev, waketime_avg)); +} + +static void block_threads(pthread_t *w, + pthread_attr_t thread_attr) +{ + cpu_set_t cpu; + unsigned int i; + + threads_starting = nthreads; + + /* create and block all threads */ + for (i = 0; i < nthreads; i++) { + CPU_ZERO(&cpu); + CPU_SET(i % ncpus, &cpu); + + if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) + err(EXIT_FAILURE, "pthread_create"); + } +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_wake(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + pthread_attr_t thread_attr; + + argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0); + if (argc) { + usage_with_options(bench_futex_wake_usage, options); + exit(EXIT_FAILURE); + } + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (!nthreads) + nthreads = ncpus; + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] futex %p), " + "waking up %d at a time.\n\n", + getpid(), nthreads, fshared ? "shared":"private", &futex1, nwakes); + + init_stats(&wakeup_stats); + init_stats(&waketime_stats); + pthread_attr_init(&thread_attr); + pthread_mutex_init(&thread_lock, NULL); + pthread_cond_init(&thread_parent, NULL); + pthread_cond_init(&thread_worker, NULL); + + for (j = 0; j < bench_repeat && !done; j++) { + unsigned int nwoken = 0; + struct timeval start, end, runtime; + + /* create, launch & block all threads */ + block_threads(worker, thread_attr); + + /* make sure all threads are already blocked */ + pthread_mutex_lock(&thread_lock); + while (threads_starting) + pthread_cond_wait(&thread_parent, &thread_lock); + pthread_cond_broadcast(&thread_worker); + pthread_mutex_unlock(&thread_lock); + + usleep(100000); + + /* Ok, all threads are patiently blocked, start waking folks up */ + gettimeofday(&start, NULL); + while (nwoken != nthreads) + nwoken += futex_wake(&futex1, nwakes, futex_flag); + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); + + update_stats(&wakeup_stats, nwoken); + update_stats(&waketime_stats, runtime.tv_usec); + + if (!silent) { + printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n", + j + 1, nwoken, nthreads, runtime.tv_usec/1e3); + } + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + } + + /* cleanup & report results */ + pthread_cond_destroy(&thread_parent); + pthread_cond_destroy(&thread_worker); + pthread_mutex_destroy(&thread_lock); + pthread_attr_destroy(&thread_attr); + + print_summary(); + + free(worker); + return ret; +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h new file mode 100644 index 000000000..7ed22ff1e --- /dev/null +++ b/tools/perf/bench/futex.h @@ -0,0 +1,84 @@ +/* + * Glibc independent futex library for testing kernel functionality. + * Shamelessly stolen from Darren Hart + * http://git.kernel.org/cgit/linux/kernel/git/dvhart/futextest.git/ + */ + +#ifndef _FUTEX_H +#define _FUTEX_H + +#include +#include +#include +#include + +/** + * futex() - SYS_futex syscall wrapper + * @uaddr: address of first futex + * @op: futex op code + * @val: typically expected value of uaddr, but varies by op + * @timeout: typically an absolute struct timespec (except where noted + * otherwise). Overloaded by some ops + * @uaddr2: address of second futex for some ops\ + * @val3: varies by op + * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG + * + * futex() is used by all the following futex op wrappers. It can also be + * used for misuse and abuse testing. Generally, the specific op wrappers + * should be used instead. It is a macro instead of an static inline function as + * some of the types over overloaded (timeout is used for nr_requeue for + * example). + * + * These argument descriptions are the defaults for all + * like-named arguments in the following wrappers except where noted below. + */ +#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ + syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) + +/** + * futex_wait() - block on uaddr with optional timeout + * @timeout: relative timeout + */ +static inline int +futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) +{ + return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); +} + +/** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake: wake up to this many tasks + */ +static inline int +futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) +{ + return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); +} + +/** +* futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 +* @nr_wake: wake up to this many tasks +* @nr_requeue: requeue up to this many tasks +*/ +static inline int +futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake, + int nr_requeue, int opflags) +{ + return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, + val, opflags); +} + +#ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP +#include +static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr, + size_t cpusetsize, + cpu_set_t *cpuset) +{ + attr = attr; + cpusetsize = cpusetsize; + cpuset = cpuset; + return 0; +} +#endif + +#endif /* _FUTEX_H */ diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h new file mode 100644 index 000000000..57b4ed871 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -0,0 +1,12 @@ + +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMCPY_FN(fn, name, desc) \ + extern void *fn(void *, const void *, size_t); + +#include "mem-memcpy-x86-64-asm-def.h" + +#undef MEMCPY_FN + +#endif + diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h new file mode 100644 index 000000000..8c0c1a277 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -0,0 +1,12 @@ + +MEMCPY_FN(memcpy_orig, + "x86-64-unrolled", + "unrolled memcpy() in arch/x86/lib/memcpy_64.S") + +MEMCPY_FN(__memcpy, + "x86-64-movsq", + "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") + +MEMCPY_FN(memcpy_erms, + "x86-64-movsb", + "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S new file mode 100644 index 000000000..e4c2c3014 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -0,0 +1,10 @@ +#define memcpy MEMCPY /* don't hide glibc's memcpy() */ +#define altinstr_replacement text +#define globl p2align 4; .globl +#include "../../../arch/x86/lib/memcpy_64.S" +/* + * We need to provide note.GNU-stack section, saying that we want + * NOT executable stack. Otherwise the final linking will assume that + * the ELF stack should not be restricted at all and set it RWX. + */ +.section .note.GNU-stack,"",@progbits diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c new file mode 100644 index 000000000..d3dfb7936 --- /dev/null +++ b/tools/perf/bench/mem-memcpy.c @@ -0,0 +1,434 @@ +/* + * mem-memcpy.c + * + * memcpy: Simple memory copy in various ways + * + * Written by Hitoshi Mitake + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "../util/cloexec.h" +#include "bench.h" +#include "mem-memcpy-arch.h" +#include "mem-memset-arch.h" + +#include +#include +#include +#include +#include + +#define K 1024 + +static const char *length_str = "1MB"; +static const char *routine = "default"; +static int iterations = 1; +static bool use_cycle; +static int cycle_fd; +static bool only_prefault; +static bool no_prefault; + +static const struct option options[] = { + OPT_STRING('l', "length", &length_str, "1MB", + "Specify length of memory to copy. " + "Available units: B, KB, MB, GB and TB (upper and lower)"), + OPT_STRING('r', "routine", &routine, "default", + "Specify routine to copy, \"all\" runs all available routines"), + OPT_INTEGER('i', "iterations", &iterations, + "repeat memcpy() invocation this number of times"), + OPT_BOOLEAN('c', "cycle", &use_cycle, + "Use cycles event instead of gettimeofday() for measuring"), + OPT_BOOLEAN('o', "only-prefault", &only_prefault, + "Show only the result with page faults before memcpy()"), + OPT_BOOLEAN('n', "no-prefault", &no_prefault, + "Show only the result without page faults before memcpy()"), + OPT_END() +}; + +typedef void *(*memcpy_t)(void *, const void *, size_t); +typedef void *(*memset_t)(void *, int, size_t); + +struct routine { + const char *name; + const char *desc; + union { + memcpy_t memcpy; + memset_t memset; + } fn; +}; + +struct routine memcpy_routines[] = { + { .name = "default", + .desc = "Default memcpy() provided by glibc", + .fn.memcpy = memcpy }, +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMCPY_FN(_fn, _name, _desc) {.name = _name, .desc = _desc, .fn.memcpy = _fn}, +#include "mem-memcpy-x86-64-asm-def.h" +#undef MEMCPY_FN + +#endif + + { NULL, + NULL, + {NULL} } +}; + +static const char * const bench_mem_memcpy_usage[] = { + "perf bench mem memcpy ", + NULL +}; + +static struct perf_event_attr cycle_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES +}; + +static void init_cycle(void) +{ + cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, + perf_event_open_cloexec_flag()); + + if (cycle_fd < 0 && errno == ENOSYS) + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); + else + BUG_ON(cycle_fd < 0); +} + +static u64 get_cycle(void) +{ + int ret; + u64 clk; + + ret = read(cycle_fd, &clk, sizeof(u64)); + BUG_ON(ret != sizeof(u64)); + + return clk; +} + +static double timeval2double(struct timeval *ts) +{ + return (double)ts->tv_sec + + (double)ts->tv_usec / (double)1000000; +} + +#define pf (no_prefault ? 0 : 1) + +#define print_bps(x) do { \ + if (x < K) \ + printf(" %14lf B/Sec", x); \ + else if (x < K * K) \ + printf(" %14lfd KB/Sec", x / K); \ + else if (x < K * K * K) \ + printf(" %14lf MB/Sec", x / K / K); \ + else \ + printf(" %14lf GB/Sec", x / K / K / K); \ + } while (0) + +struct bench_mem_info { + const struct routine *routines; + u64 (*do_cycle)(const struct routine *r, size_t len, bool prefault); + double (*do_gettimeofday)(const struct routine *r, size_t len, bool prefault); + const char *const *usage; +}; + +static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen) +{ + const struct routine *r = &info->routines[r_idx]; + double result_bps[2]; + u64 result_cycle[2]; + + result_cycle[0] = result_cycle[1] = 0ULL; + result_bps[0] = result_bps[1] = 0.0; + + printf("Routine %s (%s)\n", r->name, r->desc); + + if (bench_format == BENCH_FORMAT_DEFAULT) + printf("# Copying %s Bytes ...\n\n", length_str); + + if (!only_prefault && !no_prefault) { + /* show both of results */ + if (use_cycle) { + result_cycle[0] = info->do_cycle(r, len, false); + result_cycle[1] = info->do_cycle(r, len, true); + } else { + result_bps[0] = info->do_gettimeofday(r, len, false); + result_bps[1] = info->do_gettimeofday(r, len, true); + } + } else { + if (use_cycle) + result_cycle[pf] = info->do_cycle(r, len, only_prefault); + else + result_bps[pf] = info->do_gettimeofday(r, len, only_prefault); + } + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + if (!only_prefault && !no_prefault) { + if (use_cycle) { + printf(" %14lf Cycle/Byte\n", + (double)result_cycle[0] + / totallen); + printf(" %14lf Cycle/Byte (with prefault)\n", + (double)result_cycle[1] + / totallen); + } else { + print_bps(result_bps[0]); + printf("\n"); + print_bps(result_bps[1]); + printf(" (with prefault)\n"); + } + } else { + if (use_cycle) { + printf(" %14lf Cycle/Byte", + (double)result_cycle[pf] + / totallen); + } else + print_bps(result_bps[pf]); + + printf("%s\n", only_prefault ? " (with prefault)" : ""); + } + break; + case BENCH_FORMAT_SIMPLE: + if (!only_prefault && !no_prefault) { + if (use_cycle) { + printf("%lf %lf\n", + (double)result_cycle[0] / totallen, + (double)result_cycle[1] / totallen); + } else { + printf("%lf %lf\n", + result_bps[0], result_bps[1]); + } + } else { + if (use_cycle) { + printf("%lf\n", (double)result_cycle[pf] + / totallen); + } else + printf("%lf\n", result_bps[pf]); + } + break; + default: + /* reaching this means there's some disaster: */ + die("unknown format: %d\n", bench_format); + break; + } +} + +static int bench_mem_common(int argc, const char **argv, + const char *prefix __maybe_unused, + struct bench_mem_info *info) +{ + int i; + size_t len; + double totallen; + + argc = parse_options(argc, argv, options, + info->usage, 0); + + if (no_prefault && only_prefault) { + fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); + return 1; + } + + if (use_cycle) + init_cycle(); + + len = (size_t)perf_atoll((char *)length_str); + totallen = (double)len * iterations; + + if ((s64)len <= 0) { + fprintf(stderr, "Invalid length:%s\n", length_str); + return 1; + } + + /* same to without specifying either of prefault and no-prefault */ + if (only_prefault && no_prefault) + only_prefault = no_prefault = false; + + if (!strncmp(routine, "all", 3)) { + for (i = 0; info->routines[i].name; i++) + __bench_mem_routine(info, i, len, totallen); + return 0; + } + + for (i = 0; info->routines[i].name; i++) { + if (!strcmp(info->routines[i].name, routine)) + break; + } + if (!info->routines[i].name) { + printf("Unknown routine:%s\n", routine); + printf("Available routines...\n"); + for (i = 0; info->routines[i].name; i++) { + printf("\t%s ... %s\n", + info->routines[i].name, info->routines[i].desc); + } + return 1; + } + + __bench_mem_routine(info, i, len, totallen); + + return 0; +} + +static void memcpy_alloc_mem(void **dst, void **src, size_t length) +{ + *dst = zalloc(length); + if (!*dst) + die("memory allocation failed - maybe length is too large?\n"); + + *src = zalloc(length); + if (!*src) + die("memory allocation failed - maybe length is too large?\n"); + /* Make sure to always replace the zero pages even if MMAP_THRESH is crossed */ + memset(*src, 0, length); +} + +static u64 do_memcpy_cycle(const struct routine *r, size_t len, bool prefault) +{ + u64 cycle_start = 0ULL, cycle_end = 0ULL; + void *src = NULL, *dst = NULL; + memcpy_t fn = r->fn.memcpy; + int i; + + memcpy_alloc_mem(&dst, &src, len); + + if (prefault) + fn(dst, src, len); + + cycle_start = get_cycle(); + for (i = 0; i < iterations; ++i) + fn(dst, src, len); + cycle_end = get_cycle(); + + free(src); + free(dst); + return cycle_end - cycle_start; +} + +static double do_memcpy_gettimeofday(const struct routine *r, size_t len, + bool prefault) +{ + struct timeval tv_start, tv_end, tv_diff; + memcpy_t fn = r->fn.memcpy; + void *src = NULL, *dst = NULL; + int i; + + memcpy_alloc_mem(&dst, &src, len); + + if (prefault) + fn(dst, src, len); + + BUG_ON(gettimeofday(&tv_start, NULL)); + for (i = 0; i < iterations; ++i) + fn(dst, src, len); + BUG_ON(gettimeofday(&tv_end, NULL)); + + timersub(&tv_end, &tv_start, &tv_diff); + + free(src); + free(dst); + return (double)(((double)len * iterations) / timeval2double(&tv_diff)); +} + +int bench_mem_memcpy(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + struct bench_mem_info info = { + .routines = memcpy_routines, + .do_cycle = do_memcpy_cycle, + .do_gettimeofday = do_memcpy_gettimeofday, + .usage = bench_mem_memcpy_usage, + }; + + return bench_mem_common(argc, argv, prefix, &info); +} + +static void memset_alloc_mem(void **dst, size_t length) +{ + *dst = zalloc(length); + if (!*dst) + die("memory allocation failed - maybe length is too large?\n"); +} + +static u64 do_memset_cycle(const struct routine *r, size_t len, bool prefault) +{ + u64 cycle_start = 0ULL, cycle_end = 0ULL; + memset_t fn = r->fn.memset; + void *dst = NULL; + int i; + + memset_alloc_mem(&dst, len); + + if (prefault) + fn(dst, -1, len); + + cycle_start = get_cycle(); + for (i = 0; i < iterations; ++i) + fn(dst, i, len); + cycle_end = get_cycle(); + + free(dst); + return cycle_end - cycle_start; +} + +static double do_memset_gettimeofday(const struct routine *r, size_t len, + bool prefault) +{ + struct timeval tv_start, tv_end, tv_diff; + memset_t fn = r->fn.memset; + void *dst = NULL; + int i; + + memset_alloc_mem(&dst, len); + + if (prefault) + fn(dst, -1, len); + + BUG_ON(gettimeofday(&tv_start, NULL)); + for (i = 0; i < iterations; ++i) + fn(dst, i, len); + BUG_ON(gettimeofday(&tv_end, NULL)); + + timersub(&tv_end, &tv_start, &tv_diff); + + free(dst); + return (double)(((double)len * iterations) / timeval2double(&tv_diff)); +} + +static const char * const bench_mem_memset_usage[] = { + "perf bench mem memset ", + NULL +}; + +static const struct routine memset_routines[] = { + { .name ="default", + .desc = "Default memset() provided by glibc", + .fn.memset = memset }, +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMSET_FN(_fn, _name, _desc) { .name = _name, .desc = _desc, .fn.memset = _fn }, +#include "mem-memset-x86-64-asm-def.h" +#undef MEMSET_FN + +#endif + + { .name = NULL, + .desc = NULL, + .fn.memset = NULL } +}; + +int bench_mem_memset(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + struct bench_mem_info info = { + .routines = memset_routines, + .do_cycle = do_memset_cycle, + .do_gettimeofday = do_memset_gettimeofday, + .usage = bench_mem_memset_usage, + }; + + return bench_mem_common(argc, argv, prefix, &info); +} diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h new file mode 100644 index 000000000..633800cb0 --- /dev/null +++ b/tools/perf/bench/mem-memset-arch.h @@ -0,0 +1,12 @@ + +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMSET_FN(fn, name, desc) \ + extern void *fn(void *, int, size_t); + +#include "mem-memset-x86-64-asm-def.h" + +#undef MEMSET_FN + +#endif + diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h new file mode 100644 index 000000000..f02d02877 --- /dev/null +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -0,0 +1,12 @@ + +MEMSET_FN(memset_orig, + "x86-64-unrolled", + "unrolled memset() in arch/x86/lib/memset_64.S") + +MEMSET_FN(__memset, + "x86-64-stosq", + "movsq-based memset() in arch/x86/lib/memset_64.S") + +MEMSET_FN(memset_erms, + "x86-64-stosb", + "movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S new file mode 100644 index 000000000..de278784c --- /dev/null +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -0,0 +1,11 @@ +#define memset MEMSET /* don't hide glibc's memset() */ +#define altinstr_replacement text +#define globl p2align 4; .globl +#include "../../../arch/x86/lib/memset_64.S" + +/* + * We need to provide note.GNU-stack section, saying that we want + * NOT executable stack. Otherwise the final linking will assume that + * the ELF stack should not be restricted at all and set it RWX. + */ +.section .note.GNU-stack,"",@progbits diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c new file mode 100644 index 000000000..ba5efa471 --- /dev/null +++ b/tools/perf/bench/numa.c @@ -0,0 +1,1752 @@ +/* + * numa.c + * + * numa: Simulate NUMA-sensitive workload and measure their NUMA performance + */ + +#include "../perf.h" +#include "../builtin.h" +#include "../util/util.h" +#include "../util/parse-options.h" + +#include "bench.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Regular printout to the terminal, supressed if -q is specified: + */ +#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) + +/* + * Debug printf: + */ +#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) + +struct thread_data { + int curr_cpu; + cpu_set_t bind_cpumask; + int bind_node; + u8 *process_data; + int process_nr; + int thread_nr; + int task_nr; + unsigned int loops_done; + u64 val; + u64 runtime_ns; + pthread_mutex_t *process_lock; +}; + +/* Parameters set by options: */ + +struct params { + /* Startup synchronization: */ + bool serialize_startup; + + /* Task hierarchy: */ + int nr_proc; + int nr_threads; + + /* Working set sizes: */ + const char *mb_global_str; + const char *mb_proc_str; + const char *mb_proc_locked_str; + const char *mb_thread_str; + + double mb_global; + double mb_proc; + double mb_proc_locked; + double mb_thread; + + /* Access patterns to the working set: */ + bool data_reads; + bool data_writes; + bool data_backwards; + bool data_zero_memset; + bool data_rand_walk; + u32 nr_loops; + u32 nr_secs; + u32 sleep_usecs; + + /* Working set initialization: */ + bool init_zero; + bool init_random; + bool init_cpu0; + + /* Misc options: */ + int show_details; + int run_all; + int thp; + + long bytes_global; + long bytes_process; + long bytes_process_locked; + long bytes_thread; + + int nr_tasks; + bool show_quiet; + + bool show_convergence; + bool measure_convergence; + + int perturb_secs; + int nr_cpus; + int nr_nodes; + + /* Affinity options -C and -N: */ + char *cpu_list_str; + char *node_list_str; +}; + + +/* Global, read-writable area, accessible to all processes and threads: */ + +struct global_info { + u8 *data; + + pthread_mutex_t startup_mutex; + int nr_tasks_started; + + pthread_mutex_t startup_done_mutex; + + pthread_mutex_t start_work_mutex; + int nr_tasks_working; + + pthread_mutex_t stop_work_mutex; + u64 bytes_done; + + struct thread_data *threads; + + /* Convergence latency measurement: */ + bool all_converged; + bool stop_work; + + int print_once; + + struct params p; +}; + +static struct global_info *g = NULL; + +static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); +static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); + +struct params p0; + +static const struct option options[] = { + OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"), + OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"), + + OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"), + OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"), + OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), + OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"), + + OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run"), + OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run"), + OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), + + OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), + OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), + OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), + OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), + OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"), + + + OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"), + OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"), + OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"), + OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"), + + OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), + OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), + OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), + OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), + OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), + OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"), + OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), + + /* Special option string parsing callbacks: */ + OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", + "bind the first N tasks to these specific cpus (the rest is unbound)", + parse_cpus_opt), + OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", + "bind the first N tasks to these specific memory nodes (the rest is unbound)", + parse_nodes_opt), + OPT_END() +}; + +static const char * const bench_numa_usage[] = { + "perf bench numa ", + NULL +}; + +static const char * const numa_usage[] = { + "perf bench numa mem []", + NULL +}; + +static cpu_set_t bind_to_cpu(int target_cpu) +{ + cpu_set_t orig_mask, mask; + int ret; + + ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); + BUG_ON(ret); + + CPU_ZERO(&mask); + + if (target_cpu == -1) { + int cpu; + + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &mask); + } else { + BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); + CPU_SET(target_cpu, &mask); + } + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); + + return orig_mask; +} + +static cpu_set_t bind_to_node(int target_node) +{ + int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; + cpu_set_t orig_mask, mask; + int cpu; + int ret; + + BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); + BUG_ON(!cpus_per_node); + + ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); + BUG_ON(ret); + + CPU_ZERO(&mask); + + if (target_node == -1) { + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &mask); + } else { + int cpu_start = (target_node + 0) * cpus_per_node; + int cpu_stop = (target_node + 1) * cpus_per_node; + + BUG_ON(cpu_stop > g->p.nr_cpus); + + for (cpu = cpu_start; cpu < cpu_stop; cpu++) + CPU_SET(cpu, &mask); + } + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); + + return orig_mask; +} + +static void bind_to_cpumask(cpu_set_t mask) +{ + int ret; + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); +} + +static void mempol_restore(void) +{ + int ret; + + ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); + + BUG_ON(ret); +} + +static void bind_to_memnode(int node) +{ + unsigned long nodemask; + int ret; + + if (node == -1) + return; + + BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)); + nodemask = 1L << node; + + ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); + dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + + BUG_ON(ret); +} + +#define HPSIZE (2*1024*1024) + +#define set_taskname(fmt...) \ +do { \ + char name[20]; \ + \ + snprintf(name, 20, fmt); \ + prctl(PR_SET_NAME, name); \ +} while (0) + +static u8 *alloc_data(ssize_t bytes0, int map_flags, + int init_zero, int init_cpu0, int thp, int init_random) +{ + cpu_set_t orig_mask; + ssize_t bytes; + u8 *buf; + int ret; + + if (!bytes0) + return NULL; + + /* Allocate and initialize all memory on CPU#0: */ + if (init_cpu0) { + orig_mask = bind_to_node(0); + bind_to_memnode(0); + } + + bytes = bytes0 + HPSIZE; + + buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); + BUG_ON(buf == (void *)-1); + + if (map_flags == MAP_PRIVATE) { + if (thp > 0) { + ret = madvise(buf, bytes, MADV_HUGEPAGE); + if (ret && !g->print_once) { + g->print_once = 1; + printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n"); + } + } + if (thp < 0) { + ret = madvise(buf, bytes, MADV_NOHUGEPAGE); + if (ret && !g->print_once) { + g->print_once = 1; + printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n"); + } + } + } + + if (init_zero) { + bzero(buf, bytes); + } else { + /* Initialize random contents, different in each word: */ + if (init_random) { + u64 *wbuf = (void *)buf; + long off = rand(); + long i; + + for (i = 0; i < bytes/8; i++) + wbuf[i] = i + off; + } + } + + /* Align to 2MB boundary: */ + buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); + + /* Restore affinity: */ + if (init_cpu0) { + bind_to_cpumask(orig_mask); + mempol_restore(); + } + + return buf; +} + +static void free_data(void *data, ssize_t bytes) +{ + int ret; + + if (!data) + return; + + ret = munmap(data, bytes); + BUG_ON(ret); +} + +/* + * Create a shared memory buffer that can be shared between processes, zeroed: + */ +static void * zalloc_shared_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Create a shared memory buffer that can be shared between processes: + */ +static void * setup_shared_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Allocate process-local memory - this will either be shared between + * threads of this process, or only be accessed by this thread: + */ +static void * setup_private_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Return a process-shared (global) mutex: + */ +static void init_global_mutex(pthread_mutex_t *mutex) +{ + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(mutex, &attr); +} + +static int parse_cpu_list(const char *arg) +{ + p0.cpu_list_str = strdup(arg); + + dprintf("got CPU list: {%s}\n", p0.cpu_list_str); + + return 0; +} + +static int parse_setup_cpu_list(void) +{ + struct thread_data *td; + char *str0, *str; + int t; + + if (!g->p.cpu_list_str) + return 0; + + dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + + str0 = str = strdup(g->p.cpu_list_str); + t = 0; + + BUG_ON(!str); + + tprintf("# binding tasks to CPUs:\n"); + tprintf("# "); + + while (true) { + int bind_cpu, bind_cpu_0, bind_cpu_1; + char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; + int bind_len; + int step; + int mul; + + tok = strsep(&str, ","); + if (!tok) + break; + + tok_end = strstr(tok, "-"); + + dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); + if (!tok_end) { + /* Single CPU specified: */ + bind_cpu_0 = bind_cpu_1 = atol(tok); + } else { + /* CPU range specified (for example: "5-11"): */ + bind_cpu_0 = atol(tok); + bind_cpu_1 = atol(tok_end + 1); + } + + step = 1; + tok_step = strstr(tok, "#"); + if (tok_step) { + step = atol(tok_step + 1); + BUG_ON(step <= 0 || step >= g->p.nr_cpus); + } + + /* + * Mask length. + * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', + * where the _4 means the next 4 CPUs are allowed. + */ + bind_len = 1; + tok_len = strstr(tok, "_"); + if (tok_len) { + bind_len = atol(tok_len + 1); + BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); + } + + /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ + mul = 1; + tok_mul = strstr(tok, "x"); + if (tok_mul) { + mul = atol(tok_mul + 1); + BUG_ON(mul <= 0); + } + + dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); + + if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { + printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); + return -1; + } + + BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); + BUG_ON(bind_cpu_0 > bind_cpu_1); + + for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { + int i; + + for (i = 0; i < mul; i++) { + int cpu; + + if (t >= g->p.nr_tasks) { + printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu); + goto out; + } + td = g->threads + t; + + if (t) + tprintf(","); + if (bind_len > 1) { + tprintf("%2d/%d", bind_cpu, bind_len); + } else { + tprintf("%2d", bind_cpu); + } + + CPU_ZERO(&td->bind_cpumask); + for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { + BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); + CPU_SET(cpu, &td->bind_cpumask); + } + t++; + } + } + } +out: + + tprintf("\n"); + + if (t < g->p.nr_tasks) + printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + + free(str0); + return 0; +} + +static int parse_cpus_opt(const struct option *opt __maybe_unused, + const char *arg, int unset __maybe_unused) +{ + if (!arg) + return -1; + + return parse_cpu_list(arg); +} + +static int parse_node_list(const char *arg) +{ + p0.node_list_str = strdup(arg); + + dprintf("got NODE list: {%s}\n", p0.node_list_str); + + return 0; +} + +static int parse_setup_node_list(void) +{ + struct thread_data *td; + char *str0, *str; + int t; + + if (!g->p.node_list_str) + return 0; + + dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + + str0 = str = strdup(g->p.node_list_str); + t = 0; + + BUG_ON(!str); + + tprintf("# binding tasks to NODEs:\n"); + tprintf("# "); + + while (true) { + int bind_node, bind_node_0, bind_node_1; + char *tok, *tok_end, *tok_step, *tok_mul; + int step; + int mul; + + tok = strsep(&str, ","); + if (!tok) + break; + + tok_end = strstr(tok, "-"); + + dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); + if (!tok_end) { + /* Single NODE specified: */ + bind_node_0 = bind_node_1 = atol(tok); + } else { + /* NODE range specified (for example: "5-11"): */ + bind_node_0 = atol(tok); + bind_node_1 = atol(tok_end + 1); + } + + step = 1; + tok_step = strstr(tok, "#"); + if (tok_step) { + step = atol(tok_step + 1); + BUG_ON(step <= 0 || step >= g->p.nr_nodes); + } + + /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ + mul = 1; + tok_mul = strstr(tok, "x"); + if (tok_mul) { + mul = atol(tok_mul + 1); + BUG_ON(mul <= 0); + } + + dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); + + if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { + printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); + return -1; + } + + BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); + BUG_ON(bind_node_0 > bind_node_1); + + for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { + int i; + + for (i = 0; i < mul; i++) { + if (t >= g->p.nr_tasks) { + printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); + goto out; + } + td = g->threads + t; + + if (!t) + tprintf(" %2d", bind_node); + else + tprintf(",%2d", bind_node); + + td->bind_node = bind_node; + t++; + } + } + } +out: + + tprintf("\n"); + + if (t < g->p.nr_tasks) + printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + + free(str0); + return 0; +} + +static int parse_nodes_opt(const struct option *opt __maybe_unused, + const char *arg, int unset __maybe_unused) +{ + if (!arg) + return -1; + + return parse_node_list(arg); + + return 0; +} + +#define BIT(x) (1ul << x) + +static inline uint32_t lfsr_32(uint32_t lfsr) +{ + const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); + return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); +} + +/* + * Make sure there's real data dependency to RAM (when read + * accesses are enabled), so the compiler, the CPU and the + * kernel (KSM, zero page, etc.) cannot optimize away RAM + * accesses: + */ +static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) +{ + if (g->p.data_reads) + val += *data; + if (g->p.data_writes) + *data = val + 1; + return val; +} + +/* + * The worker process does two types of work, a forwards going + * loop and a backwards going loop. + * + * We do this so that on multiprocessor systems we do not create + * a 'train' of processing, with highly synchronized processes, + * skewing the whole benchmark. + */ +static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) +{ + long words = bytes/sizeof(u64); + u64 *data = (void *)__data; + long chunk_0, chunk_1; + u64 *d0, *d, *d1; + long off; + long i; + + BUG_ON(!data && words); + BUG_ON(data && !words); + + if (!data) + return val; + + /* Very simple memset() work variant: */ + if (g->p.data_zero_memset && !g->p.data_rand_walk) { + bzero(data, bytes); + return val; + } + + /* Spread out by PID/TID nr and by loop nr: */ + chunk_0 = words/nr_max; + chunk_1 = words/g->p.nr_loops; + off = nr*chunk_0 + loop*chunk_1; + + while (off >= words) + off -= words; + + if (g->p.data_rand_walk) { + u32 lfsr = nr + loop + val; + int j; + + for (i = 0; i < words/1024; i++) { + long start, end; + + lfsr = lfsr_32(lfsr); + + start = lfsr % words; + end = min(start + 1024, words-1); + + if (g->p.data_zero_memset) { + bzero(data + start, (end-start) * sizeof(u64)); + } else { + for (j = start; j < end; j++) + val = access_data(data + j, val); + } + } + } else if (!g->p.data_backwards || (nr + loop) & 1) { + + d0 = data + off; + d = data + off + 1; + d1 = data + words; + + /* Process data forwards: */ + for (;;) { + if (unlikely(d >= d1)) + d = data; + if (unlikely(d == d0)) + break; + + val = access_data(d, val); + + d++; + } + } else { + /* Process data backwards: */ + + d0 = data + off; + d = data + off - 1; + d1 = data + words; + + /* Process data forwards: */ + for (;;) { + if (unlikely(d < data)) + d = data + words-1; + if (unlikely(d == d0)) + break; + + val = access_data(d, val); + + d--; + } + } + + return val; +} + +static void update_curr_cpu(int task_nr, unsigned long bytes_worked) +{ + unsigned int cpu; + + cpu = sched_getcpu(); + + g->threads[task_nr].curr_cpu = cpu; + prctl(0, bytes_worked); +} + +#define MAX_NR_NODES 64 + +/* + * Count the number of nodes a process's threads + * are spread out on. + * + * A count of 1 means that the process is compressed + * to a single node. A count of g->p.nr_nodes means it's + * spread out on the whole system. + */ +static int count_process_nodes(int process_nr) +{ + char node_present[MAX_NR_NODES] = { 0, }; + int nodes; + int n, t; + + for (t = 0; t < g->p.nr_threads; t++) { + struct thread_data *td; + int task_nr; + int node; + + task_nr = process_nr*g->p.nr_threads + t; + td = g->threads + task_nr; + + node = numa_node_of_cpu(td->curr_cpu); + if (node < 0) /* curr_cpu was likely still -1 */ + return 0; + + node_present[node] = 1; + } + + nodes = 0; + + for (n = 0; n < MAX_NR_NODES; n++) + nodes += node_present[n]; + + return nodes; +} + +/* + * Count the number of distinct process-threads a node contains. + * + * A count of 1 means that the node contains only a single + * process. If all nodes on the system contain at most one + * process then we are well-converged. + */ +static int count_node_processes(int node) +{ + int processes = 0; + int t, p; + + for (p = 0; p < g->p.nr_proc; p++) { + for (t = 0; t < g->p.nr_threads; t++) { + struct thread_data *td; + int task_nr; + int n; + + task_nr = p*g->p.nr_threads + t; + td = g->threads + task_nr; + + n = numa_node_of_cpu(td->curr_cpu); + if (n == node) { + processes++; + break; + } + } + } + + return processes; +} + +static void calc_convergence_compression(int *strong) +{ + unsigned int nodes_min, nodes_max; + int p; + + nodes_min = -1; + nodes_max = 0; + + for (p = 0; p < g->p.nr_proc; p++) { + unsigned int nodes = count_process_nodes(p); + + if (!nodes) { + *strong = 0; + return; + } + + nodes_min = min(nodes, nodes_min); + nodes_max = max(nodes, nodes_max); + } + + /* Strong convergence: all threads compress on a single node: */ + if (nodes_min == 1 && nodes_max == 1) { + *strong = 1; + } else { + *strong = 0; + tprintf(" {%d-%d}", nodes_min, nodes_max); + } +} + +static void calc_convergence(double runtime_ns_max, double *convergence) +{ + unsigned int loops_done_min, loops_done_max; + int process_groups; + int nodes[MAX_NR_NODES]; + int distance; + int nr_min; + int nr_max; + int strong; + int sum; + int nr; + int node; + int cpu; + int t; + + if (!g->p.show_convergence && !g->p.measure_convergence) + return; + + for (node = 0; node < g->p.nr_nodes; node++) + nodes[node] = 0; + + loops_done_min = -1; + loops_done_max = 0; + + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + unsigned int loops_done; + + cpu = td->curr_cpu; + + /* Not all threads have written it yet: */ + if (cpu < 0) + continue; + + node = numa_node_of_cpu(cpu); + + nodes[node]++; + + loops_done = td->loops_done; + loops_done_min = min(loops_done, loops_done_min); + loops_done_max = max(loops_done, loops_done_max); + } + + nr_max = 0; + nr_min = g->p.nr_tasks; + sum = 0; + + for (node = 0; node < g->p.nr_nodes; node++) { + nr = nodes[node]; + nr_min = min(nr, nr_min); + nr_max = max(nr, nr_max); + sum += nr; + } + BUG_ON(nr_min > nr_max); + + BUG_ON(sum > g->p.nr_tasks); + + if (0 && (sum < g->p.nr_tasks)) + return; + + /* + * Count the number of distinct process groups present + * on nodes - when we are converged this will decrease + * to g->p.nr_proc: + */ + process_groups = 0; + + for (node = 0; node < g->p.nr_nodes; node++) { + int processes = count_node_processes(node); + + nr = nodes[node]; + tprintf(" %2d/%-2d", nr, processes); + + process_groups += processes; + } + + distance = nr_max - nr_min; + + tprintf(" [%2d/%-2d]", distance, process_groups); + + tprintf(" l:%3d-%-3d (%3d)", + loops_done_min, loops_done_max, loops_done_max-loops_done_min); + + if (loops_done_min && loops_done_max) { + double skew = 1.0 - (double)loops_done_min/loops_done_max; + + tprintf(" [%4.1f%%]", skew * 100.0); + } + + calc_convergence_compression(&strong); + + if (strong && process_groups == g->p.nr_proc) { + if (!*convergence) { + *convergence = runtime_ns_max; + tprintf(" (%6.1fs converged)\n", *convergence/1e9); + if (g->p.measure_convergence) { + g->all_converged = true; + g->stop_work = true; + } + } + } else { + if (*convergence) { + tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); + *convergence = 0; + } + tprintf("\n"); + } +} + +static void show_summary(double runtime_ns_max, int l, double *convergence) +{ + tprintf("\r # %5.1f%% [%.1f mins]", + (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); + + calc_convergence(runtime_ns_max, convergence); + + if (g->p.show_details >= 0) + fflush(stdout); +} + +static void *worker_thread(void *__tdata) +{ + struct thread_data *td = __tdata; + struct timeval start0, start, stop, diff; + int process_nr = td->process_nr; + int thread_nr = td->thread_nr; + unsigned long last_perturbance; + int task_nr = td->task_nr; + int details = g->p.show_details; + int first_task, last_task; + double convergence = 0; + u64 val = td->val; + double runtime_ns_max; + u8 *global_data; + u8 *process_data; + u8 *thread_data; + u64 bytes_done; + long work_done; + u32 l; + + bind_to_cpumask(td->bind_cpumask); + bind_to_memnode(td->bind_node); + + set_taskname("thread %d/%d", process_nr, thread_nr); + + global_data = g->data; + process_data = td->process_data; + thread_data = setup_private_data(g->p.bytes_thread); + + bytes_done = 0; + + last_task = 0; + if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) + last_task = 1; + + first_task = 0; + if (process_nr == 0 && thread_nr == 0) + first_task = 1; + + if (details >= 2) { + printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", + process_nr, thread_nr, global_data, process_data, thread_data); + } + + if (g->p.serialize_startup) { + pthread_mutex_lock(&g->startup_mutex); + g->nr_tasks_started++; + pthread_mutex_unlock(&g->startup_mutex); + + /* Here we will wait for the main process to start us all at once: */ + pthread_mutex_lock(&g->start_work_mutex); + g->nr_tasks_working++; + + /* Last one wake the main process: */ + if (g->nr_tasks_working == g->p.nr_tasks) + pthread_mutex_unlock(&g->startup_done_mutex); + + pthread_mutex_unlock(&g->start_work_mutex); + } + + gettimeofday(&start0, NULL); + + start = stop = start0; + last_perturbance = start.tv_sec; + + for (l = 0; l < g->p.nr_loops; l++) { + start = stop; + + if (g->stop_work) + break; + + val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val); + val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val); + val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); + + if (g->p.sleep_usecs) { + pthread_mutex_lock(td->process_lock); + usleep(g->p.sleep_usecs); + pthread_mutex_unlock(td->process_lock); + } + /* + * Amount of work to be done under a process-global lock: + */ + if (g->p.bytes_process_locked) { + pthread_mutex_lock(td->process_lock); + val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); + pthread_mutex_unlock(td->process_lock); + } + + work_done = g->p.bytes_global + g->p.bytes_process + + g->p.bytes_process_locked + g->p.bytes_thread; + + update_curr_cpu(task_nr, work_done); + bytes_done += work_done; + + if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) + continue; + + td->loops_done = l; + + gettimeofday(&stop, NULL); + + /* Check whether our max runtime timed out: */ + if (g->p.nr_secs) { + timersub(&stop, &start0, &diff); + if ((u32)diff.tv_sec >= g->p.nr_secs) { + g->stop_work = true; + break; + } + } + + /* Update the summary at most once per second: */ + if (start.tv_sec == stop.tv_sec) + continue; + + /* + * Perturb the first task's equilibrium every g->p.perturb_secs seconds, + * by migrating to CPU#0: + */ + if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { + cpu_set_t orig_mask; + int target_cpu; + int this_cpu; + + last_perturbance = stop.tv_sec; + + /* + * Depending on where we are running, move into + * the other half of the system, to create some + * real disturbance: + */ + this_cpu = g->threads[task_nr].curr_cpu; + if (this_cpu < g->p.nr_cpus/2) + target_cpu = g->p.nr_cpus-1; + else + target_cpu = 0; + + orig_mask = bind_to_cpu(target_cpu); + + /* Here we are running on the target CPU already */ + if (details >= 1) + printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); + + bind_to_cpumask(orig_mask); + } + + if (details >= 3) { + timersub(&stop, &start, &diff); + runtime_ns_max = diff.tv_sec * 1000000000; + runtime_ns_max += diff.tv_usec * 1000; + + if (details >= 0) { + printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", + process_nr, thread_nr, runtime_ns_max / bytes_done, val); + } + fflush(stdout); + } + if (!last_task) + continue; + + timersub(&stop, &start0, &diff); + runtime_ns_max = diff.tv_sec * 1000000000ULL; + runtime_ns_max += diff.tv_usec * 1000ULL; + + show_summary(runtime_ns_max, l, &convergence); + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start0, &diff); + td->runtime_ns = diff.tv_sec * 1000000000ULL; + td->runtime_ns += diff.tv_usec * 1000ULL; + + free_data(thread_data, g->p.bytes_thread); + + pthread_mutex_lock(&g->stop_work_mutex); + g->bytes_done += bytes_done; + pthread_mutex_unlock(&g->stop_work_mutex); + + return NULL; +} + +/* + * A worker process starts a couple of threads: + */ +static void worker_process(int process_nr) +{ + pthread_mutex_t process_lock; + struct thread_data *td; + pthread_t *pthreads; + u8 *process_data; + int task_nr; + int ret; + int t; + + pthread_mutex_init(&process_lock, NULL); + set_taskname("process %d", process_nr); + + /* + * Pick up the memory policy and the CPU binding of our first thread, + * so that we initialize memory accordingly: + */ + task_nr = process_nr*g->p.nr_threads; + td = g->threads + task_nr; + + bind_to_memnode(td->bind_node); + bind_to_cpumask(td->bind_cpumask); + + pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); + process_data = setup_private_data(g->p.bytes_process); + + if (g->p.show_details >= 3) { + printf(" # process %2d global mem: %p, process mem: %p\n", + process_nr, g->data, process_data); + } + + for (t = 0; t < g->p.nr_threads; t++) { + task_nr = process_nr*g->p.nr_threads + t; + td = g->threads + task_nr; + + td->process_data = process_data; + td->process_nr = process_nr; + td->thread_nr = t; + td->task_nr = task_nr; + td->val = rand(); + td->curr_cpu = -1; + td->process_lock = &process_lock; + + ret = pthread_create(pthreads + t, NULL, worker_thread, td); + BUG_ON(ret); + } + + for (t = 0; t < g->p.nr_threads; t++) { + ret = pthread_join(pthreads[t], NULL); + BUG_ON(ret); + } + + free_data(process_data, g->p.bytes_process); + free(pthreads); +} + +static void print_summary(void) +{ + if (g->p.show_details < 0) + return; + + printf("\n ###\n"); + printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", + g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); + printf(" # %5dx %5ldMB global shared mem operations\n", + g->p.nr_loops, g->p.bytes_global/1024/1024); + printf(" # %5dx %5ldMB process shared mem operations\n", + g->p.nr_loops, g->p.bytes_process/1024/1024); + printf(" # %5dx %5ldMB thread local mem operations\n", + g->p.nr_loops, g->p.bytes_thread/1024/1024); + + printf(" ###\n"); + + printf("\n ###\n"); fflush(stdout); +} + +static void init_thread_data(void) +{ + ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + int t; + + g->threads = zalloc_shared_data(size); + + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + int cpu; + + /* Allow all nodes by default: */ + td->bind_node = -1; + + /* Allow all CPUs by default: */ + CPU_ZERO(&td->bind_cpumask); + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &td->bind_cpumask); + } +} + +static void deinit_thread_data(void) +{ + ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + + free_data(g->threads, size); +} + +static int init(void) +{ + g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); + + /* Copy over options: */ + g->p = p0; + + g->p.nr_cpus = numa_num_configured_cpus(); + + g->p.nr_nodes = numa_max_node() + 1; + + /* char array in count_process_nodes(): */ + BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + + if (g->p.show_quiet && !g->p.show_details) + g->p.show_details = -1; + + /* Some memory should be specified: */ + if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) + return -1; + + if (g->p.mb_global_str) { + g->p.mb_global = atof(g->p.mb_global_str); + BUG_ON(g->p.mb_global < 0); + } + + if (g->p.mb_proc_str) { + g->p.mb_proc = atof(g->p.mb_proc_str); + BUG_ON(g->p.mb_proc < 0); + } + + if (g->p.mb_proc_locked_str) { + g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); + BUG_ON(g->p.mb_proc_locked < 0); + BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); + } + + if (g->p.mb_thread_str) { + g->p.mb_thread = atof(g->p.mb_thread_str); + BUG_ON(g->p.mb_thread < 0); + } + + BUG_ON(g->p.nr_threads <= 0); + BUG_ON(g->p.nr_proc <= 0); + + g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; + + g->p.bytes_global = g->p.mb_global *1024L*1024L; + g->p.bytes_process = g->p.mb_proc *1024L*1024L; + g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; + g->p.bytes_thread = g->p.mb_thread *1024L*1024L; + + g->data = setup_shared_data(g->p.bytes_global); + + /* Startup serialization: */ + init_global_mutex(&g->start_work_mutex); + init_global_mutex(&g->startup_mutex); + init_global_mutex(&g->startup_done_mutex); + init_global_mutex(&g->stop_work_mutex); + + init_thread_data(); + + tprintf("#\n"); + if (parse_setup_cpu_list() || parse_setup_node_list()) + return -1; + tprintf("#\n"); + + print_summary(); + + return 0; +} + +static void deinit(void) +{ + free_data(g->data, g->p.bytes_global); + g->data = NULL; + + deinit_thread_data(); + + free_data(g, sizeof(*g)); + g = NULL; +} + +/* + * Print a short or long result, depending on the verbosity setting: + */ +static void print_res(const char *name, double val, + const char *txt_unit, const char *txt_short, const char *txt_long) +{ + if (!name) + name = "main,"; + + if (!g->p.show_quiet) + printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); + else + printf(" %14.3f %s\n", val, txt_long); +} + +static int __bench_numa(const char *name) +{ + struct timeval start, stop, diff; + u64 runtime_ns_min, runtime_ns_sum; + pid_t *pids, pid, wpid; + double delta_runtime; + double runtime_avg; + double runtime_sec_max; + double runtime_sec_min; + int wait_stat; + double bytes; + int i, t; + + if (init()) + return -1; + + pids = zalloc(g->p.nr_proc * sizeof(*pids)); + pid = -1; + + /* All threads try to acquire it, this way we can wait for them to start up: */ + pthread_mutex_lock(&g->start_work_mutex); + + if (g->p.serialize_startup) { + tprintf(" #\n"); + tprintf(" # Startup synchronization: ..."); fflush(stdout); + } + + gettimeofday(&start, NULL); + + for (i = 0; i < g->p.nr_proc; i++) { + pid = fork(); + dprintf(" # process %2d: PID %d\n", i, pid); + + BUG_ON(pid < 0); + if (!pid) { + /* Child process: */ + worker_process(i); + + exit(0); + } + pids[i] = pid; + + } + /* Wait for all the threads to start up: */ + while (g->nr_tasks_started != g->p.nr_tasks) + usleep(1000); + + BUG_ON(g->nr_tasks_started != g->p.nr_tasks); + + if (g->p.serialize_startup) { + double startup_sec; + + pthread_mutex_lock(&g->startup_done_mutex); + + /* This will start all threads: */ + pthread_mutex_unlock(&g->start_work_mutex); + + /* This mutex is locked - the last started thread will wake us: */ + pthread_mutex_lock(&g->startup_done_mutex); + + gettimeofday(&stop, NULL); + + timersub(&stop, &start, &diff); + + startup_sec = diff.tv_sec * 1000000000.0; + startup_sec += diff.tv_usec * 1000.0; + startup_sec /= 1e9; + + tprintf(" threads initialized in %.6f seconds.\n", startup_sec); + tprintf(" #\n"); + + start = stop; + pthread_mutex_unlock(&g->startup_done_mutex); + } else { + gettimeofday(&start, NULL); + } + + /* Parent process: */ + + + for (i = 0; i < g->p.nr_proc; i++) { + wpid = waitpid(pids[i], &wait_stat, 0); + BUG_ON(wpid < 0); + BUG_ON(!WIFEXITED(wait_stat)); + + } + + runtime_ns_sum = 0; + runtime_ns_min = -1LL; + + for (t = 0; t < g->p.nr_tasks; t++) { + u64 thread_runtime_ns = g->threads[t].runtime_ns; + + runtime_ns_sum += thread_runtime_ns; + runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); + + tprintf("\n ###\n"); + tprintf("\n"); + + runtime_sec_max = diff.tv_sec * 1000000000.0; + runtime_sec_max += diff.tv_usec * 1000.0; + runtime_sec_max /= 1e9; + + runtime_sec_min = runtime_ns_min/1e9; + + bytes = g->bytes_done; + runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; + + if (g->p.measure_convergence) { + print_res(name, runtime_sec_max, + "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); + } + + print_res(name, runtime_sec_max, + "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); + + print_res(name, runtime_sec_min, + "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime"); + + print_res(name, runtime_avg, + "secs,", "runtime-avg/thread", "secs average thread-runtime"); + + delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; + print_res(name, delta_runtime / runtime_sec_max * 100.0, + "%,", "spread-runtime/thread", "% difference between max/avg runtime"); + + print_res(name, bytes / g->p.nr_tasks / 1e9, + "GB,", "data/thread", "GB data processed, per thread"); + + print_res(name, bytes / 1e9, + "GB,", "data-total", "GB data processed, total"); + + print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks), + "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); + + print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, + "GB/sec,", "thread-speed", "GB/sec/thread speed"); + + print_res(name, bytes / runtime_sec_max / 1e9, + "GB/sec,", "total-speed", "GB/sec total speed"); + + free(pids); + + deinit(); + + return 0; +} + +#define MAX_ARGS 50 + +static int command_size(const char **argv) +{ + int size = 0; + + while (*argv) { + size++; + argv++; + } + + BUG_ON(size >= MAX_ARGS); + + return size; +} + +static void init_params(struct params *p, const char *name, int argc, const char **argv) +{ + int i; + + printf("\n # Running %s \"perf bench numa", name); + + for (i = 0; i < argc; i++) + printf(" %s", argv[i]); + + printf("\"\n"); + + memset(p, 0, sizeof(*p)); + + /* Initialize nonzero defaults: */ + + p->serialize_startup = 1; + p->data_reads = true; + p->data_writes = true; + p->data_backwards = true; + p->data_rand_walk = true; + p->nr_loops = -1; + p->init_random = true; + p->mb_global_str = "1"; + p->nr_proc = 1; + p->nr_threads = 1; + p->nr_secs = 5; + p->run_all = argc == 1; +} + +static int run_bench_numa(const char *name, const char **argv) +{ + int argc = command_size(argv); + + init_params(&p0, name, argc, argv); + argc = parse_options(argc, argv, options, bench_numa_usage, 0); + if (argc) + goto err; + + if (__bench_numa(name)) + goto err; + + return 0; + +err: + return -1; +} + +#define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" +#define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" + +#define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" +#define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" + +#define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" +#define OPT_BW_NOTHP OPT_BW, "--thp", "-1" + +/* + * The built-in test-suite executed by "perf bench numa -a". + * + * (A minimum of 4 nodes and 16 GB of RAM is recommended.) + */ +static const char *tests[][MAX_ARGS] = { + /* Basic single-stream NUMA bandwidth measurements: */ + { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "0", OPT_BW_RAM }, + { "RAM-bw-local-NOTHP,", + "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, + { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "1", OPT_BW_RAM }, + + /* 2-stream NUMA bandwidth measurements: */ + { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,2", "-M", "0x2", OPT_BW_RAM }, + { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,2", "-M", "1x2", OPT_BW_RAM }, + + /* Cross-stream NUMA bandwidth measurement: */ + { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,8", "-M", "1,0", OPT_BW_RAM }, + + /* Convergence latency measurements: */ + { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, + { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, + { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, + { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, + { " 4x4-convergence-NOTHP,", + "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, + { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV }, + { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV }, + { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV }, + { " 8x4-convergence-NOTHP,", + "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, + { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV }, + { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV }, + { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV }, + { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV }, + { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV }, + + /* Various NUMA process/thread layout bandwidth measurements: */ + { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW }, + { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW }, + { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW }, + { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW }, + { " 8x1-bw-process-NOTHP,", + "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, + { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, + + { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, + { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, + { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, + { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, + + { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, + { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, + { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, + { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, + { " 4x8-bw-thread-NOTHP,", + "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, + { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, + { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, + + { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, + { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, + + { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, + { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, + { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, + { "numa01-bw-thread-NOTHP,", + "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP }, +}; + +static int bench_all(void) +{ + int nr = ARRAY_SIZE(tests); + int ret; + int i; + + ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); + BUG_ON(ret < 0); + + for (i = 0; i < nr; i++) { + run_bench_numa(tests[i][0], tests[i] + 1); + } + + printf("\n"); + + return 0; +} + +int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) +{ + init_params(&p0, "main,", argc, argv); + argc = parse_options(argc, argv, options, bench_numa_usage, 0); + if (argc) + goto err; + + if (p0.run_all) + return bench_all(); + + if (__bench_numa(NULL)) + goto err; + + return 0; + +err: + usage_with_options(numa_usage, options); + return -1; +} diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c new file mode 100644 index 000000000..d7f281c28 --- /dev/null +++ b/tools/perf/bench/sched-messaging.c @@ -0,0 +1,331 @@ +/* + * + * sched-messaging.c + * + * messaging: Benchmark for scheduler and IPC mechanisms + * + * Based on hackbench by Rusty Russell + * Ported to perf by Hitoshi Mitake + * + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/parse-options.h" +#include "../builtin.h" +#include "bench.h" + +/* Test groups of 20 processes spraying to 20 receivers */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DATASIZE 100 + +static bool use_pipes = false; +static unsigned int loops = 100; +static bool thread_mode = false; +static unsigned int num_groups = 10; + +struct sender_context { + unsigned int num_fds; + int ready_out; + int wakefd; + int out_fds[0]; +}; + +struct receiver_context { + unsigned int num_packets; + int in_fds[2]; + int ready_out; + int wakefd; +}; + +static void fdpair(int fds[2]) +{ + if (use_pipes) { + if (pipe(fds) == 0) + return; + } else { + if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0) + return; + } + + err(EXIT_FAILURE, use_pipes ? "pipe()" : "socketpair()"); +} + +/* Block until we're ready to go */ +static void ready(int ready_out, int wakefd) +{ + char dummy; + struct pollfd pollfd = { .fd = wakefd, .events = POLLIN }; + + /* Tell them we're ready. */ + if (write(ready_out, &dummy, 1) != 1) + err(EXIT_FAILURE, "CLIENT: ready write"); + + /* Wait for "GO" signal */ + if (poll(&pollfd, 1, -1) != 1) + err(EXIT_FAILURE, "poll"); +} + +/* Sender sprays loops messages down each file descriptor */ +static void *sender(struct sender_context *ctx) +{ + char data[DATASIZE]; + unsigned int i, j; + + ready(ctx->ready_out, ctx->wakefd); + + /* Now pump to every receiver. */ + for (i = 0; i < loops; i++) { + for (j = 0; j < ctx->num_fds; j++) { + int ret, done = 0; + +again: + ret = write(ctx->out_fds[j], data + done, + sizeof(data)-done); + if (ret < 0) + err(EXIT_FAILURE, "SENDER: write"); + done += ret; + if (done < DATASIZE) + goto again; + } + } + + return NULL; +} + + +/* One receiver per fd */ +static void *receiver(struct receiver_context* ctx) +{ + unsigned int i; + + if (!thread_mode) + close(ctx->in_fds[1]); + + /* Wait for start... */ + ready(ctx->ready_out, ctx->wakefd); + + /* Receive them all */ + for (i = 0; i < ctx->num_packets; i++) { + char data[DATASIZE]; + int ret, done = 0; + +again: + ret = read(ctx->in_fds[0], data + done, DATASIZE - done); + if (ret < 0) + err(EXIT_FAILURE, "SERVER: read"); + done += ret; + if (done < DATASIZE) + goto again; + } + + return NULL; +} + +static pthread_t create_worker(void *ctx, void *(*func)(void *)) +{ + pthread_attr_t attr; + pthread_t childid; + int ret; + + if (!thread_mode) { + /* process mode */ + /* Fork the receiver. */ + switch (fork()) { + case -1: + err(EXIT_FAILURE, "fork()"); + break; + case 0: + (*func) (ctx); + exit(0); + break; + default: + break; + } + + return (pthread_t)0; + } + + if (pthread_attr_init(&attr) != 0) + err(EXIT_FAILURE, "pthread_attr_init:"); + +#ifndef __ia64__ + if (pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN) != 0) + err(EXIT_FAILURE, "pthread_attr_setstacksize"); +#endif + + ret = pthread_create(&childid, &attr, func, ctx); + if (ret != 0) + err(EXIT_FAILURE, "pthread_create failed"); + + return childid; +} + +static void reap_worker(pthread_t id) +{ + int proc_status; + void *thread_status; + + if (!thread_mode) { + /* process mode */ + wait(&proc_status); + if (!WIFEXITED(proc_status)) + exit(1); + } else { + pthread_join(id, &thread_status); + } +} + +/* One group of senders and receivers */ +static unsigned int group(pthread_t *pth, + unsigned int num_fds, + int ready_out, + int wakefd) +{ + unsigned int i; + struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) + + num_fds * sizeof(int)); + + if (!snd_ctx) + err(EXIT_FAILURE, "malloc()"); + + for (i = 0; i < num_fds; i++) { + int fds[2]; + struct receiver_context *ctx = malloc(sizeof(*ctx)); + + if (!ctx) + err(EXIT_FAILURE, "malloc()"); + + + /* Create the pipe between client and server */ + fdpair(fds); + + ctx->num_packets = num_fds * loops; + ctx->in_fds[0] = fds[0]; + ctx->in_fds[1] = fds[1]; + ctx->ready_out = ready_out; + ctx->wakefd = wakefd; + + pth[i] = create_worker(ctx, (void *)receiver); + + snd_ctx->out_fds[i] = fds[1]; + if (!thread_mode) + close(fds[0]); + } + + /* Now we have all the fds, fork the senders */ + for (i = 0; i < num_fds; i++) { + snd_ctx->ready_out = ready_out; + snd_ctx->wakefd = wakefd; + snd_ctx->num_fds = num_fds; + + pth[num_fds+i] = create_worker(snd_ctx, (void *)sender); + } + + /* Close the fds we have left */ + if (!thread_mode) + for (i = 0; i < num_fds; i++) + close(snd_ctx->out_fds[i]); + + /* Return number of children to reap */ + return num_fds * 2; +} + +static const struct option options[] = { + OPT_BOOLEAN('p', "pipe", &use_pipes, + "Use pipe() instead of socketpair()"), + OPT_BOOLEAN('t', "thread", &thread_mode, + "Be multi thread instead of multi process"), + OPT_UINTEGER('g', "group", &num_groups, "Specify number of groups"), + OPT_UINTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_sched_message_usage[] = { + "perf bench sched messaging ", + NULL +}; + +int bench_sched_messaging(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + unsigned int i, total_children; + struct timeval start, stop, diff; + unsigned int num_fds = 20; + int readyfds[2], wakefds[2]; + char dummy; + pthread_t *pth_tab; + + argc = parse_options(argc, argv, options, + bench_sched_message_usage, 0); + + pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t)); + if (!pth_tab) + err(EXIT_FAILURE, "main:malloc()"); + + fdpair(readyfds); + fdpair(wakefds); + + total_children = 0; + for (i = 0; i < num_groups; i++) + total_children += group(pth_tab+total_children, num_fds, + readyfds[1], wakefds[0]); + + /* Wait for everyone to be ready */ + for (i = 0; i < total_children; i++) + if (read(readyfds[0], &dummy, 1) != 1) + err(EXIT_FAILURE, "Reading for readyfds"); + + gettimeofday(&start, NULL); + + /* Kick them off */ + if (write(wakefds[1], &dummy, 1) != 1) + err(EXIT_FAILURE, "Writing to start them"); + + /* Reap them all */ + for (i = 0; i < total_children; i++) + reap_worker(pth_tab[i]); + + gettimeofday(&stop, NULL); + + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# %d sender and receiver %s per group\n", + num_fds, thread_mode ? "threads" : "processes"); + printf("# %d groups == %d %s run\n\n", + num_groups, num_groups * 2 * num_fds, + thread_mode ? "threads" : "processes"); + printf(" %14s: %lu.%03lu [sec]\n", "Total time", + diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + break; + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + break; + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + free(pth_tab); + + return 0; +} diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c new file mode 100644 index 000000000..005cc2837 --- /dev/null +++ b/tools/perf/bench/sched-pipe.c @@ -0,0 +1,184 @@ +/* + * + * sched-pipe.c + * + * pipe: Benchmark for pipe() + * + * Based on pipe-test-1m.c by Ingo Molnar + * http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c + * Ported to perf by Hitoshi Mitake + */ +#include "../perf.h" +#include "../util/util.h" +#include "../util/parse-options.h" +#include "../builtin.h" +#include "bench.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct thread_data { + int nr; + int pipe_read; + int pipe_write; + pthread_t pthread; +}; + +#define LOOPS_DEFAULT 1000000 +static int loops = LOOPS_DEFAULT; + +/* Use processes by default: */ +static bool threaded; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"), + OPT_END() +}; + +static const char * const bench_sched_pipe_usage[] = { + "perf bench sched pipe ", + NULL +}; + +static void *worker_thread(void *__tdata) +{ + struct thread_data *td = __tdata; + int m = 0, i; + int ret; + + for (i = 0; i < loops; i++) { + if (!td->nr) { + ret = read(td->pipe_read, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + ret = write(td->pipe_write, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + } else { + ret = write(td->pipe_write, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + ret = read(td->pipe_read, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + } + } + + return NULL; +} + +int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused) +{ + struct thread_data threads[2], *td; + int pipe_1[2], pipe_2[2]; + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + int nr_threads = 2; + int t; + + /* + * why does "ret" exist? + * discarding returned value of read(), write() + * causes error in building environment for perf + */ + int __maybe_unused ret, wait_stat; + pid_t pid, retpid __maybe_unused; + + argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0); + + BUG_ON(pipe(pipe_1)); + BUG_ON(pipe(pipe_2)); + + gettimeofday(&start, NULL); + + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + td->nr = t; + + if (t == 0) { + td->pipe_read = pipe_1[0]; + td->pipe_write = pipe_2[1]; + } else { + td->pipe_write = pipe_1[1]; + td->pipe_read = pipe_2[0]; + } + } + + + if (threaded) { + + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + ret = pthread_create(&td->pthread, NULL, worker_thread, td); + BUG_ON(ret); + } + + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + ret = pthread_join(td->pthread, NULL); + BUG_ON(ret); + } + + } else { + pid = fork(); + assert(pid >= 0); + + if (!pid) { + worker_thread(threads + 0); + exit(0); + } else { + worker_thread(threads + 1); + } + + retpid = waitpid(pid, &wait_stat, 0); + assert((retpid == pid) && WIFEXITED(wait_stat)); + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %d pipe operations between two %s\n\n", + loops, threaded ? "threads" : "processes"); + + result_usec = diff.tv_sec * 1000000; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)1000000))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + diff.tv_sec, + (unsigned long) (diff.tv_usec / 1000)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} -- cgit v1.2.3