1 files changed, 460 insertions, 0 deletions
diff --git a/man/man2/membarrier.2 b/man/man2/membarrier.2
new file mode 100644
index 000000000..cd8029dd9
--- /dev/null
+++ b/man/man2/membarrier.2
@@ -0,0 +1,460 @@
+'\" t
+.\" Copyright 2015-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH membarrier 2 (date) "Linux man-pages (unreleased)"
+.SH NAME
+membarrier \- issue memory barriers on a set of threads
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.P
+.BR "#include <linux/membarrier.h>" \
+" /* Definition of " MEMBARRIER_* " constants */"
+.BR "#include <sys/syscall.h>" "      /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.P
+.BI "int syscall(SYS_membarrier, int " cmd ", unsigned int " flags \
+", int " cpu_id );
+.fi
+.P
+.IR Note :
+glibc provides no wrapper for
+.BR membarrier (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR membarrier ()
+system call helps reducing the overhead of the memory barrier
+instructions required to order memory accesses on multi-core systems.
+However, this system call is heavier than a memory barrier, so using it
+effectively is
+.I not
+as simple as replacing memory barriers with this
+system call, but requires understanding of the details below.
+.P
+Use of memory barriers needs to be done taking into account that a
+memory barrier always needs to be either matched with its memory barrier
+counterparts, or that the architecture's memory model doesn't require the
+matching barriers.
+.P
+There are cases where one side of the matching barriers (which we will
+refer to as "fast side") is executed much more often than the other
+(which we will refer to as "slow side").
+This is a prime target for the use of
+.BR membarrier ().
+The key idea is to replace, for these matching
+barriers, the fast-side memory barriers by simple compiler barriers,
+for example:
+.P
+.in +4n
+.EX
+asm volatile ("" : : : "memory")
+.EE
+.in
+.P
+and replace the slow-side memory barriers by calls to
+.BR membarrier ().
+.P
+This will add overhead to the slow side, and remove overhead from the
+fast side, thus resulting in an overall performance increase as long as
+the slow side is infrequent enough that the overhead of the
+.BR membarrier ()
+calls does not outweigh the performance gain on the fast side.
+.P
+The
+.I cmd
+argument is one of the following:
+.TP
+.BR MEMBARRIER_CMD_QUERY " (since Linux 4.3)"
+Query the set of supported commands.
+The return value of the call is a bit mask of supported
+commands.
+.BR MEMBARRIER_CMD_QUERY ,
+which has the value 0,
+is not itself included in this bit mask.
+This command is always supported (on kernels where
+.BR membarrier ()
+is provided).
+.TP
+.BR MEMBARRIER_CMD_GLOBAL " (since Linux 4.16)"
+Ensure that all threads from all processes on the system pass through a
+state where all memory accesses to user-space addresses match program
+order between entry to and return from the
+.BR membarrier ()
+system call.
+All threads on the system are targeted by this command.
+.TP
+.BR MEMBARRIER_CMD_GLOBAL_EXPEDITED " (since Linux 4.16)"
+Execute a memory barrier on all running threads of all processes that
+previously registered with
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+Upon return from the system call, the calling thread has a guarantee that all
+running threads have passed through a state where all memory accesses to
+user-space addresses match program order between entry to and return
+from the system call (non-running threads are de facto in such a state).
+This guarantee is provided only for the threads of processes that
+previously registered with
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+Given that registration is about the intent to receive the barriers, it
+is valid to invoke
+.B MEMBARRIER_CMD_GLOBAL_EXPEDITED
+from a process that has not employed
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+The "expedited" commands complete faster than the non-expedited ones;
+they never block, but have the downside of causing extra overhead.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED " (since Linux 4.16)"
+Register the process's intent to receive
+.B MEMBARRIER_CMD_GLOBAL_EXPEDITED
+memory barriers.
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED " (since Linux 4.14)"
+Execute a memory barrier on each running thread belonging to the same
+process as the calling thread.
+.IP
+Upon return from the system call, the calling
+thread has a guarantee that all its running thread siblings have passed
+through a state where all memory accesses to user-space addresses match
+program order between entry to and return from the system call
+(non-running threads are de facto in such a state).
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+The "expedited" commands complete faster than the non-expedited ones;
+they never block, but have the downside of causing extra overhead.
+.IP
+A process must register its intent to use the private
+expedited command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED " (since Linux 4.14)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED .
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
+In addition to providing the memory ordering guarantees described in
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
+upon return from system call the calling thread has a guarantee that all its
+running thread siblings have executed a core serializing instruction.
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+The "expedited" commands complete faster than the non-expedited ones,
+they never block, but have the downside of causing extra overhead.
+.IP
+A process must register its intent to use the private expedited sync
+core command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE .
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ " (since Linux 5.10)"
+Ensure the caller thread, upon return from system call, that all its
+running thread siblings have any currently running rseq critical sections
+restarted if
+.I flags
+parameter is 0; if
+.I flags
+parameter is
+.BR MEMBARRIER_CMD_FLAG_CPU ,
+then this operation is performed only on CPU indicated by
+.IR cpu_id .
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+RSEQ membarrier is only available in the "private expedited" form.
+.IP
+A process must register its intent to use the private expedited rseq
+command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ " (since Linux 5.10)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ .
+.TP
+.BR MEMBARRIER_CMD_SHARED " (since Linux 4.3)"
+This is an alias for
+.B MEMBARRIER_CMD_GLOBAL
+that exists for header backward compatibility.
+.P
+The
+.I flags
+argument must be specified as 0 unless the command is
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ ,
+in which case
+.I flags
+can be either 0 or
+.BR MEMBARRIER_CMD_FLAG_CPU .
+.P
+The
+.I cpu_id
+argument is ignored unless
+.I flags
+is
+.BR MEMBARRIER_CMD_FLAG_CPU ,
+in which case it must specify the CPU targeted by this membarrier
+command.
+.P
+All memory accesses performed in program order from each targeted thread
+are guaranteed to be ordered with respect to
+.BR membarrier ().
+.P
+If we use the semantic
+.I barrier()
+to represent a compiler barrier forcing memory
+accesses to be performed in program order across the barrier, and
+.I smp_mb()
+to represent explicit memory barriers forcing full memory
+ordering across the barrier, we have the following ordering table for
+each pairing of
+.IR barrier() ,
+.BR membarrier (),
+and
+.IR smp_mb() .
+The pair ordering is detailed as (O: ordered, X: not ordered):
+.P
+.RS
+.TS
+l c c c.
+\&	barrier()	smp_mb()	membarrier()
+barrier()	X	X	O
+smp_mb()	X	O	O
+membarrier()	O	O	O
+.TE
+.RE
+.SH RETURN VALUE
+On success, the
+.B MEMBARRIER_CMD_QUERY
+operation returns a bit mask of supported commands, and the
+.BR MEMBARRIER_CMD_GLOBAL ,
+.BR MEMBARRIER_CMD_GLOBAL_EXPEDITED ,
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED ,
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED ,
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE ,
+and
+.B MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
+operations return zero.
+On error, \-1 is returned,
+and
+.I errno
+is set to indicate the error.
+.P
+For a given command, with
+.I flags
+set to 0, this system call is
+guaranteed to always return the same value until reboot.
+Further calls with the same arguments will lead to the same result.
+Therefore, with
+.I flags
+set to 0, error handling is required only for the first call to
+.BR membarrier ().
+.SH ERRORS
+.TP
+.B EINVAL
+.I cmd
+is invalid, or
+.I flags
+is nonzero, or the
+.B MEMBARRIER_CMD_GLOBAL
+command is disabled because the
+.I nohz_full
+CPU parameter has been set, or the
+.B MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
+and
+.B MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
+commands are not implemented by the architecture.
+.TP
+.B ENOSYS
+The
+.BR membarrier ()
+system call is not implemented by this kernel.
+.TP
+.B EPERM
+The current process was not registered prior to using private expedited
+commands.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.3.
+.P
+Before Linux 5.10, the prototype was:
+.P
+.in +4n
+.EX
+.BI "int membarrier(int " cmd ", int " flags );
+.EE
+.in
+.SH NOTES
+A memory barrier instruction is part of the instruction set of
+architectures with weakly ordered memory models.
+It orders memory
+accesses prior to the barrier and after the barrier with respect to
+matching barriers on other cores.
+For instance, a load fence can order
+loads prior to and following that fence with respect to stores ordered
+by store fences.
+.P
+Program order is the order in which instructions are ordered in the
+program assembly code.
+.P
+Examples where
+.BR membarrier ()
+can be useful include implementations
+of Read-Copy-Update libraries and garbage collectors.
+.SH EXAMPLES
+Assuming a multithreaded application where "fast_path()" is executed
+very frequently, and where "slow_path()" is executed infrequently, the
+following code (x86) can be transformed using
+.BR membarrier ():
+.P
+.in +4n
+.\" SRC BEGIN (membarrier.c)
+.EX
+#include <stdlib.h>
+\&
+static volatile int a, b;
+\&
+static void
+fast_path(int *read_b)
+{
+    a = 1;
+    asm volatile ("mfence" : : : "memory");
+    *read_b = b;
+}
+\&
+static void
+slow_path(int *read_a)
+{
+    b = 1;
+    asm volatile ("mfence" : : : "memory");
+    *read_a = a;
+}
+\&
+int
+main(void)
+{
+    int read_a, read_b;
+\&
+    /*
+     * Real applications would call fast_path() and slow_path()
+     * from different threads. Call those from main() to keep
+     * this example short.
+     */
+\&
+    slow_path(&read_a);
+    fast_path(&read_b);
+\&
+    /*
+     * read_b == 0 implies read_a == 1 and
+     * read_a == 0 implies read_b == 1.
+     */
+\&
+    if (read_b == 0 && read_a == 0)
+        abort();
+\&
+    exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.P
+The code above transformed to use
+.BR membarrier ()
+becomes:
+.P
+.in +4n
+.EX
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/membarrier.h>
+\&
+static volatile int a, b;
+\&
+static int
+membarrier(int cmd, unsigned int flags, int cpu_id)
+{
+    return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+\&
+static int
+init_membarrier(void)
+{
+    int ret;
+\&
+    /* Check that membarrier() is supported. */
+\&
+    ret = membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
+    if (ret < 0) {
+        perror("membarrier");
+        return \-1;
+    }
+\&
+    if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
+        fprintf(stderr,
+            "membarrier does not support MEMBARRIER_CMD_GLOBAL\en");
+        return \-1;
+    }
+\&
+    return 0;
+}
+\&
+static void
+fast_path(int *read_b)
+{
+    a = 1;
+    asm volatile ("" : : : "memory");
+    *read_b = b;
+}
+\&
+static void
+slow_path(int *read_a)
+{
+    b = 1;
+    membarrier(MEMBARRIER_CMD_GLOBAL, 0, 0);
+    *read_a = a;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+    int read_a, read_b;
+\&
+    if (init_membarrier())
+        exit(EXIT_FAILURE);
+\&
+    /*
+     * Real applications would call fast_path() and slow_path()
+     * from different threads. Call those from main() to keep
+     * this example short.
+     */
+\&
+    slow_path(&read_a);
+    fast_path(&read_b);
+\&
+    /*
+     * read_b == 0 implies read_a == 1 and
+     * read_a == 0 implies read_b == 1.
+     */
+\&
+    if (read_b == 0 && read_a == 0)
+        abort();
+\&
+    exit(EXIT_SUCCESS);
+}
+.EE
+.in
+.\" .SH SEE ALSO
+.\" FIXME See if the following syscalls make it into Linux 4.15 or later
+.\" .BR cpu_opv (2),
+.\" .BR rseq (2)