summaryrefslogtreecommitdiffstats
path: root/man2/seccomp_unotify.2
blob: 1de5601fef4c99d9009899b5d0a29cc54d7ff634 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
.\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
.\"
.\" SPDX-License-Identifier: Linux-man-pages-copyleft
.\"
.TH seccomp_unotify 2 (date) "Linux man-pages (unreleased)"
.SH NAME
seccomp_unotify \- Seccomp user-space notification mechanism
.SH LIBRARY
Standard C library
.RI ( libc ", " \-lc )
.SH SYNOPSIS
.nf
.B #include <linux/seccomp.h>
.B #include <linux/filter.h>
.B #include <linux/audit.h>
.P
.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
", void *" args );
.P
.B #include <sys/ioctl.h>
.P
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV,"
.BI "          struct seccomp_notif *" req );
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND,"
.BI "          struct seccomp_notif_resp *" resp );
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id );
.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ADDFD,"
.BI "          struct seccomp_notif_addfd *" addfd );
.fi
.SH DESCRIPTION
This page describes the user-space notification mechanism provided by the
Secure Computing (seccomp) facility.
As well as the use of the
.B SECCOMP_FILTER_FLAG_NEW_LISTENER
flag, the
.B SECCOMP_RET_USER_NOTIF
action value, and the
.B SECCOMP_GET_NOTIF_SIZES
operation described in
.MR seccomp 2 ,
this mechanism involves the use of a number of related
.MR ioctl 2
operations (described below).
.\"
.SS Overview
In conventional usage of a seccomp filter,
the decision about how to treat a system call is made by the filter itself.
By contrast, the user-space notification mechanism allows
the seccomp filter to delegate
the handling of the system call to another user-space process.
Note that this mechanism is explicitly
.B not
intended as a method implementing security policy; see NOTES.
.P
In the discussion that follows,
the thread(s) on which the seccomp filter is installed is (are)
referred to as the
.IR target ,
and the process that is notified by the user-space notification
mechanism is referred to as the
.IR supervisor .
.P
A suitably privileged supervisor can use the user-space notification
mechanism to perform actions on behalf of the target.
The advantage of the user-space notification mechanism is that
the supervisor will
usually be able to retrieve information about the target and the
performed system call that the seccomp filter itself cannot.
(A seccomp filter is limited in the information it can obtain and
the actions that it can perform because it
is running on a virtual machine inside the kernel.)
.P
An overview of the steps performed by the target and the supervisor
is as follows:
.\"-------------------------------------
.IP (1) 5
The target establishes a seccomp filter in the usual manner,
but with two differences:
.RS
.IP \[bu] 3
The
.MR seccomp 2
.I flags
argument includes the flag
.BR SECCOMP_FILTER_FLAG_NEW_LISTENER .
Consequently, the return value of the (successful)
.MR seccomp 2
call is a new "listening"
file descriptor that can be used to receive notifications.
Only one "listening" seccomp filter can be installed for a thread.
.\" FIXME
.\" Is the last sentence above correct?
.\"
.\" Kees Cook (25 Oct 2020) notes:
.\"
.\" I like this limitation, but I expect that it'll need to change in the
.\" future. Even with LSMs, we see the need for arbitrary stacking, and the
.\" idea of there being only 1 supervisor will eventually break down. Right
.\" now there is only 1 because only container managers are using this
.\" feature. But if some daemon starts using it to isolate some thread,
.\" suddenly it might break if a container manager is trying to listen to it
.\" too, etc. I expect it won't be needed soon, but I do think it'll change.
.\"
.IP \[bu]
In cases where it is appropriate, the seccomp filter returns the action value
.BR SECCOMP_RET_USER_NOTIF .
This return value will trigger a notification event.
.RE
.\"-------------------------------------
.IP (2)
In order that the supervisor can obtain notifications
using the listening file descriptor,
(a duplicate of) that file descriptor must be passed from
the target to the supervisor.
One way in which this could be done is by passing the file descriptor
over a UNIX domain socket connection between the target and the supervisor
(using the
.B SCM_RIGHTS
ancillary message type described in
.MR unix 7 ).
Another way to do this is through the use of
.MR pidfd_getfd 2 .
.\" Jann Horn:
.\"     Instead of using unix domain sockets to send the fd to the
.\"     parent, I think you could also use clone3() with
.\"     flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd
.\"     that was reserved in the parent, call unshare(CLONE_FILES)
.\"     in the child after setting up the seccomp fd, and wake
.\"     up the parent with something like pthread_cond_signal()?
.\"     I'm not sure whether that'd look better or worse in the
.\"     end though, so maybe just ignore this comment.
.\"-------------------------------------
.IP (3)
The supervisor will receive notification events
on the listening file descriptor.
These events are returned as structures of type
.IR seccomp_notif .
Because this structure and its size may evolve over kernel versions,
the supervisor must first determine the size of this structure
using the
.MR seccomp 2
.B SECCOMP_GET_NOTIF_SIZES
operation, which returns a structure of type
.IR seccomp_notif_sizes .
The supervisor allocates a buffer of size
.I seccomp_notif_sizes.seccomp_notif
bytes to receive notification events.
In addition,the supervisor allocates another buffer of size
.I seccomp_notif_sizes.seccomp_notif_resp
bytes for the response (a
.I struct seccomp_notif_resp
structure)
that it will provide to the kernel (and thus the target).
.\"-------------------------------------
.IP (4)
The target then performs its workload,
which includes system calls that will be controlled by the seccomp filter.
Whenever one of these system calls causes the filter to return the
.B SECCOMP_RET_USER_NOTIF
action value, the kernel does
.I not
(yet) execute the system call;
instead, execution of the target is temporarily blocked inside
the kernel (in a sleep state that is interruptible by signals)
and a notification event is generated on the listening file descriptor.
.\"-------------------------------------
.IP (5)
The supervisor can now repeatedly monitor the
listening file descriptor for
.BR SECCOMP_RET_USER_NOTIF -triggered
events.
To do this, the supervisor uses the
.B SECCOMP_IOCTL_NOTIF_RECV
.MR ioctl 2
operation to read information about a notification event;
this operation blocks until an event is available.
The operation returns a
.I seccomp_notif
structure containing information about the system call
that is being attempted by the target.
(As described in NOTES,
the file descriptor can also be monitored with
.MR select 2 ,
.MR poll 2 ,
or
.MR epoll 7 .)
.\" FIXME
.\" Christian Brauner:
.\"
.\" Do we support O_NONBLOCK with SECCOMP_IOCTL_NOTIF_RECV and if
.\" not should we?
.\"
.\" Michael Kerrisk:
.\"
.\" A quick test suggests that O_NONBLOCK has no effect on the blocking
.\" behavior of SECCOMP_IOCTL_NOTIF_RECV.
.
.\"-------------------------------------
.IP (6)
The
.I seccomp_notif
structure returned by the
.B SECCOMP_IOCTL_NOTIF_RECV
operation includes the same information (a
.I seccomp_data
structure) that was passed to the seccomp filter.
This information allows the supervisor to discover the system call number and
the arguments for the target's system call.
In addition, the notification event contains the ID of the thread
that triggered the notification and a unique cookie value that
is used in subsequent
.B SECCOMP_IOCTL_NOTIF_ID_VALID
and
.B SECCOMP_IOCTL_NOTIF_SEND
operations.
.IP
The information in the notification can be used to discover the
values of pointer arguments for the target's system call.
(This is something that can't be done from within a seccomp filter.)
One way in which the supervisor can do this is to open the corresponding
.IR /proc/ tid /mem
file (see
.MR proc 5 )
and read bytes from the location that corresponds to one of
the pointer arguments whose value is supplied in the notification event.
.\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem,
.\" such as ptrace() and /proc/PID/map_files
(The supervisor must be careful to avoid
a race condition that can occur when doing this;
see the description of the
.B SECCOMP_IOCTL_NOTIF_ID_VALID
.MR ioctl 2
operation below.)
In addition,
the supervisor can access other system information that is visible
in user space but which is not accessible from a seccomp filter.
.\"-------------------------------------
.IP (7)
Having obtained information as per the previous step,
the supervisor may then choose to perform an action in response
to the target's system call
(which, as noted above, is not executed when the seccomp filter returns the
.B SECCOMP_RET_USER_NOTIF
action value).
.IP
One example use case here relates to containers.
The target may be located inside a container where
it does not have sufficient capabilities to mount a filesystem
in the container's mount namespace.
However, the supervisor may be a more privileged process that
does have sufficient capabilities to perform the mount operation.
.\"-------------------------------------
.IP (8)
The supervisor then sends a response to the notification.
The information in this response is used by the kernel to construct
a return value for the target's system call and provide
a value that will be assigned to the
.I errno
variable of the target.
.IP
The response is sent using the
.B SECCOMP_IOCTL_NOTIF_SEND
.MR ioctl 2
operation, which is used to transmit a
.I seccomp_notif_resp
structure to the kernel.
This structure includes a cookie value that the supervisor obtained in the
.I seccomp_notif
structure returned by the
.B SECCOMP_IOCTL_NOTIF_RECV
operation.
This cookie value allows the kernel to associate the response with the
target.
This structure must include the cookie value that the supervisor
obtained in the
.I seccomp_notif
structure returned by the
.B SECCOMP_IOCTL_NOTIF_RECV
operation;
the cookie allows the kernel to associate the response with the target.
.\"-------------------------------------
.IP (9)
Once the notification has been sent,
the system call in the target thread unblocks,
returning the information that was provided by the supervisor
in the notification response.
.\"-------------------------------------
.P
As a variation on the last two steps,
the supervisor can send a response that tells the kernel that it
should execute the target thread's system call; see the discussion of
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
below.
.\"
.SH IOCTL OPERATIONS
The following
.MR ioctl 2
operations are supported by the seccomp user-space
notification file descriptor.
For each of these operations, the first (file descriptor) argument of
.MR ioctl 2
is the listening file descriptor returned by a call to
.MR seccomp 2
with the
.B SECCOMP_FILTER_FLAG_NEW_LISTENER
flag.
.\"
.SS SECCOMP_IOCTL_NOTIF_RECV
The
.B SECCOMP_IOCTL_NOTIF_RECV
operation (available since Linux 5.0) is used to obtain a user-space
notification event.
If no such event is currently pending,
the operation blocks until an event occurs.
The third
.MR ioctl 2
argument is a pointer to a structure of the following form
which contains information about the event.
This structure must be zeroed out before the call.
.P
.in +4n
.EX
struct seccomp_notif {
    __u64  id;              /* Cookie */
    __u32  pid;             /* TID of target thread */
    __u32  flags;           /* Currently unused (0) */
    struct seccomp_data data;   /* See seccomp(2) */
};
.EE
.in
.P
The fields in this structure are as follows:
.TP
.I id
This is a cookie for the notification.
Each such cookie is guaranteed to be unique for the corresponding
seccomp filter.
.RS
.IP \[bu] 3
The cookie can be used with the
.B SECCOMP_IOCTL_NOTIF_ID_VALID
.MR ioctl 2
operation described below.
.IP \[bu]
When returning a notification response to the kernel,
the supervisor must include the cookie value in the
.I seccomp_notif_resp
structure that is specified as the argument of the
.B SECCOMP_IOCTL_NOTIF_SEND
operation.
.RE
.TP
.I pid
This is the thread ID of the target thread that triggered
the notification event.
.TP
.I flags
This is a bit mask of flags providing further information on the event.
In the current implementation, this field is always zero.
.TP
.I data
This is a
.I seccomp_data
structure containing information about the system call that
triggered the notification.
This is the same structure that is passed to the seccomp filter.
See
.MR seccomp 2
for details of this structure.
.P
On success, this operation returns 0; on failure, \-1 is returned, and
.I errno
is set to indicate the cause of the error.
This operation can fail with the following errors:
.TP
.BR EINVAL " (since Linux 5.5)"
.\" commit 2882d53c9c6f3b8311d225062522f03772cf0179
The
.I seccomp_notif
structure that was passed to the call contained nonzero fields.
.TP
.B ENOENT
The target thread was killed by a signal as the notification information
was being generated,
or the target's (blocked) system call was interrupted by a signal handler.
.\" FIXME
.\" From my experiments,
.\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after
.\" the target thread terminates, then the ioctl() simply
.\" blocks (rather than returning an error to indicate that the
.\" target no longer exists).
.\"
.\" I found that surprising, and it required some contortions in
.\" the example program.  It was not possible to code my SIGCHLD
.\" handler (which reaps the zombie when the worker/target
.\" terminates) to simply set a flag checked in the main
.\" handleNotifications() loop, since this created an
.\" unavoidable race where the child might terminate just after
.\" I had checked the flag, but before I blocked (forever!) in the
.\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code
.\" the signal handler to simply call _exit(2) in order to
.\" terminate the parent process (the supervisor).
.\"
.\" Is this expected behavior? It seems to me rather
.\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error
.\" if the target has terminated.
.\"
.\" Jann posted a patch to rectify this, but there was no response
.\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing
.\" this issue. (I've tried building with the patch, but encountered
.\" an issue with the target process entering D state after a signal.)
.\"
.\" For now, this behavior is documented in BUGS.
.\"
.\" Kees Cook commented: Let's change [this] ASAP!
.\"
.SS SECCOMP_IOCTL_NOTIF_ID_VALID
The
.B SECCOMP_IOCTL_NOTIF_ID_VALID
operation (available since Linux 5.0) is used to check that a notification ID
returned by an earlier
.B SECCOMP_IOCTL_NOTIF_RECV
operation is still valid
(i.e., that the target still exists and its system call
is still blocked waiting for a response).
.P
The third
.MR ioctl 2
argument is a pointer to the cookie
.RI ( id )
returned by the
.B SECCOMP_IOCTL_NOTIF_RECV
operation.
.P
This operation is necessary to avoid race conditions that can occur when the
.I pid
returned by the
.B SECCOMP_IOCTL_NOTIF_RECV
operation terminates, and that process ID is reused by another process.
An example of this kind of race is the following
.IP (1) 5
A notification is generated on the listening file descriptor.
The returned
.I seccomp_notif
contains the TID of the target thread (in the
.I pid
field of the structure).
.IP (2)
The target terminates.
.IP (3)
Another thread or process is created on the system that by chance reuses the
TID that was freed when the target terminated.
.IP (4)
The supervisor
.MR open 2 s
the
.IR /proc/ tid /mem
file for the TID obtained in step 1, with the intention of (say)
inspecting the memory location(s) that containing the argument(s) of
the system call that triggered the notification in step 1.
.P
In the above scenario, the risk is that the supervisor may try
to access the memory of a process other than the target.
This race can be avoided by following the call to
.MR open 2
with a
.B SECCOMP_IOCTL_NOTIF_ID_VALID
operation to verify that the process that generated the notification
is still alive.
(Note that if the target terminates after the latter step,
a subsequent
.MR read 2
from the file descriptor may return 0, indicating end of file.)
.\" Jann Horn:
.\"     the PID can be reused, but the /proc/$pid directory is
.\"     internally not associated with the numeric PID, but,
.\"     conceptually speaking, with a specific incarnation of the
.\"     PID, or something like that.  (Actually, it is associated
.\"     with the "struct pid", which is not reused, instead of the
.\"     numeric PID.
.P
See NOTES for a discussion of other cases where
.B SECCOMP_IOCTL_NOTIF_ID_VALID
checks must be performed.
.P
On success (i.e., the notification ID is still valid),
this operation returns 0.
On failure (i.e., the notification ID is no longer valid),
\-1 is returned, and
.I errno
is set to
.BR ENOENT .
.\"
.SS SECCOMP_IOCTL_NOTIF_SEND
The
.B SECCOMP_IOCTL_NOTIF_SEND
operation (available since Linux 5.0)
is used to send a notification response back to the kernel.
The third
.MR ioctl 2
argument of this structure is a pointer to a structure of the following form:
.P
.in +4n
.EX
struct seccomp_notif_resp {
    __u64 id;           /* Cookie value */
    __s64 val;          /* Success return value */
    __s32 error;        /* 0 (success) or negative error number */
    __u32 flags;        /* See below */
};
.EE
.in
.P
The fields of this structure are as follows:
.TP
.I id
This is the cookie value that was obtained using the
.B SECCOMP_IOCTL_NOTIF_RECV
operation.
This cookie value allows the kernel to correctly associate this response
with the system call that triggered the user-space notification.
.TP
.I val
This is the value that will be used for a spoofed
success return for the target's system call; see below.
.TP
.I error
This is the value that will be used as the error number
.RI ( errno )
for a spoofed error return for the target's system call; see below.
.TP
.I flags
This is a bit mask that includes zero or more of the following flags:
.RS
.TP
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)"
Tell the kernel to execute the target's system call.
.\" commit fb3c5386b382d4097476ce9647260fc89b34afdb
.RE
.P
Two kinds of response are possible:
.IP \[bu] 3
A response to the kernel telling it to execute the
target's system call.
In this case, the
.I flags
field includes
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
and the
.I error
and
.I val
fields must be zero.
.IP
This kind of response can be useful in cases where the supervisor needs
to do deeper analysis of the target's system call than is possible
from a seccomp filter (e.g., examining the values of pointer arguments),
and, having decided that the system call does not require emulation
by the supervisor, the supervisor wants the system call to
be executed normally in the target.
.IP
The
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
flag should be used with caution; see NOTES.
.IP \[bu]
A spoofed return value for the target's system call.
In this case, the kernel does not execute the target's system call,
instead causing the system call to return a spoofed value as specified by
fields of the
.I seccomp_notif_resp
structure.
The supervisor should set the fields of this structure as follows:
.RS
.IP \[bu] 3
.I flags
does not contain
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE .
.IP \[bu]
.I error
is set either to 0 for a spoofed "success" return or to a negative
error number for a spoofed "failure" return.
In the former case, the kernel causes the target's system call
to return the value specified in the
.I val
field.
In the latter case, the kernel causes the target's system call
to return \-1, and
.I errno
is assigned the negated
.I error
value.
.IP \[bu]
.I val
is set to a value that will be used as the return value for a spoofed
"success" return for the target's system call.
The value in this field is ignored if the
.I error
field contains a nonzero value.
.\" FIXME
.\" Kees Cook suggested:
.\"
.\" Strictly speaking, this is architecture specific, but
.\" all architectures do it this way. Should seccomp enforce
.\" val == 0 when err != 0 ?
.\"
.\" Christian Brauner
.\"
.\" Feels like it should, at least for the SEND ioctl where we already
.\" verify that val and err are both 0 when CONTINUE is specified (as you
.\" pointed out correctly above).
.RE
.P
On success, this operation returns 0; on failure, \-1 is returned, and
.I errno
is set to indicate the cause of the error.
This operation can fail with the following errors:
.TP
.B EINPROGRESS
A response to this notification has already been sent.
.TP
.B EINVAL
An invalid value was specified in the
.I flags field.
.TP
.B
.B EINVAL
The
.I flags
field contained
.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
and the
.I error
or
.I val
field was not zero.
.TP
.B ENOENT
The blocked system call in the target
has been interrupted by a signal handler
or the target has terminated.
.\" Jann Horn notes:
.\"     you could also get this [ENOENT] if a response has already
.\"     been sent, instead of EINPROGRESS - the only difference is
.\"     whether the target thread has picked up the response yet
.\"
.SS SECCOMP_IOCTL_NOTIF_ADDFD
The
.B SECCOMP_IOCTL_NOTIF_ADDFD
operation (available since Linux 5.9)
allows the supervisor to install a file descriptor
into the target's file descriptor table.
Much like the use of
.B SCM_RIGHTS
messages described in
.MR unix 7 ,
this operation is semantically equivalent to duplicating
a file descriptor from the supervisor's file descriptor table
into the target's file descriptor table.
.P
The
.B SECCOMP_IOCTL_NOTIF_ADDFD
operation permits the supervisor to emulate a target system call (such as
.MR socket 2
or
.MR openat 2 )
that generates a file descriptor.
The supervisor can perform the system call that generates
the file descriptor (and associated open file description)
and then use this operation to allocate
a file descriptor that refers to the same open file description in the target.
(For an explanation of open file descriptions, see
.MR open 2 .)
.P
Once this operation has been performed,
the supervisor can close its copy of the file descriptor.
.P
In the target,
the received file descriptor is subject to the same
Linux Security Module (LSM) checks as are applied to a file descriptor
that is received in an
.B SCM_RIGHTS
ancillary message.
If the file descriptor refers to a socket,
it inherits the cgroup version 1 network controller settings
.RI ( classid
and
.IR netprioidx )
of the target.
.P
The third
.MR ioctl 2
argument is a pointer to a structure of the following form:
.P
.in +4n
.EX
struct seccomp_notif_addfd {
    __u64 id;           /* Cookie value */
    __u32 flags;        /* Flags */
    __u32 srcfd;        /* Local file descriptor number */
    __u32 newfd;        /* 0 or desired file descriptor
                           number in target */
    __u32 newfd_flags;  /* Flags to set on target file
                           descriptor */
};
.EE
.in
.P
The fields in this structure are as follows:
.TP
.I id
This field should be set to the notification ID
(cookie value) that was obtained via
.BR SECCOMP_IOCTL_NOTIF_RECV .
.TP
.I flags
This field is a bit mask of flags that modify the behavior of the operation.
Currently, only one flag is supported:
.RS
.TP
.B SECCOMP_ADDFD_FLAG_SETFD
When allocating the file descriptor in the target,
use the file descriptor number specified in the
.I newfd
field.
.TP
.BR SECCOMP_ADDFD_FLAG_SEND " (since Linux 5.14)"
.\" commit 0ae71c7720e3ae3aabd2e8a072d27f7bd173d25c
Perform the equivalent of
.B SECCOMP_IOCTL_NOTIF_ADDFD
plus
.B SECCOMP_IOCTL_NOTIF_SEND
as an atomic operation.
On successful invocation, the target process's
.I errno
will be 0
and the return value will be the file descriptor number
that was allocated in the target.
If allocating the file descriptor in the target fails,
the target's system call continues to be blocked
until a successful response is sent.
.RE
.TP
.I srcfd
This field should be set to the number of the file descriptor
in the supervisor that is to be duplicated.
.TP
.I newfd
This field determines which file descriptor number is allocated in the target.
If the
.B SECCOMP_ADDFD_FLAG_SETFD
flag is set,
then this field specifies which file descriptor number should be allocated.
If this file descriptor number is already open in the target,
it is atomically closed and reused.
If the descriptor duplication fails due to an LSM check, or if
.I srcfd
is not a valid file descriptor,
the file descriptor
.I newfd
will not be closed in the target process.
.IP
If the
.B SECCOMP_ADDFD_FLAG_SETFD
flag it not set, then this field must be 0,
and the kernel allocates the lowest unused file descriptor number
in the target.
.TP
.I newfd_flags
This field is a bit mask specifying flags that should be set on
the file descriptor that is received in the target process.
Currently, only the following flag is implemented:
.RS
.TP
.B O_CLOEXEC
Set the close-on-exec flag on the received file descriptor.
.RE
.P
On success, this
.MR ioctl 2
call returns the number of the file descriptor that was allocated
in the target.
Assuming that the emulated system call is one that returns
a file descriptor as its function result (e.g.,
.MR socket 2 ),
this value can be used as the return value
.RI ( resp.val )
that is supplied in the response that is subsequently sent with the
.B SECCOMP_IOCTL_NOTIF_SEND
operation.
.P
On error, \-1 is returned and
.I errno
is set to indicate the cause of the error.
.P
This operation can fail with the following errors:
.TP
.B EBADF
Allocating the file descriptor in the target would cause the target's
.B RLIMIT_NOFILE
limit to be exceeded (see
.MR getrlimit 2 ).
.TP
.B EBUSY
If the flag
.B SECCOMP_IOCTL_NOTIF_SEND
is used, this means the operation can't proceed until other
.B SECCOMP_IOCTL_NOTIF_ADDFD
requests are processed.
.TP
.B EINPROGRESS
The user-space notification specified in the
.I id
field exists but has not yet been fetched (by a
.BR SECCOMP_IOCTL_NOTIF_RECV )
or has already been responded to (by a
.BR SECCOMP_IOCTL_NOTIF_SEND ).
.TP
.B EINVAL
An invalid flag was specified in the
.I flags
or
.I newfd_flags
field, or the
.I newfd
field is nonzero and the
.B SECCOMP_ADDFD_FLAG_SETFD
flag was not specified in the
.I flags
field.
.TP
.B EMFILE
The file descriptor number specified in
.I newfd
exceeds the limit specified in
.IR /proc/sys/fs/nr_open .
.TP
.B ENOENT
The blocked system call in the target
has been interrupted by a signal handler
or the target has terminated.
.P
Here is some sample code (with error handling omitted) that uses the
.B SECCOMP_ADDFD_FLAG_SETFD
operation (here, to emulate a call to
.MR openat 2 ):
.P
.EX
.in +4n
int fd, removeFd;
\&
fd = openat(req\->data.args[0], path, req\->data.args[2],
                req\->data.args[3]);
\&
struct seccomp_notif_addfd addfd;
addfd.id = req\->id; /* Cookie from SECCOMP_IOCTL_NOTIF_RECV */
addfd.srcfd = fd;
addfd.newfd = 0;
addfd.flags = 0;
addfd.newfd_flags = O_CLOEXEC;
\&
targetFd = ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
\&
close(fd);          /* No longer needed in supervisor */
\&
struct seccomp_notif_resp *resp;
    /* Code to allocate 'resp' omitted */
resp\->id = req\->id;
resp\->error = 0;        /* "Success" */
resp\->val = targetFd;
resp\->flags = 0;
ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp);
.in
.EE
.SH NOTES
One example use case for the user-space notification
mechanism is to allow a container manager
(a process which is typically running with more privilege than
the processes inside the container)
to mount block devices or create device nodes for the container.
The mount use case provides an example of where the
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
.MR ioctl 2
operation is useful.
Upon receiving a notification for the
.MR mount 2
system call, the container manager (the "supervisor") can distinguish
a request to mount a block filesystem
(which would not be possible for a "target" process inside the container)
and mount that file system.
If, on the other hand, the container manager detects that the operation
could be performed by the process inside the container
(e.g., a mount of a
.MR tmpfs 5
filesystem), it can notify the kernel that the target process's
.MR mount 2
system call can continue.
.\"
.SS select()/poll()/epoll semantics
The file descriptor returned when
.MR seccomp 2
is employed with the
.B SECCOMP_FILTER_FLAG_NEW_LISTENER
flag can be monitored using
.MR poll 2 ,
.MR epoll 7 ,
and
.MR select 2 .
These interfaces indicate that the file descriptor is ready as follows:
.IP \[bu] 3
When a notification is pending,
these interfaces indicate that the file descriptor is readable.
Following such an indication, a subsequent
.B SECCOMP_IOCTL_NOTIF_RECV
.MR ioctl 2
will not block, returning either information about a notification
or else failing with the error
.B EINTR
if the target has been killed by a signal or its system call
has been interrupted by a signal handler.
.IP \[bu]
After the notification has been received (i.e., by the
.B SECCOMP_IOCTL_NOTIF_RECV
.MR ioctl 2
operation), these interfaces indicate that the file descriptor is writable,
meaning that a notification response can be sent using the
.B SECCOMP_IOCTL_NOTIF_SEND
.MR ioctl 2
operation.
.IP \[bu]
After the last thread using the filter has terminated and been reaped using
.MR waitpid 2
(or similar),
the file descriptor indicates an end-of-file condition (readable in
.MR select 2 ;
.BR POLLHUP / EPOLLHUP
in
.MR poll 2 /
.MR epoll_wait 2 ).
.SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE
The intent of the user-space notification feature is
to allow system calls to be performed on behalf of the target.
The target's system call should either be handled by the supervisor or
allowed to continue normally in the kernel (where standard security
policies will be applied).
.P
.BR "Note well" :
this mechanism must not be used to make security policy decisions
about the system call,
which would be inherently race-prone for reasons described next.
.P
The
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
flag must be used with caution.
If set by the supervisor, the target's system call will continue.
However, there is a time-of-check, time-of-use race here,
since an attacker could exploit the interval of time where the target is
blocked waiting on the "continue" response to do things such as
rewriting the system call arguments.
.P
Note furthermore that a user-space notifier can be bypassed if
the existing filters allow the use of
.MR seccomp 2
or
.MR prctl 2
to install a filter that returns an action value with a higher precedence than
.B SECCOMP_RET_USER_NOTIF
(see
.MR seccomp 2 ).
.P
It should thus be absolutely clear that the
seccomp user-space notification mechanism
.B can not
be used to implement a security policy!
It should only ever be used in scenarios where a more privileged process
supervises the system calls of a lesser privileged target to
get around kernel-enforced security restrictions when
the supervisor deems this safe.
In other words,
in order to continue a system call, the supervisor should be sure that
another security mechanism or the kernel itself will sufficiently block
the system call if its arguments are rewritten to something unsafe.
.\"
.SS Caveats regarding the use of \fI/proc/\fPtid\fI/mem\fP
The discussion above noted the need to use the
.B SECCOMP_IOCTL_NOTIF_ID_VALID
.MR ioctl 2
when opening the
.IR /proc/ tid /mem
file of the target
to avoid the possibility of accessing the memory of the wrong process
in the event that the target terminates and its ID
is recycled by another (unrelated) thread.
However, the use of this
.MR ioctl 2
operation is also necessary in other situations,
as explained in the following paragraphs.
.P
Consider the following scenario, where the supervisor
tries to read the pathname argument of a target's blocked
.MR mount 2
system call:
.IP (1) 5
From one of its functions
.RI ( func() ),
the target calls
.MR mount 2 ,
which triggers a user-space notification and causes the target to block.
.IP (2)
The supervisor receives the notification, opens
.IR /proc/ tid /mem ,
and (successfully) performs the
.B SECCOMP_IOCTL_NOTIF_ID_VALID
check.
.IP (3)
The target receives a signal, which causes the
.MR mount 2
to abort.
.IP (4)
The signal handler executes in the target, and returns.
.IP (5)
Upon return from the handler, the execution of
.I func()
resumes, and it returns (and perhaps other functions are called,
overwriting the memory that had been used for the stack frame of
.IR func() ).
.IP (6)
Using the address provided in the notification information,
the supervisor reads from the target's memory location that used to
contain the pathname.
.IP (7)
The supervisor now calls
.MR mount 2
with some arbitrary bytes obtained in the previous step.
.P
The conclusion from the above scenario is this:
since the target's blocked system call may be interrupted by a signal handler,
the supervisor must be written to expect that the
target may abandon its system call at
.B any
time;
in such an event, any information that the supervisor obtained from
the target's memory must be considered invalid.
.P
To prevent such scenarios,
every read from the target's memory must be separated from use of
the bytes so obtained by a
.B SECCOMP_IOCTL_NOTIF_ID_VALID
check.
In the above example, the check would be placed between the two final steps.
An example of such a check is shown in EXAMPLES.
.P
Following on from the above, it should be clear that
a write by the supervisor into the target's memory can
.B never
be considered safe.
.\"
.SS Caveats regarding blocking system calls
Suppose that the target performs a blocking system call (e.g.,
.MR accept 2 )
that the supervisor should handle.
The supervisor might then in turn execute the same blocking system call.
.P
In this scenario,
it is important to note that if the target's system call is now
interrupted by a signal, the supervisor is
.I not
informed of this.
If the supervisor does not take suitable steps to
actively discover that the target's system call has been canceled,
various difficulties can occur.
Taking the example of
.MR accept 2 ,
the supervisor might remain blocked in its
.MR accept 2
holding a port number that the target
(which, after the interruption by the signal handler,
perhaps closed  its listening socket) might expect to be able to reuse in a
.MR bind 2
call.
.P
Therefore, when the supervisor wishes to emulate a blocking system call,
it must do so in such a way that it gets informed if the target's
system call is interrupted by a signal handler.
For example, if the supervisor itself executes the same
blocking system call, then it could employ a separate thread
that uses the
.B SECCOMP_IOCTL_NOTIF_ID_VALID
operation to check if the target is still blocked in its system call.
Alternatively, in the
.MR accept 2
example, the supervisor might use
.MR poll 2
to monitor both the notification file descriptor
(so as to discover when the target's
.MR accept 2
call has been interrupted) and the listening file descriptor
(so as to know when a connection is available).
.P
If the target's system call is interrupted,
the supervisor must take care to release resources (e.g., file descriptors)
that it acquired on behalf of the target.
.\"
.SS Interaction with SA_RESTART signal handlers
Consider the following scenario:
.IP (1) 5
The target process has used
.MR sigaction 2
to install a signal handler with the
.B SA_RESTART
flag.
.IP (2)
The target has made a system call that triggered a seccomp
user-space notification and the target is currently blocked
until the supervisor sends a notification response.
.IP (3)
A signal is delivered to the target and the signal handler is executed.
.IP (4)
When (if) the supervisor attempts to send a notification response, the
.B SECCOMP_IOCTL_NOTIF_SEND
.MR ioctl 2 )
operation will fail with the
.B ENOENT
error.
.P
In this scenario, the kernel will restart the target's system call.
Consequently, the supervisor will receive another user-space notification.
Thus, depending on how many times the blocked system call
is interrupted by a signal handler,
the supervisor may receive multiple notifications for
the same instance of a system call in the target.
.P
One oddity is that system call restarting as described in this scenario
will occur even for the blocking system calls listed in
.MR signal 7
that would
.B never
normally be restarted by the
.B SA_RESTART
flag.
.\" FIXME
.\" About the above, Kees Cook commented:
.\"
.\" Does this need fixing? I imagine the correct behavior for this case
.\" would be a response to _SEND of EINPROGRESS and the target would see
.\" EINTR normally?
.\"
.\" I mean, it's not like seccomp doesn't already expose weirdness with
.\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this
.\" regard. :(
.
.\" FIXME
.\" Michael Kerrisk:
.\" I wonder about the effect of this oddity for system calls that
.\" are normally nonrestartable because they have timeouts. My
.\" understanding is that the kernel doesn't restart those system
.\" calls because it's impossible for the kernel to restart the call
.\" with the right timeout value. I wonder what happens when those
.\" system calls are restarted in the scenario we're discussing.)
.P
Furthermore, if the supervisor response is a file descriptor
added with
.BR SECCOMP_IOCTL_NOTIF_ADDFD ,
then the flag
.B SECCOMP_ADDFD_FLAG_SEND
can be used to atomically add the file descriptor and return that value,
making sure no file descriptors are inadvertently leaked into the target.
.SH BUGS
If a
.B SECCOMP_IOCTL_NOTIF_RECV
.MR ioctl 2
operation
.\" or a poll/epoll/select
is performed after the target terminates, then the
.MR ioctl 2
call simply blocks (rather than returning an error to indicate that the
target no longer exists).
.\" FIXME
.\" Comment from Kees Cook:
.\"
.\" I want this fixed. It caused me no end of pain when building the
.\" selftests, and ended up spawning my implementing a global test timeout
.\" in kselftest. :P Before the usage counter refactor, there was no sane
.\" way to deal with this, but now I think we're close.
.\"
.SH EXAMPLES
The (somewhat contrived) program shown below demonstrates the use of
the interfaces described in this page.
The program creates a child process that serves as the "target" process.
The child process installs a seccomp filter that returns the
.B SECCOMP_RET_USER_NOTIF
action value if a call is made to
.MR mkdir 2 .
The child process then calls
.MR mkdir 2
once for each of the supplied command-line arguments,
and reports the result returned by the call.
After processing all arguments, the child process terminates.
.P
The parent process acts as the supervisor, listening for the notifications
that are generated when the target process calls
.MR mkdir 2 .
When such a notification occurs,
the supervisor examines the memory of the target process (using
.IR /proc/ pid /mem )
to discover the pathname argument that was supplied to the
.MR mkdir 2
call, and performs one of the following actions:
.IP \[bu] 3
If the pathname begins with the prefix "/tmp/",
then the supervisor attempts to create the specified directory,
and then spoofs a return for the target process based on the return
value of the supervisor's
.MR mkdir 2
call.
In the event that that call succeeds,
the spoofed success return value is the length of the pathname.
.IP \[bu]
If the pathname begins with "./" (i.e., it is a relative pathname),
the supervisor sends a
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
response to the kernel to say that the kernel should execute
the target process's
.MR mkdir 2
call.
.IP \[bu]
If the pathname begins with some other prefix,
the supervisor spoofs an error return for the target process,
so that the target process's
.MR mkdir 2
call appears to fail with the error
.B EOPNOTSUPP
("Operation not supported").
Additionally, if the specified pathname is exactly "/bye",
then the supervisor terminates.
.P
This program can be used to demonstrate various aspects of the
behavior of the seccomp user-space notification mechanism.
To help aid such demonstrations,
the program logs various messages to show the operation
of the target process (lines prefixed "T:") and the supervisor
(indented lines prefixed "S:").
.P
In the following example, the target attempts to create the directory
.IR /tmp/x .
Upon receiving the notification, the supervisor creates the directory on the
target's behalf,
and spoofs a success return to be received by the target process's
.MR mkdir 2
call.
.P
.in +4n
.EX
$ \fB./seccomp_unotify /tmp/x\fP
T: PID = 23168
\&
T: about to mkdir("/tmp/x")
        S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168
        S: executing: mkdir("/tmp/x", 0700)
        S: success! spoofed return = 6
        S: sending response (flags = 0; val = 6; error = 0)
T: SUCCESS: mkdir(2) returned 6
\&
T: terminating
        S: target has terminated; bye
.EE
.in
.P
In the above output, note that the spoofed return value seen by the target
process is 6 (the length of the pathname
.IR /tmp/x ),
whereas a normal
.MR mkdir 2
call returns 0 on success.
.P
In the next example, the target attempts to create a directory using the
relative pathname
.IR ./sub .
Since this pathname starts with "./",
the supervisor sends a
.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
response to the kernel,
and the kernel then (successfully) executes the target process's
.MR mkdir 2
call.
.P
.in +4n
.EX
$ \fB./seccomp_unotify ./sub\fP
T: PID = 23204
\&
T: about to mkdir("./sub")
        S: got notification (ID 0xddb16abe25b4c12) for PID 23204
        S: target can execute system call
        S: sending response (flags = 0x1; val = 0; error = 0)
T: SUCCESS: mkdir(2) returned 0
\&
T: terminating
        S: target has terminated; bye
.EE
.in
.P
If the target process attempts to create a directory with
a pathname that doesn't start with "." and doesn't begin with the prefix
"/tmp/", then the supervisor spoofs an error return
.RB ( EOPNOTSUPP ,
"Operation not  supported")
for the target's
.MR mkdir 2
call (which is not executed):
.P
.in +4n
.EX
$ \fB./seccomp_unotify /xxx\fP
T: PID = 23178
\&
T: about to mkdir("/xxx")
        S: got notification (ID 0xe7dc095d1c524e80) for PID 23178
        S: spoofing error response (Operation not supported)
        S: sending response (flags = 0; val = 0; error = \-95)
T: ERROR: mkdir(2): Operation not supported
\&
T: terminating
        S: target has terminated; bye
.EE
.in
.P
In the next example,
the target process attempts to create a directory with the pathname
.BR /tmp/nosuchdir/b .
Upon receiving the notification,
the supervisor attempts to create that directory, but the
.MR mkdir 2
call fails because the directory
.B /tmp/nosuchdir
does not exist.
Consequently, the supervisor spoofs an error return that passes the error
that it received back to the target process's
.MR mkdir 2
call.
.P
.in +4n
.EX
$ \fB./seccomp_unotify /tmp/nosuchdir/b\fP
T: PID = 23199
\&
T: about to mkdir("/tmp/nosuchdir/b")
        S: got notification (ID 0x8744454293506046) for PID 23199
        S: executing: mkdir("/tmp/nosuchdir/b", 0700)
        S: failure! (errno = 2; No such file or directory)
        S: sending response (flags = 0; val = 0; error = \-2)
T: ERROR: mkdir(2): No such file or directory
\&
T: terminating
        S: target has terminated; bye
.EE
.in
.P
If the supervisor receives a notification and sees that the
argument of the target's
.MR mkdir 2
is the string "/bye", then (as well as spoofing an
.B EOPNOTSUPP
error), the supervisor terminates.
If the target process subsequently executes another
.MR mkdir 2
that triggers its seccomp filter to return the
.B SECCOMP_RET_USER_NOTIF
action value, then the kernel causes the target process's system call to
fail with the error
.B ENOSYS
("Function not implemented").
This is demonstrated by the following example:
.P
.in +4n
.EX
$ \fB./seccomp_unotify /bye /tmp/y\fP
T: PID = 23185
\&
T: about to mkdir("/bye")
        S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185
        S: spoofing error response (Operation not supported)
        S: sending response (flags = 0; val = 0; error = \-95)
        S: terminating **********
T: ERROR: mkdir(2): Operation not supported
\&
T: about to mkdir("/tmp/y")
T: ERROR: mkdir(2): Function not implemented
\&
T: terminating
.EE
.in
.\"
.SS Program source
.\" SRC BEGIN (seccomp_unotify.c)
.EX
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <linux/audit.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <signal.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>
\&
#define ARRAY_SIZE(arr)  (sizeof(arr) / sizeof((arr)[0]))
\&
/* Send the file descriptor \[aq]fd\[aq] over the connected UNIX domain socket
   \[aq]sockfd\[aq]. Returns 0 on success, or \-1 on error. */
\&
static int
sendfd(int sockfd, int fd)
{
    int             data;
    struct iovec    iov;
    struct msghdr   msgh;
    struct cmsghdr  *cmsgp;
\&
    /* Allocate a char array of suitable size to hold the ancillary data.
       However, since this buffer is in reality a \[aq]struct cmsghdr\[aq], use a
       union to ensure that it is suitably aligned. */
    union {
        char   buf[CMSG_SPACE(sizeof(int))];
                        /* Space large enough to hold an \[aq]int\[aq] */
        struct cmsghdr align;
    } controlMsg;
\&
    /* The \[aq]msg_name\[aq] field can be used to specify the address of the
       destination socket when sending a datagram. However, we do not
       need to use this field because \[aq]sockfd\[aq] is a connected socket. */
\&
    msgh.msg_name = NULL;
    msgh.msg_namelen = 0;
\&
    /* On Linux, we must transmit at least one byte of real data in
       order to send ancillary data. We transmit an arbitrary integer
       whose value is ignored by recvfd(). */
\&
    msgh.msg_iov = &iov;
    msgh.msg_iovlen = 1;
    iov.iov_base = &data;
    iov.iov_len = sizeof(int);
    data = 12345;
\&
    /* Set \[aq]msghdr\[aq] fields that describe ancillary data */
\&
    msgh.msg_control = controlMsg.buf;
    msgh.msg_controllen = sizeof(controlMsg.buf);
\&
    /* Set up ancillary data describing file descriptor to send */
\&
    cmsgp = CMSG_FIRSTHDR(&msgh);
    cmsgp\->cmsg_level = SOL_SOCKET;
    cmsgp\->cmsg_type = SCM_RIGHTS;
    cmsgp\->cmsg_len = CMSG_LEN(sizeof(int));
    memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int));
\&
    /* Send real plus ancillary data */
\&
    if (sendmsg(sockfd, &msgh, 0) == \-1)
        return \-1;
\&
    return 0;
}
\&
/* Receive a file descriptor on a connected UNIX domain socket. Returns
   the received file descriptor on success, or \-1 on error. */
\&
static int
recvfd(int sockfd)
{
    int            data, fd;
    ssize_t        nr;
    struct iovec   iov;
    struct msghdr  msgh;
\&
    /* Allocate a char buffer for the ancillary data. See the comments
       in sendfd() */
    union {
        char   buf[CMSG_SPACE(sizeof(int))];
        struct cmsghdr align;
    } controlMsg;
    struct cmsghdr *cmsgp;
\&
    /* The \[aq]msg_name\[aq] field can be used to obtain the address of the
       sending socket. However, we do not need this information. */
\&
    msgh.msg_name = NULL;
    msgh.msg_namelen = 0;
\&
    /* Specify buffer for receiving real data */
\&
    msgh.msg_iov = &iov;
    msgh.msg_iovlen = 1;
    iov.iov_base = &data;       /* Real data is an \[aq]int\[aq] */
    iov.iov_len = sizeof(int);
\&
    /* Set \[aq]msghdr\[aq] fields that describe ancillary data */
\&
    msgh.msg_control = controlMsg.buf;
    msgh.msg_controllen = sizeof(controlMsg.buf);
\&
    /* Receive real plus ancillary data; real data is ignored */
\&
    nr = recvmsg(sockfd, &msgh, 0);
    if (nr == \-1)
        return \-1;
\&
    cmsgp = CMSG_FIRSTHDR(&msgh);
\&
    /* Check the validity of the \[aq]cmsghdr\[aq] */
\&
    if (cmsgp == NULL
        || cmsgp\->cmsg_len != CMSG_LEN(sizeof(int))
        || cmsgp\->cmsg_level != SOL_SOCKET
        || cmsgp\->cmsg_type != SCM_RIGHTS)
    {
        errno = EINVAL;
        return \-1;
    }
\&
    /* Return the received file descriptor to our caller */
\&
    memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int));
    return fd;
}
\&
static void
sigchldHandler(int sig)
{
    char msg[] = "\etS: target has terminated; bye\en";
\&
    write(STDOUT_FILENO, msg, sizeof(msg) \- 1);
    _exit(EXIT_SUCCESS);
}
\&
static int
seccomp(unsigned int operation, unsigned int flags, void *args)
{
    return syscall(SYS_seccomp, operation, flags, args);
}
\&
/* The following is the x86\-64\-specific BPF boilerplate code for checking
   that the BPF program is running on the right architecture + ABI. At
   completion of these instructions, the accumulator contains the system
   call number. */
\&
/* For the x32 ABI, all system call numbers have bit 30 set */
\&
#define X32_SYSCALL_BIT         0x40000000
\&
#define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
                 (offsetof(struct seccomp_data, arch))), \e
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
                 (offsetof(struct seccomp_data, nr))), \e
        BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
\&
/* installNotifyFilter() installs a seccomp filter that generates
   user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process
   calls mkdir(2); the filter allows all other system calls.
\&
   The function return value is a file descriptor from which the
   user\-space notifications can be fetched. */
\&
static int
installNotifyFilter(void)
{
    int notifyFd;
\&
    struct sock_filter filter[] = {
        X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
\&
        /* mkdir() triggers notification to user\-space supervisor */
\&
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_mkdir, 0, 1),
        BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
\&
        /* Every other system call is allowed */
\&
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
    };
\&
    struct sock_fprog prog = {
        .len = ARRAY_SIZE(filter),
        .filter = filter,
    };
\&
    /* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
       as a result, seccomp() returns a notification file descriptor. */
\&
    notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
                       SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
    if (notifyFd == \-1)
        err(EXIT_FAILURE, "seccomp\-install\-notify\-filter");
\&
    return notifyFd;
}
\&
/* Close a pair of sockets created by socketpair() */
\&
static void
closeSocketPair(int sockPair[2])
{
    if (close(sockPair[0]) == \-1)
        err(EXIT_FAILURE, "closeSocketPair\-close\-0");
    if (close(sockPair[1]) == \-1)
        err(EXIT_FAILURE, "closeSocketPair\-close\-1");
}
\&
/* Implementation of the target process; create a child process that:
\&
   (1) installs a seccomp filter with the
       SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
   (2) writes the seccomp notification file descriptor returned from
       the previous step onto the UNIX domain socket, \[aq]sockPair[0]\[aq];
   (3) calls mkdir(2) for each element of \[aq]argv\[aq].
\&
   The function return value in the parent is the PID of the child
   process; the child does not return from this function. */
\&
static pid_t
targetProcess(int sockPair[2], char *argv[])
{
    int    notifyFd, s;
    pid_t  targetPid;
\&
    targetPid = fork();
\&
    if (targetPid == \-1)
        err(EXIT_FAILURE, "fork");
\&
    if (targetPid > 0)          /* In parent, return PID of child */
        return targetPid;
\&
    /* Child falls through to here */
\&
    printf("T: PID = %ld\en", (long) getpid());
\&
    /* Install seccomp filter(s) */
\&
    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
        err(EXIT_FAILURE, "prctl");
\&
    notifyFd = installNotifyFilter();
\&
    /* Pass the notification file descriptor to the tracing process over
       a UNIX domain socket */
\&
    if (sendfd(sockPair[0], notifyFd) == \-1)
        err(EXIT_FAILURE, "sendfd");
\&
    /* Notification and socket FDs are no longer needed in target */
\&
    if (close(notifyFd) == \-1)
        err(EXIT_FAILURE, "close\-target\-notify\-fd");
\&
    closeSocketPair(sockPair);
\&
    /* Perform a mkdir() call for each of the command\-line arguments */
\&
    for (char **ap = argv; *ap != NULL; ap++) {
        printf("\enT: about to mkdir(\e"%s\e")\en", *ap);
\&
        s = mkdir(*ap, 0700);
        if (s == \-1)
            perror("T: ERROR: mkdir(2)");
        else
            printf("T: SUCCESS: mkdir(2) returned %d\en", s);
    }
\&
    printf("\enT: terminating\en");
    exit(EXIT_SUCCESS);
}
\&
/* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV
   operation is still valid. It will no longer be valid if the target
   process has terminated or is no longer blocked in the system call that
   generated the notification (because it was interrupted by a signal).
\&
   This operation can be used when doing such things as accessing
   /proc/PID files in the target process in order to avoid TOCTOU race
   conditions where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV
   terminates and is reused by another process. */
\&
static bool
cookieIsValid(int notifyFd, uint64_t id)
{
    return ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == 0;
}
\&
/* Access the memory of the target process in order to fetch the
   pathname referred to by the system call argument \[aq]argNum\[aq] in
   \[aq]req\->data.args[]\[aq].  The pathname is returned in \[aq]path\[aq],
   a buffer of \[aq]len\[aq] bytes allocated by the caller.
\&
   Returns true if the pathname is successfully fetched, and false
   otherwise. For possible causes of failure, see the comments below. */
\&
static bool
getTargetPathname(struct seccomp_notif *req, int notifyFd,
                  int argNum, char *path, size_t len)
{
    int      procMemFd;
    char     procMemPath[PATH_MAX];
    ssize_t  nread;
\&
    snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid);
\&
    procMemFd = open(procMemPath, O_RDONLY | O_CLOEXEC);
    if (procMemFd == \-1)
        return false;
\&
    /* Check that the process whose info we are accessing is still alive
       and blocked in the system call that caused the notification.
       If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed in
       cookieIsValid()) succeeded, we know that the /proc/PID/mem file
       descriptor that we opened corresponded to the process for which we
       received a notification. If that process subsequently terminates,
       then read() on that file descriptor will return 0 (EOF). */
\&
    if (!cookieIsValid(notifyFd, req\->id)) {
        close(procMemFd);
        return false;
    }
\&
    /* Read bytes at the location containing the pathname argument */
\&
    nread = pread(procMemFd, path, len, req\->data.args[argNum]);
\&
    close(procMemFd);
\&
    if (nread <= 0)
        return false;
\&
    /* Once again check that the notification ID is still valid. The
       case we are particularly concerned about here is that just
       before we fetched the pathname, the target\[aq]s blocked system
       call was interrupted by a signal handler, and after the handler
       returned, the target carried on execution (past the interrupted
       system call). In that case, we have no guarantees about what we
       are reading, since the target\[aq]s memory may have been arbitrarily
       changed by subsequent operations. */
\&
    if (!cookieIsValid(notifyFd, req\->id)) {
        perror("\etS: notification ID check failed!!!");
        return false;
    }
\&
    /* Even if the target\[aq]s system call was not interrupted by a signal,
       we have no guarantees about what was in the memory of the target
       process. (The memory may have been modified by another thread, or
       even by an external attacking process.) We therefore treat the
       buffer returned by pread() as untrusted input. The buffer should
       contain a terminating null byte; if not, then we will trigger an
       error for the target process. */
\&
    if (strnlen(path, nread) < nread)
        return true;
\&
    return false;
}
\&
/* Allocate buffers for the seccomp user\-space notification request and
   response structures. It is the caller\[aq]s responsibility to free the
   buffers returned via \[aq]req\[aq] and \[aq]resp\[aq]. */
\&
static void
allocSeccompNotifBuffers(struct seccomp_notif **req,
                         struct seccomp_notif_resp **resp,
                         struct seccomp_notif_sizes *sizes)
{
    size_t  resp_size;
\&
    /* Discover the sizes of the structures that are used to receive
       notifications and send notification responses, and allocate
       buffers of those sizes. */
\&
    if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, sizes) == \-1)
        err(EXIT_FAILURE, "seccomp\-SECCOMP_GET_NOTIF_SIZES");
\&
    *req = malloc(sizes\->seccomp_notif);
    if (*req == NULL)
        err(EXIT_FAILURE, "malloc\-seccomp_notif");
\&
    /* When allocating the response buffer, we must allow for the fact
       that the user\-space binary may have been built with user\-space
       headers where \[aq]struct seccomp_notif_resp\[aq] is bigger than the
       response buffer expected by the (older) kernel. Therefore, we
       allocate a buffer that is the maximum of the two sizes. This
       ensures that if the supervisor places bytes into the response
       structure that are past the response size that the kernel expects,
       then the supervisor is not touching an invalid memory location. */
\&
    resp_size = sizes\->seccomp_notif_resp;
    if (sizeof(struct seccomp_notif_resp) > resp_size)
        resp_size = sizeof(struct seccomp_notif_resp);
\&
    *resp = malloc(resp_size);
    if (*resp == NULL)
        err(EXIT_FAILURE, "malloc\-seccomp_notif_resp");
\&
}
\&
/* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file
   descriptor, \[aq]notifyFd\[aq]. */
\&
static void
handleNotifications(int notifyFd)
{
    bool                        pathOK;
    char                        path[PATH_MAX];
    struct seccomp_notif        *req;
    struct seccomp_notif_resp   *resp;
    struct seccomp_notif_sizes  sizes;
\&
    allocSeccompNotifBuffers(&req, &resp, &sizes);
\&
    /* Loop handling notifications */
\&
    for (;;) {
\&
        /* Wait for next notification, returning info in \[aq]*req\[aq] */
\&
        memset(req, 0, sizes.seccomp_notif);
        if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) {
            if (errno == EINTR)
                continue;
            err(EXIT_FAILURE, "\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV");
        }
\&
        printf("\etS: got notification (ID %#llx) for PID %d\en",
               req\->id, req\->pid);
\&
        /* The only system call that can generate a notification event
           is mkdir(2). Nevertheless, we check that the notified system
           call is indeed mkdir() as kind of future\-proofing of this
           code in case the seccomp filter is later modified to
           generate notifications for other system calls. */
\&
        if (req\->data.nr != SYS_mkdir) {
            printf("\etS: notification contained unexpected "
                   "system call number; bye!!!\en");
            exit(EXIT_FAILURE);
        }
\&
        pathOK = getTargetPathname(req, notifyFd, 0, path, sizeof(path));
\&
        /* Prepopulate some fields of the response */
\&
        resp\->id = req\->id;     /* Response includes notification ID */
        resp\->flags = 0;
        resp\->val = 0;
\&
        /* If getTargetPathname() failed, trigger an EINVAL error
           response (sending this response may yield an error if the
           failure occurred because the notification ID was no longer
           valid); if the directory is in /tmp, then create it on behalf
           of the supervisor; if the pathname starts with \[aq].\[aq], tell the
           kernel to let the target process execute the mkdir();
           otherwise, give an error for a directory pathname in any other
           location. */
\&
        if (!pathOK) {
            resp\->error = \-EINVAL;
            printf("\etS: spoofing error for invalid pathname (%s)\en",
                   strerror(\-resp\->error));
        } else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
            printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en",
                   path, req\->data.args[1]);
\&
            if (mkdir(path, req\->data.args[1]) == 0) {
                resp\->error = 0;            /* "Success" */
                resp\->val = strlen(path);   /* Used as return value of
                                               mkdir() in target */
                printf("\etS: success! spoofed return = %lld\en",
                       resp\->val);
            } else {
\&
                /* If mkdir() failed in the supervisor, pass the error
                   back to the target */
\&
                resp\->error = \-errno;
                printf("\etS: failure! (errno = %d; %s)\en", errno,
                       strerror(errno));
            }
        } else if (strncmp(path, "./", strlen("./")) == 0) {
            resp\->error = resp\->val = 0;
            resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
            printf("\etS: target can execute system call\en");
        } else {
            resp\->error = \-EOPNOTSUPP;
            printf("\etS: spoofing error response (%s)\en",
                   strerror(\-resp\->error));
        }
\&
        /* Send a response to the notification */
\&
        printf("\etS: sending response "
               "(flags = %#x; val = %lld; error = %d)\en",
               resp\->flags, resp\->val, resp\->error);
\&
        if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) {
            if (errno == ENOENT)
                printf("\etS: response failed with ENOENT; "
                       "perhaps target process\[aq]s syscall was "
                       "interrupted by a signal?\en");
            else
                perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND");
        }
\&
        /* If the pathname is just "/bye", then the supervisor breaks out
           of the loop and terminates. This allows us to see what happens
           if the target process makes further calls to mkdir(2). */
\&
        if (strcmp(path, "/bye") == 0)
            break;
    }
\&
    free(req);
    free(resp);
    printf("\etS: terminating **********\en");
    exit(EXIT_FAILURE);
}
\&
/* Implementation of the supervisor process:
\&
   (1) obtains the notification file descriptor from \[aq]sockPair[1]\[aq]
   (2) handles notifications that arrive on that file descriptor. */
\&
static void
supervisor(int sockPair[2])
{
    int notifyFd;
\&
    notifyFd = recvfd(sockPair[1]);
\&
    if (notifyFd == \-1)
        err(EXIT_FAILURE, "recvfd");
\&
    closeSocketPair(sockPair);  /* We no longer need the socket pair */
\&
    handleNotifications(notifyFd);
}
\&
int
main(int argc, char *argv[])
{
    int               sockPair[2];
    struct sigaction  sa;
\&
    setbuf(stdout, NULL);
\&
    if (argc < 2) {
        fprintf(stderr, "At least one pathname argument is required\en");
        exit(EXIT_FAILURE);
    }
\&
    /* Create a UNIX domain socket that is used to pass the seccomp
       notification file descriptor from the target process to the
       supervisor process. */
\&
    if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1)
        err(EXIT_FAILURE, "socketpair");
\&
    /* Create a child process\-\-the "target"\-\-that installs seccomp
       filtering. The target process writes the seccomp notification
       file descriptor onto \[aq]sockPair[0]\[aq] and then calls mkdir(2) for
       each directory in the command\-line arguments. */
\&
    (void) targetProcess(sockPair, &argv[optind]);
\&
    /* Catch SIGCHLD when the target terminates, so that the
       supervisor can also terminate. */
\&
    sa.sa_handler = sigchldHandler;
    sa.sa_flags = 0;
    sigemptyset(&sa.sa_mask);
    if (sigaction(SIGCHLD, &sa, NULL) == \-1)
        err(EXIT_FAILURE, "sigaction");
\&
    supervisor(sockPair);
\&
    exit(EXIT_SUCCESS);
}
.EE
.\" SRC END
.SH SEE ALSO
.MR ioctl 2 ,
.MR pidfd_getfd 2 ,
.MR pidfd_open 2 ,
.MR seccomp 2
.P
A further example program can be found in the kernel source file
.IR samples/seccomp/user-trap.c .